像Excel一样使用SQL进行数据分析
<p>Excel是数据分析中最常用的工具 ,利用Excel可以完成数据清洗,预处理,以及最常见的数据分类,数据筛选,分类汇总,以及数据透视等操作,而这些操作用SQL一样可以实现。SQL不仅可以从数据库中读取数据,还能通过不同的SQL函数语句直接返回所需要的结果,从而大大提高了自己在客户端应用程序中计算的效率。</p>
<p>
<img title="像Excel一样使用SQL进行数据分析" alt="像Excel一样使用SQL进行数据分析" border="0" src="https://zhuji.jb51.net/uploads/img/202305/13d1f360828b18b14578de9b191c025f.jpg"></p>
<h3>
1 重复数据处理</h3>
<p>
查找重复记录</p>
<ol class="dp-sql">
<li class="alt">
<span><span class="keyword">SELECT</span><span> * </span><span class="keyword">FROM</span><span> </span><span class="func">user</span><span> </span></span>
</li>
<li>
<span><span class="keyword">Where</span><span> (nick_name,</span><span class="keyword">password</span><span>) </span><span class="op">in</span><span> </span></span>
</li>
<li class="alt">
<span>( </span>
</li>
<li>
<span><span class="keyword">SELECT</span><span> nick_name,</span><span class="keyword">password</span><span> </span></span>
</li>
<li class="alt">
<span><span class="keyword">FROM</span><span> </span><span class="func">user</span><span> </span></span>
</li>
<li>
<span><span class="keyword">group</span><span> </span><span class="keyword">by</span><span> nick_name,</span><span class="keyword">password</span><span> </span></span>
</li>
<li class="alt">
<span><span class="keyword">having</span><span> </span><span class="func">count</span><span>(nick_name)>1 </span></span>
</li>
<li>
<span>); </span>
</li>
</ol>
<p>
查找去重记录</p>
<p>
查找id最大的记录</p>
<ol class="dp-sql">
<li class="alt">
<span><span class="keyword">SELECT</span><span> * </span><span class="keyword">FROM</span><span> </span><span class="func">user</span><span> </span></span>
</li>
<li>
<span><span class="keyword">WHERE</span><span> id </span><span class="op">in</span><span> </span></span>
</li>
<li class="alt">
<span>(<span class="keyword">SELECT</span><span> </span><span class="keyword">max</span><span>(id) </span><span class="keyword">FROM</span><span> </span><span class="func">user</span><span> </span></span>
</li>
<li>
<span><span class="keyword">group</span><span> </span><span class="keyword">by</span><span> nick_name,</span><span class="keyword">password</span><span> </span></span>
</li>
<li class="alt">
<span><span class="keyword">having</span><span> </span><span class="func">count</span><span>(nick_name)>1 </span></span>
</li>
<li>
<span>); </span>
</li>
</ol>
<p>
删除重复记录</p>
<p>
只保留id值最小的记录</p>
<ol class="dp-sql">
<li class="alt">
<span><span class="keyword">DELETE</span><span> c1 </span></span>
</li>
<li>
<span><span class="keyword">FROM</span><span> customer c1,customer c2 </span></span>
</li>
<li class="alt">
<span><span class="keyword">WHERE</span><span> c1.cust_email=c2.cust_email </span></span>
</li>
<li>
<span><span class="op">AND</span><span> c1.id>c2.id; </span></span>
</li>
<li class="alt">
<span><span class="keyword">DELETE</span><span> </span><span class="keyword">FROM</span><span> </span><span class="func">user</span><span> </span><span class="keyword">Where</span><span> (nick_name,</span><span class="keyword">password</span><span>) </span><span class="op">in</span><span> </span></span>
</li>
<li>
<span>(<span class="keyword">SELECT</span><span> nick_name,</span><span class="keyword">password</span><span> </span><span class="keyword">FROM</span><span> </span></span>
</li>
<li class="alt">
<span>(<span class="keyword">SELECT</span><span> nick_name,</span><span class="keyword">password</span><span> </span><span class="keyword">FROM</span><span> </span><span class="func">user</span><span> </span></span>
</li>
<li>
<span><span class="keyword">group</span><span> </span><span class="keyword">by</span><span> nick_name,</span><span class="keyword">password</span><span> </span></span>
</li>
<li class="alt">
<span><span class="keyword">having</span><span> </span><span class="func">count</span><span>(nick_name)>1) </span><span class="keyword">as</span><span> tmp1 </span></span>
</li>
<li>
<span>) </span>
</li>
<li class="alt">
<span><span class="op">and</span><span> id </span><span class="op">not</span><span> </span><span class="op">in</span><span> </span></span>
</li>
<li>
<span>(<span class="keyword">SELECT</span><span> id </span><span class="keyword">FROM</span><span> </span></span>
</li>
<li class="alt">
<span>(<span class="keyword">SELECT</span><span> </span><span class="keyword">min</span><span>(id) id </span><span class="keyword">FROM</span><span> </span><span class="func">user</span><span> </span></span>
</li>
<li>
<span><span class="keyword">group</span><span> </span><span class="keyword">by</span><span> nick_name,</span><span class="keyword">password</span><span> </span></span>
</li>
<li class="alt">
<span><span class="keyword">having</span><span> </span><span class="func">count</span><span>(nick_name)>1) </span><span class="keyword">as</span><span> tmp2 </span></span>
</li>
<li>
<span>); </span>
</li>
</ol>
<h3>
2 缺失值处理</h3>
<p>
查找缺失值记录</p>
<ol class="dp-sql">
<li class="alt">
<span><span class="keyword">SELECT</span><span> * </span><span class="keyword">FROM</span><span> customer </span></span>
</li>
<li>
<span><span class="keyword">WHERE</span><span> cust_email </span><span class="keyword">IS</span><span> </span><span class="op">NULL</span><span>; </span></span>
</li>
</ol>
<p>
更新列填充空值</p>
<ol class="dp-sql">
<li class="alt">
<span><span class="keyword">UPDATE</span><span> sale </span><span class="keyword">set</span><span> city = </span><span class="string">"未知"</span><span> </span></span>
</li>
<li>
<span><span class="keyword">WHERE</span><span> city </span><span class="keyword">IS</span><span> </span><span class="op">NULL</span><span>; </span></span>
</li>
<li class="alt">
</li>
<li>
<span><span class="keyword">UPDATE</span><span> orderitems </span><span class="keyword">set</span><span> </span></span>
</li>
<li class="alt">
<span>price_new=IFNULL(price_new,5.74); </span>
</li>
</ol>
<p>
查询并填充空值列</p>
<ol class="dp-sql">
<li class="alt">
<span><span class="keyword">SELECT</span><span> </span><span class="func">AVG</span><span>(price_new) </span><span class="keyword">FROM</span><span> orderitems; </span></span>
</li>
<li>
</li>
<li class="alt">
<span><span class="keyword">SELECT</span><span> IFNULL(price_new,5.74) </span><span class="keyword">AS</span><span> bus_ifnull </span></span>
</li>
<li>
<span><span class="keyword">FROM</span><span> orderitems; </span></span>
</li>
</ol>
<h3>
3 计算列</h3>
<p>
更新表添加计算列</p>
<ol class="dp-sql">
<li class="alt">
<span><span class="keyword">ALTER</span><span> </span><span class="keyword">TABLE</span><span> orderitems </span><span class="keyword">ADD</span><span> price_new </span><span class="keyword">DECIMAL</span><span>(8,2) </span><span class="op">NOT</span><span> </span><span class="op">NULL</span><span>; </span></span>
</li>
<li>
</li>
<li class="alt">
<span><span class="keyword">UPDATE</span><span> orderitems </span><span class="keyword">set</span><span> price_new= item_price*</span><span class="func">count</span><span>; </span></span>
</li>
</ol>
<p>
查询计算列</p>
<ol class="dp-sql">
<li class="alt">
<span><span class="keyword">SELECT</span><span> item_price*</span><span class="func">count</span><span> </span><span class="keyword">as</span><span> sales </span><span class="keyword">FROM</span><span> orderitems; </span></span>
</li>
</ol>
<h3>
4 排序</h3>
<p>
多列排序</p>
<ol class="dp-sql">
<li class="alt">
<span><span class="keyword">SELECT</span><span> * </span><span class="keyword">FROM</span><span> orderitems </span></span>
</li>
<li>
<span><span class="keyword">ORDER</span><span> </span><span class="keyword">BY</span><span> price_new </span><span class="keyword">DESC</span><span>,quantity; </span></span>
</li>
</ol>
<p>
查询排名前几的记录</p>
<ol class="dp-sql">
<li class="alt">
<span><span class="keyword">SELECT</span><span> * </span><span class="keyword">FROM</span><span> orderitems </span></span>
</li>
<li>
<span><span class="keyword">ORDER</span><span> </span><span class="keyword">BY</span><span> price_new </span><span class="keyword">DESC</span><span> LIMIT 5; </span></span>
</li>
</ol>
<p>
查询第10大的值</p>
<ol class="dp-sql">
<li class="alt">
<span><span class="keyword">SELECT</span><span> </span><span class="keyword">DISTINCT</span><span> price_new </span></span>
</li>
<li>
<span><span class="keyword">FROM</span><span> orderitems </span></span>
</li>
<li class="alt">
<span><span class="keyword">ORDER</span><span> </span><span class="keyword">BY</span><span> price_new </span><span class="keyword">DESC</span><span> LIMIT 9,1; </span></span>
</li>
</ol>
<p>
排名</p>
<p>
数值相同的排名相同且排名连续</p>
<ol class="dp-sql">
<li class="alt">
<span><span class="keyword">SELECT</span><span> prod_price, </span></span>
</li>
<li>
<span>(<span class="keyword">SELECT</span><span> </span><span class="func">COUNT</span><span>(</span><span class="keyword">DISTINCT</span><span> prod_price) </span></span>
</li>
<li class="alt">
<span><span class="keyword">FROM</span><span> products </span></span>
</li>
<li>
<span><span class="keyword">WHERE</span><span> prod_price>=a.prod_price </span></span>
</li>
<li class="alt">
<span>) <span class="keyword">AS</span><span> rank </span></span>
</li>
<li>
<span><span class="keyword">FROM</span><span> products </span><span class="keyword">AS</span><span> a </span></span>
</li>
<li class="alt">
<span><span class="keyword">ORDER</span><span> </span><span class="keyword">BY</span><span> rank ; </span></span>
</li>
</ol>
<h3>
5 字符串处理</h3>
<p>
字符串替换</p>
<ol class="dp-sql">
<li class="alt">
<span><span class="keyword">UPDATE</span><span> data1 </span><span class="keyword">SET</span><span> city=</span><span class="func">REPLACE</span><span>(city,</span><span class="string">'SH'</span><span>,</span><span class="string">'shanghai'</span><span>); </span></span>
</li>
<li>
</li>
<li class="alt">
<span><span class="keyword">SELECT</span><span> city </span><span class="keyword">FROM</span><span> data1; </span></span>
</li>
</ol>
<p>
按位置字符串截取</p>
<p>
字符串截取可用于数据分列</p>
<p>
MySQL 字符串截取函数:left(), right(), substring(), substring_index()</p>
<ol class="dp-sql">
<li class="alt">
<span><span class="keyword">SELECT</span><span> </span><span class="func">left</span><span>(</span><span class="string">'example.com'</span><span>, 3); </span></span>
</li>
</ol>
<p>
从字符串的第 4 个字符位置开始取,直到结束</p>
<ol class="dp-sql">
<li class="alt">
<span><span class="keyword">SELECT</span><span> </span><span class="func">substring</span><span>(</span><span class="string">'example.com'</span><span>, 4); </span></span>
</li>
</ol>
<p>
从字符串的第 4 个字符位置开始取,只取 2 个字符</p>
<ol class="dp-sql">
<li class="alt">
<span><span class="keyword">SELECT</span><span> </span><span class="func">substring</span><span>(</span><span class="string">'example.com'</span><span>, 4, 2); </span></span>
</li>
</ol>
<p>
按关键字截取字符串</p>
<p>
取第一个分隔符之前的所有字符,结果是www</p>
<ol class="dp-sql">
<li class="alt">
<span><span class="keyword">SELECT</span><span> substring_index(</span><span class="string">'www.google.com'</span><span>,</span><span class="string">'.'</span><span>,1); </span></span>
</li>
</ol>
<p>
取倒数第二个分隔符之后的所有字符,结果是google.com;</p>
<ol class="dp-sql">
<li class="alt">
<span><span class="keyword">SELECT</span><span> substring_index(</span><span class="string">'www.google.com'</span><span>,</span><span class="string">'.'</span><span>,-2); </span></span>
</li>
</ol>
<h3>
6 筛选</h3>
<p>
通过操作符实现高级筛选</p>
<p>
使用 AND OR IN NOT 等操作符实现高级筛选过滤</p>
<ol class="dp-sql">
<li class="alt">
<span><span class="keyword">SELECT</span><span> prod_name,prod_price </span><span class="keyword">FROM</span><span> Products </span></span>
</li>
<li>
<span><span class="keyword">WHERE</span><span> vend_id </span><span class="op">IN</span><span>(</span><span class="string">'DLL01'</span><span>,</span><span class="string">'BRS01'</span><span>); </span></span>
</li>
<li class="alt">
<span><span class="keyword">SELECT</span><span> prod_name </span><span class="keyword">FROM</span><span> Products </span><span class="keyword">WHERE</span><span> </span><span class="op">NOT</span><span> vend_id=</span><span class="string">'DLL01'</span><span>; </span></span>
</li>
</ol>
<p>
通配符筛选</p>
<p>
常用通配符有% _ [] ^</p>
<ol class="dp-sql">
<li class="alt">
<span><span class="keyword">SELECT</span><span> * </span><span class="keyword">from</span><span> customers </span><span class="keyword">WHERE</span><span> country </span><span class="op">LIKE</span><span> </span><span class="string">"CH%"</span><span>; </span></span>
</li>
</ol>
<h3>
7 表联结</h3>
<p>
SQL表连接可以实现类似于Excel中的Vlookup函数的功能</p>
<ol class="dp-sql">
<li class="alt">
<span><span class="keyword">SELECT</span><span> vend_id,prod_name,prod_price </span></span>
</li>
<li>
<span><span class="keyword">FROM</span><span> Vendors </span><span class="keyword">INNER</span><span> </span><span class="op">JOIN</span><span> Products </span></span>
</li>
<li class="alt">
<span><span class="keyword">ON</span><span> Vendors.vend_id=Products.vend_id; </span></span>
</li>
<li>
</li>
<li class="alt">
<span><span class="keyword">SELECT</span><span> prod_name,vend_name,prod_price,quantity </span></span>
</li>
<li>
<span><span class="keyword">FROM</span><span> OderItems,Products,Vendors </span></span>
</li>
<li class="alt">
<span><span class="keyword">WHERE</span><span> Products.vend_id=Vendors.vend_id </span></span>
</li>
<li>
<span><span class="op">AND</span><span> OrderItems.prod_id=Products.prod_id </span></span>
</li>
<li class="alt">
<span><span class="op">AND</span><span> order_num=20007; </span></span>
</li>
</ol>
<p>
自联结 在一条SELECT语句中多次使用相同的表</p>
<ol class="dp-sql">
<li class="alt">
<span><span class="keyword">SELECT</span><span> c1.cust_od,c1.cust_name,c1.cust_contact </span></span>
</li>
<li>
<span><span class="keyword">FROM</span><span> Customers </span><span class="keyword">as</span><span> c1,Customers </span><span class="keyword">as</span><span> c2 </span></span>
</li>
<li class="alt">
<span><span class="keyword">WHERE</span><span> c1.cust_name=c2.cust_name </span></span>
</li>
<li>
<span><span class="op">AND</span><span> c2.cust_contact=</span><span class="string">'Jim Jones'</span><span>; </span></span>
</li>
</ol>
<h3>
8 数据透视</h3>
<p>
数据分组可以实现Excel中数据透视表的功能</p>
<p>
数据分组</p>
<p>
group by 用于数据分组 having 用于分组后数据的过滤</p>
<ol class="dp-sql">
<li class="alt">
<span><span class="keyword">SELECT</span><span> order_num,</span><span class="func">COUNT</span><span>(*) </span><span class="keyword">as</span><span> items </span></span>
</li>
<li>
<span><span class="keyword">FROM</span><span> OrderItems </span></span>
</li>
<li class="alt">
<span><span class="keyword">GROUP</span><span> </span><span class="keyword">BY</span><span> order_num </span><span class="keyword">HAVING</span><span> </span><span class="func">COUNT</span><span>(*)>=3; </span></span>
</li>
</ol>
<p>
交叉表</p>
<p>
通过CASE WHEN函数实现</p>
<ol class="dp-sql">
<li class="alt">
<span><span class="keyword">SELECT</span><span> data1.city, </span></span>
</li>
<li>
<span><span class="func">CASE</span><span> </span><span class="keyword">WHEN</span><span> colour = </span><span class="string">"A"</span><span> </span><span class="keyword">THEN</span><span> price </span><span class="keyword">END</span><span> </span><span class="keyword">AS</span><span> A, </span></span>
</li>
<li class="alt">
<span><span class="func">CASE</span><span> </span><span class="keyword">WHEN</span><span> colour = </span><span class="string">"B"</span><span> </span><span class="keyword">THEN</span><span> price </span><span class="keyword">END</span><span> </span><span class="keyword">AS</span><span> B, </span></span>
</li>
<li>
<span><span class="func">CASE</span><span> </span><span class="keyword">WHEN</span><span> colour = </span><span class="string">"C"</span><span> </span><span class="keyword">THEN</span><span> price </span><span class="keyword">END</span><span> </span><span class="keyword">AS</span><span> C, </span></span>
</li>
<li class="alt">
<span><span class="func">CASE</span><span> </span><span class="keyword">WHEN</span><span> colour = </span><span class="string">"F"</span><span> </span><span class="keyword">THEN</span><span> price </span><span class="keyword">END</span><span> </span><span class="keyword">AS</span><span> F </span></span>
</li>
<li>
<span><span class="keyword">FROM</span><span> data1 </span></span>
</li>
</ol>
<p>
原文链接:https://mp.weixin.qq.com/s?__biz=MzA3MTg4NjY4Mw==&mid=2457324446&idx=2&sn=9cbf3a9184e66e115c867a3326ca88e9&chksm=88a5d9aabfd250bc3f12aca480686547379692d25efc1e3b223670f27c1a8473f2329729dc9f&mpshare=1&</p>
頁:
[1]