数据清洗

2020-05-29

本文阅读量：2

本文为 Pandas 库的相关知识。

Pandas库的基本使用

用途：针对二位数据表进行数据处理，数据项拆分、过滤、合并（统计、可视化）

常用方法：

初始化：DataFrame(json，columns，index)
切片：loc按行列名称切；iloc按行列序号切
拼装：concat 按列名匹配合并
矩阵拼接：concatenate 简单的行列合并
合并：merge 按列名进行关联合并(重点)
导入csv文件：read_csv(文件名，sep=分隔符)
常见数据窥探方法：shape，info，columns，head
value_counts：返回数据集中列的次数
数据处理函数：apply（lambda表达式）
求和：sum

# 引入库函数
import pandas as pd  # pandas库
import numpy as np  # 数组处理库
from pandas import DataFrame  # pandas中支持二维数据的库

# pandas 家政服务王师傅   dataFrame 机械疏通器，容器

data={"city":["北京","上海","杭州","苏州","深圳"],
      "year":[2017,2018,2019,2020,2018],
     "price":[40000,50000,20000,22000,30000]}

# 将数据转换为DataFrame格式
df=DataFrame(data) # 简单转换
df2=DataFrame(data,columns=["city","year","price","people"],index=["one","two","three","four","five"])  # 指定索引，列名的数据初始化

# 显示有限数据，默认5行，可以指定行数
df.head(3)
df2.head()

# 读取数据
df2.city  # 通过.直接访问
df2['city']  # 通过[]+列名访问

#单列批量赋值
df2["people"]=1
df2['people']=np.arange(5)
df2['people']=[6,5,4,3,2]

# 二维表的切片
# loc针对名称进行切片
df2.loc[['one','two']]  # 单维度切片
df2.loc[['one','two'],['city','price','people']]  # 双维度切片
# iloc针对下标进行切片
df2.iloc[1:3,0:2]

# 强类型语言：先声明后使用
# 弱类型语言：可直接使用未声明变量

# 数据集拼接
d={i:'abc'+str(i) for i in range(10)}

# 矩阵拼接
# 从numpy对象中调用random.randint方法产生（3*3）整数类型的数字
arr=np.random.randint(0,10,(3,3))
arr2=np.random.randint(0,10,(3,3))
# concatenate方法
np.concatenate((arr,arr2))  # 默认是y轴方向的拼接
np.concatenate((arr,arr2),axis=1) #axis=1 x轴拼接
# vstack和hstack方法
np.vstack((arr,arr2))
np.hstack((arr,arr2))

# 声明一个函数,根据给定的行列参数，产生指定行列内容数据集
def mk_df(cols,index): # 列值、索引值
    data={c:[c+str(i) for i in index] for c in cols}
    return DataFrame(data,columns=cols,index=index)

df2=mk_df(['A','B'],[1,2,3])
df3=mk_df(['A','B'],[4,5,6])
df4=mk_df(['C','B'],[4,5,6])
# 垂直方向拼接
pd.concat((df2,df3))
pd.concat((df3,df4))
pd.concat((df3,df4),ignore_index=True)  # 重置索引
pd.concat((df3,df4),join='inner')  # 内连接
pd.concat((df3,df4),join='outer')  # 外连接
pd.concat([df4,df3.reindex(columns=df4.columns)])  # 指定拼接主体

# 数据集关联
d1=DataFrame({
    "employee":["zhang","li","wang"],
    "dept":["hr","marketing","sales"]})
d2=DataFrame({
    "employee":["zhang","li","wang"],
    "work_time":[10,15,20],
    "height":[174,180,185]})
d3=DataFrame({
    "emp":["zhang","li","wang"],
    "work_time":[10,15,20],
    "height":[174,180,185]})
# 两表的关联字段都为employee
pd.merge(d1,d2,on="employee")
# 左表的关联字段employee，右表关联字段emp
pd.merge(d1,d3,left_on="employee",right_on="emp")

练习题

数据合并练习

import numpy as np
import pandas as pd
from pandas import Series, DataFrame

raw_data_1 = {
        'subject_id': ['1', '2', '3', '4', '5'],
        'first_name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'], 
        'last_name': ['Anderson', 'Ackerman', 'Ali', 'Aoni', 'Atiches']}
raw_data_2 = {
        'subject_id': ['4', '5', '6', '7', '8'],
        'first_name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'], 
        'last_name': ['Bonder', 'Black', 'Balwner', 'Brice', 'Btisan']}
raw_data_3 = {
        'subject_id': ['1', '2', '3', '4', '5', '7', '8', '9', '10', '11'],
        'test_id': [51, 15, 15, 61, 16, 14, 15, 1, 61, 16]}

# 将上述的数据框分别命名为data1, data2, data3生成DataFrame
data1=pd.DataFrame(raw_data_1)
data2=pd.DataFrame(raw_data_2)
data3=pd.DataFrame(raw_data_3)

# 将data1和data2两个数据框按照行的维度进行合并
all_data=pd.concat((data1,data2),ignore_index=True)
# 将data1和data2两个数据框按照列的维度进行合并
all_data_col=pd.concat((data1,data2),axis=1,ignore_index=True)

# 按照subject_id的值对all_data和data3作合并
pd.merge(all_data,data3,on='subject_id')

# 对data1和data2按照subject_id作内连接合并
pd.merge(data1,data2,on='subject_id',how='inner')
# 找到data1和data2合并之后的所有匹配结果
pd.merge(data1,data2,on='subject_id',how='outer')

基于二手车的数据清洗

import pandas as pd
import numpy as np

data=pd.read_csv('second_cars_info.csv',encoding='gb2312')

# 提取Boarding中的年份和月份，但是Boarding_time中有“未上牌”
# 先计算“未上牌”的数据占比
np.sum(data.Boarding_time=='未上牌')/data.shape[0]*100
np.sum(data.Boarding_time=='未上牌')/data.Boarding_time.size
# 比重不大，直接删除
data=data[data.Boarding_time!="未上牌"]

# 重置删除数据后的数据集索引
data.index=np.arange(0,data.shape[0])

# 提取年份和月份，单独设置一列
data["year"]=data.Boarding_time.str[0:4].astype("int")
data["month"]=data.Boarding_time.str[5:-1].astype("int")

# 清洗公里数
data["Km"].value_counts()
# 去除单位，只保留数值
data["km_new"]=data.Km.str[:-3]
# 删除掉带‘百’的数据
index_new=data[~data.km_new.str.contains("百")].index
data=data.loc[index_new,:]
data[data.km_new.str.contains("百")]   # 检查是否还有含百的数据集
data["km_new"]=data.km_new.astype("float")

基于电商订单的数据清洗

'''一、加载数据'''
# 1.加载处理数据
import pandas as pd
# 2.读取数据文件
df=pd.read_excel('order2019.xlsx',index_col='id')
df.describe()


'''二、提取数据'''
'''2.1 根据业务需要提取数据，只提取2019年数据'''
# 1.引入时间模块
import datetime
startTime=datetime.datetime(2019,1,1)
endTime=datetime.datetime(2019,12,31,23,59,59)
# 2.将数据源中的时间数据转换成datetime形式
df.orderTime=pd.to_datetime(df.orderTime)
df.payTime=pd.to_datetime(df.payTime)
# 3.将2019年1月1日前的数据删除
df.drop(index=df[df.orderTime<startTime].index,inplace=True)
# 4.将2019年12月31日后的数据删除
df.drop(index=df[df.orderTime>endTime].index,inplace=True)

'''2.2 不提取支付时间间隔过长的数据'''
# 1.下单时间与支付时间间隔
df['payinterval']=(df.payTime-df.orderTime).dt.total_seconds()
# 2.剔除支付时间超过30分钟
df.drop(index=df[df.payinterval>1800].index,inplace=True)
df.drop(index=df[df.payinterval<0].index,inplace=True)

'''2.3 不提取订单金额与支付金额为负'''
df.drop(index=df[df.orderAmount<0].index,inplace=True)
df.drop(index=df[df.payment<0].index,inplace=True)


'''三、清洗数据'''
'''3.1 清洗订单号'''
df.drop(index=df[df.orderID.duplicated()].index,inplace=True)
df.orderID.unique().size

'''3.2 清洗商品号'''
df.goodsID[df.goodsID=='PR000000'].size
df.drop(index=df[df.goodsID=='PR000000'].index,inplace=True)

'''3.3 清洗渠道'''
# 1.查看空值
df.info()
df[df.chanelID.isnull()]
# 2.修复空值
df['chanelID'].fillna(value=df.chanelID.mode()[0],inplace=True)

'''3.4 清洗平台类型'''
df['platfromType']=df['platfromType'].str.replace(" ","")
df.platfromType.unique()

'''3.5 清洗付款金额'''
# 1.创建折扣字段
df['discount']=df.payment/df.orderAmount
df.describe()
# 2.平均折扣
meanDiscount=df[df['discount']<=1].discount.sum()/df[df['discount']<=1].discount.size
meanDiscount
# 3.折扣大于1的数据置为空
df['payment']=df['payment'].mask(df['discount']>1,None)
# 4.对折扣大于1的数据进行填补数据
df['payment'].fillna(value=df.orderAmount*meanDiscount,inplace=True)
df.describe()
# 5.处理折扣
df['dicsount']=round(df['discount'],2)


'''四、分析数据'''
'''4.1 查看整体销售情况'''
# 1.销售GMV
df.orderAmount.sum()/10000
# 2.成交总和
df['payment'].sum()/10000
# 3.实际成交额
df[df.chargeback=='否'].payment.sum()/10000
# 4.订单数量
df.orderID.unique().size
# 5.退货订单数
df[df.chargeback=='是'].orderID.size
# 6.退货率
df[df.chargeback=='是'].orderID.size/df.orderID.unique().size
# 7.用户数
df.userID.unique().size

'''4.2 销售情况，各月份GMV/成交额趋势'''
# 翻转维度，以月份为坐标轴
# 1.月份
# df.字段名  获取
# df['字段名']  写入
df['month']=df['orderTime'].dt.month
df
# 2.绘制图
import matplotlib.pyplot as plt
from matplotlib import font_manager
%matplotlib inline
my_font=font_manager.FontProperties(fname='C:\Windows\Fonts\msyh.ttc',size=12)
# 图像
plt.figure(figsize=(15,9))
plt.grid(alpha=0.4)
x=df.groupby('month').sum().index
# GMV
y1=df.groupby('month')['orderAmount'].sum().values/10000
# 实际付款
y2=df.groupby('month')['payment'].sum().values/10000
# 不含退单销售额
y3=df[df.chargeback=='否'].groupby('month')['payment'].sum().values/10000
# x轴
x_ticks_label=["{}月份".format(i) for i in x]
plt.xticks(x,x_ticks_label,rotation=45,fontproperties=my_font)
# plot 折线图
plt.plot(x,y1,label='GMV',color='red',marker='o')
plt.plot(x,y2,label='销售额',color='green',marker='*')
plt.plot(x,y3,label='不含退单',color='blue',marker='.')
plt.xlabel('月份',fontproperties=my_font)
plt.ylabel('销售额万元',fontproperties=my_font)
plt.title('销售额走势',fontproperties=my_font,size=20)
# 折点坐标
for a,b in zip(x,y1):
    plt.annotate('(%.2f)'%(b),xy=(a,b),xytext=(-20,10),textcoords='offset points')
for a,b in zip(x,y2):
    plt.annotate('(%.2f)'%(b),xy=(a,b),xytext=(-20,10),textcoords='offset points')
for a,b in zip(x,y3):
    plt.annotate('(%.2f)'%(b),xy=(a,b),xytext=(-20,10),textcoords='offset points')
# 图例
plt.legend(prop=my_font,loc='upper left')
plt.show()

'''4.3 流量渠道来源分析'''
custom=df.groupby('chanelID')['userID'].count()
plt.rcParams['font.sans-serif']=['SimHei']
custom.plot.pie(figsize=(10,9),labels=custom.index,autopct='%.1f%%')
plt.title('各渠道来源用户占比')

'''4.4 用户行为，每日订单量分析'''
# 0对应周一
df['dayofweek']=df['orderTime'].dt.dayofweek
df['dayofweek'].unique()
week=df.groupby('dayofweek')['orderID'].count()
weekX=['周一','周二','周三','周四','周五','周六','周日']
weekY=week.values
plt.xticks(range(len(weekX)),weekX,fontproperties=my_font)
#柱状图\条形图 
rects=plt.bar(range(len(weekY)),weekY,width=0.5,color=['b'])
for rect in rects:
    height =rect.get_height()
    plt.text(rect.get_x()+rect.get_width()/2,height+1.5,str(height),ha="center")
plt.show()

'''4.5 用户行为，时间段下单量'''
df1=df.copy()
# df1['orderTime']=pd.to_datetime(df1.orderTime)
# df1['orderTime']=df1['orderTime'].dt.time
# timedf=df1.groupby('orderTime')['orderID'].count()
s=df1['orderTime'].dt.floor('30T')
s.dt.strftime('%H:%M')
pd.Timedelta(29,unit='m')
pd.Timedelta(29*60,unit='s')
df1['orderTime']=s.dt.strftime('%H:%M')+'-'+(s+pd.Timedelta(29*60,unit='s')).dt.strftime('%H:%M')
df1
timedf=df1.groupby('orderTime')['orderID'].count()
timedf
timedfX=timedf.index
timedfY=timedf.values
plt.figure(figsize=(20,8),dpi=80)
plt.style.use('ggplot')
plt.xticks(range(len(timedfX)),timedfX,rotation=90)
rect=plt.bar(timedfX,timedfY,width=0.5)

'''4.6 用户行为，客户情况'''
# 客单价
df.orderAmount.sum()/df.userID.unique().size
# 处理userID字段，后期作为索引
df['userid']=df["userID"].str[0:4]
df['userid']
# 只保留数值
df['userID']=df["userID"].str[5:]

'''4.7 用户复购率'''
# 将消费时间进行数据透视
# 统计每个用户在每月的订单量，userID为index，month为column
pivoted_counts=df.pivot_table(index='userID',columns='month',values='orderTime',aggfunc='count').fillna(0)
pivoted_counts.head()

# 复购率的定义是在某时间窗口内消费两次及以上的用户在总消费用户中占比。
# 这里的时间窗口是月，如果一个用户在同一天下了两笔订单，这里也将他算作复购用户。
# 转换数据，消费两次以上为1，一次为0，没有消费为NaN
import numpy as np
pcRepeatBuy=pivoted_counts.applymap(lambda x:1 if x>1 else np.NaN if x==0 else 0)
pcRepeatBuy
# 每月复购用户数/每月总下单用户数
(pcRepeatBuy.sum()/pcRepeatBuy.count()).plot(figsize=(20,9))

'''4.8 客户RFM模型'''
# R->近度(研究日与上次购买日期的距离)
# R越小 近期有交易
# R越大 近期无交易 沉睡 流失
# F->频度
# 越大越活跃 忠诚度高
# 越小不活跃
# M->客户每次消费金额
# 越大
customdf=df.copy()
customdf.drop(index=df[df.chargeback=='是'].index,inplace=True)
customdf['orderTime']=pd.to_datetime(customdf['orderTime'],format='%Y-%m-%d')
customdf
# customdf.set_index('userID',drop=False,inplace=True)
# customdf.count()
customdf.set_index('userID',drop=True,inplace=True)
customdf.count()
customdf['orders']=1  # 设置原始订单频率
customdf
rfmdf=customdf.pivot_table(
    index=['userID'],
    values=['orderAmount','orderTime','orders'],
    aggfunc={
        'orderTime':'max',
        'orderAmount':'sum',
        'orders':'sum'
    }
)
rfmdf['R']=(rfmdf.orderTime.max()-rfmdf.orderTime).dt.days
rfmdf.rename(columns={'orderAmount':'M','orders':'F'},inplace=True)
rfmdf=rfmdf[['R','F','M']]

rfmdf.apply(lambda x:x-x.mean())
# 对用户进行分类，设置标签
def rfm_func(x):
    level=x.apply(lambda x:'1' if x>=0 else '0')
    label=level.R+level.F+level.M
    d={
        '111':'重要价值客户',
        '011':'重要保持客户',
        '101':'重要发展客户',
        '001':'重要挽留客户',
        '110':'一般价值客户',
        '010':'一般保持客户',
        '100':'一般发展客户',
        '000':'一般挽留客户'
    }
    result=d[label]
    return result
rfmdf['label']=rfmdf.apply(lambda x:x-x.mean()).apply(rfm_func,axis=1)
rfmdf.groupby('label').count()

rfmdf.label.value_counts().plot.bar(figsize=(20,9))
plt.xticks(rotation=0,fontproperties=my_font)