In [17]:
import pandas as pd
import numpy as np
df=pd.DataFrame({"id":[1001,1002,1003,1004,1005,1006],
"date":pd.date_range('20130102', periods=6),
"city":['Beijing ', 'SH', ' guangzhou ', 'Shenzhen', 'shanghai', 'BEIJING '],
"age":[23,44,54,32,34,32],
"category":['100-A','100-B','110-A','110-C','210-A','130-F'],
"price":[1200,np.nan,2133,5433,np.nan,4432]},columns =['id','date','city','category','age','price'])
df
Out[17]:
id date city category age price
0 1001 2013-01-02 Beijing 100-A 23 1200.0
1 1002 2013-01-03 SH 100-B 44 NaN
2 1003 2013-01-04 guangzhou 110-A 54 2133.0
3 1004 2013-01-05 Shenzhen 110-C 32 5433.0
4 1005 2013-01-06 shanghai 210-A 34 NaN
5 1006 2013-01-07 BEIJING 130-F 32 4432.0
In [18]:
df1=pd.DataFrame({"id":[1001,1002,1003,1004,1005,1006,1007,1008],
"gender":['male','female','male','female','male','female','male','female'],
"pay":['Y','N','Y','Y','N','Y','N','Y',],
"m-point":[10,12,20,40,40,40,30,20]})
df1
Out[18]:
id gender pay m-point
0 1001 male Y 10
1 1002 female N 12
2 1003 male Y 20
3 1004 female Y 40
4 1005 male N 40
5 1006 female Y 40
6 1007 male N 30
7 1008 female Y 20
In [19]:
df_inner=pd.merge(left=df,right=df1,how='inner',on='id')
df_inner
Out[19]:
id date city category age price gender pay m-point
0 1001 2013-01-02 Beijing 100-A 23 1200.0 male Y 10
1 1002 2013-01-03 SH 100-B 44 NaN female N 12
2 1003 2013-01-04 guangzhou 110-A 54 2133.0 male Y 20
3 1004 2013-01-05 Shenzhen 110-C 32 5433.0 female Y 40
4 1005 2013-01-06 shanghai 210-A 34 NaN male N 40
5 1006 2013-01-07 BEIJING 130-F 32 4432.0 female Y 40
In [20]:
# 数据清洗,city全部转换为小写,并去除空格
df_inner['city'] = df_inner['city'].str.lower()
df_inner['city'] = df_inner['city'].map(str.strip)
df_inner
Out[20]:
id date city category age price gender pay m-point
0 1001 2013-01-02 beijing 100-A 23 1200.0 male Y 10
1 1002 2013-01-03 sh 100-B 44 NaN female N 12
2 1003 2013-01-04 guangzhou 110-A 54 2133.0 male Y 20
3 1004 2013-01-05 shenzhen 110-C 32 5433.0 female Y 40
4 1005 2013-01-06 shanghai 210-A 34 NaN male N 40
5 1006 2013-01-07 beijing 130-F 32 4432.0 female Y 40
In [21]:
# 数据清洗,将date设置为索引
df_inner.set_index(keys=['date'],inplace=True)
df_inner
Out[21]:
id city category age price gender pay m-point
date
2013-01-02 1001 beijing 100-A 23 1200.0 male Y 10
2013-01-03 1002 sh 100-B 44 NaN female N 12
2013-01-04 1003 guangzhou 110-A 54 2133.0 male Y 20
2013-01-05 1004 shenzhen 110-C 32 5433.0 female Y 40
2013-01-06 1005 shanghai 210-A 34 NaN male N 40
2013-01-07 1006 beijing 130-F 32 4432.0 female Y 40
In [22]:
# 使用与条件进行筛选,条件是年龄大于25岁,并且城市为beijing
df_inner.loc[(df_inner['age']>25) & (df_inner['city']=='beijing')] # 注意运算的优先级
Out[22]:
id city category age price gender pay m-point
date
2013-01-07 1006 beijing 130-F 32 4432.0 female Y 40
In [28]:
# 使用“或”条件进行筛选,年龄大于25岁或城市为beijing,并按age升序排列
df_inner.loc[(df_inner['age']>25) | (df_inner['city']=='beijing')].sort_values(['age'])
Out[28]:
id city category age price gender pay m-point
date
2013-01-02 1001 beijing 100-A 23 1200.0 male Y 10
2013-01-05 1004 shenzhen 110-C 32 5433.0 female Y 40
2013-01-07 1006 beijing 130-F 32 4432.0 female Y 40
2013-01-06 1005 shanghai 210-A 34 NaN male N 40
2013-01-03 1002 sh 100-B 44 NaN female N 12
2013-01-04 1003 guangzhou 110-A 54 2133.0 male Y 20
In [37]:
df_inner.loc[(df_inner['age']>25) | (df_inner['city']=='beijing')].sort_values(['age'])['price'].sum() # price求和
Out[37]:
13198.0
In [38]:
df_inner.loc[(df_inner['age']>25) | (df_inner['city']=='beijing')].sort_values(['age'])['city'].count() # city计数
Out[38]:
6
In [45]:
# 还有一种筛选的方式是用query函数。下面是具体的代码和筛选结果。
df_inner.query('city == ["beijing","shanghai"]')
Out[45]:
id city category age price gender pay m-point
date
2013-01-02 1001 beijing 100-A 23 1200.0 male Y 10
2013-01-06 1005 shanghai 210-A 34 NaN male N 40
2013-01-07 1006 beijing 130-F 32 4432.0 female Y 40
In [50]:
# 分类汇总groupby
df_inner.groupby(by='city').count()
Out[50]:
id category age price gender pay m-point
city
beijing 2 2 2 2 2 2 2
guangzhou 1 1 1 1 1 1 1
sh 1 1 1 0 1 1 1
shanghai 1 1 1 0 1 1 1
shenzhen 1 1 1 1 1 1 1
In [51]:
# 对特定的列进行计数汇总
df_inner.groupby(by='city')['id'].count()
Out[51]:
city
beijing      2
guangzhou    1
sh           1
shanghai     1
shenzhen     1
Name: id, dtype: int64
In [53]:
df_inner.groupby(by=['city','gender']).count()
Out[53]:
id category age price pay m-point
city gender
beijing female 1 1 1 1 1 1
male 1 1 1 1 1 1
guangzhou male 1 1 1 1 1 1
sh female 1 1 1 0 1 1
shanghai male 1 1 1 0 1 1
shenzhen female 1 1 1 1 1 1
In [58]:
# 对汇总后的数据同时按多个维度进行计算
df_inner.groupby(by=['city','gender']).agg([len,np.sum,np.mean])
Out[58]:
id age price m-point
len sum mean len sum mean len sum mean len sum mean
city gender
beijing female 1 1006 1006 1 32 32 1.0 4432.0 4432.0 1 40 40
male 1 1001 1001 1 23 23 1.0 1200.0 1200.0 1 10 10
guangzhou male 1 1003 1003 1 54 54 1.0 2133.0 2133.0 1 20 20
sh female 1 1002 1002 1 44 44 1.0 0.0 NaN 1 12 12
shanghai male 1 1005 1005 1 34 34 1.0 0.0 NaN 1 40 40
shenzhen female 1 1004 1004 1 32 32 1.0 5433.0 5433.0 1 40 40
In [60]:
# 数据透视表
df_pivot = pd.pivot_table(df_inner,index='city',values=['price'],columns=['gender'],aggfunc=[len,np.sum],fill_value=0,margins=True)
df_pivot
Out[60]:
len sum
price price
gender female male All female male All
city
beijing 1 1 2.0 4432 1200 5632.0
guangzhou 0 1 1.0 0 2133 2133.0
sh 1 0 NaN 0 0 NaN
shanghai 0 1 NaN 0 0 NaN
shenzhen 1 0 1.0 5433 0 5433.0
All 2 2 4.0 9865 3333 13198.0
In [66]:
# 这里有个核心的问题,即多行多列索引怎么提取,怎么操作,以df_pivot为例
df_pivot.columns
Out[66]:
MultiIndex([('len', 'price', 'female'),
            ('len', 'price',   'male'),
            ('len', 'price',    'All'),
            ('sum', 'price', 'female'),
            ('sum', 'price',   'male'),
            ('sum', 'price',    'All')],
           names=[None, None, 'gender'])
In [67]:
df_pivot.index
Out[67]:
Index(['beijing', 'guangzhou', 'sh', 'shanghai', 'shenzhen', 'All'], dtype='object', name='city')
In [81]:
df_pivot.iloc[0:2,0:4]
Out[81]:
len sum
price price
gender female male All female
city
beijing 1 1 2.0 4432
guangzhou 0 1 1.0 0