In [1]:
import pandas as pd
import numpy as np
In [5]:
df=pd.DataFrame({"id":[1001,1002,1003,1004,1005,1006],
"date":pd.date_range('20130102', periods=6),
"city":['Beijing ', 'SH', ' guangzhou ', 'Shenzhen', 'shanghai', 'BEIJING '],
"age":[23,44,54,32,34,32],
"category":['100-A','100-B','110-A','110-C','210-A','130-F'],
"price":[1200,np.nan,2133,5433,np.nan,4432]},columns =['id','date','city','category','age','price'])
df
Out[5]:
id date city category age price
0 1001 2013-01-02 Beijing 100-A 23 1200.0
1 1002 2013-01-03 SH 100-B 44 NaN
2 1003 2013-01-04 guangzhou 110-A 54 2133.0
3 1004 2013-01-05 Shenzhen 110-C 32 5433.0
4 1005 2013-01-06 shanghai 210-A 34 NaN
5 1006 2013-01-07 BEIJING 130-F 32 4432.0
In [4]:
df1=pd.DataFrame({"id":[1001,1002,1003,1004,1005,1006,1007,1008],
"gender":['male','female','male','female','male','female','male','female'],
"pay":['Y','N','Y','Y','N','Y','N','Y',],
"m-point":[10,12,20,40,40,40,30,20]})
df1
Out[4]:
id gender pay m-point
0 1001 male Y 10
1 1002 female N 12
2 1003 male Y 20
3 1004 female Y 40
4 1005 male N 40
5 1006 female Y 40
6 1007 male N 30
7 1008 female Y 20
In [41]:
# 数据表inner匹配合并,类比于sql中的join
df_inner=pd.merge(left=df,right=df1,how='inner')
df_inner
Out[41]:
id date city category age price gender pay m-point
0 1001 2013-01-02 Beijing 100-A 23 1200.0 male Y 10
1 1002 2013-01-03 SH 100-B 44 NaN female N 12
2 1003 2013-01-04 guangzhou 110-A 54 2133.0 male Y 20
3 1004 2013-01-05 Shenzhen 110-C 32 5433.0 female Y 40
4 1005 2013-01-06 shanghai 210-A 34 NaN male N 40
5 1006 2013-01-07 BEIJING 130-F 32 4432.0 female Y 40
In [11]:
# 数据表left匹配合并,类比于sql中的left join
df_left=pd.merge(left=df,right=df1,how='left')
df_left
Out[11]:
id date city category age price gender pay m-point
0 1001 2013-01-02 Beijing 100-A 23 1200.0 male Y 10
1 1002 2013-01-03 SH 100-B 44 NaN female N 12
2 1003 2013-01-04 guangzhou 110-A 54 2133.0 male Y 20
3 1004 2013-01-05 Shenzhen 110-C 32 5433.0 female Y 40
4 1005 2013-01-06 shanghai 210-A 34 NaN male N 40
5 1006 2013-01-07 BEIJING 130-F 32 4432.0 female Y 40
In [12]:
# 数据表right匹配合并,类比于sql中的right join
df_right=pd.merge(left=df,right=df1,how='right')
df_right
Out[12]:
id date city category age price gender pay m-point
0 1001 2013-01-02 Beijing 100-A 23.0 1200.0 male Y 10
1 1002 2013-01-03 SH 100-B 44.0 NaN female N 12
2 1003 2013-01-04 guangzhou 110-A 54.0 2133.0 male Y 20
3 1004 2013-01-05 Shenzhen 110-C 32.0 5433.0 female Y 40
4 1005 2013-01-06 shanghai 210-A 34.0 NaN male N 40
5 1006 2013-01-07 BEIJING 130-F 32.0 4432.0 female Y 40
6 1007 NaT NaN NaN NaN NaN male N 30
7 1008 NaT NaN NaN NaN NaN female Y 20
In [14]:
# 设置id列为索引列
df_inner.set_index('id')
Out[14]:
date city category age price gender pay m-point
id
1001 2013-01-02 Beijing 100-A 23 1200.0 male Y 10
1002 2013-01-03 SH 100-B 44 NaN female N 12
1003 2013-01-04 guangzhou 110-A 54 2133.0 male Y 20
1004 2013-01-05 Shenzhen 110-C 32 5433.0 female Y 40
1005 2013-01-06 shanghai 210-A 34 NaN male N 40
1006 2013-01-07 BEIJING 130-F 32 4432.0 female Y 40
In [17]:
# 排序sort_values函数和sort_index
# 按特定列的值排序
df_inner.sort_values(by=['age'],ascending=True)
Out[17]:
id date city category age price gender pay m-point
0 1001 2013-01-02 Beijing 100-A 23 1200.0 male Y 10
3 1004 2013-01-05 Shenzhen 110-C 32 5433.0 female Y 40
5 1006 2013-01-07 BEIJING 130-F 32 4432.0 female Y 40
4 1005 2013-01-06 shanghai 210-A 34 NaN male N 40
1 1002 2013-01-03 SH 100-B 44 NaN female N 12
2 1003 2013-01-04 guangzhou 110-A 54 2133.0 male Y 20
In [18]:
# 按索引列排序
df_inner.sort_index()
Out[18]:
id date city category age price gender pay m-point
0 1001 2013-01-02 Beijing 100-A 23 1200.0 male Y 10
1 1002 2013-01-03 SH 100-B 44 NaN female N 12
2 1003 2013-01-04 guangzhou 110-A 54 2133.0 male Y 20
3 1004 2013-01-05 Shenzhen 110-C 32 5433.0 female Y 40
4 1005 2013-01-06 shanghai 210-A 34 NaN male N 40
5 1006 2013-01-07 BEIJING 130-F 32 4432.0 female Y 40
In [20]:
# 数据判断分组函数where,类似于if函数
#如果price列的值>3000,group列显示high,否则显示low
df_inner['group'] = np.where(df_inner['price']>3000, 'high', 'low')
df_inner
Out[20]:
id date city category age price gender pay m-point group
0 1001 2013-01-02 Beijing 100-A 23 1200.0 male Y 10 low
1 1002 2013-01-03 SH 100-B 44 NaN female N 12 low
2 1003 2013-01-04 guangzhou 110-A 54 2133.0 male Y 20 low
3 1004 2013-01-05 Shenzhen 110-C 32 5433.0 female Y 40 high
4 1005 2013-01-06 shanghai 210-A 34 NaN male N 40 low
5 1006 2013-01-07 BEIJING 130-F 32 4432.0 female Y 40 high
In [42]:
#对复合多个条件的数据进行分组标记,np.where里面组合条件也行
df_inner.loc[(df_inner['city']=='beijing') & (df_inner['price']>=4000), 'sign']=1
df_inner
Out[42]:
id date city category age price gender pay m-point sign
0 1001 2013-01-02 Beijing 100-A 23 1200.0 male Y 10 NaN
1 1002 2013-01-03 SH 100-B 44 NaN female N 12 NaN
2 1003 2013-01-04 guangzhou 110-A 54 2133.0 male Y 20 NaN
3 1004 2013-01-05 Shenzhen 110-C 32 5433.0 female Y 40 NaN
4 1005 2013-01-06 shanghai 210-A 34 NaN male N 40 NaN
5 1006 2013-01-07 BEIJING 130-F 32 4432.0 female Y 40 NaN
In [43]:
# 对category字段的值依次进行分列,并创建数据表,索引值为df_inner的索引列,列名称为category和size
# 这里面嵌套的x.split('-') for x in df_inner['category']我没懂,还能这么写?
split = pd.DataFrame((x.split('-') for x in df_inner['category']),index=df_inner.index,columns=['category','size'])
split
Out[43]:
category size
0 100 A
1 100 B
2 110 A
3 110 C
4 210 A
5 130 F
In [44]:
# for循环遍历series
for x in df_inner['category']:
    print(x)
100-A
100-B
110-A
110-C
210-A
130-F
In [45]:
# 使用原表的index作为merge的主键进行合并
df_inner=pd.merge(df_inner,split,how='inner',left_index=True,right_index=True)
df_inner
Out[45]:
id date city category_x age price gender pay m-point sign category_y size
0 1001 2013-01-02 Beijing 100-A 23 1200.0 male Y 10 NaN 100 A
1 1002 2013-01-03 SH 100-B 44 NaN female N 12 NaN 100 B
2 1003 2013-01-04 guangzhou 110-A 54 2133.0 male Y 20 NaN 110 A
3 1004 2013-01-05 Shenzhen 110-C 32 5433.0 female Y 40 NaN 110 C
4 1005 2013-01-06 shanghai 210-A 34 NaN male N 40 NaN 210 A
5 1006 2013-01-07 BEIJING 130-F 32 4432.0 female Y 40 NaN 130 F