In [11]:
import pandas as pd
import numpy as np
# 创建数据表,np.nan代表null值
df=pd.DataFrame({"id":[1001,1002,1003,1004,1005,1006],
"date":pd.date_range('20130102', periods=6),
"city":['Beijing ', 'SH', ' guangzhou ', 'Shenzhen', 'shanghai', 'BEIJING '],
"age":[23,44,54,32,34,32],
"category":['100-A','100-B','110-A','110-C','210-A','130-F'],
"price":[1200,np.nan,2133,5433,np.nan,4432]},columns =['id','date','city','category','age','price'])
df.head()
Out[11]:
id date city category age price
0 1001 2013-01-02 Beijing 100-A 23 1200.0
1 1002 2013-01-03 SH 100-B 44 NaN
2 1003 2013-01-04 guangzhou 110-A 54 2133.0
3 1004 2013-01-05 Shenzhen 110-C 32 5433.0
4 1005 2013-01-06 shanghai 210-A 34 NaN
In [12]:
# 删除数据表中含有空值的行
df.dropna(how='any') #inplace=True参数代表替换原df
Out[12]:
id date city category age price
0 1001 2013-01-02 Beijing 100-A 23 1200.0
2 1003 2013-01-04 guangzhou 110-A 54 2133.0
3 1004 2013-01-05 Shenzhen 110-C 32 5433.0
5 1006 2013-01-07 BEIJING 130-F 32 4432.0
In [13]:
# 使用数字0填充控制
df.fillna(value=0)
Out[13]:
id date city category age price
0 1001 2013-01-02 Beijing 100-A 23 1200.0
1 1002 2013-01-03 SH 100-B 44 0.0
2 1003 2013-01-04 guangzhou 110-A 54 2133.0
3 1004 2013-01-05 Shenzhen 110-C 32 5433.0
4 1005 2013-01-06 shanghai 210-A 34 0.0
5 1006 2013-01-07 BEIJING 130-F 32 4432.0
In [20]:
# 使用均值对NA进行填充
df['price'].fillna(value=df['price'].mean())
Out[20]:
0    1200.0
1    3299.5
2    2133.0
3    5433.0
4    3299.5
5    4432.0
Name: price, dtype: float64
In [34]:
# 清理city字段中的空格
df['city']=df['city'].map(str.strip)
df
Out[34]:
id date city category age price
0 1001 2013-01-02 Beijing 100-A 23 1200.0
1 1002 2013-01-03 SH 100-B 44 NaN
2 1003 2013-01-04 guangzhou 110-A 54 2133.0
3 1004 2013-01-05 Shenzhen 110-C 32 5433.0
4 1005 2013-01-06 shanghai 210-A 34 NaN
5 1006 2013-01-07 BEIJING 130-F 32 4432.0
In [41]:
# 大小写转换
df['city'].str.lower() #df['city'].str.upper()
Out[41]:
0      beijing
1           sh
2    guangzhou
3     shenzhen
4     shanghai
5      beijing
Name: city, dtype: object
In [43]:
#更改数据格式astype,dtype查看数据格式
df['price'].dtype
Out[43]:
dtype('float64')
In [56]:
df.fillna(value=0,inplace=True) #na无法更改类型,需要先填充
df['price']=df['price'].astype('int')
df['price']
Out[56]:
0    1200
1       0
2    2133
3    5433
4       0
5    4432
Name: price, dtype: int32
In [57]:
df.dtypes #注意dataframe的dtypes是带s的
Out[57]:
id                   int64
date        datetime64[ns]
city                object
category            object
age                  int64
price                int32
dtype: object
In [60]:
# 更改列名称,传入字典代表后一个替换前一个
df.rename(columns={'category':'category-size'})
Out[60]:
id date city category-size age price
0 1001 2013-01-02 beijing 100-A 23 1200
1 1002 2013-01-03 sh 100-B 44 0
2 1003 2013-01-04 guangzhou 110-A 54 2133
3 1004 2013-01-05 shenzhen 110-C 32 5433
4 1005 2013-01-06 shanghai 210-A 34 0
5 1006 2013-01-07 beijing 130-F 32 4432
In [70]:
# 删除重复值
print(len(df.city.unique())) #查看city的唯一值数量
df['city'].drop_duplicates()
5
Out[70]:
0      beijing
1           sh
2    guangzhou
3     shenzhen
4     shanghai
Name: city, dtype: object
In [71]:
# 删除重复值,keep='last',保留最后出现的值
df['city'].drop_duplicates(keep='last')
Out[71]:
1           sh
2    guangzhou
3     shenzhen
4     shanghai
5      beijing
Name: city, dtype: object
In [75]:
# 数据替换
df['city'].replace(to_replace='sh',value='shanghai')
Out[75]:
0      beijing
1     shanghai
2    guangzhou
3     shenzhen
4     shanghai
5      beijing
Name: city, dtype: object