In [52]:
import pandas as pd
import numpy as np
In [53]:
#读取表,shift+tab查看方法参数
df=pd.DataFrame(pd.read_csv('population_total.csv'))
df.head()
Out[53]:
country 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812
0 Afghanistan 3280000 3280000 3280000 3280000 3280000 3280000 3280000 3280000 3280000 3280000 3280000 3280000 3280000
1 Albania 410000 412000 413000 414000 416000 417000 418000 420000 421000 422000 424000 425000 426000
2 Algeria 2500000 2510000 2520000 2530000 2540000 2550000 2560000 2570000 2580000 2590000 2600000 2600000 2610000
3 Andorra 2650 2650 2650 2650 2650 2650 2650 2650 2650 2650 2650 2660 2660
4 Angola 1570000 1570000 1570000 1570000 1570000 1570000 1570000 1570000 1570000 1570000 1570000 1570000 1570000
In [54]:
#为了直观的显示数字,不采用科学计数法
pd.set_option('display.float_format', lambda x: '%.2f' % x) 
#基础数学属性描述
df.describe()
Out[54]:
1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812
count 10.00 10.00 10.00 10.00 10.00 10.00 10.00 10.00 10.00 10.00 10.00 10.00 10.00
mean 1230765.00 1231865.00 1233865.00 1235865.00 1238065.00 1240065.00 1241065.00 1243165.00 1245165.00 1247165.00 1249265.00 1250466.00 1251566.00
std 1307158.80 1308176.48 1310951.42 1313734.20 1316380.20 1319178.49 1320291.12 1323032.45 1325850.64 1328676.39 1331440.77 1333022.92 1334098.51
min 2650.00 2650.00 2650.00 2650.00 2650.00 2650.00 2650.00 2650.00 2650.00 2650.00 2650.00 2660.00 2660.00
25% 365750.00 365500.00 365000.00 364250.00 364250.00 363500.00 362750.00 362000.00 361250.00 360500.00 359750.00 359250.00 358500.00
50% 473500.00 473500.00 473500.00 474000.00 475000.00 475500.00 476000.00 477000.00 477500.00 478000.00 479000.00 480000.00 481000.00
75% 2267500.00 2275000.00 2282500.00 2290000.00 2297500.00 2305000.00 2312500.00 2320000.00 2327500.00 2335000.00 2342500.00 2342500.00 2350000.00
max 3280000.00 3280000.00 3280000.00 3280000.00 3280000.00 3280000.00 3280000.00 3280000.00 3280000.00 3280000.00 3290000.00 3300000.00 3300000.00
In [55]:
# 创建数据表,np.nan代表null值
df=pd.DataFrame({"id":[1001,1002,1003,1004,1005,1006],
"date":pd.date_range('20130102', periods=6),
"city":['Beijing ', 'SH', ' guangzhou ', 'Shenzhen', 'shanghai', 'BEIJING '],
"age":[23,44,54,32,34,32],
"category":['100-A','100-B','110-A','110-C','210-A','130-F'],
"price":[1200,np.nan,2133,5433,np.nan,4432]},columns =['id','date','city','category','age','price'])
df.head()
Out[55]:
id date city category age price
0 1001 2013-01-02 Beijing 100-A 23 1200.00
1 1002 2013-01-03 SH 100-B 44 nan
2 1003 2013-01-04 guangzhou 110-A 54 2133.00
3 1004 2013-01-05 Shenzhen 110-C 32 5433.00
4 1005 2013-01-06 shanghai 210-A 34 nan
In [56]:
#数据表检查
# 查看数据表的维度
df.shape
Out[56]:
(6, 6)
In [57]:
# 查看数据表的整体信息
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 6 columns):
id          6 non-null int64
date        6 non-null datetime64[ns]
city        6 non-null object
category    6 non-null object
age         6 non-null int64
price       4 non-null float64
dtypes: datetime64[ns](1), float64(1), int64(2), object(2)
memory usage: 416.0+ bytes
In [58]:
#查看数据格式
df.dtypes
Out[58]:
id                   int64
date        datetime64[ns]
city                object
category            object
age                  int64
price              float64
dtype: object
In [59]:
# 查看单列格式
df['id'].dtype
Out[59]:
dtype('int64')
In [60]:
# 查看空值,
df.isnull()
Out[60]:
id date city category age price
0 False False False False False False
1 False False False False False True
2 False False False False False False
3 False False False False False False
4 False False False False False True
5 False False False False False False
In [33]:
#查看特定列是否为null
df['price'].isnull() 
Out[33]:
0    False
1     True
2    False
3    False
4     True
5    False
Name: price, dtype: bool
In [35]:
#查看唯一值,只能对特定列的唯一值
df['price'].unique()
Out[35]:
array([1200.,   nan, 2133., 5433., 4432.])
In [36]:
#查看数据表数值
df.values
Out[36]:
array([[1001, Timestamp('2013-01-02 00:00:00'), 'Beijing ', '100-A', 23,
        1200.0],
       [1002, Timestamp('2013-01-03 00:00:00'), 'SH', '100-B', 44, nan],
       [1003, Timestamp('2013-01-04 00:00:00'), ' guangzhou ', '110-A',
        54, 2133.0],
       [1004, Timestamp('2013-01-05 00:00:00'), 'Shenzhen', '110-C', 32,
        5433.0],
       [1005, Timestamp('2013-01-06 00:00:00'), 'shanghai', '210-A', 34,
        nan],
       [1006, Timestamp('2013-01-07 00:00:00'), 'BEIJING ', '130-F', 32,
        4432.0]], dtype=object)
In [40]:
# 查看列属性名称
df.columns
Out[40]:
Index(['id', 'date', 'city', 'category', 'age', 'price'], dtype='object')
In [44]:
# 查看前3行数据
df.head(3)
Out[44]:
id date city category age price
0 1001 2013-01-02 Beijing 100-A 23 1200.00
1 1002 2013-01-03 SH 100-B 44 nan
2 1003 2013-01-04 guangzhou 110-A 54 2133.00
In [45]:
# 查看后3行数据
df.tail(3)
Out[45]:
id date city category age price
3 1004 2013-01-05 Shenzhen 110-C 32 5433.00
4 1005 2013-01-06 shanghai 210-A 34 nan
5 1006 2013-01-07 BEIJING 130-F 32 4432.00
In [50]:
#数据的基础数学描述
df.describe()
Out[50]:
id age price
count 6.00 6.00 4.00
mean 1003.50 36.50 3299.50
std 1.87 10.88 1966.64
min 1001.00 23.00 1200.00
25% 1002.25 32.00 1899.75
50% 1003.50 33.00 3282.50
75% 1004.75 41.50 4682.25
max 1006.00 54.00 5433.00