http://pandas.pydata.org/pandas-docs/stable/index.html
http://pandas.pydata.org/pandas-docs/stable/10min.html
http://blog.csdn.net/claroja/article/category/6919729/1
http://blog.csdn.net/pipisorry/article/category/5916621
http://pandas.pydata.org/pandas-docs/stable/r_interface.html
http://pandas.pydata.org/pandas-docs/stable/comparison_with_r.html
import pandas as pd
import numpy as np
s = pd.Series([1,3,5,np.nan,6,8])
print(s)
## 0 1.0
## 1 3.0
## 2 5.0
## 3 NaN
## 4 6.0
## 5 8.0
## dtype: float64
import pandas as pd
import numpy as np
dates = pd.date_range('20130101', periods=6)
print(dates)
df = pd.DataFrame(np.random.randn(6,3), index=dates, columns=list('ABC'))
print(df)
## DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
## '2013-01-05', '2013-01-06'],
## dtype='datetime64[ns]', freq='D')
## A B C
## 2013-01-01 0.719050 -1.190380 1.216087
## 2013-01-02 0.634315 -0.638159 1.237365
## 2013-01-03 -0.545393 0.203222 0.458559
## 2013-01-04 -1.627630 -1.688958 -1.392453
## 2013-01-05 -1.364594 0.149654 -0.139358
## 2013-01-06 1.953070 1.027044 -1.311478
import pandas as pd
import numpy as np
df2 = pd.DataFrame({ 'A' : 1.,
'B' : pd.Timestamp('20130102'),
'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
'D' : np.array([3] * 4,dtype='int32'),
'E' : pd.Categorical(["test","train","test","train"]),
'F' : 'foo' })
print(df2)
print(df2.dtypes)
## A B C D E F
## 0 1.0 2013-01-02 1.0 3 test foo
## 1 1.0 2013-01-02 1.0 3 train foo
## 2 1.0 2013-01-02 1.0 3 test foo
## 3 1.0 2013-01-02 1.0 3 train foo
## A float64
## B datetime64[ns]
## C float32
## D int32
## E category
## F object
## dtype: object
import pandas as pd
import numpy as np
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,3), index=dates, columns=list('ABC'))
print(df.head())
print(df.tail(2))
## A B C
## 2013-01-01 1.474897 -0.090145 1.318224
## 2013-01-02 1.725293 0.988291 -2.028873
## 2013-01-03 0.588877 -0.249527 -2.231888
## 2013-01-04 -0.762990 -0.356846 1.227532
## 2013-01-05 -2.137339 -0.335290 1.963766
## A B C
## 2013-01-05 -2.137339 -0.33529 1.963766
## 2013-01-06 1.187954 -0.44375 1.342456
import pandas as pd
import numpy as np
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,3), index=dates, columns=list('ABC'))
print(df.index)
print(df.columns)
print(df.values)
## DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
## '2013-01-05', '2013-01-06'],
## dtype='datetime64[ns]', freq='D')
## Index(['A', 'B', 'C'], dtype='object')
## [[ 1.31825181 0.26434194 0.43418584]
## [-0.09103258 -1.27462493 -0.2828914 ]
## [-1.91087252 0.50810281 0.76963331]
## [ 0.35981189 0.39978904 -0.11465738]
## [-0.82236698 -1.47789632 1.78549502]
## [-1.71783794 -1.94443651 -1.28818172]]
import pandas as pd
import numpy as np
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,3), index=dates, columns=list('ABC'))
print(df.describe())
## A B C
## count 6.000000 6.000000 6.000000
## mean -0.526849 -0.437596 -0.131694
## std 1.150566 0.888248 1.378806
## min -1.863193 -1.976389 -2.003173
## 25% -0.980001 -0.733561 -1.137370
## 50% -0.872928 -0.214403 0.094520
## 75% -0.247964 0.043086 0.815450
## max 1.503289 0.557319 1.516211
import pandas as pd
import numpy as np
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,3), index=dates, columns=list('ABC'))
print(df.T)
## 2013-01-01 2013-01-02 2013-01-03 2013-01-04 2013-01-05 2013-01-06
## A -0.432442 0.306032 0.069921 1.108310 -0.377539 -0.422082
## B 0.384943 -1.215627 0.067218 -0.238315 1.150479 0.511353
## C -0.794288 -0.316593 -1.498269 -0.683950 0.152308 1.144845
import pandas as pd
import numpy as np
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,3), index=dates, columns=list('ABC'))
# 默认按frame升序,降序可在括号加ascending=False
print(df.sort_index(axis=1, ascending=False))
# 按frame的行索引进行排序
print(df.sort_index())
# 按frame的列索引进行排序
print(df.sort_index(axis=1))
# 按frame的一个列或多个列的值进行排序
print(df.sort_index(by='A'))
print(df.sort_index(by=['A','B']))
## -c:17: FutureWarning: by argument to sort_index is deprecated, pls use .sort_values(by=...)
## -c:19: FutureWarning: by argument to sort_index is deprecated, pls use .sort_values(by=...)
## C B A
## 2013-01-01 2.668079 2.119120 0.056270
## 2013-01-02 -0.016571 -1.687678 -0.937221
## 2013-01-03 1.214240 -1.198925 0.633789
## 2013-01-04 -0.684196 0.418149 0.034202
## 2013-01-05 0.432678 -1.426104 0.502198
## 2013-01-06 1.657599 1.011420 -1.416276
## A B C
## 2013-01-01 0.056270 2.119120 2.668079
## 2013-01-02 -0.937221 -1.687678 -0.016571
## 2013-01-03 0.633789 -1.198925 1.214240
## 2013-01-04 0.034202 0.418149 -0.684196
## 2013-01-05 0.502198 -1.426104 0.432678
## 2013-01-06 -1.416276 1.011420 1.657599
## A B C
## 2013-01-01 0.056270 2.119120 2.668079
## 2013-01-02 -0.937221 -1.687678 -0.016571
## 2013-01-03 0.633789 -1.198925 1.214240
## 2013-01-04 0.034202 0.418149 -0.684196
## 2013-01-05 0.502198 -1.426104 0.432678
## 2013-01-06 -1.416276 1.011420 1.657599
## A B C
## 2013-01-06 -1.416276 1.011420 1.657599
## 2013-01-02 -0.937221 -1.687678 -0.016571
## 2013-01-04 0.034202 0.418149 -0.684196
## 2013-01-01 0.056270 2.119120 2.668079
## 2013-01-05 0.502198 -1.426104 0.432678
## 2013-01-03 0.633789 -1.198925 1.214240
## A B C
## 2013-01-06 -1.416276 1.011420 1.657599
## 2013-01-02 -0.937221 -1.687678 -0.016571
## 2013-01-04 0.034202 0.418149 -0.684196
## 2013-01-01 0.056270 2.119120 2.668079
## 2013-01-05 0.502198 -1.426104 0.432678
## 2013-01-03 0.633789 -1.198925 1.214240
import pandas as pd
import numpy as np
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,3), index=dates, columns=list('ABC'))
# 1、选择一个单独的列,这将会返回一个Series,等同于df.A:
print(df['A'])
# 2、通过[]进行选择,这将会对行进行切片
print(df[0:3])
print(df['20130102':'20130104'])
## 2013-01-01 0.334347
## 2013-01-02 -0.400806
## 2013-01-03 -1.640650
## 2013-01-04 0.178510
## 2013-01-05 0.714261
## 2013-01-06 -0.203589
## Freq: D, Name: A, dtype: float64
## A B C
## 2013-01-01 0.334347 -0.488606 1.257202
## 2013-01-02 -0.400806 2.522782 0.568591
## 2013-01-03 -1.640650 -1.343565 1.008803
## A B C
## 2013-01-02 -0.400806 2.522782 0.568591
## 2013-01-03 -1.640650 -1.343565 1.008803
## 2013-01-04 0.178510 -0.411125 0.070722
import pandas as pd
import numpy as np
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,3), index=dates, columns=list('ABC'))
# 1、使用标签来获取一个交叉的区域
print(df.loc[dates[0]])
# 2、通过标签来在多个轴上进行选择
print(df.loc[:,['A','B']])
# 3、标签切片
print(df.loc['20130102':'20130104',['A','B']])
# 4、对于返回的对象进行维度缩减
print(df.loc['20130102',['A','B']])
# 5、获取一个标量
print(df.loc[dates[0],'A'])
# 6、快速访问一个标量(与上一个方法等价)
print(df.at[dates[0],'A'])
## A -0.105711
## B -1.553135
## C 1.401940
## Name: 2013-01-01 00:00:00, dtype: float64
## A B
## 2013-01-01 -0.105711 -1.553135
## 2013-01-02 0.516738 -0.395743
## 2013-01-03 0.042964 0.424069
## 2013-01-04 -0.808021 0.503598
## 2013-01-05 0.364841 -0.071134
## 2013-01-06 0.132690 -1.146901
## A B
## 2013-01-02 0.516738 -0.395743
## 2013-01-03 0.042964 0.424069
## 2013-01-04 -0.808021 0.503598
## A 0.516738
## B -0.395743
## Name: 2013-01-02 00:00:00, dtype: float64
## -0.105711464678
## -0.105711464678
import pandas as pd
import numpy as np
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,3), index=dates, columns=list('ABC'))
# 1、通过传递数值进行位置选择(选择的是行)
print(df.iloc[3])
# 2、通过数值进行切片,与numpy/python中的情况类似
print(df.iloc[3:5,0:2])
# 3、通过指定一个位置的列表,与numpy/python中的情况类似
print(df.iloc[[1,2,4],[0,2]])
# 4、对行进行切片
print(df.iloc[1:3,:])
# 5、对列进行切片
print(df.iloc[:,1:3])
# 6、获取特定的值
print(df.iloc[1,1])
print(df.iat[1,1])
## A -0.430113
## B 1.166528
## C -0.151003
## Name: 2013-01-04 00:00:00, dtype: float64
## A B
## 2013-01-04 -0.430113 1.166528
## 2013-01-05 0.707701 0.523491
## A C
## 2013-01-02 -0.272539 1.178722
## 2013-01-03 1.349420 -1.980765
## 2013-01-05 0.707701 0.211483
## A B C
## 2013-01-02 -0.272539 -2.579566 1.178722
## 2013-01-03 1.349420 -1.060671 -1.980765
## B C
## 2013-01-01 -0.869997 0.135881
## 2013-01-02 -2.579566 1.178722
## 2013-01-03 -1.060671 -1.980765
## 2013-01-04 1.166528 -0.151003
## 2013-01-05 0.523491 0.211483
## 2013-01-06 -0.914865 0.580675
## -2.57956639873
## -2.57956639873
import pandas as pd
import numpy as np
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,3), index=dates, columns=list('ABC'))
# 1、使用一个单独列的值来选择数据:
print(df[df.A > 0])
# 2、使用where操作来选择数据:
print(df[df > 0])
# 3、使用isin()方法来过滤:
df2 = df.copy()
print(df2)
df2['E'] = ['one', 'one','two','three','four','three']
df2
print(df2)
print(df2[df2['E'].isin(['one','four'])])
## A B C
## 2013-01-06 0.719447 0.607389 -0.50302
## A B C
## 2013-01-01 NaN NaN 1.568646
## 2013-01-02 NaN 0.696549 NaN
## 2013-01-03 NaN NaN 1.041927
## 2013-01-04 NaN 1.384888 NaN
## 2013-01-05 NaN NaN 0.881413
## 2013-01-06 0.719447 0.607389 NaN
## A B C
## 2013-01-01 -1.508735 -0.098011 1.568646
## 2013-01-02 -0.584852 0.696549 -0.097488
## 2013-01-03 -0.770481 -1.115255 1.041927
## 2013-01-04 -0.804570 1.384888 -0.067071
## 2013-01-05 -1.060169 -1.229640 0.881413
## 2013-01-06 0.719447 0.607389 -0.503020
## A B C E
## 2013-01-01 -1.508735 -0.098011 1.568646 one
## 2013-01-02 -0.584852 0.696549 -0.097488 one
## 2013-01-03 -0.770481 -1.115255 1.041927 two
## 2013-01-04 -0.804570 1.384888 -0.067071 three
## 2013-01-05 -1.060169 -1.229640 0.881413 four
## 2013-01-06 0.719447 0.607389 -0.503020 three
## A B C E
## 2013-01-01 -1.508735 -0.098011 1.568646 one
## 2013-01-02 -0.584852 0.696549 -0.097488 one
## 2013-01-05 -1.060169 -1.229640 0.881413 four
import pandas as pd
import numpy as np
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,3), index=dates, columns=list('ABC'))
# 1、设置一个新的列:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))
print(s1)
print(df)
df['F'] = s1
# 2、通过标签设置新的值:
df.at[dates[0],'A'] = 0
# 3、通过位置设置新的值:
df.iat[0,1] = 0
# 4、通过一个numpy数组设置一组新值:
df.loc[:,'D'] = np.array([5] * len(df))
# 5、通过where操作来设置新的值:(必须数值型,存在字符串列会出错!)
print(df)
df2 = df.copy()
df2[df2 > 0] = -df2
print(df2)
## 2013-01-02 1
## 2013-01-03 2
## 2013-01-04 3
## 2013-01-05 4
## 2013-01-06 5
## 2013-01-07 6
## Freq: D, dtype: int64
## A B C
## 2013-01-01 1.420685 0.752250 -1.270429
## 2013-01-02 -0.819530 0.749631 -0.816112
## 2013-01-03 1.203946 0.082946 -1.361337
## 2013-01-04 0.844981 0.617703 0.544874
## 2013-01-05 -0.847279 -0.786258 -0.880352
## 2013-01-06 -0.126569 -1.887023 0.392236
## A B C F D
## 2013-01-01 0.000000 0.000000 -1.270429 NaN 5
## 2013-01-02 -0.819530 0.749631 -0.816112 1.0 5
## 2013-01-03 1.203946 0.082946 -1.361337 2.0 5
## 2013-01-04 0.844981 0.617703 0.544874 3.0 5
## 2013-01-05 -0.847279 -0.786258 -0.880352 4.0 5
## 2013-01-06 -0.126569 -1.887023 0.392236 5.0 5
## A B C F D
## 2013-01-01 0.000000 0.000000 -1.270429 NaN -5
## 2013-01-02 -0.819530 -0.749631 -0.816112 -1.0 -5
## 2013-01-03 -1.203946 -0.082946 -1.361337 -2.0 -5
## 2013-01-04 -0.844981 -0.617703 -0.544874 -3.0 -5
## 2013-01-05 -0.847279 -0.786258 -0.880352 -4.0 -5
## 2013-01-06 -0.126569 -1.887023 -0.392236 -5.0 -5
在pandas中,使用np.nan来代替缺失值,这些值将默认不会包含在计算中。
import pandas as pd
import numpy as np
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,3), index=dates, columns=list('ABC'))
# 1、reindex()方法可以对指定轴上的索引进行改变/增加/删除操作,这将返回原始数据的一个拷贝:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1.loc[dates[0]:dates[1],'E'] = 1
print(df1)
# 2、去掉包含缺失值的行:
print(df1.dropna(how='any'))
# 3、对缺失值进行填充:
print(df1.fillna(value=5))
# 4、对数据进行布尔填充:
print(pd.isnull(df1))
## A B C E
## 2013-01-01 -1.844647 0.638594 -0.352188 1.0
## 2013-01-02 -0.003575 -0.153742 -0.617550 1.0
## 2013-01-03 -0.795512 -0.154565 0.391559 NaN
## 2013-01-04 -0.411507 -0.505420 0.012720 NaN
## A B C E
## 2013-01-01 -1.844647 0.638594 -0.352188 1.0
## 2013-01-02 -0.003575 -0.153742 -0.617550 1.0
## A B C E
## 2013-01-01 -1.844647 0.638594 -0.352188 1.0
## 2013-01-02 -0.003575 -0.153742 -0.617550 1.0
## 2013-01-03 -0.795512 -0.154565 0.391559 5.0
## 2013-01-04 -0.411507 -0.505420 0.012720 5.0
## A B C E
## 2013-01-01 False False False False
## 2013-01-02 False False False False
## 2013-01-03 False False False True
## 2013-01-04 False False False True
import pandas as pd
import numpy as np
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,3), index=dates, columns=list('ABC'))
print(df)
# 1、执行描述性统计:
print(df.mean())
# 2、在其他轴上进行相同的操作:
print(df.mean(1))
# 3、对于拥有不同维度,需要对齐的对象进行操作。Pandas会自动的沿着指定的维度进行。
s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2)
print(s)
print(df.sub(s, axis='index'))
## A B C
## 2013-01-01 0.769428 -1.202104 -2.450158
## 2013-01-02 0.998613 1.412680 -0.949217
## 2013-01-03 1.088385 -0.148886 0.179365
## 2013-01-04 -0.838662 -0.769332 -0.555229
## 2013-01-05 -1.008979 -0.853671 1.133233
## 2013-01-06 -0.778389 2.249821 -0.142890
## A 0.038399
## B 0.114751
## C -0.464149
## dtype: float64
## 2013-01-01 -0.960945
## 2013-01-02 0.487359
## 2013-01-03 0.372955
## 2013-01-04 -0.721074
## 2013-01-05 -0.243139
## 2013-01-06 0.442847
## Freq: D, dtype: float64
## 2013-01-01 NaN
## 2013-01-02 NaN
## 2013-01-03 1.0
## 2013-01-04 3.0
## 2013-01-05 5.0
## 2013-01-06 NaN
## Freq: D, dtype: float64
## A B C
## 2013-01-01 NaN NaN NaN
## 2013-01-02 NaN NaN NaN
## 2013-01-03 0.088385 -1.148886 -0.820635
## 2013-01-04 -3.838662 -3.769332 -3.555229
## 2013-01-05 -6.008979 -5.853671 -3.866767
## 2013-01-06 NaN NaN NaN
import pandas as pd
import numpy as np
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,3), index=dates, columns=list('ABC'))
print(df)
# 对数据应用函数:
print(df.apply(np.cumsum))
print(df.apply(lambda x: x.max() - x.min()))
print(np.sum(df['A']))
## A B C
## 2013-01-01 -0.441176 -0.211272 -1.031730
## 2013-01-02 -0.820718 1.133895 0.522500
## 2013-01-03 1.085982 -0.205137 -1.300741
## 2013-01-04 0.306680 -2.745015 0.703153
## 2013-01-05 1.340365 -0.521452 0.267131
## 2013-01-06 -0.956634 0.962353 -1.028829
## A B C
## 2013-01-01 -0.441176 -0.211272 -1.031730
## 2013-01-02 -1.261894 0.922623 -0.509230
## 2013-01-03 -0.175912 0.717487 -1.809972
## 2013-01-04 0.130768 -2.027528 -1.106818
## 2013-01-05 1.471132 -2.548980 -0.839688
## 2013-01-06 0.514498 -1.586626 -1.868517
## A 2.296999
## B 3.878910
## C 2.003895
## dtype: float64
## 0.5144983920883905
import pandas as pd
import numpy as np
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
print(s)
print(s.str.lower())
## 0 A
## 1 B
## 2 C
## 3 Aaba
## 4 Baca
## 5 NaN
## 6 CABA
## 7 dog
## 8 cat
## dtype: object
## 0 a
## 1 b
## 2 c
## 3 aaba
## 4 baca
## 5 NaN
## 6 caba
## 7 dog
## 8 cat
## dtype: object
Pandas提供了大量的方法能够轻松的对Series,DataFrame和Panel对象进行各种符合各种逻辑关系的合并操作。
import pandas as pd
import numpy as np
# Concat
df = pd.DataFrame(np.random.randn(10, 4))
print(df)
pieces = [df[:3], df[3:7], df[7:]]
print(pieces)
print(pd.concat(pieces))
# Join 类似于SQL类型的合并,具体请参阅:Database style joining
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})
print(left)
print(right)
print(pd.merge(left, right, on='key'))
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})
print(left)
print(right)
print(pd.merge(left, right, on='key'))
# Append 将一行连接到一个DataFrame上,具体请参阅Appending:
df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])
print(df)
s = df.iloc[3]
print(s)
print(df.append(s, ignore_index=True))
## 0 1 2 3
## 0 -1.762194 -0.043978 -0.096215 1.196402
## 1 -0.047959 -0.821557 -0.549494 0.488567
## 2 0.350267 0.342583 0.611219 0.073297
## 3 1.050589 -1.071536 -0.882427 -0.056424
## 4 -0.634756 -0.866571 -0.905026 -0.531998
## 5 -0.705706 0.039591 0.611581 0.510896
## 6 0.208323 0.733922 0.157127 -0.618363
## 7 -1.111692 0.353678 -0.128415 0.977743
## 8 -1.087846 -0.657068 -0.375478 -1.566491
## 9 -1.873369 -0.869045 -1.681428 0.357354
## [ 0 1 2 3
## 0 -1.762194 -0.043978 -0.096215 1.196402
## 1 -0.047959 -0.821557 -0.549494 0.488567
## 2 0.350267 0.342583 0.611219 0.073297, 0 1 2 3
## 3 1.050589 -1.071536 -0.882427 -0.056424
## 4 -0.634756 -0.866571 -0.905026 -0.531998
## 5 -0.705706 0.039591 0.611581 0.510896
## 6 0.208323 0.733922 0.157127 -0.618363, 0 1 2 3
## 7 -1.111692 0.353678 -0.128415 0.977743
## 8 -1.087846 -0.657068 -0.375478 -1.566491
## 9 -1.873369 -0.869045 -1.681428 0.357354]
## 0 1 2 3
## 0 -1.762194 -0.043978 -0.096215 1.196402
## 1 -0.047959 -0.821557 -0.549494 0.488567
## 2 0.350267 0.342583 0.611219 0.073297
## 3 1.050589 -1.071536 -0.882427 -0.056424
## 4 -0.634756 -0.866571 -0.905026 -0.531998
## 5 -0.705706 0.039591 0.611581 0.510896
## 6 0.208323 0.733922 0.157127 -0.618363
## 7 -1.111692 0.353678 -0.128415 0.977743
## 8 -1.087846 -0.657068 -0.375478 -1.566491
## 9 -1.873369 -0.869045 -1.681428 0.357354
## key lval
## 0 foo 1
## 1 foo 2
## key rval
## 0 foo 4
## 1 foo 5
## key lval rval
## 0 foo 1 4
## 1 foo 1 5
## 2 foo 2 4
## 3 foo 2 5
## key lval
## 0 foo 1
## 1 bar 2
## key rval
## 0 foo 4
## 1 bar 5
## key lval rval
## 0 foo 1 4
## 1 bar 2 5
## A B C D
## 0 2.471918 0.933723 -1.157861 1.245011
## 1 -0.002935 0.606728 -0.791455 0.252616
## 2 1.046225 -0.678820 -0.958433 0.719773
## 3 0.029483 -0.646972 0.246525 0.064674
## 4 -1.230817 1.887350 1.278275 0.301016
## 5 0.622592 -0.643613 -0.831216 -0.572489
## 6 0.267997 -0.325497 -0.526077 -1.967288
## 7 0.850685 1.420866 0.258023 0.130523
## A 0.029483
## B -0.646972
## C 0.246525
## D 0.064674
## Name: 3, dtype: float64
## A B C D
## 0 2.471918 0.933723 -1.157861 1.245011
## 1 -0.002935 0.606728 -0.791455 0.252616
## 2 1.046225 -0.678820 -0.958433 0.719773
## 3 0.029483 -0.646972 0.246525 0.064674
## 4 -1.230817 1.887350 1.278275 0.301016
## 5 0.622592 -0.643613 -0.831216 -0.572489
## 6 0.267997 -0.325497 -0.526077 -1.967288
## 7 0.850685 1.420866 0.258023 0.130523
## 8 0.029483 -0.646972 0.246525 0.064674
对于"group by"操作,我们通常是指以下一个或多个操作步骤:
- (Splitting)按照一些规则将数据分为不同的组
- (Applying) 对于每组数据分别执行一个函数
- (Combining)将结果组合到一个数据结构中
import pandas as pd
import numpy as np
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar','foo', 'bar', 'foo', 'foo'],
'B' : ['one', 'one', 'two', 'three','two', 'two', 'one', 'three'],
'C' : np.random.randn(8),
'D' : np.random.randn(8)})
print(df)
# 分组并对每个分组执行sum函数:
group1 = df.groupby('A').sum()
print(group1)
# print([x for x in group1])
# print(group1.size())
print(group1.count())
# 通过多个列进行分组形成一个层次索引,然后执行函数:
group2 = df.groupby(['A','B']).sum()
print(group2)
## A B C D
## 0 foo one 0.077003 -1.656320
## 1 bar one 0.580323 0.941866
## 2 foo two -0.556902 -0.040688
## 3 bar three -1.367754 -0.236453
## 4 foo two -1.800508 0.475211
## 5 bar two 0.390206 1.823098
## 6 foo one 0.970117 -0.510762
## 7 foo three 0.094226 -0.368662
## C D
## A
## bar -0.397225 2.528511
## foo -1.216064 -2.101221
## C 2
## D 2
## dtype: int64
## C D
## A B
## bar one 0.580323 0.941866
## three -1.367754 -0.236453
## two 0.390206 1.823098
## foo one 1.047119 -2.167082
## three 0.094226 -0.368662
## two -2.357409 0.434523
import pandas as pd
import numpy as np
tuples = list(zip(*[['bar', 'bar', 'baz', 'baz','foo', 'foo', 'qux', 'qux'],
['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]))
print(tuples)
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
print(index)
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])
print(df)
df2 = df[:4]
print(df2)
stacked = df2.stack()
print(stacked)
print(stacked.unstack())
print(stacked.unstack(1))
print(stacked.unstack(0))
print(stacked.unstack(2))
## [('bar', 'one'), ('bar', 'two'), ('baz', 'one'), ('baz', 'two'), ('foo', 'one'), ('foo', 'two'), ('qux', 'one'), ('qux', 'two')]
## MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
## labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],
## names=['first', 'second'])
## A B
## first second
## bar one 1.105246 1.312284
## two -0.488749 1.203323
## baz one -0.231653 -0.301010
## two 0.772011 1.347532
## foo one -0.363355 1.388029
## two -0.232480 0.335387
## qux one 0.647417 1.879464
## two -0.348406 -0.533293
## A B
## first second
## bar one 1.105246 1.312284
## two -0.488749 1.203323
## baz one -0.231653 -0.301010
## two 0.772011 1.347532
## first second
## bar one A 1.105246
## B 1.312284
## two A -0.488749
## B 1.203323
## baz one A -0.231653
## B -0.301010
## two A 0.772011
## B 1.347532
## dtype: float64
## A B
## first second
## bar one 1.105246 1.312284
## two -0.488749 1.203323
## baz one -0.231653 -0.301010
## two 0.772011 1.347532
## second one two
## first
## bar A 1.105246 -0.488749
## B 1.312284 1.203323
## baz A -0.231653 0.772011
## B -0.301010 1.347532
## first bar baz
## second
## one A 1.105246 -0.231653
## B 1.312284 -0.301010
## two A -0.488749 0.772011
## B 1.203323 1.347532
## A B
## first second
## bar one 1.105246 1.312284
## two -0.488749 1.203323
## baz one -0.231653 -0.301010
## two 0.772011 1.347532
可以产生类似于excel数据透视表的结果,相当的直观
import pandas as pd
import numpy as np
df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
'B' : ['A', 'B', 'C'] * 4,
'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
'D' : np.random.randn(12),
'E' : np.random.randn(12)})
print(df)
# 1)分组统计
# 其中参数index指定“行”键,columns指定“列”键。
print(pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C']))
# 2)分项汇总
# 如果将参数margins设置为True,则可以得到分项总计数据。
print(pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'], margins=True))
# 3)交叉表crosstab
# 可以按照指定的行和列统计分组频数,用起来非常方便;当然同样的功能也可采用groupby实现。
print(pd.crosstab(df.A, df.B, margins=True))
## A B C D E
## 0 one A foo -0.221076 2.059491
## 1 one B foo -1.182770 -0.038186
## 2 two C foo 1.605421 -0.227043
## 3 three A bar -0.685556 -0.187624
## 4 one B bar 0.044209 -0.969931
## 5 one C bar -0.725583 -1.114549
## 6 two A foo 0.456501 1.171362
## 7 three B foo 1.287492 1.080057
## 8 one C foo -0.708959 -0.475746
## 9 one A bar -1.066508 -0.646032
## 10 two B bar -1.691038 0.983874
## 11 three C bar 1.066925 -0.616295
## C bar foo
## A B
## one A -1.066508 -0.221076
## B 0.044209 -1.182770
## C -0.725583 -0.708959
## three A -0.685556 NaN
## B NaN 1.287492
## C 1.066925 NaN
## two A NaN 0.456501
## B -1.691038 NaN
## C NaN 1.605421
## C bar foo All
## A B
## one A -1.066508 -0.221076 -0.643792
## B 0.044209 -1.182770 -0.569280
## C -0.725583 -0.708959 -0.717271
## three A -0.685556 NaN -0.685556
## B NaN 1.287492 1.287492
## C 1.066925 NaN 1.066925
## two A NaN 0.456501 0.456501
## B -1.691038 NaN -1.691038
## C NaN 1.605421 1.605421
## All -0.509592 0.206101 -0.151745
## B A B C All
## A
## one 2 2 2 6
## three 1 1 1 3
## two 1 1 1 3
## All 4 4 4 12
pandas在对频率转换进行重新采样时拥有简单、强大且高效的功能(如将按秒采样的数据转换为按5分钟为单位进行采样的数据)。这种操作在金融领域非常常见。
import pandas as pd
import numpy as np
## 1、时区表示:
rng = pd.date_range('1/1/2012', periods=10, freq='S')
print(rng)
ts = pd.Series(np.random.randint(0, 50, len(rng)), index=rng)
print(ts)
print(ts.resample('5Min').sum())
## 2、时区转换:
rng = pd.date_range('3/6/2012 00:00', periods=5, freq='D')
print(rng)
ts = pd.Series(np.random.randn(len(rng)), rng)
print(ts)
ts_utc = ts.tz_localize('UTC')
print(ts_utc)
## 3、时间跨度转换:
print(ts_utc.tz_convert('US/Eastern'))
## 4、时期和时间戳之间的转换使得可以使用一些方便的算术函数。
rng = pd.date_range('1/1/2012', periods=5, freq='M')
print(rng)
ts = pd.Series(np.random.randn(len(rng)), index=rng)
print(ts)
ps = ts.to_period()
print(ps)
print(ps.to_timestamp())
prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV')
print(prng)
ts = pd.Series(np.random.randn(len(prng)), prng)
print(ts)
ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9
print(ts.head())
## DatetimeIndex(['2012-01-01 00:00:00', '2012-01-01 00:00:01',
## '2012-01-01 00:00:02', '2012-01-01 00:00:03',
## '2012-01-01 00:00:04', '2012-01-01 00:00:05',
## '2012-01-01 00:00:06', '2012-01-01 00:00:07',
## '2012-01-01 00:00:08', '2012-01-01 00:00:09'],
## dtype='datetime64[ns]', freq='S')
## 2012-01-01 00:00:00 12
## 2012-01-01 00:00:01 37
## 2012-01-01 00:00:02 38
## 2012-01-01 00:00:03 45
## 2012-01-01 00:00:04 10
## 2012-01-01 00:00:05 49
## 2012-01-01 00:00:06 39
## 2012-01-01 00:00:07 35
## 2012-01-01 00:00:08 11
## 2012-01-01 00:00:09 6
## Freq: S, dtype: int32
## 2012-01-01 282
## Freq: 5T, dtype: int32
## DatetimeIndex(['2012-03-06', '2012-03-07', '2012-03-08', '2012-03-09',
## '2012-03-10'],
## dtype='datetime64[ns]', freq='D')
## 2012-03-06 -0.022886
## 2012-03-07 1.200976
## 2012-03-08 1.297353
## 2012-03-09 -0.331530
## 2012-03-10 0.530223
## Freq: D, dtype: float64
## 2012-03-06 00:00:00+00:00 -0.022886
## 2012-03-07 00:00:00+00:00 1.200976
## 2012-03-08 00:00:00+00:00 1.297353
## 2012-03-09 00:00:00+00:00 -0.331530
## 2012-03-10 00:00:00+00:00 0.530223
## Freq: D, dtype: float64
## 2012-03-05 19:00:00-05:00 -0.022886
## 2012-03-06 19:00:00-05:00 1.200976
## 2012-03-07 19:00:00-05:00 1.297353
## 2012-03-08 19:00:00-05:00 -0.331530
## 2012-03-09 19:00:00-05:00 0.530223
## Freq: D, dtype: float64
## DatetimeIndex(['2012-01-31', '2012-02-29', '2012-03-31', '2012-04-30',
## '2012-05-31'],
## dtype='datetime64[ns]', freq='M')
## 2012-01-31 -0.684212
## 2012-02-29 0.517506
## 2012-03-31 1.310289
## 2012-04-30 0.990170
## 2012-05-31 0.688944
## Freq: M, dtype: float64
## 2012-01 -0.684212
## 2012-02 0.517506
## 2012-03 1.310289
## 2012-04 0.990170
## 2012-05 0.688944
## Freq: M, dtype: float64
## 2012-01-01 -0.684212
## 2012-02-01 0.517506
## 2012-03-01 1.310289
## 2012-04-01 0.990170
## 2012-05-01 0.688944
## Freq: MS, dtype: float64
## PeriodIndex(['1990Q1', '1990Q2', '1990Q3', '1990Q4', '1991Q1', '1991Q2',
## '1991Q3', '1991Q4', '1992Q1', '1992Q2', '1992Q3', '1992Q4',
## '1993Q1', '1993Q2', '1993Q3', '1993Q4', '1994Q1', '1994Q2',
## '1994Q3', '1994Q4', '1995Q1', '1995Q2', '1995Q3', '1995Q4',
## '1996Q1', '1996Q2', '1996Q3', '1996Q4', '1997Q1', '1997Q2',
## '1997Q3', '1997Q4', '1998Q1', '1998Q2', '1998Q3', '1998Q4',
## '1999Q1', '1999Q2', '1999Q3', '1999Q4', '2000Q1', '2000Q2',
## '2000Q3', '2000Q4'],
## dtype='int64', freq='Q-NOV')
## 1990Q1 -0.867859
## 1990Q2 0.830501
## 1990Q3 -0.105151
## 1990Q4 -1.067168
## 1991Q1 0.199731
## 1991Q2 1.563998
## 1991Q3 0.900670
## 1991Q4 -0.080161
## 1992Q1 -0.235376
## 1992Q2 -0.311618
## 1992Q3 -0.599459
## 1992Q4 0.297456
## 1993Q1 0.378720
## 1993Q2 -0.559001
## 1993Q3 0.021934
## 1993Q4 1.138138
## 1994Q1 -1.030343
## 1994Q2 -0.901108
## 1994Q3 1.927886
## 1994Q4 -1.479484
## 1995Q1 -1.613674
## 1995Q2 0.013176
## 1995Q3 1.063955
## 1995Q4 -1.212704
## 1996Q1 -0.004495
## 1996Q2 -0.327333
## 1996Q3 -0.009903
## 1996Q4 -1.194884
## 1997Q1 0.434205
## 1997Q2 -0.726546
## 1997Q3 1.791200
## 1997Q4 1.072537
## 1998Q1 1.376644
## 1998Q2 -0.408941
## 1998Q3 -0.073592
## 1998Q4 -3.184624
## 1999Q1 0.051400
## 1999Q2 0.249227
## 1999Q3 0.071067
## 1999Q4 0.303785
## 2000Q1 -1.374489
## 2000Q2 -1.485674
## 2000Q3 1.396185
## 2000Q4 1.123512
## Freq: Q-NOV, dtype: float64
## 1990-03-01 09:00 -0.867859
## 1990-06-01 09:00 0.830501
## 1990-09-01 09:00 -0.105151
## 1990-12-01 09:00 -1.067168
## 1991-03-01 09:00 0.199731
## Freq: H, dtype: float64
import pandas as pd
import numpy as np
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,3), index=dates, columns=list('ABC'))
print(df)
### CSV
# 写入csv文件:
df.to_csv('D:/foo.csv')
# 从csv文件中读取:
df1 = pd.read_csv('D:/foo.csv')
print(df1)
### HDF5
# 写入HDF5存储:
df.to_hdf('foo.h5', 'df')
# 从HDF5存储中读取:
pd.read_hdf('foo.h5', 'df')
### Excel
# 写入excel文件:
df.to_excel('D:/foo.xlsx', sheet_name='Sheet1')
# 从excel文件中读取:
df2 = pd.read_excel('D:/foo.xlsx', 'Sheet1', index_col=None, na_values=['NA'])
print(df2)
import pandas as pd, numpy as np
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
mutate + ifelse
df['E'] = np.where(df['D'] >= 0, '>=0', '<0')
df['F'] = np.random.randint(0, 2, 6)
df.assign(G = df.A * df.D) # 或者
df['F'] = df['F'].apply(str) #针对单列的
df.applymap(str) #这个相当于是mutate_each
table
pd.value_counts(df["E"])
pd.pivot_table(df,index=['E','F'])
index 也就是取df的rownames,但与R不一样的在于,df可能有多维rownames
df.index
df.set_index(['A'], drop = 0, append = 1) # 把已有的列设置为index,可保留之前的index,也可以把新的index在原数据中删除
df['dates'] = df.index # 新生成一列dates
df.reset_index(level=0, inplace=True) # 同上
df.reset_index(level=['index']) # 同上
删除列和行
df = df.drop('index', axis = 1) # 可以删除多列
df.drop(df.index[[1,3]])
column names
df.columns
df.columns = ['a', 'b', 'c', 'e', 'd', 'f'] # 重命名
df.rename(columns = {'A':'aa','B':'bb', 'C':'cc', 'D':'dd', 'E':'ee', 'F':'ff'}, inplace=True)
df.rename(columns=lambda x: x[1:].upper(), inplace=True) # 也可以用函数 inplace参数的意思就是代替原来的变量,深拷贝
哑变量 dummy variables
pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies()
纯粹的df的矩阵,即不包含column和index
df.values
df.get_values()
summary
df.describe() # 只会针对数值型变量做计算
rbind
df2=pd.DataFrame([[5,6],[7,8]],columns=list('AB'))
df.append(df2, ignore_index=True)
group by 分组汇总计算,和pivot_table类似
df.groupby(['E','F']).mean()
df.groupby(['E','F']).agg(['sum', 'mean'])
pd.pivot_table(df,index=['E','F'], aggfunc=[np.sum, np.mean])
df.pivot_table(index=['E','F'], aggfunc=[np.sum, np.mean]) # 同上
df.groupby(['E','F']).agg({'A':['mean','sum'], 'B':'min'}) # groupby 也可以这样写
排序
df.sort(['A','B'],ascending=[1,0]) # 按列排序,na_position控制NAN的位置
df.sort_index(ascending=0) # 按index排序
筛选
df[(df.A >= -1) & (df.B <= 0)] # 值筛选
df[df.E.str.contains(">")] # 包含某个字符,contains筛选的其实是正则表达式
df[df.F.isin(['1'])] # 在列表内
变量选择
df['A'] # 单个的列
df[0:3] # 行
df['20130102':'20130104'] # 按index筛选
df.loc[:,] # 类似于R里面的dataframe选行和列的方法
df.iloc[:,] # iloc只能用数字了
抽样
df.sample(10, replace = True)
df.sample(3)
df.sample(frac = 0.5) # 按比例抽样
df.sample(frac = 10, replace = True,weights = np.random.randint(1,10,6)) # 对样本加权
df.sample(3, axis = 1) # 变量抽样
join(即 merge)
pd.merge(df.sample(4), df.sample(4), how = "left", on = "A", indicator = True)
随机数
numpy.random.rand(3, 2) # 按维度生成[0,1)之间的均匀分布随机数
np.random.randn(2,5) # 按维度生成标准正太分布随机数
np.random.randint(2, size=10) # randint(low[, high, size])生成随机整数,默认low为0,high必填,size默认为1
np.random.bytes(10) # 返回随机字节
a=np.arange(10)
np.random.shuffle(a) # 洗牌
a=np.arange(9).reshape(3, 3)
np.random.shuffle(a) # 若是数组,则只会打乱第一维
np.random.permutation(10) # 随机排列,对于多维序列也适用
np.random.permutation(10) .reshape(2, 5)
np.random.seed(1000) # 种子
np.random.normal(2,3,[5,2]) # 高斯分布,其他分布可查
# http://docs.scipy.org/doc/numpy-1.10.1/reference/routines.random.html
np.random.seed(12345678)
x = scipy.stats.norm.rvs(loc=5, scale=3, size=100) # 另外scipy也有这些随机数的生成,附带检验
scipy.stats.shapiro(x)
# http://docs.scipy.org/doc/scipy-0.17.0/reference/stats.html
gather和spread
# gather:
def gather( df, key, value, cols ):
id_vars = [ col for col in df.columns if col not in cols ]
id_values = cols
var_name = key
value_name = value
return pandas.melt( df, id_vars, id_values, var_name, value_name )
# 以上是定义的一个函数,实际上一样的,横变竖,是gather,竖变横,是spread
pd.melt(df, id_vars=['E','F'], value_vars=['A','C'])
# spread:
pd.pivot(df["D"],df["E"],df['F']) #这个是竖变横
df3=pd.pivot(df2['D'],df2['variable'],df2['value'])
df3.reset_index(level=0, inplace=True) # 再变回df的样子
熵
scipy.stats.entropy(np.arange(10))
字符串拼接
[",".join(['a','b','d'])]
df[['E','F']].groupby('F')['E'].apply(lambda x: "{%s}" % ', '.join(x)) # 分组拼接,前提是这些列都要是字符串
df[['E','F']].applymap(str).groupby('E')['F'].apply(lambda x: "%s" % ', '.join(x)) # 所以可以这样
随机字符串生成
import random,string
df2 = pd.DataFrame(range(10),columns=['y'])
df2["x"] = [",".join(random.sample(string.lowercase,random.randint(2,5))) for i in range(10)]
分列后生成hash表
# 用20 的示例数据
df3=pd.DataFrame(df2.x.str.split(',').tolist(),index=df2.y).stack().reset_index(level=0)
df3.columns=["y","x"]
去重
df[["F","E"]].drop_duplicates()
离散化
pd.cut(df.A,range(-1,2,1))