http://pandas.pydata.org/pandas-docs/stable/index.html

http://pandas.pydata.org/pandas-docs/stable/10min.html

http://blog.csdn.net/claroja/article/category/6919729/1

http://blog.csdn.net/pipisorry/article/category/5916621

DataFrame

http://pandas.pydata.org/pandas-docs/stable/r_interface.html

http://pandas.pydata.org/pandas-docs/stable/comparison_with_r.html

1.创建对象

1、可以通过传递一个list对象来创建一个Series,pandas会默认创建整型索引:

import pandas as pd
import numpy as np

s = pd.Series([1,3,5,np.nan,6,8])
print(s)
## 0    1.0
## 1    3.0
## 2    5.0
## 3    NaN
## 4    6.0
## 5    8.0
## dtype: float64

2、通过传递一个numpy array,时间索引以及列标签来创建一个DataFrame:

import pandas as pd
import numpy as np

dates = pd.date_range('20130101', periods=6)
print(dates)

df = pd.DataFrame(np.random.randn(6,3), index=dates, columns=list('ABC'))
print(df)
## DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
##                '2013-01-05', '2013-01-06'],
##               dtype='datetime64[ns]', freq='D')
##                    A         B         C
## 2013-01-01  0.719050 -1.190380  1.216087
## 2013-01-02  0.634315 -0.638159  1.237365
## 2013-01-03 -0.545393  0.203222  0.458559
## 2013-01-04 -1.627630 -1.688958 -1.392453
## 2013-01-05 -1.364594  0.149654 -0.139358
## 2013-01-06  1.953070  1.027044 -1.311478

3、通过传递一个能够被转换成类似序列结构的字典对象来创建一个DataFrame:

import pandas as pd
import numpy as np

df2 = pd.DataFrame({ 'A' : 1.,
                     'B' : pd.Timestamp('20130102'),
                     'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                     'D' : np.array([3] * 4,dtype='int32'),
                     'E' : pd.Categorical(["test","train","test","train"]),
                     'F' : 'foo' })

print(df2)
print(df2.dtypes)
##      A          B    C  D      E    F
## 0  1.0 2013-01-02  1.0  3   test  foo
## 1  1.0 2013-01-02  1.0  3  train  foo
## 2  1.0 2013-01-02  1.0  3   test  foo
## 3  1.0 2013-01-02  1.0  3  train  foo
## A           float64
## B    datetime64[ns]
## C           float32
## D             int32
## E          category
## F            object
## dtype: object

2.查看数据

1、查看frame中头部和尾部的行:

import pandas as pd
import numpy as np

dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,3), index=dates, columns=list('ABC'))
print(df.head())
print(df.tail(2))
##                    A         B         C
## 2013-01-01  1.474897 -0.090145  1.318224
## 2013-01-02  1.725293  0.988291 -2.028873
## 2013-01-03  0.588877 -0.249527 -2.231888
## 2013-01-04 -0.762990 -0.356846  1.227532
## 2013-01-05 -2.137339 -0.335290  1.963766
##                    A        B         C
## 2013-01-05 -2.137339 -0.33529  1.963766
## 2013-01-06  1.187954 -0.44375  1.342456

2、显示索引、列和底层的numpy数据:

import pandas as pd
import numpy as np

dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,3), index=dates, columns=list('ABC'))
print(df.index)
print(df.columns)
print(df.values)
## DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
##                '2013-01-05', '2013-01-06'],
##               dtype='datetime64[ns]', freq='D')
## Index(['A', 'B', 'C'], dtype='object')
## [[ 1.31825181  0.26434194  0.43418584]
##  [-0.09103258 -1.27462493 -0.2828914 ]
##  [-1.91087252  0.50810281  0.76963331]
##  [ 0.35981189  0.39978904 -0.11465738]
##  [-0.82236698 -1.47789632  1.78549502]
##  [-1.71783794 -1.94443651 -1.28818172]]

3、describe()函数对于数据的快速统计汇总:

import pandas as pd
import numpy as np

dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,3), index=dates, columns=list('ABC'))

print(df.describe())
##               A         B         C
## count  6.000000  6.000000  6.000000
## mean  -0.526849 -0.437596 -0.131694
## std    1.150566  0.888248  1.378806
## min   -1.863193 -1.976389 -2.003173
## 25%   -0.980001 -0.733561 -1.137370
## 50%   -0.872928 -0.214403  0.094520
## 75%   -0.247964  0.043086  0.815450
## max    1.503289  0.557319  1.516211

4、对数据的转置:

import pandas as pd
import numpy as np

dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,3), index=dates, columns=list('ABC'))

print(df.T)
##    2013-01-01  2013-01-02  2013-01-03  2013-01-04  2013-01-05  2013-01-06
## A   -0.432442    0.306032    0.069921    1.108310   -0.377539   -0.422082
## B    0.384943   -1.215627    0.067218   -0.238315    1.150479    0.511353
## C   -0.794288   -0.316593   -1.498269   -0.683950    0.152308    1.144845

5、排序

import pandas as pd
import numpy as np

dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,3), index=dates, columns=list('ABC'))

# 默认按frame升序,降序可在括号加ascending=False
print(df.sort_index(axis=1, ascending=False))

# 按frame的行索引进行排序
print(df.sort_index())

# 按frame的列索引进行排序
print(df.sort_index(axis=1))

# 按frame的一个列或多个列的值进行排序
print(df.sort_index(by='A'))

print(df.sort_index(by=['A','B']))
## -c:17: FutureWarning: by argument to sort_index is deprecated, pls use .sort_values(by=...)
## -c:19: FutureWarning: by argument to sort_index is deprecated, pls use .sort_values(by=...)
##                    C         B         A
## 2013-01-01  2.668079  2.119120  0.056270
## 2013-01-02 -0.016571 -1.687678 -0.937221
## 2013-01-03  1.214240 -1.198925  0.633789
## 2013-01-04 -0.684196  0.418149  0.034202
## 2013-01-05  0.432678 -1.426104  0.502198
## 2013-01-06  1.657599  1.011420 -1.416276
##                    A         B         C
## 2013-01-01  0.056270  2.119120  2.668079
## 2013-01-02 -0.937221 -1.687678 -0.016571
## 2013-01-03  0.633789 -1.198925  1.214240
## 2013-01-04  0.034202  0.418149 -0.684196
## 2013-01-05  0.502198 -1.426104  0.432678
## 2013-01-06 -1.416276  1.011420  1.657599
##                    A         B         C
## 2013-01-01  0.056270  2.119120  2.668079
## 2013-01-02 -0.937221 -1.687678 -0.016571
## 2013-01-03  0.633789 -1.198925  1.214240
## 2013-01-04  0.034202  0.418149 -0.684196
## 2013-01-05  0.502198 -1.426104  0.432678
## 2013-01-06 -1.416276  1.011420  1.657599
##                    A         B         C
## 2013-01-06 -1.416276  1.011420  1.657599
## 2013-01-02 -0.937221 -1.687678 -0.016571
## 2013-01-04  0.034202  0.418149 -0.684196
## 2013-01-01  0.056270  2.119120  2.668079
## 2013-01-05  0.502198 -1.426104  0.432678
## 2013-01-03  0.633789 -1.198925  1.214240
##                    A         B         C
## 2013-01-06 -1.416276  1.011420  1.657599
## 2013-01-02 -0.937221 -1.687678 -0.016571
## 2013-01-04  0.034202  0.418149 -0.684196
## 2013-01-01  0.056270  2.119120  2.668079
## 2013-01-05  0.502198 -1.426104  0.432678
## 2013-01-03  0.633789 -1.198925  1.214240

3、选择

1、获取

import pandas as pd
import numpy as np

dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,3), index=dates, columns=list('ABC'))

# 1、选择一个单独的列,这将会返回一个Series,等同于df.A:
print(df['A'])

# 2、通过[]进行选择,这将会对行进行切片
print(df[0:3])
print(df['20130102':'20130104'])
## 2013-01-01    0.334347
## 2013-01-02   -0.400806
## 2013-01-03   -1.640650
## 2013-01-04    0.178510
## 2013-01-05    0.714261
## 2013-01-06   -0.203589
## Freq: D, Name: A, dtype: float64
##                    A         B         C
## 2013-01-01  0.334347 -0.488606  1.257202
## 2013-01-02 -0.400806  2.522782  0.568591
## 2013-01-03 -1.640650 -1.343565  1.008803
##                    A         B         C
## 2013-01-02 -0.400806  2.522782  0.568591
## 2013-01-03 -1.640650 -1.343565  1.008803
## 2013-01-04  0.178510 -0.411125  0.070722

2、通过标签选择

import pandas as pd
import numpy as np

dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,3), index=dates, columns=list('ABC'))

# 1、使用标签来获取一个交叉的区域
print(df.loc[dates[0]])
  
# 2、通过标签来在多个轴上进行选择
print(df.loc[:,['A','B']])

# 3、标签切片
print(df.loc['20130102':'20130104',['A','B']])

# 4、对于返回的对象进行维度缩减
print(df.loc['20130102',['A','B']])

# 5、获取一个标量
print(df.loc[dates[0],'A'])

# 6、快速访问一个标量(与上一个方法等价)
print(df.at[dates[0],'A'])
## A   -0.105711
## B   -1.553135
## C    1.401940
## Name: 2013-01-01 00:00:00, dtype: float64
##                    A         B
## 2013-01-01 -0.105711 -1.553135
## 2013-01-02  0.516738 -0.395743
## 2013-01-03  0.042964  0.424069
## 2013-01-04 -0.808021  0.503598
## 2013-01-05  0.364841 -0.071134
## 2013-01-06  0.132690 -1.146901
##                    A         B
## 2013-01-02  0.516738 -0.395743
## 2013-01-03  0.042964  0.424069
## 2013-01-04 -0.808021  0.503598
## A    0.516738
## B   -0.395743
## Name: 2013-01-02 00:00:00, dtype: float64
## -0.105711464678
## -0.105711464678

3、通过位置选择

import pandas as pd
import numpy as np

dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,3), index=dates, columns=list('ABC'))

# 1、通过传递数值进行位置选择(选择的是行)
print(df.iloc[3])

# 2、通过数值进行切片,与numpy/python中的情况类似
print(df.iloc[3:5,0:2])

# 3、通过指定一个位置的列表,与numpy/python中的情况类似
print(df.iloc[[1,2,4],[0,2]])

# 4、对行进行切片
print(df.iloc[1:3,:])

# 5、对列进行切片
print(df.iloc[:,1:3])

# 6、获取特定的值
print(df.iloc[1,1])
print(df.iat[1,1])
## A   -0.430113
## B    1.166528
## C   -0.151003
## Name: 2013-01-04 00:00:00, dtype: float64
##                    A         B
## 2013-01-04 -0.430113  1.166528
## 2013-01-05  0.707701  0.523491
##                    A         C
## 2013-01-02 -0.272539  1.178722
## 2013-01-03  1.349420 -1.980765
## 2013-01-05  0.707701  0.211483
##                    A         B         C
## 2013-01-02 -0.272539 -2.579566  1.178722
## 2013-01-03  1.349420 -1.060671 -1.980765
##                    B         C
## 2013-01-01 -0.869997  0.135881
## 2013-01-02 -2.579566  1.178722
## 2013-01-03 -1.060671 -1.980765
## 2013-01-04  1.166528 -0.151003
## 2013-01-05  0.523491  0.211483
## 2013-01-06 -0.914865  0.580675
## -2.57956639873
## -2.57956639873

4、布尔索引

import pandas as pd
import numpy as np

dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,3), index=dates, columns=list('ABC'))

# 1、使用一个单独列的值来选择数据:
print(df[df.A > 0])

# 2、使用where操作来选择数据:
print(df[df > 0])

# 3、使用isin()方法来过滤:
df2 = df.copy()
print(df2)
df2['E'] = ['one', 'one','two','three','four','three']
df2
print(df2)
print(df2[df2['E'].isin(['one','four'])])
##                    A         B        C
## 2013-01-06  0.719447  0.607389 -0.50302
##                    A         B         C
## 2013-01-01       NaN       NaN  1.568646
## 2013-01-02       NaN  0.696549       NaN
## 2013-01-03       NaN       NaN  1.041927
## 2013-01-04       NaN  1.384888       NaN
## 2013-01-05       NaN       NaN  0.881413
## 2013-01-06  0.719447  0.607389       NaN
##                    A         B         C
## 2013-01-01 -1.508735 -0.098011  1.568646
## 2013-01-02 -0.584852  0.696549 -0.097488
## 2013-01-03 -0.770481 -1.115255  1.041927
## 2013-01-04 -0.804570  1.384888 -0.067071
## 2013-01-05 -1.060169 -1.229640  0.881413
## 2013-01-06  0.719447  0.607389 -0.503020
##                    A         B         C      E
## 2013-01-01 -1.508735 -0.098011  1.568646    one
## 2013-01-02 -0.584852  0.696549 -0.097488    one
## 2013-01-03 -0.770481 -1.115255  1.041927    two
## 2013-01-04 -0.804570  1.384888 -0.067071  three
## 2013-01-05 -1.060169 -1.229640  0.881413   four
## 2013-01-06  0.719447  0.607389 -0.503020  three
##                    A         B         C     E
## 2013-01-01 -1.508735 -0.098011  1.568646   one
## 2013-01-02 -0.584852  0.696549 -0.097488   one
## 2013-01-05 -1.060169 -1.229640  0.881413  four

5、设置(新值)

import pandas as pd
import numpy as np

dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,3), index=dates, columns=list('ABC'))

# 1、设置一个新的列:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))
print(s1)
print(df)
df['F'] = s1

# 2、通过标签设置新的值:
df.at[dates[0],'A'] = 0

# 3、通过位置设置新的值:
df.iat[0,1] = 0

# 4、通过一个numpy数组设置一组新值:
df.loc[:,'D'] = np.array([5] * len(df))

# 5、通过where操作来设置新的值:(必须数值型,存在字符串列会出错!)
print(df)
df2 = df.copy()
df2[df2 > 0] = -df2
print(df2)
## 2013-01-02    1
## 2013-01-03    2
## 2013-01-04    3
## 2013-01-05    4
## 2013-01-06    5
## 2013-01-07    6
## Freq: D, dtype: int64
##                    A         B         C
## 2013-01-01  1.420685  0.752250 -1.270429
## 2013-01-02 -0.819530  0.749631 -0.816112
## 2013-01-03  1.203946  0.082946 -1.361337
## 2013-01-04  0.844981  0.617703  0.544874
## 2013-01-05 -0.847279 -0.786258 -0.880352
## 2013-01-06 -0.126569 -1.887023  0.392236
##                    A         B         C    F  D
## 2013-01-01  0.000000  0.000000 -1.270429  NaN  5
## 2013-01-02 -0.819530  0.749631 -0.816112  1.0  5
## 2013-01-03  1.203946  0.082946 -1.361337  2.0  5
## 2013-01-04  0.844981  0.617703  0.544874  3.0  5
## 2013-01-05 -0.847279 -0.786258 -0.880352  4.0  5
## 2013-01-06 -0.126569 -1.887023  0.392236  5.0  5
##                    A         B         C    F  D
## 2013-01-01  0.000000  0.000000 -1.270429  NaN -5
## 2013-01-02 -0.819530 -0.749631 -0.816112 -1.0 -5
## 2013-01-03 -1.203946 -0.082946 -1.361337 -2.0 -5
## 2013-01-04 -0.844981 -0.617703 -0.544874 -3.0 -5
## 2013-01-05 -0.847279 -0.786258 -0.880352 -4.0 -5
## 2013-01-06 -0.126569 -1.887023 -0.392236 -5.0 -5

4. 缺失值处理

在pandas中,使用np.nan来代替缺失值,这些值将默认不会包含在计算中。

import pandas as pd
import numpy as np

dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,3), index=dates, columns=list('ABC'))

# 1、reindex()方法可以对指定轴上的索引进行改变/增加/删除操作,这将返回原始数据的一个拷贝:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1.loc[dates[0]:dates[1],'E'] = 1
print(df1)

# 2、去掉包含缺失值的行:
print(df1.dropna(how='any'))

# 3、对缺失值进行填充:
print(df1.fillna(value=5))

# 4、对数据进行布尔填充:
print(pd.isnull(df1))
##                    A         B         C    E
## 2013-01-01 -1.844647  0.638594 -0.352188  1.0
## 2013-01-02 -0.003575 -0.153742 -0.617550  1.0
## 2013-01-03 -0.795512 -0.154565  0.391559  NaN
## 2013-01-04 -0.411507 -0.505420  0.012720  NaN
##                    A         B         C    E
## 2013-01-01 -1.844647  0.638594 -0.352188  1.0
## 2013-01-02 -0.003575 -0.153742 -0.617550  1.0
##                    A         B         C    E
## 2013-01-01 -1.844647  0.638594 -0.352188  1.0
## 2013-01-02 -0.003575 -0.153742 -0.617550  1.0
## 2013-01-03 -0.795512 -0.154565  0.391559  5.0
## 2013-01-04 -0.411507 -0.505420  0.012720  5.0
##                 A      B      C      E
## 2013-01-01  False  False  False  False
## 2013-01-02  False  False  False  False
## 2013-01-03  False  False  False   True
## 2013-01-04  False  False  False   True

5. 相关操作

统计(相关操作通常情况下不包括缺失值)

import pandas as pd
import numpy as np

dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,3), index=dates, columns=list('ABC'))
print(df)

# 1、执行描述性统计:
print(df.mean())

# 2、在其他轴上进行相同的操作:
print(df.mean(1))

# 3、对于拥有不同维度,需要对齐的对象进行操作。Pandas会自动的沿着指定的维度进行。
s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2)
print(s)
print(df.sub(s, axis='index'))
##                    A         B         C
## 2013-01-01  0.769428 -1.202104 -2.450158
## 2013-01-02  0.998613  1.412680 -0.949217
## 2013-01-03  1.088385 -0.148886  0.179365
## 2013-01-04 -0.838662 -0.769332 -0.555229
## 2013-01-05 -1.008979 -0.853671  1.133233
## 2013-01-06 -0.778389  2.249821 -0.142890
## A    0.038399
## B    0.114751
## C   -0.464149
## dtype: float64
## 2013-01-01   -0.960945
## 2013-01-02    0.487359
## 2013-01-03    0.372955
## 2013-01-04   -0.721074
## 2013-01-05   -0.243139
## 2013-01-06    0.442847
## Freq: D, dtype: float64
## 2013-01-01    NaN
## 2013-01-02    NaN
## 2013-01-03    1.0
## 2013-01-04    3.0
## 2013-01-05    5.0
## 2013-01-06    NaN
## Freq: D, dtype: float64
##                    A         B         C
## 2013-01-01       NaN       NaN       NaN
## 2013-01-02       NaN       NaN       NaN
## 2013-01-03  0.088385 -1.148886 -0.820635
## 2013-01-04 -3.838662 -3.769332 -3.555229
## 2013-01-05 -6.008979 -5.853671 -3.866767
## 2013-01-06       NaN       NaN       NaN

Apply

import pandas as pd
import numpy as np

dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,3), index=dates, columns=list('ABC'))
print(df)

# 对数据应用函数:
print(df.apply(np.cumsum))
print(df.apply(lambda x: x.max() - x.min()))
print(np.sum(df['A']))
##                    A         B         C
## 2013-01-01 -0.441176 -0.211272 -1.031730
## 2013-01-02 -0.820718  1.133895  0.522500
## 2013-01-03  1.085982 -0.205137 -1.300741
## 2013-01-04  0.306680 -2.745015  0.703153
## 2013-01-05  1.340365 -0.521452  0.267131
## 2013-01-06 -0.956634  0.962353 -1.028829
##                    A         B         C
## 2013-01-01 -0.441176 -0.211272 -1.031730
## 2013-01-02 -1.261894  0.922623 -0.509230
## 2013-01-03 -0.175912  0.717487 -1.809972
## 2013-01-04  0.130768 -2.027528 -1.106818
## 2013-01-05  1.471132 -2.548980 -0.839688
## 2013-01-06  0.514498 -1.586626 -1.868517
## A    2.296999
## B    3.878910
## C    2.003895
## dtype: float64
## 0.5144983920883905

字符串方法

import pandas as pd
import numpy as np

s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])

print(s)
print(s.str.lower())
## 0       A
## 1       B
## 2       C
## 3    Aaba
## 4    Baca
## 5     NaN
## 6    CABA
## 7     dog
## 8     cat
## dtype: object
## 0       a
## 1       b
## 2       c
## 3    aaba
## 4    baca
## 5     NaN
## 6    caba
## 7     dog
## 8     cat
## dtype: object

6.合并

Pandas提供了大量的方法能够轻松的对Series,DataFrame和Panel对象进行各种符合各种逻辑关系的合并操作。

import pandas as pd
import numpy as np

# Concat
df = pd.DataFrame(np.random.randn(10, 4))
print(df)
pieces = [df[:3], df[3:7], df[7:]]
print(pieces)
print(pd.concat(pieces))

# Join 类似于SQL类型的合并,具体请参阅:Database style joining
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})
print(left)
print(right)
print(pd.merge(left, right, on='key'))

left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})
print(left)
print(right)
print(pd.merge(left, right, on='key'))

# Append 将一行连接到一个DataFrame上,具体请参阅Appending:
df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])
print(df)
s = df.iloc[3]
print(s)
print(df.append(s, ignore_index=True))
##           0         1         2         3
## 0 -1.762194 -0.043978 -0.096215  1.196402
## 1 -0.047959 -0.821557 -0.549494  0.488567
## 2  0.350267  0.342583  0.611219  0.073297
## 3  1.050589 -1.071536 -0.882427 -0.056424
## 4 -0.634756 -0.866571 -0.905026 -0.531998
## 5 -0.705706  0.039591  0.611581  0.510896
## 6  0.208323  0.733922  0.157127 -0.618363
## 7 -1.111692  0.353678 -0.128415  0.977743
## 8 -1.087846 -0.657068 -0.375478 -1.566491
## 9 -1.873369 -0.869045 -1.681428  0.357354
## [          0         1         2         3
## 0 -1.762194 -0.043978 -0.096215  1.196402
## 1 -0.047959 -0.821557 -0.549494  0.488567
## 2  0.350267  0.342583  0.611219  0.073297,           0         1         2         3
## 3  1.050589 -1.071536 -0.882427 -0.056424
## 4 -0.634756 -0.866571 -0.905026 -0.531998
## 5 -0.705706  0.039591  0.611581  0.510896
## 6  0.208323  0.733922  0.157127 -0.618363,           0         1         2         3
## 7 -1.111692  0.353678 -0.128415  0.977743
## 8 -1.087846 -0.657068 -0.375478 -1.566491
## 9 -1.873369 -0.869045 -1.681428  0.357354]
##           0         1         2         3
## 0 -1.762194 -0.043978 -0.096215  1.196402
## 1 -0.047959 -0.821557 -0.549494  0.488567
## 2  0.350267  0.342583  0.611219  0.073297
## 3  1.050589 -1.071536 -0.882427 -0.056424
## 4 -0.634756 -0.866571 -0.905026 -0.531998
## 5 -0.705706  0.039591  0.611581  0.510896
## 6  0.208323  0.733922  0.157127 -0.618363
## 7 -1.111692  0.353678 -0.128415  0.977743
## 8 -1.087846 -0.657068 -0.375478 -1.566491
## 9 -1.873369 -0.869045 -1.681428  0.357354
##    key  lval
## 0  foo     1
## 1  foo     2
##    key  rval
## 0  foo     4
## 1  foo     5
##    key  lval  rval
## 0  foo     1     4
## 1  foo     1     5
## 2  foo     2     4
## 3  foo     2     5
##    key  lval
## 0  foo     1
## 1  bar     2
##    key  rval
## 0  foo     4
## 1  bar     5
##    key  lval  rval
## 0  foo     1     4
## 1  bar     2     5
##           A         B         C         D
## 0  2.471918  0.933723 -1.157861  1.245011
## 1 -0.002935  0.606728 -0.791455  0.252616
## 2  1.046225 -0.678820 -0.958433  0.719773
## 3  0.029483 -0.646972  0.246525  0.064674
## 4 -1.230817  1.887350  1.278275  0.301016
## 5  0.622592 -0.643613 -0.831216 -0.572489
## 6  0.267997 -0.325497 -0.526077 -1.967288
## 7  0.850685  1.420866  0.258023  0.130523
## A    0.029483
## B   -0.646972
## C    0.246525
## D    0.064674
## Name: 3, dtype: float64
##           A         B         C         D
## 0  2.471918  0.933723 -1.157861  1.245011
## 1 -0.002935  0.606728 -0.791455  0.252616
## 2  1.046225 -0.678820 -0.958433  0.719773
## 3  0.029483 -0.646972  0.246525  0.064674
## 4 -1.230817  1.887350  1.278275  0.301016
## 5  0.622592 -0.643613 -0.831216 -0.572489
## 6  0.267997 -0.325497 -0.526077 -1.967288
## 7  0.850685  1.420866  0.258023  0.130523
## 8  0.029483 -0.646972  0.246525  0.064674

7. 分组

对于"group by"操作,我们通常是指以下一个或多个操作步骤:

  - (Splitting)按照一些规则将数据分为不同的组
  - (Applying) 对于每组数据分别执行一个函数
  - (Combining)将结果组合到一个数据结构中
import pandas as pd
import numpy as np

df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar','foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three','two', 'two', 'one', 'three'],
                   'C' : np.random.randn(8),
                   'D' : np.random.randn(8)})

print(df)

# 分组并对每个分组执行sum函数:
group1 = df.groupby('A').sum()
print(group1)
# print([x for x in group1])
# print(group1.size())
print(group1.count())

# 通过多个列进行分组形成一个层次索引,然后执行函数:
group2 = df.groupby(['A','B']).sum()
print(group2)
##      A      B         C         D
## 0  foo    one  0.077003 -1.656320
## 1  bar    one  0.580323  0.941866
## 2  foo    two -0.556902 -0.040688
## 3  bar  three -1.367754 -0.236453
## 4  foo    two -1.800508  0.475211
## 5  bar    two  0.390206  1.823098
## 6  foo    one  0.970117 -0.510762
## 7  foo  three  0.094226 -0.368662
##             C         D
## A                      
## bar -0.397225  2.528511
## foo -1.216064 -2.101221
## C    2
## D    2
## dtype: int64
##                   C         D
## A   B                        
## bar one    0.580323  0.941866
##     three -1.367754 -0.236453
##     two    0.390206  1.823098
## foo one    1.047119 -2.167082
##     three  0.094226 -0.368662
##     two   -2.357409  0.434523

8.Reshaping

Stack

import pandas as pd
import numpy as np

tuples = list(zip(*[['bar', 'bar', 'baz', 'baz','foo', 'foo', 'qux', 'qux'],
                    ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]))
print(tuples)

index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
print(index)

df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])
print(df)

df2 = df[:4]
print(df2)

stacked = df2.stack()
print(stacked)

print(stacked.unstack())

print(stacked.unstack(1))

print(stacked.unstack(0))

print(stacked.unstack(2))
## [('bar', 'one'), ('bar', 'two'), ('baz', 'one'), ('baz', 'two'), ('foo', 'one'), ('foo', 'two'), ('qux', 'one'), ('qux', 'two')]
## MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
##            labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],
##            names=['first', 'second'])
##                      A         B
## first second                    
## bar   one     1.105246  1.312284
##       two    -0.488749  1.203323
## baz   one    -0.231653 -0.301010
##       two     0.772011  1.347532
## foo   one    -0.363355  1.388029
##       two    -0.232480  0.335387
## qux   one     0.647417  1.879464
##       two    -0.348406 -0.533293
##                      A         B
## first second                    
## bar   one     1.105246  1.312284
##       two    -0.488749  1.203323
## baz   one    -0.231653 -0.301010
##       two     0.772011  1.347532
## first  second   
## bar    one     A    1.105246
##                B    1.312284
##        two     A   -0.488749
##                B    1.203323
## baz    one     A   -0.231653
##                B   -0.301010
##        two     A    0.772011
##                B    1.347532
## dtype: float64
##                      A         B
## first second                    
## bar   one     1.105246  1.312284
##       two    -0.488749  1.203323
## baz   one    -0.231653 -0.301010
##       two     0.772011  1.347532
## second        one       two
## first                      
## bar   A  1.105246 -0.488749
##       B  1.312284  1.203323
## baz   A -0.231653  0.772011
##       B -0.301010  1.347532
## first          bar       baz
## second                      
## one    A  1.105246 -0.231653
##        B  1.312284 -0.301010
## two    A -0.488749  0.772011
##        B  1.203323  1.347532
##                      A         B
## first second                    
## bar   one     1.105246  1.312284
##       two    -0.488749  1.203323
## baz   one    -0.231653 -0.301010
##       two     0.772011  1.347532

数据透视表

可以产生类似于excel数据透视表的结果,相当的直观

import pandas as pd
import numpy as np

df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
                   'B' : ['A', 'B', 'C'] * 4,
                   'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
                   'D' : np.random.randn(12),
                   'E' : np.random.randn(12)})

print(df)

# 1)分组统计
# 其中参数index指定“行”键,columns指定“列”键。
print(pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C']))

# 2)分项汇总
# 如果将参数margins设置为True,则可以得到分项总计数据。
print(pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'], margins=True))

# 3)交叉表crosstab  
# 可以按照指定的行和列统计分组频数,用起来非常方便;当然同样的功能也可采用groupby实现。
print(pd.crosstab(df.A, df.B, margins=True))
##         A  B    C         D         E
## 0     one  A  foo -0.221076  2.059491
## 1     one  B  foo -1.182770 -0.038186
## 2     two  C  foo  1.605421 -0.227043
## 3   three  A  bar -0.685556 -0.187624
## 4     one  B  bar  0.044209 -0.969931
## 5     one  C  bar -0.725583 -1.114549
## 6     two  A  foo  0.456501  1.171362
## 7   three  B  foo  1.287492  1.080057
## 8     one  C  foo -0.708959 -0.475746
## 9     one  A  bar -1.066508 -0.646032
## 10    two  B  bar -1.691038  0.983874
## 11  three  C  bar  1.066925 -0.616295
## C             bar       foo
## A     B                    
## one   A -1.066508 -0.221076
##       B  0.044209 -1.182770
##       C -0.725583 -0.708959
## three A -0.685556       NaN
##       B       NaN  1.287492
##       C  1.066925       NaN
## two   A       NaN  0.456501
##       B -1.691038       NaN
##       C       NaN  1.605421
## C             bar       foo       All
## A     B                              
## one   A -1.066508 -0.221076 -0.643792
##       B  0.044209 -1.182770 -0.569280
##       C -0.725583 -0.708959 -0.717271
## three A -0.685556       NaN -0.685556
##       B       NaN  1.287492  1.287492
##       C  1.066925       NaN  1.066925
## two   A       NaN  0.456501  0.456501
##       B -1.691038       NaN -1.691038
##       C       NaN  1.605421  1.605421
## All     -0.509592  0.206101 -0.151745
## B      A  B  C  All
## A                  
## one    2  2  2    6
## three  1  1  1    3
## two    1  1  1    3
## All    4  4  4   12

9. 时间序列

pandas在对频率转换进行重新采样时拥有简单、强大且高效的功能(如将按秒采样的数据转换为按5分钟为单位进行采样的数据)。这种操作在金融领域非常常见。

import pandas as pd
import numpy as np

## 1、时区表示:
rng = pd.date_range('1/1/2012', periods=10, freq='S')
print(rng)
ts = pd.Series(np.random.randint(0, 50, len(rng)), index=rng)
print(ts)
print(ts.resample('5Min').sum())

## 2、时区转换:
rng = pd.date_range('3/6/2012 00:00', periods=5, freq='D')
print(rng)
ts = pd.Series(np.random.randn(len(rng)), rng)
print(ts)
ts_utc = ts.tz_localize('UTC')
print(ts_utc)

## 3、时间跨度转换:
print(ts_utc.tz_convert('US/Eastern'))

## 4、时期和时间戳之间的转换使得可以使用一些方便的算术函数。
rng = pd.date_range('1/1/2012', periods=5, freq='M')
print(rng)
ts = pd.Series(np.random.randn(len(rng)), index=rng)
print(ts)

ps = ts.to_period()
print(ps)
print(ps.to_timestamp())

prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV')
print(prng)
ts = pd.Series(np.random.randn(len(prng)), prng)
print(ts)
ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9
print(ts.head())
## DatetimeIndex(['2012-01-01 00:00:00', '2012-01-01 00:00:01',
##                '2012-01-01 00:00:02', '2012-01-01 00:00:03',
##                '2012-01-01 00:00:04', '2012-01-01 00:00:05',
##                '2012-01-01 00:00:06', '2012-01-01 00:00:07',
##                '2012-01-01 00:00:08', '2012-01-01 00:00:09'],
##               dtype='datetime64[ns]', freq='S')
## 2012-01-01 00:00:00    12
## 2012-01-01 00:00:01    37
## 2012-01-01 00:00:02    38
## 2012-01-01 00:00:03    45
## 2012-01-01 00:00:04    10
## 2012-01-01 00:00:05    49
## 2012-01-01 00:00:06    39
## 2012-01-01 00:00:07    35
## 2012-01-01 00:00:08    11
## 2012-01-01 00:00:09     6
## Freq: S, dtype: int32
## 2012-01-01    282
## Freq: 5T, dtype: int32
## DatetimeIndex(['2012-03-06', '2012-03-07', '2012-03-08', '2012-03-09',
##                '2012-03-10'],
##               dtype='datetime64[ns]', freq='D')
## 2012-03-06   -0.022886
## 2012-03-07    1.200976
## 2012-03-08    1.297353
## 2012-03-09   -0.331530
## 2012-03-10    0.530223
## Freq: D, dtype: float64
## 2012-03-06 00:00:00+00:00   -0.022886
## 2012-03-07 00:00:00+00:00    1.200976
## 2012-03-08 00:00:00+00:00    1.297353
## 2012-03-09 00:00:00+00:00   -0.331530
## 2012-03-10 00:00:00+00:00    0.530223
## Freq: D, dtype: float64
## 2012-03-05 19:00:00-05:00   -0.022886
## 2012-03-06 19:00:00-05:00    1.200976
## 2012-03-07 19:00:00-05:00    1.297353
## 2012-03-08 19:00:00-05:00   -0.331530
## 2012-03-09 19:00:00-05:00    0.530223
## Freq: D, dtype: float64
## DatetimeIndex(['2012-01-31', '2012-02-29', '2012-03-31', '2012-04-30',
##                '2012-05-31'],
##               dtype='datetime64[ns]', freq='M')
## 2012-01-31   -0.684212
## 2012-02-29    0.517506
## 2012-03-31    1.310289
## 2012-04-30    0.990170
## 2012-05-31    0.688944
## Freq: M, dtype: float64
## 2012-01   -0.684212
## 2012-02    0.517506
## 2012-03    1.310289
## 2012-04    0.990170
## 2012-05    0.688944
## Freq: M, dtype: float64
## 2012-01-01   -0.684212
## 2012-02-01    0.517506
## 2012-03-01    1.310289
## 2012-04-01    0.990170
## 2012-05-01    0.688944
## Freq: MS, dtype: float64
## PeriodIndex(['1990Q1', '1990Q2', '1990Q3', '1990Q4', '1991Q1', '1991Q2',
##              '1991Q3', '1991Q4', '1992Q1', '1992Q2', '1992Q3', '1992Q4',
##              '1993Q1', '1993Q2', '1993Q3', '1993Q4', '1994Q1', '1994Q2',
##              '1994Q3', '1994Q4', '1995Q1', '1995Q2', '1995Q3', '1995Q4',
##              '1996Q1', '1996Q2', '1996Q3', '1996Q4', '1997Q1', '1997Q2',
##              '1997Q3', '1997Q4', '1998Q1', '1998Q2', '1998Q3', '1998Q4',
##              '1999Q1', '1999Q2', '1999Q3', '1999Q4', '2000Q1', '2000Q2',
##              '2000Q3', '2000Q4'],
##             dtype='int64', freq='Q-NOV')
## 1990Q1   -0.867859
## 1990Q2    0.830501
## 1990Q3   -0.105151
## 1990Q4   -1.067168
## 1991Q1    0.199731
## 1991Q2    1.563998
## 1991Q3    0.900670
## 1991Q4   -0.080161
## 1992Q1   -0.235376
## 1992Q2   -0.311618
## 1992Q3   -0.599459
## 1992Q4    0.297456
## 1993Q1    0.378720
## 1993Q2   -0.559001
## 1993Q3    0.021934
## 1993Q4    1.138138
## 1994Q1   -1.030343
## 1994Q2   -0.901108
## 1994Q3    1.927886
## 1994Q4   -1.479484
## 1995Q1   -1.613674
## 1995Q2    0.013176
## 1995Q3    1.063955
## 1995Q4   -1.212704
## 1996Q1   -0.004495
## 1996Q2   -0.327333
## 1996Q3   -0.009903
## 1996Q4   -1.194884
## 1997Q1    0.434205
## 1997Q2   -0.726546
## 1997Q3    1.791200
## 1997Q4    1.072537
## 1998Q1    1.376644
## 1998Q2   -0.408941
## 1998Q3   -0.073592
## 1998Q4   -3.184624
## 1999Q1    0.051400
## 1999Q2    0.249227
## 1999Q3    0.071067
## 1999Q4    0.303785
## 2000Q1   -1.374489
## 2000Q2   -1.485674
## 2000Q3    1.396185
## 2000Q4    1.123512
## Freq: Q-NOV, dtype: float64
## 1990-03-01 09:00   -0.867859
## 1990-06-01 09:00    0.830501
## 1990-09-01 09:00   -0.105151
## 1990-12-01 09:00   -1.067168
## 1991-03-01 09:00    0.199731
## Freq: H, dtype: float64

10. 导入和保存数据

import pandas as pd
import numpy as np

dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,3), index=dates, columns=list('ABC'))
print(df)

### CSV
# 写入csv文件:
df.to_csv('D:/foo.csv')
# 从csv文件中读取:
df1 = pd.read_csv('D:/foo.csv')
print(df1)

### HDF5
# 写入HDF5存储:
df.to_hdf('foo.h5', 'df')
# 从HDF5存储中读取:
pd.read_hdf('foo.h5', 'df')

### Excel
# 写入excel文件:
df.to_excel('D:/foo.xlsx', sheet_name='Sheet1')
# 从excel文件中读取:
df2 = pd.read_excel('D:/foo.xlsx', 'Sheet1', index_col=None, na_values=['NA'])
print(df2)

常用方法案例举例

import pandas as pd, numpy as np
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))

mutate + ifelse

  df['E'] = np.where(df['D'] >= 0, '>=0', '<0')
  df['F'] = np.random.randint(0, 2, 6)
  df.assign(G = df.A * df.D) # 或者
  df['F'] = df['F'].apply(str) #针对单列的
  df.applymap(str) #这个相当于是mutate_each

table

  pd.value_counts(df["E"])
  pd.pivot_table(df,index=['E','F'])

index 也就是取df的rownames,但与R不一样的在于,df可能有多维rownames

  df.index
  df.set_index(['A'], drop = 0, append = 1) # 把已有的列设置为index,可保留之前的index,也可以把新的index在原数据中删除
  df['dates'] = df.index # 新生成一列dates
  df.reset_index(level=0, inplace=True) # 同上
  df.reset_index(level=['index']) # 同上

删除列和行

  df = df.drop('index', axis = 1) # 可以删除多列
  df.drop(df.index[[1,3]])

column names

  df.columns
  df.columns = ['a', 'b', 'c', 'e', 'd', 'f'] # 重命名
  df.rename(columns = {'A':'aa','B':'bb', 'C':'cc', 'D':'dd', 'E':'ee', 'F':'ff'}, inplace=True)
  df.rename(columns=lambda x: x[1:].upper(), inplace=True) # 也可以用函数 inplace参数的意思就是代替原来的变量,深拷贝

哑变量 dummy variables

  pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies()

纯粹的df的矩阵,即不包含column和index

  df.values
  df.get_values()

summary

  df.describe() # 只会针对数值型变量做计算

rbind

  df2=pd.DataFrame([[5,6],[7,8]],columns=list('AB'))
  df.append(df2, ignore_index=True)

group by 分组汇总计算,和pivot_table类似

  df.groupby(['E','F']).mean()
  df.groupby(['E','F']).agg(['sum', 'mean'])
  pd.pivot_table(df,index=['E','F'], aggfunc=[np.sum, np.mean])
  df.pivot_table(index=['E','F'], aggfunc=[np.sum, np.mean]) # 同上
  df.groupby(['E','F']).agg({'A':['mean','sum'], 'B':'min'}) # groupby 也可以这样写

排序

  df.sort(['A','B'],ascending=[1,0]) # 按列排序,na_position控制NAN的位置
  df.sort_index(ascending=0) # 按index排序

筛选

  df[(df.A >= -1) & (df.B <= 0)] # 值筛选
  df[df.E.str.contains(">")] # 包含某个字符,contains筛选的其实是正则表达式
  df[df.F.isin(['1'])] # 在列表内

变量选择

  df['A'] # 单个的列
  df[0:3] # 行
  df['20130102':'20130104'] # 按index筛选
  df.loc[:,] # 类似于R里面的dataframe选行和列的方法
  df.iloc[:,] # iloc只能用数字了

抽样

  df.sample(10, replace = True)
  df.sample(3)
  df.sample(frac = 0.5) # 按比例抽样
  df.sample(frac = 10, replace = True,weights = np.random.randint(1,10,6)) # 对样本加权
  df.sample(3, axis = 1) # 变量抽样

join(即 merge)

  pd.merge(df.sample(4), df.sample(4), how = "left", on = "A", indicator = True)

随机数

  numpy.random.rand(3, 2) # 按维度生成[0,1)之间的均匀分布随机数
  np.random.randn(2,5) # 按维度生成标准正太分布随机数
  np.random.randint(2, size=10) # randint(low[, high, size])生成随机整数,默认low为0,high必填,size默认为1
  np.random.bytes(10) # 返回随机字节
  a=np.arange(10)
  np.random.shuffle(a) # 洗牌
  a=np.arange(9).reshape(3, 3)
  np.random.shuffle(a) # 若是数组,则只会打乱第一维
  np.random.permutation(10) # 随机排列,对于多维序列也适用
  np.random.permutation(10) .reshape(2, 5)
  np.random.seed(1000) # 种子
  np.random.normal(2,3,[5,2]) # 高斯分布,其他分布可查
  # http://docs.scipy.org/doc/numpy-1.10.1/reference/routines.random.html
  np.random.seed(12345678)
  x = scipy.stats.norm.rvs(loc=5, scale=3, size=100) # 另外scipy也有这些随机数的生成,附带检验
  scipy.stats.shapiro(x)
  # http://docs.scipy.org/doc/scipy-0.17.0/reference/stats.html

gather和spread

  # gather:
  def gather( df, key, value, cols ):
      id_vars = [ col for col in df.columns if col not in cols ]
      id_values = cols
      var_name = key
      value_name = value
      return pandas.melt( df, id_vars, id_values, var_name, value_name )
  # 以上是定义的一个函数,实际上一样的,横变竖,是gather,竖变横,是spread
  pd.melt(df, id_vars=['E','F'], value_vars=['A','C'])
  # spread:
  pd.pivot(df["D"],df["E"],df['F']) #这个是竖变横
  df3=pd.pivot(df2['D'],df2['variable'],df2['value'])
  df3.reset_index(level=0, inplace=True) # 再变回df的样子

  scipy.stats.entropy(np.arange(10)) 

字符串拼接

  [",".join(['a','b','d'])]
  df[['E','F']].groupby('F')['E'].apply(lambda x: "{%s}" % ', '.join(x)) # 分组拼接,前提是这些列都要是字符串
  df[['E','F']].applymap(str).groupby('E')['F'].apply(lambda x: "%s" % ', '.join(x)) # 所以可以这样

随机字符串生成

  import random,string
  df2 = pd.DataFrame(range(10),columns=['y'])
  df2["x"] = [",".join(random.sample(string.lowercase,random.randint(2,5))) for i in range(10)]

分列后生成hash表

  # 用20 的示例数据
  df3=pd.DataFrame(df2.x.str.split(',').tolist(),index=df2.y).stack().reset_index(level=0)
  df3.columns=["y","x"]

去重

  df[["F","E"]].drop_duplicates()

离散化

  pd.cut(df.A,range(-1,2,1))