[Python] pandas 基本教學

程式語言:Python
Package:pandas
官方網站
官方文件

功能:數據處理分析

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df = pd.DataFrame({ 'A' : 1.,
                    'B' : pd.Timestamp('20130102'),
                    'C' : pd.Series(np.linspace(1, 4, num=4),index=list(range(4)),dtype='float32'),
                    'D' : np.array([3] * 4,dtype='int32'),
                    'E' : pd.Categorical(["test","train","test","train"]),
                    'F' : 'foo' })

print(df) 
#      A          B    C  D      E    F
# 0  1.0 2013-01-02  1.0  3   test  foo
# 1  1.0 2013-01-02  2.0  3  train  foo
# 2  1.0 2013-01-02  3.0  3   test  foo
# 3  1.0 2013-01-02  4.0  3  train  foo
                     
print(df.dtypes)
# A           float64
# B    datetime64[ns]
# C           float32
# D             int32
# E          category
# F            object
# dtype: object

df[['A', 'C']].plot()
plt.legend(loc='best')
plt.show()

資料架構

class pandas.Series(data=None, index=None, dtype=None, name=None, copy=False, fastpath=False)
  • pandas.Series 
  • 一維資料
  • data
    • 輸入的資料
      • array-like
      • dict
      • scalar value (like 10)
  • index
    • 加入對應的 row name
  • dtype
    • 資料類型
  • name
    • 資料名字
    • 轉換為 DataFrame 時,會成為 column name
  • copy
    • 是否重新 copy
    • 若無 copy,請小心資料的更改
  • fastpath
    • 內部參數
範例
numpy 建立
import numpy as np
import pandas as pd

data = np.linspace(1, 3, num=3)
print(data) # [ 1.  2.  3.]

s1 = pd.Series(data, index=['a', 'b', 'c'])
print(s1)
# a    1.0
# b    2.0
# c    3.0
# dtype: float64

s2 = pd.Series(data, index=['a', 'b', 'c'], copy=True, name='s2')
print(s2)
# a    1.0
# b    2.0
# c    3.0
# Name: s2, dtype: float64

# 可看到 column name 為 's2'
print(pd.DataFrame(s2))
#     s2
# a  1.0
# b  2.0
# c  3.0

print(s1[0]) # 1
print(s1['a']) # 1

data[0] = 10
# 可看到只有 s2 無改變,因是重新 copy
print(data)
# [ 10.   2.   3.]
print(s1)
# a    10.0
# b     2.0
# c     3.0
# dtype: float64
print(s2)
# a    1.0
# b    2.0
# c    3.0
# dtype: float64

dict 或 純值 建立
import pandas as pd

s3 = pd.Series({'a':2., 'b':5})
print(s3)
# a    2.0
# b    5.0
# dtype: float64

s4 = pd.Series(10, index=['a', 'b', 'c', 'd', 'e'])
print(s4)
# a    10
# b    10
# c    10
# d    10
# e    10
# dtype: int64

class pandas.DataFrame(data=None, index=None, columns=None, dtype=None, copy=False)
  • pandas.DataFrame
  • 二維資料
  • data
    • 輸入的資料
      • array-like
      • Dict of 1D ndarrays, lists, dicts, or Series
      • 單個 Series
      • 另一個 DataFrame
  • index
    • 輸入資料擁有 row name,例:dict of Series, dicts
      為指定讀取的 row name
    • 輸入資料無 row name,例:list、structured arrays
      為加入對應的 row name
  • columns
    • 輸入資料擁有 column name,例:dict of Series, dicts、structured arrays
      為指定讀取的 column name
    • 輸入資料無 column name,例:list
      為加入對應的 column name
  • dtype
    • 資料類型
  • copy
    • 是否重新 copy 
    • 若無 copy,請小心資料的更改 
範例
Series
import numpy as np
import pandas as pd

# Series
d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
     'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
     
df = pd.DataFrame(d)
print(df)
#    one  two
# a  1.0  1.0
# b  2.0  2.0
# c  3.0  3.0
# d  NaN  4.0

# 指定 index
print(pd.DataFrame(d, index=['d', 'b', 'a']))
#    one  two
# d  NaN  4.0
# b  2.0  2.0
# a  1.0  1.0

# 指定 column name
print(pd.DataFrame(d, index=['d', 'b', 'a'], columns=['two', 'one', 'three']))
#    two three
# d  4.0   NaN
# b  2.0   NaN
# a  1.0   NaN

Dict
import numpy as np
import pandas as pd

# dict
d = {'one' : [1., 2., 3., 4.],
     'two' : [4., 3., 2., 1.]}

print(pd.DataFrame(d))
#    one  two
# 0  1.0  4.0
# 1  2.0  3.0
# 2  3.0  2.0
# 3  4.0  1.0

# 加入 index
print(pd.DataFrame(d, index=['a', 'b', 'c', 'd']))
#    one  two
# a  1.0  4.0
# b  2.0  3.0
# c  3.0  2.0
# d  4.0  1.0

Structured arrays
import numpy as np
import pandas as pd

# structured arrays
# 建立四個空白資料,並擁有三種資料 'A' 'B' 'C',其對應資料類型為 integer 4 bytes, float 4 bytes, string 10 bytes
data = np.zeros((4,), dtype=[('A', 'i4'),('B', 'f4'),('C', 'S10')])

data[:] = [(1,2.,'Hello'), (2,3.,"World"), (6,8.,"!!"), (9,11.,"??")]
print(data)
# [(1,   2., b'Hello') (2,   3., b'World') (6,   8., b'!!') (9,  11., b'??')]

# 印出 A 的資料
print(data['A'])
# [1 2 6 9]

print(pd.DataFrame(data))
#    A    B         C
# 0  1  2.0  b'Hello'
# 1  2  3.0  b'World'
# 2  6  8.0     b'!!'
# 3  9  11.0    b'??'

print(pd.DataFrame(data, index=['first', 'second', 'three', 'four']))
#         A    B         C
# first   1  2.0  b'Hello'
# second  2  3.0  b'World'
# three   6  8.0     b'!!'
# four    9  11.0    b'??'

print(pd.DataFrame(data, columns=['C', 'A', 'B']))
#           C  A    B
# 0  b'Hello'  1  2.0
# 1  b'World'  2  3.0
# 2     b'!!'  6  8.0
# 3     b'??'  9  11.0

Multi-indexed frame
import numpy as np
import pandas as pd

# multi-indexed frame
df = pd.DataFrame({('a', 'b'): {('A', 'B'): 1, ('A', 'C'): 2},
                   ('a', 'a'): {('A', 'C'): 3, ('A', 'B'): 4},
                   ('a', 'c'): {('A', 'B'): 5, ('A', 'C'): 6},
                   ('b', 'a'): {('A', 'C'): 7, ('A', 'B'): 8},
                   ('b', 'b'): {('A', 'D'): 9, ('A', 'B'): 10}})
                  
print(df)
#        a              b
#        a    b    c    a     b
# A B  4.0  1.0  5.0  8.0  10.0
#   C  3.0  2.0  6.0  7.0   NaN
#   D  NaN  NaN  NaN  NaN   9.0

print(df['b']['a']['A']['C'])
# 7.0

class pandas.Panel(data=None, items=None, major_axis=None, minor_axis=None, copy=False, dtype=None)
  • pandas.Panel
  • 三維資料,但不常使用
  • data
    • 輸入的資料
  • items
    • 資料大項的名字
  • major_axis
    • 加入 row name
  • minor_axis
    • 加入 column name 
  • copy
    • 是否重新 copy
    • 若無 copy,請小心資料的更改
  • dtype
    • 資料類型
範例
numpy 建立
import numpy as np
import pandas as pd

# 3D ndarray
wp = pd.Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'],
              major_axis=pd.date_range('1/1/2000', periods=5),
              minor_axis=['A', 'B', 'C', 'D'])
print(wp)
# <class 'pandas.core.panel.Panel'>
# Dimensions: 2 (items) x 5 (major_axis) x 4 (minor_axis)
# Items axis: Item1 to Item2
# Major_axis axis: 2000-01-01 00:00:00 to 2000-01-05 00:00:00
# Minor_axis axis: A to D

DataFrame 建立
import numpy as np
import pandas as pd
             
# Series
d = {'item1' : pd.DataFrame({'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
                             'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}),
     'item2' : pd.DataFrame({'thr' : pd.Series([1., 2., 3.], index=['e', 'f', 'g'])})}
     
wp = pd.Panel(d)
print(wp)
# <class 'pandas.core.panel.Panel'>
# Dimensions: 2 (items) x 7 (major_axis) x 3 (minor_axis)
# Items axis: item1 to item2
# Major_axis axis: a to g
# Minor_axis axis: one to two

# filter_observations 預設為 True,表示不顯示含有 NaN 的欄位
# 所以轉換後為 Empty DataFrame
print(wp.to_frame())
# Empty DataFrame
# Columns: [item1, item2]
# Index: []

print(wp.to_frame(filter_observations=False))
#              item1  item2
# major minor
# a     one      1.0    NaN
#       thr      NaN    NaN
#       two      1.0    NaN
# b     one      2.0    NaN
#       thr      NaN    NaN
#       two      2.0    NaN
# c     one      3.0    NaN
#       thr      NaN    NaN
#       two      3.0    NaN
# d     one      NaN    NaN
#       thr      NaN    NaN
#       two      4.0    NaN
# e     one      NaN    NaN
#       thr      NaN    1.0
#       two      NaN    NaN
# f     one      NaN    NaN
#       thr      NaN    2.0
#       two      NaN    NaN
# g     one      NaN    NaN
#       thr      NaN    3.0
#       two      NaN    NaN

基本操作

得值
原始資料
import numpy as np
import pandas as pd

dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
print(df)
#                    A         B         C         D
# 2013-01-01  0.254520 -0.263404 -2.210508  0.807513
# 2013-01-02  0.024263 -0.014928 -0.601224  0.561930
# 2013-01-03 -0.628894 -0.419761  1.051119 -0.593262
# 2013-01-04 -1.116615 -0.027055  0.537114 -0.955006
# 2013-01-05  0.637412 -0.188929 -0.354975  1.693871
# 2013-01-06  1.014020  0.812423  0.420242 -0.633087

Getting
# get by [],只能一個 index
print(df['A']) # 等同 df.A
# 2013-01-01    0.254520
# 2013-01-02    0.024263
# 2013-01-03   -0.628894
# 2013-01-04   -1.116615
# 2013-01-05    0.637412
# 2013-01-06    1.014020
# Freq: D, Name: A, dtype: float64
# Freq 表示間隔頻率為一天

# get by [[]],可以多個 columns
print(df[['A', 'B']])
#                    A         B     
# 2013-01-01  0.254520 -0.263404 
# 2013-01-02  0.024263 -0.014928 
# 2013-01-03 -0.628894 -0.419761 
# 2013-01-04 -1.116615 -0.027055 
# 2013-01-05  0.637412 -0.188929 
# 2013-01-06  1.014020  0.812423 

print(df[0:10])
#                    A         B         C         D
# 2013-01-01  0.254520 -0.263404 -2.210508  0.807513
# 2013-01-02  0.024263 -0.014928 -0.601224  0.561930
# 2013-01-03 -0.628894 -0.419761  1.051119 -0.593262
# 2013-01-04 -1.116615 -0.027055  0.537114 -0.955006
# 2013-01-05  0.637412 -0.188929 -0.354975  1.693871
# 2013-01-06  1.014020  0.812423  0.420242 -0.633087

print(df['20130102':'20130104'])
#                    A         B         C         D
# 2013-01-02  0.024263 -0.014928 -0.601224  0.561930
# 2013-01-03 -0.628894 -0.419761  1.051119 -0.593262
# 2013-01-04 -1.116615 -0.027055  0.537114 -0.955006

Selection by Label
print(df.loc['20130102':'20130104', 'A'])
# 2013-01-02    0.024263
# 2013-01-03   -0.628894
# 2013-01-04   -1.116615
# Freq: D, Name: A, dtype: float64

Selection by Position
# get by index
# index 需正確,可用 df.reset_index(drop=True)
print(df.iloc[0:10, 0])
# 2013-01-01    0.254520
# 2013-01-02    0.024263
# 2013-01-03   -0.628894
# 2013-01-04   -1.116615
# 2013-01-05    0.637412
# 2013-01-06    1.014020
# Freq: D, Name: A, dtype: float64

Selection by Mix
# get by index, label
print(df.ix[0:10, 'A'])
# 2013-01-01    0.254520
# 2013-01-02    0.024263
# 2013-01-03   -0.628894
# 2013-01-04   -1.116615
# 2013-01-05    0.637412
# 2013-01-06    1.014020
# Freq: D, Name: A, dtype: float64

Boolean Indexing
print(df[df.A > 0])
#                    A         B         C         D
# 2013-01-01  0.254520 -0.263404 -2.210508  0.807513
# 2013-01-02  0.024263 -0.014928 -0.601224  0.561930
# 2013-01-05  0.637412 -0.188929 -0.354975  1.693871
# 2013-01-06  1.014020  0.812423  0.420242 -0.633087

isin
df2 = df.copy()
df2['E'] = ['one', 'one','two','three','four','three']
print(df2)
#                    A         B         C         D      E
# 2013-01-01  0.254520 -0.263404 -2.210508  0.807513    one
# 2013-01-02  0.024263 -0.014928 -0.601224  0.561930    one
# 2013-01-03 -0.628894 -0.419761  1.051119 -0.593262    two
# 2013-01-04 -1.116615 -0.027055  0.537114 -0.955006  three
# 2013-01-05  0.637412 -0.188929 -0.354975  1.693871   four
# 2013-01-06  1.014020  0.812423  0.420242 -0.633087  three

print(df2[df2['E'].isin(['two','four'])])
#                    A         B         C         D      E
# 2013-01-03 -0.628894 -0.419761  1.051119 -0.593262    two
# 2013-01-05  0.637412 -0.188929 -0.354975  1.693871   four

values
# 轉換為 numpy 格式
print(df.values)
# array([[ 0.254520, -0.263404, -2.210508,  0.807513],
#        [ 0.024263, -0.014928, -0.601224,  0.561930],
#        [-0.628894, -0.419761,  1.051119, -0.593262],
#        [-1.116615, -0.027055,  0.537114, -0.955006],
#        [ 0.637412, -0.188929, -0.354975,  1.693871],
#        [ 1.014020,  0.812423,  0.420242, -0.633087]])

ravel()
# 轉換為 numpy 一維格式
print(df['A'].ravel())
# array([0.254520,  0.024263, -0.628894, -1.116615,  0.637412,  1.014020])
賦值
利用 series, label , index, numpy
import numpy as np
import pandas as pd

df = pd.DataFrame()
date = pd.date_range('20130102', periods=6)

s1 = pd.Series([1,2,3,4,5,6], index=date)

# by series
df['F'] = s1
# by label
df.at[date[2],'A'] = 1000
# by index
df.iat[0,1] = 0
# by np
df.loc[:,'D'] = np.array([5] * len(df))

print(df)
#             F       A  D
# 2013-01-02  1     0.0  5
# 2013-01-03  2     NaN  5
# 2013-01-04  3  1000.0  5
# 2013-01-05  4     NaN  5
# 2013-01-06  5     NaN  5
# 2013-01-07  6     NaN  5

# by 任意得值的方法
df.loc['20130102':'20130103', 'A'] = 1
# index 需正確,可用 df.reset_index(drop=True)
df.iloc[3:5, 0] = 10
df.ix[5, 'A'] = 0

print(df)
#             F       A  D
# 2013-01-02  1     1.0  5
# 2013-01-03  2     1.0  5
# 2013-01-04  3  1000.0  5
# 2013-01-05  4    10.0  5
# 2013-01-06  5    10.0  5
# 2013-01-07  6     0.0  5

# 只有符合條件才會賦值
df[df > 10] = -df
print(df)
#             F       A  D
# 2013-01-02  1     1.0  5
# 2013-01-03  2     1.0  5
# 2013-01-04  3 -1000.0  5
# 2013-01-05  4    10.0  5
# 2013-01-06  5    10.0  5
# 2013-01-07  6     0.0  5
合併表格
Merge, join, and concatenate
import pandas as pd

df1 = pd.DataFrame({'X1':['a', 'b', 'c'],
                    'X2':[1, 2, 3],})
#   X1  X2
# 0  a   1
# 1  b   2
# 2  c   3

df2 = pd.DataFrame({'X1':['a', 'b', 'd'],
                    'X3':[4, None, 6],})
#   X1   X3
# 0  a  4.0
# 1  b  NaN
# 2  d  6.0

df1.merge(df2, how='left', on='X1')
#   X1  X2   X3
# 0  a   1  4.0
# 1  b   2  NaN
# 2  c   3  NaN

df1.merge(df2, how='right', on='X1')
#   X1   X2   X3
# 0  a  1.0  4.0
# 1  b  2.0  NaN
# 2  d  NaN  6.0

df1.merge(df2, how='outer', on='X1')
#   X1   X2   X3
# 0  a  1.0  4.0
# 1  b  2.0  NaN
# 2  c  3.0  NaN
# 3  d  NaN  6.0

# join 必需有相同的 index 才行
df1.set_index('X1').join(df2.set_index('X1'), how='right')
#      X2   X3
# X1
# a   1.0  4.0
# b   2.0  NaN
# d   NaN  6.0

# 方向為 index
df1.append(df2)
#   X1   X2   X3
# 0  a  1.0  NaN
# 1  b  2.0  NaN
# 2  c  3.0  NaN
# 0  a  NaN  4.0
# 1  b  NaN  NaN
# 2  d  NaN  6.0

# 方向為 index
pd.concat([df1, df2])
#   X1   X2   X3
# 0  a  1.0  NaN
# 1  b  2.0  NaN
# 2  c  3.0  NaN
# 0  a  NaN  4.0
# 1  b  NaN  NaN
# 2  d  NaN  6.0

# 方向為 columns
pd.concat([df1, df2], axis='columns')
#   X1  X2 X1   X3
# 0  a   1  a  4.0
# 1  b   2  b  NaN
# 2  c   3  d  6.0
調整表格
import pandas as pd

df = pd.DataFrame({'foo': ['one','one','one','two','two','two'],
                       'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
                       'baz': [1, 2, 3, 4, 5, 6]})
#     foo   bar  baz
# 0   one   A    1
# 1   one   B    2
# 2   one   C    3
# 3   two   A    4
# 4   two   B    5
# 5   two   C    6

# 重新調整表格,指定排列方式
df.pivot(index='foo', columns='bar', values='baz')
#      A   B   C
# one  1   2   3
# two  4   5   6

# pivot 的進階版,指定排列方式,可設定值的處理,預設為平均
df.pivot_table(index='foo', columns='bar', values='baz')
#      A   B   C
# one  1   2   3
# two  4   5   6
import pandas as pd
import numpy as np

df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",
                          "bar", "bar", "bar", "bar"],
                   "B": ["one", "one", "one", "two", "two",
                          "one", "one", "two", "two"],
                   "C": ["small", "large", "large", "small",
                          "small", "large", "small", "small",
                          "large"],
                   "D": [1, 2, 2, 3, 3, 4, 5, 6, 7]})
#     A    B      C  D
# 0  foo  one  small  1
# 1  foo  one  large  2
# 2  foo  one  large  2
# 3  foo  two  small  3
# 4  foo  two  small  3
# 5  bar  one  large  4
# 6  bar  one  small  5
# 7  bar  two  small  6
# 8  bar  two  large  7

df.pivot_table(index=['A', 'B'], columns=['C'], values='D', aggfunc=np.sum)
# C        large  small
# A   B
# bar one    4.0    5.0
#     two    7.0    6.0
# foo one    4.0    1.0
#     two    NaN    6.0
import pandas as pd

df = pd.DataFrame({'a': {'one':1.0, 'two':2.0},
                   'b': {'one':3.0, 'two':4.0}})
#       a    b
# one  1.0  3.0
# two  2.0  4.0

# 把 columns 移到 index,可能會得到 Series
df.stack()
# one  a    1.0
#      b    3.0
# two  a    2.0
#      b    4.0

# 把 index 移到 columns,可能會得到 Series
df.stack().unstack()
#       a    b
# one  1.0  3.0
# two  2.0  4.0

# 重新排列 index 並把 index 移至 columns,仍是 DataFrame
df.reset_index()
#   index    a    b
# 0   one  1.0  3.0
# 1   two  2.0  4.0
import pandas as pd

df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'},
                   'B': {0: 1, 1: 3, 2: 5},
                   'C': {0: 2, 1: 4, 2: 6}})
#    A  B  C
# 0  a  1  2
# 1  b  3  4
# 2  c  5  6

# 新增對應 B C 的行與其對應的值
df.melt(id_vars=['A'], value_vars=['B', 'C'])
#    A variable  value
# 0  a        B      1
# 1  b        B      3
# 2  c        B      5
# 3  a        C      2
# 4  b        C      4
# 5  c        C      6
統計運算
原始資料
import numpy as np
import pandas as pd

dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.ones((6, 4))*2, index=dates, columns=list('ABCD'), dtype='float64')
print(df)
#               A    B    C    D
# 2013-01-01  2.0  2.0  2.0  2.0
# 2013-01-02  2.0  2.0  2.0  2.0
# 2013-01-03  2.0  2.0  2.0  2.0
# 2013-01-04  2.0  2.0  2.0  2.0
# 2013-01-05  2.0  2.0  2.0  2.0
# 2013-01-06  2.0  2.0  2.0  2.0

數學運算
# 數學運算
print(df+10)
#                A     B     C     D
# 2013-01-01  12.0  12.0  12.0  12.0
# 2013-01-02  12.0  12.0  12.0  12.0
# 2013-01-03  12.0  12.0  12.0  12.0
# 2013-01-04  12.0  12.0  12.0  12.0
# 2013-01-05  12.0  12.0  12.0  12.0
# 2013-01-06  12.0  12.0  12.0  12.0

print(df-10)
#               A    B    C    D
# 2013-01-01 -8.0 -8.0 -8.0 -8.0
# 2013-01-02 -8.0 -8.0 -8.0 -8.0
# 2013-01-03 -8.0 -8.0 -8.0 -8.0
# 2013-01-04 -8.0 -8.0 -8.0 -8.0
# 2013-01-05 -8.0 -8.0 -8.0 -8.0
# 2013-01-06 -8.0 -8.0 -8.0 -8.0

print(df/10)
#               A    B    C    D
# 2013-01-01  0.2  0.2  0.2  0.2
# 2013-01-02  0.2  0.2  0.2  0.2
# 2013-01-03  0.2  0.2  0.2  0.2
# 2013-01-04  0.2  0.2  0.2  0.2
# 2013-01-05  0.2  0.2  0.2  0.2
# 2013-01-06  0.2  0.2  0.2  0.2

print(df*10)
#                A     B     C     D
# 2013-01-01  20.0  20.0  20.0  20.0
# 2013-01-02  20.0  20.0  20.0  20.0
# 2013-01-03  20.0  20.0  20.0  20.0
# 2013-01-04  20.0  20.0  20.0  20.0
# 2013-01-05  20.0  20.0  20.0  20.0
# 2013-01-06  20.0  20.0  20.0  20.0

print(df**2)
#               A    B    C    D
# 2013-01-01  4.0  4.0  4.0  4.0
# 2013-01-02  4.0  4.0  4.0  4.0
# 2013-01-03  4.0  4.0  4.0  4.0
# 2013-01-04  4.0  4.0  4.0  4.0
# 2013-01-05  4.0  4.0  4.0  4.0
# 2013-01-06  4.0  4.0  4.0  4.0

統計計算
# 統計
print(df.describe())
#          A    B    C    D
# count  6.0  6.0  6.0  6.0
# mean   2.0  2.0  2.0  2.0
# std    0.0  0.0  0.0  0.0
# min    2.0  2.0  2.0  2.0
# 25%    2.0  2.0  2.0  2.0
# 50%    2.0  2.0  2.0  2.0
# 75%    2.0  2.0  2.0  2.0
# max    2.0  2.0  2.0  2.0

# mean 值
print(df.mean())
# A    2.0
# B    2.0
# C    2.0
# D    2.0
# dtype: float64

# 累積計算
print(df.cumsum())
#                A     B     C     D
# 2013-01-01   2.0   2.0   2.0   2.0
# 2013-01-02   4.0   4.0   4.0   4.0
# 2013-01-03   6.0   6.0   6.0   6.0
# 2013-01-04   8.0   8.0   8.0   8.0
# 2013-01-05  10.0  10.0  10.0  10.0
# 2013-01-06  12.0  12.0  12.0  12.0
缺值
缺值處理,丟棄或給予其他值
import numpy as np
import pandas as pd

df = pd.DataFrame()
date = pd.date_range('20130102', periods=6)

s1 = pd.Series([1,2,3,4,5,6], index=date)

# by series
df['F'] = s1
# by label
df.at[date[2],'A'] = 1000
# by index
df.iat[0,1] = 0
# by np
df.loc[:,'D'] = np.array([5] * len(df))

print(df)
#             F       A  D
# 2013-01-02  1     0.0  5
# 2013-01-03  2     NaN  5
# 2013-01-04  3  1000.0  5
# 2013-01-05  4     NaN  5
# 2013-01-06  5     NaN  5
# 2013-01-07  6     NaN  5

# DataFrame.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
# 預設 axis=0 => "index"
# how='any',只要有 NaN 就丟掉
# how='all',全部為 NaN 才丟掉
print(df.dropna(how='any'))
#             F       A  D
# 2013-01-02  1     0.0  5
# 2013-01-04  3  1000.0  5

# 將 NaN 填入值
print(df.fillna(value='Test'))
#             F     A  D
# 2013-01-02  1     0  5
# 2013-01-03  2  Test  5
# 2013-01-04  3  1000  5
# 2013-01-05  4  Test  5
# 2013-01-06  5  Test  5
# 2013-01-07  6  Test  5
Grouping
分類處理
import numpy as np
import pandas as pd

df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                       'foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three',
                       'two', 'two', 'one', 'three'],
                   'C' : np.linspace(1, 8, num=8),
                   'D' : np.linspace(-1, -8, num=8)})

print(df)
#      A      B    C    D
# 0  foo    one  1.0 -1.0
# 1  bar    one  2.0 -2.0
# 2  foo    two  3.0 -3.0
# 3  bar  three  4.0 -4.0
# 4  foo    two  5.0 -5.0
# 5  bar    two  6.0 -6.0
# 6  foo    one  7.0 -7.0

print(df.groupby('A').sum())
#         C     D
# A
# bar  12.0 -12.0
# foo  24.0 -24.0

print(df.groupby(['A','B']).sum())
#              C    D
# A   B
# bar one    2.0 -2.0
#     three  4.0 -4.0
#     two    6.0 -6.0
# foo one    8.0 -8.0
#     three  8.0 -8.0
#     two    8.0 -8.0
讀寫檔
  • 通常回傳的是 DataFrame
  • 列出比較常用,詳細可參考 IO Tools
    • sep 可用正規表示法
      但若超過一個字且不為 '\s+' 會強迫使用 python parsing engine
    • pandas.read_pickle(path)
    • pandas.read_clipboard(**kwargs)
    • pandas.read_table(filepath_or_buffer[, sep, ...])
    • pandas.read_csv(filepath_or_buffer[, sep, ...])
    • pandas.read_excel(io[, sheetname, header, ...])
    • pandas.read_json([path_or_buf, orient, typ, dtype, ...])
    • pandas.read_html(io[, match, flavor, header, ...])
    • pandas.read_sql(sql, con[, index_col, ...])
範例
import pandas as pd

# 預設 sep=','
pd.read_csv('test.csv')
#      a    b    c
# 0    1    2    3
# 1    4    5    6

# 預設 sep='\t'
# 自定 column names
pd.read_table('test.txt', names=['aa', 'bb', 'cc'])
#     aa    bb    cc
# 0    a    b    c
# 1    1    2    3
# 2    4    5    6

# 需要 xlrd package 
# 資料無標題
pd.read_excel('test.xlsx', header=None)
#      0    1    2    3    4
# 0    1    2    3    4    5
# 1    6    7    8    9    10

# 中文路徑會有問題,可用 open 取代
# sep 可用正規表示法,超過一個字且不為 '\s+' 不指定 engine 會有 warning
with open('新文字文件.txt', 'r') as f:
    pd.read_table(f, sep='[a-z]', engine='python')
#      a    b    c
# 0    1    2    3
# 1    4    5    6
參考
Pandas Cheat Sheet: Data Wrangling in Python

留言