[Python] pandas 基本教學

程式語言：Python
Package：pandas
官方網站
官方文件

功能：數據處理分析

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df = pd.DataFrame({ 'A' : 1.,
                    'B' : pd.Timestamp('20130102'),
                    'C' : pd.Series(np.linspace(1, 4, num=4),index=list(range(4)),dtype='float32'),
                    'D' : np.array([3] * 4,dtype='int32'),
                    'E' : pd.Categorical(["test","train","test","train"]),
                    'F' : 'foo' })

print(df) 
#      A          B    C  D      E    F
# 0  1.0 2013-01-02  1.0  3   test  foo
# 1  1.0 2013-01-02  2.0  3  train  foo
# 2  1.0 2013-01-02  3.0  3   test  foo
# 3  1.0 2013-01-02  4.0  3  train  foo
                     
print(df.dtypes)
# A           float64
# B    datetime64[ns]
# C           float32
# D             int32
# E          category
# F            object
# dtype: object

df[['A', 'C']].plot()
plt.legend(loc='best')
plt.show()

資料架構

class pandas.Series(data=None, index=None, dtype=None, name=None, copy=False, fastpath=False)

pandas.Series
一維資料
data

輸入的資料

array-like
dict
scalar value (like 10)

index

加入對應的 row name

dtype

資料類型

name

資料名字
轉換為 DataFrame 時，會成為 column name

copy

是否重新 copy
若無 copy，請小心資料的更改

fastpath

內部參數

範例

numpy 建立

import numpy as np
import pandas as pd

data = np.linspace(1, 3, num=3)
print(data) # [ 1.  2.  3.]

s1 = pd.Series(data, index=['a', 'b', 'c'])
print(s1)
# a    1.0
# b    2.0
# c    3.0
# dtype: float64

s2 = pd.Series(data, index=['a', 'b', 'c'], copy=True, name='s2')
print(s2)
# a    1.0
# b    2.0
# c    3.0
# Name: s2, dtype: float64

# 可看到 column name 為 's2'
print(pd.DataFrame(s2))
#     s2
# a  1.0
# b  2.0
# c  3.0

print(s1[0]) # 1
print(s1['a']) # 1

data[0] = 10
# 可看到只有 s2 無改變，因是重新 copy
print(data)
# [ 10.   2.   3.]
print(s1)
# a    10.0
# b     2.0
# c     3.0
# dtype: float64
print(s2)
# a    1.0
# b    2.0
# c    3.0
# dtype: float64

dict 或純值建立

import pandas as pd

s3 = pd.Series({'a':2., 'b':5})
print(s3)
# a    2.0
# b    5.0
# dtype: float64

s4 = pd.Series(10, index=['a', 'b', 'c', 'd', 'e'])
print(s4)
# a    10
# b    10
# c    10
# d    10
# e    10
# dtype: int64

class pandas.DataFrame(data=None, index=None, columns=None, dtype=None, copy=False)

pandas.DataFrame
二維資料
data

輸入的資料

array-like
Dict of 1D ndarrays, lists, dicts, or Series
單個 Series
另一個 DataFrame

index

輸入資料擁有 row name，例：dict of Series, dicts
為指定讀取的 row name
輸入資料無 row name，例：list、structured arrays
為加入對應的 row name

columns

輸入資料擁有 column name，例：dict of Series, dicts、structured arrays
為指定讀取的 column name
輸入資料無 column name，例：list
為加入對應的 column name

dtype

資料類型

copy

是否重新 copy
若無 copy，請小心資料的更改

範例

Series

import numpy as np
import pandas as pd

# Series
d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
     'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
     
df = pd.DataFrame(d)
print(df)
#    one  two
# a  1.0  1.0
# b  2.0  2.0
# c  3.0  3.0
# d  NaN  4.0

# 指定 index
print(pd.DataFrame(d, index=['d', 'b', 'a']))
#    one  two
# d  NaN  4.0
# b  2.0  2.0
# a  1.0  1.0

# 指定 column name
print(pd.DataFrame(d, index=['d', 'b', 'a'], columns=['two', 'one', 'three']))
#    two three
# d  4.0   NaN
# b  2.0   NaN
# a  1.0   NaN

Dict

import numpy as np
import pandas as pd

# dict
d = {'one' : [1., 2., 3., 4.],
     'two' : [4., 3., 2., 1.]}

print(pd.DataFrame(d))
#    one  two
# 0  1.0  4.0
# 1  2.0  3.0
# 2  3.0  2.0
# 3  4.0  1.0

# 加入 index
print(pd.DataFrame(d, index=['a', 'b', 'c', 'd']))
#    one  two
# a  1.0  4.0
# b  2.0  3.0
# c  3.0  2.0
# d  4.0  1.0

Structured arrays

import numpy as np
import pandas as pd

# structured arrays
# 建立四個空白資料，並擁有三種資料 'A' 'B' 'C'，其對應資料類型為 integer 4 bytes, float 4 bytes, string 10 bytes
data = np.zeros((4,), dtype=[('A', 'i4'),('B', 'f4'),('C', 'S10')])

data[:] = [(1,2.,'Hello'), (2,3.,"World"), (6,8.,"!!"), (9,11.,"??")]
print(data)
# [(1,   2., b'Hello') (2,   3., b'World') (6,   8., b'!!') (9,  11., b'??')]

# 印出 A 的資料
print(data['A'])
# [1 2 6 9]

print(pd.DataFrame(data))
#    A    B         C
# 0  1  2.0  b'Hello'
# 1  2  3.0  b'World'
# 2  6  8.0     b'!!'
# 3  9  11.0    b'??'

print(pd.DataFrame(data, index=['first', 'second', 'three', 'four']))
#         A    B         C
# first   1  2.0  b'Hello'
# second  2  3.0  b'World'
# three   6  8.0     b'!!'
# four    9  11.0    b'??'

print(pd.DataFrame(data, columns=['C', 'A', 'B']))
#           C  A    B
# 0  b'Hello'  1  2.0
# 1  b'World'  2  3.0
# 2     b'!!'  6  8.0
# 3     b'??'  9  11.0

Multi-indexed frame

import numpy as np
import pandas as pd

# multi-indexed frame
df = pd.DataFrame({('a', 'b'): {('A', 'B'): 1, ('A', 'C'): 2},
                   ('a', 'a'): {('A', 'C'): 3, ('A', 'B'): 4},
                   ('a', 'c'): {('A', 'B'): 5, ('A', 'C'): 6},
                   ('b', 'a'): {('A', 'C'): 7, ('A', 'B'): 8},
                   ('b', 'b'): {('A', 'D'): 9, ('A', 'B'): 10}})
                  
print(df)
#        a              b
#        a    b    c    a     b
# A B  4.0  1.0  5.0  8.0  10.0
#   C  3.0  2.0  6.0  7.0   NaN
#   D  NaN  NaN  NaN  NaN   9.0

print(df['b']['a']['A']['C'])
# 7.0

class pandas.Panel(data=None, items=None, major_axis=None, minor_axis=None, copy=False, dtype=None)

pandas.Panel
三維資料，但不常使用
data

輸入的資料

items

資料大項的名字

major_axis

加入 row name

minor_axis

加入 column name

copy

是否重新 copy
若無 copy，請小心資料的更改

dtype

資料類型

範例

numpy 建立

import numpy as np
import pandas as pd

# 3D ndarray
wp = pd.Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'],
              major_axis=pd.date_range('1/1/2000', periods=5),
              minor_axis=['A', 'B', 'C', 'D'])
print(wp)
# <class 'pandas.core.panel.Panel'>
# Dimensions: 2 (items) x 5 (major_axis) x 4 (minor_axis)
# Items axis: Item1 to Item2
# Major_axis axis: 2000-01-01 00:00:00 to 2000-01-05 00:00:00
# Minor_axis axis: A to D

DataFrame 建立

import numpy as np
import pandas as pd
             
# Series
d = {'item1' : pd.DataFrame({'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
                             'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}),
     'item2' : pd.DataFrame({'thr' : pd.Series([1., 2., 3.], index=['e', 'f', 'g'])})}
     
wp = pd.Panel(d)
print(wp)
# <class 'pandas.core.panel.Panel'>
# Dimensions: 2 (items) x 7 (major_axis) x 3 (minor_axis)
# Items axis: item1 to item2
# Major_axis axis: a to g
# Minor_axis axis: one to two

# filter_observations 預設為 True，表示不顯示含有 NaN 的欄位
# 所以轉換後為 Empty DataFrame
print(wp.to_frame())
# Empty DataFrame
# Columns: [item1, item2]
# Index: []

print(wp.to_frame(filter_observations=False))
#              item1  item2
# major minor
# a     one      1.0    NaN
#       thr      NaN    NaN
#       two      1.0    NaN
# b     one      2.0    NaN
#       thr      NaN    NaN
#       two      2.0    NaN
# c     one      3.0    NaN
#       thr      NaN    NaN
#       two      3.0    NaN
# d     one      NaN    NaN
#       thr      NaN    NaN
#       two      4.0    NaN
# e     one      NaN    NaN
#       thr      NaN    1.0
#       two      NaN    NaN
# f     one      NaN    NaN
#       thr      NaN    2.0
#       two      NaN    NaN
# g     one      NaN    NaN
#       thr      NaN    3.0
#       two      NaN    NaN

基本操作

得值

原始資料

import numpy as np
import pandas as pd

dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
print(df)
#                    A         B         C         D
# 2013-01-01  0.254520 -0.263404 -2.210508  0.807513
# 2013-01-02  0.024263 -0.014928 -0.601224  0.561930
# 2013-01-03 -0.628894 -0.419761  1.051119 -0.593262
# 2013-01-04 -1.116615 -0.027055  0.537114 -0.955006
# 2013-01-05  0.637412 -0.188929 -0.354975  1.693871
# 2013-01-06  1.014020  0.812423  0.420242 -0.633087

Getting

# get by []，只能一個 index
print(df['A']) # 等同 df.A
# 2013-01-01    0.254520
# 2013-01-02    0.024263
# 2013-01-03   -0.628894
# 2013-01-04   -1.116615
# 2013-01-05    0.637412
# 2013-01-06    1.014020
# Freq: D, Name: A, dtype: float64
# Freq 表示間隔頻率為一天

# get by [[]]，可以多個 columns
print(df[['A', 'B']])
#                    A         B     
# 2013-01-01  0.254520 -0.263404 
# 2013-01-02  0.024263 -0.014928 
# 2013-01-03 -0.628894 -0.419761 
# 2013-01-04 -1.116615 -0.027055 
# 2013-01-05  0.637412 -0.188929 
# 2013-01-06  1.014020  0.812423 

print(df[0:10])
#                    A         B         C         D
# 2013-01-01  0.254520 -0.263404 -2.210508  0.807513
# 2013-01-02  0.024263 -0.014928 -0.601224  0.561930
# 2013-01-03 -0.628894 -0.419761  1.051119 -0.593262
# 2013-01-04 -1.116615 -0.027055  0.537114 -0.955006
# 2013-01-05  0.637412 -0.188929 -0.354975  1.693871
# 2013-01-06  1.014020  0.812423  0.420242 -0.633087

print(df['20130102':'20130104'])
#                    A         B         C         D
# 2013-01-02  0.024263 -0.014928 -0.601224  0.561930
# 2013-01-03 -0.628894 -0.419761  1.051119 -0.593262
# 2013-01-04 -1.116615 -0.027055  0.537114 -0.955006

Selection by Label

print(df.loc['20130102':'20130104', 'A'])
# 2013-01-02    0.024263
# 2013-01-03   -0.628894
# 2013-01-04   -1.116615
# Freq: D, Name: A, dtype: float64

Selection by Position

# get by index
# index 需正確，可用 df.reset_index(drop=True)
print(df.iloc[0:10, 0])
# 2013-01-01    0.254520
# 2013-01-02    0.024263
# 2013-01-03   -0.628894
# 2013-01-04   -1.116615
# 2013-01-05    0.637412
# 2013-01-06    1.014020
# Freq: D, Name: A, dtype: float64

Selection by Mix

# get by index, label
print(df.ix[0:10, 'A'])
# 2013-01-01    0.254520
# 2013-01-02    0.024263
# 2013-01-03   -0.628894
# 2013-01-04   -1.116615
# 2013-01-05    0.637412
# 2013-01-06    1.014020
# Freq: D, Name: A, dtype: float64

Boolean Indexing

print(df[df.A > 0])
#                    A         B         C         D
# 2013-01-01  0.254520 -0.263404 -2.210508  0.807513
# 2013-01-02  0.024263 -0.014928 -0.601224  0.561930
# 2013-01-05  0.637412 -0.188929 -0.354975  1.693871
# 2013-01-06  1.014020  0.812423  0.420242 -0.633087

isin

df2 = df.copy()
df2['E'] = ['one', 'one','two','three','four','three']
print(df2)
#                    A         B         C         D      E
# 2013-01-01  0.254520 -0.263404 -2.210508  0.807513    one
# 2013-01-02  0.024263 -0.014928 -0.601224  0.561930    one
# 2013-01-03 -0.628894 -0.419761  1.051119 -0.593262    two
# 2013-01-04 -1.116615 -0.027055  0.537114 -0.955006  three
# 2013-01-05  0.637412 -0.188929 -0.354975  1.693871   four
# 2013-01-06  1.014020  0.812423  0.420242 -0.633087  three

print(df2[df2['E'].isin(['two','four'])])
#                    A         B         C         D      E
# 2013-01-03 -0.628894 -0.419761  1.051119 -0.593262    two
# 2013-01-05  0.637412 -0.188929 -0.354975  1.693871   four

values

# 轉換為 numpy 格式
print(df.values)
# array([[ 0.254520, -0.263404, -2.210508,  0.807513],
#        [ 0.024263, -0.014928, -0.601224,  0.561930],
#        [-0.628894, -0.419761,  1.051119, -0.593262],
#        [-1.116615, -0.027055,  0.537114, -0.955006],
#        [ 0.637412, -0.188929, -0.354975,  1.693871],
#        [ 1.014020,  0.812423,  0.420242, -0.633087]])

ravel()

# 轉換為 numpy 一維格式
print(df['A'].ravel())
# array([0.254520,  0.024263, -0.628894, -1.116615,  0.637412,  1.014020])

賦值

利用 series, label , index, numpy

import numpy as np
import pandas as pd

df = pd.DataFrame()
date = pd.date_range('20130102', periods=6)

s1 = pd.Series([1,2,3,4,5,6], index=date)

# by series
df['F'] = s1
# by label
df.at[date[2],'A'] = 1000
# by index
df.iat[0,1] = 0
# by np
df.loc[:,'D'] = np.array([5] * len(df))

print(df)
#             F       A  D
# 2013-01-02  1     0.0  5
# 2013-01-03  2     NaN  5
# 2013-01-04  3  1000.0  5
# 2013-01-05  4     NaN  5
# 2013-01-06  5     NaN  5
# 2013-01-07  6     NaN  5

# by 任意得值的方法
df.loc['20130102':'20130103', 'A'] = 1
# index 需正確，可用 df.reset_index(drop=True)
df.iloc[3:5, 0] = 10
df.ix[5, 'A'] = 0

print(df)
#             F       A  D
# 2013-01-02  1     1.0  5
# 2013-01-03  2     1.0  5
# 2013-01-04  3  1000.0  5
# 2013-01-05  4    10.0  5
# 2013-01-06  5    10.0  5
# 2013-01-07  6     0.0  5

# 只有符合條件才會賦值
df[df > 10] = -df
print(df)
#             F       A  D
# 2013-01-02  1     1.0  5
# 2013-01-03  2     1.0  5
# 2013-01-04  3 -1000.0  5
# 2013-01-05  4    10.0  5
# 2013-01-06  5    10.0  5
# 2013-01-07  6     0.0  5

合併表格

Merge, join, and concatenate

import pandas as pd

df1 = pd.DataFrame({'X1':['a', 'b', 'c'],
                    'X2':[1, 2, 3],})
#   X1  X2
# 0  a   1
# 1  b   2
# 2  c   3

df2 = pd.DataFrame({'X1':['a', 'b', 'd'],
                    'X3':[4, None, 6],})
#   X1   X3
# 0  a  4.0
# 1  b  NaN
# 2  d  6.0

df1.merge(df2, how='left', on='X1')
#   X1  X2   X3
# 0  a   1  4.0
# 1  b   2  NaN
# 2  c   3  NaN

df1.merge(df2, how='right', on='X1')
#   X1   X2   X3
# 0  a  1.0  4.0
# 1  b  2.0  NaN
# 2  d  NaN  6.0

df1.merge(df2, how='outer', on='X1')
#   X1   X2   X3
# 0  a  1.0  4.0
# 1  b  2.0  NaN
# 2  c  3.0  NaN
# 3  d  NaN  6.0

# join 必需有相同的 index 才行
df1.set_index('X1').join(df2.set_index('X1'), how='right')
#      X2   X3
# X1
# a   1.0  4.0
# b   2.0  NaN
# d   NaN  6.0

# 方向為 index
df1.append(df2)
#   X1   X2   X3
# 0  a  1.0  NaN
# 1  b  2.0  NaN
# 2  c  3.0  NaN
# 0  a  NaN  4.0
# 1  b  NaN  NaN
# 2  d  NaN  6.0

# 方向為 index
pd.concat([df1, df2])
#   X1   X2   X3
# 0  a  1.0  NaN
# 1  b  2.0  NaN
# 2  c  3.0  NaN
# 0  a  NaN  4.0
# 1  b  NaN  NaN
# 2  d  NaN  6.0

# 方向為 columns
pd.concat([df1, df2], axis='columns')
#   X1  X2 X1   X3
# 0  a   1  a  4.0
# 1  b   2  b  NaN
# 2  c   3  d  6.0

調整表格

import pandas as pd

df = pd.DataFrame({'foo': ['one','one','one','two','two','two'],
                       'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
                       'baz': [1, 2, 3, 4, 5, 6]})
#     foo   bar  baz
# 0   one   A    1
# 1   one   B    2
# 2   one   C    3
# 3   two   A    4
# 4   two   B    5
# 5   two   C    6

# 重新調整表格，指定排列方式
df.pivot(index='foo', columns='bar', values='baz')
#      A   B   C
# one  1   2   3
# two  4   5   6

# pivot 的進階版，指定排列方式，可設定值的處理，預設為平均
df.pivot_table(index='foo', columns='bar', values='baz')
#      A   B   C
# one  1   2   3
# two  4   5   6

import pandas as pd
import numpy as np

df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",
                          "bar", "bar", "bar", "bar"],
                   "B": ["one", "one", "one", "two", "two",
                          "one", "one", "two", "two"],
                   "C": ["small", "large", "large", "small",
                          "small", "large", "small", "small",
                          "large"],
                   "D": [1, 2, 2, 3, 3, 4, 5, 6, 7]})
#     A    B      C  D
# 0  foo  one  small  1
# 1  foo  one  large  2
# 2  foo  one  large  2
# 3  foo  two  small  3
# 4  foo  two  small  3
# 5  bar  one  large  4
# 6  bar  one  small  5
# 7  bar  two  small  6
# 8  bar  two  large  7

df.pivot_table(index=['A', 'B'], columns=['C'], values='D', aggfunc=np.sum)
# C        large  small
# A   B
# bar one    4.0    5.0
#     two    7.0    6.0
# foo one    4.0    1.0
#     two    NaN    6.0

import pandas as pd

df = pd.DataFrame({'a': {'one':1.0, 'two':2.0},
                   'b': {'one':3.0, 'two':4.0}})
#       a    b
# one  1.0  3.0
# two  2.0  4.0

# 把 columns 移到 index，可能會得到 Series
df.stack()
# one  a    1.0
#      b    3.0
# two  a    2.0
#      b    4.0

# 把 index 移到 columns，可能會得到 Series
df.stack().unstack()
#       a    b
# one  1.0  3.0
# two  2.0  4.0

# 重新排列 index 並把 index 移至 columns，仍是 DataFrame
df.reset_index()
#   index    a    b
# 0   one  1.0  3.0
# 1   two  2.0  4.0

import pandas as pd

df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'},
                   'B': {0: 1, 1: 3, 2: 5},
                   'C': {0: 2, 1: 4, 2: 6}})
#    A  B  C
# 0  a  1  2
# 1  b  3  4
# 2  c  5  6

# 新增對應 B C 的行與其對應的值
df.melt(id_vars=['A'], value_vars=['B', 'C'])
#    A variable  value
# 0  a        B      1
# 1  b        B      3
# 2  c        B      5
# 3  a        C      2
# 4  b        C      4
# 5  c        C      6

統計運算

原始資料

import numpy as np
import pandas as pd

dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.ones((6, 4))*2, index=dates, columns=list('ABCD'), dtype='float64')
print(df)
#               A    B    C    D
# 2013-01-01  2.0  2.0  2.0  2.0
# 2013-01-02  2.0  2.0  2.0  2.0
# 2013-01-03  2.0  2.0  2.0  2.0
# 2013-01-04  2.0  2.0  2.0  2.0
# 2013-01-05  2.0  2.0  2.0  2.0
# 2013-01-06  2.0  2.0  2.0  2.0

數學運算

# 數學運算
print(df+10)
#                A     B     C     D
# 2013-01-01  12.0  12.0  12.0  12.0
# 2013-01-02  12.0  12.0  12.0  12.0
# 2013-01-03  12.0  12.0  12.0  12.0
# 2013-01-04  12.0  12.0  12.0  12.0
# 2013-01-05  12.0  12.0  12.0  12.0
# 2013-01-06  12.0  12.0  12.0  12.0

print(df-10)
#               A    B    C    D
# 2013-01-01 -8.0 -8.0 -8.0 -8.0
# 2013-01-02 -8.0 -8.0 -8.0 -8.0
# 2013-01-03 -8.0 -8.0 -8.0 -8.0
# 2013-01-04 -8.0 -8.0 -8.0 -8.0
# 2013-01-05 -8.0 -8.0 -8.0 -8.0
# 2013-01-06 -8.0 -8.0 -8.0 -8.0

print(df/10)
#               A    B    C    D
# 2013-01-01  0.2  0.2  0.2  0.2
# 2013-01-02  0.2  0.2  0.2  0.2
# 2013-01-03  0.2  0.2  0.2  0.2
# 2013-01-04  0.2  0.2  0.2  0.2
# 2013-01-05  0.2  0.2  0.2  0.2
# 2013-01-06  0.2  0.2  0.2  0.2

print(df*10)
#                A     B     C     D
# 2013-01-01  20.0  20.0  20.0  20.0
# 2013-01-02  20.0  20.0  20.0  20.0
# 2013-01-03  20.0  20.0  20.0  20.0
# 2013-01-04  20.0  20.0  20.0  20.0
# 2013-01-05  20.0  20.0  20.0  20.0
# 2013-01-06  20.0  20.0  20.0  20.0

print(df**2)
#               A    B    C    D
# 2013-01-01  4.0  4.0  4.0  4.0
# 2013-01-02  4.0  4.0  4.0  4.0
# 2013-01-03  4.0  4.0  4.0  4.0
# 2013-01-04  4.0  4.0  4.0  4.0
# 2013-01-05  4.0  4.0  4.0  4.0
# 2013-01-06  4.0  4.0  4.0  4.0

統計計算

# 統計
print(df.describe())
#          A    B    C    D
# count  6.0  6.0  6.0  6.0
# mean   2.0  2.0  2.0  2.0
# std    0.0  0.0  0.0  0.0
# min    2.0  2.0  2.0  2.0
# 25%    2.0  2.0  2.0  2.0
# 50%    2.0  2.0  2.0  2.0
# 75%    2.0  2.0  2.0  2.0
# max    2.0  2.0  2.0  2.0

# mean 值
print(df.mean())
# A    2.0
# B    2.0
# C    2.0
# D    2.0
# dtype: float64

# 累積計算
print(df.cumsum())
#                A     B     C     D
# 2013-01-01   2.0   2.0   2.0   2.0
# 2013-01-02   4.0   4.0   4.0   4.0
# 2013-01-03   6.0   6.0   6.0   6.0
# 2013-01-04   8.0   8.0   8.0   8.0
# 2013-01-05  10.0  10.0  10.0  10.0
# 2013-01-06  12.0  12.0  12.0  12.0

缺值

缺值處理，丟棄或給予其他值

import numpy as np
import pandas as pd

df = pd.DataFrame()
date = pd.date_range('20130102', periods=6)

s1 = pd.Series([1,2,3,4,5,6], index=date)

# by series
df['F'] = s1
# by label
df.at[date[2],'A'] = 1000
# by index
df.iat[0,1] = 0
# by np
df.loc[:,'D'] = np.array([5] * len(df))

print(df)
#             F       A  D
# 2013-01-02  1     0.0  5
# 2013-01-03  2     NaN  5
# 2013-01-04  3  1000.0  5
# 2013-01-05  4     NaN  5
# 2013-01-06  5     NaN  5
# 2013-01-07  6     NaN  5

# DataFrame.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
# 預設 axis=0 => "index"
# how='any'，只要有 NaN 就丟掉
# how='all'，全部為 NaN 才丟掉
print(df.dropna(how='any'))
#             F       A  D
# 2013-01-02  1     0.0  5
# 2013-01-04  3  1000.0  5

# 將 NaN 填入值
print(df.fillna(value='Test'))
#             F     A  D
# 2013-01-02  1     0  5
# 2013-01-03  2  Test  5
# 2013-01-04  3  1000  5
# 2013-01-05  4  Test  5
# 2013-01-06  5  Test  5
# 2013-01-07  6  Test  5

Grouping

分類處理

import numpy as np
import pandas as pd

df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                       'foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three',
                       'two', 'two', 'one', 'three'],
                   'C' : np.linspace(1, 8, num=8),
                   'D' : np.linspace(-1, -8, num=8)})

print(df)
#      A      B    C    D
# 0  foo    one  1.0 -1.0
# 1  bar    one  2.0 -2.0
# 2  foo    two  3.0 -3.0
# 3  bar  three  4.0 -4.0
# 4  foo    two  5.0 -5.0
# 5  bar    two  6.0 -6.0
# 6  foo    one  7.0 -7.0

print(df.groupby('A').sum())
#         C     D
# A
# bar  12.0 -12.0
# foo  24.0 -24.0

print(df.groupby(['A','B']).sum())
#              C    D
# A   B
# bar one    2.0 -2.0
#     three  4.0 -4.0
#     two    6.0 -6.0
# foo one    8.0 -8.0
#     three  8.0 -8.0
#     two    8.0 -8.0

讀寫檔

通常回傳的是 DataFrame
列出比較常用，詳細可參考 IO Tools

sep 可用正規表示法
但若超過一個字且不為 '\s+' 會強迫使用 python parsing engine
pandas.read_pickle(path)
pandas.read_clipboard(**kwargs)
pandas.read_table(filepath_or_buffer[, sep, ...])
pandas.read_csv(filepath_or_buffer[, sep, ...])
pandas.read_excel(io[, sheetname, header, ...])
pandas.read_json([path_or_buf, orient, typ, dtype, ...])
pandas.read_html(io[, match, flavor, header, ...])
pandas.read_sql(sql, con[, index_col, ...])

範例

import pandas as pd

# 預設 sep=','
pd.read_csv('test.csv')
#      a    b    c
# 0    1    2    3
# 1    4    5    6

# 預設 sep='\t'
# 自定 column names
pd.read_table('test.txt', names=['aa', 'bb', 'cc'])
#     aa    bb    cc
# 0    a    b    c
# 1    1    2    3
# 2    4    5    6

# 需要 xlrd package 
# 資料無標題
pd.read_excel('test.xlsx', header=None)
#      0    1    2    3    4
# 0    1    2    3    4    5
# 1    6    7    8    9    10

# 中文路徑會有問題，可用 open 取代
# sep 可用正規表示法，超過一個字且不為 '\s+' 不指定 engine 會有 warning
with open('新文字文件.txt', 'r') as f:
    pd.read_table(f, sep='[a-z]', engine='python')
#      a    b    c
# 0    1    2    3
# 1    4    5    6

參考
Pandas Cheat Sheet: Data Wrangling in Python

子風的知識庫

搜尋此網誌