Working with Pandas


Working with Pandas
In [1]:
import numpy as np
import pandas as pd

s = pd.Series([1,3,5,np.nan,6,8])
print(s)
0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64
In [2]:
dates = pd.date_range('20130101', periods=6)
print(dates)
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')
In [5]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=('A','B','C','D'))  #using series
print(df)
                   A         B         C         D
2013-01-01  0.375465  1.208336  0.402344  0.620279
2013-01-02 -1.013392  0.697856 -1.108243  0.556585
2013-01-03  1.972607  0.126715 -1.203313  1.335513
2013-01-04  0.898955  1.949423 -1.090135  2.268246
2013-01-05 -1.569889 -0.062766 -0.289277  0.351344
2013-01-06  0.369853 -0.490518  0.654728 -0.683796
In [22]:
df=pd.DataFrame({ 'A' : 1.,
                 'B' : pd.Timestamp('20130102'),
                 'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                 'D' : np.array([3] * 4,dtype='int32'),
                 'E' : pd.Categorical(["test","train","test","train"]),
                 'F' : 'foo' })
print(df)

print(df.head())

print(df.index)
     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
3  1.0 2013-01-02  1.0  3  train  foo
     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
3  1.0 2013-01-02  1.0  3  train  foo
Int64Index([0, 1, 2, 3], dtype='int64')
In [21]:
print(df.columns)
Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
In [23]:
df.values
Out[23]:
array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']], dtype=object)
In [27]:
print(list('1234'))
['1', '2', '3', '4']
In [29]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=('A','B','C','D'))  #using series
print(df)




df.sort_index(axis=0, ascending=False)
                   A         B         C         D
2013-01-01 -0.027837  0.077297 -0.124679 -0.089361
2013-01-02 -0.333209 -0.524610 -0.968841  0.705234
2013-01-03 -0.097520 -0.280874  0.468379  0.520813
2013-01-04  0.047438  1.877535 -1.117088  1.213645
2013-01-05 -0.866087 -0.381429 -0.650145 -1.208026
2013-01-06  0.509621  1.447343 -0.089048 -0.320007
Out[29]:
A B C D
2013-01-06 0.509621 1.447343 -0.089048 -0.320007
2013-01-05 -0.866087 -0.381429 -0.650145 -1.208026
2013-01-04 0.047438 1.877535 -1.117088 1.213645
2013-01-03 -0.097520 -0.280874 0.468379 0.520813
2013-01-02 -0.333209 -0.524610 -0.968841 0.705234
2013-01-01 -0.027837 0.077297 -0.124679 -0.089361
In [32]:
df.sort_index(axis=1, ascending=False)
Out[32]:
D C B A
2013-01-01 -0.089361 -0.124679 0.077297 -0.027837
2013-01-02 0.705234 -0.968841 -0.524610 -0.333209
2013-01-03 0.520813 0.468379 -0.280874 -0.097520
2013-01-04 1.213645 -1.117088 1.877535 0.047438
2013-01-05 -1.208026 -0.650145 -0.381429 -0.866087
2013-01-06 -0.320007 -0.089048 1.447343 0.509621
In [33]:
df.sort_values(by='B')
Out[33]:
A B C D
2013-01-02 -0.333209 -0.524610 -0.968841 0.705234
2013-01-05 -0.866087 -0.381429 -0.650145 -1.208026
2013-01-03 -0.097520 -0.280874 0.468379 0.520813
2013-01-01 -0.027837 0.077297 -0.124679 -0.089361
2013-01-06 0.509621 1.447343 -0.089048 -0.320007
2013-01-04 0.047438 1.877535 -1.117088 1.213645
In [44]:
df[0:2]             #slices rows
Out[44]:
A B C D
2013-01-01 -0.431875 2.015969 0.634258 0.862669
2013-01-02 2.086694 0.829265 -0.702303 0.657043
In [37]:
df.loc['20130102':'20130104',['A','B']]
Out[37]:
A B
2013-01-02 -0.333209 -0.524610
2013-01-03 -0.097520 -0.280874
2013-01-04 0.047438 1.877535
In [38]:
df.loc[:,['A','B']]
Out[38]:
A B
2013-01-01 -0.027837 0.077297
2013-01-02 -0.333209 -0.524610
2013-01-03 -0.097520 -0.280874
2013-01-04 0.047438 1.877535
2013-01-05 -0.866087 -0.381429
2013-01-06 0.509621 1.447343
In [39]:
df.loc[dates[0],'A']     #pointing to a number
Out[39]:
-0.027837176903767264
In [49]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=('A','B','C','D'))  #using series
print(df)






print(df.loc[dates[3]])

print(df.iloc[3])
                   A         B         C         D
2013-01-01  0.674418  0.850490 -1.928506 -0.055898
2013-01-02 -1.070952 -0.908185  0.915426 -0.196263
2013-01-03 -0.557899 -0.624848  0.119355  0.773706
2013-01-04 -1.605233  0.367749 -1.345935 -0.776756
2013-01-05  1.054863  0.585973 -0.512838  1.924100
2013-01-06 -0.374367 -0.420607  0.536889  2.099798
A   -1.605233
B    0.367749
C   -1.345935
D   -0.776756
Name: 2013-01-04 00:00:00, dtype: float64
A   -1.605233
B    0.367749
C   -1.345935
D   -0.776756
Name: 2013-01-04 00:00:00, dtype: float64
In [50]:
print(df.iloc[3:5,0:2])
                   A         B
2013-01-04 -1.605233  0.367749
2013-01-05  1.054863  0.585973
In [51]:
df.iloc[1:3,:]
Out[51]:
A B C D
2013-01-02 -1.070952 -0.908185 0.915426 -0.196263
2013-01-03 -0.557899 -0.624848 0.119355 0.773706
In [52]:
 df.iloc[:,1:3]
Out[52]:
B C
2013-01-01 0.850490 -1.928506
2013-01-02 -0.908185 0.915426
2013-01-03 -0.624848 0.119355
2013-01-04 0.367749 -1.345935
2013-01-05 0.585973 -0.512838
2013-01-06 -0.420607 0.536889
In [53]:
df.iloc[1,1]         #for getting value explicitly
Out[53]:
-0.90818478724555773
In [54]:
df[df.A > 0]
Out[54]:
A B C D
2013-01-01 0.674418 0.850490 -1.928506 -0.055898
2013-01-05 1.054863 0.585973 -0.512838 1.924100
In [55]:
df[df > 0]
Out[55]:
A B C D
2013-01-01 0.674418 0.850490 NaN NaN
2013-01-02 NaN NaN 0.915426 NaN
2013-01-03 NaN NaN 0.119355 0.773706
2013-01-04 NaN 0.367749 NaN NaN
2013-01-05 1.054863 0.585973 NaN 1.924100
2013-01-06 NaN NaN 0.536889 2.099798
In [56]:
df2=df.copy()
df2['E'] = ['one', 'one','two','three','four','three']
print(df2)
                   A         B         C         D      E
2013-01-01  0.674418  0.850490 -1.928506 -0.055898    one
2013-01-02 -1.070952 -0.908185  0.915426 -0.196263    one
2013-01-03 -0.557899 -0.624848  0.119355  0.773706    two
2013-01-04 -1.605233  0.367749 -1.345935 -0.776756  three
2013-01-05  1.054863  0.585973 -0.512838  1.924100   four
2013-01-06 -0.374367 -0.420607  0.536889  2.099798  three
In [57]:
print(df)
                   A         B         C         D
2013-01-01  0.674418  0.850490 -1.928506 -0.055898
2013-01-02 -1.070952 -0.908185  0.915426 -0.196263
2013-01-03 -0.557899 -0.624848  0.119355  0.773706
2013-01-04 -1.605233  0.367749 -1.345935 -0.776756
2013-01-05  1.054863  0.585973 -0.512838  1.924100
2013-01-06 -0.374367 -0.420607  0.536889  2.099798
In [61]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=('A','B','C','D'))  #using series
print(df)
                   A         B         C         D
2013-01-01  0.165240 -0.446295 -1.473960 -2.272753
2013-01-02  0.849150 -0.848962 -1.255791  0.883076
2013-01-03 -2.605466  0.538571 -1.187388  0.386836
2013-01-04  0.622302 -1.006722  1.116170 -0.826446
2013-01-05  0.444221  1.084396  0.510132 -0.234789
2013-01-06  2.091659  0.969032 -0.167117  0.191899
In [72]:
#reindex function


df2=df.reindex(index=dates,columns=list(df.columns)+['E'] )
df2.loc[:,['E']]=1
print(df2)
                   A         B         C         D    E
2013-01-01  0.165240 -0.446295 -1.473960 -2.272753  1.0
2013-01-02  0.849150 -0.848962 -1.255791  0.883076  1.0
2013-01-03 -2.605466  0.538571 -1.187388  0.386836  1.0
2013-01-04  0.622302 -1.006722  1.116170 -0.826446  1.0
2013-01-05  0.444221  1.084396  0.510132 -0.234789  1.0
2013-01-06  2.091659  0.969032 -0.167117  0.191899  1.0
In [73]:
df.mean()
Out[73]:
A    0.261184
B    0.048337
C   -0.409659
D   -0.312030
dtype: float64
In [74]:
df.mean(1)
Out[74]:
2013-01-01   -1.006942
2013-01-02   -0.093132
2013-01-03   -0.716862
2013-01-04   -0.023674
2013-01-05    0.450990
2013-01-06    0.771368
Freq: D, dtype: float64
In [81]:
df = pd.DataFrame(np.random.randn(10, 4))
In [82]:
df
Out[82]:
0 1 2 3
0 0.034300 0.562577 -0.793628 0.266447
1 1.164551 -0.889588 0.512698 -0.374190
2 -1.815902 0.887592 -1.588327 0.129538
3 -0.126049 0.082714 -0.000536 0.513339
4 -0.025417 0.451445 0.629116 0.772384
5 0.011039 1.621303 0.234002 0.945682
6 -0.823008 0.771252 -0.295291 0.638264
7 -2.560043 -1.152196 -0.410878 -0.967670
8 -0.474092 0.549937 1.142994 -0.170030
9 1.174293 -0.158292 0.823994 0.285797
In [83]:
pieces = [df[:3], df[3:7], df[7:]]
In [84]:
pd.concat(pieces)
Out[84]:
0 1 2 3
0 0.034300 0.562577 -0.793628 0.266447
1 1.164551 -0.889588 0.512698 -0.374190
2 -1.815902 0.887592 -1.588327 0.129538
3 -0.126049 0.082714 -0.000536 0.513339
4 -0.025417 0.451445 0.629116 0.772384
5 0.011039 1.621303 0.234002 0.945682
6 -0.823008 0.771252 -0.295291 0.638264
7 -2.560043 -1.152196 -0.410878 -0.967670
8 -0.474092 0.549937 1.142994 -0.170030
9 1.174293 -0.158292 0.823994 0.285797

Comments

Popular posts from this blog

Playing with Tableau

Visualizing with Tableau-1