6  Pandas

import pandas as pd
import numpy as np
Xdf=pd.DataFrame(np.arange(16).reshape(4,4))
Xdf
0 1 2 3
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
3 12 13 14 15
Xdf[0]
0     0
1     4
2     8
3    12
Name: 0, dtype: int64
Xdf[0:3]
0 1 2 3
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
Xdf[Xdf[1]<10]
0 1 2 3
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
Xdf[Xdf<10]
0 1 2 3
0 0.0 1.0 2.0 3.0
1 4.0 5.0 6.0 7.0
2 8.0 9.0 NaN NaN
3 NaN NaN NaN NaN
Xdf[[0,3,2]]
0 3 2
0 0 3 2
1 4 7 6
2 8 11 10
3 12 15 14
Xdf.loc[0:1]
0 1 2 3
0 0 1 2 3
1 4 5 6 7
Xdf.iloc[0:1]
0 1 2 3
0 0 1 2 3
Xdf[0:1]
0 1 2 3
0 0 1 2 3
# Exercise 2
DF=pd.DataFrame(dict(X1=np.random.uniform(0,1,15),
                X2=np.random.standard_normal(15),
                X3=1))
DF
X1 X2 X3
0 0.002925 -0.693743 1
1 0.033349 0.774359 1
2 0.879925 -0.317643 1
3 0.810953 0.087859 1
4 0.655664 0.336143 1
5 0.748835 -0.021620 1
6 0.971919 -0.463584 1
7 0.933795 0.763103 1
8 0.898300 -0.311474 1
9 0.382806 -0.697514 1
10 0.621569 1.838894 1
11 0.693038 0.949865 1
12 0.629064 -0.204435 1
13 0.961133 2.522102 1
14 0.370875 1.190071 1
DF[(DF['X1']>0.5)&(DF['X1']<=0.8)].index
Index([4, 5, 10, 11, 12], dtype='int64')
DF.loc[(DF['X1']>0.5)&(DF['X1']<=0.8)].index
Index([4, 5, 10, 11, 12], dtype='int64')
DF.loc[(DF['X1']>0.5)&(DF['X1']<=0.8),:].index
Index([4, 5, 10, 11, 12], dtype='int64')
DF.iloc[np.array((DF['X1']>0.5)&(DF['X1']<=0.8))].index
Index([4, 5, 10, 11, 12], dtype='int64')
DF[['X3','X2','X1']]
X3 X2 X1
0 1 -0.693743 0.002925
1 1 0.774359 0.033349
2 1 -0.317643 0.879925
3 1 0.087859 0.810953
4 1 0.336143 0.655664
5 1 -0.021620 0.748835
6 1 -0.463584 0.971919
7 1 0.763103 0.933795
8 1 -0.311474 0.898300
9 1 -0.697514 0.382806
10 1 1.838894 0.621569
11 1 0.949865 0.693038
12 1 -0.204435 0.629064
13 1 2.522102 0.961133
14 1 1.190071 0.370875
DF.loc[DF['X2']<0,['X1']]
X1
0 0.002925
2 0.879925
5 0.748835
6 0.971919
8 0.898300
9 0.382806
12 0.629064

6.0.0.1 Input / Output functions in Pandas

# read_csv, read_excel, read_table, read_fwf ...
import os
os.getcwd()
'/Users/hwan/Desktop/Homepage/study_24spring'
pd.read_csv('pandas.csv')
S.No Name Age City Salary
0 1 Tom 28 Toronto 20000
1 2 Lee 32 HongKong 3000
2 3 Steven 43 Bay Area 8300
3 4 Ram 38 Hyderabad 3900
pd.read_csv('pandas.csv',sep=',')
S.No Name Age City Salary
0 1 Tom 28 Toronto 20000
1 2 Lee 32 HongKong 3000
2 3 Steven 43 Bay Area 8300
3 4 Ram 38 Hyderabad 3900
pd.read_csv('pandas.csv',index_col=0)
Name Age City Salary
S.No
1 Tom 28 Toronto 20000
2 Lee 32 HongKong 3000
3 Steven 43 Bay Area 8300
4 Ram 38 Hyderabad 3900
pd.read_csv('pandas.csv',index_col='Name')
S.No Age City Salary
Name
Tom 1 28 Toronto 20000
Lee 2 32 HongKong 3000
Steven 3 43 Bay Area 8300
Ram 4 38 Hyderabad 3900
pd.read_csv('pandas.csv',header=None)
0 1 2 3 4
0 S.No Name Age City Salary
1 1 Tom 28 Toronto 20000
2 2 Lee 32 HongKong 3000
3 3 Steven 43 Bay Area 8300
4 4 Ram 38 Hyderabad 3900
pd.read_csv('pandas.csv',names=[0,1,2,3,4])
0 1 2 3 4
0 S.No Name Age City Salary
1 1 Tom 28 Toronto 20000
2 2 Lee 32 HongKong 3000
3 3 Steven 43 Bay Area 8300
4 4 Ram 38 Hyderabad 3900
pd.read_csv('pandas.csv',names=[1,2,3,4])
1 2 3 4
S.No Name Age City Salary
1 Tom 28 Toronto 20000
2 Lee 32 HongKong 3000
3 Steven 43 Bay Area 8300
4 Ram 38 Hyderabad 3900
pd.read_csv('pandas.csv',dtype={'Salary':np.float64})
S.No Name Age City Salary
0 1 Tom 28 Toronto 20000.0
1 2 Lee 32 HongKong 3000.0
2 3 Steven 43 Bay Area 8300.0
3 4 Ram 38 Hyderabad 3900.0
pd.read_csv('pandas.csv',names=['a','b','c','d','e'])
a b c d e
0 S.No Name Age City Salary
1 1 Tom 28 Toronto 20000
2 2 Lee 32 HongKong 3000
3 3 Steven 43 Bay Area 8300
4 4 Ram 38 Hyderabad 3900
pd.read_csv('pandas.csv',names=['a','b','c','d'],header=0,index_col=0)
a b c d
1 Tom 28 Toronto 20000
2 Lee 32 HongKong 3000
3 Steven 43 Bay Area 8300
4 Ram 38 Hyderabad 3900
pd.read_csv('pandas.csv',names=['a','b','c','d'],skiprows=1,index_col=0)
a b c d
1 Tom 28 Toronto 20000
2 Lee 32 HongKong 3000
3 Steven 43 Bay Area 8300
4 Ram 38 Hyderabad 3900
pd.read_csv('pandas.csv',na_values=['Tom'])
S.No Name Age City Salary
0 1 NaN 28 Toronto 20000
1 2 Lee 32 HongKong 3000
2 3 Steven 43 Bay Area 8300
3 4 Ram 38 Hyderabad 3900

6.0.0.2 Binary Operations

A = pd.DataFrame(np.random.randint(0, 20, (2, 2)), columns=list('AB'))
B = pd.DataFrame(np.random.randint(0, 10, (3, 3)), columns=list('BAC'))
print( A, B, sep="\n\n")
    A  B
0   0  1
1  18  1

   B  A  C
0  1  9  9
1  4  6  2
2  4  1  0
A+B
A B C
0 9.0 2.0 NaN
1 24.0 5.0 NaN
2 NaN NaN NaN
A.add(B,fill_value=0)
A B C
0 9.0 2.0 9.0
1 24.0 5.0 2.0
2 1.0 4.0 0.0
B-B.loc[0,:]
B A C
0 0 0 0
1 3 -3 -7
2 3 -8 -9
B-B.loc[0:0,:]
B A C
0 0.0 0.0 0.0
1 NaN NaN NaN
2 NaN NaN NaN
B.sub(B.loc[0,:])
B A C
0 0 0 0
1 3 -3 -7
2 3 -8 -9
B.sub(B['A'],axis=0)
B A C
0 -8 0 0
1 -2 0 -4
2 3 0 -1

6.0.0.3 Pandas DataFrame Manipulation

import numpy as np
import pandas as pd
df = pd.DataFrame(np.random.rand(7, 5))
df
0 1 2 3 4
0 0.254541 0.022793 0.292160 0.256052 0.742068
1 0.959650 0.210256 0.937732 0.523141 0.660096
2 0.694372 0.674311 0.502167 0.164087 0.527422
3 0.591313 0.952835 0.800817 0.357809 0.029770
4 0.004058 0.930224 0.908414 0.104988 0.372187
5 0.062482 0.480021 0.220693 0.820707 0.578013
6 0.678025 0.481827 0.727163 0.119174 0.252283
df[[0,1]]
0 1
0 0.254541 0.022793
1 0.959650 0.210256
2 0.694372 0.674311
3 0.591313 0.952835
4 0.004058 0.930224
5 0.062482 0.480021
6 0.678025 0.481827