6 Pandas

import pandas as pd
import numpy as np

Xdf=pd.DataFrame(np.arange(16).reshape(4,4))
Xdf

	0	1	2	3
0	0	1	2	3
1	4	5	6	7
2	8	9	10	11
3	12	13	14	15

Xdf[0]

0     0
1     4
2     8
3    12
Name: 0, dtype: int64

Xdf[0:3]

	0	1	2	3
0	0	1	2	3
1	4	5	6	7
2	8	9	10	11

Xdf[Xdf[1]<10]

	0	1	2	3
0	0	1	2	3
1	4	5	6	7
2	8	9	10	11

Xdf[Xdf<10]

	0	1	2	3
0	0.0	1.0	2.0	3.0
1	4.0	5.0	6.0	7.0
2	8.0	9.0	NaN	NaN
3	NaN	NaN	NaN	NaN

Xdf[[0,3,2]]

	0	3	2
0	0	3	2
1	4	7	6
2	8	11	10
3	12	15	14

Xdf.loc[0:1]

	0	1	2	3
0	0	1	2	3
1	4	5	6	7

Xdf.iloc[0:1]

	0	1	2	3
0	0	1	2	3

Xdf[0:1]

	0	1	2	3
0	0	1	2	3

# Exercise 2
DF=pd.DataFrame(dict(X1=np.random.uniform(0,1,15),
                X2=np.random.standard_normal(15),
                X3=1))
DF

	X1	X2	X3
0	0.002925	-0.693743	1
1	0.033349	0.774359	1
2	0.879925	-0.317643	1
3	0.810953	0.087859	1
4	0.655664	0.336143	1
5	0.748835	-0.021620	1
6	0.971919	-0.463584	1
7	0.933795	0.763103	1
8	0.898300	-0.311474	1
9	0.382806	-0.697514	1
10	0.621569	1.838894	1
11	0.693038	0.949865	1
12	0.629064	-0.204435	1
13	0.961133	2.522102	1
14	0.370875	1.190071	1

DF[(DF['X1']>0.5)&(DF['X1']<=0.8)].index

Index([4, 5, 10, 11, 12], dtype='int64')

DF.loc[(DF['X1']>0.5)&(DF['X1']<=0.8)].index

Index([4, 5, 10, 11, 12], dtype='int64')

DF.loc[(DF['X1']>0.5)&(DF['X1']<=0.8),:].index

Index([4, 5, 10, 11, 12], dtype='int64')

DF.iloc[np.array((DF['X1']>0.5)&(DF['X1']<=0.8))].index

Index([4, 5, 10, 11, 12], dtype='int64')

DF[['X3','X2','X1']]

	X3	X2	X1
0	1	-0.693743	0.002925
1	1	0.774359	0.033349
2	1	-0.317643	0.879925
3	1	0.087859	0.810953
4	1	0.336143	0.655664
5	1	-0.021620	0.748835
6	1	-0.463584	0.971919
7	1	0.763103	0.933795
8	1	-0.311474	0.898300
9	1	-0.697514	0.382806
10	1	1.838894	0.621569
11	1	0.949865	0.693038
12	1	-0.204435	0.629064
13	1	2.522102	0.961133
14	1	1.190071	0.370875

DF.loc[DF['X2']<0,['X1']]

	X1
0	0.002925
2	0.879925
5	0.748835
6	0.971919
8	0.898300
9	0.382806
12	0.629064

6.0.0.1 Input / Output functions in Pandas

# read_csv, read_excel, read_table, read_fwf ...
import os
os.getcwd()

'/Users/hwan/Desktop/Homepage/study_24spring'

pd.read_csv('pandas.csv')

	S.No	Name	Age	City	Salary
0	1	Tom	28	Toronto	20000
1	2	Lee	32	HongKong	3000
2	3	Steven	43	Bay Area	8300
3	4	Ram	38	Hyderabad	3900

pd.read_csv('pandas.csv',sep=',')

	S.No	Name	Age	City	Salary
0	1	Tom	28	Toronto	20000
1	2	Lee	32	HongKong	3000
2	3	Steven	43	Bay Area	8300
3	4	Ram	38	Hyderabad	3900

pd.read_csv('pandas.csv',index_col=0)

	Name	Age	City	Salary
S.No
1	Tom	28	Toronto	20000
2	Lee	32	HongKong	3000
3	Steven	43	Bay Area	8300
4	Ram	38	Hyderabad	3900

pd.read_csv('pandas.csv',index_col='Name')

	S.No	Age	City	Salary
Name
Tom	1	28	Toronto	20000
Lee	2	32	HongKong	3000
Steven	3	43	Bay Area	8300
Ram	4	38	Hyderabad	3900

pd.read_csv('pandas.csv',header=None)

	0	1	2	3	4
0	S.No	Name	Age	City	Salary
1	1	Tom	28	Toronto	20000
2	2	Lee	32	HongKong	3000
3	3	Steven	43	Bay Area	8300
4	4	Ram	38	Hyderabad	3900

pd.read_csv('pandas.csv',names=[0,1,2,3,4])

	0	1	2	3	4
0	S.No	Name	Age	City	Salary
1	1	Tom	28	Toronto	20000
2	2	Lee	32	HongKong	3000
3	3	Steven	43	Bay Area	8300
4	4	Ram	38	Hyderabad	3900

pd.read_csv('pandas.csv',names=[1,2,3,4])

	1	2	3	4
S.No	Name	Age	City	Salary
1	Tom	28	Toronto	20000
2	Lee	32	HongKong	3000
3	Steven	43	Bay Area	8300
4	Ram	38	Hyderabad	3900

pd.read_csv('pandas.csv',dtype={'Salary':np.float64})

	S.No	Name	Age	City	Salary
0	1	Tom	28	Toronto	20000.0
1	2	Lee	32	HongKong	3000.0
2	3	Steven	43	Bay Area	8300.0
3	4	Ram	38	Hyderabad	3900.0

pd.read_csv('pandas.csv',names=['a','b','c','d','e'])

	a	b	c	d	e
0	S.No	Name	Age	City	Salary
1	1	Tom	28	Toronto	20000
2	2	Lee	32	HongKong	3000
3	3	Steven	43	Bay Area	8300
4	4	Ram	38	Hyderabad	3900

pd.read_csv('pandas.csv',names=['a','b','c','d'],header=0,index_col=0)

	a	b	c	d
1	Tom	28	Toronto	20000
2	Lee	32	HongKong	3000
3	Steven	43	Bay Area	8300
4	Ram	38	Hyderabad	3900

pd.read_csv('pandas.csv',names=['a','b','c','d'],skiprows=1,index_col=0)

	a	b	c	d
1	Tom	28	Toronto	20000
2	Lee	32	HongKong	3000
3	Steven	43	Bay Area	8300
4	Ram	38	Hyderabad	3900

pd.read_csv('pandas.csv',na_values=['Tom'])

	S.No	Name	Age	City	Salary
0	1	NaN	28	Toronto	20000
1	2	Lee	32	HongKong	3000
2	3	Steven	43	Bay Area	8300
3	4	Ram	38	Hyderabad	3900

6.0.0.2 Binary Operations

A = pd.DataFrame(np.random.randint(0, 20, (2, 2)), columns=list('AB'))
B = pd.DataFrame(np.random.randint(0, 10, (3, 3)), columns=list('BAC'))
print( A, B, sep="\n\n")

A+B

	A	B	C
0	9.0	2.0	NaN
1	24.0	5.0	NaN
2	NaN	NaN	NaN

A.add(B,fill_value=0)

	A	B	C
0	9.0	2.0	9.0
1	24.0	5.0	2.0
2	1.0	4.0	0.0

B-B.loc[0,:]

	B	A	C
0	0	0	0
1	3	-3	-7
2	3	-8	-9

B-B.loc[0:0,:]

	B	A	C
0	0.0	0.0	0.0
1	NaN	NaN	NaN
2	NaN	NaN	NaN

B.sub(B.loc[0,:])

	B	A	C
0	0	0	0
1	3	-3	-7
2	3	-8	-9

B.sub(B['A'],axis=0)

	B	C
0	-8	0
1	-2	-4
2	3	-1

6.0.0.3 Pandas DataFrame Manipulation

import numpy as np
import pandas as pd
df = pd.DataFrame(np.random.rand(7, 5))
df

	0	1	2	3	4
0	0.254541	0.022793	0.292160	0.256052	0.742068
1	0.959650	0.210256	0.937732	0.523141	0.660096
2	0.694372	0.674311	0.502167	0.164087	0.527422
3	0.591313	0.952835	0.800817	0.357809	0.029770
4	0.004058	0.930224	0.908414	0.104988	0.372187
5	0.062482	0.480021	0.220693	0.820707	0.578013
6	0.678025	0.481827	0.727163	0.119174	0.252283

df[[0,1]]

	0	1
0	0.254541	0.022793
1	0.959650	0.210256
2	0.694372	0.674311
3	0.591313	0.952835
4	0.004058	0.930224
5	0.062482	0.480021
6	0.678025	0.481827