1  빅데이터와 금융자료 분석 CH1

import numpy as np
import pandas as pd
missdict = {'f1': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            'f2': [10., None, 20., 30., None, 50., 60., 70., 80., 90.],
            'f3': ['A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'C', 'C']}
missdata = pd.DataFrame( missdict )
missdata.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   f1      10 non-null     int64  
 1   f2      8 non-null      float64
 2   f3      10 non-null     object 
dtypes: float64(1), int64(1), object(1)
memory usage: 368.0+ bytes
missdata.isna().mean()
f1    0.0
f2    0.2
f3    0.0
dtype: float64
tmpdata1 = missdata.dropna()
tmpdata1
f1 f2 f3
0 1 10.0 A
2 3 20.0 A
3 4 30.0 A
5 6 50.0 B
6 7 60.0 B
7 8 70.0 B
8 9 80.0 C
9 10 90.0 C
tmpdata2 = missdata.dropna( subset=['f3'] )
tmpdata2
f1 f2 f3
0 1 10.0 A
1 2 NaN A
2 3 20.0 A
3 4 30.0 A
4 5 NaN B
5 6 50.0 B
6 7 60.0 B
7 8 70.0 B
8 9 80.0 C
9 10 90.0 C
numdata = missdata.select_dtypes(include=['int64', 'float64'])
tmpdata3 = numdata.fillna( -999, inplace=False )
tmpdata3.describe()
f1 f2
count 10.00000 10.000000
mean 5.50000 -158.800000
std 3.02765 443.562297
min 1.00000 -999.000000
25% 3.25000 12.500000
50% 5.50000 40.000000
75% 7.75000 67.500000
max 10.00000 90.000000
numdata.mean()
f1     5.50
f2    51.25
dtype: float64
tmpdata4 = numdata.fillna( numdata.mean(), inplace=False )
tmpdata4
f1 f2
0 1 10.00
1 2 51.25
2 3 20.00
3 4 30.00
4 5 51.25
5 6 50.00
6 7 60.00
7 8 70.00
8 9 80.00
9 10 90.00
missdata.groupby('f3')['f2'].mean()
f3
A    20.0
B    60.0
C    85.0
Name: f2, dtype: float64
missdata.groupby('f3')['f2'].transform('mean')
0    20.0
1    20.0
2    20.0
3    20.0
4    60.0
5    60.0
6    60.0
7    60.0
8    85.0
9    85.0
Name: f2, dtype: float64
tmpdata5 = numdata.copy()
tmpdata5['f2'].fillna( missdata.groupby('f3')['f2'].transform('mean'), inplace=True)
tmpdata5
/var/folders/n2/jbh_0_091bx8qgz7j87t2qwc0000gp/T/ipykernel_25894/622840210.py:2: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  tmpdata5['f2'].fillna( missdata.groupby('f3')['f2'].transform('mean'),inplace=True)
f1 f2
0 1 10.0
1 2 20.0
2 3 20.0
3 4 30.0
4 5 60.0
5 6 50.0
6 7 60.0
7 8 70.0
8 9 80.0
9 10 90.0
missdata_tr = missdata.dropna()
x_tr = missdata_tr[['f1']]
y_tr = missdata_tr['f2']

from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit( x_tr, y_tr )

missdata_ts = missdata [ missdata.isnull().any(axis=1) ]
x_ts = missdata_ts[['f1']]

predicted_values = model.predict( x_ts )
tmpdata6 = missdata.copy()
tmpdata6.loc[ tmpdata6['f2'].isnull(), 'f2'] = predicted_values
tmpdata6
f1 f2 f3
0 1 10.000000 A
1 2 14.191176 A
2 3 20.000000 A
3 4 30.000000 A
4 5 41.985294 B
5 6 50.000000 B
6 7 60.000000 B
7 8 70.000000 B
8 9 80.000000 C
9 10 90.000000 C
missdata_num = missdata.copy()
missdata_num['f3']=missdata_num['f3'].map({'A':1,'B':2,'C':3})
missdata_num
f1 f2 f3
0 1 10.0 1
1 2 NaN 1
2 3 20.0 1
3 4 30.0 1
4 5 NaN 2
5 6 50.0 2
6 7 60.0 2
7 8 70.0 2
8 9 80.0 3
9 10 90.0 3
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=2)
tmpdata7 = imputer.fit_transform(missdata_num)
pd.DataFrame( tmpdata7 )
0 1 2
0 1.0 10.0 1.0
1 2.0 15.0 1.0
2 3.0 20.0 1.0
3 4.0 30.0 1.0
4 5.0 40.0 2.0
5 6.0 50.0 2.0
6 7.0 60.0 2.0
7 8.0 70.0 2.0
8 9.0 80.0 3.0
9 10.0 90.0 3.0
outdict = {'A': [10, 0.02, 0.3, 40, 50, 60, 712, 80, 90, 1003],
           'B': [0.05, 0.00015, 25, 35, 45, 205, 65, 75, 85, 3905]}
outdata = pd.DataFrame( outdict )

Q1 = outdata.quantile(0.25)
Q3 = outdata.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

((outdata < lower_bound) | (outdata > upper_bound))
A B
0 False False
1 False False
2 False False
3 False False
4 False False
5 False True
6 True False
7 False False
8 False False
9 True True
outliers = ((outdata < lower_bound) | (outdata > upper_bound)).any(axis=1)
outliersdata = outdata[ outliers ]
outliersdata
A B
5 60.0 205.0
6 712.0 65.0
9 1003.0 3905.0
standardizeddata = (outdata - outdata.mean()) / outdata.std()
standardizeddata
A B
0 -0.552206 -0.364647
1 -0.580536 -0.364688
2 -0.579741 -0.344154
3 -0.467047 -0.335940
4 -0.438661 -0.327727
5 -0.410274 -0.196309
6 1.440519 -0.311300
7 -0.353501 -0.303086
8 -0.325115 -0.294872
9 2.266563 2.842723
outliers2 = ((standardizeddata < -3) | (standardizeddata > 3)).any(axis=1)
outliersdata2 = outdata[ outliers2 ]
outliersdata2
A B
import matplotlib.pyplot as plt
np.random.seed(42)
X_inliers = 0.3 * np.random.randn(100, 2)
X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2))
X = np.r_[X_inliers + 2, X_inliers - 2, X_outliers]

plt.figure(figsize=(5, 4))
plt.scatter(X[:, 0], X[:, 1], color='k', s=20)

from sklearn.neighbors import LocalOutlierFactor
clf = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
y_pred = clf.fit_predict(X) # 1: inlier, -1: outlier
outlier_mask = y_pred == -1

plt.figure(figsize=(5, 4))
plt.scatter(X[:, 0], X[:, 1], color='b', s=20, label='Inliers')
plt.scatter(X[outlier_mask, 0], X[outlier_mask, 1], color='r', s=50,label='Outliers')
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.legend()

from sklearn.ensemble import IsolationForest
clf2 = IsolationForest(contamination=0.1)
# contamination : 이상치 비율
# n_estimators : 나무의 갯수 (defalut 100)
# max_features : 각 나무별 특성변수의 갯수(default 1)
clf2.fit( X )
y_pred2 = clf2.predict( X ) # 1: inlier, -1: outlier
outlier_mask2 = y_pred2 == -1

plt.figure(figsize=(5, 4))
plt.scatter(X[:, 0], X[:, 1], color='b', s=20, label='Inliers')
plt.scatter(X[outlier_mask2, 0], X[outlier_mask2, 1], color='r', s=50,label='Outliers')
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.legend()

clf2.score_samples(X)
array([-0.39567479, -0.43973362, -0.38576701, -0.4625295 , -0.39169087,
       -0.39353101, -0.44632054, -0.4539587 , -0.39410011, -0.42823486,
       -0.43802322, -0.4231585 , -0.38600184, -0.40324911, -0.38778406,
       -0.46849881, -0.4075734 , -0.43420431, -0.45140279, -0.4113938 ,
       -0.40036102, -0.38335266, -0.42666089, -0.41104579, -0.44476313,
       -0.38744281, -0.39942914, -0.44257912, -0.38938554, -0.40470075,
       -0.39006555, -0.42881353, -0.44275864, -0.40154039, -0.39729665,
       -0.42434279, -0.42743206, -0.57699503, -0.38628797, -0.45454533,
       -0.3864026 , -0.44513991, -0.39598029, -0.41231495, -0.39270763,
       -0.40568073, -0.39005843, -0.43210962, -0.38601781, -0.38485125,
       -0.41991455, -0.40181793, -0.38788591, -0.48400217, -0.38538727,
       -0.47693547, -0.50815903, -0.39115267, -0.40423505, -0.43216634,
       -0.4254748 , -0.48475901, -0.4890737 , -0.40357175, -0.39439224,
       -0.42406868, -0.40171635, -0.44870204, -0.38761922, -0.43170548,
       -0.42364173, -0.43593615, -0.39560581, -0.4391784 , -0.39543452,
       -0.38550106, -0.38910014, -0.39558501, -0.48693408, -0.41865632,
       -0.40964165, -0.43730378, -0.41716448, -0.47893783, -0.39791987,
       -0.40492088, -0.38431516, -0.39544321, -0.42208103, -0.53522817,
       -0.41839783, -0.40171635, -0.39362168, -0.39333773, -0.44128807,
       -0.40208128, -0.40907841, -0.39096216, -0.38929625, -0.40645206,
       -0.38953796, -0.43147334, -0.38691831, -0.45292849, -0.39365031,
       -0.40064735, -0.46513506, -0.48209563, -0.39635657, -0.44569517,
       -0.4496465 , -0.43243444, -0.39406682, -0.40625773, -0.39826724,
       -0.46462545, -0.40782229, -0.42572527, -0.46599338, -0.42852596,
       -0.40082304, -0.38971614, -0.4628486 , -0.4219109 , -0.46365237,
       -0.39237271, -0.40320941, -0.42432754, -0.40309339, -0.40204058,
       -0.39044555, -0.43501511, -0.4324525 , -0.40503508, -0.40259674,
       -0.42956023, -0.42799201, -0.56257644, -0.38875652, -0.47585014,
       -0.3835507 , -0.4556408 , -0.406217  , -0.40385118, -0.39458774,
       -0.40122018, -0.40401747, -0.4443972 , -0.38320086, -0.3834212 ,
       -0.44945534, -0.4060406 , -0.38476589, -0.46817324, -0.38466101,
       -0.47610349, -0.49275615, -0.38344594, -0.40974845, -0.42395885,
       -0.42189161, -0.49261883, -0.48518689, -0.40901991, -0.39452123,
       -0.44785724, -0.40375287, -0.45749968, -0.40496115, -0.4260838 ,
       -0.41810669, -0.44560803, -0.38806155, -0.45523273, -0.38817405,
       -0.38177641, -0.39724261, -0.40279204, -0.46677259, -0.42162856,
       -0.4086809 , -0.43961327, -0.40404401, -0.45862325, -0.40473907,
       -0.41509113, -0.38081908, -0.39036169, -0.42441162, -0.52130968,
       -0.41557892, -0.40347146, -0.39318444, -0.38921027, -0.46342999,
       -0.39756969, -0.41357841, -0.38429305, -0.40297782, -0.40881293,
       -0.60259641, -0.44411547, -0.57052254, -0.50247903, -0.73212838,
       -0.64786135, -0.55623141, -0.45375541, -0.68754218, -0.67833274,
       -0.70698798, -0.63422967, -0.64031155, -0.78183101, -0.65477657,
       -0.67178443, -0.67084008, -0.68580391, -0.71446705, -0.64861428])