import numpy as np
import pandas as pd
1 빅데이터와 금융자료 분석 CH1
= {'f1': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
missdict 'f2': [10., None, 20., 30., None, 50., 60., 70., 80., 90.],
'f3': ['A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'C', 'C']}
= pd.DataFrame( missdict )
missdata missdata.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 f1 10 non-null int64
1 f2 8 non-null float64
2 f3 10 non-null object
dtypes: float64(1), int64(1), object(1)
memory usage: 368.0+ bytes
missdata.isna().mean()
f1 0.0
f2 0.2
f3 0.0
dtype: float64
= missdata.dropna()
tmpdata1 tmpdata1
f1 | f2 | f3 | |
---|---|---|---|
0 | 1 | 10.0 | A |
2 | 3 | 20.0 | A |
3 | 4 | 30.0 | A |
5 | 6 | 50.0 | B |
6 | 7 | 60.0 | B |
7 | 8 | 70.0 | B |
8 | 9 | 80.0 | C |
9 | 10 | 90.0 | C |
= missdata.dropna( subset=['f3'] )
tmpdata2 tmpdata2
f1 | f2 | f3 | |
---|---|---|---|
0 | 1 | 10.0 | A |
1 | 2 | NaN | A |
2 | 3 | 20.0 | A |
3 | 4 | 30.0 | A |
4 | 5 | NaN | B |
5 | 6 | 50.0 | B |
6 | 7 | 60.0 | B |
7 | 8 | 70.0 | B |
8 | 9 | 80.0 | C |
9 | 10 | 90.0 | C |
= missdata.select_dtypes(include=['int64', 'float64'])
numdata = numdata.fillna( -999, inplace=False )
tmpdata3 tmpdata3.describe()
f1 | f2 | |
---|---|---|
count | 10.00000 | 10.000000 |
mean | 5.50000 | -158.800000 |
std | 3.02765 | 443.562297 |
min | 1.00000 | -999.000000 |
25% | 3.25000 | 12.500000 |
50% | 5.50000 | 40.000000 |
75% | 7.75000 | 67.500000 |
max | 10.00000 | 90.000000 |
numdata.mean()
f1 5.50
f2 51.25
dtype: float64
= numdata.fillna( numdata.mean(), inplace=False )
tmpdata4 tmpdata4
f1 | f2 | |
---|---|---|
0 | 1 | 10.00 |
1 | 2 | 51.25 |
2 | 3 | 20.00 |
3 | 4 | 30.00 |
4 | 5 | 51.25 |
5 | 6 | 50.00 |
6 | 7 | 60.00 |
7 | 8 | 70.00 |
8 | 9 | 80.00 |
9 | 10 | 90.00 |
'f3')['f2'].mean() missdata.groupby(
f3
A 20.0
B 60.0
C 85.0
Name: f2, dtype: float64
'f3')['f2'].transform('mean') missdata.groupby(
0 20.0
1 20.0
2 20.0
3 20.0
4 60.0
5 60.0
6 60.0
7 60.0
8 85.0
9 85.0
Name: f2, dtype: float64
= numdata.copy()
tmpdata5 'f2'].fillna( missdata.groupby('f3')['f2'].transform('mean'), inplace=True)
tmpdata5[ tmpdata5
/var/folders/n2/jbh_0_091bx8qgz7j87t2qwc0000gp/T/ipykernel_25894/622840210.py:2: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
tmpdata5['f2'].fillna( missdata.groupby('f3')['f2'].transform('mean'),inplace=True)
f1 | f2 | |
---|---|---|
0 | 1 | 10.0 |
1 | 2 | 20.0 |
2 | 3 | 20.0 |
3 | 4 | 30.0 |
4 | 5 | 60.0 |
5 | 6 | 50.0 |
6 | 7 | 60.0 |
7 | 8 | 70.0 |
8 | 9 | 80.0 |
9 | 10 | 90.0 |
= missdata.dropna()
missdata_tr = missdata_tr[['f1']]
x_tr = missdata_tr['f2']
y_tr
from sklearn.linear_model import LinearRegression
= LinearRegression()
model
model.fit( x_tr, y_tr )
= missdata [ missdata.isnull().any(axis=1) ]
missdata_ts = missdata_ts[['f1']]
x_ts
= model.predict( x_ts )
predicted_values = missdata.copy()
tmpdata6 'f2'].isnull(), 'f2'] = predicted_values
tmpdata6.loc[ tmpdata6[ tmpdata6
f1 | f2 | f3 | |
---|---|---|---|
0 | 1 | 10.000000 | A |
1 | 2 | 14.191176 | A |
2 | 3 | 20.000000 | A |
3 | 4 | 30.000000 | A |
4 | 5 | 41.985294 | B |
5 | 6 | 50.000000 | B |
6 | 7 | 60.000000 | B |
7 | 8 | 70.000000 | B |
8 | 9 | 80.000000 | C |
9 | 10 | 90.000000 | C |
= missdata.copy()
missdata_num 'f3']=missdata_num['f3'].map({'A':1,'B':2,'C':3}) missdata_num[
missdata_num
f1 | f2 | f3 | |
---|---|---|---|
0 | 1 | 10.0 | 1 |
1 | 2 | NaN | 1 |
2 | 3 | 20.0 | 1 |
3 | 4 | 30.0 | 1 |
4 | 5 | NaN | 2 |
5 | 6 | 50.0 | 2 |
6 | 7 | 60.0 | 2 |
7 | 8 | 70.0 | 2 |
8 | 9 | 80.0 | 3 |
9 | 10 | 90.0 | 3 |
from sklearn.impute import KNNImputer
= KNNImputer(n_neighbors=2)
imputer = imputer.fit_transform(missdata_num) tmpdata7
pd.DataFrame( tmpdata7 )
0 | 1 | 2 | |
---|---|---|---|
0 | 1.0 | 10.0 | 1.0 |
1 | 2.0 | 15.0 | 1.0 |
2 | 3.0 | 20.0 | 1.0 |
3 | 4.0 | 30.0 | 1.0 |
4 | 5.0 | 40.0 | 2.0 |
5 | 6.0 | 50.0 | 2.0 |
6 | 7.0 | 60.0 | 2.0 |
7 | 8.0 | 70.0 | 2.0 |
8 | 9.0 | 80.0 | 3.0 |
9 | 10.0 | 90.0 | 3.0 |
= {'A': [10, 0.02, 0.3, 40, 50, 60, 712, 80, 90, 1003],
outdict 'B': [0.05, 0.00015, 25, 35, 45, 205, 65, 75, 85, 3905]}
= pd.DataFrame( outdict )
outdata
= outdata.quantile(0.25)
Q1 = outdata.quantile(0.75)
Q3 = Q3 - Q1
IQR = Q1 - 1.5 * IQR
lower_bound = Q3 + 1.5 * IQR
upper_bound
< lower_bound) | (outdata > upper_bound)) ((outdata
A | B | |
---|---|---|
0 | False | False |
1 | False | False |
2 | False | False |
3 | False | False |
4 | False | False |
5 | False | True |
6 | True | False |
7 | False | False |
8 | False | False |
9 | True | True |
= ((outdata < lower_bound) | (outdata > upper_bound)).any(axis=1)
outliers = outdata[ outliers ]
outliersdata outliersdata
A | B | |
---|---|---|
5 | 60.0 | 205.0 |
6 | 712.0 | 65.0 |
9 | 1003.0 | 3905.0 |
= (outdata - outdata.mean()) / outdata.std()
standardizeddata standardizeddata
A | B | |
---|---|---|
0 | -0.552206 | -0.364647 |
1 | -0.580536 | -0.364688 |
2 | -0.579741 | -0.344154 |
3 | -0.467047 | -0.335940 |
4 | -0.438661 | -0.327727 |
5 | -0.410274 | -0.196309 |
6 | 1.440519 | -0.311300 |
7 | -0.353501 | -0.303086 |
8 | -0.325115 | -0.294872 |
9 | 2.266563 | 2.842723 |
= ((standardizeddata < -3) | (standardizeddata > 3)).any(axis=1)
outliers2 = outdata[ outliers2 ]
outliersdata2 outliersdata2
A | B |
---|
import matplotlib.pyplot as plt
42)
np.random.seed(= 0.3 * np.random.randn(100, 2)
X_inliers = np.random.uniform(low=-4, high=4, size=(20, 2))
X_outliers = np.r_[X_inliers + 2, X_inliers - 2, X_outliers]
X
=(5, 4))
plt.figure(figsize0], X[:, 1], color='k', s=20) plt.scatter(X[:,
from sklearn.neighbors import LocalOutlierFactor
= LocalOutlierFactor(n_neighbors=20, contamination=0.1)
clf = clf.fit_predict(X) # 1: inlier, -1: outlier
y_pred = y_pred == -1
outlier_mask
=(5, 4))
plt.figure(figsize0], X[:, 1], color='b', s=20, label='Inliers')
plt.scatter(X[:, 0], X[outlier_mask, 1], color='r', s=50,label='Outliers')
plt.scatter(X[outlier_mask, "Feature 1")
plt.xlabel("Feature 2")
plt.ylabel( plt.legend()
from sklearn.ensemble import IsolationForest
= IsolationForest(contamination=0.1)
clf2 # contamination : 이상치 비율
# n_estimators : 나무의 갯수 (defalut 100)
# max_features : 각 나무별 특성변수의 갯수(default 1)
clf2.fit( X )= clf2.predict( X ) # 1: inlier, -1: outlier
y_pred2 = y_pred2 == -1
outlier_mask2
=(5, 4))
plt.figure(figsize0], X[:, 1], color='b', s=20, label='Inliers')
plt.scatter(X[:, 0], X[outlier_mask2, 1], color='r', s=50,label='Outliers')
plt.scatter(X[outlier_mask2, "Feature 1")
plt.xlabel("Feature 2")
plt.ylabel( plt.legend()
clf2.score_samples(X)
array([-0.39567479, -0.43973362, -0.38576701, -0.4625295 , -0.39169087,
-0.39353101, -0.44632054, -0.4539587 , -0.39410011, -0.42823486,
-0.43802322, -0.4231585 , -0.38600184, -0.40324911, -0.38778406,
-0.46849881, -0.4075734 , -0.43420431, -0.45140279, -0.4113938 ,
-0.40036102, -0.38335266, -0.42666089, -0.41104579, -0.44476313,
-0.38744281, -0.39942914, -0.44257912, -0.38938554, -0.40470075,
-0.39006555, -0.42881353, -0.44275864, -0.40154039, -0.39729665,
-0.42434279, -0.42743206, -0.57699503, -0.38628797, -0.45454533,
-0.3864026 , -0.44513991, -0.39598029, -0.41231495, -0.39270763,
-0.40568073, -0.39005843, -0.43210962, -0.38601781, -0.38485125,
-0.41991455, -0.40181793, -0.38788591, -0.48400217, -0.38538727,
-0.47693547, -0.50815903, -0.39115267, -0.40423505, -0.43216634,
-0.4254748 , -0.48475901, -0.4890737 , -0.40357175, -0.39439224,
-0.42406868, -0.40171635, -0.44870204, -0.38761922, -0.43170548,
-0.42364173, -0.43593615, -0.39560581, -0.4391784 , -0.39543452,
-0.38550106, -0.38910014, -0.39558501, -0.48693408, -0.41865632,
-0.40964165, -0.43730378, -0.41716448, -0.47893783, -0.39791987,
-0.40492088, -0.38431516, -0.39544321, -0.42208103, -0.53522817,
-0.41839783, -0.40171635, -0.39362168, -0.39333773, -0.44128807,
-0.40208128, -0.40907841, -0.39096216, -0.38929625, -0.40645206,
-0.38953796, -0.43147334, -0.38691831, -0.45292849, -0.39365031,
-0.40064735, -0.46513506, -0.48209563, -0.39635657, -0.44569517,
-0.4496465 , -0.43243444, -0.39406682, -0.40625773, -0.39826724,
-0.46462545, -0.40782229, -0.42572527, -0.46599338, -0.42852596,
-0.40082304, -0.38971614, -0.4628486 , -0.4219109 , -0.46365237,
-0.39237271, -0.40320941, -0.42432754, -0.40309339, -0.40204058,
-0.39044555, -0.43501511, -0.4324525 , -0.40503508, -0.40259674,
-0.42956023, -0.42799201, -0.56257644, -0.38875652, -0.47585014,
-0.3835507 , -0.4556408 , -0.406217 , -0.40385118, -0.39458774,
-0.40122018, -0.40401747, -0.4443972 , -0.38320086, -0.3834212 ,
-0.44945534, -0.4060406 , -0.38476589, -0.46817324, -0.38466101,
-0.47610349, -0.49275615, -0.38344594, -0.40974845, -0.42395885,
-0.42189161, -0.49261883, -0.48518689, -0.40901991, -0.39452123,
-0.44785724, -0.40375287, -0.45749968, -0.40496115, -0.4260838 ,
-0.41810669, -0.44560803, -0.38806155, -0.45523273, -0.38817405,
-0.38177641, -0.39724261, -0.40279204, -0.46677259, -0.42162856,
-0.4086809 , -0.43961327, -0.40404401, -0.45862325, -0.40473907,
-0.41509113, -0.38081908, -0.39036169, -0.42441162, -0.52130968,
-0.41557892, -0.40347146, -0.39318444, -0.38921027, -0.46342999,
-0.39756969, -0.41357841, -0.38429305, -0.40297782, -0.40881293,
-0.60259641, -0.44411547, -0.57052254, -0.50247903, -0.73212838,
-0.64786135, -0.55623141, -0.45375541, -0.68754218, -0.67833274,
-0.70698798, -0.63422967, -0.64031155, -0.78183101, -0.65477657,
-0.67178443, -0.67084008, -0.68580391, -0.71446705, -0.64861428])