[정리] scikit learn - GLM

1 분 소요

모든 예제는 사이킷런 공식 문서를 참고하였습니다.

OLS LM

# import
from sklearn import datasets
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# 예제 dataset load 
diabetes = datasets.load_diabetes()
X = pd.DataFrame.from_dict(diabetes.data[:,2]) # 컬럼 하나만
y = pd.DataFrame.from_dict(diabetes.target)

# 데이터셋 나누기
X['y'] = y
train, valid = train_test_split(X, train_size=0.8)
train.shape, valid.shape # shape 확인

x_train = train.drop(columns=['y'])
y_train = train.y
x_val = valid.drop(columns=['y'])
y_val = valid.y

# 모델 생성
lm = linear_model.LinearRegression()

# 예측값
predicted = lm.predict(x_val)

# 스코어링, 모델 선택
mean_squared_error(y_val, predicted)
r2_score(y_val,predicted)

# 시각화
plt.scatter(x_val, y_val, color="black")
plt.show()

Ridge Regression

OLS LM의 계수에 일종의 패널티 텀을 추가한 것

여전히 OLS 방식으로 해를 찾는다.

from sklearn import linear_model
reg = linear_model.Ridge(alpha=0.5) # penalty term
reg.fit ([[0, 0], [1, 1], [2, 2]], [0, 1, 2])
reg.coef_
# array([ 0.34545455,  0.34545455])
reg.intercept_
# 0.13636...

RidgeCV

RidgeCV를 사용하면 모든 경우를 Cross-Validation 함으로써 적합한 alpha 값을 찾을 수 있다. scoring 파라미터에 모델을 선택할 떄 사용할 metric을 넣을 수 있다.

from sklearn import linear_model
# 0.05, 0.1, 0.15, ...
reg = linear_model.RidgeCV(
    alphas=[i*0.05 for i in range(1,21)],
    scoring="r2"
)
reg.fit([[0, 0], [0, 0], [1, 1]], [0, .1, 1])       
reg.alpha_      

feature importance

print(__doc__)

import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import make_classification
from sklearn.ensemble import ExtraTreesClassifier

# Build a classification task using 3 informative features
X, y = make_classification(n_samples=1000,
                           n_features=10,
                           n_informative=3,
                           n_redundant=0,
                           n_repeated=0,
                           n_classes=2,
                           random_state=0,
                           shuffle=False)

# Build a forest and compute the feature importances
forest = ExtraTreesClassifier(n_estimators=250,
                              random_state=0)

forest.fit(X, y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()

Model Selection

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_test, y_test) * 100, 2)
print(acc_log)

linear_svc = LinearSVC()
linear_svc.fit(X_train, y_train)
acc_linear_svc = round(linear_svc.score(X_test, y_test) * 100, 2)
print(acc_linear_svc)

bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                         algorithm="SAMME",
                         n_estimators=200)
bdt.fit(X_train, y_train)
acc_bdt = round(bdt.score(X_test, y_test) * 100, 2)
print(acc_bdt)

clf_gb = GradientBoostingClassifier(n_estimators=100, 
                                    max_depth=1, 
                                    random_state=10)
clf_gb.fit(X_train, y_train)
acc_clf_gb = round(clf_gb.score(X_test, y_test) * 100, 2)
print(acc_clf_gb)

댓글남기기