[정리] scikit learn - GLM
모든 예제는 사이킷런 공식 문서를 참고하였습니다.
OLS LM
# import
from sklearn import datasets
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
# 예제 dataset load
diabetes = datasets.load_diabetes()
X = pd.DataFrame.from_dict(diabetes.data[:,2]) # 컬럼 하나만
y = pd.DataFrame.from_dict(diabetes.target)
# 데이터셋 나누기
X['y'] = y
train, valid = train_test_split(X, train_size=0.8)
train.shape, valid.shape # shape 확인
x_train = train.drop(columns=['y'])
y_train = train.y
x_val = valid.drop(columns=['y'])
y_val = valid.y
# 모델 생성
lm = linear_model.LinearRegression()
# 예측값
predicted = lm.predict(x_val)
# 스코어링, 모델 선택
mean_squared_error(y_val, predicted)
r2_score(y_val,predicted)
# 시각화
plt.scatter(x_val, y_val, color="black")
plt.show()
Ridge Regression
OLS LM의 계수에 일종의 패널티 텀을 추가한 것
여전히 OLS 방식으로 해를 찾는다.
from sklearn import linear_model
reg = linear_model.Ridge(alpha=0.5) # penalty term
reg.fit ([[0, 0], [1, 1], [2, 2]], [0, 1, 2])
reg.coef_
# array([ 0.34545455, 0.34545455])
reg.intercept_
# 0.13636...
RidgeCV
RidgeCV를 사용하면 모든 경우를 Cross-Validation 함으로써 적합한 alpha 값을 찾을 수 있다. scoring
파라미터에 모델을 선택할 떄 사용할 metric을 넣을 수 있다.
from sklearn import linear_model
# 0.05, 0.1, 0.15, ...
reg = linear_model.RidgeCV(
alphas=[i*0.05 for i in range(1,21)],
scoring="r2"
)
reg.fit([[0, 0], [0, 0], [1, 1]], [0, .1, 1])
reg.alpha_
feature importance
print(__doc__)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.ensemble import ExtraTreesClassifier
# Build a classification task using 3 informative features
X, y = make_classification(n_samples=1000,
n_features=10,
n_informative=3,
n_redundant=0,
n_repeated=0,
n_classes=2,
random_state=0,
shuffle=False)
# Build a forest and compute the feature importances
forest = ExtraTreesClassifier(n_estimators=250,
random_state=0)
forest.fit(X, y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
axis=0)
indices = np.argsort(importances)[::-1]
# Print the feature ranking
print("Feature ranking:")
for f in range(X.shape[1]):
print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()
Model Selection
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_test, y_test) * 100, 2)
print(acc_log)
linear_svc = LinearSVC()
linear_svc.fit(X_train, y_train)
acc_linear_svc = round(linear_svc.score(X_test, y_test) * 100, 2)
print(acc_linear_svc)
bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
algorithm="SAMME",
n_estimators=200)
bdt.fit(X_train, y_train)
acc_bdt = round(bdt.score(X_test, y_test) * 100, 2)
print(acc_bdt)
clf_gb = GradientBoostingClassifier(n_estimators=100,
max_depth=1,
random_state=10)
clf_gb.fit(X_train, y_train)
acc_clf_gb = round(clf_gb.score(X_test, y_test) * 100, 2)
print(acc_clf_gb)
댓글남기기