3-3 특성 공학과 규제

다중 회귀

여러 개의 특성을 사용한 선형 회귀(앞 내용은 하나의 특성을 이용한 선형 회귀)

import pandas as pd
import numpy as np
df = pd.read_csv('<http://bit.ly/perch_csv_data>')
perch_full = df.to_numpy()
# print(perch_full)

# target data 는 별로 안기니까 그대로
perch_weight = np.array([5.9, 32.0, 40.0, 51.5, 70.0, 100.0, 78.0, 80.0, 85.0, 85.0, 110.0,
       115.0, 125.0, 130.0, 120.0, 120.0, 130.0, 135.0, 110.0, 130.0,
       150.0, 145.0, 150.0, 170.0, 225.0, 145.0, 188.0, 180.0, 197.0,
       218.0, 300.0, 260.0, 265.0, 250.0, 250.0, 300.0, 320.0, 514.0,
       556.0, 840.0, 685.0, 700.0, 700.0, 690.0, 900.0, 650.0, 820.0,
       850.0, 900.0, 1015.0, 820.0, 1100.0, 1000.0, 1100.0, 1000.0,
       1000.0])

from sklearn.model_selection import train_test_split
train_input, test_input, train_target, test_target = train_test_split(perch_full, perch_weight, random_state = 42 )

from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(include_bias = False)
poly.fit(train_input)
train_poly = poly.transform(train_input)
# print(train_poly.shape)
# -> (42,9)

# -> 특성이 각각 어떤 입력의 조합으로 만들어졌는지 
# print(poly.get_feature_names())
# 테스트 세트 변환
test_poly = poly.transform(test_input)

from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(train_poly, train_target)
# print(lr.score(train_poly, train_target))
print(lr.score(test_poly, test_target))
# -> 0.9903183436982125
# -> 0.971455991159415

# 훈련세트의 정확도가 테스트 세트의 정확도보다 높고 과소 적합 문제는 발생하지 않는다

특성을 더 늘려보자

(3제곱 4제곱 ..)

PolynomialFeatures 의 클래스의 degree 매개 변수를 사용하여 필요한 고차항의 최대 차수를 지정할 수 있다 5제곱까지 특성을 만들어보자

import pandas as pd
import numpy as np
df = pd.read_csv('<http://bit.ly/perch_csv_data>')
perch_full = df.to_numpy()
# print(perch_full)

# target data 는 별로 안기니까 그대로
perch_weight = np.array([5.9, 32.0, 40.0, 51.5, 70.0, 100.0, 78.0, 80.0, 85.0, 85.0, 110.0,
       115.0, 125.0, 130.0, 120.0, 120.0, 130.0, 135.0, 110.0, 130.0,
       150.0, 145.0, 150.0, 170.0, 225.0, 145.0, 188.0, 180.0, 197.0,
       218.0, 300.0, 260.0, 265.0, 250.0, 250.0, 300.0, 320.0, 514.0,
       556.0, 840.0, 685.0, 700.0, 700.0, 690.0, 900.0, 650.0, 820.0,
       850.0, 900.0, 1015.0, 820.0, 1100.0, 1000.0, 1100.0, 1000.0,
       1000.0])

from sklearn.model_selection import train_test_split
train_input, test_input, train_target, test_target = train_test_split(perch_full, perch_weight, random_state = 42 )

from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree = 5, include_bias = False)
poly.fit(train_input)
train_poly = poly.transform(train_input)
test_poly = poly.transform(test_input)

from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(train_poly, train_target)
# print(lr.score(train_poly, train_target))
# -> 0.9999999999938116
# print(lr.score(test_poly, test_target))
# error 특성의 개수를 크게 늘리면 선형 모델은 아주 강력해진다 훈련 세트에 대해 거의 완벽 학습 그러나 이것은 훈련 세트에 너무 과대 적합 되므로 테스트 세트에서는 형편없는 점수를 만든다

→ 이런 문제를 해결하기 위해선 특성의 개수를 줄여야 함

규제

머신러닝 모델이 훈련 세트를 너무 과도하게 학습하지 못하도록 훼방하는 것을 말함
즉 모델이 훈련 세트에 과대 적합하지 않도록 만드는 것
선형 회귀 모델의 경우 특성에 곱해지는 계수 (또는 기울기)의 크기를 작게 만드는 것
보다 보편적으로 패턴을 학습하도록