import pandas as pd
import numpy as np
df = pd.read_csv('<http://bit.ly/perch_csv_data>')
perch_full = df.to_numpy()
# print(perch_full)
# target data 는 별로 안기니까 그대로
perch_weight = np.array([5.9, 32.0, 40.0, 51.5, 70.0, 100.0, 78.0, 80.0, 85.0, 85.0, 110.0,
115.0, 125.0, 130.0, 120.0, 120.0, 130.0, 135.0, 110.0, 130.0,
150.0, 145.0, 150.0, 170.0, 225.0, 145.0, 188.0, 180.0, 197.0,
218.0, 300.0, 260.0, 265.0, 250.0, 250.0, 300.0, 320.0, 514.0,
556.0, 840.0, 685.0, 700.0, 700.0, 690.0, 900.0, 650.0, 820.0,
850.0, 900.0, 1015.0, 820.0, 1100.0, 1000.0, 1100.0, 1000.0,
1000.0])
from sklearn.model_selection import train_test_split
train_input, test_input, train_target, test_target = train_test_split(perch_full, perch_weight, random_state = 42 )
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(include_bias = False)
poly.fit(train_input)
train_poly = poly.transform(train_input)
# print(train_poly.shape)
# -> (42,9)
# -> 특성이 각각 어떤 입력의 조합으로 만들어졌는지
# print(poly.get_feature_names())
# 테스트 세트 변환
test_poly = poly.transform(test_input)
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(train_poly, train_target)
# print(lr.score(train_poly, train_target))
print(lr.score(test_poly, test_target))
# -> 0.9903183436982125
# -> 0.971455991159415
# 훈련세트의 정확도가 테스트 세트의 정확도보다 높고 과소 적합 문제는 발생하지 않는다
(3제곱 4제곱 ..)
import pandas as pd
import numpy as np
df = pd.read_csv('<http://bit.ly/perch_csv_data>')
perch_full = df.to_numpy()
# print(perch_full)
# target data 는 별로 안기니까 그대로
perch_weight = np.array([5.9, 32.0, 40.0, 51.5, 70.0, 100.0, 78.0, 80.0, 85.0, 85.0, 110.0,
115.0, 125.0, 130.0, 120.0, 120.0, 130.0, 135.0, 110.0, 130.0,
150.0, 145.0, 150.0, 170.0, 225.0, 145.0, 188.0, 180.0, 197.0,
218.0, 300.0, 260.0, 265.0, 250.0, 250.0, 300.0, 320.0, 514.0,
556.0, 840.0, 685.0, 700.0, 700.0, 690.0, 900.0, 650.0, 820.0,
850.0, 900.0, 1015.0, 820.0, 1100.0, 1000.0, 1100.0, 1000.0,
1000.0])
from sklearn.model_selection import train_test_split
train_input, test_input, train_target, test_target = train_test_split(perch_full, perch_weight, random_state = 42 )
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree = 5, include_bias = False)
poly.fit(train_input)
train_poly = poly.transform(train_input)
test_poly = poly.transform(test_input)
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(train_poly, train_target)
# print(lr.score(train_poly, train_target))
# -> 0.9999999999938116
# print(lr.score(test_poly, test_target))
# error 특성의 개수를 크게 늘리면 선형 모델은 아주 강력해진다 훈련 세트에 대해 거의 완벽 학습 그러나 이것은 훈련 세트에 너무 과대 적합 되므로 테스트 세트에서는 형편없는 점수를 만든다
→ 이런 문제를 해결하기 위해선 특성의 개수를 줄여야 함