따릉이 | Notion

https://github.com/BambooStreet/MachineLearning/blob/main/따릉이.ipynb
↑↑↑깃헙이 더 보기 편해요
	## 라이브러리 로딩

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor 
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

## 데이터 로딩 및 확인

train = pd.read_csv('train.csv') 
test = pd.read_csv('test.csv')
submission = pd.read_csv('submission.csv')

train.info()

test.info()

train.head()

test.head()

## 결측치 확인 및 처리

train.isnull().sum()

test.isnull().sum()

train.describe()

결측치 처리를 위해 데이터를 합쳐준다.

data = pd.concat([train,test])
data

합쳐진 데이터의 결측치를 평균값으로 채워준다.  
이진 분류된 피처는 제외한다.

def fill_nan(df,column) :
    df[column] = df[column].fillna(value = df[column].mean())

fill_nan(data,'hour_bef_temperature')
fill_nan(data,'hour_bef_windspeed')
fill_nan(data,'hour_bef_humidity')
fill_nan(data,'hour_bef_visibility')
fill_nan(data,'hour_bef_ozone')
fill_nan(data,'hour_bef_pm10')
fill_nan(data,'hour_bef_pm2.5')

이진 분류된 피처인 hour_bef_precipitation은 평균값으로 결측치를 채우면 안된다.  
따라서 상태를 살펴보자

data[data['hour_bef_precipitation'].isnull()]

습도가 52도이고, 대부분의 비 정보는 비가 안오는 경우가 대부분이다.  
hour_bef_precipitation 결측치는 0으로 처리해도 무방할 것 같다.

data['hour_bef_precipitation'] = data['hour_bef_precipitation'].fillna(value=0)

data.isnull().sum()

결측치 처리가 완료되었다. 학습을 위해 다시 데이터를 분리해준다.

train = data[~pd.isnull(data['count'])]
test = data[pd.isnull(data['count'])]

## 상관관계 분석

import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

train.corr()

plt.figure(figsize = (12, 12))
sns.heatmap(train.corr(), annot = True)

1에 가까울수록 관계가 깊다, 0에 가까울수록 관계가 적다.  
상관성이 있다고 생각되는 데이터를 중점으로 살펴보았다.

train.corr()>=0.3

hour_bef_pm10, hour_bef_pm2.5(1시간 전 미세먼지 데이터)는 자전거 대여량과 큰 상관관계를 보이지 않는 것 같다.  
hour_bef_visibility(1시간 전 시정, 시계) 또한 0.3으로 상관관계가 약한 것으로 파악된다.  
추후 학습 데이터에서 제거한다.

## 분포도

수치형 데이터의 집계 값을 나타낸다,

plt.rc('font',size=15)
sns.displot(train['count'])

타겟 값의 분포가 0 근처에 몰려있고 분포가 왼쪽으로 많이 편향되어 있다.  
회귀모델이 좋은 성능을 내려면 정규분포를 따라야 하는데, 현재 타깃값 count는 정규분포를 따르지 않는다.   
로그변환을 통해 왼쪽으로 쏠린 데이터를 바꿔준다.

sns.displot(np.log(train['count']))

추후에 지수변환을 통해 타깃값인 count로 복원해야 한다.

### 막대그래프

sns.displot(train['hour_bef_precipitation'])

rain = train[train['hour_bef_precipitation']==1]

rain.describe()['count']

train.describe()['count']

## 회귀선을 포함한 산점도 그래프

수치형 데이터가 대부분이므로 산점도 그래프로 시각화 해보았다.

#figure 준비
plt.rc('font',size=9)
figure, axes = plt.subplots(nrows=4,ncols=2)
plt.tight_layout()
figure.set_size_inches(10,20)

#figure 서브플롯에 할당
#기온, 풍속, 습도, 시정, 오존, 미세먼지10, 미세먼지 2.5 산점도 그래프
sns.regplot(x='hour_bef_temperature', y = 'count', data = train, ax = axes[0,0],
           scatter_kws={'alpha':0.3},line_kws={'color':'g'})
sns.regplot(x='hour_bef_windspeed', y = 'count', data = train, ax = axes[0,1],
           scatter_kws={'alpha':0.3},line_kws={'color':'g'})
sns.regplot(x='hour_bef_humidity', y = 'count', data = train, ax = axes[1,0],
           scatter_kws={'alpha':0.3},line_kws={'color':'g'})
sns.regplot(x='hour_bef_visibility', y = 'count', data = train, ax = axes[1,1],
           scatter_kws={'alpha':0.3},line_kws={'color':'g'})
sns.regplot(x='hour_bef_ozone', y = 'count', data = train, ax = axes[2,0],
           scatter_kws={'alpha':0.3},line_kws={'color':'g'})
sns.regplot(x='hour_bef_pm10', y = 'count', data = train, ax = axes[2,1],
           scatter_kws={'alpha':0.3},line_kws={'color':'g'})
sns.regplot(x='hour_bef_pm2.5', y = 'count', data = train, ax = axes[3,0],
           scatter_kws={'alpha':0.3},line_kws={'color':'g'})

결측치가 많이 존재한 이유도 있겠지만 미세먼지의 회귀 그래프는 상관관계가 적어보인다, 추후 제거할 것을 생각해보자

## 정리

1. 타깃값 변환
* 분포도 확인 결과 타깃값 count가 0근처로 치우쳐 있으므로 로그변환해 정규분포에 가깝게 만들어야 한다. 타깃값을 count가 아닌 log(count)로 변환해 사용할 것이므로 마지막에 다시 지수변환해 count로 복원해야 한다.  

2. 피처 제거
* id 피처는 학습에 관계 없으므로 제거한다.
* 미세먼지 피처와 시야 피처는 타겟값과 큰 상관관계가 없고 결측치 비율도 높으므로 제거한다.

## 모델 정의 및 학습

* 모델 : RandomForestRegressor 
* 성능 개선
 * 피처 엔지니어링 : 앞의 분석 수준에서 모든 모델에서 동일하게 수행
 * 하이퍼파라미터 최적화 : grid search
* 기타: 타깃값이 count가 아닌 log(count)임

#### 데이터 합치기

data = pd.concat([train,test])
data

#### 필요없는 피처 제거

#drop_features = ['id','hour_bef_pm10','hour_bef_pm2.5']
drop_features = ['id','hour_bef_pm10','hour_bef_pm2.5','hour_bef_visibility']

data = data.drop(drop_features, axis=1)

data

#### 데이터 나누기

#훈련, 테스트 데이터 나누기
X_train = data[~pd.isnull(data['count'])]
X_test = data[pd.isnull(data['count'])]
#카운트 제거
X_train = X_train.drop(['count'],axis=1)
X_test = X_test.drop(['count'],axis=1)
y = train['count'] #타깃값

#### 데이터 스케일링

from sklearn.preprocessing import MinMaxScaler

#ord = ['hour','hour_bef_temperature','hour_bef_precipitation','hour_bef_windspeed',
#      'hour_bef_humidity','hour_bef_visibility','hour_bef_ozone']
ord = ['hour','hour_bef_temperature','hour_bef_precipitation','hour_bef_windspeed',
      'hour_bef_humidity','hour_bef_ozone']
scaler = MinMaxScaler()
train_scaler = scaler.fit(X_train[ord])
X_train[ord] = scaler.transform(X_train[ord])
X_test[ord] = scaler.transform(X_test[ord])
#X_train_scaled = minmax_scaler.transform(X_train)
#X_test_scaled = minmax_scaler.transform(X_test)

X_train.head()

X_test.head()

## 모델 훈련 및 저장

from sklearn.ensemble import RandomForestRegressor 
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
#모델 생성
randomforest_model = RandomForestRegressor()

#그리드서치 객체 생성
rf_params = {'random_state':[42],
             'n_estimators':[100,120],
             'max_depth':[32,64],
             'min_samples_leaf':[1,6],
             'min_samples_split':[2,8]}
gridsearch_random_forest_model = GridSearchCV(estimator=randomforest_model, 
                                              param_grid = rf_params,  
                                              cv=5)

#그리드서치 수행
log_y = np.log(y)
gridsearch_random_forest_model.fit(X_train,log_y)
print('최적의 하이퍼 파라미터',gridsearch_random_forest_model.best_params_)
print('GridSearchCV 최고 정확도:{0:.4f}'.format(gridsearch_random_forest_model.best_score_))

randomforest_preds = gridsearch_random_forest_model.best_estimator_.predict(X_test)
submission['count'] = np.exp(randomforest_preds) #지수변환
submission

submission.to_csv('0926.csv',index = False)
#46.453