[Kaggle][문제]강아지 품종 분류(dog-breed-identification)

[Kaggle][문제]강아지 품종 분류(dog-breed-identification) (tistory.com)

위 게시글을 보고 공부했습니다.

1. 데이터 읽어오기

import pandas as pd

label = pd.read_csv('File Path')
sample_submission = pd.read_csv('File Path')

# 종('breed')에 대한 정보 확인하기(어떤 종류가 있는지, 몇개 있는지)
print(len(label['breed'].unique()))
label['breed'].unique()

# id 값에 해당하는 이미지 경로를 'imgpath' 필드에 저장하기
filePath = 'File Path'
f = lambda x: filePath + x + '.jpg'

label['imgpath'] = label['id'].apply(f)
label.head()

# 'imgpath'에 있는 이미지를 읽어서(load_img 활용) array로 변환(img_to_array 활용)하여 'imgArray' 필드에 저장하기
from keras.preprocessing.image import img_to_array,load_img,ImageDataGenerator
img = load_img(label['imgpath'][0], target_size = (100, 100))
img = img_to_array(img)#array로 변환
print(img.shape)
ff = lambda x: img_to_array(load_img(x , target_size = (100, 100)))
label['imgArray'] = label['imgpath'].apply(ff)
label['imgArray'][0].shape

label.head()

# 데이터를 일부만 사용했습니다. 

print(label.shape)
import numpy as np
import pandas as pd
label = np.random.permutation(label)
label = label[:3000]

label = pd.DataFrame(label)
label.columns = ['id', 'breed',	'imgpath', 'imgArray']
label.head()

2. 데이터 전처리

#2. 데이터 전처리
# X: 이미지 데이터 정규화해 할당
X_train= label['imgArray'] / 255

# Y :정답레이블(breed)을 one-hot encoding 변환
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
a = le.fit_transform(label['breed'])
Y_train = np_utils.to_categorical(a)
Y_train[0]

# train, test set 분리하기
x_train, y_train = X_train[:2500],Y_train[:2500]
x_valid, y_valid = X_train[2500:],Y_train[2500:]

tmp = x_train.reset_index()
tmp['imgArray']
x_t = []
for i in range(len(tmp)):
    x_t.append(tmp['imgArray'][i].tolist())

tmp1 = x_valid.reset_index()
tmp1['imgArray']
x_v = []
for i in range(len(tmp1)):
    x_v.append(tmp1['imgArray'][i].tolist())

x_train = np.array(x_t)
x_valid = np.array(x_v)

3. 모델 설계&모델 컴파일

# 컨볼루션 신경망의 설정
from keras.models import *
from keras.layers import *
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3), input_shape=(100, 100, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.3))
model.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.3))

model.add(Conv2D(128, kernel_size=(3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.3))

model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(120, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

4. 학습

import os
from keras.callbacks import *

# 모델 최적화 설정
MODEL_DIR = './model/'
if not os.path.exists(MODEL_DIR):
    os.mkdir(MODEL_DIR)

modelpath="./model/{epoch:02d}-{val_loss:.4f}.hdf5"
checkpointer = ModelCheckpoint(filepath=modelpath, 
                               monitor='val_loss', verbose=1, 
                               save_best_only=True)
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=10)

# 모델의 실행
history = model.fit(x_train, y_train, validation_data=(x_valid, y_valid),
                    epochs = 8)

5. 검증

# 테스트 정확도 출력
print("\n Test Accuracy: %.4f" % (model.evaluate(x_valid, y_valid)[1]))

# 테스트 셋의 오차
y_vloss = history.history['val_loss']

# 학습셋의 오차
y_loss = history.history['loss']

# 그래프로 표현
x_len = np.arange(len(y_loss))

import matplotlib.pyplot as plt

plt.plot(x_len, y_vloss, marker='.', c="red", label='Testset_loss')
plt.plot(x_len, y_loss, marker='.', c="blue", label='Trainset_loss')

# 그래프에 그리드를 주고 레이블을 표시
plt.legend(loc='lower left')
plt.grid()
plt.xlabel('epoch')
plt.ylabel('loss')
plt.show()