이승제

박서진

home-credit-modeling (1).ipynb

#모델 훈련
from torch.utils.data import DataLoader, TensorDataset
torch.manual_seed(42)
batch_size = 64
num_epochs = 10
loss_history = []

X = data.drop(["target", "case_id", "MONTH"], axis = 1)
y = data["target"]

X_train_tensor = torch.tensor(X.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y.values, dtype=torch.float32)

train_ds = TensorDataset(X_train_tensor, y_train_tensor)
train_dl = DataLoader(train_ds, batch_size, shuffle=True)

for epoch in range(num_epochs):
    epoch_loss = 0
    accuracy_hist_train = 0
    for x_batch, y_batch in train_dl:
    
        # x_batch : WEEK_NUM이 존재 / x_batch_not_week : WEEK_NUM이 존재 X
        x_batch_not_week = torch.cat((x_batch[:, :3], x_batch[:, 4:]), dim=1)
        week = x_batch[:, 3]
        pred = model(x_batch_not_week)
        
        # score
        pred_int = pred.round().int()
        print("not_week", x_batch_not_week.shape)
        print("batch:", x_batch.shape)
        print("week:", week.shape)
        print("pred: ", pred_int.shape)
        print("y_batch:", y_batch.shape)
        # score column
        # pred_int -> 1차원
        pred_int = pred_int.view(-1)
        base = torch.cat([week, pred_int, y_batch], dim = 0)
        print(base.shape)
        ############ 수정 필요
        base = base.view(3, )
        # 텐서를 numpy 배열로 변환
        numpy_array = base.numpy()
        df_base = pd.DataFrame(numpy_array, columns=['WEEK_NUM', 'score', 'target'])
        # gini 점수
        score = gini_stability(df_base)
        
        
        y_batch = y_batch.float()
        loss = loss_fn(pred, y_batch)
        epoch_loss += loss.item()
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        is_correct = (torch.argmax(pred, dim=1) == y_batch.long()).float()
        accuracy_hist_train += is_correct.sum()
        
        
    epoch_loss /= len(train_dl.dataset) 
    loss_history.append(epoch_loss)  
    accuracy_hist_train /= len(train_dl.dataset)
    print(f'epoch {epoch}  accuracy {accuracy_hist_train:.4f}  loss {epoch_loss:.4f}')
# 폴드 작성, 검증
    cv = StratifiedGroupKFold(n_splits = 5)
    for train_idx, valid_idx in cv.split(X, y, groups = weeks):
        
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_valid, y_valid = X.iloc[valid_idx], y.iloc[valid_idx]
        
        # base data 생성(week = WEEK_NUM)
        week = weeks.iloc[valid_idx]
        df_base = pd.concat([X_valid, week], axis = 1)
        
        # 모델 생성
        model = lgb.LGBMClassifier()
        
        # 모델 학습
        model.fit(X_train, y_train)
        
        pred = model.predict_proba(X_valid)[:, 1]
        df_base['score'] = model.predict(X_valid)
        
        # base  + target(target = y_valid)
        df_base = pd.concat([df_base, y_valid], axis = 1)
        score = gini_stability(df_base)
        #score = roc_auc_score(y_valid, pred)
        
        # 리스트에 저장
        score_folds.append(score)
        
        
from catboost import Pool
  
 for idx_train, idx_valid in cv.split(X, y, groups=weeks):
    
    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]
    
    X_train[CATS] = X_train[CATS].astype("category")
    X_valid[CATS] = X_valid[CATS].astype("category")
    
    pool_train = Pool(X_train[CATS], y_train,cat_features=CATS)
    pool_valid = Pool(X_valid[CATS], y_valid,cat_features=CATS)
    
    # cat 모델
    model_1 = cat.CatBoostClassifier(eval_metric='AUC')
    model_1.fit(pool_train, eval_set = pool_valid,verbose=300)
    
    y_pred_valid = model_1.predict_proba(X_valid)[:, 1]
    
    X_valid['score'] = model_1.predict(X_vaild)
    X_valid['WEEK_NUM'] = weeks.iloc[idx_valid]
    
    base = pd.concat([X_valid, y_valid], axis = 1)
    base = pd.concat([base, X_valid['score']], axis = 1)
    
    score = gini_stability(base)
    gini_score.append(score)
    
    auc_score = roc_auc_score(y_valid, y_pred_valid)
    cv_scores_xgb.append(auc_score)