AI/광인사플젝

[머신러닝]2.치매예측 인공지능(전체코드 맛보기)

살랑춤춰요 2024. 8. 1. 09:31

- 전자공학과 인공지능을 복수전공을 했기 때문에 신호처리를 주로 다뤘습니다.

(단, 정보보호법에 의거하여 데이터를 유추할 수 있을만한 모든 내용은 첨부불가하여 EDA부분은 그래프까지만 설명가능)

 

1. 데이터의 이상치 추출

- 데이터의 이상치를 추출했더니 이상치가 너무 많았음

- 레이블링 정보를 기준으로 이상치를 확인했더니 정상군에서 대부분 이상치가 존재했고, 경도인지장애에는 약간의 이상치값, 알츠하이머에는 이상치가 존재하지 않았음

-> 신호에서 이상치값이 있어야 정상군에 속하고 이상치가 없을수록 알츠하이머일것이라 판단.

 

2. 분석과 처리방법은 3번부터 진행하겠습니다.

- 해당 페이지의 코드는 신호처리와 예측에 관한 베이스라인입니다.

from scipy.signal import butter,filtfilt
import csv, os ,math, pywt        ; from glob import glob
import matplotlib.pyplot as plt
import numpy as np     ; import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from tqdm.notebook import tqdm  ; import seaborn as sns
from skimage.restoration import denoise_wavelet
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, \
QuantileTransformer, PowerTransformer, Normalizer

import warnings
warnings.filterwarnings('ignore')
plt.rcParams["font.family"] = 'NanumGothic'
plt.rcParams['axes.unicode_minus'] = False
 
# 데이터가 위치한 경로에서 모든 csv 파일 가져오기
train_test_set = sorted(glob('GNB/100_new/*.csv'))
validation_set = sorted(glob('GNB/35_new/*.csv'))
 
df_98_ = pd.DataFrame()
category = 0
for i, file in enumerate(train_test_set) :

    user_df = pd.read_csv(file)
    user_df = user_df[user_df.columns[:22]][:]

    lst_730   = user_df.columns[1:8].tolist()
    lst_850   = user_df.columns[8:15].tolist()
    lst_back  = user_df.columns[15:22].tolist()

    for idx in range(7) :

#         user_df[lst_730[idx]] = user_df[lst_730[idx]] - np.mean(user_df[lst_730[idx]]) #  DC OFFSET 제거
#         user_df[lst_850[idx]] = user_df[lst_850[idx]] - np.mean(user_df[lst_850[idx]]) #  DC OFFSET 제거

        user_df[lst_730[idx]] = user_df[lst_730[idx]]  # + user_df[lst_back[idx]] #+ 300
        user_df[lst_850[idx]] = user_df[lst_850[idx]]  # + user_df[lst_back[idx]] #+ 300

    user_df = user_df[user_df.columns[:15]][:]
    uid  = int(file.split('/')[-1].split('\\')[1].split(".")[0]) # 경로의 형태가 모두 다르기 때문에 자신의 경로 형태에 맞게 수정해야함.
    user_df.insert(0,'uid', uid)
    if i < 55:
        category = 'CN'
    elif  55<=i<85:
        category = 'MCI'
    elif  85<=i<99:
        category = "AD"
    user_df['Category'] = category

    df_98_ = pd.concat([df_98_,user_df])

uid_lst = list(set(df_98_.uid.unique().tolist()))

df_98 = pd.DataFrame()

for uid in uid_lst :

    df   = df_98_[df_98_.uid==uid]
    df_98 = pd.concat([df_98,df])

df_98 = df_98.reset_index(drop = True)

del df_98_
 
df_35_ = pd.DataFrame() # test data를 위해 데이터프레임 만들기

for i, file in enumerate(validation_set) :

    user_df = pd.read_csv(file)
    user_df = user_df[user_df.columns[:22]][:]

    lst_730   = user_df.columns[1:8].tolist()
    lst_850   = user_df.columns[8:15].tolist()
    lst_back  = user_df.columns[15:22].tolist()

    for idx in range(7) :

#         user_df[lst_730[idx]] = user_df[lst_730[idx]] - np.mean(user_df[lst_730[idx]]) #  DC OFFSET 제거
#         user_df[lst_850[idx]] = user_df[lst_850[idx]] - np.mean(user_df[lst_850[idx]]) #  DC OFFSET 제거

        user_df[lst_730[idx]] = user_df[lst_730[idx]]  #+ user_df[lst_back[idx]]# + 250
        user_df[lst_850[idx]] = user_df[lst_850[idx]]  #+ user_df[lst_back[idx]]# + 250
    user_df = user_df[user_df.columns[:15]][:]
    uid  = int(file.split('/')[-1].split('\\')[1].split(".")[0]) # 경로의 형태가 모두 다르기 때문에 자신의 경로 형태에 맞게 수정해야함.
    user_df.insert(0,'uid', uid)
    if i <= 5:
        category = 'AD'
        user_df['Category'] = category
        df_35_ = pd.concat([df_35_,user_df])
    elif  6<=i<21:
        category = 'CN'
        user_df['Category'] = category
        df_35_ = pd.concat([df_35_,user_df])
    elif  22<=i<=36:
        category = "MCI"
        user_df['Category'] = category
        df_35_ = pd.concat([df_35_,user_df])

#     user_df['Category'] = category
#     df_35_ = pd.concat([df_35_,user_df])

uid_lst = list(set(df_35_.uid.unique().tolist()))

df_35 = pd.DataFrame()

for uid in uid_lst :

    df   = df_35_[df_35_.uid==uid]
    df_35 = pd.concat([df_35,df])

df_35 = df_35.reset_index(drop = True)

del df_35_

def all_section_slice_data(df, ss_1,es_1,ss_2,es_2,ss_3,es_3,ss_4,es_4,ss_5,es_5) :

    user_lst = df.uid.unique()
    new_df   = pd.DataFrame()
    for i in user_lst :
        slice_df = pd.concat([df[df.uid==i][ss_1:es_1],df[df.uid==i][ss_2:es_2],df[df.uid==i][ss_3:es_3]
                              ,df[df.uid==i][ss_4:es_4],df[df.uid==i][ss_5:es_5]])
        new_df   = pd.concat([new_df,slice_df])
    return new_df

ss_1,es_1,ss_2,es_2,ss_3,es_3,ss_4,es_4,ss_5,es_5 = 0,1000,1400,2000,2400,3000,3400,4000,4400,5000

df_35 = all_section_slice_data(df_35,
                               ss_1,es_1,ss_2,es_2,ss_3,es_3,ss_4,es_4,ss_5,es_5).reset_index(drop=True)
 
def lowpassfilter(signal, thresh = 0, wavelet="db8"):
    thresh = thresh*np.nanmax(signal)
    coeff = pywt.wavedec(signal, wavelet, mode="per" )
    coeff[1:] = (pywt.threshold(i, value=thresh, mode="soft" ) for i in coeff[1:])
    reconstructed_signal = pywt.waverec(coeff, wavelet, mode="per" )
    if len(signal) != len(reconstructed_signal) :
        reconstructed_signal = reconstructed_signal[1:]

    return reconstructed_signal[:]

wav_df_98 = pd.DataFrame()

for idx in tqdm(df_98.uid.unique()) :

    uid_1 = df_98[df_98.uid==idx]
    col_lst = uid_1.columns[2:-1].tolist()
    print("columns lst : {}".format(col_lst), end='\r')
    for j in col_lst :
        uid_1[j] = lowpassfilter(uid_1[j].values, 0.27)
   
    wav_df_98 = pd.concat([wav_df_98,uid_1])
wav_df_98 = wav_df_98.reset_index(drop=True)
 
# 필터와 wavelet 변환 적용
'''
<wavelet 변한을 적용하는 이유>
신호의 값으로 해석하는것이 아닌 EDA 를 통해 높은 신호값이 많이 존재할수록 정산군에 속함.
따라서 t축(시간축)에 따라 신호가 변화하는 형태를 해석해야하기 때문에 wavelet 변환을 실시해야함.
'''
wav_df_35 = pd.DataFrame()

for idx in tqdm(df_35.uid.unique()) :

    uid_1 = df_35[df_35.uid==idx]
    col_lst = uid_1.columns[2:-1].tolist()
    print("columns lst : {}".format(col_lst), end='\r')
    for j in col_lst :
        uid_1[j] = lowpassfilter(uid_1[j].values, 0.27)

    wav_df_35 = pd.concat([wav_df_35,uid_1])
wav_df_35.head()
wav_df_35 = wav_df_35.reset_index(drop=True)
 
# 광학밀도변환
'''
<광학밀도변환(광필터)를 적용하는 이유>
알츠하이머 환자는 공통적으로 후각기능이 저하됨. 이 후각기능은 뇌신호에서 헤모글로빈의 농도에 따라 후각의 척도를 정함.
옥시 헤모글로빈과 디옥시헤모글로빈의 빛이 흡수된 정도와 반사된 정도값을 계산하여 각각의 농도를 얻을 수 있음.
신호에서 빛과 관련된 값을 추출하기 위해선 광밀도변환을 실시함.
'''
od_df_98 = pd.DataFrame()

for idx in tqdm(wav_df_98.uid.unique()) :

    raw = wav_df_98[wav_df_98.uid==idx].reset_index(drop=True)

    lst_730 = raw.columns[2:9].tolist()   #730 , 1~7채널
    lst_850 = raw.columns[9:16].tolist()  #850 , 1~7채널

    for jdx in lst_730 :

        ch         = jdx.split("_")[-1]
        ch730_name = "A_730_{}".format(ch)
        ch850_name = "A_850_{}".format(ch)
        print('process columns : {}, {}'.format(ch730_name,ch850_name), end='\r')
        data730 = raw[ch730_name]
        data850 = raw[ch850_name]

        base730 = data730.copy()
        base850 = data850.copy()

        # 신호는 측정이 시작되면 일정구간동안 엄청난 노이즈가 들어옴
        # 일정 구간이 지나면 정상적으로 측정이 시작됨.
        # 100만큼 구간을 잘라서 노이즈 처리를 실시
        base730[base730.index > 100] = np.mean(base730[:100])
        for ix in base730.index[:100].tolist() :
            base730[ix] = float(np.mean(data730[0:ix+1]))

        base850[base850.index > 100] = np.mean(base850[:100])
        for ix in base850.index[:100].tolist() :
            base850[ix] = float(np.mean(data850[0:ix+1]))

        # 비어-람베르트 법칙을 이용해서 광밀도 변환을 실시.
        raw[ch730_name] = np.log10(base730/data730)
        raw[ch850_name] = np.log10(base850/data850)


    od_df_98 = pd.concat([od_df_98,raw])

od_df_98 = od_df_98.reset_index(drop=True)

uids = 14

plt.title("98 OD Data - User name : {} -  730nm,850nm - 4Channel".format(uids),fontsize=20)
od_df_98[od_df_98.uid==uids].A_730_3.plot(figsize=(12,5))
od_df_98[od_df_98.uid==uids].A_850_3.plot(figsize=(12,5))
 
od_df_35 = pd.DataFrame()

for idx in tqdm(wav_df_35.uid.unique()) :

    raw = wav_df_35[wav_df_35.uid==idx].reset_index(drop=True)

    lst_730 = raw.columns[2:9].tolist()   #730 , 1~7채널
    lst_850 = raw.columns[9:16].tolist()  #850 , 1~7채널

    for jdx in lst_730 :

        ch         = jdx.split("_")[-1]
        ch730_name = "A_730_{}".format(ch)
        ch850_name = "A_850_{}".format(ch)
        print('process columns : {}, {}'.format(ch730_name,ch850_name), end='\r')
        data730 = raw[ch730_name]
        data850 = raw[ch850_name]

        base730 = data730.copy()
        base850 = data850.copy()

        base730[base730.index > 100] = np.mean(base730[:100])
        for ix in base730.index[:100].tolist() :
            base730[ix] = float(np.mean(data730[0:ix+1]))

        base850[base850.index > 100] = np.mean(base850[:100])
        for ix in base850.index[:100].tolist() :
            base850[ix] = float(np.mean(data850[0:ix+1]))

        raw[ch730_name] = np.log10(base730/data730) #/(separation_730[int(ch)])
        raw[ch850_name] = np.log10(base850/data850) #/(separation_850[int(ch)])

    od_df_35 = pd.concat([od_df_35,raw])

od_df_35 = od_df_35.reset_index(drop=True)


uids = 13

plt.title("35 OD Data - User name : {} -  730nm,850nm - 4Channel".format(uids),fontsize=20)
od_df_35[od_df_35.uid==uids].A_730_3.plot(figsize=(12,5))
od_df_35[od_df_35.uid==uids].A_850_3.plot(figsize=(12,5))
 
# 측정기기의 몰 흡광계수
e730_oxy = 2.1838
e730_deoxy = 9.5048
e850_oxy = 10.2470
e850_deoxy = 9.2819

e_c = (e730_oxy*e850_deoxy-e850_oxy*e730_deoxy)

separation_730 = np.array([4, 3.5, 3, 3.5, 4, 3, 1])
separation_850 = np.array([4, 3.5, 3, 3.5, 4, 3, 1])

hb_df_98 = pd.DataFrame()

for idx in od_df_98.uid.unique() :

    od = od_df_98[od_df_98.uid==idx].reset_index(drop=True)

    lst_730 = raw.columns[2:9].tolist()   #730 , 1~7채널
    lst_850 = raw.columns[9:16].tolist()  #850 , 1~7채널

    for jdx in lst_730 :

        ch         = jdx.split("_")[-1]

        ch730_name = "A_730_{}".format(ch)
        ch850_name = "A_850_{}".format(ch)

        hbo_name = "hbo_{}".format(str(int(ch)+1))
        hbr_name = "hbr_{}".format(str(int(ch)+1))

        OD730 = od[ch730_name]
        OD850 = od[ch850_name]

        hbo = (OD730*e850_deoxy-OD850*e730_deoxy)/(e_c * separation_850[int(ch)])
        hbr = (OD850*e730_oxy-OD730*e850_oxy)/(e_c * separation_850[int(ch)])
        od[ch730_name] = hbo
        od[ch850_name] = hbr

        od.rename(columns = {ch730_name:hbo_name, ch850_name:hbr_name}, inplace = True )
    hb_df_98 = pd.concat([hb_df_98,od])
hb_df_98 = hb_df_98.reset_index(drop=True)

uids = 14

plt.title("98 Hb Data - User name : {} -  hbo,hbr - 4Channel".format(uids),fontsize=20)
hb_df_98[hb_df_98.uid==uids].hbo_4.plot(figsize=(12,5))
hb_df_98[hb_df_98.uid==uids].hbr_4.plot(figsize=(12,5))
 
hb_df_35 = pd.DataFrame()

for idx in od_df_35.uid.unique() :

    od = od_df_35[od_df_35.uid==idx].reset_index(drop=True)

    lst_730 = raw.columns[2:9].tolist()   #730 , 1~7채널
    lst_850 = raw.columns[9:16].tolist()  #850 , 1~7채널

    for jdx in lst_730 :

        ch         = jdx.split("_")[-1]

        ch730_name = "A_730_{}".format(ch)
        ch850_name = "A_850_{}".format(ch)

        hbo_name = "hbo_{}".format(str(int(ch)+1))
        hbr_name = "hbr_{}".format(str(int(ch)+1))

        OD730 = od[ch730_name]
        OD850 = od[ch850_name]

        hbo = (OD730*e850_deoxy-OD850*e730_deoxy)/(e_c * separation_850[int(ch)])
        hbr = (OD850*e730_oxy-OD730*e850_oxy)/(e_c * separation_850[int(ch)])

        od[ch730_name] = hbo
        od[ch850_name] = hbr

        od.rename(columns = {ch730_name:hbo_name, ch850_name:hbr_name}, inplace = True )
    hb_df_35 = pd.concat([hb_df_35,od])
hb_df_35 = hb_df_35.reset_index(drop=True)

uids = 25

plt.title("35 Hb Data - User name : {} -  hbo,hbr - 4Channel".format(uids),fontsize=20)
hb_df_35[hb_df_35.uid==uids].hbo_4.plot(figsize=(12,5))
hb_df_35[hb_df_35.uid==uids].hbr_4.plot(figsize=(12,5))
 
chb_df_98 = pd.DataFrame()

for idx in hb_df_98.uid.unique() :

    chb_df = pd.DataFrame()

    uid1  = hb_df_98[hb_df_98.uid==idx]

    hbo_lst  = uid1[uid1.columns[2:8]].columns
    hbr_lst  = uid1[uid1.columns[9:15]].columns

    for jdx in range(len(hbo_lst)) :

        cNIRS_HbO_constant = np.dot(uid1[hbo_lst[jdx]], uid1.hbo_7) / np.dot(uid1[hbo_lst[jdx]], uid1.hbo_7)
        cNIRS_Hb_constant  = np.dot(uid1[hbr_lst[jdx]], uid1.hbr_7) / np.dot(uid1[hbr_lst[jdx]], uid1.hbr_7)

        if cNIRS_HbO_constant>0:
            cHbO = uid1[hbo_lst[jdx]] - uid1.hbo_7 * float(cNIRS_HbO_constant)
        else :
            cHbO = uid1[hbo_lst[jdx]]
        if cNIRS_Hb_constant >0:
            cHb = uid1[hbr_lst[jdx]] - uid1.hbr_7 * float(cNIRS_Hb_constant)
        else :
            cHb = uid1[hbr_lst[jdx]]

        chb_df['c' + hbo_lst[jdx]] = cHbO[:3400]
        chb_df['c' + hbr_lst[jdx]] = cHb[:3400]

    chb_df.insert(0,'uid', idx)
    chb_df = chb_df[['uid','chbo_1','chbo_2','chbo_3','chbo_4','chbo_5','chbo_6'
                     ,'chbr_1','chbr_2','chbr_3','chbr_4','chbr_5','chbr_6']]
    chb_df['Category'] = uid1.Category.unique()[0]
    chb_df_98 = pd.concat([chb_df_98, chb_df])

chb_df_98 = chb_df_98.reset_index(drop=True)

pd1= chb_df_98.chbo_1 - chb_df_98.chbr_1
pd2= chb_df_98.chbo_2 - chb_df_98.chbr_2
pd3= chb_df_98.chbo_3 - chb_df_98.chbr_3
pd4= chb_df_98.chbo_4 - chb_df_98.chbr_4
pd5= chb_df_98.chbo_5 - chb_df_98.chbr_5
pd6= chb_df_98.chbo_6 - chb_df_98.chbr_6

lr01 = (pd1 + pd2 + pd3) - (pd4 + pd5 + pd6)

chb_df_98.insert(13, 'lr01', lr01)
chb_df_98.insert(14, 'dc_lr01', lr01 - np.mean(lr01))  # remove DC OFFSET
chb_df_98.insert(15, 'mean_chb', (pd1 + pd2 + pd3 +pd4 + pd5 + pd6) / 6)

uids = 14

plt.title("cHb Data - User name : {} -  cHbo,cHbr - 4Channel".format(uids),fontsize=20)
chb_df_98[chb_df_98.uid==uids].chbo_4.plot(figsize=(12,5))
chb_df_98[chb_df_98.uid==uids].chbr_4.plot(figsize=(12,5))
 
 
chb_df_35 = pd.DataFrame()

for idx in hb_df_35.uid.unique() :

    chb_df = pd.DataFrame()

    uid1  = hb_df_35[hb_df_35.uid==idx]

    hbo_lst  = uid1[uid1.columns[2:8]].columns
    hbr_lst  = uid1[uid1.columns[9:15]].columns

    for jdx in range(len(hbo_lst)) :

        cNIRS_HbO_constant = np.dot(uid1[hbo_lst[jdx]], uid1.hbo_7) / np.dot(uid1[hbo_lst[jdx]], uid1.hbo_7)
        cNIRS_Hb_constant  = np.dot(uid1[hbr_lst[jdx]], uid1.hbr_7) / np.dot(uid1[hbr_lst[jdx]], uid1.hbr_7)

        if cNIRS_HbO_constant>0:
            cHbO = uid1[hbo_lst[jdx]] - uid1.hbo_7 * float(cNIRS_HbO_constant)
        else :
            cHbO = uid1[hbo_lst[jdx]]
        if cNIRS_Hb_constant >0:
            cHb = uid1[hbr_lst[jdx]] - uid1.hbr_7 * float(cNIRS_Hb_constant)
        else :
            cHb = uid1[hbr_lst[jdx]]

        chb_df['c' + hbo_lst[jdx]] = cHbO[:3400]
        chb_df['c' + hbr_lst[jdx]] = cHb[:3400]

    chb_df.insert(0,'uid', idx)
    chb_df = chb_df[['uid','chbo_1','chbo_2','chbo_3','chbo_4','chbo_5','chbo_6'
                     ,'chbr_1','chbr_2','chbr_3','chbr_4','chbr_5','chbr_6']]
    chb_df['Category'] = uid1.Category.unique()[0]
    chb_df_35 = pd.concat([chb_df_35, chb_df])

chb_df_35 = chb_df_35.reset_index(drop=True)

pd1= chb_df_35.chbo_1 - chb_df_35.chbr_1
pd2= chb_df_35.chbo_2 - chb_df_35.chbr_2
pd3= chb_df_35.chbo_3 - chb_df_35.chbr_3
pd4= chb_df_35.chbo_4 - chb_df_35.chbr_4
pd5= chb_df_35.chbo_5 - chb_df_35.chbr_5
pd6= chb_df_35.chbo_6 - chb_df_35.chbr_6

lr01 = (pd1 + pd2 + pd3) - (pd4 + pd5 + pd6)

chb_df_35.insert(13, 'lr01', lr01)
chb_df_35.insert(14, 'dc_lr01', lr01 - np.mean(lr01))  # remove DC OFFSET
chb_df_35.insert(15, 'mean_chb', (pd1 + pd2 + pd3 +pd4 + pd5 + pd6) / 6)

uids = 14

plt.title("cHb Data - User name : {} -  730nm,850nm - 4Channel".format(uids),fontsize=20)
chb_df_35[chb_df_35.uid==uids].chbo_4.plot(figsize=(12,5))
chb_df_35[chb_df_35.uid==uids].chbr_4.plot(figsize=(12,5))
 
# NaN값이 있는 유저 확인 후 제거

null_uid = chb_df_98[chb_df_98.lr01.isna()].uid.unique()
print('98 - Nan user Index : {}'.format(null_uid))
for i in null_uid :
      chb_df_98 = chb_df_98[chb_df_98.uid != i]

null_uid = chb_df_35[chb_df_35.lr01.isna()].uid.unique()
print('35 - Nan user Index : {}'.format(null_uid))
for i in null_uid :
      chb_df_35 = chb_df_35[chb_df_35.uid != i]
 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, \
QuantileTransformer, PowerTransformer, Normalizer
from sklearn.metrics import precision_score , recall_score ,roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier
 
print("사용할 수 있는 피처 목록 : \n{}".format(chb_df_98.columns[1:-1].tolist()))
 
train = chb_df_98.copy()
test  = chb_df_35.copy()

feature  = 'lr01'

time_seq = 3400 # or len(chb_100[chb_100.uid==1])

train_user_num = len(train.uid.unique())  # 학습 데이터 수
test_user_num = len(test.uid.unique())    # 테스트 데이터 수

x_train_hbo = np.array(train[feature].values).reshape(train_user_num, time_seq) # 넘파이 변환
y_train = np.array(train.drop_duplicates(['uid']).Category.values)                    # 라벨 목록

x_test_hbo = np.array(test[feature].values).reshape(test_user_num, time_seq)     # 넘파이 변환
y_test = np.array(test.drop_duplicates(['uid']).Category.values)                      # 라벨 목록


########## 데이터 스케일링 주석 해제 후 사용

# scaler = PowerTransformer()  # 피처를 정규 분포 형태로 데이터 변환
# scaler = StandardScaler()    # 평균 0, 분산 1로 조정하여 변환  (분류에 적합) - 이상치에 민감
scaler = Normalizer()        # 피처의 모든 길이가 1이 되도록 조정
# scaler = MinMaxScaler()      # 모든 값을 0과 1사이에 존재하도록 변환         - 이상치에 민감
# scaler = RobustScaler()      # 중앙값과 사분위 값을 이용하여 데이터 변환 (이상치에 효과적)
# scaler = QuantileTransformer(output_distribution='normal')
# 1000개 분위를 사용하여 데이터를 균등분포 하도록 변환  Robust와 유사하면서 0~1사이로 데이터 압축

x_train_hbo = scaler.fit_transform(x_train_hbo)   # 스케일링 학습 데이터 적용
x_test_hbo  = scaler.transform(x_test_hbo)        # 테스트 데이터에 그대로 적용

x_train_hbo.shape, y_train.shape, x_test_hbo.shape, y_test.shape
 
rf = RandomForestClassifier(random_state=42)
rf.fit(x_train_hbo, y_train)

print("랜덤 포레스트 \n")

print("훈련 세트 ")
print(" - 정확도(acc)         : {:.3f}".format(rf.score(x_train_hbo, y_train)))
print(" - 정밀도(precision)   : {:.3f}".format( precision_score(y_train, rf.predict(x_train_hbo),average= "macro") ))
print(" - 재현율(recall)      : {:.3f}".format( recall_score(y_train, rf.predict(x_train_hbo),average= "macro") ))
print(" - AUC 점수            : {:.3f}".format( roc_auc_score(y_train, rf.predict_proba(x_train_hbo), multi_class='ovr') ) , end='\n')


label=['AD', 'CN', 'MCI'] # 라벨 설정
plot = plot_confusion_matrix(rf, # 분류 모델
                             x_train_hbo, y_train, # 예측 데이터와 예측값의 정답(y_true)
                             display_labels=label, # 표에 표시할 labels
                             cmap=plt.cm.Blues, # 컬러맵(plt.cm.Reds, plt.cm.rainbow 등이 있음)
                             normalize=None) # 'true', 'pred', 'all' 중에서 지정 가능. default=None
plot.ax_.set_title('Confusion Matrix - Train')



print("테스트 세트 ")
print(" - 정확도(acc)         : {:.3f}".format(rf.score(x_test_hbo, y_test)))
print(" - 정밀도(precision)   : {:.3f}".format( precision_score(y_test, rf.predict(x_test_hbo),average= "macro") ))
print(" - 재현율(recall)      : {:.3f}".format( recall_score(y_test, rf.predict(x_test_hbo),average= "macro") ))
print(" - AUC 점수            : {:.3f}".format( roc_auc_score(y_test, rf.predict_proba(x_test_hbo), multi_class='ovr') ) , end='\n')


label=['AD', 'CN', 'MCI'] # 라벨 설정
plot = plot_confusion_matrix(rf, # 분류 모델
                             x_test_hbo, y_test, # 예측 데이터와 예측값의 정답(y_true)
                             display_labels=label, # 표에 표시할 labels
                             cmap=plt.cm.Blues, # 컬러맵(plt.cm.Reds, plt.cm.rainbow 등이 있음)
                             normalize=None) # 'true', 'pred', 'all' 중에서 지정 가능. default=None
plot.ax_.set_title('Confusion Matrix - Test')

def plot_feature_importances(model):
    n_features = x_train_hbo.data.shape[1]
    plt.barh(model.feature_importances_,range(n_features), align='center')
    plt.yticks(np.arange(n_features))
    plt.xlabel("Feature importance")
    plt.ylabel("Feature")