AI/광인사플젝
[머신러닝]2.치매예측 인공지능(전체코드 맛보기)
살랑춤춰요
2024. 8. 1. 09:31
- 전자공학과 인공지능을 복수전공을 했기 때문에 신호처리를 주로 다뤘습니다.
(단, 정보보호법에 의거하여 데이터를 유추할 수 있을만한 모든 내용은 첨부불가하여 EDA부분은 그래프까지만 설명가능)
1. 데이터의 이상치 추출
- 데이터의 이상치를 추출했더니 이상치가 너무 많았음
- 레이블링 정보를 기준으로 이상치를 확인했더니 정상군에서 대부분 이상치가 존재했고, 경도인지장애에는 약간의 이상치값, 알츠하이머에는 이상치가 존재하지 않았음
-> 신호에서 이상치값이 있어야 정상군에 속하고 이상치가 없을수록 알츠하이머일것이라 판단.
2. 분석과 처리방법은 3번부터 진행하겠습니다.
- 해당 페이지의 코드는 신호처리와 예측에 관한 베이스라인입니다.
from scipy.signal import butter,filtfilt
import csv, os ,math, pywt ; from glob import glob
import matplotlib.pyplot as plt
import numpy as np ; import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from tqdm.notebook import tqdm ; import seaborn as sns
from skimage.restoration import denoise_wavelet
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, \
QuantileTransformer, PowerTransformer, Normalizer
import warnings
warnings.filterwarnings('ignore')
plt.rcParams["font.family"] = 'NanumGothic'
plt.rcParams['axes.unicode_minus'] = False
# 데이터가 위치한 경로에서 모든 csv 파일 가져오기
train_test_set = sorted(glob('GNB/100_new/*.csv'))
validation_set = sorted(glob('GNB/35_new/*.csv'))
df_98_ = pd.DataFrame()
category = 0
for i, file in enumerate(train_test_set) :
user_df = pd.read_csv(file)
user_df = user_df[user_df.columns[:22]][:]
lst_730 = user_df.columns[1:8].tolist()
lst_850 = user_df.columns[8:15].tolist()
lst_back = user_df.columns[15:22].tolist()
for idx in range(7) :
# user_df[lst_730[idx]] = user_df[lst_730[idx]] - np.mean(user_df[lst_730[idx]]) # DC OFFSET 제거
# user_df[lst_850[idx]] = user_df[lst_850[idx]] - np.mean(user_df[lst_850[idx]]) # DC OFFSET 제거
user_df[lst_730[idx]] = user_df[lst_730[idx]] # + user_df[lst_back[idx]] #+ 300
user_df[lst_850[idx]] = user_df[lst_850[idx]] # + user_df[lst_back[idx]] #+ 300
user_df = user_df[user_df.columns[:15]][:]
uid = int(file.split('/')[-1].split('\\')[1].split(".")[0]) # 경로의 형태가 모두 다르기 때문에 자신의 경로 형태에 맞게 수정해야함.
user_df.insert(0,'uid', uid)
if i < 55:
category = 'CN'
elif 55<=i<85:
category = 'MCI'
elif 85<=i<99:
category = "AD"
user_df['Category'] = category
df_98_ = pd.concat([df_98_,user_df])
uid_lst = list(set(df_98_.uid.unique().tolist()))
df_98 = pd.DataFrame()
for uid in uid_lst :
df = df_98_[df_98_.uid==uid]
df_98 = pd.concat([df_98,df])
df_98 = df_98.reset_index(drop = True)
del df_98_
df_35_ = pd.DataFrame() # test data를 위해 데이터프레임 만들기
for i, file in enumerate(validation_set) :
user_df = pd.read_csv(file)
user_df = user_df[user_df.columns[:22]][:]
lst_730 = user_df.columns[1:8].tolist()
lst_850 = user_df.columns[8:15].tolist()
lst_back = user_df.columns[15:22].tolist()
for idx in range(7) :
# user_df[lst_730[idx]] = user_df[lst_730[idx]] - np.mean(user_df[lst_730[idx]]) # DC OFFSET 제거
# user_df[lst_850[idx]] = user_df[lst_850[idx]] - np.mean(user_df[lst_850[idx]]) # DC OFFSET 제거
user_df[lst_730[idx]] = user_df[lst_730[idx]] #+ user_df[lst_back[idx]]# + 250
user_df[lst_850[idx]] = user_df[lst_850[idx]] #+ user_df[lst_back[idx]]# + 250
user_df = user_df[user_df.columns[:15]][:]
uid = int(file.split('/')[-1].split('\\')[1].split(".")[0]) # 경로의 형태가 모두 다르기 때문에 자신의 경로 형태에 맞게 수정해야함.
user_df.insert(0,'uid', uid)
if i <= 5:
category = 'AD'
user_df['Category'] = category
df_35_ = pd.concat([df_35_,user_df])
elif 6<=i<21:
category = 'CN'
user_df['Category'] = category
df_35_ = pd.concat([df_35_,user_df])
elif 22<=i<=36:
category = "MCI"
user_df['Category'] = category
df_35_ = pd.concat([df_35_,user_df])
# user_df['Category'] = category
# df_35_ = pd.concat([df_35_,user_df])
uid_lst = list(set(df_35_.uid.unique().tolist()))
df_35 = pd.DataFrame()
for uid in uid_lst :
df = df_35_[df_35_.uid==uid]
df_35 = pd.concat([df_35,df])
df_35 = df_35.reset_index(drop = True)
del df_35_
def all_section_slice_data(df, ss_1,es_1,ss_2,es_2,ss_3,es_3,ss_4,es_4,ss_5,es_5) :
user_lst = df.uid.unique()
new_df = pd.DataFrame()
for i in user_lst :
slice_df = pd.concat([df[df.uid==i][ss_1:es_1],df[df.uid==i][ss_2:es_2],df[df.uid==i][ss_3:es_3]
,df[df.uid==i][ss_4:es_4],df[df.uid==i][ss_5:es_5]])
new_df = pd.concat([new_df,slice_df])
return new_df
ss_1,es_1,ss_2,es_2,ss_3,es_3,ss_4,es_4,ss_5,es_5 = 0,1000,1400,2000,2400,3000,3400,4000,4400,5000
df_35 = all_section_slice_data(df_35,
ss_1,es_1,ss_2,es_2,ss_3,es_3,ss_4,es_4,ss_5,es_5).reset_index(drop=True)
def lowpassfilter(signal, thresh = 0, wavelet="db8"):
thresh = thresh*np.nanmax(signal)
coeff = pywt.wavedec(signal, wavelet, mode="per" )
coeff[1:] = (pywt.threshold(i, value=thresh, mode="soft" ) for i in coeff[1:])
reconstructed_signal = pywt.waverec(coeff, wavelet, mode="per" )
if len(signal) != len(reconstructed_signal) :
reconstructed_signal = reconstructed_signal[1:]
return reconstructed_signal[:]
wav_df_98 = pd.DataFrame()
for idx in tqdm(df_98.uid.unique()) :
uid_1 = df_98[df_98.uid==idx]
col_lst = uid_1.columns[2:-1].tolist()
print("columns lst : {}".format(col_lst), end='\r')
for j in col_lst :
uid_1[j] = lowpassfilter(uid_1[j].values, 0.27)
wav_df_98 = pd.concat([wav_df_98,uid_1])
wav_df_98 = wav_df_98.reset_index(drop=True)
# 필터와 wavelet 변환 적용
'''
<wavelet 변한을 적용하는 이유>
신호의 값으로 해석하는것이 아닌 EDA 를 통해 높은 신호값이 많이 존재할수록 정산군에 속함.
따라서 t축(시간축)에 따라 신호가 변화하는 형태를 해석해야하기 때문에 wavelet 변환을 실시해야함.
'''
wav_df_35 = pd.DataFrame()
for idx in tqdm(df_35.uid.unique()) :
uid_1 = df_35[df_35.uid==idx]
col_lst = uid_1.columns[2:-1].tolist()
print("columns lst : {}".format(col_lst), end='\r')
for j in col_lst :
uid_1[j] = lowpassfilter(uid_1[j].values, 0.27)
wav_df_35 = pd.concat([wav_df_35,uid_1])
wav_df_35.head()
wav_df_35 = wav_df_35.reset_index(drop=True)
# 광학밀도변환
'''
<광학밀도변환(광필터)를 적용하는 이유>
알츠하이머 환자는 공통적으로 후각기능이 저하됨. 이 후각기능은 뇌신호에서 헤모글로빈의 농도에 따라 후각의 척도를 정함.
옥시 헤모글로빈과 디옥시헤모글로빈의 빛이 흡수된 정도와 반사된 정도값을 계산하여 각각의 농도를 얻을 수 있음.
신호에서 빛과 관련된 값을 추출하기 위해선 광밀도변환을 실시함.
'''
od_df_98 = pd.DataFrame()
for idx in tqdm(wav_df_98.uid.unique()) :
raw = wav_df_98[wav_df_98.uid==idx].reset_index(drop=True)
lst_730 = raw.columns[2:9].tolist() #730 , 1~7채널
lst_850 = raw.columns[9:16].tolist() #850 , 1~7채널
for jdx in lst_730 :
ch = jdx.split("_")[-1]
ch730_name = "A_730_{}".format(ch)
ch850_name = "A_850_{}".format(ch)
print('process columns : {}, {}'.format(ch730_name,ch850_name), end='\r')
data730 = raw[ch730_name]
data850 = raw[ch850_name]
base730 = data730.copy()
base850 = data850.copy()
# 신호는 측정이 시작되면 일정구간동안 엄청난 노이즈가 들어옴
# 일정 구간이 지나면 정상적으로 측정이 시작됨.
# 100만큼 구간을 잘라서 노이즈 처리를 실시
base730[base730.index > 100] = np.mean(base730[:100])
for ix in base730.index[:100].tolist() :
base730[ix] = float(np.mean(data730[0:ix+1]))
base850[base850.index > 100] = np.mean(base850[:100])
for ix in base850.index[:100].tolist() :
base850[ix] = float(np.mean(data850[0:ix+1]))
# 비어-람베르트 법칙을 이용해서 광밀도 변환을 실시.
raw[ch730_name] = np.log10(base730/data730)
raw[ch850_name] = np.log10(base850/data850)
od_df_98 = pd.concat([od_df_98,raw])
od_df_98 = od_df_98.reset_index(drop=True)
uids = 14
plt.title("98 OD Data - User name : {} - 730nm,850nm - 4Channel".format(uids),fontsize=20)
od_df_98[od_df_98.uid==uids].A_730_3.plot(figsize=(12,5))
od_df_98[od_df_98.uid==uids].A_850_3.plot(figsize=(12,5))
od_df_35 = pd.DataFrame()
for idx in tqdm(wav_df_35.uid.unique()) :
raw = wav_df_35[wav_df_35.uid==idx].reset_index(drop=True)
lst_730 = raw.columns[2:9].tolist() #730 , 1~7채널
lst_850 = raw.columns[9:16].tolist() #850 , 1~7채널
for jdx in lst_730 :
ch = jdx.split("_")[-1]
ch730_name = "A_730_{}".format(ch)
ch850_name = "A_850_{}".format(ch)
print('process columns : {}, {}'.format(ch730_name,ch850_name), end='\r')
data730 = raw[ch730_name]
data850 = raw[ch850_name]
base730 = data730.copy()
base850 = data850.copy()
base730[base730.index > 100] = np.mean(base730[:100])
for ix in base730.index[:100].tolist() :
base730[ix] = float(np.mean(data730[0:ix+1]))
base850[base850.index > 100] = np.mean(base850[:100])
for ix in base850.index[:100].tolist() :
base850[ix] = float(np.mean(data850[0:ix+1]))
raw[ch730_name] = np.log10(base730/data730) #/(separation_730[int(ch)])
raw[ch850_name] = np.log10(base850/data850) #/(separation_850[int(ch)])
od_df_35 = pd.concat([od_df_35,raw])
od_df_35 = od_df_35.reset_index(drop=True)
uids = 13
plt.title("35 OD Data - User name : {} - 730nm,850nm - 4Channel".format(uids),fontsize=20)
od_df_35[od_df_35.uid==uids].A_730_3.plot(figsize=(12,5))
od_df_35[od_df_35.uid==uids].A_850_3.plot(figsize=(12,5))
# 측정기기의 몰 흡광계수
e730_oxy = 2.1838
e730_deoxy = 9.5048
e850_oxy = 10.2470
e850_deoxy = 9.2819
e_c = (e730_oxy*e850_deoxy-e850_oxy*e730_deoxy)
separation_730 = np.array([4, 3.5, 3, 3.5, 4, 3, 1])
separation_850 = np.array([4, 3.5, 3, 3.5, 4, 3, 1])
hb_df_98 = pd.DataFrame()
for idx in od_df_98.uid.unique() :
od = od_df_98[od_df_98.uid==idx].reset_index(drop=True)
lst_730 = raw.columns[2:9].tolist() #730 , 1~7채널
lst_850 = raw.columns[9:16].tolist() #850 , 1~7채널
for jdx in lst_730 :
ch = jdx.split("_")[-1]
ch730_name = "A_730_{}".format(ch)
ch850_name = "A_850_{}".format(ch)
hbo_name = "hbo_{}".format(str(int(ch)+1))
hbr_name = "hbr_{}".format(str(int(ch)+1))
OD730 = od[ch730_name]
OD850 = od[ch850_name]
hbo = (OD730*e850_deoxy-OD850*e730_deoxy)/(e_c * separation_850[int(ch)])
hbr = (OD850*e730_oxy-OD730*e850_oxy)/(e_c * separation_850[int(ch)])
od[ch730_name] = hbo
od[ch850_name] = hbr
od.rename(columns = {ch730_name:hbo_name, ch850_name:hbr_name}, inplace = True )
hb_df_98 = pd.concat([hb_df_98,od])
hb_df_98 = hb_df_98.reset_index(drop=True)
uids = 14
plt.title("98 Hb Data - User name : {} - hbo,hbr - 4Channel".format(uids),fontsize=20)
hb_df_98[hb_df_98.uid==uids].hbo_4.plot(figsize=(12,5))
hb_df_98[hb_df_98.uid==uids].hbr_4.plot(figsize=(12,5))
hb_df_35 = pd.DataFrame()
for idx in od_df_35.uid.unique() :
od = od_df_35[od_df_35.uid==idx].reset_index(drop=True)
lst_730 = raw.columns[2:9].tolist() #730 , 1~7채널
lst_850 = raw.columns[9:16].tolist() #850 , 1~7채널
for jdx in lst_730 :
ch = jdx.split("_")[-1]
ch730_name = "A_730_{}".format(ch)
ch850_name = "A_850_{}".format(ch)
hbo_name = "hbo_{}".format(str(int(ch)+1))
hbr_name = "hbr_{}".format(str(int(ch)+1))
OD730 = od[ch730_name]
OD850 = od[ch850_name]
hbo = (OD730*e850_deoxy-OD850*e730_deoxy)/(e_c * separation_850[int(ch)])
hbr = (OD850*e730_oxy-OD730*e850_oxy)/(e_c * separation_850[int(ch)])
od[ch730_name] = hbo
od[ch850_name] = hbr
od.rename(columns = {ch730_name:hbo_name, ch850_name:hbr_name}, inplace = True )
hb_df_35 = pd.concat([hb_df_35,od])
hb_df_35 = hb_df_35.reset_index(drop=True)
uids = 25
plt.title("35 Hb Data - User name : {} - hbo,hbr - 4Channel".format(uids),fontsize=20)
hb_df_35[hb_df_35.uid==uids].hbo_4.plot(figsize=(12,5))
hb_df_35[hb_df_35.uid==uids].hbr_4.plot(figsize=(12,5))
chb_df_98 = pd.DataFrame()
for idx in hb_df_98.uid.unique() :
chb_df = pd.DataFrame()
uid1 = hb_df_98[hb_df_98.uid==idx]
hbo_lst = uid1[uid1.columns[2:8]].columns
hbr_lst = uid1[uid1.columns[9:15]].columns
for jdx in range(len(hbo_lst)) :
cNIRS_HbO_constant = np.dot(uid1[hbo_lst[jdx]], uid1.hbo_7) / np.dot(uid1[hbo_lst[jdx]], uid1.hbo_7)
cNIRS_Hb_constant = np.dot(uid1[hbr_lst[jdx]], uid1.hbr_7) / np.dot(uid1[hbr_lst[jdx]], uid1.hbr_7)
if cNIRS_HbO_constant>0:
cHbO = uid1[hbo_lst[jdx]] - uid1.hbo_7 * float(cNIRS_HbO_constant)
else :
cHbO = uid1[hbo_lst[jdx]]
if cNIRS_Hb_constant >0:
cHb = uid1[hbr_lst[jdx]] - uid1.hbr_7 * float(cNIRS_Hb_constant)
else :
cHb = uid1[hbr_lst[jdx]]
chb_df['c' + hbo_lst[jdx]] = cHbO[:3400]
chb_df['c' + hbr_lst[jdx]] = cHb[:3400]
chb_df.insert(0,'uid', idx)
chb_df = chb_df[['uid','chbo_1','chbo_2','chbo_3','chbo_4','chbo_5','chbo_6'
,'chbr_1','chbr_2','chbr_3','chbr_4','chbr_5','chbr_6']]
chb_df['Category'] = uid1.Category.unique()[0]
chb_df_98 = pd.concat([chb_df_98, chb_df])
chb_df_98 = chb_df_98.reset_index(drop=True)
pd1= chb_df_98.chbo_1 - chb_df_98.chbr_1
pd2= chb_df_98.chbo_2 - chb_df_98.chbr_2
pd3= chb_df_98.chbo_3 - chb_df_98.chbr_3
pd4= chb_df_98.chbo_4 - chb_df_98.chbr_4
pd5= chb_df_98.chbo_5 - chb_df_98.chbr_5
pd6= chb_df_98.chbo_6 - chb_df_98.chbr_6
lr01 = (pd1 + pd2 + pd3) - (pd4 + pd5 + pd6)
chb_df_98.insert(13, 'lr01', lr01)
chb_df_98.insert(14, 'dc_lr01', lr01 - np.mean(lr01)) # remove DC OFFSET
chb_df_98.insert(15, 'mean_chb', (pd1 + pd2 + pd3 +pd4 + pd5 + pd6) / 6)
uids = 14
plt.title("cHb Data - User name : {} - cHbo,cHbr - 4Channel".format(uids),fontsize=20)
chb_df_98[chb_df_98.uid==uids].chbo_4.plot(figsize=(12,5))
chb_df_98[chb_df_98.uid==uids].chbr_4.plot(figsize=(12,5))
chb_df_35 = pd.DataFrame()
for idx in hb_df_35.uid.unique() :
chb_df = pd.DataFrame()
uid1 = hb_df_35[hb_df_35.uid==idx]
hbo_lst = uid1[uid1.columns[2:8]].columns
hbr_lst = uid1[uid1.columns[9:15]].columns
for jdx in range(len(hbo_lst)) :
cNIRS_HbO_constant = np.dot(uid1[hbo_lst[jdx]], uid1.hbo_7) / np.dot(uid1[hbo_lst[jdx]], uid1.hbo_7)
cNIRS_Hb_constant = np.dot(uid1[hbr_lst[jdx]], uid1.hbr_7) / np.dot(uid1[hbr_lst[jdx]], uid1.hbr_7)
if cNIRS_HbO_constant>0:
cHbO = uid1[hbo_lst[jdx]] - uid1.hbo_7 * float(cNIRS_HbO_constant)
else :
cHbO = uid1[hbo_lst[jdx]]
if cNIRS_Hb_constant >0:
cHb = uid1[hbr_lst[jdx]] - uid1.hbr_7 * float(cNIRS_Hb_constant)
else :
cHb = uid1[hbr_lst[jdx]]
chb_df['c' + hbo_lst[jdx]] = cHbO[:3400]
chb_df['c' + hbr_lst[jdx]] = cHb[:3400]
chb_df.insert(0,'uid', idx)
chb_df = chb_df[['uid','chbo_1','chbo_2','chbo_3','chbo_4','chbo_5','chbo_6'
,'chbr_1','chbr_2','chbr_3','chbr_4','chbr_5','chbr_6']]
chb_df['Category'] = uid1.Category.unique()[0]
chb_df_35 = pd.concat([chb_df_35, chb_df])
chb_df_35 = chb_df_35.reset_index(drop=True)
pd1= chb_df_35.chbo_1 - chb_df_35.chbr_1
pd2= chb_df_35.chbo_2 - chb_df_35.chbr_2
pd3= chb_df_35.chbo_3 - chb_df_35.chbr_3
pd4= chb_df_35.chbo_4 - chb_df_35.chbr_4
pd5= chb_df_35.chbo_5 - chb_df_35.chbr_5
pd6= chb_df_35.chbo_6 - chb_df_35.chbr_6
lr01 = (pd1 + pd2 + pd3) - (pd4 + pd5 + pd6)
chb_df_35.insert(13, 'lr01', lr01)
chb_df_35.insert(14, 'dc_lr01', lr01 - np.mean(lr01)) # remove DC OFFSET
chb_df_35.insert(15, 'mean_chb', (pd1 + pd2 + pd3 +pd4 + pd5 + pd6) / 6)
uids = 14
plt.title("cHb Data - User name : {} - 730nm,850nm - 4Channel".format(uids),fontsize=20)
chb_df_35[chb_df_35.uid==uids].chbo_4.plot(figsize=(12,5))
chb_df_35[chb_df_35.uid==uids].chbr_4.plot(figsize=(12,5))
# NaN값이 있는 유저 확인 후 제거
null_uid = chb_df_98[chb_df_98.lr01.isna()].uid.unique()
print('98 - Nan user Index : {}'.format(null_uid))
for i in null_uid :
chb_df_98 = chb_df_98[chb_df_98.uid != i]
null_uid = chb_df_35[chb_df_35.lr01.isna()].uid.unique()
print('35 - Nan user Index : {}'.format(null_uid))
for i in null_uid :
chb_df_35 = chb_df_35[chb_df_35.uid != i]
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, \
QuantileTransformer, PowerTransformer, Normalizer
from sklearn.metrics import precision_score , recall_score ,roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier
print("사용할 수 있는 피처 목록 : \n{}".format(chb_df_98.columns[1:-1].tolist()))
train = chb_df_98.copy()
test = chb_df_35.copy()
feature = 'lr01'
time_seq = 3400 # or len(chb_100[chb_100.uid==1])
train_user_num = len(train.uid.unique()) # 학습 데이터 수
test_user_num = len(test.uid.unique()) # 테스트 데이터 수
x_train_hbo = np.array(train[feature].values).reshape(train_user_num, time_seq) # 넘파이 변환
y_train = np.array(train.drop_duplicates(['uid']).Category.values) # 라벨 목록
x_test_hbo = np.array(test[feature].values).reshape(test_user_num, time_seq) # 넘파이 변환
y_test = np.array(test.drop_duplicates(['uid']).Category.values) # 라벨 목록
########## 데이터 스케일링 주석 해제 후 사용
# scaler = PowerTransformer() # 피처를 정규 분포 형태로 데이터 변환
# scaler = StandardScaler() # 평균 0, 분산 1로 조정하여 변환 (분류에 적합) - 이상치에 민감
scaler = Normalizer() # 피처의 모든 길이가 1이 되도록 조정
# scaler = MinMaxScaler() # 모든 값을 0과 1사이에 존재하도록 변환 - 이상치에 민감
# scaler = RobustScaler() # 중앙값과 사분위 값을 이용하여 데이터 변환 (이상치에 효과적)
# scaler = QuantileTransformer(output_distribution='normal')
# 1000개 분위를 사용하여 데이터를 균등분포 하도록 변환 Robust와 유사하면서 0~1사이로 데이터 압축
x_train_hbo = scaler.fit_transform(x_train_hbo) # 스케일링 학습 데이터 적용
x_test_hbo = scaler.transform(x_test_hbo) # 테스트 데이터에 그대로 적용
x_train_hbo.shape, y_train.shape, x_test_hbo.shape, y_test.shape
rf = RandomForestClassifier(random_state=42)
rf.fit(x_train_hbo, y_train)
print("랜덤 포레스트 \n")
print("훈련 세트 ")
print(" - 정확도(acc) : {:.3f}".format(rf.score(x_train_hbo, y_train)))
print(" - 정밀도(precision) : {:.3f}".format( precision_score(y_train, rf.predict(x_train_hbo),average= "macro") ))
print(" - 재현율(recall) : {:.3f}".format( recall_score(y_train, rf.predict(x_train_hbo),average= "macro") ))
print(" - AUC 점수 : {:.3f}".format( roc_auc_score(y_train, rf.predict_proba(x_train_hbo), multi_class='ovr') ) , end='\n')
label=['AD', 'CN', 'MCI'] # 라벨 설정
plot = plot_confusion_matrix(rf, # 분류 모델
x_train_hbo, y_train, # 예측 데이터와 예측값의 정답(y_true)
display_labels=label, # 표에 표시할 labels
cmap=plt.cm.Blues, # 컬러맵(plt.cm.Reds, plt.cm.rainbow 등이 있음)
normalize=None) # 'true', 'pred', 'all' 중에서 지정 가능. default=None
plot.ax_.set_title('Confusion Matrix - Train')
print("테스트 세트 ")
print(" - 정확도(acc) : {:.3f}".format(rf.score(x_test_hbo, y_test)))
print(" - 정밀도(precision) : {:.3f}".format( precision_score(y_test, rf.predict(x_test_hbo),average= "macro") ))
print(" - 재현율(recall) : {:.3f}".format( recall_score(y_test, rf.predict(x_test_hbo),average= "macro") ))
print(" - AUC 점수 : {:.3f}".format( roc_auc_score(y_test, rf.predict_proba(x_test_hbo), multi_class='ovr') ) , end='\n')
label=['AD', 'CN', 'MCI'] # 라벨 설정
plot = plot_confusion_matrix(rf, # 분류 모델
x_test_hbo, y_test, # 예측 데이터와 예측값의 정답(y_true)
display_labels=label, # 표에 표시할 labels
cmap=plt.cm.Blues, # 컬러맵(plt.cm.Reds, plt.cm.rainbow 등이 있음)
normalize=None) # 'true', 'pred', 'all' 중에서 지정 가능. default=None
plot.ax_.set_title('Confusion Matrix - Test')
def plot_feature_importances(model):
n_features = x_train_hbo.data.shape[1]
plt.barh(model.feature_importances_,range(n_features), align='center')
plt.yticks(np.arange(n_features))
plt.xlabel("Feature importance")
plt.ylabel("Feature")