파이썬 랜덤 포레스트 요인 중요도 분석 및 그래프 그리기

랜덤 포레스트로 분류하는 것이 아니라, 분류하기 위한 모델만 학습한 뒤 모델에서 각 요인이 얼마나 기여하는 지를 보기 위한 분석입니다.

필요한 라이브러리 불러오기

from sklearn.ensemble import RandomForestRegressor #회귀트리(모델)
from sklearn.model_selection import train_test_split #train/tes리
from sklearn.metrics import mean_squared_error #평균제곱오차
from sklearn.model_selection import cross_val_score #교차검증 라이브러리
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV #하이퍼 파라미터 튜닝
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

데이터 파일 불러오기

from google.colab import drive
drive.mount('/content/drive') 
path='/content/drive/MyDrive/data/{파일명}.csv'
data=pd.read_csv(path)

데이터 전처리

data1=data.drop(['{열1}', '{열2}', '{열3}', '{열4}', '{결과값}'], axis='columns')
y1=data.{결과값} #결과값으로 사용할 결과값 y로 설정

하이퍼 파라미터 최적화의 경우

x_train, x_valid, y_train, y_valid=train_test_split(x, y1, test_size=0.2, random_state=0) #학습, 검증 데이터 셋 분류
params={'n_estimators':(100,200),'max_depth':(8, 10), 'min_samples_leaf':(8,10), 'min_samples_split':(8,16)} #파라미터 값 수동 설정
regressor=RandomForestRegressor(random_state=0, n_jobs=-1)
grid_cv=GridSearchCV(regressor, param_grid=params, cv=2, n_jobs=-1)
grid_cv.fit(x_train, y_train)
print('최적 하이퍼 파라미터:', grid_cv.best_params_)
print('최적 예측 정확도: {0:.4f}'.format(grid_cv.best_score_))

하이퍼 파라미터 적용 후 학습 진행(모델 생성)

regressor = RandomForestRegressor(random_state=0, max_depth=8, min_samples_leaf=8, min_samples_split=8, n_estimators=200)
regressor.fit(x_train, y_train)

하이퍼파라미터 최적화 없이 모델 생성

regressor=RandomForestRegressor()
rf_run = RandomForestRegressor(random_state=0, n_jobs=-1)
regressor.fit(x1, y1)

그래프 그리기

ftr_importances_values = regressor.feature_importances_
ftr_importances = pd.Series(ftr_importances_values, index=x_train2.columns)
ftr_top = ftr_importances.sort_values(ascending=False)[:20] #x, y축 값 가져오기

sns.set_palette('Greys', n_colors=7) #그래프 색상 설정(파레트)
plt.figure(figsize=(8, 6)) #그래프 크기 설정
fig, ax=plt.subplots() #인덱스에 쓸 객체 만들기
splot=sns.barplot(x=ftr_top, y=ftr_top.index)#그래프 그리기

for p in splot.patches: #인덱스 위치때문에 분류함
  if p.get_width()<0.5:
    ax.annotate("%.3f" % p.get_width(), xy=(p.get_width(), p.get_y()+p.get_height()/2), xytext=(5, 0), textcoords='offset points', ha="left", va="center", size = 8)
  else:
    ax.annotate("%.3f" % p.get_width(), xy=(0.55, p.get_y()+p.get_height()/2), xytext=(5, 0), textcoords='offset points', ha="left", va="center", size = 8)

plt.show()

왼쪽에 인덱스도 나오는데 마스킹을 안해서 자름

은하수가 보이는 장소

파이썬 랜덤 포레스트 요인 중요도 분석 및 그래프 그리기

티스토리툴바