오답노트
[ML] 전통적 시계열 모델링 - ARIMA 본문
전처리
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import statsmodels.api as sm
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from sklearn.metrics import *
# 검증 함수
def residual_diag(residuals, lags = 20) :
print('* 정규성 검정(> 0.05) : ', round(stats.shapiro(residuals)[1],5))
print('* 정상성 검정(< 0.05) : ', round(sm.tsa.stattools.adfuller(residuals)[1],5))
print('* 자기상관성 확인(ACF, PACF)')
fig,ax = plt.subplots(1,2, figsize = (15,5))
plot_acf(residuals, lags = lags, ax = ax[0])
plot_pacf(residuals, lags = lags, ax = ax[1])
plt.show()
path = 'retail_demand2.csv'
data = pd.read_csv(path, usecols = ['date', 'sales', 'tot_sales', 'comp_sales'])
data = data.loc[data['date']<= '2015-10-31'].reset_index(drop = True)
# 날짜 -> 인덱스
data['DT'] = data['date']
data.set_index('DT', inplace=True)
# 날짜 단위 정하기
df = data.asfreq('D')
# y만들기
df['y'] = df['sales'].shift(-1)
df.dropna(axis = 0, inplace = True)
데이터 분할
# x,y 데이터 분할
target = 'y'
x = df.drop([target, 'date'], axis = 1)
y = df.loc[:, target]
from sklearn.model_selection import TimeSeriesSplit
val_size = 30
nfold = 3
#시계열 데이터 분할
tscv = TimeSeriesSplit(n_splits = nfold, test_size = val_size)
ARIMA
#y 값 분리
train = y[:-30]
val = y[-30:]
#모델 학습
model1_1 = sm.tsa.SARIMAX(train, order=(1,0,1)).fit() #ARMA
model1_2 = sm.tsa.SARIMAX(train, order=(1,1,1)).fit() #ARIMA
# 잔차 진단
residuals = model1_2.resid
residual_diag(residuals)
# AIC
print('model1 AIC :', model1_1.aic)
print('model2 AIC :', model1_2.aic)
# Validation
pred = model1_2.forecast(30)
mean_absolute_error(val, pred)
hyper parameter tuning
from itertools import product
# product 함수를 이용하여 값의 조합을 구성
p = [0,1,2,3,4]
q = [0,1,2,3,4]
d = [0,1]
iter = list(product(p,d,q))
# Grid Search
mae, aic = [],[]
for i in iter :
model_fit = sm.tsa.SARIMAX(train, order=(i[0],i[1],i[2])).fit()
pred = model_fit.forecast(30)
mae.append( mean_absolute_error(val, pred))
aic.append(model_fit.aic)
print(i)
result = pd.DataFrame({'params(p,d,q)' : iter, 'mae' : mae, 'aic':aic})
model2_1 = sm.tsa.SARIMAX(train, order=(3,1,3)).fit()
model2_2 = sm.tsa.SARIMAX(train, order=(4,1,4)).fit()
# 잔차 진단
residuals = model2_2.resid
residual_diag(residuals)
# AIC
print('model2 AIC :', model2_2.aic)
# Validation
pred = model2_2.forecast(30)
mean_absolute_error(val, pred)
Cross Validation
rmse, mae, mape, aic = [],[],[],[]
residuals = []
preds = []
p,d,q = 4,1,4
for train_index, val_index in tscv.split(x):
# 인덱스로 데이터 분할
train = y[train_index]
val = y[val_index]
# 학습
model = sm.tsa.SARIMAX(train, order=(p,d,q)).fit()
# 예측
pred = model.forecast(val_size)
preds += list(pred)
# 잔차 저장
residuals += list(model.resid)
# 평가
rmse.append(mean_squared_error(val, pred, squared = False))
mae.append(mean_absolute_error(val, pred))
mape.append(mean_absolute_percentage_error(val, pred))
aic.append(model.aic)
'Python > ML' 카테고리의 다른 글
[ML] 모델에 대한 설명 - Permutation Feature Importance (0) | 2022.09.05 |
---|---|
[ML] 모델에 대한 설명 (0) | 2022.09.05 |
[ML] 시계열 데이터 - 전통적 시계열 모델링 (0) | 2022.08.29 |
[ML] 시계열 데이터 (0) | 2022.08.29 |
[ML] 비지도 학습 - DBSCAN (0) | 2022.08.29 |