Install dependencies

In [ ]:
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
In [ ]:
%cd '/content/drive/MyDrive/CS460 ML Project /CODES/EXPERIMENTS/ARIMA'
/content/drive/MyDrive/CS460 ML Project /CODES/EXPERIMENTS/ARIMA
In [ ]:
!pip install pmdarima
Requirement already satisfied: pmdarima in /usr/local/lib/python3.7/dist-packages (1.8.3)
Requirement already satisfied: numpy>=1.19.3 in /usr/local/lib/python3.7/dist-packages (from pmdarima) (1.19.5)
Requirement already satisfied: statsmodels!=0.12.0,>=0.11 in /usr/local/lib/python3.7/dist-packages (from pmdarima) (0.13.0)
Requirement already satisfied: scipy>=1.3.2 in /usr/local/lib/python3.7/dist-packages (from pmdarima) (1.4.1)
Requirement already satisfied: Cython!=0.29.18,>=0.29 in /usr/local/lib/python3.7/dist-packages (from pmdarima) (0.29.24)
Requirement already satisfied: setuptools!=50.0.0,>=38.6.0 in /usr/local/lib/python3.7/dist-packages (from pmdarima) (57.4.0)
Requirement already satisfied: pandas>=0.19 in /usr/local/lib/python3.7/dist-packages (from pmdarima) (1.1.5)
Requirement already satisfied: urllib3 in /usr/local/lib/python3.7/dist-packages (from pmdarima) (1.24.3)
Requirement already satisfied: scikit-learn>=0.22 in /usr/local/lib/python3.7/dist-packages (from pmdarima) (0.22.2.post1)
Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.7/dist-packages (from pmdarima) (1.0.1)
Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=0.19->pmdarima) (2.8.2)
Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas>=0.19->pmdarima) (2018.9)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas>=0.19->pmdarima) (1.15.0)
Requirement already satisfied: patsy>=0.5.2 in /usr/local/lib/python3.7/dist-packages (from statsmodels!=0.12.0,>=0.11->pmdarima) (0.5.2)

Import libraries

In [ ]:
import matplotlib.pyplot as plt
import csv
import pandas as pd
from pmdarima import auto_arima
from statsmodels.tsa.arima.model import ARIMA

Read data

In [ ]:
df = pd.read_csv('Maharashtra.csv',index_col='DATE',parse_dates=True)
df=df.dropna()
df.shape
df.head()
Out[ ]:
ACTIVE RECOVERED DEATH TESTED
DATE
2020-03-09 2 0 0 0
2020-03-10 5 0 0 0
2020-03-11 11 0 0 0
2020-03-12 14 0 0 0
2020-03-13 17 0 0 0

Stepwise fit by auto_arima

In [ ]:
stepwise_fit = auto_arima(df['ACTIVE'],trace=True)
stepwise_fit.summary()
Performing stepwise search to minimize aic
 ARIMA(2,1,2)(0,0,0)[0] intercept   : AIC=11564.202, Time=1.15 sec
 ARIMA(0,1,0)(0,0,0)[0] intercept   : AIC=12118.083, Time=0.03 sec
 ARIMA(1,1,0)(0,0,0)[0] intercept   : AIC=11752.160, Time=0.05 sec
 ARIMA(0,1,1)(0,0,0)[0] intercept   : AIC=11929.621, Time=0.28 sec
 ARIMA(0,1,0)(0,0,0)[0]             : AIC=12116.113, Time=0.02 sec
 ARIMA(1,1,2)(0,0,0)[0] intercept   : AIC=11577.615, Time=0.25 sec
 ARIMA(2,1,1)(0,0,0)[0] intercept   : AIC=11578.291, Time=0.30 sec
 ARIMA(3,1,2)(0,0,0)[0] intercept   : AIC=11572.432, Time=0.76 sec
 ARIMA(2,1,3)(0,0,0)[0] intercept   : AIC=11573.650, Time=0.65 sec
 ARIMA(1,1,1)(0,0,0)[0] intercept   : AIC=11579.352, Time=0.32 sec
 ARIMA(1,1,3)(0,0,0)[0] intercept   : AIC=11579.474, Time=0.38 sec
 ARIMA(3,1,1)(0,0,0)[0] intercept   : AIC=11579.938, Time=0.35 sec
 ARIMA(3,1,3)(0,0,0)[0] intercept   : AIC=11544.602, Time=1.69 sec
 ARIMA(4,1,3)(0,0,0)[0] intercept   : AIC=11549.299, Time=1.44 sec
 ARIMA(3,1,4)(0,0,0)[0] intercept   : AIC=11550.571, Time=2.17 sec
 ARIMA(2,1,4)(0,0,0)[0] intercept   : AIC=11575.652, Time=0.70 sec
 ARIMA(4,1,2)(0,0,0)[0] intercept   : AIC=11567.082, Time=1.52 sec
 ARIMA(4,1,4)(0,0,0)[0] intercept   : AIC=11551.048, Time=2.24 sec
 ARIMA(3,1,3)(0,0,0)[0]             : AIC=11542.610, Time=1.41 sec
 ARIMA(2,1,3)(0,0,0)[0]             : AIC=11571.650, Time=0.53 sec
 ARIMA(3,1,2)(0,0,0)[0]             : AIC=11570.432, Time=0.59 sec
 ARIMA(4,1,3)(0,0,0)[0]             : AIC=11547.300, Time=1.24 sec
 ARIMA(3,1,4)(0,0,0)[0]             : AIC=11548.580, Time=1.82 sec
 ARIMA(2,1,2)(0,0,0)[0]             : AIC=11562.047, Time=0.90 sec
 ARIMA(2,1,4)(0,0,0)[0]             : AIC=11573.652, Time=0.60 sec
 ARIMA(4,1,2)(0,0,0)[0]             : AIC=11565.070, Time=1.32 sec
 ARIMA(4,1,4)(0,0,0)[0]             : AIC=11548.925, Time=2.16 sec

Best model:  ARIMA(3,1,3)(0,0,0)[0]          
Total fit time: 24.915 seconds
Out[ ]:
SARIMAX Results
Dep. Variable: y No. Observations: 593
Model: SARIMAX(3, 1, 3) Log Likelihood -5764.305
Date: Fri, 29 Oct 2021 AIC 11542.610
Time: 15:03:33 BIC 11573.295
Sample: 0 HQIC 11554.562
- 593
Covariance Type: opg
coef std err z P>|z| [0.025 0.975]
ar.L1 -0.8129 0.014 -59.913 0.000 -0.839 -0.786
ar.L2 0.7589 0.013 60.239 0.000 0.734 0.784
ar.L3 0.9490 0.015 63.639 0.000 0.920 0.978
ma.L1 1.0776 0.028 38.969 0.000 1.023 1.132
ma.L2 -0.2356 0.035 -6.806 0.000 -0.304 -0.168
ma.L3 -0.6110 0.023 -26.051 0.000 -0.657 -0.565
sigma2 1.664e+07 1.16e-09 1.43e+16 0.000 1.66e+07 1.66e+07
Ljung-Box (L1) (Q): 0.25 Jarque-Bera (JB): 1905.84
Prob(Q): 0.62 Prob(JB): 0.00
Heteroskedasticity (H): 5.42 Skew: -1.21
Prob(H) (two-sided): 0.00 Kurtosis: 11.45


Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
[2] Covariance matrix is singular or near-singular, with condition number 5.31e+31. Standard errors may be unstable.

Train

In [ ]:
train = df.iloc[:-15]
test = df.iloc[-15:]


model=ARIMA(train['ACTIVE'],order=(3,1,3))
model=model.fit()
model.summary()
/usr/local/lib/python3.7/dist-packages/statsmodels/tsa/base/tsa_model.py:539: ValueWarning: No frequency information was provided, so inferred frequency D will be used.
  % freq, ValueWarning)
/usr/local/lib/python3.7/dist-packages/statsmodels/tsa/base/tsa_model.py:539: ValueWarning: No frequency information was provided, so inferred frequency D will be used.
  % freq, ValueWarning)
/usr/local/lib/python3.7/dist-packages/statsmodels/tsa/base/tsa_model.py:539: ValueWarning: No frequency information was provided, so inferred frequency D will be used.
  % freq, ValueWarning)
/usr/local/lib/python3.7/dist-packages/statsmodels/tsa/statespace/sarimax.py:966: UserWarning: Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.
  warn('Non-stationary starting autoregressive parameters'
/usr/local/lib/python3.7/dist-packages/statsmodels/tsa/statespace/sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters.
  warn('Non-invertible starting MA parameters found.'
/usr/local/lib/python3.7/dist-packages/statsmodels/base/model.py:606: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  ConvergenceWarning)
Out[ ]:
SARIMAX Results
Dep. Variable: ACTIVE No. Observations: 578
Model: ARIMA(3, 1, 3) Log Likelihood -5625.580
Date: Fri, 29 Oct 2021 AIC 11265.160
Time: 15:04:09 BIC 11295.664
Sample: 03-09-2020 HQIC 11277.055
- 10-07-2021
Covariance Type: opg
coef std err z P>|z| [0.025 0.975]
ar.L1 -0.8216 0.013 -62.033 0.000 -0.848 -0.796
ar.L2 0.7475 0.013 56.601 0.000 0.722 0.773
ar.L3 0.9470 0.015 64.889 0.000 0.918 0.976
ma.L1 1.0919 0.028 39.629 0.000 1.038 1.146
ma.L2 -0.2157 0.036 -5.957 0.000 -0.287 -0.145
ma.L3 -0.6053 0.024 -25.500 0.000 -0.652 -0.559
sigma2 1.707e+07 1.06e-09 1.6e+16 0.000 1.71e+07 1.71e+07
Ljung-Box (L1) (Q): 0.14 Jarque-Bera (JB): 1712.40
Prob(Q): 0.71 Prob(JB): 0.00
Heteroskedasticity (H): 8.51 Skew: -1.17
Prob(H) (two-sided): 0.00 Kurtosis: 11.11


Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
[2] Covariance matrix is singular or near-singular, with condition number 1.83e+31. Standard errors may be unstable.

Predict

In [ ]:
start=len(train)
end=len(train)+len(test)-1
pred=model.predict(start=start,end=end,typ='levels')
pred.index=df.index[start:end+1]
pred
Out[ ]:
DATE
2021-10-08    36574.415094
2021-10-09    36332.761060
2021-10-10    36203.189624
2021-10-11    35739.246233
2021-10-12    35794.723360
2021-10-13    35279.629252
2021-10-14    35304.964687
2021-10-15    34951.630151
2021-10-16    34773.095618
2021-10-17    34679.638040
2021-10-18    34288.368591
2021-10-19    34370.905217
2021-10-20    33922.104093
2021-10-21    33982.020700
2021-10-22    33675.455313
Name: predicted_mean, dtype: float64

Check

In [ ]:
pred.plot(legend=True)
test['ACTIVE'].plot(legend=True)
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fa6404a30d0>
In [ ]:
from sklearn import metrics
In [ ]:
metrics.mean_absolute_error(test['ACTIVE'], pred)
Out[ ]:
2385.227698585097
In [ ]:
metrics