importing the libraries

In [1]:

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.seasonal import seasonal_decompose

importing the data set

In [2]:

df = pd.read_csv("Master_Product_Sales_Data.csv")

In [3]:

df

Out[3]:

Product_CodeWarehouseProduct_CategoryDateOrder_DemandTotal Sales
0Product_0993Whse_JCategory_0282012/7/2710013400
1Product_0979Whse_JCategory_0282012/1/1950052500
2Product_0979Whse_JCategory_0282012/2/350052500
3Product_0979Whse_JCategory_0282012/2/950052500
4Product_0979Whse_JCategory_0282012/3/250052500
1037331Product_1791Whse_JCategory_0062016/4/271000108000
1037332Product_1974Whse_JCategory_0062016/4/27161
1037333Product_1787Whse_JCategory_0062016/4/282500160000
1037334Product_0901Whse_JCategory_0232016/10/7506750
1037335Product_0704Whse_JCategory_0012016/6/274100

1037336 rows × 6 columns

Cleaning the dataset

In [4]:

df.nunique()

Out[4]:

Product_Code         2160
Warehouse               4
Product_Category       33
Date                 1729
Order_Demand         3309
Total Sales         17412
dtype: int64

In [5]:

df = df[['Date', 'Order_Demand']]

In [6]:

df['Date'] = pd.to_datetime(df['Date'], format = '%Y/%m/%d')
<ipython-input-6-d1cefc9bc903>:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = pd.to_datetime(df['Date'], format = '%Y/%m/%d')

In [7]:

df = df.set_index('Date')

In [8]:

dataset = df[['Order_Demand']]

In [9]:

dataset = dataset.resample('M').sum()

In [10]:

# removing the outliers
dataset = dataset.iloc[12:-1,:]

ETS Decompostion

In [11]:

ets_result = seasonal_decompose(dataset, model = 'additive')

In [12]:

ets_result.plot()

Out[12]:

In [13]:

pip install pmdarima

In [14]:

# Import the library
from pmdarima import auto_arima

Parameter analysis for ARIMA model

In [15]:

import warnings

In [16]:

warnings.filterwarnings("ignore")

In [17]:

stepwise_fit = auto_arima(dataset['Order_Demand'], start_p = 1, start_q = 1, max_p=3, max_q = 3, m =12, seasonal = True, d=None, D=1, trace= True, error_action = 'ignore', surpress_warnings = True, stepwise = True)
Performing stepwise search to minimize aic
 ARIMA(1,1,1)(1,1,1)[12]             : AIC=inf, Time=0.13 sec
 ARIMA(0,1,0)(0,1,0)[12]             : AIC=1656.011, Time=0.01 sec
 ARIMA(1,1,0)(1,1,0)[12]             : AIC=1657.591, Time=0.03 sec
 ARIMA(0,1,1)(0,1,1)[12]             : AIC=1654.560, Time=0.05 sec
 ARIMA(0,1,1)(0,1,0)[12]             : AIC=1657.739, Time=0.02 sec
 ARIMA(0,1,1)(1,1,1)[12]             : AIC=inf, Time=0.10 sec
 ARIMA(0,1,1)(0,1,2)[12]             : AIC=1651.070, Time=0.10 sec
 ARIMA(0,1,1)(1,1,2)[12]             : AIC=1653.068, Time=0.19 sec
 ARIMA(0,1,0)(0,1,2)[12]             : AIC=1650.632, Time=0.06 sec
 ARIMA(0,1,0)(0,1,1)[12]             : AIC=1653.385, Time=0.02 sec
 ARIMA(0,1,0)(1,1,2)[12]             : AIC=1652.620, Time=0.14 sec
 ARIMA(0,1,0)(1,1,1)[12]             : AIC=1651.554, Time=0.06 sec
 ARIMA(1,1,0)(0,1,2)[12]             : AIC=1651.488, Time=0.09 sec
 ARIMA(1,1,1)(0,1,2)[12]             : AIC=1652.010, Time=0.21 sec
 ARIMA(0,1,0)(0,1,2)[12] intercept   : AIC=1652.762, Time=0.08 sec

Best model:  ARIMA(0,1,0)(0,1,2)[12]          
Total fit time: 1.276 seconds

In [18]:

stepwise_fit.summary()

Out[18]:

Dep. Variable:yNo. Observations:60
Model:SARIMAX(0, 1, 0)x(0, 1, [1, 2], 12)Log Likelihood-822.316
Date:Thu, 04 Nov 2021AIC1650.632
Time:14:47:39BIC1656.182
Sample:0HQIC1652.720
– 60
Covariance Type:opg
coefstd errzP>|z|[0.0250.975]
ma.S.L12-0.34820.276-1.2630.207-0.8890.192
ma.S.L24-0.28900.223-1.2970.195-0.7260.148
sigma21.043e+143.45e-163.03e+290.0001.04e+141.04e+14
Ljung-Box (L1) (Q):4.49Jarque-Bera (JB):1.10
Prob(Q):0.03Prob(JB):0.58
Heteroskedasticity (H):1.01Skew:-0.30
Prob(H) (two-sided):0.99Kurtosis:2.54

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
[2] Covariance matrix is singular or near-singular, with condition number 2.78e+45. Standard errors may be unstable.

split into train test sets

In [19]:

train = dataset.iloc[:len(dataset)-12]
test = dataset.iloc[len(dataset)-12:]

In [20]:

# Fit ARIMA model

In [21]:

from statsmodels.tsa.statespace.sarimax import SARIMAX

In [22]:

model = SARIMAX(train['Order_Demand'],
               order=(0,1,0),
               seasonal_order=(0,1,[1,2],12))

In [23]:

result = model.fit()

In [24]:

result.summary()

Out[24]:

Dep. Variable:Order_DemandNo. Observations:48
Model:SARIMAX(0, 1, 0)x(0, 1, [1, 2], 12)Log Likelihood-611.617
Date:Thu, 04 Nov 2021AIC1229.234
Time:14:47:39BIC1233.900
Sample:01-31-2012HQIC1230.845
– 12-31-2015
Covariance Type:opg
coefstd errzP>|z|[0.0250.975]
ma.S.L12-0.31170.305-1.0230.306-0.9090.285
ma.S.L24-0.23820.206-1.1550.248-0.6420.166
sigma29.636e+13nannannannannan
Ljung-Box (L1) (Q):1.93Jarque-Bera (JB):0.63
Prob(Q):0.17Prob(JB):0.73
Heteroskedasticity (H):0.53Skew:-0.05
Prob(H) (two-sided):0.29Kurtosis:2.35

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
[2] Covariance matrix is singular or near-singular, with condition number 2.47e+45. Standard errors may be unstable.In [25]:

# Pred against test set

In [26]:

start = len(train)
end = len(train) + len(test) - 1

In [27]:

# Predictions for one-year against the test set
predictions = result.predict(start, end,
                             typ = 'levels').rename("Predictions")

In [28]:

predictions.plot(legend = True, color = "#951f53")
test['Order_Demand'].plot(legend = True)
train['Order_Demand'].plot(legend = True)

Out[28]:

<AxesSubplot:xlabel='Date'>

In [29]:

#notes: https://www.geeksforgeeks.org/python-arima-model-for-time-series-forecasting/

Evaluare the model using MSE and RMSE

In [30]:

from sklearn.metrics import mean_squared_error
from statsmodels.tools.eval_measures import rmse
from sklearn.metrics import accuracy_score 

In [31]:

#calculating the root mean squared error
rmse(test["Order_Demand"], predictions)

Out[31]:

12228976.658876069

In [32]:

#calculating the mean squared error
mean_squared_error(test["Order_Demand"], predictions)

Out[32]:

149547870123335.72

In [33]:

y_test=test["Order_Demand"].values

In [34]:

y_pred = predictions.values

In [35]:

comapre = pd.DataFrame(zip(y_test,y_pred), columns = ['y_test','y_pred'])

In [36]:

comapre['diff'] = comapre['y_pred']/(comapre['y_test']-1)

In [37]:

comapre

Out[37]:

y_testy_preddiff
0786276191.016541e+081.292855
1740650419.009375e+071.216414
2933039101.021972e+081.095316
3795033649.487303e+071.193321
4802995938.634700e+071.075311
5845530119.454459e+071.118169
6884399361.023227e+081.156974
7804717728.605685e+071.069404
8776988968.893183e+071.144570
9840007579.335134e+071.111316
10901285688.732978e+070.968947
11804979329.100245e+071.130494

Futuer Pred on All Data

In [38]:

model = SARIMAX(dataset['Order_Demand'],
               order=(0,1,0),
               seasonal_order=(0,1,[1,2],12))

In [39]:

result = model.fit()

In [40]:

start_futuer = len(dataset)
end_futuer = len(dataset) + 12

In [41]:

# Predictions for one-year against the test set
predictions = result.predict(start_futuer, end_futuer,
                             typ = 'levels').rename("Predictions")

In [42]:

predictions.plot(legend = True, color = "#951f53")
dataset['Order_Demand'].plot(legend = True)

Out[42]:

<AxesSubplot:xlabel='Date'>

In [43]:

output = dataset.reset_index()

In [44]:

predictions

Out[44]:

2017-01-31    7.959537e+07
2017-02-28    7.678775e+07
2017-03-31    9.213604e+07
2017-04-30    8.136279e+07
2017-05-31    8.199487e+07
2017-06-30    8.513333e+07
2017-07-31    8.836628e+07
2017-08-31    7.844622e+07
2017-09-30    7.932424e+07
2017-10-31    8.721598e+07
2017-11-30    8.960154e+07
2017-12-31    8.270573e+07
2018-01-31    8.841014e+07
Freq: M, Name: Predictions, dtype: float64

In [45]:

pred = pd.DataFrame({'Date':predictions.index, "Order_Demand":predictions.values})

In [46]:

pred

Out[46]:

DateOrder_Demand
02017-01-317.959537e+07
12017-02-287.678775e+07
22017-03-319.213604e+07
32017-04-308.136279e+07
42017-05-318.199487e+07
52017-06-308.513333e+07
62017-07-318.836628e+07
72017-08-317.844622e+07
82017-09-307.932424e+07
92017-10-318.721598e+07
102017-11-308.960154e+07
112017-12-318.270573e+07
122018-01-318.841014e+07

In [47]:

full_pred_dataset = pd.concat([output,pred])