importing the libraries
In [1]:
import numpy as np import matplotlib.pyplot as plt import pandas as pd from statsmodels.tsa.arima.model import ARIMA from statsmodels.tsa.seasonal import seasonal_decompose
importing the data set
In [2]:
df = pd.read_csv("Master_Product_Sales_Data.csv")
In [3]:
df
Out[3]:
Product_Code | Warehouse | Product_Category | Date | Order_Demand | Total Sales | |
---|---|---|---|---|---|---|
0 | Product_0993 | Whse_J | Category_028 | 2012/7/27 | 100 | 13400 |
1 | Product_0979 | Whse_J | Category_028 | 2012/1/19 | 500 | 52500 |
2 | Product_0979 | Whse_J | Category_028 | 2012/2/3 | 500 | 52500 |
3 | Product_0979 | Whse_J | Category_028 | 2012/2/9 | 500 | 52500 |
4 | Product_0979 | Whse_J | Category_028 | 2012/3/2 | 500 | 52500 |
… | … | … | … | … | … | … |
1037331 | Product_1791 | Whse_J | Category_006 | 2016/4/27 | 1000 | 108000 |
1037332 | Product_1974 | Whse_J | Category_006 | 2016/4/27 | 1 | 61 |
1037333 | Product_1787 | Whse_J | Category_006 | 2016/4/28 | 2500 | 160000 |
1037334 | Product_0901 | Whse_J | Category_023 | 2016/10/7 | 50 | 6750 |
1037335 | Product_0704 | Whse_J | Category_001 | 2016/6/27 | 4 | 100 |
1037336 rows × 6 columns
Cleaning the dataset
In [4]:
df.nunique()
Out[4]:
Product_Code 2160 Warehouse 4 Product_Category 33 Date 1729 Order_Demand 3309 Total Sales 17412 dtype: int64
In [5]:
df = df[['Date', 'Order_Demand']]
In [6]:
df['Date'] = pd.to_datetime(df['Date'], format = '%Y/%m/%d')
<ipython-input-6-d1cefc9bc903>:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['Date'] = pd.to_datetime(df['Date'], format = '%Y/%m/%d')
In [7]:
df = df.set_index('Date')
In [8]:
dataset = df[['Order_Demand']]
In [9]:
dataset = dataset.resample('M').sum()
In [10]:
# removing the outliers dataset = dataset.iloc[12:-1,:]
ETS Decompostion
In [11]:
ets_result = seasonal_decompose(dataset, model = 'additive')
In [12]:
ets_result.plot()
Out[12]:
In [13]:
pip install pmdarima
In [14]:
# Import the library from pmdarima import auto_arima
Parameter analysis for ARIMA model
In [15]:
import warnings
In [16]:
warnings.filterwarnings("ignore")
In [17]:
stepwise_fit = auto_arima(dataset['Order_Demand'], start_p = 1, start_q = 1, max_p=3, max_q = 3, m =12, seasonal = True, d=None, D=1, trace= True, error_action = 'ignore', surpress_warnings = True, stepwise = True)
Performing stepwise search to minimize aic ARIMA(1,1,1)(1,1,1)[12] : AIC=inf, Time=0.13 sec ARIMA(0,1,0)(0,1,0)[12] : AIC=1656.011, Time=0.01 sec ARIMA(1,1,0)(1,1,0)[12] : AIC=1657.591, Time=0.03 sec ARIMA(0,1,1)(0,1,1)[12] : AIC=1654.560, Time=0.05 sec ARIMA(0,1,1)(0,1,0)[12] : AIC=1657.739, Time=0.02 sec ARIMA(0,1,1)(1,1,1)[12] : AIC=inf, Time=0.10 sec ARIMA(0,1,1)(0,1,2)[12] : AIC=1651.070, Time=0.10 sec ARIMA(0,1,1)(1,1,2)[12] : AIC=1653.068, Time=0.19 sec ARIMA(0,1,0)(0,1,2)[12] : AIC=1650.632, Time=0.06 sec ARIMA(0,1,0)(0,1,1)[12] : AIC=1653.385, Time=0.02 sec ARIMA(0,1,0)(1,1,2)[12] : AIC=1652.620, Time=0.14 sec ARIMA(0,1,0)(1,1,1)[12] : AIC=1651.554, Time=0.06 sec ARIMA(1,1,0)(0,1,2)[12] : AIC=1651.488, Time=0.09 sec ARIMA(1,1,1)(0,1,2)[12] : AIC=1652.010, Time=0.21 sec ARIMA(0,1,0)(0,1,2)[12] intercept : AIC=1652.762, Time=0.08 sec Best model: ARIMA(0,1,0)(0,1,2)[12] Total fit time: 1.276 seconds
In [18]:
stepwise_fit.summary()
Out[18]:
Dep. Variable: | y | No. Observations: | 60 |
---|---|---|---|
Model: | SARIMAX(0, 1, 0)x(0, 1, [1, 2], 12) | Log Likelihood | -822.316 |
Date: | Thu, 04 Nov 2021 | AIC | 1650.632 |
Time: | 14:47:39 | BIC | 1656.182 |
Sample: | 0 | HQIC | 1652.720 |
– 60 | |||
Covariance Type: | opg |
coef | std err | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
ma.S.L12 | -0.3482 | 0.276 | -1.263 | 0.207 | -0.889 | 0.192 |
ma.S.L24 | -0.2890 | 0.223 | -1.297 | 0.195 | -0.726 | 0.148 |
sigma2 | 1.043e+14 | 3.45e-16 | 3.03e+29 | 0.000 | 1.04e+14 | 1.04e+14 |
Ljung-Box (L1) (Q): | 4.49 | Jarque-Bera (JB): | 1.10 |
---|---|---|---|
Prob(Q): | 0.03 | Prob(JB): | 0.58 |
Heteroskedasticity (H): | 1.01 | Skew: | -0.30 |
Prob(H) (two-sided): | 0.99 | Kurtosis: | 2.54 |
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
[2] Covariance matrix is singular or near-singular, with condition number 2.78e+45. Standard errors may be unstable.
split into train test sets
In [19]:
train = dataset.iloc[:len(dataset)-12] test = dataset.iloc[len(dataset)-12:]
In [20]:
# Fit ARIMA model
In [21]:
from statsmodels.tsa.statespace.sarimax import SARIMAX
In [22]:
model = SARIMAX(train['Order_Demand'], order=(0,1,0), seasonal_order=(0,1,[1,2],12))
In [23]:
result = model.fit()
In [24]:
result.summary()
Out[24]:
Dep. Variable: | Order_Demand | No. Observations: | 48 |
---|---|---|---|
Model: | SARIMAX(0, 1, 0)x(0, 1, [1, 2], 12) | Log Likelihood | -611.617 |
Date: | Thu, 04 Nov 2021 | AIC | 1229.234 |
Time: | 14:47:39 | BIC | 1233.900 |
Sample: | 01-31-2012 | HQIC | 1230.845 |
– 12-31-2015 | |||
Covariance Type: | opg |
coef | std err | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
ma.S.L12 | -0.3117 | 0.305 | -1.023 | 0.306 | -0.909 | 0.285 |
ma.S.L24 | -0.2382 | 0.206 | -1.155 | 0.248 | -0.642 | 0.166 |
sigma2 | 9.636e+13 | nan | nan | nan | nan | nan |
Ljung-Box (L1) (Q): | 1.93 | Jarque-Bera (JB): | 0.63 |
---|---|---|---|
Prob(Q): | 0.17 | Prob(JB): | 0.73 |
Heteroskedasticity (H): | 0.53 | Skew: | -0.05 |
Prob(H) (two-sided): | 0.29 | Kurtosis: | 2.35 |
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
[2] Covariance matrix is singular or near-singular, with condition number 2.47e+45. Standard errors may be unstable.In [25]:
# Pred against test set
In [26]:
start = len(train) end = len(train) + len(test) - 1
In [27]:
# Predictions for one-year against the test set predictions = result.predict(start, end, typ = 'levels').rename("Predictions")
In [28]:
predictions.plot(legend = True, color = "#951f53") test['Order_Demand'].plot(legend = True) train['Order_Demand'].plot(legend = True)
Out[28]:
<AxesSubplot:xlabel='Date'>
In [29]:
#notes: https://www.geeksforgeeks.org/python-arima-model-for-time-series-forecasting/
Evaluare the model using MSE and RMSE
In [30]:
from sklearn.metrics import mean_squared_error from statsmodels.tools.eval_measures import rmse from sklearn.metrics import accuracy_score
In [31]:
#calculating the root mean squared error rmse(test["Order_Demand"], predictions)
Out[31]:
12228976.658876069
In [32]:
#calculating the mean squared error mean_squared_error(test["Order_Demand"], predictions)
Out[32]:
149547870123335.72
In [33]:
y_test=test["Order_Demand"].values
In [34]:
y_pred = predictions.values
In [35]:
comapre = pd.DataFrame(zip(y_test,y_pred), columns = ['y_test','y_pred'])
In [36]:
comapre['diff'] = comapre['y_pred']/(comapre['y_test']-1)
In [37]:
comapre
Out[37]:
y_test | y_pred | diff | |
---|---|---|---|
0 | 78627619 | 1.016541e+08 | 1.292855 |
1 | 74065041 | 9.009375e+07 | 1.216414 |
2 | 93303910 | 1.021972e+08 | 1.095316 |
3 | 79503364 | 9.487303e+07 | 1.193321 |
4 | 80299593 | 8.634700e+07 | 1.075311 |
5 | 84553011 | 9.454459e+07 | 1.118169 |
6 | 88439936 | 1.023227e+08 | 1.156974 |
7 | 80471772 | 8.605685e+07 | 1.069404 |
8 | 77698896 | 8.893183e+07 | 1.144570 |
9 | 84000757 | 9.335134e+07 | 1.111316 |
10 | 90128568 | 8.732978e+07 | 0.968947 |
11 | 80497932 | 9.100245e+07 | 1.130494 |
Futuer Pred on All Data
In [38]:
model = SARIMAX(dataset['Order_Demand'], order=(0,1,0), seasonal_order=(0,1,[1,2],12))
In [39]:
result = model.fit()
In [40]:
start_futuer = len(dataset) end_futuer = len(dataset) + 12
In [41]:
# Predictions for one-year against the test set predictions = result.predict(start_futuer, end_futuer, typ = 'levels').rename("Predictions")
In [42]:
predictions.plot(legend = True, color = "#951f53") dataset['Order_Demand'].plot(legend = True)
Out[42]:
<AxesSubplot:xlabel='Date'>
In [43]:
output = dataset.reset_index()
In [44]:
predictions
Out[44]:
2017-01-31 7.959537e+07 2017-02-28 7.678775e+07 2017-03-31 9.213604e+07 2017-04-30 8.136279e+07 2017-05-31 8.199487e+07 2017-06-30 8.513333e+07 2017-07-31 8.836628e+07 2017-08-31 7.844622e+07 2017-09-30 7.932424e+07 2017-10-31 8.721598e+07 2017-11-30 8.960154e+07 2017-12-31 8.270573e+07 2018-01-31 8.841014e+07 Freq: M, Name: Predictions, dtype: float64
In [45]:
pred = pd.DataFrame({'Date':predictions.index, "Order_Demand":predictions.values})
In [46]:
pred
Out[46]:
Date | Order_Demand | |
---|---|---|
0 | 2017-01-31 | 7.959537e+07 |
1 | 2017-02-28 | 7.678775e+07 |
2 | 2017-03-31 | 9.213604e+07 |
3 | 2017-04-30 | 8.136279e+07 |
4 | 2017-05-31 | 8.199487e+07 |
5 | 2017-06-30 | 8.513333e+07 |
6 | 2017-07-31 | 8.836628e+07 |
7 | 2017-08-31 | 7.844622e+07 |
8 | 2017-09-30 | 7.932424e+07 |
9 | 2017-10-31 | 8.721598e+07 |
10 | 2017-11-30 | 8.960154e+07 |
11 | 2017-12-31 | 8.270573e+07 |
12 | 2018-01-31 | 8.841014e+07 |
In [47]:
full_pred_dataset = pd.concat([output,pred])