importing the libraries

In [1]:

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random

importing the data set

In [2]:

df = pd.read_csv("Master_Product_Sales_Data.csv")

In [3]:

df.shape

Out[3]:

(1037336, 6)

In [4]:

df.head()

Out[4]:

	Product_Code	Warehouse	Product_Category	Date	Order_Demand	Total Sales
0	Product_0993	Whse_J	Category_028	2012/7/27	100	13400
1	Product_0979	Whse_J	Category_028	2012/1/19	500	52500
2	Product_0979	Whse_J	Category_028	2012/2/3	500	52500
3	Product_0979	Whse_J	Category_028	2012/2/9	500	52500
4	Product_0979	Whse_J	Category_028	2012/3/2	500	52500

In [5]:

df.nunique()

Out[5]:

Product_Code         2160
Warehouse               4
Product_Category       33
Date                 1729
Order_Demand         3309
Total Sales         17412
dtype: int64

Cleaning the Dataset

In [6]:

import re

In [7]:

df['Order_Demand'] = df['Order_Demand'].apply(lambda x: re.sub("[^0-9]", "", x))

In [8]:

df['Order_Demand'] = df['Order_Demand'].astype(int)

In [10]:

data_code = df[['Product_Code','Order_Demand', 'Total Sales']].groupby(by=['Product_Code']).sum().sort_values(['Order_Demand'])

In [12]:

data_code = data_code.reset_index()

In [13]:

data = data_code[['Order_Demand', 'Total Sales']]

defining the data set

In [14]:

X = data.iloc[:,:].values

Using the elbow method to find the optimal number of clusters

In [15]:

from sklearn.cluster import KMeans
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

Training the K-Means model on the dataset

In [16]:

kmeans = KMeans(n_clusters = 3, init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit_predict(X)

In [17]:

plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans == 0, 1], s = 100, cmap = 'tab10', label = 'Cluster 1', color ='#951f53' )
plt.scatter(X[y_kmeans == 1, 0], X[y_kmeans == 1, 1], s = 100, cmap = 'tab10', label = 'Cluster 2', color = '#e4795a')
plt.scatter(X[y_kmeans == 2, 0], X[y_kmeans == 2, 1], s = 100, cmap = 'tab10', label = 'Cluster 3', color = '#b8afac')

plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroids')

plt.title('Clusters of Products')
plt.xlabel('Order Demand')
plt.ylabel('Total Sales')
plt.legend()
plt.show()

In [18]:

cluster_featuers = np.insert(X, 2, y_kmeans, axis = 1)

In [19]:

cluster_featuers = pd.DataFrame(cluster_featuers, columns = ['Order_Demand','Sales','Cluster'])

In [20]:

product_code = data_code['Product_Code']

In [21]:

cluster_featuers['Product_Code'] = data_code['Product_Code']

In [22]:

cluster_featuers

Out[22]:

	Order_Demand	Sales	Cluster	Product_Code
0	2	82	0	Product_1703
1	2	276	0	Product_1698
2	3	66	0	Product_0465
3	20	1480	0	Product_0853
4	22	2464	0	Product_0638
…	…	…	…	…
2155	123303000	8384604000	2	Product_1295
2156	169777000	14431045000	2	Product_1341
2157	210651000	16641429000	1	Product_0083
2158	289117000	26020530000	1	Product_1248
2159	472474000	30710810000	1	Product_1359

2160 rows × 4 columnsIn [23]:

cluster_1 = cluster_featuers.loc[cluster_featuers['Cluster']==0]
cluster_2 = cluster_featuers.loc[cluster_featuers['Cluster']==1]
cluster_3 = cluster_featuers.loc[cluster_featuers['Cluster']==2]

In [24]:

cluster_2

Out[24]:

	Order_Demand	Sales	Cluster	Product_Code
2157	210651000	16641429000	1	Product_0083
2158	289117000	26020530000	1	Product_1248
2159	472474000	30710810000	1	Product_1359

In [25]:

full_df = pd.merge(df, cluster_featuers, on = 'Product_Code', how='left')

In [26]:

full_df

Out[26]:

	Product_Code	Warehouse	Product_Category	Date	Order_Demand_x	Total Sales	Order_Demand_y	Sales	Cluster
0	Product_0993	Whse_J	Category_028	2012/7/27	100	13400	59700	7999800	0
1	Product_0979	Whse_J	Category_028	2012/1/19	500	52500	7332500	769912500	0
2	Product_0979	Whse_J	Category_028	2012/2/3	500	52500	7332500	769912500	0
3	Product_0979	Whse_J	Category_028	2012/2/9	500	52500	7332500	769912500	0
4	Product_0979	Whse_J	Category_028	2012/3/2	500	52500	7332500	769912500	0
…	…	…	…	…	…	…	…	…	…
1037331	Product_1791	Whse_J	Category_006	2016/4/27	1000	108000	56000	6048000	0
1037332	Product_1974	Whse_J	Category_006	2016/4/27	1	61	153	9333	0
1037333	Product_1787	Whse_J	Category_006	2016/4/28	2500	160000	5892500	377120000	0
1037334	Product_0901	Whse_J	Category_023	2016/10/7	50	6750	104950	14168250	0
1037335	Product_0704	Whse_J	Category_001	2016/6/27	4	100	1915	47875	0

1037336 rows × 9 columnsIn [27]:

# cluster_1 = full_df.loc[full_df['cluster']==0]
# cluster_2 = full_df.loc[full_df['cluster']==1]
# cluster_3 = full_df.loc[full_df['cluster']==2]

Cluster 1

In [28]:

cluster_1

Out[28]:

	Order_Demand	Sales	Cluster	Product_Code
0	2	82	0	Product_1703
1	2	276	0	Product_1698
2	3	66	0	Product_0465
3	20	1480	0	Product_0853
4	22	2464	0	Product_0638
…	…	…	…	…
2139	61751000	1852530000	0	Product_1342
2141	73779700	2065831600	0	Product_1016
2142	76703000	1994278000	0	Product_1350
2146	81690200	1388733400	0	Product_1382
2149	91856000	1745264000	0	Product_1451

2134 rows × 4 columnsIn [29]:

cluster_1['Product_Code'].unique()

Out[29]:

array(['Product_1703', 'Product_1698', 'Product_0465', ...,
       'Product_1350', 'Product_1382', 'Product_1451'], dtype=object)

In [30]:

cluster_1['Order_Demand'].unique()

Out[30]:

array([       2,        3,       20, ..., 76703000, 81690200, 91856000],
      dtype=int64)

Cluster 2

In [31]:

cluster_2['Product_Code'].unique()

Out[31]:

array(['Product_0083', 'Product_1248', 'Product_1359'], dtype=object)

In [32]:

cluster_2['Order_Demand'].unique()

Out[32]:

array([210651000, 289117000, 472474000], dtype=int64)

Cluster 3

In [33]:

cluster_3['Product_Code'].unique()

Out[33]:

array(['Product_1361', 'Product_1502', 'Product_1567', 'Product_1403',
       'Product_1263', 'Product_1152', 'Product_1393', 'Product_1570',
       'Product_1377', 'Product_1360', 'Product_1453', 'Product_1480',
       'Product_1294', 'Product_1264', 'Product_1378', 'Product_1287',
       'Product_1274', 'Product_1432', 'Product_1286', 'Product_1245',
       'Product_1241', 'Product_1295', 'Product_1341'], dtype=object)

In [34]:

cluster_3['Order_Demand'].unique()

Out[34]:

array([ 28190000,  32672250,  35361000,  37471000,  41760000,  44790000,
        47357000,  53776000,  56322600,  61618000,  70504500,  77835000,
        78714000,  80187000,  83632700,  88715600,  92831000,  97207000,
       101566400, 103537000, 117741000, 123303000, 169777000], dtype=int64)