importing the libraries

In [1]:

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random

importing the data set

In [2]:

df = pd.read_csv("Master_Product_Sales_Data.csv")

In [3]:

df.shape

Out[3]:

(1037336, 6)

In [4]:

df.head()

Out[4]:

Product_CodeWarehouseProduct_CategoryDateOrder_DemandTotal Sales
0Product_0993Whse_JCategory_0282012/7/2710013400
1Product_0979Whse_JCategory_0282012/1/1950052500
2Product_0979Whse_JCategory_0282012/2/350052500
3Product_0979Whse_JCategory_0282012/2/950052500
4Product_0979Whse_JCategory_0282012/3/250052500

In [5]:

df.nunique()

Out[5]:

Product_Code         2160
Warehouse               4
Product_Category       33
Date                 1729
Order_Demand         3309
Total Sales         17412
dtype: int64

Cleaning the Dataset

In [6]:

import re

In [7]:

df['Order_Demand'] = df['Order_Demand'].apply(lambda x: re.sub("[^0-9]", "", x))

In [8]:

df['Order_Demand'] = df['Order_Demand'].astype(int)

In [10]:

data_code = df[['Product_Code','Order_Demand', 'Total Sales']].groupby(by=['Product_Code']).sum().sort_values(['Order_Demand'])

In [12]:

data_code = data_code.reset_index()

In [13]:

data = data_code[['Order_Demand', 'Total Sales']]

defining the data set

In [14]:

X = data.iloc[:,:].values

Using the elbow method to find the optimal number of clusters

In [15]:

from sklearn.cluster import KMeans
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

Training the K-Means model on the dataset

In [16]:

kmeans = KMeans(n_clusters = 3, init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit_predict(X)

In [17]:

plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans == 0, 1], s = 100, cmap = 'tab10', label = 'Cluster 1', color ='#951f53' )
plt.scatter(X[y_kmeans == 1, 0], X[y_kmeans == 1, 1], s = 100, cmap = 'tab10', label = 'Cluster 2', color = '#e4795a')
plt.scatter(X[y_kmeans == 2, 0], X[y_kmeans == 2, 1], s = 100, cmap = 'tab10', label = 'Cluster 3', color = '#b8afac')

plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroids')

plt.title('Clusters of Products')
plt.xlabel('Order Demand')
plt.ylabel('Total Sales')
plt.legend()
plt.show()

In [18]:

cluster_featuers = np.insert(X, 2, y_kmeans, axis = 1)

In [19]:

cluster_featuers = pd.DataFrame(cluster_featuers, columns = ['Order_Demand','Sales','Cluster'])

In [20]:

product_code = data_code['Product_Code']

In [21]:

cluster_featuers['Product_Code'] = data_code['Product_Code']

In [22]:

cluster_featuers

Out[22]:

Order_DemandSalesClusterProduct_Code
02820Product_1703
122760Product_1698
23660Product_0465
32014800Product_0853
42224640Product_0638
215512330300083846040002Product_1295
2156169777000144310450002Product_1341
2157210651000166414290001Product_0083
2158289117000260205300001Product_1248
2159472474000307108100001Product_1359

2160 rows × 4 columnsIn [23]:

cluster_1 = cluster_featuers.loc[cluster_featuers['Cluster']==0]
cluster_2 = cluster_featuers.loc[cluster_featuers['Cluster']==1]
cluster_3 = cluster_featuers.loc[cluster_featuers['Cluster']==2]

In [24]:

cluster_2

Out[24]:

Order_DemandSalesClusterProduct_Code
2157210651000166414290001Product_0083
2158289117000260205300001Product_1248
2159472474000307108100001Product_1359

In [25]:

full_df = pd.merge(df, cluster_featuers, on = 'Product_Code', how='left')

In [26]:

full_df

Out[26]:

Product_CodeWarehouseProduct_CategoryDateOrder_Demand_xTotal SalesOrder_Demand_ySalesCluster
0Product_0993Whse_JCategory_0282012/7/27100134005970079998000
1Product_0979Whse_JCategory_0282012/1/195005250073325007699125000
2Product_0979Whse_JCategory_0282012/2/35005250073325007699125000
3Product_0979Whse_JCategory_0282012/2/95005250073325007699125000
4Product_0979Whse_JCategory_0282012/3/25005250073325007699125000
1037331Product_1791Whse_JCategory_0062016/4/2710001080005600060480000
1037332Product_1974Whse_JCategory_0062016/4/2716115393330
1037333Product_1787Whse_JCategory_0062016/4/28250016000058925003771200000
1037334Product_0901Whse_JCategory_0232016/10/7506750104950141682500
1037335Product_0704Whse_JCategory_0012016/6/2741001915478750

1037336 rows × 9 columnsIn [27]:

# cluster_1 = full_df.loc[full_df['cluster']==0]
# cluster_2 = full_df.loc[full_df['cluster']==1]
# cluster_3 = full_df.loc[full_df['cluster']==2]

Cluster 1

In [28]:

cluster_1

Out[28]:

Order_DemandSalesClusterProduct_Code
02820Product_1703
122760Product_1698
23660Product_0465
32014800Product_0853
42224640Product_0638
21396175100018525300000Product_1342
21417377970020658316000Product_1016
21427670300019942780000Product_1350
21468169020013887334000Product_1382
21499185600017452640000Product_1451

2134 rows × 4 columnsIn [29]:

cluster_1['Product_Code'].unique()

Out[29]:

array(['Product_1703', 'Product_1698', 'Product_0465', ...,
       'Product_1350', 'Product_1382', 'Product_1451'], dtype=object)

In [30]:

cluster_1['Order_Demand'].unique()

Out[30]:

array([       2,        3,       20, ..., 76703000, 81690200, 91856000],
      dtype=int64)

Cluster 2

In [31]:

cluster_2['Product_Code'].unique()

Out[31]:

array(['Product_0083', 'Product_1248', 'Product_1359'], dtype=object)

In [32]:

cluster_2['Order_Demand'].unique()

Out[32]:

array([210651000, 289117000, 472474000], dtype=int64)

Cluster 3

In [33]:

cluster_3['Product_Code'].unique()

Out[33]:

array(['Product_1361', 'Product_1502', 'Product_1567', 'Product_1403',
       'Product_1263', 'Product_1152', 'Product_1393', 'Product_1570',
       'Product_1377', 'Product_1360', 'Product_1453', 'Product_1480',
       'Product_1294', 'Product_1264', 'Product_1378', 'Product_1287',
       'Product_1274', 'Product_1432', 'Product_1286', 'Product_1245',
       'Product_1241', 'Product_1295', 'Product_1341'], dtype=object)

In [34]:

cluster_3['Order_Demand'].unique()

Out[34]:

array([ 28190000,  32672250,  35361000,  37471000,  41760000,  44790000,
        47357000,  53776000,  56322600,  61618000,  70504500,  77835000,
        78714000,  80187000,  83632700,  88715600,  92831000,  97207000,
       101566400, 103537000, 117741000, 123303000, 169777000], dtype=int64)