import pandas as pd
import numpy as np
import math
from time import gmtime, strftime #DateTime Formatting
import matplotlib.pyplot as plt #Basic Plotting
import seaborn as sns #Advanced Plotting
import datetime
import warnings

from sklearn.cluster import KMeans
from sklearn import metrics
from scipy.spatial.distance import cdist
import numpy as np
import matplotlib.pyplot as plt

#pre-processing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.options.display.float_format = '{:.3f}'.format


import os
orig_dir = os.getcwd()
os.chdir("C:\\Users\\gusahil\\Downloads")
os.getcwd()

'C:\\Users\\gusahil\\Downloads'


wine = pd.read_csv('wine_data_train.csv', sep=",").reset_index(drop = True).drop('ID', axis=1)
wine.head(3)


wine.describe().T


np.unique(wine['quality'], return_counts = True)

(array([-1,  3,  4,  5,  6,  7,  8,  9], dtype=int64),
 array([ 436,   26,  178, 1825, 2454,  916,  158,    4], dtype=int64))


np.unique(wine['color'], return_counts = True)

(array(['R', 'W'], dtype=object), array([1458, 4539], dtype=int64))


#wine.isnull()


# Feature PreProcessing


wine.loc[wine['quality'] == -1, 'quality'] = np.median(wine.loc[wine['quality'] != -1, 'quality'])


np.unique(wine['quality'], return_counts = True)

(array([3., 4., 5., 6., 7., 8., 9.]),
 array([  26,  178, 1825, 2890,  916,  158,    4], dtype=int64))


# use pd.concat to join the new columns with your original dataframe
wine = pd.concat([wine,pd.get_dummies(wine['color'], prefix='color')],axis=1).drop(['color','color_R'], axis=1)
wine.head()


# Create scalar
scaler = StandardScaler()

# Fit on training data
wine_normalize = scaler.fit_transform(wine)
wine_normalize_df = pd.DataFrame(wine_normalize, columns = wine.columns)
                            
wine_normalize_df.head()


kmeans = KMeans(n_clusters=5, random_state=0).fit(wine_normalize_df)


np.unique(kmeans.labels_, return_counts=True)
#kmeans.cluster_centers_

(array([0, 1, 2, 3, 4]), array([1572,  882, 1513,  567, 1463], dtype=int64))


kmeans.cluster_centers_

array([[-2.12365378e-01, -3.82860071e-01,  1.85145569e-05,
        -3.43165071e-01, -1.98305146e-01, -1.55977855e-01,
         2.18141023e-01, -3.45851269e-01, -9.20564940e-02,
        -2.98818894e-01, -2.44048260e-01, -4.32977724e-01,
         5.62307678e-01],
       [ 1.26301966e-01,  1.67003687e+00, -1.21319547e+00,
        -6.33714276e-01,  7.28073009e-01, -7.93156267e-01,
        -1.18934934e+00,  5.32953899e-01,  9.88569665e-01,
         4.74683880e-01, -2.13600393e-01, -4.59961888e-01,
        -1.74587394e+00],
       [-4.43835014e-01, -3.88821859e-01,  4.25562715e-02,
        -3.92072979e-01, -5.69245624e-01,  5.09618999e-02,
        -2.37258214e-02, -1.16143157e+00, -4.60975437e-02,
        -2.75051035e-01,  1.14766456e+00,  8.93839357e-01,
         5.46742611e-01],
       [ 2.01609498e+00,  4.26959534e-01,  9.91087786e-01,
        -5.90955413e-01,  1.23390936e+00, -9.10575725e-01,
        -1.33492171e+00,  9.50530406e-01, -4.50467505e-02,
         1.44545797e+00,  1.23698281e-01,  1.39520994e-01,
        -1.75622345e+00],
       [-1.72735172e-01, -3.57090194e-01,  3.00221314e-01,
         1.38525838e+00, -1.15805480e-01,  9.45987694e-01,
         1.02489352e+00,  8.83040675e-01, -4.30551791e-01,
        -2.42184312e-01, -8.45236024e-01, -2.37652608e-01,
         5.63572476e-01]])


distortions = []
inertias = []
mapping1 = {}
mapping2 = {}
K = range(1, 10)

X = wine_normalize_df
 
for k in K:
    # Building and fitting the model
    kmeanModel = KMeans(n_clusters=k).fit(X)
    kmeanModel.fit(X)
 
    distortions.append(sum(np.min(cdist(X, kmeanModel.cluster_centers_,
                                        'euclidean'), axis=1)) / X.shape[0])
    inertias.append(kmeanModel.inertia_)
 
    mapping1[k] = sum(np.min(cdist(X, kmeanModel.cluster_centers_,
                                   'euclidean'), axis=1)) / X.shape[0]
    mapping2[k] = kmeanModel.inertia_


# It is calculated as the average of the squared distances from the cluster centers of the respective clusters. Typically, the Euclidean distance metric is used.

plt.plot(K, distortions, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Distortion')
plt.title('The Elbow Method using Distortion')
plt.show()


for key, val in mapping1.items():
    print(f'{key} : {val}')

1 : 3.398663442230733
2 : 2.917028762948958
3 : 2.6162602589705894
4 : 2.5006083589797243
5 : 2.3995061784912033
6 : 2.37215372581441
7 : 2.3174643113438664
8 : 2.2818834977325357
9 : 2.236834289199079


cluster_means = pd.DataFrame(kmeans.cluster_centers_, columns = wine.columns)
cluster_means

	fixed acidity	volatile acidity	citric acid	residual sugar	chlorides	free sulfur dioxide	total sulfur dioxide	density	pH	sulphates	alcohol	quality	color
0	7.400	0.700	0.000	1.900	0.076	11.000	34.000	0.998	3.510	0.560	9.400	5	R
1	7.800	0.880	0.000	2.600	0.098	25.000	67.000	0.997	3.200	0.680	9.800	5	R
2	7.800	0.760	0.040	2.300	0.092	15.000	54.000	0.997	3.260	0.650	9.800	5	R

	count	mean	std	min	25%	50%	75%	max
fixed acidity	5997.000	7.212	1.301	3.800	6.400	7.000	7.700	15.900
volatile acidity	5997.000	0.339	0.164	0.080	0.230	0.290	0.400	1.580
citric acid	5997.000	0.319	0.145	0.000	0.250	0.310	0.390	1.660
residual sugar	5997.000	5.445	4.757	0.600	1.800	3.000	8.100	65.800
chlorides	5997.000	0.056	0.034	0.009	0.038	0.047	0.064	0.610
free sulfur dioxide	5997.000	30.695	17.913	1.000	17.000	29.000	42.000	289.000
total sulfur dioxide	5997.000	116.313	56.619	6.000	78.000	119.000	156.000	440.000
density	5997.000	0.995	0.003	0.987	0.992	0.995	0.997	1.039
pH	5997.000	3.219	0.160	2.720	3.110	3.210	3.320	4.010
sulphates	5997.000	0.532	0.150	0.220	0.430	0.510	0.600	2.000
alcohol	5997.000	10.491	1.199	0.099	9.500	10.300	11.300	14.900
quality	5997.000	5.322	1.957	-1.000	5.000	6.000	6.000	9.000

	fixed acidity	volatile acidity	citric acid	residual sugar	chlorides	free sulfur dioxide	total sulfur dioxide	density	pH	sulphates	alcohol	quality
0	7.400	0.700	0.000	1.900	0.076	11.000	34.000	0.998	3.510	0.560	9.400	5.000
1	7.800	0.880	0.000	2.600	0.098	25.000	67.000	0.997	3.200	0.680	9.800	5.000
2	7.800	0.760	0.040	2.300	0.092	15.000	54.000	0.997	3.260	0.650	9.800	5.000
3	11.200	0.280	0.560	1.900	0.075	17.000	60.000	0.998	3.160	0.580	9.800	6.000
4	7.400	0.700	0.000	1.900	0.076	11.000	34.000	0.998	3.510	0.560	9.400	5.000

	fixed acidity	volatile acidity	citric acid	residual sugar	chlorides	free sulfur dioxide	total sulfur dioxide	density	pH	sulphates	alcohol	quality	color_W
0	0.144	2.199	-2.193	-0.745	0.592	-1.100	-1.454	1.035	1.819	0.190	-0.910	-0.995	-1.764
1	0.452	3.295	-2.193	-0.598	1.236	-0.318	-0.871	0.703	-0.117	0.992	-0.576	-0.995	-1.764
2	0.452	2.565	-1.918	-0.661	1.060	-0.876	-1.101	0.769	0.257	0.792	-0.576	-0.995	-1.764
3	3.065	-0.359	1.659	-0.745	0.563	-0.765	-0.995	1.102	-0.367	0.324	-0.576	0.203	-1.764
4	0.144	2.199	-2.193	-0.745	0.592	-1.100	-1.454	1.035	1.819	0.190	-0.910	-0.995	-1.764

	fixed acidity	volatile acidity	citric acid	residual sugar	chlorides	free sulfur dioxide	total sulfur dioxide	density	pH	sulphates	alcohol	quality	color_W
0	-0.212	-0.383	0.000	-0.343	-0.198	-0.156	0.218	-0.346	-0.092	-0.299	-0.244	-0.433	0.562
1	0.126	1.670	-1.213	-0.634	0.728	-0.793	-1.189	0.533	0.989	0.475	-0.214	-0.460	-1.746
2	-0.444	-0.389	0.043	-0.392	-0.569	0.051	-0.024	-1.161	-0.046	-0.275	1.148	0.894	0.547
3	2.016	0.427	0.991	-0.591	1.234	-0.911	-1.335	0.951	-0.045	1.445	0.124	0.140	-1.756
4	-0.173	-0.357	0.300	1.385	-0.116	0.946	1.025	0.883	-0.431	-0.242	-0.845	-0.238	0.564