#
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib as mp
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import laUtilities as ut
import slideUtilities as sl
import demoUtilities as dm
from matplotlib import animation
from importlib import reload
from datetime import datetime
from IPython.display import Image, display_html, display, Math, HTML;
qr_setting = None

mp.rcParams['animation.html'] = 'jshtml';


#
display(Image("images/08-gradescope_grading.png", width=600))


#
display(Image("images/20-kmeans-example.png", width=600))


#
display(Image("images/20-kmeans-nonspherical-clusters.png", width=600))


#
display(Image("images/20-kmeans-cluster-size.png", width=600))


#
display(Image("images/20-kmeans-bad-initialization.png", width=600))


unif_X = np.random.default_rng().uniform(0, 1, 100)
unif_Y = np.random.default_rng().uniform(0, 1, 100)
df = pd.DataFrame(np.column_stack([unif_X, unif_Y]), columns = ['X', 'Y'])
kmeans = KMeans(init = 'k-means++', n_clusters = 3, n_init = 100)
df['label'] = kmeans.fit_predict(df[['X', 'Y']])
df.plot('X', 'Y', kind = 'scatter', c = 'label', colormap='viridis', colorbar = False)
plt.axis('equal')
plt.axis('off');


# From xkcd.com
display(Image("images/08-kmeans-xkcd.png", width=400))


X_rand, y_rand = sk_data.make_blobs(n_samples=[100, 250, 150], centers = [[1, 2],[1.5, 3], [2, 4]], n_features = 2,
                          center_box = (-10.0, 10.0), cluster_std = [.2, .3, .2], random_state = 0)
df_rand_gt = pd.DataFrame(np.column_stack([X_rand[:, 0], X_rand[:, 1], y_rand]), columns = ['X', 'Y', 'label'])
df_rand_clust = df_rand_gt.copy()
kmeans = KMeans(init = 'k-means++', n_clusters = 3, n_init = 100)
df_rand_clust['label'] = kmeans.fit_predict(df_rand_gt[['X', 'Y']])


figs, axs = plt.subplots(1, 2, figsize = (12, 5))
df_rand_gt.plot('X', 'Y', kind = 'scatter', c = 'label', colormap='viridis', ax = axs[0],
                   colorbar = False)
axs[0].set_title('Ground Truth (T)')
axs[0].set_axis_off()
df_rand_clust.plot('X', 'Y', kind = 'scatter', c = 'label', colormap='viridis', ax = axs[1],
                  colorbar = False)
axs[1].set_title('Clustering (C)')
axs[1].set_axis_off();


sns.heatmap(rX, xticklabels = False, yticklabels = False, linewidths = 0);


def ri_evaluate_clusters(X,max_clusters,ground_truth):
    ri = np.zeros(max_clusters+1)
    ri[0] = 0;
    for k in range(1,max_clusters+1):
        kmeans = KMeans(init='k-means++', n_clusters=k, n_init=10)
        kmeans.fit_predict(X)
        ri[k] = metrics.adjusted_rand_score(kmeans.labels_,ground_truth)
    return ri
    
ri = ri_evaluate_clusters(X, 10, y)
plt.plot(range(1,len(ri)), ri[1:], 'o-')
plt.xlabel('Number of clusters')
plt.title('$k$-means Clustering Compared to Known Labels')
plt.ylabel('Adjusted Rand Index');


error = np.zeros(11)
for k in range(1,11):
    kmeans = KMeans(init='k-means++', n_clusters = k, n_init = 10)
    kmeans.fit_predict(X)
    error[k] = kmeans.inertia_


#
import sklearn.manifold
import sklearn.metrics as metrics
euclidean_dists = metrics.euclidean_distances(X)
mds = sklearn.manifold.MDS(n_components = 2, max_iter = 3000, eps = 1e-9, random_state = 0,
                   dissimilarity = "precomputed", n_jobs = 1)
fit = mds.fit(euclidean_dists)
pos = fit.embedding_
plt.axis('equal')
plt.scatter(pos[:, 0], pos[:, 1], s = 8);


plt.plot(range(1, len(error)), error[1:], 'o-')
plt.xlabel('Number of clusters')
plt.title(r'$k$-means clustering performance of synthetic data')
plt.ylabel('Error');


def evaluate_clusters(X,max_clusters):
    error = np.zeros(max_clusters+1)
    error[0] = 0;
    for k in range(1,max_clusters+1):
        kmeans = KMeans(init='k-means++', n_clusters=k, n_init=10)
        kmeans.fit_predict(X)
        error[k] = kmeans.inertia_
    return error


sc = metrics.silhouette_score(X, labels, metric='euclidean')
print(sc)

0.8319348841402534


#
def sc_evaluate_clusters(X, max_clusters, n_init, seed):
    s = np.zeros(max_clusters+1)
    s[0] = 0;
    s[1] = 0;
    for k in range(2, max_clusters+1):
        kmeans = KMeans(init='k-means++', n_clusters = k, n_init = n_init, random_state = seed)
        kmeans.fit_predict(X)
        s[k] = metrics.silhouette_score(X, kmeans.labels_, metric = 'euclidean')
    return s

s = sc_evaluate_clusters(X, 10, 10, 1)
plt.plot(range(2, len(s)), s[2:], 'o-')
plt.xlabel('Number of Clusters')
plt.title('$k$-means clustering performance on synthetic data')
plt.ylabel('Silhouette Score');

Announcements¶

Homework logistics¶

-Means recap¶

Centroid and Variance¶

7.6 Limitations of -means¶

Feature Scaling¶

Choosing the right ¶

8. Cluster Evaluation¶

8.1 How useful are the clusters?¶

Rand Index¶

8.2 Deciding on the Number of Clusters¶

Inspecting Clustering Error¶

Silhouette Coefficient¶

Announcements¶

Homework logistics¶

k-Means recap¶

Centroid and Variance¶

7.6 Limitations of k-means¶

Feature Scaling¶

Choosing the right k¶

8. Cluster Evaluation¶

8.1 How useful are the clusters?¶

Rand Index¶

8.2 Deciding on the Number of Clusters¶

Inspecting Clustering Error¶

Silhouette Coefficient¶

-Means recap¶

7.6 Limitations of -means¶

Choosing the right ¶