#
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib as mp
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import laUtilities as ut
import slideUtilities as sl
import demoUtilities as dm
from matplotlib import animation
from importlib import reload
from datetime import datetime
from IPython.display import Image, display_html, display, Math, HTML;
qr_setting = None

mp.rcParams['animation.html'] = 'jshtml';


#
display(Image("images/20-kmeans-example.png", width=600))


#
figs, axs = plt.subplots(1, 2, figsize = (12, 5))
df_rand_gt.plot('X', 'Y', kind = 'scatter', c = 'label', colormap='viridis', ax = axs[0],
                   colorbar = False)
axs[0].set_title('Ground Truth (T)')
axs[0].set_axis_off()
df_rand_clust.plot('X', 'Y', kind = 'scatter', c = 'label', colormap='viridis', ax = axs[1],
                  colorbar = False)
axs[1].set_title('Clustering (C)')
axs[1].set_axis_off();


#
X_rand, y_rand = sk_data.make_blobs(n_samples=[100, 100, 250, 70, 75, 80], centers = [[1, 2], [1.5, 1], [3, 2], [1.75, 3.25], [2, 4], [2.25, 3.25]], n_features = 2,
                          center_box = (-10.0, 10.0), cluster_std = [.2, .2, .3, .1, .15, .15], random_state = 0)
df_rand = pd.DataFrame(np.column_stack([X_rand[:, 0], X_rand[:, 1], y_rand]), columns = ['X', 'Y', 'label'])
df_rand = df_rand.astype({'label': 'int'})
df_rand['label2'] = [{0: 0, 1: 1, 2: 2, 3: 3, 4: 3, 5: 3}[x] for x in df_rand['label']]
df_rand['label3'] = [{0: 0, 1: 0, 2: 1, 3: 2, 4: 2, 5: 2}[x] for x in df_rand['label']]
# kmeans = KMeans(init = 'k-means++', n_clusters = 3, n_init = 100)
# df_rand['label'] = kmeans.fit_predict(df_rand[['X', 'Y']])


#
df_rand.plot('X', 'Y', kind = 'scatter', colormap='viridis', 
                   colorbar = False, figsize = (6, 6))
plt.axis('square')
plt.axis('off');


#
df_rand.plot('X', 'Y', kind = 'scatter', c = 'label3', colormap='viridis', 
                   colorbar = False, figsize = (6, 6))
plt.axis('square')
plt.axis('off');


#
df_rand.plot('X', 'Y', kind = 'scatter', c = 'label2', colormap='viridis', 
                   colorbar = False, figsize = (6, 6))
plt.axis('square')
plt.axis('off');


#
df_rand.plot('X', 'Y', kind = 'scatter', c = 'label', colormap='viridis', 
                   colorbar = False, figsize = (6, 6))
plt.axis('square')
plt.axis('off');


#
display(Image("images/21-dendrogram.png", width=600))


#
display(Image("images/21-dendrogram-cut.png", width=600))


#
display(Image("images/21-animal-taxonomy.jpg", width=600))


# Single, Complete, and Average Linkage
display(Image("images/21-hierarchical-criteria.png", width=1000))


import matplotlib.pyplot as plt

pt_x = [0.4, 0.22, 0.35, 0.26, 0.08, 0.45]
pt_y = [0.53, 0.38, 0.32, 0.19, 0.41, 0.30]
plt.plot(pt_x, pt_y, 'o', markersize = 10, color = 'k')
plt.ylim([.15, .60])
plt.xlim([0.05, 0.70])
for i in range(6):
    plt.annotate(f'{i}', (pt_x[i]+0.02, pt_y[i]-0.01), fontsize = 12)
plt.axis('off');


X = np.array([pt_x, pt_y]).T
from scipy.spatial import distance_matrix
labels = ['p0', 'p1', 'p2', 'p3', 'p4', 'p5']
D = pd.DataFrame(distance_matrix(X, X), index = labels, columns = labels)
D.style.format('{:.2f}') # round to 2 digits after the decimal


#
display(Image("images/21-singlelink-pointset.png", width=500))


import scipy.cluster
import scipy.cluster.hierarchy as hierarchy
Z = hierarchy.linkage(X, method='single')
hierarchy.dendrogram(Z);


#
X_moon_05, y_moon_05 = sk_data.make_moons(random_state = 0, noise = 0.05)
Z = hierarchy.linkage(X_moon_05, method='single')
labels = hierarchy.fcluster(Z, 2, criterion = 'maxclust')
plt.scatter(X_moon_05[:,0], X_moon_05[:,1], c = [['b','g'][i-1] for i in labels])
plt.title('Single-Linkage Can Find Irregularly Shaped Clusters')
plt.axis('off');


#
X_rand_lo, y_rand_lo = sk_data.make_blobs(n_samples=[20, 200], centers = [[1, 1], [3, 1]], n_features = 2,
                          center_box = (-10.0, 10.0), cluster_std = [.1, .5], random_state = 0)
Z = hierarchy.linkage(X_rand_lo, method='single')
labels = hierarchy.fcluster(Z, 2, criterion = 'maxclust')
plt.scatter(X_rand_lo[:,0], X_rand_lo[:,1], c = [['b','g'][i-1] for i in labels])
plt.title('Single-Linkage Can Find Different-Sized Clusters')
plt.axis('off');


#
X_moon_10, y_moon_10 = sk_data.make_moons(random_state = 0, noise = 0.1)
Z = hierarchy.linkage(X_moon_10, method='single')
labels = hierarchy.fcluster(Z, 2, criterion = 'maxclust')
plt.scatter(X_moon_10[:,0], X_moon_10[:,1], c = [['b','g'][i-1] for i in labels])
plt.title('Single-Linkage Clustering Changes Drastically on Slightly More Noisy Data')
plt.axis('off');


#
X_rand_hi, y_rand_hi = sk_data.make_blobs(n_samples=[20, 200], centers = [[1, 1], [3, 1]], n_features = 2,
                          center_box = (-10.0, 10.0), cluster_std = [.15, .6], random_state = 0)
Z = hierarchy.linkage(X_rand_hi, method='single')
labels = hierarchy.fcluster(Z, 2, criterion = 'maxclust')
plt.title('Single-Linkage Clustering Changes Drastically on Slightly More Noisy Data')
plt.scatter(X_rand_hi[:,0], X_rand_hi[:,1], c = [['b','g'][i-1] for i in labels])
plt.axis('off');


#
display(Image("images/21-completelink-pointset.png", width=500))


Z = hierarchy.linkage(X, method='complete')
hierarchy.dendrogram(Z);


#
X_moon_05, y_moon_05 = sk_data.make_moons(random_state = 0, noise = 0.05)
Z = hierarchy.linkage(X_moon_05, method='complete')
labels = hierarchy.fcluster(Z, 2, criterion = 'maxclust')
plt.scatter(X_moon_05[:,0], X_moon_05[:,1], c = [['b','g'][i-1] for i in labels])
plt.title('Complete-Linkage Seeks Globular Clusters of Similar Size')
plt.axis('off');


#
Z = hierarchy.linkage(X_rand_hi, method='complete')
labels = hierarchy.fcluster(Z, 2, criterion = 'maxclust')
plt.scatter(X_rand_hi[:,0], X_rand_hi[:,1], c = [['b','g'][i-1] for i in labels])
plt.title('Complete-Linkage Seeks Globular Clusters of Similar Size')
plt.axis('off');


#
Z = hierarchy.linkage(X_moon_10, method='complete')
labels = hierarchy.fcluster(Z, 2, criterion = 'maxclust')
plt.scatter(X_moon_10[:,0], X_moon_10[:,1], c = [['b','g'][i-1] for i in labels])
plt.title('Complete-Linkage Clustering of Noisy Data similar to Less Noisy')
plt.axis('off');


#
display(Image("images/21-averagelink-pointset.png", width=500))


Z = hierarchy.linkage(X, method='average')
hierarchy.dendrogram(Z);


#
Z = hierarchy.linkage(X_moon_10, method='average')
labels = hierarchy.fcluster(Z, 2, criterion = 'maxclust')
plt.scatter(X_moon_10[:,0], X_moon_10[:,1], c = [['b','g'][i-1] for i in labels])
plt.title('Average-Linkage Similar to Complete - Globular Clusters')
plt.axis('off');


#
Z = hierarchy.linkage(X_rand_hi, method='average')
labels = hierarchy.fcluster(Z, 2, criterion = 'maxclust')
plt.scatter(X_rand_hi[:,0], X_rand_hi[:,1], c = [['b','g'][i-1] for i in labels])
plt.title('Average-Linkage More resistant to noise than Single-Linkage')
plt.axis('off');


X, y = sk_data.make_blobs(n_samples=100, centers=3, n_features=30,
                          center_box=(-10.0, 10.0),random_state=0)


#
sns.heatmap(X, xticklabels=False, yticklabels=False, linewidths=0,cbar=False);


#
import sklearn.manifold
import sklearn.metrics as metrics
euclidean_dists = metrics.euclidean_distances(X)
mds = sklearn.manifold.MDS(n_components = 2, max_iter = 3000, eps = 1e-9, random_state = 0,
                   dissimilarity = "precomputed", n_jobs = 1)
fit = mds.fit(euclidean_dists)
pos = fit.embedding_
plt.axis('equal')
plt.scatter(pos[:, 0], pos[:, 1], s = 8);


import scipy.cluster
import scipy.cluster.hierarchy as hierarchy
import scipy.spatial.distance

# linkages = ['single','complete','average','weighted','ward']
Z = hierarchy.linkage(X, method = 'single')


R = hierarchy.dendrogram(Z)


from sklearn.datasets import fetch_20newsgroups
categories = ['comp.os.ms-windows.misc', 'sci.space','rec.sport.baseball']
news_data = fetch_20newsgroups(subset = 'train', categories = categories)


from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', min_df = 4, max_df = 0.8)
data = vectorizer.fit_transform(news_data.data).todense()
data.shape

(1781, 9409)


# metrics can be ‘braycurtis’, ‘canberra’, ‘chebyshev’, ‘cityblock’, ‘correlation’, ‘cosine’, 
# ‘dice’, ‘euclidean’, ‘hamming’, ‘jaccard’, ‘kulsinski’, ‘mahalanobis’, ‘matching’, 
# ‘minkowski’, ‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, ‘sokalmichener’, ‘sokalsneath’, 
# ‘sqeuclidean’, ‘yule’.
Z_20ng = hierarchy.linkage(data, method = 'ward', metric = 'euclidean')
plt.figure(figsize=(14,4))
R_20ng = hierarchy.dendrogram(Z_20ng, p=4, truncate_mode = 'level', show_leaf_counts=True)


clusters = hierarchy.fcluster(Z_20ng, 3, criterion = 'maxclust')
print(clusters.shape)
clusters

(1781,)

array([3, 3, 3, ..., 1, 3, 1], dtype=int32)


max_clusters = 20
s = np.zeros(max_clusters+1)
for k in range(2, max_clusters+1):
    clusters = hierarchy.fcluster(Z_20ng, k, criterion = 'maxclust')
    s[k] = metrics.silhouette_score(data, clusters, metric = 'euclidean')
plt.plot(range(2, len(s)), s[2:], '.-')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score');


print('Top Terms Per Cluster:')
k = 5
clusters = hierarchy.fcluster(Z_20ng, k, criterion = 'maxclust')
for i in range(1,k+1):
    items = np.array([item for item,clust in zip(data, clusters) if clust == i])
    centroids = np.squeeze(items).mean(axis = 0)
    asc_order_centroids = centroids.argsort()#[:, ::-1]
    order_centroids = asc_order_centroids[::-1]
    terms = vectorizer.get_feature_names()
    print(f'Cluster {i}:')
    for ind in order_centroids[:10]:
        print(f' {terms[ind]}')
    print('')

Top Terms Per Cluster:
Cluster 1:
 space
 nasa
 edu
 henry
 gov
 alaska
 access
 com
 moon
 digex

Cluster 2:
 ax
 max
 b8f
 g9v
 a86
 145
 1d9
 pl
 2di
 0t

Cluster 3:
 edu
 com
 year
 baseball
 article
 writes
 cs
 team
 game
 university

Cluster 4:
 risc
 instruction
 ghhwang
 csie
 set
 nctu
 cisc
 tw
 reduced
 mq

Cluster 5:
 windows
 edu
 file
 dos
 com
 files
 card
 drivers
 driver
 use

	p0	p1	p2	p3	p4	p5
p0	0.00	0.23	0.22	0.37	0.34	0.24
p1	0.23	0.00	0.14	0.19	0.14	0.24
p2	0.22	0.14	0.00	0.16	0.28	0.10
p3	0.37	0.19	0.16	0.00	0.28	0.22
p4	0.34	0.14	0.28	0.28	0.00	0.39
p5	0.24	0.24	0.10	0.22	0.39	0.00

Announcements¶

Recap of Last Lecture¶

9. Hierarchical Clustering¶

9.1 Limits of Partitional Clustering¶

9.2 Hierarchical Clustering¶

Strengths of Hierarchical Clustering¶

Comparison to $k$ -means¶

9.3 Hierarchical Clustering Algorithms¶

Defining Cluster Proximity¶

Single-Linkage Clustering¶

Complete-Linkage Clustering¶

Average-Linkage Clustering¶

Ward's Distance¶

9.4 Hierarchical Clustering on Synthetic Data¶

9.5 Hierarchical Clustering on Real Data¶

Selecting the Number of Clusters¶

Announcements¶

Recap of Last Lecture¶

9. Hierarchical Clustering¶

9.1 Limits of Partitional Clustering¶

9.2 Hierarchical Clustering¶

Strengths of Hierarchical Clustering¶

Comparison to kk-means¶

9.3 Hierarchical Clustering Algorithms¶

Defining Cluster Proximity¶

Single-Linkage Clustering¶

Complete-Linkage Clustering¶

Average-Linkage Clustering¶

Ward's Distance¶

9.4 Hierarchical Clustering on Synthetic Data¶

9.5 Hierarchical Clustering on Real Data¶

Selecting the Number of Clusters¶

Comparison to $k$ -means¶