#
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib as mp
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import laUtilities as ut
import slideUtilities as sl
import demoUtilities as dm
from matplotlib import animation
from importlib import reload
from datetime import datetime
from IPython.display import Image, display_html, display, Math, HTML;
qr_setting = None

mp.rcParams['animation.html'] = 'jshtml';


#
ax = ut.plotSetup(-6,6,-1,4,(12,6))
u = np.array([4.,1])
v = np.array([-4.,3])
pt = u + v
plt.plot([u[0],v[0]],[u[1],v[1]],'b-',lw=2)
plt.plot([0,u[0]],[0,u[1]],'b-',lw=2)
plt.plot([0,v[0]],[0,v[1]],'b-',lw=2)
m = (u-v)/2.0
mm = v + m
ax.text(mm[0]+0.25,mm[1]+0.25,r'${\bf c}$',size=16)
ax.text(2,0.15,r'${\bf a}$',size=16)
ax.text(-3,1,r'${\bf b}$',size=16)
ax.annotate('', xy=(0.2*v[0], 0.2*v[1]),  xycoords='data',
                xytext=(0.3*u[0], 0.3*u[1]), textcoords='data',
                size=15,
                #bbox=dict(boxstyle="round", fc="0.8"),
                arrowprops={'arrowstyle': 'simple',
                                'fc': '0', 
                                'ec': 'none',
                                'connectionstyle' : 'arc3,rad=0.5'},
                )
ax.text(0.9,0.8,r'$\theta$',size=20)
plt.axis('off');


#
ax = ut.plotSetup(-6,6,-1,4,(12,6))
ut.centerAxes(ax)
u = np.array([4.,1])
v = np.array([-4.,3])
pt = u + v
plt.plot([u[0],v[0]],[u[1],v[1]],'b-',lw=2)
plt.plot([0,u[0]],[0,u[1]],'b-',lw=2)
plt.plot([0,v[0]],[0,v[1]],'b-',lw=2)
ut.plotVec(ax,u)
ut.plotVec(ax,v)
m = (u-v)/2.0
mm = v + m
ax.text(mm[0]+0.25,mm[1]+0.25,r'$\Vert{\bf u-v}\Vert$',size=16)
ax.text(2,0.15,r'$\Vert{\bf u}\Vert$',size=16)
ax.text(-3,1,r'$\Vert{\bf v}\Vert$',size=16)
ax.annotate('', xy=(0.2*v[0], 0.2*v[1]),  xycoords='data',
                xytext=(0.3*u[0], 0.3*u[1]), textcoords='data',
                size=15,
                #bbox=dict(boxstyle="round", fc="0.8"),
                arrowprops={'arrowstyle': 'simple',
                                'fc': '0', 
                                'ec': 'none',
                                'connectionstyle' : 'arc3,rad=0.5'},
                )
ax.text(4.1, 0.8, r'$\bf u$', size = 14)
ax.text(-4.3, 2.9, r'$\bf v$', size = 14)
ax.text(0.9,0.8,r'$\theta$',size=20);


#
u = np.array([0.95, np.sin(np.arccos(0.95))])
theta = (np.pi/3)+np.arccos(0.95)
v = np.array([np.cos(theta), np.sin(theta)])
ax = ut.plotSetup(-1.3,1.3,-1.3,1.3,(12,6))
ut.centerAxes(ax)
plt.axis('equal')
angles = 2.0*np.pi * np.array(range(101))/100.0
plt.plot(np.cos(angles),np.sin(angles),'r-')
ax.arrow(0,0,u[0],u[1],head_width=0.07, head_length=0.1,length_includes_head = True)
ax.arrow(0,0,v[0],v[1],head_width=0.07, head_length=0.1,length_includes_head = True)
ax.annotate('', xy=(0.2*v[0], 0.2*v[1]),  xycoords='data',
                xytext=(0.3*u[0], 0.3*u[1]), textcoords='data',
                size=15,
                #bbox=dict(boxstyle="round", fc="0.8"),
                arrowprops={'arrowstyle': 'simple',
                                'fc': '0', 
                                'ec': 'none',
                                'connectionstyle' : 'arc3,rad=0.5'})
mid = 0.4*(u+v)/2.0
ax.text(mid[0],mid[1],r'$\theta$',size=20)
ax.text(u[0]+.1,u[1]+.1,r'$(%0.2f,%0.2f)$'% (u[0],u[1]),size=16)
ax.text(v[0]+.1,v[1]+.1,r'$(%0.2f,%0.2f)$'% (v[0],v[1]),size=16);


u = np.array([1.,3,-7,-2])
v = np.array([8.,-2,4,6])

# normalize to unit vectors
uNormalized = u/np.sqrt(u.T.dot(u))  # computing the norm manually, as long as the vector is nonzero
vNormalized = v/np.linalg.norm(v)    # an easier way
print("Normalized u =", uNormalized)
print("Normalized v =", vNormalized)

Normalized u = [ 0.12598816  0.37796447 -0.8819171  -0.25197632]
Normalized v = [ 0.73029674 -0.18257419  0.36514837  0.54772256]


# compute inner product
innerProduct = (vNormalized).dot(uNormalized)
print("Inner product =", innerProduct)

Inner product = -0.4370415209168243


# find angle, convert from radians to degrees
print("Angle theta =", np.arccos(innerProduct)*180/np.pi)

Angle theta = 115.91527033906208


#
display(Image("images/20-clustering-1.png", width=350))


#
display(Image("images/20-clustering-3.png", width=650))


#
display(Image("images/20-clustering-2.png", width=650))


#
display(Image("images/20-partitional-clustering.png", width=600))


#
display(Image("images/08-rosa-gold.png", width=400))


#
display(Image("images/08-rosa-gold-2.png", width=400))


#
display(Image("images/20-top-ten-algorithms-cover.png", width=300))


#
display(Image("images/20-kmeans-example.png", width=600))


import sklearn.datasets as sk_data
X, y = sk_data.make_blobs(n_samples=100, centers=3, n_features=30,
                          center_box=(-10.0, 10.0), random_state=0)


import seaborn as sns
sns.heatmap(X, xticklabels = False, yticklabels = False, linewidths = 0, cbar = False);


import sklearn.metrics as metrics
euclidean_dists = metrics.euclidean_distances(X)
euclidean_dists

array([[ 0.        , 47.73797008, 45.18787978, ..., 47.87535624,
        49.64694402, 45.58307694],
       [47.73797008,  0.        , 43.66760596, ...,  7.3768511 ,
         7.36794305, 43.51069074],
       [45.18787978, 43.66760596,  0.        , ..., 42.55609472,
        43.80829605,  9.31642449],
       ...,
       [47.87535624,  7.3768511 , 42.55609472, ...,  0.        ,
         8.19377462, 41.81523421],
       [49.64694402,  7.36794305, 43.80829605, ...,  8.19377462,
         0.        , 43.41205895],
       [45.58307694, 43.51069074,  9.31642449, ..., 41.81523421,
        43.41205895,  0.        ]])


sns.heatmap(euclidean_dists, xticklabels=False, yticklabels=False, linewidths=0, 
            square=True);


import sklearn.manifold
mds = sklearn.manifold.MDS(n_components=2, max_iter=3000, eps=1e-9, random_state=0,
                   dissimilarity = "precomputed", n_jobs = 1)
fit = mds.fit(euclidean_dists)
pos = fit.embedding_
plt.scatter(pos[:, 0], pos[:, 1], s=8)
plt.axis('square');


from sklearn.cluster import KMeans
kmeans = KMeans(init = 'k-means++', n_clusters = 3, n_init = 100)
kmeans.fit_predict(X)

array([1, 2, 0, 0, 2, 1, 2, 2, 1, 2, 0, 1, 0, 1, 2, 0, 0, 1, 0, 2, 0, 2,
       0, 1, 2, 2, 1, 0, 0, 0, 0, 1, 0, 1, 0, 2, 0, 2, 0, 2, 2, 2, 1, 0,
       1, 0, 1, 2, 1, 0, 0, 1, 1, 1, 1, 0, 1, 2, 2, 0, 1, 0, 2, 1, 1, 2,
       0, 1, 1, 2, 2, 2, 1, 1, 0, 1, 2, 1, 2, 1, 2, 2, 2, 1, 0, 0, 2, 1,
       1, 0, 1, 2, 2, 0, 1, 0, 0, 2, 2, 0], dtype=int32)


centroids = kmeans.cluster_centers_
labels = kmeans.labels_
error = kmeans.inertia_


print(f'The total error of the clustering is: {error:0.1f}.')
print('\nCluster labels:')
print(labels)
print('\nCluster centroids:')
print(centroids)

The total error of the clustering is: 2733.8.

Cluster labels:
[1 2 0 0 2 1 2 2 1 2 0 1 0 1 2 0 0 1 0 2 0 2 0 1 2 2 1 0 0 0 0 1 0 1 0 2 0
 2 0 2 2 2 1 0 1 0 1 2 1 0 0 1 1 1 1 0 1 2 2 0 1 0 2 1 1 2 0 1 1 2 2 2 1 1
 0 1 2 1 2 1 2 2 2 1 0 0 2 1 1 0 1 2 2 0 1 0 0 2 2 0]

Cluster centroids:
[[-4.7833887   5.32946939 -0.87141823  1.38900567 -9.59956915  2.35207348
   2.22988468  2.03394692  8.9797878   3.67857655 -2.67618716 -1.17595897
   3.76433199 -8.46317271  3.28114395  3.73803392 -5.73436869 -7.0844462
  -3.75643598 -3.07904369  1.36974653 -0.95918462  9.91135428 -8.17722281
  -5.8656831  -6.76869078  3.12196673 -4.85745245 -0.70449349 -4.94582258]
 [ 0.88697885  4.29142902  1.93200132  1.10877989 -1.55994342  2.80616392
  -1.11495818  7.74595341  8.92512875 -2.29656298  6.09588722  0.47062896
   1.36408008  8.63168509 -8.54512921 -8.59161818 -9.64308952  6.92270491
   5.65321496  7.29061444  9.58822315  5.79602014 -0.84970449  5.46127493
  -7.77730238  2.75092191 -7.17026663  9.07475984  0.04245798 -1.98719465]
 [-7.0489904  -7.92501873  2.89710462 -7.17088692 -6.01151677 -2.66405834
   6.43970052 -8.20341647  6.54146052 -7.92978843  9.56983319 -0.86327902
   9.25897119  1.73061823  4.84528928 -9.26418246 -4.54021612 -7.47784575
  -4.15060719 -7.85665458 -3.76688414 -1.6692291  -8.78048843  3.78904162
   1.24247168 -4.73618733  0.27327032 -7.93180624  1.59974866  8.78601576]]


idx = np.argsort(labels)
rX = X[idx, :]
sns.heatmap(rX, xticklabels = False, yticklabels = False, linewidths = 0);


rearranged_dists = euclidean_dists[idx,:][:,idx]
sns.heatmap(rearranged_dists, xticklabels = False, yticklabels = False, linewidths = 0, square = True);

Announcements¶

7. The K-Means Problem¶

7.1 The Angle Between Two Vectors¶

Example Application: Cosine Similarity¶

7.2 The Clustering Problem¶

Centroid and Variance¶

7.3 The $k$ -means Problem¶

7.4 The $k$ -means Algorithm¶

7.5 An example of $k$ -means with synthetic data¶

Working with synthetic data¶

Visualizing with Multidimensional Scaling¶

Applying $k$ -Means¶

Visualizing the Results of Clustering¶

Announcements¶

7. The K-Means Problem¶

7.1 The Angle Between Two Vectors¶

Example Application: Cosine Similarity¶

7.2 The Clustering Problem¶

Centroid and Variance¶

7.3 The kk-means Problem¶

7.4 The kk-means Algorithm¶

7.5 An example of kk-means with synthetic data¶

Working with synthetic data¶

Visualizing with Multidimensional Scaling¶

Applying kk-Means¶

Visualizing the Results of Clustering¶

7.3 The $k$ -means Problem¶

7.4 The $k$ -means Algorithm¶

7.5 An example of $k$ -means with synthetic data¶

Applying $k$ -Means¶