#
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib as mp
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import laUtilities as ut
import slideUtilities as sl
import demoUtilities as dm
from matplotlib import animation
from importlib import reload
from datetime import datetime
from IPython.display import Image, display_html, display, Math, HTML;
qr_setting = None
mp.rcParams['animation.html'] = 'jshtml';
HW1 is due today at 8pm
HW 2 is out
Upcoming office hours:
Read Boyd-Vandenberghe Chapter 3.1-3.2 and Chapter 4
The inner product is the sum of the componentwise product of and .
If and then the inner product of and is:
Definition. The norm of is the nonnegative scalar defined by
We can use inner product to find the distance between two vectors.
And because the Pythagorean Theorem says that and are perpendicular if and only if:
We can say that two vectors and in are orthogonal to each other if
There is an important connection between the inner product of two vectors and the angle between them. This connection is very useful (e.g., in visualizing data mining operations).
We start from the law of cosines:
#
ax = ut.plotSetup(-6,6,-1,4,(12,6))
u = np.array([4.,1])
v = np.array([-4.,3])
pt = u + v
plt.plot([u[0],v[0]],[u[1],v[1]],'b-',lw=2)
plt.plot([0,u[0]],[0,u[1]],'b-',lw=2)
plt.plot([0,v[0]],[0,v[1]],'b-',lw=2)
m = (u-v)/2.0
mm = v + m
ax.text(mm[0]+0.25,mm[1]+0.25,r'${\bf c}$',size=16)
ax.text(2,0.15,r'${\bf a}$',size=16)
ax.text(-3,1,r'${\bf b}$',size=16)
ax.annotate('', xy=(0.2*v[0], 0.2*v[1]), xycoords='data',
xytext=(0.3*u[0], 0.3*u[1]), textcoords='data',
size=15,
#bbox=dict(boxstyle="round", fc="0.8"),
arrowprops={'arrowstyle': 'simple',
'fc': '0',
'ec': 'none',
'connectionstyle' : 'arc3,rad=0.5'},
)
ax.text(0.9,0.8,r'$\theta$',size=20)
plt.axis('off');
Now let's interpret this law in terms of vectors and .
Once again, it is the angle that these vectors make at the origin that we are concerned with.
#
ax = ut.plotSetup(-6,6,-1,4,(12,6))
ut.centerAxes(ax)
u = np.array([4.,1])
v = np.array([-4.,3])
pt = u + v
plt.plot([u[0],v[0]],[u[1],v[1]],'b-',lw=2)
plt.plot([0,u[0]],[0,u[1]],'b-',lw=2)
plt.plot([0,v[0]],[0,v[1]],'b-',lw=2)
ut.plotVec(ax,u)
ut.plotVec(ax,v)
m = (u-v)/2.0
mm = v + m
ax.text(mm[0]+0.25,mm[1]+0.25,r'$\Vert{\bf u-v}\Vert$',size=16)
ax.text(2,0.15,r'$\Vert{\bf u}\Vert$',size=16)
ax.text(-3,1,r'$\Vert{\bf v}\Vert$',size=16)
ax.annotate('', xy=(0.2*v[0], 0.2*v[1]), xycoords='data',
xytext=(0.3*u[0], 0.3*u[1]), textcoords='data',
size=15,
#bbox=dict(boxstyle="round", fc="0.8"),
arrowprops={'arrowstyle': 'simple',
'fc': '0',
'ec': 'none',
'connectionstyle' : 'arc3,rad=0.5'},
)
ax.text(4.1, 0.8, r'$\bf u$', size = 14)
ax.text(-4.3, 2.9, r'$\bf v$', size = 14)
ax.text(0.9,0.8,r'$\theta$',size=20);
Applying the law of cosines we get:
Now, previously we calculated that:
Which means that
So
This is a very important connection between the notion of inner product and trigonometry.
As a quick check, note that if and are nonzero, and , then
In other words, the angle between and is 90 degrees (or 270 degrees). So this agrees with our definition of orthogonality.
One implication in particular concerns unit vectors.
So
Note that and are unit vectors.
So we have the very simple rule, that for two unit vectors, their inner product is the cosine of the angle between them!
Example. Consider and
So
So
So degrees.
#
u = np.array([0.95, np.sin(np.arccos(0.95))])
theta = (np.pi/3)+np.arccos(0.95)
v = np.array([np.cos(theta), np.sin(theta)])
ax = ut.plotSetup(-1.3,1.3,-1.3,1.3,(12,6))
ut.centerAxes(ax)
plt.axis('equal')
angles = 2.0*np.pi * np.array(range(101))/100.0
plt.plot(np.cos(angles),np.sin(angles),'r-')
ax.arrow(0,0,u[0],u[1],head_width=0.07, head_length=0.1,length_includes_head = True)
ax.arrow(0,0,v[0],v[1],head_width=0.07, head_length=0.1,length_includes_head = True)
ax.annotate('', xy=(0.2*v[0], 0.2*v[1]), xycoords='data',
xytext=(0.3*u[0], 0.3*u[1]), textcoords='data',
size=15,
#bbox=dict(boxstyle="round", fc="0.8"),
arrowprops={'arrowstyle': 'simple',
'fc': '0',
'ec': 'none',
'connectionstyle' : 'arc3,rad=0.5'})
mid = 0.4*(u+v)/2.0
ax.text(mid[0],mid[1],r'$\theta$',size=20)
ax.text(u[0]+.1,u[1]+.1,r'$(%0.2f,%0.2f)$'% (u[0],u[1]),size=16)
ax.text(v[0]+.1,v[1]+.1,r'$(%0.2f,%0.2f)$'% (v[0],v[1]),size=16);
Example. Find the angle formed by the vectors:
Solution. First normalize the vectors:
So
Then calculate the cosine of the angle between them:
Then:
We can also compute this angle using Python.
u = np.array([1.,3,-7,-2])
v = np.array([8.,-2,4,6])
# normalize to unit vectors
uNormalized = u/np.sqrt(u.T.dot(u)) # computing the norm manually, as long as the vector is nonzero
vNormalized = v/np.linalg.norm(v) # an easier way
print("Normalized u =", uNormalized)
print("Normalized v =", vNormalized)
Normalized u = [ 0.12598816 0.37796447 -0.8819171 -0.25197632] Normalized v = [ 0.73029674 -0.18257419 0.36514837 0.54772256]
# compute inner product
innerProduct = (vNormalized).dot(uNormalized)
print("Inner product =", innerProduct)
Inner product = -0.4370415209168243
# find angle, convert from radians to degrees
print("Angle theta =", np.arccos(innerProduct)*180/np.pi)
Angle theta = 115.91527033906208
Let's consider a data science application of this technique. Suppose we are given two documents and .
For a goal such as information retrieval, we'd like to know whether the two documents are "similar".
One common way to formalize similarity is to say that documents are similar if they contain similar words. Here, we will view each document as a "bag of words," and ignore where each word occurs in the document.
We can measure this as follows: Construct a vector that counts the frequency of occurrence of certain words in :
Do the same thing for , yielding .
What we have done is taken individual documents and represented them as vectors and in a very high dimensional space, .
Here could be 10,000 or more.
Now, to ask whether the two documents are similar, we simply ask "do their vectors point in the same general direction?"
And, despite the fact that we are in a very high dimensional space which we cannot visualize, we can nonetheless answer the question easily.
The angle between the two document-vectors is simply:
But what if rather than having two documents, we have a million documents? Then it's less clear what to make about the similarity or dissimilarity of any pair of documents. We want a more systematic way to understand the patterns in the overall dataset.
When we do clustering, what problem are we trying to solve? We will answer this question informally at first. (But soon we will look at formal criteria!)
Informally, a clustering is:
a grouping of data objects, such that the objects within a group are similar (or near) to one another and dissimilar (or far) from the objects in other groups.
#
display(Image("images/20-clustering-1.png", width=350))
So we want our clustering algorithm to:
Here are the basic questions we need to ask about clustering:
Now note that even with our more-formal discussion, the criteria for deciding on a "best" clustering can still be ambiguous.
#
display(Image("images/20-clustering-3.png", width=650))
#
display(Image("images/20-clustering-2.png", width=650))
To accommodate the ambiguity here, one approach is to seek a hierarchical clustering. That is, as set of nested clusters organized in a tree.
We'll discuss hierarchical cluster next time.
For today, we'll focus on partitional clustering:
in a partitional clustering, the points are divided into a set of non-overlapping groups.
#
display(Image("images/20-partitional-clustering.png", width=600))
In a partitional clustering:
We are going to assume for now that the number of clusters is given in advance.
We will denote the number of clusters as .
To measure how close or different many vectors are in space, let's define the idea of a dataset centroid.
Given a different vectors each with the same dimension , the centroid is the average of the vectors taken componentwise.
In other words: the centroid is the "center of mass" of the dataset.
Next, we define the sample variance of a dataset as:
In other words, the sample variance of the set of points is the average squared distance from each point to the centroid.
Now, we are ready to state our first formalization of the clustering problem.
We will assume that
-means Problem:
Find points (called centers, centroids, or means), so that the cost
is minimized.
Equivalently: we can think in terms of the partition itself.
Consider the set where .
Find points
and partition the set into different subsets by assigning each point to its nearst cluster center,
so that the cost
is minimized.
We now have a formal definition of a clustering.
This is not the only definition possible, but it is an intuitive and simple one.
How hard is it to solve this problem?
Nonetheless, there is a simple algorithm that works quite well in practice!
-means Problem: Find points (called centers, centroids, or means), so that the sum of squares of distances from every point to the closest center
is minimized.
#
display(Image("images/08-rosa-gold.png", width=400))
#
display(Image("images/08-rosa-gold-2.png", width=400))
[Source: Wikipedia]
There is a "classic" algorithm for this problem, called the "-means algorithm."
It was voted among the top-10 algorithms in data mining!
It is such a good idea that it has been independently discovered multiple times.
It was first discovered by Lloyd in 1957, so it is often called Lloyd's algorithm.
#
display(Image("images/20-top-ten-algorithms-cover.png", width=300))
The (approximate) -means algorithm:
(Nearer to than to any other center.)
3. For each , redefine to be the center of mass of cluster .
(In other words, is the mean of the vectors in .)
4. Repeat (i.e., go to Step 2) until convergence.
Let's walk through a picture showing how this algorithm works:
#
display(Image("images/20-kmeans-example.png", width=600))
Let's do an extended example showing -means clustering in practice and in the context of the python libraries scikit-learn.
scikit-learn
is a useful Python library for machine learning functions.
Our goals are to learn:
Generally, when learning about or developing a new unsupervised method, it's a good idea to try it out on a dataset in which you already know the "right" answer.
One way to do that is to generate synthetic data that has some known properties.
Among other things, scikit-learn
contains tools for generating synthetic data for testing.
import sklearn.datasets as sk_data
X, y = sk_data.make_blobs(n_samples=100, centers=3, n_features=30,
center_box=(-10.0, 10.0), random_state=0)
To get a sense of the raw data we can inspect it.
For statistical visualization, a good library is seaborn
, imported as sns
.
import seaborn as sns
sns.heatmap(X, xticklabels = False, yticklabels = False, linewidths = 0, cbar = False);
That plot shows all the data.
As usual, each row is a data item and the columns correspond to features (which are simply coordinates here).
Geometrically, these points live in a 30 dimensional space, so we cannot directly visualize their geometry.
So let's compute the pairwise distances for visualization purposes.
We can compute all pairwise distances in a single step using a scikit-learn
function:
import sklearn.metrics as metrics
euclidean_dists = metrics.euclidean_distances(X)
euclidean_dists
array([[ 0. , 47.73797008, 45.18787978, ..., 47.87535624, 49.64694402, 45.58307694], [47.73797008, 0. , 43.66760596, ..., 7.3768511 , 7.36794305, 43.51069074], [45.18787978, 43.66760596, 0. , ..., 42.55609472, 43.80829605, 9.31642449], ..., [47.87535624, 7.3768511 , 42.55609472, ..., 0. , 8.19377462, 41.81523421], [49.64694402, 7.36794305, 43.80829605, ..., 8.19377462, 0. , 43.41205895], [45.58307694, 43.51069074, 9.31642449, ..., 41.81523421, 43.41205895, 0. ]])
And we can then visualize the data is using a heatmap on the set of pairwise distances.
sns.heatmap(euclidean_dists, xticklabels=False, yticklabels=False, linewidths=0,
square=True);
The idea behind Multidimensional Scaling (MDS) is:
Usually we are looking for points in 2D or maybe 3D for visualization purposes.
The algorithm works by starting with random positions, and then moving points in a way that reduces the disparity between true distance and euclidean distance.
Note that there are many ways that this can fail!
import sklearn.manifold
mds = sklearn.manifold.MDS(n_components=2, max_iter=3000, eps=1e-9, random_state=0,
dissimilarity = "precomputed", n_jobs = 1)
fit = mds.fit(euclidean_dists)
pos = fit.embedding_
plt.scatter(pos[:, 0], pos[:, 1], s=8)
plt.axis('square');
So we can see that, although the data lives in 30 dimensions, we can get a sense of how the points are clustered by approximately placing the points into two dimensions.
The Python package scikit-learn
has a huge set of tools for unsupervised learning generally, and clustering specifically.
These are in sklearn.cluster.
There are 3 functions in all the clustering classes,
fit()
,predict()
, andfit_predict()
.fit()
builds the model from the training data (e.g. for -means, it finds the
centroids),
predict()
assigns labels to the data after building
the model, and
fit_predict()
does both in a single step.
from sklearn.cluster import KMeans
kmeans = KMeans(init = 'k-means++', n_clusters = 3, n_init = 100)
kmeans.fit_predict(X)
array([1, 2, 0, 0, 2, 1, 2, 2, 1, 2, 0, 1, 0, 1, 2, 0, 0, 1, 0, 2, 0, 2, 0, 1, 2, 2, 1, 0, 0, 0, 0, 1, 0, 1, 0, 2, 0, 2, 0, 2, 2, 2, 1, 0, 1, 0, 1, 2, 1, 0, 0, 1, 1, 1, 1, 0, 1, 2, 2, 0, 1, 0, 2, 1, 1, 2, 0, 1, 1, 2, 2, 2, 1, 1, 0, 1, 2, 1, 2, 1, 2, 2, 2, 1, 0, 0, 2, 1, 1, 0, 1, 2, 2, 0, 1, 0, 0, 2, 2, 0], dtype=int32)
All the tools in scikit-learn
are implemented as python objects.
Thus, the general sequence for using a tool from scikit-learn
is:
fit()
function, andcentroids = kmeans.cluster_centers_
labels = kmeans.labels_
error = kmeans.inertia_
print(f'The total error of the clustering is: {error:0.1f}.')
print('\nCluster labels:')
print(labels)
print('\nCluster centroids:')
print(centroids)
The total error of the clustering is: 2733.8. Cluster labels: [1 2 0 0 2 1 2 2 1 2 0 1 0 1 2 0 0 1 0 2 0 2 0 1 2 2 1 0 0 0 0 1 0 1 0 2 0 2 0 2 2 2 1 0 1 0 1 2 1 0 0 1 1 1 1 0 1 2 2 0 1 0 2 1 1 2 0 1 1 2 2 2 1 1 0 1 2 1 2 1 2 2 2 1 0 0 2 1 1 0 1 2 2 0 1 0 0 2 2 0] Cluster centroids: [[-4.7833887 5.32946939 -0.87141823 1.38900567 -9.59956915 2.35207348 2.22988468 2.03394692 8.9797878 3.67857655 -2.67618716 -1.17595897 3.76433199 -8.46317271 3.28114395 3.73803392 -5.73436869 -7.0844462 -3.75643598 -3.07904369 1.36974653 -0.95918462 9.91135428 -8.17722281 -5.8656831 -6.76869078 3.12196673 -4.85745245 -0.70449349 -4.94582258] [ 0.88697885 4.29142902 1.93200132 1.10877989 -1.55994342 2.80616392 -1.11495818 7.74595341 8.92512875 -2.29656298 6.09588722 0.47062896 1.36408008 8.63168509 -8.54512921 -8.59161818 -9.64308952 6.92270491 5.65321496 7.29061444 9.58822315 5.79602014 -0.84970449 5.46127493 -7.77730238 2.75092191 -7.17026663 9.07475984 0.04245798 -1.98719465] [-7.0489904 -7.92501873 2.89710462 -7.17088692 -6.01151677 -2.66405834 6.43970052 -8.20341647 6.54146052 -7.92978843 9.56983319 -0.86327902 9.25897119 1.73061823 4.84528928 -9.26418246 -4.54021612 -7.47784575 -4.15060719 -7.85665458 -3.76688414 -1.6692291 -8.78048843 3.78904162 1.24247168 -4.73618733 0.27327032 -7.93180624 1.59974866 8.78601576]]
Let's visualize the results. We'll do that by reordering the data items according to their cluster.
idx = np.argsort(labels)
rX = X[idx, :]
sns.heatmap(rX, xticklabels = False, yticklabels = False, linewidths = 0);
rearranged_dists = euclidean_dists[idx,:][:,idx]
sns.heatmap(rearranged_dists, xticklabels = False, yticklabels = False, linewidths = 0, square = True);