#
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib as mp
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import laUtilities as ut
import slideUtilities as sl
import demoUtilities as dm
from matplotlib import animation
from importlib import reload
from datetime import datetime
from IPython.display import Image, display_html, display, Math, HTML;
qr_setting = None
mp.rcParams['animation.html'] = 'jshtml';
#
def centerAxes(ax):
ax.spines['left'].set_position('zero')
ax.spines['right'].set_color('none')
ax.spines['bottom'].set_position('zero')
ax.spines['top'].set_color('none')
ax.xaxis.set_ticks_position('bottom')
ax.yaxis.set_ticks_position('left')
bounds = np.array([ax.axes.get_xlim(), ax.axes.get_ylim()])
ax.plot(bounds[0][0],bounds[1][0],'')
ax.plot(bounds[0][1],bounds[1][1],'')
Every matrix has a singular value decomposition. We can take the SVD of a rank- matrix ,
and create a rank- approximation by modifying SVD so that
#
with open('data/net-traffic/AbileneFlows/odnames','r') as f:
odnames = [line.strip() for line in f]
dates = pd.date_range('9/1/2003',freq='10min',periods=1008)
Atraf = pd.read_table('data/net-traffic/AbileneFlows/X',sep=' ',header=None,names=odnames,engine='python')
Atraf.index = dates
plt.figure(figsize=(10,8))
for i in range(1,13):
ax = plt.subplot(4,3,i)
Atraf.iloc[:,i-1].plot()
plt.title(odnames[i])
plt.subplots_adjust(hspace=1)
plt.suptitle('Twelve Example Traffic Traces', size=20);
#
display(Image("images/19-netflix.png", width=500))
Source: Koren et al, IEEE Computer, 2009
[This lecture is based on Prof. Crovella's CS 132 and CS 506 lecture notes, and Lior Pachter's blog.]
Consider the case where we are working with the reduced SVD of :
In the reduced SVD, is invertible (it is a diagonal matrix with all positive entries on the diagonal).
Using this decomposition we can define an important matrix corresponding to .
This matrix is called the pseudoinverse of .
(Sometimes called the Moore-Penrose pseudoinverse).
Obviously, cannot have an inverse, because it is not even square (let alone invertible) in general.
So why is called the pseudoinverse?
Let's go back to our favorite equation, , specifically in the case where there are no solutions.
In that case, we can find least-squares solutions by finding such that is the projection of onto .
And, if is invertible, that is given by
But, what if is not invertible?
There are still least-square solutions, but now there are an infinite number.
What if we just want to find one of them?
Let's use the pseudoinverse:
Then:
Now, is an orthonormal basis for .
And, are the coefficients of the projection of onto each column of , since the columns are unit length.
So, is the projection of onto .
So, is a least squares solution of ,
even when is not invertible,
ie, this formula works for any .
Remember, any has an SVD, and so any has a pseudoinverse!
In today's lecture we will explore a new way to use SVD: as a way to transform our data objects.
As a reminder, here is what the SVD looks like:
Notice that contains a row for each object.
In a sense we have transformed objects from an dimensional space to a dimensional space, where is (probably much) smaller than .
This is an example of dimensionality reduction.
When we take our data to be the rows of instead of rows of , we are reducing the dimension of our data from dimensions to dimensions.
This suggests a question:
Is there an optimal transformation of the data into dimensions?
What would that mean?
Here is a natural criterion for the "best" -dimensional transformation:
Find the -dimensional hyperplane that is "closest" to the points.
More precisely:
Given n points in , find the hyperplane (affine space) of dimension with the property that the squared distance of the points to their orthogonal projection onto the hyperplane is minimized.
# Source: https://liorpachter.wordpress.com/2014/05/26/what-is-principal-component-analysis/
display(Image("images/19-pca_figure1.jpeg", width=500))
This sounds like an appealing criterion.
But it also turns out to have a strong statistical guarantee.
In fact, this criterion is a transformation that captures the maximum variance in the data.
That is, the resulting -dimensional dataset is the one with maximum variance.
Let's see why this is the case.
First, let's recall the idea of a dataset centroid.
Given a data matrix with observations on the rows (as always):
then
In words: the centroid -- ie, the mean vector -- is the average over the rows.
It is the "center of mass" of the dataset.
Next, we define the sample variance of a dataset as:
where is row of .
In other words, the sample variance of the set of points is the average squared distance from each point to the centroid.
Consider if we move the points (translate each point by some constant amount). Clearly, the sample variance does not change.
So, let's move the points to be centered on the origin.
The sample variance of the new points is the same as the old points , but the centroid of the new point set is the origin.
Now that the mean of the points is the zero vector, we can reason geometrically.
Here is a picture to show why the distance-minimizing subspace is variance-maximizing.
In this figure,
# Source: https://liorpachter.wordpress.com/2014/05/26/what-is-principal-component-analysis/
display(Image("images/19-pca_figure2.jpeg", width=500))
The length of the black line is fixed -- it is the distance of the original point from the origin.
So the squared length of the black line is this point's contribution to the sample variance.
Now, regardless of how we shift the green line, a right triangle is formed because the projection is orthogonal.
So -- by virtue of the Pythagorean Theorem, the blue line squared plus the red line squared equals the black line squared.
Which means that when we shift the subspace (green line) so as to minimize the squared distances to all the example points (blue lines) we automatically maximize the squared distance of all the resulting blue points to the origin (red lines).
And the squared distance of the blue point from the origin (red dashed line) is its contribution to the new -dimensional sample variance.
In other words, the distance-minimizing projection is the variance-maximizing projection!
# Source: https://builtin.com/data-science/step-step-explanation-principal-component-analysis
display(Image("figures/pca.gif", width=800))
<IPython.core.display.Image object>
Example. Here is a dataset in .
#
n_samples = 500
C = np.array([[0.1, 0.6], [2., .6]])
X = np.random.randn(n_samples, 2) @ C + np.array([-6, 3])
ax = plt.figure(figsize = (7, 7)).add_subplot()
plt.xlim([-12, 12])
plt.ylim([-7, 7])
centerAxes(ax)
plt.axis('equal')
plt.scatter(X[:, 0], X[:, 1], s=10, alpha=1);
Next, we compute
This translates each point so that the new mean is the origin.
#
Xc = X - np.mean(X,axis=0)
ax = plt.figure(figsize = (7, 7)).add_subplot()
plt.xlim([-12, 12])
plt.ylim([-7, 7])
centerAxes(ax)
plt.axis('equal')
plt.scatter(X[:, 0], X[:, 1], s=10, alpha=0.8)
plt.scatter(Xc[:, 0], Xc[:, 1], s=10, alpha=0.8, color='r');
Now, the last step is to find the -dimensional subspace that minimizes the Euclidean distance between the data (red points) and their projection on the subspace.
In other words, what rank matrix is closest to ? We seek:
We know how to find this matrix -- via the SVD!
So for this case, let's construct the best 1-D approximation of the mean-centered data:
Xc = X - np.mean(X, axis = 0)
u, s, vt = np.linalg.svd(Xc, full_ matrices=False)
print(s)
[44.63537799 11.91004671]
scopy = s.copy()
scopy[1] = 0.
reducedX = u @ np.diag(scopy) @ vt
#
ax = plt.figure(figsize = (7, 7)).add_subplot()
centerAxes(ax)
plt.axis('equal')
plt.scatter(Xc[:,0],Xc[:,1], color='r')
plt.scatter(reducedX[:,0], reducedX[:,1])
endpoints = np.array([[-10],[10]]) @ vt[[0],:]
plt.plot(endpoints[:,0], endpoints[:,1], 'g-');
This method is called Principal Component Analysis.
In summary, PCA consists of:
This is equivalent to projecting the data onto the hyperplane that captures the maximum variance in the data.
It winds up constructing the best low dimensional linear approximation of the data.
What are "principal components"?
These are nothing more than the columns of (or the rows of ). Because they capture the direction of maximum variation, they are called "principal" components.
There are many uses of PCA (and SVD).
We won't have time to cover anomaly detection today. The basic idea is that rather than trying to find anomalous points in a high-dimensional dataset, we can use PCA to project the points onto a lower -dimensional dataset and look for anomalies there.
We will study both visualization and denoising in the context of text processing.
As we have seen, a common way to work with documents is to build a matrix whose rows contain all documents and whose columns contain all terms.
You could imagine encoding a dataset as such:
#
display(Image("images/09-document-term.png", width=1000))
Notice that this representation does not take into account word order or sentence structure. It's an example of a bag of words approach.
The bag of words encoding for a document is to treat the document as a multiset of words. That is, we simply count how many times each word occurs. It is a "bag" because all the order of the words in the document is lost.
Surprisingly, we can still tell a lot about the document even without knowing its word ordering.
Counting the number of times each word occurs in a document yields a vector of term frequencies.
However, simply using the "bag of words" directly has a number of drawbacks. We will adjust our vectors to account for two of them.
If a single document contains many words, then we expect all term frequencies to be high, even for relatively unimportant words. To adjust for this, it often makes sense to normalize the frequency vectors to have unit length.
If a single word occurs in many documents, then it's probably not that useful, so again let's try to reduce this word's weight. The idea here is that rare words are more informative than common words.
This is described directly in the scikit-learn documentation:
In a large text corpus, some words will be very [frequent] (e.g. “the”, “a”, “is” in English) hence carrying very little meaningful information about the actual contents of the document.
If we were to feed the direct count data directly to a classifier those very frequent terms would shadow the frequencies of rarer yet more interesting terms.
In order to re-weight the count features into floating point values suitable for usage by a classifier it is very common to use the tf–idf transform.
Tf means term-frequency while tf–idf means term-frequency times inverse document-frequency.
This is a originally a term weighting scheme developed for information retrieval (as a ranking function for search engines results), that has also found good use in document classification and clustering.
Hence, the definition of tf-idf is as follows.
First:
Next, if is the total number of documents in the corpus then:
where the denominator is the number of documents in which the term appears.
And finally:
Matrices that represent text documents will commonly contain TF-IDF scores.
Often, terms are correlated -- they appear together in combinations that suggest a certain "concept".
That is, term-document matrices often show low effective rank -- many columns can be approximated as combinations of other columns.
When PCA is used for dimensionality reduction of documents, it tends to to extract these "concept" vectors.
The application of PCA to term-document matrices is called Latent Semantic Analysis (LSA).
Among other benefits, LSA can improve the performance of clustering of documents.
This happens because the important concepts are captured in the most significant principal components.
Once again, let's use as an example the text corpus of 18,000 forum posts in 20 newsgroups. We will take a subset of just 3 newsgroups for this example.
from sklearn.datasets import fetch_20newsgroups
categories = ['comp.os.ms-windows.misc', 'sci.space','rec.sport.baseball']
news_data = fetch_20newsgroups(subset='train', categories=categories)
print(news_data.target_names)
print(news_data.target)
['comp.os.ms-windows.misc', 'rec.sport.baseball', 'sci.space'] [2 0 0 ... 2 1 2]
Let's read a few of the forum posts within this dataset.
print(news_data.data[-2])
Organization: University of Notre Dame - Office of Univ. Computing From: <RVESTERM@vma.cc.nd.edu> Subject: Re: MLB = NBA? Lines: 15 In article <1993Apr17.052025.10610@news.yale.edu>, (Sean Garrison) says: > >I think that >players' salaries are getting way out of hand to the point that they're on >a pace to become severely detrimental to baseball's future. > so you want to decrease players' salaries? so you want to increase owners' salaries? the two are equivalent. bob vesterman.
As a start, let's use -means clustering using tf-idf scores.
We can use the tokenizer within sklearn
to compute -grams for and . (An -gram is a set of consecutive terms.)
We'll compute a document-term matrix dtm
.
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', min_df=4,max_df=0.8)
dtm = vectorizer.fit_transform(news_data.data)
print(type(dtm), dtm.shape)
terms = vectorizer.get_feature_names_out()
<class 'scipy.sparse.csr.csr_matrix'> (1781, 9409)
Let's first cluster the documents using the raw tf-idf scores. This is without any use of PCA, and so includes lots of noisy or meaningless terms.
from sklearn.cluster import KMeans
k = 3
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=10, random_state = 0)
kmeans.fit_predict(dtm)
centroids = kmeans.cluster_centers_
labels = kmeans.labels_
error = kmeans.inertia_
Let's evaluate the clusters. We'll assume that the newsgroup the article came from is the 'ground truth.' Remember that we studied two metrics to evaluate how good our clustering is:
import sklearn.metrics as metrics
ri = metrics.adjusted_rand_score(labels,news_data.target)
ss = metrics.silhouette_score(dtm,kmeans.labels_,metric='euclidean')
print('Rand Index is {}'.format(ri))
print('Silhouette Score is {}'.format(ss))
Rand Index is 0.8162962282193574 Silhouette Score is 0.009438086458856245
One source of noise that we can eliminate (before we use LSA) comes from word endings.
For example: a Google search on 'run' will return web pages on 'running.'
This is useful, because the difference between 'run' and 'running' in practice is not enough to matter.
The usual solution taken is to simply 'chop off' the part of the word that indicates a variation from the base word. This process is called 'stemming.'
A very good stemmer is the "Snowball" stemmer.
You can read more at http://www.nltk.org and http://www.nltk.org/howto/stem.html.
Installation Note: From a cell you need to call nltk.download()
and select the appropriate packages from the interface that appears. In particular you need to download: stopwords
from corpora and punkt
and snowball_data
from models.
Let's stem the data using the Snowball stemmer:
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
stemmed_data = [" ".join(SnowballStemmer("english", ignore_stopwords=True).stem(word)
for sent in sent_tokenize(message)
for word in word_tokenize(sent))
for message in news_data.data]
dtm = vectorizer.fit_transform(stemmed_data)
terms = vectorizer.get_feature_names_out()
And now let's see how well we can cluster on the stemmed data.
from sklearn.cluster import KMeans
k = 3
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=10,random_state=0)
kmeans.fit_predict(dtm)
centroids = kmeans.cluster_centers_
labels = kmeans.labels_
error = kmeans.inertia_
import sklearn.metrics as metrics
ri = metrics.adjusted_rand_score(labels,news_data.target)
ss = metrics.silhouette_score(dtm,kmeans.labels_,metric='euclidean')
print('Rand Index is {}'.format(ri))
print('Silhouette Score is {}'.format(ss))
Rand Index is 0.844864300823809 Silhouette Score is 0.010807547412327282
So the Rand Index went from 0.816 to 0.846 as a result of stemming.
OK. Now, let's apply PCA.
Our data matrix is in sparse form.
First, we mean center the data.
Note that dtm
is a sparse matrix, but when we mean center the data we can reduce the sparsity.
Then we use PCA to reduce the dimension of the mean-centered data.
dtm_dense = dtm.todense()
centered_dtm = dtm_dense - np.mean(dtm_dense, axis=0)
u, s, vt = np.linalg.svd(centered_dtm)
Note that if you have sparse data, you may want to use scipy.sparse.linalg.svds()
and for large data it may be advantageous to use sklearn.decomposition.TruncatedSVD()
.
The principal components (rows of ) encode the extracted concepts with terms.
Each LSA concept is a linear combination of words.
pd.DataFrame(vt, columns=vectorizer.get_feature_names_out())
00 | 000 | 0005 | 0062 | 0096b0f0 | 00bjgood | 00mbstultz | 01 | 0114 | 01wb | ... | zri | zrlk | zs | zt | zu | zv | zw | zx | zy | zz | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.007831 | 0.012323 | 0.000581 | 0.005558 | 0.001032 | 0.002075 | 0.002008 | 0.005575 | 0.001247 | 0.000813 | ... | -0.000028 | -0.000025 | -0.000200 | -0.000025 | -0.000128 | -0.000207 | -0.000087 | -0.000150 | -0.000113 | 0.000534 |
1 | -0.005990 | 0.009540 | 0.002089 | -0.010679 | -0.001646 | -0.003477 | -0.002687 | 0.002143 | -0.003394 | 0.002458 | ... | -0.000015 | -0.000013 | -0.000054 | -0.000013 | -0.000042 | -0.000100 | -0.000026 | -0.000064 | -0.000040 | -0.001041 |
2 | -0.012630 | -0.011904 | -0.002443 | 0.001438 | 0.000439 | 0.000044 | 0.000349 | -0.006817 | 0.000692 | -0.001124 | ... | -0.000095 | -0.000086 | -0.000289 | -0.000087 | -0.000252 | -0.000576 | -0.000134 | -0.000293 | -0.000204 | -0.000013 |
3 | 0.013576 | 0.017639 | 0.003552 | 0.001148 | 0.003354 | -0.000410 | 0.000622 | 0.011649 | 0.002237 | 0.001969 | ... | 0.000205 | 0.000186 | 0.000486 | 0.000172 | 0.000464 | 0.001142 | 0.000220 | 0.000508 | 0.000352 | 0.000200 |
4 | -0.002254 | -0.004619 | -0.005458 | -0.001938 | -0.000251 | 0.000689 | 0.000043 | -0.002620 | -0.000533 | 0.001434 | ... | -0.000310 | -0.000283 | -0.000775 | -0.000252 | -0.000698 | -0.001714 | -0.000331 | -0.000728 | -0.000529 | -0.000961 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
8048 | 0.000020 | -0.000030 | -0.000339 | 0.000039 | -0.001961 | -0.000478 | -0.000579 | 0.000596 | -0.000454 | -0.000181 | ... | -0.001449 | -0.002008 | 0.000133 | -0.000930 | 0.000645 | 0.976198 | -0.000030 | 0.000288 | -0.000039 | 0.000653 |
8049 | 0.000135 | 0.000259 | -0.000068 | -0.000152 | 0.001303 | -0.000076 | -0.000020 | -0.000700 | -0.000037 | -0.000339 | ... | 0.000114 | 0.000102 | -0.000522 | -0.000170 | -0.001085 | -0.000133 | 0.999364 | -0.000520 | -0.000746 | -0.000892 |
8050 | -0.000197 | 0.001024 | -0.000109 | -0.000631 | -0.000236 | -0.000363 | -0.000234 | 0.000086 | 0.000131 | 0.000369 | ... | 0.000246 | 0.000210 | -0.001592 | -0.000180 | -0.000885 | 0.000152 | -0.000515 | 0.996860 | -0.001114 | 0.000814 |
8051 | 0.000012 | 0.000626 | -0.000220 | -0.000342 | 0.000552 | -0.000039 | -0.000120 | -0.000675 | -0.000036 | 0.000133 | ... | 0.000202 | 0.000184 | -0.000606 | -0.000289 | -0.001269 | -0.000159 | -0.000768 | -0.001135 | 0.998707 | -0.000869 |
8052 | 0.000102 | 0.000328 | 0.000567 | -0.000249 | -0.003388 | 0.002159 | 0.001577 | -0.001129 | 0.000365 | 0.000210 | ... | -0.000003 | 0.000018 | 0.000072 | -0.001076 | -0.002250 | 0.000697 | -0.000813 | 0.000834 | -0.000757 | 0.978107 |
8053 rows × 8053 columns
names = np.array(vectorizer.get_feature_names_out())
for cl in range(3):
print(f'\nPrincipal Component {cl}:')
idx = np.array(np.argsort(vt[cl]))[0][-10:]
for i in idx[::-1]:
print(f'{names[i]:12s} {vt[cl, i]:0.3f}')
Principal Component 0: year 0.140 game 0.111 henri 0.108 team 0.107 space 0.106 nasa 0.091 toronto 0.086 alaska 0.083 player 0.079 hit 0.077 Principal Component 1: space 0.260 nasa 0.218 henri 0.184 gov 0.135 orbit 0.134 alaska 0.129 access 0.129 toronto 0.118 launch 0.109 digex 0.102 Principal Component 2: henri 0.458 toronto 0.364 zoo 0.228 edu 0.201 spencer 0.194 zoolog 0.184 alaska 0.123 work 0.112 umd 0.096 utzoo 0.092
The rows of correspond to documents, which are linear combinations of concepts.
In order to improve our clustering accuracy, we will exclude the less significant concepts from the documents' feature vectors.
That is, we will choose the leftmost columns of and the topmost rows of .
The reduced set of columns of are our new document encodings, and it is those that we will cluster.
plt.xlim([0,50])
plt.xlabel('Number of Principal Components (Rank $k$)')
plt.ylabel('Singular Values')
plt.plot(range(1,len(s)+1), s);
It looks like the first 2 principal components have larger singular values than the rest.
Now let's measure how clustering would work with the first few principal components. Remember the ground truth: the data really come from one of 3 different forums.
news_data.target_names
['comp.os.ms-windows.misc', 'rec.sport.baseball', 'sci.space']
#
ri = []
ss = []
max = len(u)
for k in range(1,50):
vectorsk = u[:,:k] @ np.diag(s[:k])
kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=100, n_init=10, random_state=0)
kmeans.fit_predict(vectorsk)
labelsk = kmeans.labels_
ri.append(metrics.adjusted_rand_score(labelsk,news_data.target))
ss.append(metrics.silhouette_score(vectorsk,kmeans.labels_,metric='euclidean'))
plt.plot(range(1,50),ri)
plt.ylabel('Rand Score',size=20)
plt.xlabel('No of Prin Comps',size=20);
plt.plot(range(1,50),ss)
plt.ylabel('Silhouette Score', size=20)
plt.xlabel('No of Prin Comps', size=20);
Note that we can get good accuracy and coherent clusters with just two principal components.
That's a good thing, because it means that we can also visualize the data well with the help of PCA.
The challenge of visualization is that the data live in a high dimensional space.
We humans can only look at 2 (or maybe 3) dimensions at a time... but it's often not clear which dimensions to look at.
The idea behind using PCA for visualization is that since low-numbered principal components capture most of the variance in the data, these are the "directions" from which it is most useful to inspect the data.
We saw that the first two principal components were particularly large -- let's start by using them for visualization.
#
import seaborn as sns
Xk = u @ np.diag(s)
with sns.axes_style("white"):
fig, ax = plt.subplots(1,1,figsize=(7,7))
cmap = sns.hls_palette(n_colors=3, h=0.35, l=0.4, s=0.9)
for i, label in enumerate(set(news_data.target)):
point_indices = np.where(news_data.target == label)[0]
point_indices = point_indices.tolist()
plt.scatter(np.ravel(Xk[point_indices, 0]), np.ravel(Xk[point_indices, 1]), s=20, alpha=0.5, color=cmap[i], marker='D',
label=news_data.target_names[i])
plt.legend(loc = 'best')
sns.despine()
plt.title('Ground Truth Labels', size=20);
Points in this plot have been labelled with their "true" (aka "ground truth") cluster labels.
Notice how clearly the clusters separate and how coherently they present themselves. This is obvious an excellent visualization that is provided by PCA.
Since this visualization is so clear, we can use it to examine the results of our various clustering methods and get some insight into how they differ.
#
k = 3
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=10,random_state=0)
kmeans.fit_predict(dtm)
centroids = kmeans.cluster_centers_
labels = kmeans.labels_
error = kmeans.inertia_
with sns.axes_style("white"):
fig, ax = plt.subplots(1,1,figsize=(7,7))
cmap = sns.hls_palette(n_colors=3, h=0.35, l=0.4, s=0.9)
for i in range(k):
point_indices = np.where(labels == i)[0]
point_indices = point_indices.tolist()
plt.scatter(np.ravel(Xk[point_indices, 0]), np.ravel(Xk[point_indices, 1]), s=20, alpha=0.5, color=cmap[i], marker='D',
label=news_data.target_names[i])
sns.despine()
plt.title('Clusters On Full Dataset, Dimension = {}\nRand Score = {:0.3f}'.format(dtm.shape[1],
metrics.adjusted_rand_score(labels,news_data.target)),
size=20);
#
k = 3
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=10,random_state=0)
kmeans.fit_predict(Xk[:,:2])
centroids = kmeans.cluster_centers_
Xklabels = kmeans.labels_
error = kmeans.inertia_
with sns.axes_style("white"):
fig, ax = plt.subplots(1,1,figsize=(7,7))
cmap = sns.hls_palette(n_colors=3, h=0.35, l=0.4, s=0.9)
for i, label in enumerate(set(news_data.target)):
point_indices = np.where(Xklabels == label)[0]
point_indices = point_indices.tolist()
plt.scatter(np.ravel(Xk[point_indices,0]), np.ravel(Xk[point_indices,1]), s=20, alpha=0.5, color=cmap[i], marker='D')
sns.despine()
plt.title('Clusters On PCA-reduced Dataset, Dimension = 2\nRand Score = {:0.3f}'.format(
metrics.adjusted_rand_score(Xklabels,news_data.target)),
size=20);
What happens if we misjudge the number of clusters? Let's try forming 6 clusters instead.
#
k = 6
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=10,random_state=0)
kmeans.fit_predict(dtm)
centroids = kmeans.cluster_centers_
labels = kmeans.labels_
error = kmeans.inertia_
with sns.axes_style("white"):
fig, ax = plt.subplots(1,1,figsize=(10,10))
cmap = sns.hls_palette(n_colors=k, h=0.35, l=0.4, s=0.9)
for i in range(k):
point_indices = np.where(labels == i)[0]
point_indices = point_indices.tolist()
plt.scatter(np.ravel(Xk[point_indices,0]), np.ravel(Xk[point_indices,1]), s=20, alpha=0.5, color=cmap[i], marker='D')
sns.despine()
plt.title('Clusters On Full Dataset, Dimension = {}'.format(dtm.shape[1]),size=20)
plt.title(f'K means with six clusters on full dataset; Rand Score {metrics.adjusted_rand_score(labels,news_data.target):0.2f}');
#
k = 6
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=10,random_state=0)
kmeans.fit_predict(Xk[:,:6])
centroids = kmeans.cluster_centers_
labels = kmeans.labels_
error = kmeans.inertia_
with sns.axes_style("white"):
fig, ax = plt.subplots(1,1,figsize=(10,10))
cmap = sns.hls_palette(n_colors=k, h=0.35, l=0.4, s=0.9)
for i in range(k):
point_indices = np.where(labels == i)[0]
point_indices = point_indices.tolist()
plt.scatter(np.ravel(Xk[point_indices,0]), np.ravel(Xk[point_indices,1]), s=20, alpha=0.5, color=cmap[i], marker='D')
sns.despine()
plt.title(f'K means with six clusters; Rand Score {metrics.adjusted_rand_score(labels,news_data.target):0.2f}');
What about the other principal components? Are they useful for visualization?
A common approach is to look at all pairs of (low-numbered) principal components, to look for additional structure in the data.
Let's form 2-dimensional plots using any two of the first five principal components. The colors in the figures below correspond to the ground truth of which newsgroup each post belongs to.
#
k = 5
Xk = u[:,:k] @ np.diag(s[:k])
X_df = pd.DataFrame(Xk)
g = sns.PairGrid(X_df)
def pltColor(x,y,color):
cmap = sns.hls_palette(n_colors=3, h=0.35, l=0.4, s=0.9)
for i in range(3):
point_indices = np.where(news_data.target == i)[0]
point_indices = point_indices.tolist()
plt.scatter(x[point_indices], y[point_indices], color=cmap[i], s = 3)
sns.despine()
g.map(pltColor);
We see that using the first two principal components does indeed perform the best job at clustering.