#
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib as mp
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import laUtilities as ut
import slideUtilities as sl
import demoUtilities as dm
from matplotlib import animation
from importlib import reload
from datetime import datetime
from IPython.display import Image, display_html, display, Math, HTML;
qr_setting = None

mp.rcParams['animation.html'] = 'jshtml';


#
def centerAxes(ax):
    ax.spines['left'].set_position('zero')
    ax.spines['right'].set_color('none')
    ax.spines['bottom'].set_position('zero')
    ax.spines['top'].set_color('none')
    ax.xaxis.set_ticks_position('bottom')
    ax.yaxis.set_ticks_position('left')
    bounds = np.array([ax.axes.get_xlim(), ax.axes.get_ylim()])
    ax.plot(bounds[0][0],bounds[1][0],'')
    ax.plot(bounds[0][1],bounds[1][1],'')


#
with open('data/net-traffic/AbileneFlows/odnames','r') as f:
    odnames = [line.strip() for line in f]
dates = pd.date_range('9/1/2003',freq='10min',periods=1008)
Atraf = pd.read_table('data/net-traffic/AbileneFlows/X',sep='  ',header=None,names=odnames,engine='python')
Atraf.index = dates
plt.figure(figsize=(10,8))
for i in range(1,13):
    ax = plt.subplot(4,3,i)
    Atraf.iloc[:,i-1].plot()
    plt.title(odnames[i])
plt.subplots_adjust(hspace=1)
plt.suptitle('Twelve Example Traffic Traces', size=20);


#
display(Image("images/19-netflix.png", width=500))


# Source: https://liorpachter.wordpress.com/2014/05/26/what-is-principal-component-analysis/
display(Image("images/19-pca_figure1.jpeg", width=500))


# Source: https://liorpachter.wordpress.com/2014/05/26/what-is-principal-component-analysis/
display(Image("images/19-pca_figure2.jpeg", width=500))


# Source: https://builtin.com/data-science/step-step-explanation-principal-component-analysis
display(Image("figures/pca.gif", width=800))

<IPython.core.display.Image object>


#
n_samples = 500
C = np.array([[0.1, 0.6], [2., .6]])
X = np.random.randn(n_samples, 2) @ C + np.array([-6, 3])
ax = plt.figure(figsize = (7, 7)).add_subplot()
plt.xlim([-12, 12])
plt.ylim([-7, 7])
centerAxes(ax)
plt.axis('equal')
plt.scatter(X[:, 0], X[:, 1], s=10, alpha=1);


#
Xc = X - np.mean(X,axis=0)
ax = plt.figure(figsize = (7, 7)).add_subplot()
plt.xlim([-12, 12])
plt.ylim([-7, 7])
centerAxes(ax)
plt.axis('equal')
plt.scatter(X[:, 0], X[:, 1], s=10, alpha=0.8)
plt.scatter(Xc[:, 0], Xc[:, 1], s=10, alpha=0.8, color='r');


Xc = X - np.mean(X, axis = 0)
u, s, vt = np.linalg.svd(Xc, full_ matrices=False)
print(s)

[44.63537799 11.91004671]


scopy = s.copy()
scopy[1] = 0.
reducedX = u @ np.diag(scopy) @ vt


#
ax = plt.figure(figsize = (7, 7)).add_subplot()
centerAxes(ax)
plt.axis('equal')
plt.scatter(Xc[:,0],Xc[:,1], color='r')
plt.scatter(reducedX[:,0], reducedX[:,1])
endpoints = np.array([[-10],[10]]) @ vt[[0],:]
plt.plot(endpoints[:,0], endpoints[:,1], 'g-');


#
display(Image("images/09-document-term.png", width=1000))


from sklearn.datasets import fetch_20newsgroups

categories = ['comp.os.ms-windows.misc', 'sci.space','rec.sport.baseball']
news_data = fetch_20newsgroups(subset='train', categories=categories)


print(news_data.target_names)
print(news_data.target)

['comp.os.ms-windows.misc', 'rec.sport.baseball', 'sci.space']
[2 0 0 ... 2 1 2]


print(news_data.data[-2])

Organization: University of Notre Dame - Office of Univ. Computing
From: <RVESTERM@vma.cc.nd.edu>
Subject: Re: MLB = NBA?
Lines: 15

In article <1993Apr17.052025.10610@news.yale.edu>, (Sean Garrison) says:
>
>I think that
>players' salaries are getting way out of hand to the point that they're on
>a pace to become severely detrimental to baseball's future.
>

so you want to decrease players' salaries?

so you want to increase owners' salaries?

the two are equivalent.

bob vesterman.


from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', min_df=4,max_df=0.8)
dtm = vectorizer.fit_transform(news_data.data)


print(type(dtm), dtm.shape)
terms = vectorizer.get_feature_names_out()

<class 'scipy.sparse.csr.csr_matrix'> (1781, 9409)


from sklearn.cluster import KMeans
k = 3
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=10, random_state = 0)
kmeans.fit_predict(dtm)
centroids = kmeans.cluster_centers_
labels = kmeans.labels_
error = kmeans.inertia_


import sklearn.metrics as metrics
ri = metrics.adjusted_rand_score(labels,news_data.target)
ss = metrics.silhouette_score(dtm,kmeans.labels_,metric='euclidean')
print('Rand Index is {}'.format(ri))
print('Silhouette Score is {}'.format(ss))

Rand Index is 0.8162962282193574
Silhouette Score is 0.009438086458856245


from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize, sent_tokenize


stemmed_data = [" ".join(SnowballStemmer("english", ignore_stopwords=True).stem(word)  
         for sent in sent_tokenize(message)
        for word in word_tokenize(sent))
        for message in news_data.data]

dtm = vectorizer.fit_transform(stemmed_data)
terms = vectorizer.get_feature_names_out()


from sklearn.cluster import KMeans
k = 3
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=10,random_state=0)
kmeans.fit_predict(dtm)
centroids = kmeans.cluster_centers_
labels = kmeans.labels_
error = kmeans.inertia_


import sklearn.metrics as metrics
ri = metrics.adjusted_rand_score(labels,news_data.target)
ss = metrics.silhouette_score(dtm,kmeans.labels_,metric='euclidean')
print('Rand Index is {}'.format(ri))
print('Silhouette Score is {}'.format(ss))

Rand Index is 0.844864300823809
Silhouette Score is 0.010807547412327282


dtm_dense = dtm.todense()
centered_dtm = dtm_dense - np.mean(dtm_dense, axis=0)
u, s, vt = np.linalg.svd(centered_dtm)


pd.DataFrame(vt, columns=vectorizer.get_feature_names_out())


names = np.array(vectorizer.get_feature_names_out())
for cl in range(3):
    print(f'\nPrincipal Component {cl}:')
    idx = np.array(np.argsort(vt[cl]))[0][-10:]
    for i in idx[::-1]:
        print(f'{names[i]:12s} {vt[cl, i]:0.3f}')

Principal Component 0:
year         0.140
game         0.111
henri        0.108
team         0.107
space        0.106
nasa         0.091
toronto      0.086
alaska       0.083
player       0.079
hit          0.077

Principal Component 1:
space        0.260
nasa         0.218
henri        0.184
gov          0.135
orbit        0.134
alaska       0.129
access       0.129
toronto      0.118
launch       0.109
digex        0.102

Principal Component 2:
henri        0.458
toronto      0.364
zoo          0.228
edu          0.201
spencer      0.194
zoolog       0.184
alaska       0.123
work         0.112
umd          0.096
utzoo        0.092


plt.xlim([0,50])
plt.xlabel('Number of Principal Components (Rank $k$)')
plt.ylabel('Singular Values')
plt.plot(range(1,len(s)+1), s);


news_data.target_names

['comp.os.ms-windows.misc', 'rec.sport.baseball', 'sci.space']


#
ri = []
ss = []
max = len(u)
for k in range(1,50):
    vectorsk = u[:,:k] @ np.diag(s[:k])
    kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=100, n_init=10, random_state=0)
    kmeans.fit_predict(vectorsk)
    labelsk = kmeans.labels_
    ri.append(metrics.adjusted_rand_score(labelsk,news_data.target))
    ss.append(metrics.silhouette_score(vectorsk,kmeans.labels_,metric='euclidean'))


plt.plot(range(1,50),ri)
plt.ylabel('Rand Score',size=20)
plt.xlabel('No of Prin Comps',size=20);


plt.plot(range(1,50),ss)
plt.ylabel('Silhouette Score', size=20)
plt.xlabel('No of Prin Comps', size=20);


#
import seaborn as sns
Xk = u @ np.diag(s)
with sns.axes_style("white"):
    fig, ax = plt.subplots(1,1,figsize=(7,7))
    cmap = sns.hls_palette(n_colors=3, h=0.35, l=0.4, s=0.9)
    for i, label in enumerate(set(news_data.target)):
        point_indices = np.where(news_data.target == label)[0]
        point_indices = point_indices.tolist()
        plt.scatter(np.ravel(Xk[point_indices, 0]), np.ravel(Xk[point_indices, 1]), s=20, alpha=0.5, color=cmap[i], marker='D',
label=news_data.target_names[i])
        plt.legend(loc = 'best')
    sns.despine()
plt.title('Ground Truth Labels', size=20);


#
k = 3
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=10,random_state=0)
kmeans.fit_predict(dtm)
centroids = kmeans.cluster_centers_
labels = kmeans.labels_
error = kmeans.inertia_

with sns.axes_style("white"):
    fig, ax = plt.subplots(1,1,figsize=(7,7))
    cmap = sns.hls_palette(n_colors=3, h=0.35, l=0.4, s=0.9)
    for i in range(k):
        point_indices = np.where(labels == i)[0]
        point_indices = point_indices.tolist()
        plt.scatter(np.ravel(Xk[point_indices, 0]), np.ravel(Xk[point_indices, 1]), s=20, alpha=0.5, color=cmap[i], marker='D',
label=news_data.target_names[i])
    sns.despine()
plt.title('Clusters On Full Dataset, Dimension = {}\nRand Score = {:0.3f}'.format(dtm.shape[1],
                                                                             metrics.adjusted_rand_score(labels,news_data.target)),
          size=20);


#
k = 3
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=10,random_state=0)
kmeans.fit_predict(Xk[:,:2])
centroids = kmeans.cluster_centers_
Xklabels = kmeans.labels_
error = kmeans.inertia_

with sns.axes_style("white"):
    fig, ax = plt.subplots(1,1,figsize=(7,7))
    cmap = sns.hls_palette(n_colors=3, h=0.35, l=0.4, s=0.9)
    for i, label in enumerate(set(news_data.target)):
        point_indices = np.where(Xklabels == label)[0]
        point_indices = point_indices.tolist()
        plt.scatter(np.ravel(Xk[point_indices,0]), np.ravel(Xk[point_indices,1]), s=20, alpha=0.5, color=cmap[i], marker='D')
    sns.despine()
plt.title('Clusters On PCA-reduced Dataset, Dimension = 2\nRand Score = {:0.3f}'.format(
                                                                                 metrics.adjusted_rand_score(Xklabels,news_data.target)),
          size=20);


#
k = 6
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=10,random_state=0)
kmeans.fit_predict(dtm)
centroids = kmeans.cluster_centers_
labels = kmeans.labels_
error = kmeans.inertia_

with sns.axes_style("white"):
    fig, ax = plt.subplots(1,1,figsize=(10,10))
    cmap = sns.hls_palette(n_colors=k, h=0.35, l=0.4, s=0.9)
    for i in range(k):
        point_indices = np.where(labels == i)[0]
        point_indices = point_indices.tolist()
        plt.scatter(np.ravel(Xk[point_indices,0]), np.ravel(Xk[point_indices,1]), s=20, alpha=0.5, color=cmap[i], marker='D')
    sns.despine()
plt.title('Clusters On Full Dataset, Dimension = {}'.format(dtm.shape[1]),size=20)
    
plt.title(f'K means with six clusters on full dataset; Rand Score {metrics.adjusted_rand_score(labels,news_data.target):0.2f}');


#
k = 6
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=10,random_state=0)
kmeans.fit_predict(Xk[:,:6])
centroids = kmeans.cluster_centers_
labels = kmeans.labels_
error = kmeans.inertia_

with sns.axes_style("white"):
    fig, ax = plt.subplots(1,1,figsize=(10,10))
    cmap = sns.hls_palette(n_colors=k, h=0.35, l=0.4, s=0.9)
    for i in range(k):
        point_indices = np.where(labels == i)[0]
        point_indices = point_indices.tolist()
        plt.scatter(np.ravel(Xk[point_indices,0]), np.ravel(Xk[point_indices,1]), s=20, alpha=0.5, color=cmap[i], marker='D')
    sns.despine()

plt.title(f'K means with six clusters; Rand Score {metrics.adjusted_rand_score(labels,news_data.target):0.2f}');


#
k = 5
Xk = u[:,:k] @ np.diag(s[:k])
X_df = pd.DataFrame(Xk)
g = sns.PairGrid(X_df)
def pltColor(x,y,color):
    cmap = sns.hls_palette(n_colors=3, h=0.35, l=0.4, s=0.9)
    for i in range(3):
        point_indices = np.where(news_data.target == i)[0]
        point_indices = point_indices.tolist()
        plt.scatter(x[point_indices], y[point_indices], color=cmap[i], s = 3)
    sns.despine()
g.map(pltColor);

	00	000	0005	0062	0096b0f0	00bjgood	00mbstultz	01	0114	01wb	...	zri	zrlk	zs	zt	zu	zv	zw	zx	zy	zz
0	0.007831	0.012323	0.000581	0.005558	0.001032	0.002075	0.002008	0.005575	0.001247	0.000813	...	-0.000028	-0.000025	-0.000200	-0.000025	-0.000128	-0.000207	-0.000087	-0.000150	-0.000113	0.000534
1	-0.005990	0.009540	0.002089	-0.010679	-0.001646	-0.003477	-0.002687	0.002143	-0.003394	0.002458	...	-0.000015	-0.000013	-0.000054	-0.000013	-0.000042	-0.000100	-0.000026	-0.000064	-0.000040	-0.001041
2	-0.012630	-0.011904	-0.002443	0.001438	0.000439	0.000044	0.000349	-0.006817	0.000692	-0.001124	...	-0.000095	-0.000086	-0.000289	-0.000087	-0.000252	-0.000576	-0.000134	-0.000293	-0.000204	-0.000013
3	0.013576	0.017639	0.003552	0.001148	0.003354	-0.000410	0.000622	0.011649	0.002237	0.001969	...	0.000205	0.000186	0.000486	0.000172	0.000464	0.001142	0.000220	0.000508	0.000352	0.000200
4	-0.002254	-0.004619	-0.005458	-0.001938	-0.000251	0.000689	0.000043	-0.002620	-0.000533	0.001434	...	-0.000310	-0.000283	-0.000775	-0.000252	-0.000698	-0.001714	-0.000331	-0.000728	-0.000529	-0.000961
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
8048	0.000020	-0.000030	-0.000339	0.000039	-0.001961	-0.000478	-0.000579	0.000596	-0.000454	-0.000181	...	-0.001449	-0.002008	0.000133	-0.000930	0.000645	0.976198	-0.000030	0.000288	-0.000039	0.000653
8049	0.000135	0.000259	-0.000068	-0.000152	0.001303	-0.000076	-0.000020	-0.000700	-0.000037	-0.000339	...	0.000114	0.000102	-0.000522	-0.000170	-0.001085	-0.000133	0.999364	-0.000520	-0.000746	-0.000892
8050	-0.000197	0.001024	-0.000109	-0.000631	-0.000236	-0.000363	-0.000234	0.000086	0.000131	0.000369	...	0.000246	0.000210	-0.001592	-0.000180	-0.000885	0.000152	-0.000515	0.996860	-0.001114	0.000814
8051	0.000012	0.000626	-0.000220	-0.000342	0.000552	-0.000039	-0.000120	-0.000675	-0.000036	0.000133	...	0.000202	0.000184	-0.000606	-0.000289	-0.001269	-0.000159	-0.000768	-0.001135	0.998707	-0.000869
8052	0.000102	0.000328	0.000567	-0.000249	-0.003388	0.002159	0.001577	-0.001129	0.000365	0.000210	...	-0.000003	0.000018	0.000072	-0.001076	-0.002250	0.000697	-0.000813	0.000834	-0.000757	0.978107

Announcements¶

Recap¶

Low-effective-rank is common¶

Lecture 34: Principal Component Analysis¶

34.1 The Pseudoinverse¶

34.2 Dimensionality Reduction and PCA¶

Centroid and Variance¶

34.3 Using PCA for Visualization and Denoising¶

Our Dataset: 20 Newsgroups¶

Basic Clustering¶

Improvement: Stemming¶

Demonstrating PCA¶

Denoising¶

Visualization¶