#
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib as mp
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import laUtilities as ut
import slideUtilities as sl
import demoUtilities as dm
from matplotlib import animation
from importlib import reload
from datetime import datetime
from IPython.display import Image, display_html, display, Math, HTML;
qr_setting = None

mp.rcParams['animation.html'] = 'jshtml';


#
display(Image("images/17-pagerank-quote.png", width=600))


#
display(Image("images/17-deeper-pagerank-fig.jpg", width=250))


# Here is the P'' matrix as computed in steps 1 through 3.
P = np.array([
[1./60, 1./6, 19./60, 1./60, 1./60,  1./60],
[7./15, 1./6, 19./60, 1./60, 1./60,  1./60],
[7./15, 1./6,  1./60, 1./60, 1./60,  1./60],
[1./60, 1./6,  1./60, 1./60, 7./15, 11./12],
[1./60, 1./6, 19./60, 7./15, 1./60,  1./60],
[1./60, 1./6,  1./60, 7./15, 7./15,  1./60]
])
eigenvalues, eigenvectors = np.linalg.eig(P)
print(np.real(eigenvalues))

[ 1.          0.61008601 -0.08958752 -0.37049849 -0.45       -0.45      ]


# find the location of the largest eigenvalue (1), 
# by computing the indices that would sort the eigenvalues
# from smallest to largest
indices = np.argsort(eigenvalues)
# and take the index of the largest eigenvalue
principal = indices[-1]
print(principal)

0


# using the index of the largest eigenvalue, extract
# the corresponding eigenvector (the steady state vector)
steadyState = np.real(eigenvectors[:,principal])
steadyState = steadyState/np.sum(steadyState)
print(steadyState)

[0.03721197 0.05395735 0.04150565 0.37508082 0.20599833 0.28624589]


# find the order of the pages in the steady state vector
# this function sorts from smallest to largest (reverse of what we want)
reverseOrder = np.argsort(steadyState)
print(reverseOrder)

[0 2 1 4 5 3]


# reverse the order to get the most important page first
# and add one to convert from zero indexing to indexing of example
order = 1 + reverseOrder[::-1]
print('final order = {}'.format(order))
print('importance = {}'.format(steadyState[order-1]))

final order = [4 6 5 2 3 1]
importance = [0.37508082 0.28624589 0.20599833 0.05395735 0.04150565 0.03721197]


#
display(Image("images/17-deeper-pagerank-fig.jpg", width=250))


#
display(Image("images/17-sample-google-search.jpg", width=800))


((2./3)*(400000**3))/((2*10**9)*(3600*24*30))

8.23045267489712


20*400000.**2/((2*10**9)*(60))

26.666666666666668


# Given time, talk about sparse matrix-vector multiply.
# This is linear in n, with a constant equal to average degree (say 10)
# and gets the computation down to 40 milliseconds
# but that requires explaining why P is not really dense
# (it can be expressed as a sparse matrix plus a constant matrix)
20*10*400000./((2*10**9))

0.04


#
display(Image("images/ds121s23-course-evals.png", width=250))


#
display(Image("images/20-kmeans-example.png", width=600))


#
display(Image("images/20-kmeans-nonspherical-clusters.png", width=600))


#
display(Image("images/20-kmeans-cluster-size.png", width=600))


#
display(Image("images/20-kmeans-bad-initialization.png", width=600))


#
figs, axs = plt.subplots(1, 2, figsize = (12, 5))
df_rand_gt.plot('X', 'Y', kind = 'scatter', c = 'label', colormap='viridis', ax = axs[0],
                   colorbar = False)
axs[0].set_title('Ground Truth (T)')
axs[0].set_axis_off()
df_rand_clust.plot('X', 'Y', kind = 'scatter', c = 'label', colormap='viridis', ax = axs[1],
                  colorbar = False)
axs[1].set_title('Clustering (C)')
axs[1].set_axis_off();


#
display(Image("images/21-dendrogram.png", width=600))


#
display(Image("images/21-dendrogram-cut.png", width=600))


#
display(Image("images/21-animal-taxonomy.jpg", width=600))


# Single, Complete, and Average Linkage
display(Image("images/21-hierarchical-criteria.png", width=1000))


#
fig, axs = plt.subplots(1, 3, sharey = True, figsize = (12, 5))
#
cy = 1000 * [w_hat_0]
pred_y = N * [w_hat_0]
axs[0].plot(cx, cy, lw = 2, label = r'$k$ = 0')
axs[0].plot(x, y, 'ro', markersize = 8, fillstyle = 'none')
axs[0].set_xlabel('x', size = 16)
axs[0].set_ylabel('y', size = 16)
axs[0].set_title(r'$k$ = 0, constant' + '\n' + r'$E(\mathbf{w})$ =' + ' {:0.2f}'.format(np.linalg.norm(y - pred_y)))
#axs[0].legend(loc = 'best', fontsize = 16)
#
cy = design_matrix(cx, 1) @ w_hat_1
pred_y = design_matrix(x, 1) @ w_hat_1
axs[1].plot(cx, cy, lw = 2, label = r'$k$ = 1')
axs[1].plot(x, y, 'ro', markersize = 8, fillstyle = 'none')
axs[1].set_xlabel('x', size = 16)
axs[1].set_title(r'$k$ = 1, linear' + '\n' + r'$E(\mathbf{w})$ =' + ' {:0.2f}'.format(np.linalg.norm(y - pred_y)))
#axs[1].legend(loc = 'best', fontsize = 16)
#
cy = design_matrix(cx, 3) @ w_hat_3
pred_y = design_matrix(x, 3) @ w_hat_3
axs[2].plot(cx, cy, lw = 2, label = r'$k$ = 3')
axs[2].plot(x, y, 'ro', markersize = 8, fillstyle = 'none')
axs[2].set_xlabel('x', size = 16)
axs[2].set_title('$k$ = 3, cubic' + '\n' + r'$E(\mathbf{w})$ =' + ' {:0.2f}'.format(np.linalg.norm(y - pred_y)))
#axs[2].legend(loc = 'best', fontsize = 16)
#
fig.tight_layout();


#
w_hat_9 = fit_poly(x, y, 9)
cy = design_matrix(cx, 9) @ w_hat_9
plt.plot(cx, cy, lw = 2, label = r'$k$ = 9')
plt.plot(x, y, 'ro', markersize = 8, fillstyle = 'none')
plt.xlabel('x', size = 16)
plt.ylabel('y', size = 16)
plt.title(r'$k$ = 9' + '\n' + r'$E(\mathbf{w})$ =' + ' {:0.2f}'.format(0));


#
test_y = np.sin(2 * np.pi * x) + default_rng(8).normal(size = N, scale = 0.20)
max_k = N
train_err = [np.linalg.norm(y - N * [w_hat_0])]
test_err = [np.linalg.norm(test_y - N * [w_hat_0])]
for k in range(1, max_k):
    w_hat = fit_poly(x, y, k)
    pred_y = design_matrix(x, k) @ w_hat
    train_err.append(np.linalg.norm(y - pred_y))
    test_err.append(np.linalg.norm(test_y - pred_y))
plt.plot(range(max_k), test_err, 'ro-', label = 'Testing Error')
plt.plot(range(max_k), train_err, 'bo-', label = 'Training Error')
plt.xlabel(r'$k$', size = 16)
plt.ylabel(r'$E(\mathbf{w}^*)$')
plt.legend(loc = 'best');


#
display(Image("images/22-k-fold.png", width=500))


#
display(Image("images/22-DT-Example-2.png", width=800))


#
plt.scatter(demo_X[:,0], demo_X[:,1], c=demo_y, cmap=cmap_bold)
plt.plot(test_X[0], test_X[1], 'ok')
plt.annotate('Test Point', test_X, [75, 25], 
             textcoords = 'offset points', fontsize = 14, 
             arrowprops = {'arrowstyle': '->'})
plt.axis('equal')
plt.axis('off')
plt.title('Training Points: 2 Classes');


#
plt.scatter(demo_X[:,0], demo_X[:,1], c=demo_y, cmap=cmap_bold)
plt.plot(test_X[0], test_X[1], 'ok')
ax=plt.gcf().gca()
circle = mp.patches.Circle(test_X, 0.5, facecolor = 'red', alpha = 0.2)
plt.axis('equal')
plt.axis('off')
ax.add_artist(circle)
plt.title('1-Nearest-Neighbor: Classification: Red');


#
plt.scatter(demo_X[:,0], demo_X[:,1], c=demo_y, cmap=cmap_bold)
test_X = [-0.3, 0.7]
plt.plot(test_X[0], test_X[1], 'ok')
ax=plt.gcf().gca()
    #ellipse = mp.patches.Ellipse(gmm.means_[clus], 3 * e[0], 3 * e[1], angle, color = 'r')
circle = mp.patches.Circle(test_X, 0.9, facecolor = 'gray', alpha = 0.3)
plt.axis('equal')
plt.axis('off')
ax.add_artist(circle)
plt.title('2-Nearest-Neighbor');


#
plt.figure()
ax=plt.gcf().gca()
    #ellipse = mp.patches.Ellipse(gmm.means_[clus], 3 * e[0], 3 * e[1], angle, color = 'r')
circle = mp.patches.Circle(test_X, 1.4, facecolor = 'blue', alpha = 0.2)
ax.add_artist(circle)
plt.scatter(demo_X[:,0], demo_X[:,1], c=demo_y, cmap=cmap_bold)
test_X = [-0.3, 0.7]
plt.plot(test_X[0], test_X[1], 'ok')
plt.axis('equal')
plt.axis('off')
plt.title('3-Nearest-Neighbor: Classification: Blue');


#
display(Image("images/23-confusion-matrix.png", width=300))

Announcements¶

Lecture 38: Faster PageRank¶

The PageRank Insight¶

Step 1.¶

Step 2.¶

Step 3.¶

Step 4.¶

38.1 Computing PageRank: the Power Method¶

38.2 Course Evaluation¶

Exam review¶

Linear independence¶

Gaussian Elimination¶

Example 1¶

Example 2¶

How many solutions will a system have?¶

Vector geometry¶

Clustering¶

Centroid and Variance¶

$k$ -means Problem:¶

Limitations of $k$ -means¶

Evaluating $k$ -means¶

Hierarchical Clustering¶

Strengths of Hierarchical Clustering¶

Building a dendrogram¶

Supervised Learning¶

Decision trees¶

Hunt's Algorithm¶

$k$ -Nearest Neighbors¶

Challenges for $k$ -NN¶

Evaluating Classification Methods¶

Announcements¶

Lecture 38: Faster PageRank¶

The PageRank Insight¶

Step 1.¶

Step 2.¶

Step 3.¶

Step 4.¶

38.1 Computing PageRank: the Power Method¶

38.2 Course Evaluation¶

Exam review¶

Linear independence¶

Gaussian Elimination¶

Example 1¶

Example 2¶

How many solutions will a system have?¶

Vector geometry¶

Clustering¶

Centroid and Variance¶

kk-means Problem:¶

Limitations of kk-means¶

Evaluating kk-means¶

Hierarchical Clustering¶

Strengths of Hierarchical Clustering¶

Building a dendrogram¶

Supervised Learning¶

Decision trees¶

Hunt's Algorithm¶

kk-Nearest Neighbors¶

Challenges for kk-NN¶

Evaluating Classification Methods¶

$k$ -means Problem:¶

Limitations of $k$ -means¶

Evaluating $k$ -means¶

$k$ -Nearest Neighbors¶

Challenges for $k$ -NN¶