#
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib as mp
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import laUtilities as ut
import slideUtilities as sl
import demoUtilities as dm
from matplotlib import animation
from importlib import reload
from datetime import datetime
from IPython.display import Image, display_html, display, Math, HTML;
qr_setting = None

mp.rcParams['animation.html'] = 'jshtml';


N = 10
x = np.linspace(0, 1, N)
from numpy.random import default_rng
y = np.sin(2 * np.pi * x) + default_rng(2).normal(size = N, scale = 0.20)
# plt.figure(figsize = (3, 2))
plt.plot(x, y, 'ro', markersize = 8, fillstyle = 'none')
plt.xlabel('x', size = 16)
plt.ylabel('y', size = 16);


N = 20
x = np.linspace(0, 1, N)
y = np.sin(2 * np.pi * x) + default_rng(2).normal(size = N, scale = 0.20)


#
cx = np.linspace(0, 1, 1000)
cy = np.sin(2 * np.pi * cx)
plt.plot(cx, cy, lw = 2)
plt.plot(x, y, 'ro', markersize = 8, fillstyle = 'none')
plt.xlabel('x', size = 16)
plt.ylabel('y', size = 16);


# image credit: Lay, LAA, 4th edition
display(Image("images/Lay-fig-6-6-1.jpg", width=550))


# y = Aw, A is design matrix 1, [1, x^T], [1, x^T, x^T^2], etc, and w-hat = (A^TA)^-1 A^Ty
def design_matrix(x, k):
    N = len(x)
    A = np.ones(N)
    for i in range(1, k+1):
        A = np.column_stack([A, (x.T)**i])
    return A

def fit_poly(x, y, k):
    A = design_matrix(x, k)
    w_hat = np.linalg.inv(A.T @ A) @ A.T @ y
    return w_hat

w_hat_0 = 1/N * np.sum(y)
w_hat_1 = fit_poly(x, y, 1)
w_hat_3 = fit_poly(x, y, 3)


#
fig, axs = plt.subplots(1, 3, sharey = True, figsize = (12, 5))
#
cy = 1000 * [w_hat_0]
pred_y = N * [w_hat_0]
axs[0].plot(cx, cy, lw = 2, label = r'$k$ = 0')
axs[0].plot(x, y, 'ro', markersize = 8, fillstyle = 'none')
axs[0].set_xlabel('x', size = 16)
axs[0].set_ylabel('y', size = 16)
axs[0].set_title(r'$k$ = 0, constant' + '\n' + r'$E(\mathbf{w})$ =' + ' {:0.2f}'.format(np.linalg.norm(y - pred_y)))
#axs[0].legend(loc = 'best', fontsize = 16)
#
cy = design_matrix(cx, 1) @ w_hat_1
pred_y = design_matrix(x, 1) @ w_hat_1
axs[1].plot(cx, cy, lw = 2, label = r'$k$ = 1')
axs[1].plot(x, y, 'ro', markersize = 8, fillstyle = 'none')
axs[1].set_xlabel('x', size = 16)
axs[1].set_title(r'$k$ = 1, linear' + '\n' + r'$E(\mathbf{w})$ =' + ' {:0.2f}'.format(np.linalg.norm(y - pred_y)))
#axs[1].legend(loc = 'best', fontsize = 16)
#
cy = design_matrix(cx, 3) @ w_hat_3
pred_y = design_matrix(x, 3) @ w_hat_3
axs[2].plot(cx, cy, lw = 2, label = r'$k$ = 3')
axs[2].plot(x, y, 'ro', markersize = 8, fillstyle = 'none')
axs[2].set_xlabel('x', size = 16)
axs[2].set_title('$k$ = 3, cubic' + '\n' + r'$E(\mathbf{w})$ =' + ' {:0.2f}'.format(np.linalg.norm(y - pred_y)))
#axs[2].legend(loc = 'best', fontsize = 16)
#
fig.tight_layout();


#
w_hat_9 = fit_poly(x, y, 9)
cy = design_matrix(cx, 9) @ w_hat_9
plt.plot(cx, cy, lw = 2, label = r'$k$ = 9')
plt.plot(x, y, 'ro', markersize = 8, fillstyle = 'none')
plt.xlabel('x', size = 16)
plt.ylabel('y', size = 16)
plt.title(r'$k$ = 9' + '\n' + r'$E(\mathbf{w})$ =' + ' {:0.2f}'.format(0));


#
test_y = np.sin(2 * np.pi * x) + default_rng(8).normal(size = N, scale = 0.20)
max_k = N
train_err = [np.linalg.norm(y - N * [w_hat_0])]
test_err = [np.linalg.norm(test_y - N * [w_hat_0])]
for k in range(1, max_k):
    w_hat = fit_poly(x, y, k)
    pred_y = design_matrix(x, k) @ w_hat
    train_err.append(np.linalg.norm(y - pred_y))
    test_err.append(np.linalg.norm(test_y - pred_y))
plt.plot(range(max_k), test_err, 'ro-', label = 'Testing Error')
plt.plot(range(max_k), train_err, 'bo-', label = 'Training Error')
plt.xlabel(r'$k$', size = 16)
plt.ylabel(r'$E(\mathbf{w}^*)$')
plt.legend(loc = 'best');


#
Ns = [15, 50, 200]
xs = {Nval: np.linspace(0, 1, Nval) for Nval in Ns}
ys = {Nval: np.sin(2 * np.pi * xs[Nval]) + default_rng(3).normal(size = Nval, scale = 0.20) for Nval in Ns}


#
fig, axs = plt.subplots(1, 3, sharey = True, figsize = (12, 5))
#
cx = np.linspace(0, 1, 1000)
for i, Nval in enumerate(Ns):
    w_star = fit_poly(xs[Nval], ys[Nval], 9)
    cy = design_matrix(cx, 9) @ w_star
    pred_y = design_matrix(xs[Nval], 9) @ w_star
    axs[i].plot(xs[Nval], ys[Nval], 'ro', markersize = 9, fillstyle = 'none', alpha = 0.5)
    axs[i].plot(cx, cy, lw = 2, label = r'$N$ = {}'.format(Nval))
    axs[i].set_xlabel('x', size = 16)
    if i == 0:
        axs[i].set_ylabel('y', size = 16)
    axs[i].set_title(r'$k$ = 9, N = {}'.format(Nval))
#
fig.tight_layout();


import sklearn.model_selection as model_selection

x_train, x_test, y_train, y_test = model_selection.train_test_split(
        x, y, test_size = 0.5, random_state = 0)

print(f'Number of items in training set: {x_train.shape[0]}, in testing set: {x_test.shape[0]}')

Number of items in training set: 10, in testing set: 10


#
fig, axs = plt.subplots(1, 2, sharey = True, figsize = (8, 5))
#
axs[0].plot(x_train, y_train, 'ro', markersize = 8, fillstyle = 'none')
axs[0].set_xlabel('x', size = 16)
axs[0].set_ylabel('y', size = 16)
axs[0].set_title('Training Set', size = 16)
#
axs[1].plot(x_test, y_test, 'ro', markersize = 8, fillstyle = 'none')
axs[1].set_xlabel('x', size = 16)
axs[1].set_title('Testing Set', size = 16)
#
fig.tight_layout();


def model_error(x_train, y_train, x_test, y_test, k):
    '''
    This function fits a polynomial of degree k to the training data
    and returns the error on both the training and test data.
    '''
    w_star = fit_poly(x_train, y_train, k)
    pred_test_y = design_matrix(x_test, k) @ w_star
    pred_train_y = design_matrix(x_train, k) @ w_star
    return (np.linalg.norm(y_train - pred_train_y), np.linalg.norm(y_test - pred_test_y))

np.random.seed(7)


# fraction of data used for testing
#
split_frac = 0.5
#
# maximum polynomial degree to consider
#
max_k = 10
#
n_splits = 5
#
# grid search over k
# we assume a model_error() function that reports the
# training and testing error
# (definition omitted for space)
# 
#
err = []
for k in range(1, max_k):
    for s in range(n_splits):
        x_train, x_test, y_train, y_test = model_selection.train_test_split(
            x, y, test_size = 0.5)
        split_train_err, split_test_err = model_error(x_train, y_train, x_test, y_test, k)
        err.append([k, s, split_train_err, split_test_err])
#
# put the results in a DataFame for easy manipulation
#
df = pd.DataFrame(err, columns = ['k', 'split', 'Training Error', 'Testing Error'])
df.head  (10)


df.groupby('k').mean()[['Training Error', 'Testing Error']].plot(
    yerr = df.groupby('k').std()/np.sqrt(n_splits))
plt.ylabel('Error')
plt.ylim([0, 5]);


#
display(Image("images/22-k-fold.png", width=500))


#
display(Image("images/09-document-term.png", width=1000))


from sklearn.datasets import fetch_20newsgroups

"""
categories = [
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'talk.religion.misc',
 'comp.graphics',
 'sci.space',
 'rec.autos',
 'rec.sport.baseball'
]
"""
categories = ['comp.os.ms-windows.misc', 'rec.sport.baseball', 'sci.space']
news_data = fetch_20newsgroups(subset = 'train', categories = categories)

print("Categories:     ", news_data.target_names)
print("# of documents: ", len(news_data.target))
print("Ground truth:   ", news_data.target) # this vector is of length 1781

Categories:      ['comp.os.ms-windows.misc', 'rec.sport.baseball', 'sci.space']
# of documents:  1781
Ground truth:    [2 0 0 ... 2 1 2]


# Run the lines below to look at some of the data

print(news_data.data[0])

From: aws@iti.org (Allen W. Sherzer)
Subject: Re: DC-X update???
Organization: Evil Geniuses for a Better Tomorrow
Lines: 122

In article <ugo62B8w165w@angus.mi.org> dragon@angus.mi.org writes:

>Exactly when will the hover test be done, 

Early to mid June.

>and will any of the TV
>networks carry it.  I really want to see that...

If they think the public wants to see it they will carry it. Why not
write them and ask? You can reach them at:


                          F: NATIONAL NEWS MEDIA


ABC "World News Tonight"                 "Face the Nation"
7 West 66th Street                       CBS News
New York, NY 10023                       2020 M Street, NW
212/887-4040                             Washington, DC 20036
                                         202/457-4321

Associated Press                         "Good Morning America"
50 Rockefeller Plaza                     ABC News
New York, NY 10020                       1965 Broadway
National Desk (212/621-1600)             New York, NY 10023
Foreign Desk (212/621-1663)              212/496-4800
Washington Bureau (202/828-6400)
                                         Larry King Live TV
"CBS Evening News"                       CNN
524 W. 57th Street                       111 Massachusetts Avenue, NW
New York, NY 10019                       Washington, DC 20001
212/975-3693                             202/898-7900

"CBS This Morning"                       Larry King Show--Radio
524 W. 57th Street                       Mutual Broadcasting
New York, NY 10019                       1755 So. Jefferson Davis Highway
212/975-2824                             Arlington, VA 22202
                                         703/685-2175
"Christian Science Monitor"
CSM Publishing Society                   "Los Angeles Times"
One Norway Street                        Times-Mirror Square
Boston, MA 02115                         Los Angeles, CA 90053
800/225-7090                             800/528-4637

CNN                                      "MacNeil/Lehrer NewsHour"
One CNN Center                           P.O. Box 2626
Box 105366                               Washington, DC 20013
Atlanta, GA 30348                        703/998-2870
404/827-1500
                                         "MacNeil/Lehrer NewsHour"
CNN                                      WNET-TV
Washington Bureau                        356 W. 58th Street
111 Massachusetts Avenue, NW             New York, NY 10019
Washington, DC 20001                     212/560-3113
202/898-7900

"Crossfire"                              NBC News
CNN                                      4001 Nebraska Avenue, NW
111 Massachusetts Avenue, NW             Washington, DC 20036
Washington, DC 20001                     202/885-4200
202/898-7951                             202/362-2009 (fax)

"Morning Edition/All Things Considered"  
National Public Radio                    
2025 M Street, NW                        
Washington, DC 20036                     
202/822-2000                             

United Press International
1400 Eye Street, NW
Washington, DC 20006
202/898-8000

"New York Times"                         "U.S. News & World Report"
229 W. 43rd Street                       2400 N Street, NW
New York, NY 10036                       Washington, DC 20037
212/556-1234                             202/955-2000
212/556-7415

"New York Times"                         "USA Today"
Washington Bureau                        1000 Wilson Boulevard
1627 Eye Street, NW, 7th Floor           Arlington, VA 22229
Washington, DC 20006                     703/276-3400
202/862-0300

"Newsweek"                               "Wall Street Journal"
444 Madison Avenue                       200 Liberty Street
New York, NY 10022                       New York, NY 10281
212/350-4000                             212/416-2000

"Nightline"                              "Washington Post"
ABC News                                 1150 15th Street, NW
47 W. 66th Street                        Washington, DC 20071
New York, NY 10023                       202/344-6000
212/887-4995

"Nightline"                              "Washington Week In Review"
Ted Koppel                               WETA-TV
ABC News                                 P.O. Box 2626
1717 DeSales, NW                         Washington, DC 20013
Washington, DC 20036                     703/998-2626
202/887-7364

"This Week With David Brinkley"
ABC News
1717 DeSales, NW
Washington, DC 20036
202/887-7777

"Time" magazine
Time Warner, Inc.
Time & Life Building
Rockefeller Center
New York, NY 10020
212/522-1212

-- 
+---------------------------------------------------------------------------+
| Lady Astor:   "Sir, if you were my husband I would poison your coffee!"   |
| W. Churchill: "Madam, if you were my wife, I would drink it."             |
+----------------------57 DAYS TO FIRST FLIGHT OF DCX-----------------------+


print(news_data.data[1])

From: phoenix.Princeton.EDU!carlosn (Carlos G. Niederstrasser)
Subject: Reboot when I start windows.
Originator: news@nimaster
Nntp-Posting-Host: week.princeton.edu
Organization: Princeton University
Lines: 21

Recently the following problem has arrisen.  The first time I turn on my  
computer when windows starts (from my autoexec) after the win31 title screen  
the computer reboots on its own.  Usually the second time (after reboot) or  
from the DOS prompt everything works fine.

 s far as I remember I have not changed my config.sys or autoxec.bat or  
win.ini.  I can't remember whether this problem occured before I  
optimized/defragmented my disk and created a larger swap file (Thank you  
MathCAD 4 :(  )

System 386sx, 4MB, stacker 2.0, win31, DOS 5

---
---------------------------------------------------------------------
| Carlos G. Niederstrasser        |  Only two things are infinite,  |
| Princeton Planetary Society     |      the universe and human     |
|                                 |   stupidity, and I'm not sure   |
|                                 |   about the former. - Einstein  |
| carlosn@phoenix.princeton.edu   |---------------------------------|
| space@phoenix.princeton.edu     |    Ad Astra per Ardua Nostra    |
---------------------------------------------------------------------


print(news_data.data[-2])

Organization: University of Notre Dame - Office of Univ. Computing
From: <RVESTERM@vma.cc.nd.edu>
Subject: Re: MLB = NBA?
Lines: 15

In article <1993Apr17.052025.10610@news.yale.edu>, (Sean Garrison) says:
>
>I think that
>players' salaries are getting way out of hand to the point that they're on
>a pace to become severely detrimental to baseball's future.
>

so you want to decrease players' salaries?

so you want to increase owners' salaries?

the two are equivalent.

bob vesterman.


from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words = 'english', min_df = 4, max_df = 0.8)
data = vectorizer.fit_transform(news_data.data)

print("# of documents:", data.shape[0])
print("# of words:    ", data.shape[1])

# of documents: 1781
# of words:     9409


fig, ax1 = plt.subplots(1,1,figsize=(15,10))
dum = sns.heatmap(data[1:100,1:200].todense(), xticklabels=False, yticklabels=False, 
            linewidths=0, cbar=False, ax=ax1)


print(news_data.target)
print(news_data.target_names)

[2 0 0 ... 2 1 2]
['comp.os.ms-windows.misc', 'rec.sport.baseball', 'sci.space']


#
error = evaluate_clusters(data, 10)
plt.plot(range(1, len(error)), error[1:])
plt.title('$k$-means Clustering Performance on Newsgroup Articles')
plt.xlabel('Number of clusters')
plt.ylabel('Error');


#
ri = ri_evaluate_clusters(data, 10, news_data.target)
plt.plot(range(1, len(ri)), ri[1:], 'o-')
plt.xlabel('Number of clusters')
plt.title('$k$-means Clustering Compared to Known Labels\nNewsgroup Articles')
plt.ylabel('Adjusted Rand Index');


#
s = sc_evaluate_clusters(data, 10, 100, 3)
plt.plot(range(2, len(s)), s[2:], 'o-')
plt.xlabel('Number of Clusters')
plt.title('$k$-means clustering performance on Newsgroup Articles')
plt.ylabel('Silhouette Score');


k = 4
kmeans = KMeans(n_clusters = k, init = 'k-means++', max_iter = 100, n_init = 25, random_state = 3)
kmeans.fit_predict(data)

array([1, 1, 1, ..., 3, 0, 2], dtype=int32)


print('Top terms per cluster:')
asc_order_centroids = kmeans.cluster_centers_.argsort()#[:, ::-1]
order_centroids = asc_order_centroids[:,::-1]
terms = vectorizer.get_feature_names()
for i in range(k):
    print(f'Cluster {i}:')
    for ind in order_centroids[i, :10]:
        print(f' {terms[ind]}')
    print('')

Top terms per cluster:
Cluster 0:
 edu
 baseball
 year
 team
 game
 com
 article
 players
 writes
 games

Cluster 1:
 windows
 edu
 com
 file
 dos
 university
 thanks
 ca
 files
 use

Cluster 2:
 space
 nasa
 access
 gov
 edu
 alaska
 digex
 com
 pat
 moon

Cluster 3:
 henry
 toronto
 zoo
 spencer
 zoology
 edu
 work
 utzoo
 kipling
 umd


#
euclidean_dists = metrics.euclidean_distances(data)
labels = kmeans.labels_
idx = np.argsort(labels)
clustered_dists = euclidean_dists[idx][:,idx]
fig, ax1 = plt.subplots(1,1,figsize=(15,15))
dum = sns.heatmap(clustered_dists, xticklabels=False, yticklabels=False, linewidths=0, square=True,cbar=False, ax=ax1)


import random
n_items = euclidean_dists.shape[0]
subset = random.sample(range(n_items),500)

fit = mds.fit(euclidean_dists[subset][:,subset])
pos = fit.embedding_


labels

array([1, 1, 1, ..., 3, 0, 2], dtype=int32)


cols = [['y', 'b', 'g', 'r', 'c'][l] for l in labels[subset]]
plt.scatter(pos[:, 0], pos[:, 1], s = 12, c = cols)
plt.title('MDS Embedding of Newsgroup Articles');

	k	split	Training Error	Testing Error
0	1	0	1.147829	2.799292
1	1	1	1.700576	1.712716
2	1	2	1.387723	2.133381
3	1	3	1.696808	1.695534
4	1	4	1.571746	1.989020
5	2	0	1.358846	2.603490
6	2	1	1.332199	2.082828
7	2	2	0.747769	5.436927
8	2	3	1.559011	2.697498
9	2	4	1.677090	1.691438

Announcements¶

Lecture 10: Supervised Learning¶

10.1 The Supervised Learning Problem¶

10.2 A Toy Example of Regression¶

Model Fitting¶

Model Selection¶

More Training Data¶

Parameters and Hyperparameters¶

10.3 Holding Out Data¶

Hold Out Strategies¶

Conclusions¶

10.4 $k$ -means Clustering with Real Data¶

Getting to know the Data¶

Selecting the Number of Clusters¶

Looking into the clusters¶

Announcements¶

Lecture 10: Supervised Learning¶

10.1 The Supervised Learning Problem¶

10.2 A Toy Example of Regression¶

Model Fitting¶

Model Selection¶

More Training Data¶

Parameters and Hyperparameters¶

10.3 Holding Out Data¶

Hold Out Strategies¶

Conclusions¶

10.4 kk-means Clustering with Real Data¶

Getting to know the Data¶

Selecting the Number of Clusters¶

Looking into the clusters¶

10.4 $k$ -means Clustering with Real Data¶