#
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib as mp
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import laUtilities as ut
import slideUtilities as sl
import demoUtilities as dm
from matplotlib import animation
from importlib import reload
from datetime import datetime
from IPython.display import Image, display_html, display, Math, HTML;
qr_setting = None
mp.rcParams['animation.html'] = 'jshtml';
HW 2 is due today at 8pm
HW 3 out, due next Friday
Upcoming office hours:
Read Tan, Steinbach, Karpatne, Kumar Chapter 7.3
[This lecture is based on Prof. Crovella's CS 506 lecture notes, Pattern Recognition and Machine Learning by Christopher Bishop, and Introduction to Data Mining by Tan, Steinbach, and Kumar.]
Recall that there are, broadly speaking, two kinds of learning algorithms.
Supervised learning: Data items have labels, and we want to learn a function that correctly assigns labels to new data items.
Unsupervised learning: Data items do not have labels, and we want to learn a function that extracts important patterns from the data.
We have seen examples of unsupervised learning (-means clustering and heirarchical clustering). Today we'll start talking about supervised learning.
Let's explore in more detail the data science aspects of supervised learning.
The supervised learning problem in general is:
You are given some example data, which we'll think of abstractly as tuples .
Your goal is to learn a rule that allows you to predict for some that is not in the example data you were given.
Typically is a vector.
We use the term "features" to refer to the components of .
The collection is called the training data.
If is a continuous (numeric) value, then the problem is called regression.
For example, in predicting housing prices, the features could be a vector containing lot size, square footage, number of bathrooms, etc., and could be the sale price of the house.
In the regression case, you will usually be satisfied if your prediction is close to the true (it doesn't have to be exact to be useful).
If is a discrete value (a label, for example) then the problem is called classification.
For example, in image recognition, the features could be a vector that represents the pixels of the image, and could be a label such as "tiger," "tree," etc.
What do we have to assume to make this problem tractable?
We assume two things:
The following is based on _Pattern Recognition and Machine Learning,_ Christopher Bishop (2006), Section 1.1.
In order to explore these ideas a bit, we'll use a toy example of regression.
This is a very artificial example, but it will expose some important wrinkles in the supervised learning problem.
We will consider polynomial curve fitting.
Suppose we are given a training set comprising observations of a scalar value , which we'll collect into the vector .
For each we have a corresponding numeric value , and these form .
Here is a plot of the 10 training points:
N = 10
x = np.linspace(0, 1, N)
from numpy.random import default_rng
y = np.sin(2 * np.pi * x) + default_rng(2).normal(size = N, scale = 0.20)
# plt.figure(figsize = (3, 2))
plt.plot(x, y, 'ro', markersize = 8, fillstyle = 'none')
plt.xlabel('x', size = 16)
plt.ylabel('y', size = 16);
The way we generated these points was to take as equal spaced points on the range 0 to 1,
and for each , we take plus a sample of a Gaussian random variable.
N = 20
x = np.linspace(0, 1, N)
y = np.sin(2 * np.pi * x) + default_rng(2).normal(size = N, scale = 0.20)
#
cx = np.linspace(0, 1, 1000)
cy = np.sin(2 * np.pi * cx)
plt.plot(cx, cy, lw = 2)
plt.plot(x, y, 'ro', markersize = 8, fillstyle = 'none')
plt.xlabel('x', size = 16)
plt.ylabel('y', size = 16);
Many data sets are like this!
In many cases, there is some component of that depends on , and some component that we treat as random, called "noise."
The "noise" component is typically not really random, but rather depends on features that we cannot see.
(Remember, probability is useful for exactly this case.)
Now for this toy example, we "happen" to know that the correct rule to use for prediction is:
and the Gaussian random addition does not depend on so we cannot hope to predict it.
Let's learn from this data.
We will consider a simple approach based on curve fitting.
The class of models we will consider are polynomials. They are of the form:
where is the order of the polynomial.
If we are given some , then what we want to learn are the s, that is, .
The s are the parameters of the model.
Recall that this function is a nonlinear function in ... but it is linear in . That is, all the terms appear only raised to the first power. For this reason, we can apply a linear model: namely linear regression.
How will we fit our model, that is, learn the best parameters ?
We use a objective function to guide our search through the space of model parameters. For linear regression, we want to choose the parameters to minimize the least squares criterion:
This is a nonnegative function which is zero if the polynomial passes through every point exactly.
# image credit: Lay, LAA, 4th edition
display(Image("images/Lay-fig-6-6-1.jpg", width=550))
We often write for .
Then:
In other words, the error function measures the distance or dissimilarity between the data and the predictions.
Finding a that minimizes is a least-squares problem, and we know how to solve for it.
The resulting solution is the set of parameters that minimizes the error on the training data.
So we are done, correct?
Wait ... what about choosing , the order of the polynomial?
The problem of choosing is called model selection.
That is, a polynomial of order 3 (a cubic) is a different model from a polynomial of order 2 (a quadratic).
Let's look at constant (order 0), linear (order 1), and cubic (order 3) models.
We will fit each one using the least squares criterion:
# y = Aw, A is design matrix 1, [1, x^T], [1, x^T, x^T^2], etc, and w-hat = (A^TA)^-1 A^Ty
def design_matrix(x, k):
N = len(x)
A = np.ones(N)
for i in range(1, k+1):
A = np.column_stack([A, (x.T)**i])
return A
def fit_poly(x, y, k):
A = design_matrix(x, k)
w_hat = np.linalg.inv(A.T @ A) @ A.T @ y
return w_hat
w_hat_0 = 1/N * np.sum(y)
w_hat_1 = fit_poly(x, y, 1)
w_hat_3 = fit_poly(x, y, 3)
#
fig, axs = plt.subplots(1, 3, sharey = True, figsize = (12, 5))
#
cy = 1000 * [w_hat_0]
pred_y = N * [w_hat_0]
axs[0].plot(cx, cy, lw = 2, label = r'$k$ = 0')
axs[0].plot(x, y, 'ro', markersize = 8, fillstyle = 'none')
axs[0].set_xlabel('x', size = 16)
axs[0].set_ylabel('y', size = 16)
axs[0].set_title(r'$k$ = 0, constant' + '\n' + r'$E(\mathbf{w})$ =' + ' {:0.2f}'.format(np.linalg.norm(y - pred_y)))
#axs[0].legend(loc = 'best', fontsize = 16)
#
cy = design_matrix(cx, 1) @ w_hat_1
pred_y = design_matrix(x, 1) @ w_hat_1
axs[1].plot(cx, cy, lw = 2, label = r'$k$ = 1')
axs[1].plot(x, y, 'ro', markersize = 8, fillstyle = 'none')
axs[1].set_xlabel('x', size = 16)
axs[1].set_title(r'$k$ = 1, linear' + '\n' + r'$E(\mathbf{w})$ =' + ' {:0.2f}'.format(np.linalg.norm(y - pred_y)))
#axs[1].legend(loc = 'best', fontsize = 16)
#
cy = design_matrix(cx, 3) @ w_hat_3
pred_y = design_matrix(x, 3) @ w_hat_3
axs[2].plot(cx, cy, lw = 2, label = r'$k$ = 3')
axs[2].plot(x, y, 'ro', markersize = 8, fillstyle = 'none')
axs[2].set_xlabel('x', size = 16)
axs[2].set_title('$k$ = 3, cubic' + '\n' + r'$E(\mathbf{w})$ =' + ' {:0.2f}'.format(np.linalg.norm(y - pred_y)))
#axs[2].legend(loc = 'best', fontsize = 16)
#
fig.tight_layout();
So it looks like a third-order polynomial ( = 3) is a good fit!
How do we know it's good? Well, the training error is small.
But ... can we make the training error smaller?
Yes, we can, if we increase the order of the polynomial.
In fact, we can reduce the error to zero!
By setting , we get the following polynomial fit to the data:
#
w_hat_9 = fit_poly(x, y, 9)
cy = design_matrix(cx, 9) @ w_hat_9
plt.plot(cx, cy, lw = 2, label = r'$k$ = 9')
plt.plot(x, y, 'ro', markersize = 8, fillstyle = 'none')
plt.xlabel('x', size = 16)
plt.ylabel('y', size = 16)
plt.title(r'$k$ = 9' + '\n' + r'$E(\mathbf{w})$ =' + ' {:0.2f}'.format(0));
So ... is the 9th order polynomial a "better" model for this dataset?
Absolutely not!
Why?
Informally, the model is very "wiggly". It seems unlikely that the real data generation process is governed by this curve.
In other words, we don't expect that, if we had more data from the same source, that this model would do a good job of fitting the additional data.
We want the model to do a good job of predicting on future data. This is called the model's generalization ability.
The 9th degree polynomial would seem to have poor generalization ability.
Let's assess generalization error. For each polynomial (value of ) we will use new data, called test data. This is data that was not used to train the model, but comes from the same source.
In our case, we know how the data is generated -- plus noise -- so we can easily generate more.
#
test_y = np.sin(2 * np.pi * x) + default_rng(8).normal(size = N, scale = 0.20)
max_k = N
train_err = [np.linalg.norm(y - N * [w_hat_0])]
test_err = [np.linalg.norm(test_y - N * [w_hat_0])]
for k in range(1, max_k):
w_hat = fit_poly(x, y, k)
pred_y = design_matrix(x, k) @ w_hat
train_err.append(np.linalg.norm(y - pred_y))
test_err.append(np.linalg.norm(test_y - pred_y))
plt.plot(range(max_k), test_err, 'ro-', label = 'Testing Error')
plt.plot(range(max_k), train_err, 'bo-', label = 'Training Error')
plt.xlabel(r'$k$', size = 16)
plt.ylabel(r'$E(\mathbf{w}^*)$')
plt.legend(loc = 'best');
Notice that as we increase the order of the polynomial, the training error always declines.
Eventually, the training error reaches zero.
However, the test error does not -- it reaches its smallest value at , a cubic polynomial.
The phenomenon in which training error declines, but testing error does not, is called overfitting.
In a sense we are fitting the training data "too well".
There are two ways to think about overfitting:
The number of parameters in the model is too large, compared to the size of the training data. We can see this in the fact that we have only 10 training points, and the 9th order polynomial has 10 coefficents.
The model is more complex than the actual phenomenon being modeled. As a result, the model is not just fitting the underlying phenomenon, but also the noise in the data.
These suggest techniques we may use to avoid overfitting:
Increase the amount of training data. All else being equal, more training data is always better.
Limit the complexity of the model. Model complexity is often controlled via hyperparameters.
It's not necessarily true that a order-3 polynomial is best for this problem.
After all, we are fitting to a sine function, whose Taylor series includes polynomials of all orders.
But the higher the order of polynomial we want to fit, the more data we need to avoid overfitting.
Here we use an order-9 polynomial for increasing amounts of training data (N = 15, 50, 200):
#
Ns = [15, 50, 200]
xs = {Nval: np.linspace(0, 1, Nval) for Nval in Ns}
ys = {Nval: np.sin(2 * np.pi * xs[Nval]) + default_rng(3).normal(size = Nval, scale = 0.20) for Nval in Ns}
#
fig, axs = plt.subplots(1, 3, sharey = True, figsize = (12, 5))
#
cx = np.linspace(0, 1, 1000)
for i, Nval in enumerate(Ns):
w_star = fit_poly(xs[Nval], ys[Nval], 9)
cy = design_matrix(cx, 9) @ w_star
pred_y = design_matrix(xs[Nval], 9) @ w_star
axs[i].plot(xs[Nval], ys[Nval], 'ro', markersize = 9, fillstyle = 'none', alpha = 0.5)
axs[i].plot(cx, cy, lw = 2, label = r'$N$ = {}'.format(Nval))
axs[i].set_xlabel('x', size = 16)
if i == 0:
axs[i].set_ylabel('y', size = 16)
axs[i].set_title(r'$k$ = 9, N = {}'.format(Nval))
#
fig.tight_layout();
We see that with enough training data, the high order polynomial begins to capture the sine wave well.
Many times however, we cannot simply get more training data, or enough training data, to solve the overfitting problem.
In that case, we need to control the complexity of the model.
Notice that model selection problem required us to choose a value that specifies the order of the polynomial model.
As already mentioned, the values are called the parameters of the model.
In contrast, is called a hyperparameter.
A hyperparameter is a parameter that must be set first, before the (regular) parameters can be learned.
Hyperparameters are often used to control model complexity.
Here, the hyperparameter controls the complexity (the order) of the polynomial model.
So, to avoid overfitting, we need to choose the proper value for the hyperparameter .
We do that by holding out data.
The idea behind holding out data is simple.
We want to avoid overfitting, which occurs when a model fails to generalize -- that is, when it has high error on data that it was not trained on.
So: we will hold some data aside, and not use it for training the model, but instead use it for testing generalization ability.
Let's assume that we have 20 data points to work with, stored in arrays x
and y
.
scikit-learn
has some functions that can be helpful.
We will use train_test_split()
:
import sklearn.model_selection as model_selection
x_train, x_test, y_train, y_test = model_selection.train_test_split(
x, y, test_size = 0.5, random_state = 0)
print(f'Number of items in training set: {x_train.shape[0]}, in testing set: {x_test.shape[0]}')
Number of items in training set: 10, in testing set: 10
Notice that train_test_split()
splits the data randomly.
This will be important.
#
fig, axs = plt.subplots(1, 2, sharey = True, figsize = (8, 5))
#
axs[0].plot(x_train, y_train, 'ro', markersize = 8, fillstyle = 'none')
axs[0].set_xlabel('x', size = 16)
axs[0].set_ylabel('y', size = 16)
axs[0].set_title('Training Set', size = 16)
#
axs[1].plot(x_test, y_test, 'ro', markersize = 8, fillstyle = 'none')
axs[1].set_xlabel('x', size = 16)
axs[1].set_title('Testing Set', size = 16)
#
fig.tight_layout();
Our strategy will be:
What are good possible values for the hyperparameter? It can depend on the problem, and may involve trial and error.
This strategy of trying all possible values of the hyperparameter is called a grid search.
def model_error(x_train, y_train, x_test, y_test, k):
'''
This function fits a polynomial of degree k to the training data
and returns the error on both the training and test data.
'''
w_star = fit_poly(x_train, y_train, k)
pred_test_y = design_matrix(x_test, k) @ w_star
pred_train_y = design_matrix(x_train, k) @ w_star
return (np.linalg.norm(y_train - pred_train_y), np.linalg.norm(y_test - pred_test_y))
np.random.seed(7)
# fraction of data used for testing
#
split_frac = 0.5
#
# maximum polynomial degree to consider
#
max_k = 10
#
n_splits = 5
#
# grid search over k
# we assume a model_error() function that reports the
# training and testing error
# (definition omitted for space)
#
#
err = []
for k in range(1, max_k):
for s in range(n_splits):
x_train, x_test, y_train, y_test = model_selection.train_test_split(
x, y, test_size = 0.5)
split_train_err, split_test_err = model_error(x_train, y_train, x_test, y_test, k)
err.append([k, s, split_train_err, split_test_err])
#
# put the results in a DataFame for easy manipulation
#
df = pd.DataFrame(err, columns = ['k', 'split', 'Training Error', 'Testing Error'])
df.head (10)
k | split | Training Error | Testing Error | |
---|---|---|---|---|
0 | 1 | 0 | 1.147829 | 2.799292 |
1 | 1 | 1 | 1.700576 | 1.712716 |
2 | 1 | 2 | 1.387723 | 2.133381 |
3 | 1 | 3 | 1.696808 | 1.695534 |
4 | 1 | 4 | 1.571746 | 1.989020 |
5 | 2 | 0 | 1.358846 | 2.603490 |
6 | 2 | 1 | 1.332199 | 2.082828 |
7 | 2 | 2 | 0.747769 | 5.436927 |
8 | 2 | 3 | 1.559011 | 2.697498 |
9 | 2 | 4 | 1.677090 | 1.691438 |
Let's plot the mean for each value of k
and its standard error ():
df.groupby('k').mean()[['Training Error', 'Testing Error']].plot(
yerr = df.groupby('k').std()/np.sqrt(n_splits))
plt.ylabel('Error')
plt.ylim([0, 5]);
From this plot we can conclude that, for this dataset, a polynomial of degree shows the best generalization ability.
Deciding how much, and which, data to hold out depends on a number of factors.
In general we'd like to give the training stage as much data as possible to work with.
However, the more data we use for training, the less we have for testing -- which can decrease the accuracy of the testing stage.
Furthermore, any single partition of the data can introduce dependencies -- any class that is overrepresented in the training data will be underrepresented in the test data.
There are two ways to address these problems:
In random subsampling one partitions the data randomly between train and test sets. This is what the function train_test_split()
does.
This ensures there is no dependence between the test and train sets.
One needs to perform a reasonable number of random splits - usually five at least.
In cross-validation, the data is partitioned once, and then each partition is used as the test data once.
This ensures that all the data gets equal weight in the training and in the testing.
We divide the data into "folds".
#
display(Image("images/22-k-fold.png", width=500))
The value of can vary up to the size of the dataset.
The larger we use, the more data is used for training, but the more folds must be evaluated, which increases the time required.
In the extreme case where is equal to the data size, then each data item is held out by itself; this is called "leave-one-out".
We have seen strategies that allow us to learn from data.
The strategies include:
We've also seen that there are some subtleties to this approach that must be dealt with to avoid problems:
Suppose we have a collection of documents (say, books) and we want to build a model to understand which documents are more (or less) related to each other.
How do we encode categorical or text data in a form usable by linear algebra algorithms? All of the math we've done so far expects inputs to be numbers, not strings.
Here's one idea:
#
display(Image("images/09-document-term.png", width=1000))
As a "real world" example of -means clustering, we'll use the "20 Newsgroup" data provided as example data in sklearn
.
(http://scikit-learn.org/stable/datasets/twenty_newsgroups.html).
from sklearn.datasets import fetch_20newsgroups
"""
categories = [
'comp.windows.x',
'misc.forsale',
'rec.autos',
'rec.motorcycles',
'rec.sport.baseball',
'rec.sport.hockey',
'talk.religion.misc',
'comp.graphics',
'sci.space',
'rec.autos',
'rec.sport.baseball'
]
"""
categories = ['comp.os.ms-windows.misc', 'rec.sport.baseball', 'sci.space']
news_data = fetch_20newsgroups(subset = 'train', categories = categories)
print("Categories: ", news_data.target_names)
print("# of documents: ", len(news_data.target))
print("Ground truth: ", news_data.target) # this vector is of length 1781
Categories: ['comp.os.ms-windows.misc', 'rec.sport.baseball', 'sci.space'] # of documents: 1781 Ground truth: [2 0 0 ... 2 1 2]
# Run the lines below to look at some of the data
print(news_data.data[0])
From: aws@iti.org (Allen W. Sherzer) Subject: Re: DC-X update??? Organization: Evil Geniuses for a Better Tomorrow Lines: 122 In article <ugo62B8w165w@angus.mi.org> dragon@angus.mi.org writes: >Exactly when will the hover test be done, Early to mid June. >and will any of the TV >networks carry it. I really want to see that... If they think the public wants to see it they will carry it. Why not write them and ask? You can reach them at: F: NATIONAL NEWS MEDIA ABC "World News Tonight" "Face the Nation" 7 West 66th Street CBS News New York, NY 10023 2020 M Street, NW 212/887-4040 Washington, DC 20036 202/457-4321 Associated Press "Good Morning America" 50 Rockefeller Plaza ABC News New York, NY 10020 1965 Broadway National Desk (212/621-1600) New York, NY 10023 Foreign Desk (212/621-1663) 212/496-4800 Washington Bureau (202/828-6400) Larry King Live TV "CBS Evening News" CNN 524 W. 57th Street 111 Massachusetts Avenue, NW New York, NY 10019 Washington, DC 20001 212/975-3693 202/898-7900 "CBS This Morning" Larry King Show--Radio 524 W. 57th Street Mutual Broadcasting New York, NY 10019 1755 So. Jefferson Davis Highway 212/975-2824 Arlington, VA 22202 703/685-2175 "Christian Science Monitor" CSM Publishing Society "Los Angeles Times" One Norway Street Times-Mirror Square Boston, MA 02115 Los Angeles, CA 90053 800/225-7090 800/528-4637 CNN "MacNeil/Lehrer NewsHour" One CNN Center P.O. Box 2626 Box 105366 Washington, DC 20013 Atlanta, GA 30348 703/998-2870 404/827-1500 "MacNeil/Lehrer NewsHour" CNN WNET-TV Washington Bureau 356 W. 58th Street 111 Massachusetts Avenue, NW New York, NY 10019 Washington, DC 20001 212/560-3113 202/898-7900 "Crossfire" NBC News CNN 4001 Nebraska Avenue, NW 111 Massachusetts Avenue, NW Washington, DC 20036 Washington, DC 20001 202/885-4200 202/898-7951 202/362-2009 (fax) "Morning Edition/All Things Considered" National Public Radio 2025 M Street, NW Washington, DC 20036 202/822-2000 United Press International 1400 Eye Street, NW Washington, DC 20006 202/898-8000 "New York Times" "U.S. News & World Report" 229 W. 43rd Street 2400 N Street, NW New York, NY 10036 Washington, DC 20037 212/556-1234 202/955-2000 212/556-7415 "New York Times" "USA Today" Washington Bureau 1000 Wilson Boulevard 1627 Eye Street, NW, 7th Floor Arlington, VA 22229 Washington, DC 20006 703/276-3400 202/862-0300 "Newsweek" "Wall Street Journal" 444 Madison Avenue 200 Liberty Street New York, NY 10022 New York, NY 10281 212/350-4000 212/416-2000 "Nightline" "Washington Post" ABC News 1150 15th Street, NW 47 W. 66th Street Washington, DC 20071 New York, NY 10023 202/344-6000 212/887-4995 "Nightline" "Washington Week In Review" Ted Koppel WETA-TV ABC News P.O. Box 2626 1717 DeSales, NW Washington, DC 20013 Washington, DC 20036 703/998-2626 202/887-7364 "This Week With David Brinkley" ABC News 1717 DeSales, NW Washington, DC 20036 202/887-7777 "Time" magazine Time Warner, Inc. Time & Life Building Rockefeller Center New York, NY 10020 212/522-1212 -- +---------------------------------------------------------------------------+ | Lady Astor: "Sir, if you were my husband I would poison your coffee!" | | W. Churchill: "Madam, if you were my wife, I would drink it." | +----------------------57 DAYS TO FIRST FLIGHT OF DCX-----------------------+
print(news_data.data[1])
From: phoenix.Princeton.EDU!carlosn (Carlos G. Niederstrasser) Subject: Reboot when I start windows. Originator: news@nimaster Nntp-Posting-Host: week.princeton.edu Organization: Princeton University Lines: 21 Recently the following problem has arrisen. The first time I turn on my computer when windows starts (from my autoexec) after the win31 title screen the computer reboots on its own. Usually the second time (after reboot) or from the DOS prompt everything works fine. s far as I remember I have not changed my config.sys or autoxec.bat or win.ini. I can't remember whether this problem occured before I optimized/defragmented my disk and created a larger swap file (Thank you MathCAD 4 :( ) System 386sx, 4MB, stacker 2.0, win31, DOS 5 --- --------------------------------------------------------------------- | Carlos G. Niederstrasser | Only two things are infinite, | | Princeton Planetary Society | the universe and human | | | stupidity, and I'm not sure | | | about the former. - Einstein | | carlosn@phoenix.princeton.edu |---------------------------------| | space@phoenix.princeton.edu | Ad Astra per Ardua Nostra | ---------------------------------------------------------------------
print(news_data.data[-2])
Organization: University of Notre Dame - Office of Univ. Computing From: <RVESTERM@vma.cc.nd.edu> Subject: Re: MLB = NBA? Lines: 15 In article <1993Apr17.052025.10610@news.yale.edu>, (Sean Garrison) says: > >I think that >players' salaries are getting way out of hand to the point that they're on >a pace to become severely detrimental to baseball's future. > so you want to decrease players' salaries? so you want to increase owners' salaries? the two are equivalent. bob vesterman.
Let's ask sklearn
to calculate the TF-IDF score for each document in this dataset.
We'll talk about the particulars later in the course. For now, consider TF-IDF as a way to identify important words: it increases with the frequency of a word, but is offset by the number of documents a word appears in.
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words = 'english', min_df = 4, max_df = 0.8)
data = vectorizer.fit_transform(news_data.data)
print("# of documents:", data.shape[0])
print("# of words: ", data.shape[1])
# of documents: 1781 # of words: 9409
Next, let's look at a picture showing each of the vectors. Black dots denote document-word pairs that are rare (occur near-0 times) whereas brighter colors denote document-word pairs that occur frequently.
We find from this picture that our data is sparse: most words don't occur often in most documents. (Note that we have already removed words like "the" that occur frequently in all documents, using the stop_words
command above.)
fig, ax1 = plt.subplots(1,1,figsize=(15,10))
dum = sns.heatmap(data[1:100,1:200].todense(), xticklabels=False, yticklabels=False,
linewidths=0, cbar=False, ax=ax1)
print(news_data.target)
print(news_data.target_names)
[2 0 0 ... 2 1 2] ['comp.os.ms-windows.misc', 'rec.sport.baseball', 'sci.space']
Let's try several methods to predict the appropriate number of clusters. In the next few diagrams, we will run -means clustering for between 1 and 10.
Let's first measure the total error of the clustering algorithm. Remember that this is defined as the sum of squared-distances between each point in the dataset and the nearest center.
This error will decrease as increases, because adding more centers means that the nearest one is even closer than before.
#
error = evaluate_clusters(data, 10)
plt.plot(range(1, len(error)), error[1:])
plt.title('$k$-means Clustering Performance on Newsgroup Articles')
plt.xlabel('Number of clusters')
plt.ylabel('Error');
Next, let's use the ground truth labels and measure the Rand Index. Recall that it measures how often the clustering correctly groups the same points together, and correctly keeps apart points with different ground truth labels.
The graph below shows the Adjusted Rand Index, which essentially discounts the number of "correct" clustering decisions that could happen through pure chance alone.
#
ri = ri_evaluate_clusters(data, 10, news_data.target)
plt.plot(range(1, len(ri)), ri[1:], 'o-')
plt.xlabel('Number of clusters')
plt.title('$k$-means Clustering Compared to Known Labels\nNewsgroup Articles')
plt.ylabel('Adjusted Rand Index');
Finally, let's measure the Silhouette Coefficient. This metric does not rely on the ground truth labels. Instead it evaluates how compact the clusters are.
#
s = sc_evaluate_clusters(data, 10, 100, 3)
plt.plot(range(2, len(s)), s[2:], 'o-')
plt.xlabel('Number of Clusters')
plt.title('$k$-means clustering performance on Newsgroup Articles')
plt.ylabel('Silhouette Score');
Even though we know the data comes from 3 distinct forums, the Silhouette Coefficient happens to prefer larger choices of . Let's explore the data more closely.
k = 4
kmeans = KMeans(n_clusters = k, init = 'k-means++', max_iter = 100, n_init = 25, random_state = 3)
kmeans.fit_predict(data)
array([1, 1, 1, ..., 3, 0, 2], dtype=int32)
print('Top terms per cluster:')
asc_order_centroids = kmeans.cluster_centers_.argsort()#[:, ::-1]
order_centroids = asc_order_centroids[:,::-1]
terms = vectorizer.get_feature_names()
for i in range(k):
print(f'Cluster {i}:')
for ind in order_centroids[i, :10]:
print(f' {terms[ind]}')
print('')
Top terms per cluster: Cluster 0: edu baseball year team game com article players writes games Cluster 1: windows edu com file dos university thanks ca files use Cluster 2: space nasa access gov edu alaska digex com pat moon Cluster 3: henry toronto zoo spencer zoology edu work utzoo kipling umd
Next, let's look at the distance between each pair of documents. The following image is a picture whose pixels correspond to the distance between document and document , after sorting the documents based on their -means clustering.
You'll see that there are four distinct groups of documents that are slightly closer to each other than to the other documents. Also, the fourth cluster is much smaller than the others.
#
euclidean_dists = metrics.euclidean_distances(data)
labels = kmeans.labels_
idx = np.argsort(labels)
clustered_dists = euclidean_dists[idx][:,idx]
fig, ax1 = plt.subplots(1,1,figsize=(15,15))
dum = sns.heatmap(clustered_dists, xticklabels=False, yticklabels=False, linewidths=0, square=True,cbar=False, ax=ax1)
Let's visualize with Multidimensional Scaling (MDS), which builds a 2-dimensional plot where the distances between points are "close" to their distances in the larger 9409-dimensional space.
Note that MDS is a slow algorithm and we can't do all 1700+ data points quickly, so we will take a random sample.
import random
n_items = euclidean_dists.shape[0]
subset = random.sample(range(n_items),500)
fit = mds.fit(euclidean_dists[subset][:,subset])
pos = fit.embedding_
labels
array([1, 1, 1, ..., 3, 0, 2], dtype=int32)
cols = [['y', 'b', 'g', 'r', 'c'][l] for l in labels[subset]]
plt.scatter(pos[:, 0], pos[:, 1], s = 12, c = cols)
plt.title('MDS Embedding of Newsgroup Articles');