In [1]:
#
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib as mp
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import laUtilities as ut
import slideUtilities as sl
import demoUtilities as dm
from matplotlib import animation
from importlib import reload
from datetime import datetime
from IPython.display import Image, display_html, display, Math, HTML;
qr_setting = None

mp.rcParams['animation.html'] = 'jshtml';
In [2]:
#
def centerAxes(ax):
    ax.spines['left'].set_position('zero')
    ax.spines['right'].set_color('none')
    ax.spines['bottom'].set_position('zero')
    ax.spines['top'].set_color('none')
    ax.xaxis.set_ticks_position('bottom')
    ax.yaxis.set_ticks_position('left')
    bounds = np.array([ax.axes.get_xlim(), ax.axes.get_ylim()])
    ax.plot(bounds[0][0],bounds[1][0],'')
    ax.plot(bounds[0][1],bounds[1][1],'')
In [3]:
#
%matplotlib inline
%config InlineBackend.figure_format='retina'
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Image, display_html, display, Math, HTML;

Announcements¶

  • HW 3 is due today (2/17) at 8pm

  • Test 1 is in-class next Friday (2/24)

  • Upcoming office hours:

    • Today: Peer tutor Daniel Cho from 3-6pm in CCDS 16th floor
  • Read Tan, Steinbach, Karpatne, Kumar Chapter 3.3

13. kk-Nearest Neighbors in action¶

Recap of last lecture¶

kk-nearest neighbors is a supervised learning algorithm for classification.

In [4]:
#
plt.scatter(demo_X[:,0], demo_X[:,1], c=demo_y, cmap=cmap_bold)
plt.plot(test_X[0], test_X[1], 'ok')
plt.annotate('Test Point', test_X, [75, 25], 
             textcoords = 'offset points', fontsize = 14, 
             arrowprops = {'arrowstyle': '->'})
plt.axis('equal')
plt.axis('off')
plt.title('Training Points: 2 Classes');
In [5]:
#
plt.scatter(demo_X[:,0], demo_X[:,1], c=demo_y, cmap=cmap_bold)
plt.plot(test_X[0], test_X[1], 'ok')
ax=plt.gcf().gca()
circle = mp.patches.Circle(test_X, 0.5, facecolor = 'red', alpha = 0.2)
plt.axis('equal')
plt.axis('off')
ax.add_artist(circle)
plt.title('1-Nearest-Neighbor: Classification: Red');
In [6]:
#
plt.scatter(demo_X[:,0], demo_X[:,1], c=demo_y, cmap=cmap_bold)
test_X = [-0.3, 0.7]
plt.plot(test_X[0], test_X[1], 'ok')
ax=plt.gcf().gca()
    #ellipse = mp.patches.Ellipse(gmm.means_[clus], 3 * e[0], 3 * e[1], angle, color = 'r')
circle = mp.patches.Circle(test_X, 0.9, facecolor = 'gray', alpha = 0.3)
plt.axis('equal')
plt.axis('off')
ax.add_artist(circle)
plt.title('2-Nearest-Neighbor');
In [7]:
#
plt.figure()
ax=plt.gcf().gca()
    #ellipse = mp.patches.Ellipse(gmm.means_[clus], 3 * e[0], 3 * e[1], angle, color = 'r')
circle = mp.patches.Circle(test_X, 1.4, facecolor = 'blue', alpha = 0.2)
ax.add_artist(circle)
plt.scatter(demo_X[:,0], demo_X[:,1], c=demo_y, cmap=cmap_bold)
test_X = [-0.3, 0.7]
plt.plot(test_X[0], test_X[1], 'ok')
plt.axis('equal')
plt.axis('off')
plt.title('3-Nearest-Neighbor: Classification: Blue');

Challenges for kk-NN¶

Working with a kk-NN classifier can involve some challenges.

  1. First, the computational cost of classification grows with the size of the training data. Training is trivial but classification can be prohibitively expensive.
  1. Second, since Euclidean distance is the most common distance function used, data scaling is important.
  1. Third, we need to deal with the curse of dimensionality -- problems arise when we use geometric algorithms in high-dimensional space.

13.1 Comparing Classification Methods on Synthetic Data¶

Next we'll look at two classification methods in practice:

  • Decision Trees, and
  • k-Nearest Neighbors.

To compare these methods, the question arises:

How do we evaluate a classifier?

Using the confusion matrix we can define some useful measures:

  • Recall - defined as the fraction of actual positives correctly classified:
    • TP/(TP + FN)
  • Precision - defined as the fraction of classified positives correctly classified:
    • TP/(TP + FP)
In [11]:
#
display(Image("images/23-confusion-matrix.png", width=300))

Evaluating kk- Nearest Neighbors¶

First we'll generate some synthetic data to work with.

In [17]:
X, y = datasets.make_circles(noise=.1, factor=.5, random_state=1)
print('Shape of data: {}'.format(X.shape))
print('Unique labels: {}'.format(np.unique(y)))
Shape of data: (100, 2)
Unique labels: [0 1]

Here is what the data looks like:

In [18]:
#
plt.figure(figsize = (6,6))
plt.prism()  # this sets a nice color map
plt.scatter(X[:, 0], X[:, 1], c=y, s = 80)
plt.axis('off')
plt.axis('equal');

Recall that we always want to test on data separate from our training data.

For now, we will something very simple: take the first 50 examples for training and hold out the rest for testing. (Later we will do this a better way.)

In [19]:
X_train = X[:50]
y_train = y[:50]
X_test = X[50:]
y_test = y[50:]
In [21]:
#
fig_size = (12, 5)

plt.figure(figsize = fig_size)
plt.subplot(1, 2, 1)
plt.scatter(X_train[:, 0], X_train[:, 1], c = y_train, s = 80)
plt.axis('equal')
plt.axis('off')
plt.title('Training Data')

plt.subplot(1, 2, 2)
plt.scatter(X_test[:, 0], X_test[:, 1], c = y_test, s = 80)
plt.title('Test Data')
plt.axis('equal')
plt.axis('off');

For our first example, we will classify the points (in the two classes) using a k-nn classifier.

We will specify that k=5k=5, i.e., we will classify based on the majority vote of the 5 nearest neighbors.

In [22]:
k = 5
knn5 = KNeighborsClassifier(n_neighbors = k)    

In the context of supervised learning, the scikit-learn fit() function corresponds to training and the predict() function corresponds to testing.

In [23]:
knn5.fit(X_train,y_train)
print(f'Accuracy on test data: {knn5.score(X_test, y_test)}')
Accuracy on test data: 0.72

Accuracy of 72% sounds good -- but let's dig deeper.

We'll call the red points the Positive class and the green points the Negative class.

Here is the confusion matrix:

In [24]:
y_pred_test = knn5.predict(X_test)
pd.DataFrame(metrics.confusion_matrix(y_test, y_pred_test), 
             columns = ['Predicted +', 'Predicted -'], 
             index = ['Actual +', 'Actual -'])
Out[24]:
Predicted + Predicted -
Actual + 14 14
Actual - 0 22

Looks like the classifier is getting all of the Negative class correct, but only achieving accuracy of 50% on the Positive class.

That is, its precision is 100%, but its recall is only 50%.

Let's visualize the results.

In [25]:
#
k = 5

plt.figure(figsize = fig_size)
plt.subplot(1, 2, 1)

plt.scatter(X_train[:, 0], X_train[:, 1], c = y_train, s = 80)
plt.axis('equal')
plt.title('Training')
plt.axis('off')

plt.subplot(1, 2, 2)
plt.scatter(X_test[:, 0], X_test[:, 1], c = y_pred_test, s = 80)
plt.title(f'Testing $k$={k}\nAccuracy: {knn5.score(X_test, y_test)}')
plt.axis('off')
plt.axis('equal');

Indeed, the Positive (red) points in the upper half of the test data are all classified incorrectly.

Let's look at one of the points that the classifier got wrong:

In [26]:
#
k=5 
test_point = np.argmax(X_test[:,1])
neighbors = knn5.kneighbors([X_test[test_point]])[1]

plt.figure(figsize = fig_size)
plt.subplot(1, 2, 1)
plt.scatter(X_train[:, 0], X_train[:, 1], c = y_train, s = 80)
plt.scatter(X_train[neighbors,0], X_train[neighbors,1],
            c = y_train[neighbors], marker='o', 
            facecolors='none', edgecolors='b', s = 80)
radius = np.max(metrics.euclidean_distances(X_test[test_point].reshape(1, -1), X_train[neighbors][0]))
ax = plt.gcf().gca()
circle = mp.patches.Circle(X_test[test_point], radius, facecolor = 'blue', alpha = 0.2)
ax.add_artist(circle)
plt.axis('equal')
plt.axis('off')
plt.title(r'Training')

plt.subplot(1, 2, 2)
plt.scatter(X_test[:, 0], X_test[:, 1], c = y_pred_test, s = 80)
plt.scatter(X_test[test_point,0], X_test[test_point,1], marker='o', 
            facecolors='none', edgecolors='b', s = 80)
plt.title('Testing $k$={}\nAccuracy: {}'.format(k,knn5.score(X_test, y_test)))
plt.axis('equal')
plt.axis('off');

For comparison purposes, let's try kk = 3.

In [27]:
#
k = 3
knn3 = KNeighborsClassifier(n_neighbors=k)    
knn3.fit(X_train,y_train)
y_pred_test = knn3.predict(X_test)

plt.figure(figsize = fig_size)
plt.subplot(1, 2, 1)
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s = 80)
plt.axis('equal')
plt.axis('off')
plt.title(r'Training')

plt.subplot(1, 2, 2)
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_pred_test, s = 80)
plt.title(f'Testing $k$={k}\nAccuracy: {knn3.score(X_test, y_test)}')
plt.axis('off')
plt.axis('equal');

And let's look at the same individual point as before:

In [28]:
#
k = 3
test_point = np.argmax(X_test[:,1])
X_test[test_point]
neighbors = knn3.kneighbors([X_test[test_point]])[1]

plt.figure(figsize = fig_size)
plt.subplot(1, 2, 1)
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s = 80)
plt.scatter(X_train[neighbors, 0], X_train[neighbors, 1], marker = 'o', 
            facecolors = 'none', edgecolors = 'b', s = 80)
radius = np.max(metrics.euclidean_distances(X_test[test_point].reshape(1, -1), 
                                            X_train[neighbors][0]))
ax = plt.gcf().gca()
circle = mp.patches.Circle(X_test[test_point], radius, facecolor = 'blue', alpha = 0.2)
ax.add_artist(circle)
plt.axis('equal')
plt.axis('off')
plt.title(r'Training')
plt.subplot(1, 2, 2)
plt.scatter(X_test[:, 0], X_test[:, 1], c = y_pred_test, s = 80)
plt.scatter(X_test[test_point,0], X_test[test_point,1], marker = 'o', 
            facecolors = 'none', edgecolors = 'b', s = 80)
plt.title(f'Testing $k$={k}\nAccuracy: {knn3.score(X_test, y_test)}')
plt.axis('off')
plt.axis('equal');

So how confident can we be that the test accuracy is 92% in general?

What we really need to do is consider many different train/test splits.

Thus, the proper way to evaluate generalization ability (accuracy on the test data) is:

  1. Form a random train/test split
  2. Train the classifier on the training split
  3. Test the classifier on the testing split
  4. Accumulate statistics
  5. Repeat from 1. until enough statistics have been collected.
In [29]:
import sklearn.model_selection as model_selection

nreps = 50
kvals = range(1, 10)
acc = []
np.random.seed(4)
for k in kvals:
    test_rep = []
    train_rep = []
    for i in range(nreps):
        X_train, X_test, y_train, y_test = model_selection.train_test_split(X, 
                                                                            y, 
                                                                            test_size = 0.5)
        knn = KNeighborsClassifier(n_neighbors = k)    
        knn.fit(X_train, y_train)
        train_rep.append(knn.score(X_train, y_train))
        test_rep.append(knn.score(X_test, y_test))
    acc.append([np.mean(np.array(test_rep)), np.mean(np.array(train_rep))])
accy = np.array(acc)
In [30]:
plt.plot(kvals, accy[:, 0], '.-', label = 'Accuracy on Test Data')
plt.plot(kvals, accy[:, 1], '.-', label = 'Accuracy on Training Data')
plt.xlabel(r'$k$')
plt.ylabel('Accuracy')
plt.title('Train/Test Comparision of $k$-NN')
plt.legend(loc = 'best');

Based on the generalization error -- ie, accuracy on test (held-out) data -- it looks like k=2k=2 is the best choice.

Here is the decision boundary for kk-NN with k=2k=2.

In [31]:
#
x_min, x_max = X[:, 0].min() - .1, X[:, 0].max() + .1
y_min, y_max = X[:, 1].min() - .1, X[:, 1].max() + .1
plot_step = 0.02
xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                     np.arange(y_min, y_max, plot_step))
In [32]:
#
np.random.seed(1)
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.5)

k = 2
knn = KNeighborsClassifier(n_neighbors = k)  
knn.fit(X_train, y_train)
y_pred_train = knn.predict(X_train)
y_pred_test = knn.predict(X_test)

Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.figure(figsize = fig_size)
plt.subplot(1, 2, 1)
cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.3)
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=30)
plt.axis('equal')
plt.axis('off')
plt.xlim((x_min, x_max))
plt.ylim((y_min, y_max))
plt.title(f'{k}-NN - Training Data\nAccuracy: {knn.score(X_train, y_train)}');

plt.subplot(1, 2, 2)
cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.3)
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, s=30)
plt.axis('equal')
plt.axis('off')
plt.xlim((x_min, x_max))
plt.ylim((y_min, y_max))
plt.title(f'{k}-NN - Test Data\nAccuracy: {knn.score(X_test, y_test)}');

Evaluating a Decision Tree¶

Next, we'll use a decision tree on the same data set.

In [33]:
import sklearn.tree as tree
dtc = tree.DecisionTreeClassifier(max_leaf_nodes = 5)

dtc.fit(X_train,y_train)
y_pred_test = dtc.predict(X_test)
print('DT accuracy on test data: ', dtc.score(X_test, y_test))
y_pred_train = dtc.predict(X_train)
print('DT accuracy on training data: ', dtc.score(X_train, y_train))
DT accuracy on test data:  0.94
DT accuracy on training data:  0.98
In [34]:
#
plt.figure(figsize = (6, 6))
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_pred_test, marker='^', s=80)
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=80)
plt.axis('equal')
plt.axis('off')
plt.title(F'Decision Tree\n Triangles: Test Data, Circles: Training Data\nAccuracy: {dtc.score(X_test, y_test)}');

Let's visualize the decision boundary of the Decision Tree.

In [35]:
#
Z = dtc.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.figure(figsize = (12, 6))
plt.subplot(1, 2, 1)
cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.3)
plt.scatter(X_train[:, 0], X_train[:, 1], c = y_train, s = 80)
plt.axis('equal')
plt.axis('off')
plt.xlim((x_min, x_max))
plt.ylim((y_min, y_max))
plt.title(f'Decision Tree - Training Data\nAccuracy: {dtc.score(X_train, y_train)}');

plt.subplot(1, 2, 2)
cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.3)
plt.scatter(X_test[:, 0], X_test[:, 1], c = y_pred_test, marker = '^', s = 80)
plt.axis('equal')
plt.axis('off')
plt.xlim((x_min, x_max))
plt.ylim((y_min, y_max))
plt.title(f'Decision Tree - Test Data\nAccuracy: {dtc.score(X_test, y_test)}');

Comparing kk-NN and Decision Tree¶

It appears that kk-NN and a Decision Tree have approximately comparable performance on this dataset.

However - there is a difference in interpretability.

Interpretability is a big deal! It means the ability to explain why the classifier made the decision it did.

It can be relatively difficult to understand why kk-NN is making a specific prediction. It depends on the data in the neighborhood of the test point.

On the other hand, the Decision Tree can be easily understood.

We sometimes use the terms "black box" for an uninterpretable classifier like kk-NN, and "white box" for an interpretable classifier like DT.

Let's see an example of the interpretability of the Decision Tree:

In [36]:
dot_data = tree.export_graphviz(dtc, out_file=None,
                         feature_names=['X','Y'],
                         class_names=['Red','Green'],
                         filled=True, rounded=True,  
                         special_characters=True) 
import pydotplus
graph = pydotplus.graph_from_dot_data(dot_data) 
# graph.write_pdf("dt.pdf") 
Image(graph.create_png())
Out[36]:

13.2 Comparing on Real Data¶

To explore a few more issues, we'll now turn to some famous datasets that have been extensively studied in the past.

The Iris Dataset¶

The Iris dataset is a famous dataset used by Ronald Fisher in a classic 1936 paper on classification.

Quoting from Wikipedia:

The data set consists of 50 samples from each of three species of Iris (Iris setosa, Iris virginica and Iris versicolor). Four features were measured from each sample: the length and the width of the sepals and petals, in centimetres. Based on the combination of these four features, Fisher developed a linear discriminant model to distinguish the species from each other.

In [13]:
#
display(Image("images/01-iris.png", width=600))
In [38]:
iris = datasets.load_iris()

X = iris.data
y = iris.target
ynames = iris.target_names
print(X.shape, y.shape)
print(X[1,:])
print(iris.target_names)
print(y)
(150, 4) (150,)
[4.9 3.  1.4 0.2]
['setosa' 'versicolor' 'virginica']
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]

First, we'll explore setting hyperparameters.

We start with kk-NN.

To set the hyperparameter kk, we evaluate error on the test set for many train/test splits:

In [39]:
kvals = range(2, 20)
nreps = 50

acc = []
std = []
np.random.seed(0)
for k in kvals:
    test_rep = []
    train_rep = []
    for i in range(nreps):
        X_train, X_test, y_train, y_test = model_selection.train_test_split(
            X, y, test_size = 0.33)
        knn = KNeighborsClassifier(n_neighbors = k)    
        knn.fit(X_train, y_train)
        train_rep.append(knn.score(X_train, y_train))
        test_rep.append(knn.score(X_test, y_test))
    acc.append([np.mean(np.array(test_rep)), np.mean(np.array(train_rep))])
    std.append([np.std(np.array(test_rep)), np.std(np.array(train_rep))])
In [40]:
#
accy = np.array(acc)
stds = np.array(std)/np.sqrt(nreps)
plt.plot(kvals, accy[:, 0], '.-', label = 'Accuracy on Test Data')
plt.plot(kvals, accy[:, 1], '.-', label = 'Accuracy on Training Data')
plt.xlabel('k')
plt.ylabel('Accuracy')
plt.xticks(kvals)
plt.legend(loc = 'best');
print(f'Max Test Accuracy at k = {kvals[np.argmax(accy[:, 0])]} with accuracy {np.max(accy[:, 0]):.03f}')
Max Test Accuracy at k = 13 with accuracy 0.969

Now, it looks like kk = 13 is the best-performing value of the hyperparameter.

Can we be sure?

Be careful! Each point in the above plot is the mean of 50 random train/test splits!

If we are going to be sure that kk = 13 is best, then it should be be statistically distinguishable from the other values.

To make this call, let's plot ±1σ±1σ confidence intervals on the mean values.

In [41]:
#
plt.errorbar(kvals, accy[:, 0], stds[:, 0], label = 'Accuracy on Test Data')
plt.xlabel('k')
plt.ylabel('Accuracy')
plt.legend(loc = 'lower center')
plt.xticks(kvals)
plt.title(r'Test Accuracy with $\pm 1\sigma$ Errorbars');

It looks like kk = 13 is a reasonable value,

although a case can be made that 9 and 11 are not statistically distinguishable from 13.

To gain insight onto the complexity of the model for kk = 13, let's look at the decision boundary.

Note that we will re-run the classifier using only the first two (of four) features, so we can visualize.

In [42]:
# Create color maps
from matplotlib.colors import ListedColormap
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

# we will use only the first two (of four) features, so we can visualize
X = X_train[:, :2] 
h = .02  # step size in the mesh
k = 13
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X, y_train)
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                      np.arange(y_min, y_max, h))
Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light, shading='auto')

# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y_train, cmap=cmap_bold)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title(f"3-Class $k$-NN classification ($k$ = {k})");

There are a few artifacts, but overall this looks like a reasonably smooth set of decision boundaries.

Now we'll compare to a decision tree.

How do we control the complexity of a Decision Tree?

There are a variety of ways (see the sklearn documentation) but the simplest one is to control the number of leaf nodes in the tree.

A small number of leaf nodes is a low-complexity model, and a large number of nodes is a high-complexity model.

In [43]:
X = iris.data
y = iris.target
In [44]:
leaf_vals = range(3, 20)
nreps = 50

acc = []
std = []
np.random.seed(0)
for leaf_count in leaf_vals:
    test_rep = []
    train_rep = []
    for i in range(nreps):
        X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.10)
        dtc = tree.DecisionTreeClassifier(max_leaf_nodes = leaf_count)   
        dtc.fit(X_train, y_train)
        train_rep.append(dtc.score(X_train, y_train))
        test_rep.append(dtc.score(X_test, y_test))
    acc.append([np.mean(np.array(test_rep)), np.mean(np.array(train_rep))])
    std.append([np.std(np.array(test_rep)), np.std(np.array(train_rep))])
accy = np.array(acc)
stds = np.array(std)/np.sqrt(nreps)
In [45]:
#
plt.plot(leaf_vals, accy[:, 0], '.-', label = 'Accuracy on Test Data')
plt.plot(leaf_vals, accy[:, 1], '.-', label = 'Accuracy on Training Data')
plt.xlabel('Max Leaf Nodes')
plt.ylabel('Accuracy')
plt.legend(loc = 'best')
plt.xticks(leaf_vals)
best_leaf = leaf_vals[np.argmax(accy[:, 0])]
plt.title(f'Test/Train Error for Decision Tree\nMax Test Accuracy at {best_leaf} leaf nodes with accuracy {np.max(accy[:, 0]):.03f}');
In [46]:
#
plt.errorbar(leaf_vals, accy[:, 0], stds[:, 0], label = 'Accuracy on Test Data')
plt.xlabel('Max Leaf Nodes')
plt.ylabel('Accuracy')
plt.legend(loc = 'lower center')
plt.xticks(leaf_vals)
plt.title(r'Test Accuracy with $\pm 1\sigma$ Errorbars');

It looks like 9 leaf nodes is appropriate, but we would be justified to choose 4 or 13 as well.

And now let's visualize the decision boundary for the DT:

In [47]:
# we will use only the first two (of four) features, so we can visualize
X = X_train[:, :2] 
h = .02  # step size in the mesh
dtc = tree.DecisionTreeClassifier(max_leaf_nodes = best_leaf) 
dtc.fit(X, y_train)
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                      np.arange(y_min, y_max, h))
Z = dtc.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light, shading='auto')

# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y_train, cmap=cmap_bold)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title(f"3-Class DT Classification\n{best_leaf} leaf nodes");

MNIST dataset¶

NIST, or the US National Institute of Standards and Technology, are the folks who bring you the reference meter, reference kilogram, etc.

In [3]:
#
display(Image("images/13-reference-pb.jpg", width=600))

NIST constructed datasets for machine learning of handwritten digits. These were collected from Census Bureau employees and also from high-school students.

These data have been used repeatedly for many years to evaluate classifiers.

In [4]:
import sklearn.utils as utils
import sklearn.datasets as datasets

digits = datasets.load_digits()
X, y = utils.shuffle(digits.data, digits.target, random_state = 1)

print ('Data shape: {}'.format(X.shape))
print ('Data labels: {}'.format(y))
print ('Unique labels: {}'.format(digits.target_names))
Data shape: (1797, 64)
Data labels: [1 5 0 ... 9 1 5]
Unique labels: [0 1 2 3 4 5 6 7 8 9]

An individual item is an 8×88×8 image, encoded as a matrix:

In [49]:
digits.images[3]
Out[49]:
array([[ 0.,  0.,  7., 15., 13.,  1.,  0.,  0.],
       [ 0.,  8., 13.,  6., 15.,  4.,  0.,  0.],
       [ 0.,  2.,  1., 13., 13.,  0.,  0.,  0.],
       [ 0.,  0.,  2., 15., 11.,  1.,  0.,  0.],
       [ 0.,  0.,  0.,  1., 12., 12.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  1., 10.,  8.,  0.],
       [ 0.,  0.,  8.,  4.,  5., 14.,  9.,  0.],
       [ 0.,  0.,  7., 13., 13.,  9.,  0.,  0.]])

Let's show the matrix as an image:

In [50]:
plt.gray() 
# plt.rc('axes', grid = False);
plt.matshow(digits.images[3], cmap = plt.cm.gray_r);
<Figure size 432x288 with 0 Axes>

It is easier to visualize if we blur the pixels a little bit.

In [51]:
plt.rc('image', cmap = 'binary', interpolation = 'bilinear')
plt.figure(figsize = (4, 4))
plt.axis('off')
plt.imshow(digits.images[3]);

Here are some more samples from the dataset:

In [52]:
for t in range(4):
    plt.figure(figsize = (8, 2))
    for j in range(4):
        plt.subplot(1, 4, 1 + j)
        plt.imshow(X[4*t + j].reshape(8, 8))
        plt.axis('off');

Although this is an 8 ×× 8 image, we can just treat it as a vector of length 64.

To do model selection, we will again average over many train/test splits.

However, recall one issue of kk-NN: it can be slow on a large training set (why?).

We may thus decide to limit the size of our testing set to speed up testing.

How does the train/test split affect results?

Let's consider two cases:

  1. Train: 90% of data, Test: 10% of data
  2. Train: 67% of data, Test: 33% of data
In [5]:
import time
import sklearn.model_selection as model_selection
from sklearn.neighbors import KNeighborsClassifier
def test_knn(kvals, test_fraction, nreps):
    acc = []
    std = []
    np.random.seed(0)
    #
    for k in kvals:
        test_rep = []
        train_rep = []
        for i in range(nreps):
            X_train, X_test, y_train, y_test = model_selection.train_test_split(
                X, y, test_size = test_fraction)
            knn = KNeighborsClassifier(n_neighbors = k)    
            knn.fit(X_train, y_train)
            test_rep.append(knn.score(X_test, y_test))
        acc.append(np.mean(np.array(test_rep)))
        std.append(np.std(np.array(test_rep)))
    return(np.array(acc), np.array(std)/np.sqrt(nreps))

test_fraction1 = 0.33
start = time.time()
accy1, stds1 = test_knn(range(2, 20), test_fraction1, 50)
end = time.time()
print("time to train/test on 66% of the dataset: ", end-start)

test_fraction2 = 0.10
start = time.time()
accy2, stds2 = test_knn(range(2, 20), test_fraction2, 50)
end = time.time()
print("time to train/test on 90% of the dataset: ", end-start)
time to train/test on 66% of the dataset:  8.791742086410522
time to train/test on 90% of the dataset:  4.361397981643677
In [54]:
#
plt.figure(figsize = (7, 5))
plt.errorbar(kvals, accy1, stds1, 
             label = f'{test_fraction1:.0%} Used for Testing; accuracy {np.max(accy1):.03f}')
plt.errorbar(kvals, accy2, stds2, 
             label = f'{test_fraction2:.0%} Used for Testing; accuracy {np.max(accy2):.03f}')
plt.xlabel('k')
plt.ylabel('Accuracy')
plt.legend(loc = 'best')
plt.xticks(kvals)
best_k = kvals[np.argmax(accy1)]
plt.title(f'Test Accuracy with $\pm 1\sigma$ Error Bars');

These plots illustrate important principles:

  • The more data used for training, the better the classifier will tend to perform
  • The less data used for testing, the more variable will be the testing results (but the faster testing will go)

Note that the key decision here is what value to choose for kk. So it makes sense to use the 33% test split, because the smaller error bars give us better confidence in our decision.

We can get a sense of why kk-NN can succeed at this task by looking at the nearest neighbors of some points:

In [55]:
knn = KNeighborsClassifier(n_neighbors = 3)    
knn.fit(X, y)
neighbors = knn.kneighbors(X[:3,:], n_neighbors=3, return_distance=False)
In [56]:
#
plt.rc("image", cmap="binary")  # this sets a black on white colormap
# plot X_digits_valid[0]
for t in range(3):
    plt.figure(figsize=(8,2))
    plt.subplot(1, 4, 1)
    plt.imshow(X[t].reshape(8, 8))
    plt.axis('off')
    plt.title("Query")
    # plot three nearest neighbors from the training set
    for i in [0, 1, 2]:
        plt.subplot(1, 4, 2 + i)
        plt.title("neighbor {}".format(i))
        plt.imshow(X[neighbors[t, i]].reshape(8, 8))
        plt.axis('off')
In [ ]:
 
In [ ]: