#
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib as mp
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import laUtilities as ut
import slideUtilities as sl
import demoUtilities as dm
from matplotlib import animation
from importlib import reload
from datetime import datetime
from IPython.display import Image, display_html, display, Math, HTML;
qr_setting = None
mp.rcParams['animation.html'] = 'jshtml';
To describe the orbit of Ceres, Gauss had to construct the equation for its ellipse:
He had many measurements of pairs and had to find the This is actually a linear system:
According to Newton, this is a consistent linear system. But there are two unappealing options to solve it.
Take only six independent measurements and try to solve the system with a matrix. We will likely get a unique solution. But the solution is likely overfitting to the measurement errors from the six samples.
Take a larger number of measurements and construct a matrix with all of them. But due to measurement errors, it is unlikely that a single ellipse will go through all of them. We now have an inconsistent system with no solutions.
The key idea is: the fact that our measurements include small, unbiased errors does not make them worthless! We simply need a principled approach to doing the best job we can given the errors in our measurements.
If a system of equations has no solutions, that is because is not in the column space of .
So let's find the closest vector that is in the column space and find the solution to that equation.
(Note that there is a unique closest vector , although there might be multiple solutions such that .)
#
fig = ut.three_d_figure((24, 3), 'Least Squares looks for the Closest Point in Col A',
-15, 15, -15, 15, -15, 15, (8, 8), qr = qr_setting)
a2 = np.array([5.0,-13.0,-3.0])
a1 = np.array([1,-2.0,3])
v = -3*a1 + a2
y = np.array([12.0, 8.0, -5.0])
A = np.array([a1, a2]).T
#
# plotting the span of v
fig.plotSpan(a1, a2, 'Green')
#
yhat = A @ np.linalg.inv(A.T @ A) @ A.T @ y
fig.plotPoint(yhat[0], yhat[1], yhat[2], 'b')
fig.text(yhat[0]+0.5, yhat[1]+0.5, yhat[2]+0.5, r'$\mathbf{A\hat{x}}$', 'Ax-hat', size=18)
#
fig.plotPoint(y[0], y[1], y[2],'b')
fig.text(y[0]+0.5, y[1]+0.5, y[2]+0.5, r'$\bf b$', 'b', size=18)
#
# origin seems unhelpful
#fig.plotPoint(0, 0, 0, 'b')
#fig.text(0-1.5, 0-1.5, 0-1.5, r'$\bf 0$', '0', size=18)
#
x1 = y + np.array([8, 8, 8])
x1hat = A @ np.linalg.inv(A.T @ A) @ A.T @ x1
fig.plotPoint(x1hat[0], x1hat[1], x1hat[2], 'r')
#
x2 = y + np.array([-4, -6, 6])
x2hat = A @ np.linalg.inv(A.T @ A) @ A.T @ x2
fig.plotPoint(x2hat[0], x2hat[1], x2hat[2], 'r')
#
wpos = -4.5*a1
fig.text(wpos[0], wpos[1], wpos[2], r'Col $A$', 'Col A', size=22)
#
# lines
fig.plotLine([y, yhat], 'b', '-')
fig.plotLine([y, x1hat], 'r', '--')
fig.plotLine([y, x2hat], 'r', '--')
#ut.plotPoint3d(ax,0,0,0,'b')
fig.ax.view_init(azim=100,elev=-20.0)
fig.hideAxes()
fig.set_title(r'Least Squares looks for the Closest Point in Col $A$', 'Least Squares looks for the Closest Point in Col A', size = 16)
fig.save();
If A is and is in , then the general least-squares problem is to find an that makes as small as possible. Equivalently, our goal is to minimize the sum of squared error:
where contains the estimated values and contains the measured values.
The best estimated value must be at the orthogonal projection of onto the subspace . That is the closest vector to the measured value .
Any other vector will be further away because its distance to will be the hypotenuse of a right triangle.
#
fig = ut.three_d_figure((24, 5), 'Comparison of v to the projection of b',
-15, 15, -15, 15, -15, 15, (9,9), qr = qr_setting)
a2 = np.array([5.0,-13.0,-3.0])
a1 = np.array([1,-2.0,3])
v = -3*a1 + a2
y = np.array([12.0, 8.0, -5.0])
A = np.array([a1, a2]).T
#
# plotting the span of v
fig.plotSpan(a1, a2, 'Green')
#
yhat = A.dot(np.linalg.inv(A.T.dot(A))).dot(A.T).dot(y)
fig.plotPoint(yhat[0], yhat[1], yhat[2], 'b')
fig.text(yhat[0]+0.5, yhat[1]+0.5, yhat[2]+0.5, r'$\mathbf{\hat{b}}$', 'y-hat', size=18)
#
fig.plotPoint(y[0], y[1], y[2],'b')
fig.text(y[0]+0.5, y[1]+0.5, y[2]+0.5, r'$\bf b$', 'y', size=18)
#
fig.plotPoint(v[0], v[1], v[2], 'b')
fig.text(v[0]-1.5, v[1]-1.5, v[2]-1.5, r'$\bf v$', 'v', size=18)
#
m1 = (y + v) / 2
m1 = m1 + [2,-6,0]
fig.text(m1[0], m1[1], m1[2], r'$\Vert \bf b - \bf v\Vert$', '||b - v||', size=18)
m2 = (y + yhat) / 2
m2 = m2 + [0,1,0]
fig.text(m2[0], m2[1], m2[2], r'$\Vert \bf b - \hat{\bf b}\Vert$', '||b - b-hat||', size=18)
m3 = (yhat + v) / 2
m3 = m3 + [-1,0,-4]
fig.text(m3[0], m3[1], m3[2], r'$\Vert \hat{\bf b} - \bf v\Vert$', '||b - v||', size=18)
#
wpos = -4.5*a1
fig.text(wpos[0], wpos[1], wpos[2], r'$W$', 'W', size=22)
#
# lines
fig.plotLine([y, yhat], 'b')
fig.plotLine([v, yhat], 'b')
fig.plotLine([y, v], 'b')
#ut.plotPoint3d(ax,0,0,0,'b')
fig.ax.view_init(azim=57.0,elev=-69.0)
fig.hideAxes()
fig.set_title(r'Relationship of $\mathbf{b}$ to $\hat{\mathbf{b}} = \operatorname{Proj}_W \mathbf{b}$',
'Relationship of b Proj_W b', size = 16)
qrcode = fig.save();
So: how are we going to find this projection ?
Here is the key idea:
We know that the projection has the property that is orthogonal to
Suppose is and that satisfies .
So is orthogonal to each column of .
If is any column of , then
Now, each is a row of .
We can collect all of the equations for all the as:
So
So
Looking at this, we see that is a vector, and is a matrix, so this is a standard linear system.
This linear system is called the normal equations for
It's solution is usually denoted .
Theorem. The set of least-squares solutions of is equal to the (nonempty) set of solutions of the normal equations
Proof.
(1) The set of solutions is nonempty. The matrix on the left has the same column space as and the vector on the right is a vector in the column space of
And, by the arguments above, any least-squares solution of must satisfy the normal equations
(2) Now let's show that any solution of is a least squares solution of .
If satisfies then
which shows that is orthogonal to the rows of , and so is orthogonal to the columns of .
So the vector is orthogonal to .
So the equation
is a decomposition of into the sum of a vector in and a vector orthogonal to .
Since the orthogonal decomposition is unique, must be the orthogonal projection of onto the column space of .
So and is a least-squares solution.
Example. Find the least squares solution of the inconsistent system for
Solution.
We will use the normal equations
So the normal equations are:
We can solve this using row operations, or by inverting .
And we can then solve as
So we conclude that is the vector that minimizes
More formally,
That is, is the least-squares solution of .
We have seen that the normal equations always have a solution.
Is there always a unique solution?
No, there can be multiple solutions that all minimize
Let's remind ourselves of what is going on when a linear system has multiple solutions.
We know that a linear system has multiple solutions when there are columns that are not pivot columns.
Equivalently, when has multiple solutions, the columns of are linearly dependent.
Here is a picture of what is going on. In this case, is .
But note, that is only two-dimensional because the three columns are linearly dependent.
#
fig = ut.three_d_figure((24, 9), 'Multiple Solutions to the Normal Equations',
-15, 15, -15, 15, -15, 15, (7,7), qr = qr_setting)
a2 = np.array([5.0,-13.0,-3.0])
a1 = np.array([1,-2.0,3])
a3 = -2*a1 + a2
b = np.array([6.0, 8.0, -5.0])
A = np.array([a1, a2]).T
bhat = A.dot(np.linalg.inv(A.T.dot(A))).dot(A.T).dot(b)
fig.text(a1[0], a1[1], a1[2], r'$\bf a_1$', 'a_1', size=20)
fig.text(a2[0], a2[1], a2[2], r'$\bf a_2$', 'a_2', size=20)
fig.text(a3[0], a3[1], a3[2], r'$\bf a_3$','a_3', size=20)
fig.text(b[0], b[1], b[2], r'$\bf b$', 'b', size=20)
fig.text(bhat[0], bhat[1], bhat[2], r'$A\mathbf{\hat{x}} = \mathbf{\hat{b}}$', 'b-hat', size=20)
#ax.text(1,-4,-10,r'Span{$\bf a,b$}',size=16)
#ax.text(0.2,0.2,-4,r'$\bf 0$',size=20)
# plotting the span of v
fig.plotSpan(a1, a2, 'Green')
fig.plotPoint(a1[0], a1[1], a1[2], 'r')
fig.plotPoint(a2[0], a2[1], a2[2], 'r')
fig.plotPoint(a3[0], a3[1], a3[2], 'r')
fig.plotPoint(b[0], b[1], b[2], 'b')
fig.plotPoint(bhat[0], bhat[1], bhat[2], 'b')
fig.plotLine([b, bhat], 'b', '--')
#ut.plotPoint3d(ax,0,0,0,'b')
fig.set_title(r'When there are Multiple Solutions to the Normal Equations',
'When there are Multiple Solutions to the Normal Equations', size=20)
fig.ax.view_init(azim=26.0,elev=-77.0)
fig.save();
Example.
Find a least-squares solution for for
Solution. Compute
To solve we'll use row reduction. The augmented matrix is:
In the reduced echelon form matrix, there are 4 column vectors, but only the first three of them are linearly independent and the last one can be written as a linear combination of them.
Hence, the columns of are linearly dependent. This happens because the columns of are linearly dependent.
You can see this as follows: if has a non-trivial null space, then also has a nontrival null space.
So there is a free variable.
The general solution is then , , , and is free.
So the general least-squares solution of has the form
Keep in mind that the orthogonal projection is always unique.
The reason that there are multiple solutions to this least squares problem is that there are multiple ways to construct .
The reason that there are multiple ways to construct is that the columns of are linearly dependent, so any vector in the column space of can be constructed in multiple ways.
Here is a theorem that allows use to identify when there are multiple least-squares solutions.
Theorem. Let be an matrix. The following statements are equivalent:
When these statements are true, the least-squares solution is given by:
Finding directly.
When is invertible, and is unique, we can put together the two equations
and
to get:
Let's stop and look at this another way. Up until now we have seen how to project a point onto a line, or on to a subspace with an orthogonal basis.
Now here is a formula for projection onto a subspace (namely ) given an arbitrary basis. This is a general formula that can be very useful.
[This lecture is based on Prof. Crovella's CS 132 lecture notes.]
Here is a graph showing the number of BU computer science majors over many years, from 2004 to 2017. We'd like to have some estimate of where the numbers might be going.
#
cohortMatrix = np.array([
[56, 42, 49, 39, 40, 31, 33, 46, 55, 91, 83, 129, 153, 178],
[43, 36, 27, 34, 24, 29, 39, 56, 74, 69, 111, 136, 168, 171],
[32, 24, 22, 21, 26, 25, 44, 64, 52, 77, 105, 130, 139, 175],
[25, 16, 19, 28, 24, 30, 37, 40, 49, 56, 79, 93, 90, 126]])
nyears = np.shape(cohortMatrix)[1]
# index rows by time, and columns by cohort
Year = pd.DateOffset(years=1)
# need to fliplr because spreadsheet rows are in decreasing cohort order
datestrs=['09-2004','09-2005','09-2006','09-2007','09-2008','09-2009','09-2010','09-2011','09-2012','09-2013','09-2014','09-2015','09-2016','09-2017']
dates = [datetime.strptime(x, '%m-%Y') for x in datestrs[:nyears]]
cohorts = pd.DataFrame(np.fliplr(cohortMatrix.T),index=dates,columns=pd.Index(['U1','U2','U3','U4']))
# learning the model
b = np.array(cohorts[1:])
a = np.array(cohorts[:-1])
x, resid, rank, s = np.linalg.lstsq(a,b,rcond=None)
A = x.T
#
cohorts.sum(axis=1).plot()
plt.ylim(ymin=0)
# plt.legend(['Model','Actual'],loc='best')
plt.title('Number of BU CS Majors',size=20);
Modeling the number of students in the CS major is challenging because students enter and leave the major at various points during their undergraduate degree.
Here is the raw data of the number of declared CS majors, broken down into 1st, 2nd, 3rd, and 4th year students.
cohortMatrix = np.array([
[56, 42, 49, 39, 40, 31, 33, 46, 55, 91, 83, 129, 153, 178],
[43, 36, 27, 34, 24, 29, 39, 56, 74, 69, 111, 136, 168, 171],
[32, 24, 22, 21, 26, 25, 44, 64, 52, 77, 105, 130, 139, 175],
[25, 16, 19, 28, 24, 30, 37, 40, 49, 56, 79, 93, 90, 126]])
We will use a simple linear model that we will train on historical data.
Our state variable is
where is the number of students who are declared CS majors and in their th year at BU, in the fall of year . Our model is a linear dynamical system (or linear recurrence):
Given historical data which are measurements of our state variable from past years, we now know how to estimate using least squares!
# learning the model
cohort = pd.DataFrame(np.fliplr(cohortMatrix.T))
b = np.array(cohort[1:])
a = np.array(cohort[:-1])
print(a)
[[ 25 32 43 56] [ 16 24 36 42] [ 19 22 27 49] [ 28 21 34 39] [ 24 26 24 40] [ 30 25 29 31] [ 37 44 39 33] [ 40 64 56 46] [ 49 52 74 55] [ 56 77 69 91] [ 79 105 111 83] [ 93 130 136 129] [ 90 139 168 153]]
print(b)
[[ 16 24 36 42] [ 19 22 27 49] [ 28 21 34 39] [ 24 26 24 40] [ 30 25 29 31] [ 37 44 39 33] [ 40 64 56 46] [ 49 52 74 55] [ 56 77 69 91] [ 79 105 111 83] [ 93 130 136 129] [ 90 139 168 153] [126 175 171 178]]
x, resid, rank, s = np.linalg.lstsq(a,b,rcond=None)
A = x.T
print(A)
[[ 0.62110633 0.28792787 0.0204491 0.10003897] [ 0.65239203 0.55574243 0.30323349 -0.16349735] [ 0.33101614 1.24636712 -0.26153545 0.07684781] [ 0.49319575 -0.30684656 1.00419585 0.07532737]]
Now, using , we can predict growth into the future via
OneYearAhead = cohorts.dot(A.T).shift(1,freq=Year)
OneYearAhead.columns=pd.Index(['U1','U2','U3','U4'])
OneYearAhead.sum(axis=1).plot()
cohorts.sum(axis=1).plot()
plt.ylim(ymin=0)
plt.legend(['Model','Actual'],loc='best');
Now, the matrix captures the complex way that one year's student population relates to the next year's population.
It can be hard to understand what is going on just by looking at the values of .
But, when we look at the plot above, it appears that there is a relatively simple kind of exponential growth going on.
One of the most fundamental kinds of machine learning is the construction of a model that can be used to summarize a set of data.
A model is a concise description of a dataset or a real-world phenomenon.
For example, an equation can be a model if we use the equation to describe something in the real world.
The most common form of modeling is regression, which means constructing an equation that describes the relationships among variables.
For example, we may look at these points and observe that they approximately lie on a line.
So we could decide to model this data using a line.
#
ax = ut.plotSetup(-10, 10, -10, 10, size = (7, 7))
ut.centerAxes(ax)
line = np.array([1, 0.5])
xlin = -10.0 + 20.0 * np.random.random(100)
ylin = line[0] + (line[1] * xlin) + np.random.normal(scale = 1.5, size = 100)
ax.plot(xlin, ylin, 'ro', markersize=6);
We may look at these points and decide to model them using a quadratic function.
#
ax = ut.plotSetup(-10, 10, -10, 20, size = (7, 7))
ut.centerAxes(ax)
quad = np.array([1, 3, 0.5])
xquad = -10.0 + 20.0 * np.random.random(100)
yquad = quad[0] + (quad[1] * xquad) + (quad[2] * xquad * xquad) + np.random.normal(scale = 1.5, size = 100)
ax.plot(xquad, yquad, 'ro', markersize=6);
And we may look at these points and decide to model them using a logarithmic function.
#
ax = ut.plotSetup(-10, 10, -10, 15, size = (7, 7))
ut.centerAxes(ax)
log = np.array([1, 4])
xlog = 10.0 * np.random.random(100)
ylog = log[0] + log[1] * np.log(xlog) + np.random.normal(scale = 1.5, size = 100)
ax.plot(xlog, ylog, 'ro', markersize=6);
Clearly, none of these datasets agrees perfectly with the proposed model. So the question arises:
How do we find the best linear function (or quadratic function, or logarithmic function) given the data?
The regression problem has been studied extensively in the field of statistics and machine learning.
Certain terminology is used:
The basic regression task is:
The independent variables are collected into a matrix which is called the design matrix.
The dependent variables are collected into an observation vector
The parameters of the model (for any kind of model) are collected into a parameter vector
#
ax = ut.plotSetup(-10, 10, -10, 10, size = (7, 7))
ut.centerAxes(ax)
line = np.array([1, 0.5])
xlin = -10.0 + 20.0 * np.random.random(100)
ylin = line[0] + (line[1] * xlin) + np.random.normal(scale = 1.5, size = 100)
ax.plot(xlin, ylin, 'ro', markersize = 6)
ax.plot(xlin, line[0] + line[1] *xlin, 'b-')
plt.text(-9, 3, r'$y = \beta_0 + \beta_1x$', size=20);
The first kind of model we'll study is a linear equation:
This is the most commonly used type of model, particularly in fields like economics, psychology, biology, etc.
The reason it is so commonly used is that experimental data often produce points that seem to lie close to a line.
The question we must confront is: given a set of data, how should we "fit" the equation of the line to the data?
Our intuition is this: we want to determine the parameters that define a line that is as "close" to the points as possible.
Let's develop some terminology for evaluating a model.
Suppose we have a line . For each data point there is a point that is the point on the line with the same -coordinate.
# image credit: Lay, LAA, 4th edition
display(Image("images/Lay-fig-6-6-1.jpg", width=550))
We call the observed value of
and we call the predicted -value.
The difference between an observed -value and a predicted -value is called a residual.
There are several ways to measure how "close" the line is to the data.
The usual choice is to sum the squares of the residuals.
(Note that the residuals themselves may be positive or negative -- by squaring them, we ensure that our error measures don't cancel out.)
The least-squares line is the line that minimizes the sum of squares of the residuals.
The coefficients of the line are called regression coefficients.
Let's imagine for a moment that the data fit a line perfectly.
Then, if each of the data points happened to fall exactly on the line, the parameters and would satisfy the equations
We can write this system as
where
If the data points don't actually lie exactly on a line,
... then there are no parameters for which the predicted -values in equal the observed -values in ,
... and has no solution.
Now, since the data doesn't fall exactly on a line, we have decided to seek the that minimizes the sum of squared residuals, ie,
This is key: the sum of squares of the residuals is exactly the square of the distance between the vectors and
This is a least-squares problem, with different notation.
Computing the least-squares solution of is equivalent to finding the that determines the least-squares line.
#
ax = ut.plotSetup(-10, 10, -10, 10, size = (7, 7))
ut.centerAxes(ax)
line = np.array([1, 0.5])
xlin = -10.0 + 20.0 * np.random.random(100)
ylin = line[0]+ (line[1] * xlin) + np.random.normal(scale = 1.5, size = 100)
ax.plot(xlin, ylin, 'ro', markersize=6)
ax.plot(xlin, line[0] + line[1] * xlin, 'b-')
plt.text(-9,3,r'$y = \beta_0 + \beta_1x$',size=20);
Example. Find the equation of the least-squares line that best fits the data points
x | y |
---|---|
2 | 1 |
5 | 2 |
7 | 3 |
8 | 3 |
#
ax = ut.plotSetup(0, 10, 0, 4, size = (7, 7))
ut.centerAxes(ax)
pts = np.array([[2,1], [5,2], [7,3], [8,3]]).T
ax.plot(pts[0], pts[1], 'ro', markersize = 10);
plt.xlabel('x', size = 20)
plt.ylabel('y', size = 20);
Solution. Use the -coordinates of the data to build the design matrix , and the -coordinates to build the observation vector :
Now, to obtain the least-squares line, find the least-squares solution to
We do this by solving the normal equation that we learned last lecture (just with new notation):
So, we compute:
So the normal equations are:
Solving, we get:
So the least-squares line has the equation
#
ax = ut.plotSetup(0, 10, 0, 4, size = (7, 7))
ut.centerAxes(ax)
pts = np.array([[2,1], [5,2], [7,3], [8,3]]).T
ax.plot(pts[0], pts[1], 'ro', markersize = 10)
ut.plotLinEqn(-5./14, 1, 2./7, color='b')
plt.text(6,1.5, r'$y = \frac{2}{7} + \frac{5}{14}x$', size=24);