#
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib as mp
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import laUtilities as ut
import slideUtilities as sl
import demoUtilities as dm
from matplotlib import animation
from importlib import reload
from datetime import datetime
from IPython.display import Image, display_html, display, Math, HTML;
qr_setting = None
mp.rcParams['animation.html'] = 'jshtml';
Homework 6 is due Friday, 3/24 at 8pm
Class cancelled on Monday, 3/27.
Weekly reading and viewing assignments
For any basis for , each vector in can be written in only one way as a linear combination of the basis vectors.
Example. In , let's look at the point . We will plot this point in the standard basis and also relative to a new basis:
# standard basis
xmin = -6.0
xmax = 6.0
ymin = -2.0
ymax = 8.0
b0 = [1, 0]
b1 = [1, 2]
fig = ut.two_d_figure('Dummylabel', xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax, size=(6,5))
for x in np.linspace(ymin, ymax, int(ymax-ymin+1)):
fig.plotLinEqn(0., 1., x, alpha=0.3)
for y in np.linspace(xmin, xmax, int(xmax-xmin+1)):
fig.plotLinEqn(1., 0., y, alpha=0.3)
fig.plotLinEqn(1., 0, 0, color = 'k')
fig.plotLinEqn(0, 1, 0, color = 'k')
fig.plotPoint(0, 0, 'k')
fig.ax.text(0+.1, 0-.1, r'$\bf 0$', size = 12, horizontalalignment='left', verticalalignment='top')
fig.plotPoint(1, 0, 'r')
fig.ax.text(1+.1, 0-.1, r'${\bf e}_1$', size = 12, horizontalalignment='left', verticalalignment='top')
fig.plotPoint(0, 1, 'r')
fig.ax.text(0+.1, 1-.1, r'${\bf e}_2$', size = 12, horizontalalignment='left', verticalalignment='top')
fig.plotPoint(b1[0], b1[1], 'g')
fig.ax.text(b1[0]+.1, b1[1]-.1, r'${\bf b}_2$', size = 12, horizontalalignment='left', verticalalignment='top')
fig.plotPoint(1, 6, 'b')
fig.ax.text(1+.1, 6-.1, r'${\bf x}$', size = 12, horizontalalignment='left', verticalalignment='top')
fig.ax.axis('off')
fig.ax.set_title(r'Standard Basis. $\mathbf{x}$ = (1, 6)', size = 16);
# B-basis
fig = ut.two_d_figure('Dummylabel', xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax, size=(6,5))
m = b1[1]/b1[0]
upper_intercept = ymax - m * xmin
upper_intercept = b1[1] * np.ceil(upper_intercept / b1[1])
lower_intercept = ymin - m * xmax
lower_intercept = b1[1] * np.floor(lower_intercept / b1[1])
for yint in np.linspace(lower_intercept, upper_intercept, int((upper_intercept-lower_intercept)/b1[1])+1):
fig.plotLinEqn(-b1[1], b1[0], yint, color = 'g', alpha=0.3)
for y in np.linspace(ymin, ymax, int(((ymax-ymin)/b1[1])+1)):
fig.plotLinEqn(0., 1., y, color = 'g', alpha=0.3)
fig.plotLinEqn(b1[1], -b1[0], 0, color = 'k')
fig.plotLinEqn(b0[1], -b0[0], 0, color = 'k')
fig.plotPoint(0, 0, 'k')
fig.ax.text(0+.1, 0-.1, r'$\bf 0$', size = 12, horizontalalignment='left', verticalalignment='top')
fig.plotPoint(1, 0, 'g')
fig.ax.text(1+.1, 0-.1, r'${\bf b}_1$', size = 12, horizontalalignment='left', verticalalignment='top')
fig.plotPoint(b1[0], b1[1], 'g')
fig.ax.text(b1[0]+.1, b1[1]-.1, r'${\bf b}_2$', size = 12, horizontalalignment='left', verticalalignment='top')
fig.plotPoint(1, 6, 'b')
fig.ax.text(1+.1, 6-.1, r'${\bf x}$', size = 12, horizontalalignment='left', verticalalignment='top')
fig.ax.axis('off')
fig.ax.set_title('B-Basis. $[\mathbf{x}]_\mathcal{B}$ = (-2, 3)', size = 16);
Notice that the location of relative to the origin does not change.
However, using the -basis, it has different coordinates. The new coordinates are .
Definition. A set of vectors in is said to be an orthogonal set if each pair of distinct vectors from the set is orthogonal, i.e.,
#
fig = ut.three_d_figure((23, 2), fig_desc = 'An orthogonal set of vectors',
xmin = -3, xmax = 3, ymin = -3, ymax = 3, zmin = -3, zmax = 3,
figsize = (12, 8), qr = qr_setting)
u1 = np.array([3, 1, 1])
u2 = np.array([-1, 2, 1])
u3 = np.array([-1/2, -2, 7/2])
origin = np.array([0, 0, 0])
fig.plotLine([origin, u1], 'r', '--')
fig.plotPoint(u1[0], u1[1], u1[2], 'r')
fig.text(u1[0]+.1, u1[1]+.1, u1[2]+.1, r'$\bf u_1$', 'u1', size=16, color='k')
fig.plotLine([origin, u2], 'r', '--')
fig.plotPoint(u2[0], u2[1], u2[2], 'r')
fig.text(u2[0]+.1, u2[1]+.1, u2[2]+.1, r'$\bf u_2$', 'u2', size=16, color='k')
fig.plotLine([origin, u3], 'r', '--')
fig.plotPoint(u3[0], u3[1], u3[2], 'r')
fig.text(u3[0]+.1, u3[1]+.1, u3[2]+.1, r'$\bf u_3$', 'u3', size=16, color = 'k')
fig.text(origin[0]-.45, origin[1]-.45, origin[2]-.45, r'$\bf 0$', 0, size = 16)
fig.plotPerpSym(origin, u1, u2, 0.5)
fig.plotPerpSym(origin, u3, u2, 0.5)
fig.plotPerpSym(origin, u3, u1, 0.5)
fig.set_title(r'An orthogonal set of vectors in $\mathbb{R}^3$', 'An orthogonal set of vectors in R3', size = 16)
fig.save();
[This lecture is based on Prof. Crovella's CS 132 lecture notes and Fast.ai Lecture 8.]
Definition. An orthogonal basis for a subspace of is a basis for that is also an orthogonal set.
For example, consider
Note that Hence they form an orthogonal basis for their span.
Here is the subspace :
#
fig = ut.three_d_figure((23, 3), fig_desc = 'Orthogonal Basis on the subspace H', figsize = (8, 8),
xmin = -2, xmax = 10, ymin = -1, ymax = 10, zmin = -1, zmax = 10, qr = qr_setting)
v = 1/2 * np.array([-1, 4, 2])
u = 1/3 * np.array([8, 1, 2])
vpos = v + 0.4 * v - 0.5 * u
upos = u - 0.5 * v + 0.15 * u
fig.text(vpos[0], vpos[1], vpos[2], r'$\bf u_2$', 'v', size=16)
fig.text(upos[0], upos[1], upos[2], r'$\bf u_1$', 'u', size=16)
# fig.text(3*u[0]+2*v[0], 3*u[1]+2*v[1], 3*u[2]+2*v[2]+1, r'$\bf 2v_1+3v_2$', '2 v1 + 3 v2', size=16)
# plotting the span of v
fig.plotSpan(u, v, 'Green')
# blue grid lines
fig.plotPoint(0, 0, 0, 'y')
fig.plotPoint(u[0], u[1], u[2], 'b')
fig.plotPoint(2*u[0], 2*u[1], 2*u[2],'b')
fig.plotPoint(3*u[0], 3*u[1], 3*u[2], 'b')
fig.plotLine([[0, 0, 0], 4*u], color='b')
fig.plotLine([v, v+4*u], color='b')
fig.plotLine([2*v, 2*v+3*u], color='b')
fig.plotLine([3*v, 3*v+2.5*u], color='b')
# red grid lines
fig.plotPoint(v[0], v[1], v[2], 'r')
fig.plotPoint(2*v[0], 2*v[1], 2*v[2], 'r')
fig.plotPoint(3*v[0], 3*v[1], 3*v[2], 'r')
fig.plotLine([[0, 0, 0], 3.5*v], color='r')
fig.plotLine([u, u+3.5*v], color='r')
fig.plotLine([2*u, 2*u+3.5*v], color='r')
fig.plotLine([3*u, 3*u+2*v], color='r')
#
# fig.plotPoint(3*u[0]+2*v[0], 3*u[1]+2*v[1], 3*u[2]+2*v[2], color='m')
# plotting the axes
#fig.plotIntersection([0,0,1,0], [0,1,0,0], color='Black')
#fig.plotIntersection([0,0,1,0], [1,0,0,0], color='Black')
#fig.plotIntersection([0,1,0,0], [1,0,0,0], color='Black')
#
fig.plotPerpSym(np.array([0, 0, 0]), v, u, 1)
fig.plotPerpSym(u, v+u, u+u, 1)
fig.plotPerpSym(2*u, v+2*u, 3*u, 1)
#
fig.plotPerpSym(np.array([0, 0, 0])+v, 2*v, v+u, 1)
fig.plotPerpSym(u+v, 2*v+u, v+2*u, 1)
#
fig.set_title(r'Orthogonal Basis on the subspace $H$', 'Orthogonal Basis on the subspace H', size=16)
fig.save()
We have seen that for any subspace, there may be many different sets of vectors that can serve as a basis for .
For example, let's say we have a basis
We know that to compute the coordinates of in this basis, denoted , we need to solve the linear system:
or
In general, we'd need to perform Gaussian Elimination, or matrix inversion, or some other computationally slow method to do this.
However an orthogonal basis is a particularly nice basis, because the coordinates of any point can be computed easily and simply. Let's see how.
Theorem. Let be an orthogonal basis for a subspace of . For each in , the weights of the linear combination
are given by
Proof. Let's consider the inner product of and one of the vectors --- say, .
As we saw in the last proof, the orthogonality of means that
Since is not zero (why?), the equation above can be solved for
To find any other compute and solve for .
Example. The set that we saw earlier, i.e.,
is an orthogonal basis for
Let's express the vector as a linear combination of the vectors in .
That is, find 's coordinates in the basis --- i.e., in the coordinate system .
#
fig = ut.three_d_figure((23, 4), fig_desc = 'y in an orthogonal basis',
xmin = -3, xmax = 7, ymin = -5, ymax = 5, zmin = -8, zmax = 4,
figsize = (12, 8), qr = qr_setting, equalAxes = False)
u1 = np.array([3, 1, 1])
u2 = np.array([-1, 2, 1])
u3 = np.array([-1/2, -2, 7/2])
origin = np.array([0, 0, 0])
#
fig.plotLine([origin, u1], 'r', '--')
fig.plotPoint(u1[0], u1[1], u1[2], 'r')
fig.text(u1[0]+.1, u1[1]+.1, u1[2]+.1, r'$\bf u_1$', 'u1', size=16, color='k')
#
fig.plotLine([origin, u2], 'r', '--')
fig.plotPoint(u2[0], u2[1], u2[2], 'r')
fig.text(u2[0]+.1, u2[1]+.1, u2[2]+.1, r'$\bf u_2$', 'u2', size=16, color='k')
#
fig.plotLine([origin, u3], 'r', '--')
fig.plotPoint(u3[0], u3[1], u3[2], 'r')
fig.text(u3[0]+.1, u3[1]+.1, u3[2]+.1, r'$\bf u_3$', 'u3', size=16, color = 'k')
#
fig.text(origin[0]-.45, origin[1]-.45, origin[2]-.45, r'$\bf 0$', 0, size = 16)
#
fig.plotPerpSym(origin, u1, u2, 0.5)
fig.plotPerpSym(origin, u3, u2, 0.5)
fig.plotPerpSym(origin, u3, u1, 0.5)
#
y = u1 - 2 * u2 - 2 * u3
# print(y)
fig.plotPoint(y[0], y[1], y[2], 'b')
fig.text(y[0]-2, y[1]+.1, y[2]+.1, r'$\bf y$ = (6, 1, -8)', 'y = (6, 1, -8)', size=16, color = 'b')
fig.text(y[0]-2, y[1]+.1, y[2]-2.5, r'${\bf y} = 1{\bf u}_1 -2 {\bf u}_2 -2 {\bf u}_3$', 'y = (6, 1, -8)', size=16, color = 'b')
#
fig.set_title(r'${\bf y}$ in an Orthogonal Basis', 'y in an Orthogonal Basis', size = 16)
fig.save();
Solution. Compute
So
As a result, .
Note how much simpler it is finding the coordinates of in the orthogonal basis because:
Now let's turn to the notion of projection. In general, a projection happens when we decompose a vector into the sum of other vectors.
Here is the central idea. Suppose we start with two nonzero vectors and in .
#
ax = ut.plotSetup(-1,10,-1,4,(9,6))
ut.centerAxes(ax)
pt = [4., 3.]
plt.plot([0,pt[0]],[0,pt[1]],'b-',lw=2)
plt.plot([0,2*pt[0]],[0,0],'r-',lw=3)
ut.plotVec(ax,pt,'b')
u = np.array([pt[0],0])
ut.plotVec(ax,2*u)
ax.text(2*pt[0],-0.75,r'$\mathbf{u}$',size=20)
ax.text(pt[0]+0.1,pt[1]+0.2,r'$\mathbf{y}$',size=20);
Our goal is to "decompose" into the sum of two vectors such that
is a multiple of ; that is, for some scalar . We call this the orthogonal projection of onto .
is orthogonal to . We call this the component of orthogonal to
where and is some vector orthogonal to
#
ax = ut.plotSetup(-1,10,-1,4,(9,6))
ut.centerAxes(ax)
pt = [4., 3.]
plt.plot([0,pt[0]],[0,pt[1]],'b-',lw=2)
plt.plot([pt[0],pt[0]],[0,pt[1]],'b--',lw=2)
plt.plot([0,pt[0]],[0,0],'r-',lw=3)
plt.plot([0,0],[0,pt[1]],'g-',lw=3)
ut.plotVec(ax,pt,'b')
u = np.array([pt[0],0])
v = [0,pt[1]]
ut.plotVec(ax,u)
ut.plotVec(ax,2*u)
ut.plotVec(ax,v,'g')
ax.text(pt[0],-0.75,r'${\bf \hat{y}}=\alpha{\bf u}$',size=20)
ax.text(2*pt[0],-0.75,r'$\mathbf{u}$',size=20)
ax.text(pt[0]+0.1,pt[1]+0.2,r'$\mathbf{y}$',size=20)
ax.text(0+0.1,pt[1]+0.2,r'$\mathbf{z}$',size=20)
ax.text(0+0.1,pt[1]+0.8,r'Component of $\mathbf{y}$ orthogonal to $\mathbf{u}$',size=16)
ax.text(pt[0],-1.25,r'Orthogonal projection of $\mathbf{y}$ onto to $\mathbf{u}$',size=16);
perpline1, perpline2 = ut.perp_sym(np.array([0,0]), u, v, 0.5)
plt.plot(perpline1[0], perpline1[1], 'k', lw = 1)
plt.plot(perpline2[0], perpline2[1], 'k', lw = 1);
# ax.text(0+0.1,pt[1]+0.2,r'$\mathbf{z = y -\hat{y}}$',size=20);
To solve this, assume that we have some , and with it we compute We want to be orthogonal to
Now is orthogonal to if and only if
Hence, the solution in which is orthogonal to happens if and only if
and since ,
#
ax = ut.plotSetup(-1,10,-1,4,(9,6))
ut.centerAxes(ax)
pt = [4., 3.]
plt.plot([0,pt[0]],[0,pt[1]],'b-',lw=2)
plt.plot([pt[0],pt[0]],[0,pt[1]],'b--',lw=2)
plt.plot([0,pt[0]],[0,0],'r-',lw=3)
plt.plot([0,0],[0,pt[1]],'g-',lw=3)
ut.plotVec(ax,pt,'b')
u = np.array([pt[0],0])
v = [0,pt[1]]
ut.plotVec(ax,u)
ut.plotVec(ax,2*u)
ut.plotVec(ax,v,'g')
ax.text(pt[0],-0.75,r'${\bf \hat{y}}=\alpha{\bf u}$',size=20)
ax.text(2*pt[0],-0.75,r'$\mathbf{u}$',size=20)
ax.text(pt[0]+0.1,pt[1]+0.2,r'$\mathbf{y}$',size=20)
ax.text(0+0.1,pt[1]+0.2,r'$\mathbf{z}$',size=20)
ax.text(0+0.1,pt[1]+0.8,r'Component of $\mathbf{y}$ orthogonal to $\mathbf{u}$',size=16)
ax.text(pt[0],-1.25,r'Orthogonal projection of $\mathbf{y}$ onto to $\mathbf{u}$',size=16);
perpline1, perpline2 = ut.perp_sym(np.array([0,0]), u, v, 0.5)
plt.plot(perpline1[0], perpline1[1], 'k', lw = 1)
plt.plot(perpline2[0], perpline2[1], 'k', lw = 1);
Now, note that if we had scaled by any amount (ie, moved it to the right or the left), we would not have changed the location of
This can be seen as well by replacing with and recomputing :
Thus, the projection of is determined by the subspace that is spanned by -- in other words, the line through and the origin.
Hence sometimes is denoted by and is called the orthogonal projection of onto .
Specifically:
Example. Let and
Find the orthogonal projection of onto Then write as the sum of two orthogonal vectors, one in Span, and one orthogonal to
#
ax = ut.plotSetup(-3,11,-1,7,(8,6))
ut.centerAxes(ax)
plt.axis('equal')
u = np.array([4.,2])
y = np.array([7.,6])
yhat = (y.T.dot(u)/u.T.dot(u))*u
z = y-yhat
ut.plotLinEqn(1.,-2.,0.)
ut.plotVec(ax,u)
ut.plotVec(ax,z)
ut.plotVec(ax,y)
ut.plotVec(ax,yhat)
ax.text(u[0]+0.3,u[1]-0.5,r'$\mathbf{u}$',size=20)
ax.text(yhat[0]+0.3,yhat[1]-0.5,r'$\mathbf{\hat{y}}$',size=20)
ax.text(y[0],y[1]+0.8,r'$\mathbf{y}$',size=20)
ax.text(z[0]-2,z[1],r'$\mathbf{y - \hat{y}}$',size=20)
ax.text(10,4.5,r'$L = $Span$\{\mathbf{u}\}$',size=20)
perpline1, perpline2 = ut.perp_sym(yhat, y, np.array([0,0]), 0.75)
plt.plot(perpline1[0], perpline1[1], 'k', lw = 1)
plt.plot(perpline2[0], perpline2[1], 'k', lw = 1)
ax.plot([y[0],yhat[0]],[y[1],yhat[1]],'b--')
ax.plot([0,y[0]],[0,y[1]],'b-')
ax.plot([0,z[0]],[0,z[1]],'b-');
Solution. Compute
The orthogonal projection of onto is
And the component of orthogonal to is
So
The closest point.
Recall from geometry that given a line and a point , the closest point on the line to is given by the perpendicular from to the line.
So this gives an important interpretation of : it is the closest point to in the subspace .
#
ax = ut.plotSetup(-3,11,-1,7,(8,6))
ut.centerAxes(ax)
plt.axis('equal')
u = np.array([4.,2])
y = np.array([7.,6])
yhat = (y.T.dot(u)/u.T.dot(u))*u
z = y-yhat
ut.plotLinEqn(1.,-2.,0.)
ut.plotVec(ax,u)
ut.plotVec(ax,z)
ut.plotVec(ax,y)
ut.plotVec(ax,yhat)
ax.text(u[0]+0.3,u[1]-0.5,r'$\mathbf{u}$',size=20)
ax.text(yhat[0]+0.3,yhat[1]-0.5,r'$\mathbf{\hat{y}}$',size=20)
ax.text(y[0],y[1]+0.8,r'$\mathbf{y}$',size=20)
ax.text(z[0]-2,z[1],r'$\mathbf{y - \hat{y}}$',size=20)
ax.text(10,4.5,r'$L = $Span$\{\mathbf{u}\}$',size=20)
perpline1, perpline2 = ut.perp_sym(yhat, y, np.array([0,0]), 0.75)
plt.plot(perpline1[0], perpline1[1], 'k', lw = 1)
plt.plot(perpline2[0], perpline2[1], 'k', lw = 1)
ax.plot([y[0],yhat[0]],[y[1],yhat[1]],'b--')
ax.plot([0,y[0]],[0,y[1]],'b-')
ax.plot([0,z[0]],[0,z[1]],'b-');
The distance from to
The distance from to is the length of the perpendicular from to its orthogonal projection on , namely .
This distance equals the length of , or in other words the norm of .
In the previous example, the distance is
Let's combine two observations that we have made today.
To decompose a vector into a linear combination of vectors in a orthogonal basis, we compute
The projection of onto the subspace spanned by any is
So a decomposition like is really decomposing into a sum of orthogonal projections onto one-dimensional subspaces.
#
ax = ut.plotSetup(-1,10,-1,4,(9,6))
ut.centerAxes(ax)
pt = [4., 3.]
plt.plot([0,pt[0]],[0,pt[1]],'b-',lw=2)
plt.plot([pt[0],pt[0]],[0,pt[1]],'b--',lw=2)
plt.plot([0,pt[0]],[0,0],'r-',lw=3)
plt.plot([0,0],[0,pt[1]],'g-',lw=3)
ut.plotVec(ax,pt,'b')
u = np.array([pt[0],0])
v = [0,pt[1]]
ut.plotVec(ax,u)
ut.plotVec(ax,2*u)
ut.plotVec(ax,v,'g')
ax.text(pt[0],-0.75,r'${\bf \hat{y}}=\alpha{\bf u}$',size=20)
ax.text(2*pt[0],-0.75,r'$\mathbf{u}$',size=20)
ax.text(pt[0]+0.1,pt[1]+0.2,r'$\mathbf{y}$',size=20)
ax.text(0+0.1,pt[1]+0.2,r'$\mathbf{z}$',size=20)
ax.text(0+0.1,pt[1]+0.8,r'Component of $\mathbf{y}$ orthogonal to $\mathbf{u}$',size=16)
ax.text(pt[0],-1.25,r'Orthogonal projection of $\mathbf{y}$ onto to $\mathbf{u}$',size=16);
perpline1, perpline2 = ut.perp_sym(np.array([0,0]), u, v, 0.5)
plt.plot(perpline1[0], perpline1[1], 'k', lw = 1)
plt.plot(perpline2[0], perpline2[1], 'k', lw = 1);
For example, let's take the case where
Let's say we are given such that is orthogonal to , and so together they span
Then can be written in the form
So this equation expresses as the sum of its projections onto the (orthogonal) axes determined by and .
This is an useful way of thinking about coordinates in an orthogonal basis:
coordinates are projections onto the axes!
(Even if they are not the standard axes from the standard basis vectors.)
# Source: Lay 4th Edition, Chapter 6.2, Figure 4
display(Image("images/10-proj.jpg", width=600))
Given a subspace , remember that there are many possible basis sets that can be used to represent . In this lecture, we have seen that an orthogonal basis is particularly nice to work with.
So, let's use the concept of projection to transform any basis into an orthogonal one.
The Gram-Schmidt process is an algorithm that transforms a basis for subspace into an orthogonal basis for the same subspace .
It operates as follows.
In other words, we can find an orthogonal basis by iteratively removing the projections of all previous vectors, in order to keep only the component of each vector that is orthogonal to all vectors before it.
Here is another way to think about the Gram-Schmidt process.
Theorem. If we apply Gram-Schmidt starting from a basis for subspace , then the resulting set satisfies the following properties:
Proof. The proof of orthogonality involves applying the inner product to the equations for the vectors above; I recommend that you try writing it out yourself. Let's focus on the second property, and we will prove it by induction.
First, it is clear that .
Next, let's consider . Let's consider how this vector relates to the smaller subspace .
because we wrote as a linear combination of the vectors and .
Moreover, is not contained in the span of (indeed, we removed that very component of ), so and are linearly independent.
Hence, is a minimal spanning set for , and therefore is a basis of .
Both of these statements generalize to hold for each vector :
This procedure leads to a powerful new form of matrix factorization!
Let's first recall what matrix factorization (or decomposition) is. The objective is to write a single matrix as the product of two or more "simpler" matrices.
So far we have seen one kind of matrix factorization, the LU decomposition that decomposes an matrix into the factors where:
Question. How did we compute the LU factorization?
Answer. Perform Gaussian elimination to convert to . Keep track of the elementary row operations performed along the way, and use them to construct .
In a similar fashion, it turns out that if we apply Gram-Schmidt to the column vectors of a matrix and keep track of the operations performed along the way, it gives us a new method to factor matrices!
Definition. Let be an matrix (not necessarily square) with linearly independent columns. Then, the QR decomposition of is a factorization where:
Rather than showing the procedure for "keeping track of the operations performed along the way," let me just skip ahead to show you some code that computes the QR decomposition of a matrix based on the Gram-Schmidt algorithm.
# source: Fast.ai lecture 8 from https://nbviewer.org/github/fastai/numerical-linear-algebra/blob/master/nbs/8.%20Implementing%20QR%20Factorization.ipynb
def GramSchmidt(A):
m, n = A.shape
Q = np.zeros([m,n], dtype=np.float64)
R = np.zeros([n,n], dtype=np.float64)
for j in range(n):
v = A[:,j] # take each column vector in A
for i in range(j):
R[i,j] = np.dot(Q[:,i], A[:,j])
v = v - (R[i,j] * Q[:,i]) # subtract the projection from previous vectors
R[j,j] = np.linalg.norm(v)
Q[:, j] = v / R[j,j] # insert as a column of Q
return Q, R
import numpy as np
A = np.matrix([[0, 1, 1],
[1, 1, 1],
[1, 2, 1]])
np.linalg.qr(A)
(matrix([[ 0. , 0.81649658, -0.57735027], [-0.70710678, -0.40824829, -0.57735027], [-0.70710678, 0.40824829, 0.57735027]]), matrix([[-1.41421356, -2.12132034, -1.41421356], [ 0. , 1.22474487, 0.81649658], [ 0. , 0. , -0.57735027]]))
Let's use this code to factor the matrix
for which we computed the LU factorization a last week.
A = np.array([[3,-7,-2,2],
[-3,5,1,0],
[6,-4,0,-5],
[-9,5,-5,12]])
Q, R = GramSchmidt(A)
print("Q =",Q)
print("R =",R)
Q = [[ 0.25819889 -0.80829038 0.11547005 0.51639778] [-0.25819889 0.46188022 0.01649572 0.84836778] [ 0.51639778 0.11547005 -0.84128182 0.11065667] [-0.77459667 -0.34641016 -0.5278631 -0.03688556]] R = [[ 11.61895004 -9.03696114 3.09838668 -11.36075115] [ 0. 5.77350269 3.81051178 -6.35085296] [ 0. 0. 2.42487113 -1.89700803] [ 0. 0. 0. 0.03688556]]
QR matrix factorization is a valuable tool when performing data science.
"One algorithm in numerical linear algebra is more important than all the others: QR factorization." -Trefethen, page 48
For this reason, the scipy
package includes a built-in command to calculate it. (Note that Gram-Schmidt is just one way to compute a QR decomposition; there are others. Also, some QR decompositions factor into an matrix followed by an matrix instead.)
import scipy
Qnew, Rnew = scipy.linalg.qr(A)
print(Qnew)
print(Rnew)
[[-0.25819889 0.80829038 0.11547005 0.51639778] [ 0.25819889 -0.46188022 0.01649572 0.84836778] [-0.51639778 -0.11547005 -0.84128182 0.11065667] [ 0.77459667 0.34641016 -0.5278631 -0.03688556]] [[-11.61895004 9.03696114 -3.09838668 11.36075115] [ 0. -5.77350269 -3.81051178 6.35085296] [ 0. 0. 2.42487113 -1.89700803] [ 0. 0. 0. 0.03688556]]
We can use Numpy
to check that columns of are orthogonal.
Also, both of these code samples actually perform one final step of Gram-Schmidt: after calculating the vectors , the code normalizes each column vector to be of length 1.
for i in range(Q.shape[1]):
for j in range(i,Q.shape[1]):
print("the inner product between columns", i, "and", j,"is", Q[:,i].dot(Q[:,j]))
# remember that Python's rows and columns are 0-indexed!
the inner product between columns 0 and 0 is 1.0 the inner product between columns 0 and 1 is 4.440892098500626e-16 the inner product between columns 0 and 2 is -9.43689570931383e-16 the inner product between columns 0 and 3 is 6.752931547282515e-14 the inner product between columns 1 and 1 is 1.0 the inner product between columns 1 and 2 is -6.106226635438361e-16 the inner product between columns 1 and 3 is 9.314771176605063e-14 the inner product between columns 2 and 2 is 0.9999999999999999 the inner product between columns 2 and 3 is -4.0610570462007445e-13 the inner product between columns 3 and 3 is 1.0000000000000002
Another way to perform this same check is to compute ! (For easier reading, this time let's also round to the nearest integer.)
print((Q.T @ Q).round(0))
[[ 1. 0. -0. 0.] [ 0. 1. -0. 0.] [-0. -0. 1. -0.] [ 0. 0. -0. 1.]]
This is a very cool property! It means we can take the inverse of very efficiently, without using the typical (slow) matrix inverse algorithm. Instead, we only need to take the matrix transpose. Let's make this idea more formal.
Orthogonal sets are therefore very useful. However, they become even more useful if we normalize the vectors in the set.
Definition. A set is an orthonormal set if it is an orthogonal set of unit vectors (that is: vectors of norm 1).
If is the subspace spanned by such as a set, then is an orthonormal basis for since the set is automatically linearly independent.
Example. The simplest example of an orthonormal set is the standard basis for .
Keep the terms clear in your head:
(You can see the word "normalized" inside "orthonormal").
Matrices with orthonormal columns are particularly important.
Theorem. An matrix has orthonormal columns if and only if (where denotes the identity matrix).
Proof. Let denote the column vectors of . Then:
Every diagonal entry of this matrix equals 1 because each is a unit vector so .
Every non-diagonal entry of this matrix equals 0 because each pair of vectors and are orthogonal, so .
So
Theorem. Let by an matrix with orthonormal columns, and let and be in Then:
We can combine Properties 1 and 3 to make an important statement.
For orthonormal , the linear transformation preserves lengths and orthogonality.
So, viewed as a linear operator, an orthonormal matrix is very special: the lengths of vectors, and therefore the distances between points is not changed by the action of .
Notice as well that is -- it may not be square.
So it may map vectors from one vector space to an entirely different vector space -- but the distances between points will not be changed!
#
display(Image("images/10-ortho1.png", width=600))
... and the orthogonality of vectors will not be changed!
#
display(Image("images/10-ortho2.png", width=600))
Note however that we cannot in general construct an orthonormal map from a higher dimension to a lower one.
For example, three orthogonal vectors in cannot be mapped to three orthogonal vectors in . Can you see why this is impossible? What is it about the definition of an orthonormal set that prevents this?
Example. Let and Notice that has orthonormal columns, and
Let's verify that
One of the most useful transformation matrices is obtained when the columns of the matrix are orthonormal...
... and the matrix is square.
These matrices map vectors in to new locations in the same space, ie, .
... in a way that preserves lengths, distances and orthogonality.
Now, consider the case when is square, and has orthonormal columns.
Then the fact that implies that
Then is called an orthogonal matrix.
A good example of an orthogonal matrix is a rotation matrix:
Using trigonometric identities, you should be able to convince yourself that
and hopefully you can visualize how preserves lengths and orthogonality.