#
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib as mp
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import laUtilities as ut
import slideUtilities as sl
import demoUtilities as dm
from matplotlib import animation
from importlib import reload
from datetime import datetime
from IPython.display import Image, display_html, display, Math, HTML;
qr_setting = None
mp.rcParams['animation.html'] = 'jshtml';
Last time, we formed a theory of diagonalizing a square matrix into .
Diagonalization is not always possible.
We also proved the following theorem.
Theorem. (The Diagonalization Theorem)
A square matrix is diagonalizable if and only if has linearly independent eigenvectors.
We can now turn to a geometric understanding of how diagonalization informs us about the properties of .
Let's interpret the diagonalization in terms of how acts as a linear operator.
When thinking of as a linear operator, diagonalization has a specific interpretation:
Diagonalization separates the influence of each vector component from the others.
To see what this means, let's first consider an easier case: a diagonal matrix. When we multiply a vector by a diagonal matrix , the change to each component of depends only on that component.
That is, multiplying by a diagonal matrix simply scales the components of the vector.
Example. Let Then,
On the other hand, when we multiply by a matrix that has off-diagonal entries, the components of affect each other.
So diagonalizing a matrix allows us to bring intuition to its behavior as as linear operator.
When we compute we are taking a vector sum of the columns of :
Now is square and invertible, so its columns are a basis for . Let's call that basis
So, we can think of as "the point that has coordinates in the basis ."
On the other hand, what if we wanted to find the coordinates of a vector in basis ?
Let's say we have some written in the standard basis, and we want to find its coordinates in the basis .
So
Then since is invertible,
Thus, is "the coordinates of in the basis "
So we can interpret as:
Compute the coordinates of in the basis .
This is
Scale those coordinates according to the diagonal matrix .
This is
Find the point that has those scaled coordinates in the basis
This is
A = np.array([[ 1.86363636, 0.68181819],
[-0.22727273, 3.13636364]])
P = np.array([[0.98058068, 0.51449576],
[0.19611614, 0.85749293]])
D = np.array([[2, 0],
[0, 3]])
np.allclose(A, P @ D @ np.linalg.inv(P))
True
Example. Let's visualize diagonalization geometrically.
Consider a fixed-but-unwritten matrix . Here's a picture showing how it transforms a point .
#
ax = ut.plotSetup(-1,6,-1,6,size=(12,8))
ut.centerAxes(ax)
v1 = np.array([5.0,1.0])
v1 = v1 / np.sqrt(np.sum(v1*v1))
v2 = np.array([3.0,5.0])
v2 = v2 / np.sqrt(np.sum(v2*v2))
p1 = 2*v1+v2
p2 = 4*v1+3*v2
ut.plotVec(ax, p1,'k')
ut.plotVec(ax, p2,'k')
ax.annotate('', xy=(p2[0], p2[1]), xycoords='data',
xytext=(p1[0], p1[1]), textcoords='data',
size=15,
arrowprops={'arrowstyle': 'simple',
'fc': '0.7',
'ec': 'none',
'connectionstyle' : 'arc3,rad=-0.3'},
)
ax.text(2.5,0.75,r'${\bf x}$',size=20)
ax.text(5.2,2.75,r'$A{\bf x}$',size=20)
ax.plot(0,0,'');
So far, we cannot say much about what the linear transformation does in general.
Now, let's compute
Remember that the columns of are the eigenvectors of .
So is the coordinates of the point in the eigenvector basis:
#
ax = ut.plotSetup(-1,6,-1,6,size=(12,8))
ut.centerAxes(ax)
v1 = np.array([5.0,1.0])
v1 = v1 / np.sqrt(np.sum(v1*v1))
v2 = np.array([3.0,5.0])
v2 = v2 / np.sqrt(np.sum(v2*v2))
ut.plotVec(ax,v1,'b')
ut.plotVec(ax,v2)
ut.plotLinEqn(-v1[1],v1[0],0,color='b')
ut.plotLinEqn(-v2[1],v2[0],0,color='r')
for i in range(-4,8):
ut.plotLinEqn(-v1[1],v1[0],i*(v1[0]*v2[1]-v1[1]*v2[0]),format=':',color='b')
ut.plotLinEqn(-v2[1],v2[0],i*(v2[0]*v1[1]-v2[1]*v1[0]),format=':',color='r')
p1 = 2*v1+v2
p2 = 4*v1+3*v2
ut.plotVec(ax, p1,'k')
ax.text(2.5,0.75,r'${\bf x}$',size=20)
ax.text(v2[0]-0.15,v2[1]+0.5,r'${\bf p_2}$',size=20)
ax.text(v1[0]-0.15,v1[1]+0.35,r'${\bf p_1}$',size=20)
ax.plot(0,0,'');
The coordinates of in this basis are (2,1).
In other words
Now, we compute Since is diagonal, this is just scaling each of the -coordinates.
In this example the eigenvalue corresponding to is 2, and the eigenvalue corresponding to is 3.
#
ax = ut.plotSetup(-1,6,-1,6,size=(12,8))
ut.centerAxes(ax)
v1 = np.array([5.0,1.0])
v1 = v1 / np.sqrt(np.sum(v1*v1))
v2 = np.array([3.0,5.0])
v2 = v2 / np.sqrt(np.sum(v2*v2))
#ut.plotVec(ax,v1,'b')
#ut.plotVec(ax,v2)
ut.plotLinEqn(-v1[1],v1[0],0,color='b')
ut.plotLinEqn(-v2[1],v2[0],0,color='r')
for i in range(-4,8):
ut.plotLinEqn(-v1[1],v1[0],i*(v1[0]*v2[1]-v1[1]*v2[0]),format=':',color='b')
ut.plotLinEqn(-v2[1],v2[0],i*(v2[0]*v1[1]-v2[1]*v1[0]),format=':',color='r')
p1 = 2*v1+v2
p2 = 4*v1+3*v2
ut.plotVec(ax, p1,'k')
ut.plotVec(ax, p2,'k')
ax.annotate('', xy=(p2[0], p2[1]), xycoords='data',
xytext=(p1[0], p1[1]), textcoords='data',
size=15,
#bbox=dict(boxstyle="round", fc="0.8"),
arrowprops={'arrowstyle': 'simple',
'fc': '0.7',
'ec': 'none',
'connectionstyle' : 'arc3,rad=-0.3'},
)
ax.text(2.5,0.75,r'${\bf x}$',size=20)
ax.text(5.2,2.75,r'$A{\bf x}$',size=20)
ax.plot(0,0,'');
So the coordinates of in the basis are
Now we convert back to the standard basis -- that is, we ask which point has coordinates (4,3) in basis
We rely on the fact that if has coordinates in the basis , then
So
#
ax = ut.plotSetup(-1,6,-1,6,size=(12,8))
ut.centerAxes(ax)
v1 = np.array([5.0,1.0])
v1 = v1 / np.sqrt(np.sum(v1*v1))
v2 = np.array([3.0,5.0])
v2 = v2 / np.sqrt(np.sum(v2*v2))
#ut.plotVec(ax,v1,'b')
#ut.plotVec(ax,v2)
#ut.plotLinEqn(-v1[1],v1[0],0,color='b')
#ut.plotLinEqn(-v2[1],v2[0],0,color='r')
#for i in range(-3,8):
# ut.plotLinEqn(-v1[1],v1[0],i*(v1[0]*v2[1]-v1[1]*v2[0]),format=':',color='b')
# ut.plotLinEqn(-v2[1],v2[0],i*(v2[0]*v1[1]-v2[1]*v1[0]),format=':',color='r')
p1 = 2*v1+v2
p2 = 4*v1+3*v2
#ut.plotVec(ax, p1,'k')
ut.plotVec(ax, p2,'k')
#ax.annotate('', xy=(p2[0], p2[1]), xycoords='data',
# xytext=(p1[0], p1[1]), textcoords='data',
# size=15,
# #bbox=dict(boxstyle="round", fc="0.8"),
# arrowprops={'arrowstyle': 'simple',
# 'fc': '0.7',
# 'ec': 'none',
# 'connectionstyle' : 'arc3,rad=-0.3'},
# )
#ax.text(2.5,0.75,r'${\bf x}$',size=16)
ax.text(5.2,2.75,r'$A{\bf x}$',size=20)
ax.plot(0,0,'');
We find that =
In conclusion: notice that the transformation may be a complicated one in which each component of affects each component of .
However, by changing to the basis defined by the eigenspaces of , the action of becomes simple to understand.
Diagonalization of changes to a basis in which the action of is particularly easy to understand and compute with.
[This lecture is based on Prof. Crovella's CS 132 lecture notes.]
#
fig = ut.three_d_figure((15, 1),
'Intersection of the positive definite quadratic form z = 3 x1^2 + 7 x2 ^2 with the constraint ||x|| = 1',
-2, 2, -2, 2, 0, 8,
equalAxes = False, figsize = (7, 7), qr = qr_setting)
qf = np.array([[3., 0.],[0., 7.]])
for angle in np.linspace(0, 2*np.pi, 200):
x = np.array([np.cos(angle), np.sin(angle)])
z = x.T @ qf @ x
fig.plotPoint(x[0], x[1], z, 'b')
fig.plotPoint(x[0], x[1], 0, 'g')
fig.plotQF(qf, alpha=0.5)
fig.ax.set_zlabel('z')
fig.desc['zlabel'] = 'z'
# do not call fig.save here
Next week, we will design a generalization of this technique that applies to all matrices. It is called singular value decomposition, and it is the main mathematical tool we have been building up to in this course. We will then explore its data science applications.
But we need one more mathematical tool in our toolbelt first.
Today we'll study a very important class of matrices: symmetric matrices.
Definition. A symmetric matrix is a matrix such that .
Example. Here are three symmetric matrices:
We have already seen symmetric matrices within the normal equation: is always symmetric because
We'll see that symmetric matrices have properties that relate to both eigendecomposition and orthogonality.
Furthermore, symmetric matrices open up a broad class of problems we haven't yet touched on: constrained optimization.
As a result, symmetric matrices arise very often in applications.
At first glance, a symmetric matrix may not seem that special!
But in fact symmetric matrices have a number of interesting properties.
First, we'll look at a remarkable fact:
the eigenvectors of a symmetric matrix are orthogonal
Example. Diagonalize the following symmetric matrix:
Solution. Remember that diagonalization is a four-step process: (1) find the eigenvalues of , (2) find the eigenvectors of , (3) construct , and (4) construct .
The characteristic equation of is
So the eigenvalues are 8, 6, and 3.
We construct a basis for each eigenspace (using our standard method of finding the nullspace of ):
These three vectors form a basis for
But here is the remarkable thing: these three vectors form an orthogonal set.
That is, any two eigenvectors are orthogonal.
For example,
As a result, form an orthogonal basis for
Let's normalize these vectors so they each have length 1:
The orthogonality of the eigenvectors of a symmetric matrix is quite important.
To see this, let's write the diagonalization of in terms of these eigenvectors and eigenvalues:
Then, as usual.
But, here is the interesting thing:
is square and has orthonormal columns.
So is an orthogonal matrix (also known as an orthonormal matrix).
So, that means that
So,
Here is a theorem that shows that we always get orthogonal eigenvectors when we diagonalize a symmetric matrix:
Theorem. If is symmetric, then any two eigenvectors of from different eigenspaces are orthogonal.
Proof.
Let and be eigenvectors that correspond to distinct eigenvalues, say, and
To show that compute
So we conclude that
But so this can only happen if
So is orthogonal to
The same argument applies to any other pair of eigenvectors with distinct eigenvalues.
So any two eigenvectors of from different eigenspaces are orthogonal.
We can now introduce a special kind of diagonalizability:
An matrix is said to be orthogonally diagonalizable if there are an orthogonal matrix (with ) and a diagonal matrix such that
Such a diagonalization requires linearly independent and orthonormal eigenvectors.
When is this possible?
If is orthogonally diagonalizable, then
So is symmetric!
It turns out the converse is true (though we won't prove it).
Hence we obtain the following theorem:
Theorem. An matrix is orthogonally diagonalizable if and only if it is a symmetric matrix.
Recall from last time: we found that it was a difficult process, in general, to determine if an arbitrary matrix was diagonalizable.
But here, we have a very nice rule: every symmetric matrix is (orthogonally) diagonalizable.
An important role that symmetric matrices play has to do with quadratic expressions.
Up until now, we have mainly focused on linear equations -- equations in which the terms occur only to the first power.
Actually, though, we have looked at some quadratic expressions when we considered least-squares problems.
For example, we looked at expressions such as which is
We'll now look at quadratic expressions generally. We'll see that there is a natural and useful connection to symmetric matrices.
Definition. A quadratic form is a function of variables, eg, in which every term has degree two.
Examples:
is a quadratic form.
is not a quadratic form.
Quadratic forms arise in many settings, including signal processing, graph analysis, physics, economics, and statistics.
Within data science, quadratic forms arise in many optimization problems, among other areas.
Essentially, what an expression like is to a scalar, a quadratic form is to a vector.
Fact. Every quadratic form can be expressed as , where is a symmetric matrix.
There is a simple way to go from a quadratic form to a symmetric matrix, and vice versa.
To see this, let's look at some examples.
Example. Let Compute for the matrix
Solution.
Example. Compute for the matrix
Solution.
Notice the simple structure:
We can write this concisely:
Example. For in , let
Write this quadratic form as .
Solution.
The coefficients of go on the diagonal of .
Based on the previous example, we can see that the coefficient of each cross term is the sum of two values in symmetric positions on opposite sides of the diagonal of .
So to make symmetric, the coefficient of for must be split evenly between the - and -entries of .
You can check that
Notice that is a scalar.
In other words, when is an matrix, the quadratic form is a function that maps vectors in the domain to real numbers in the codomain .
Here are four quadratic forms with domain .
Notice that except at the values of are all positive in the first case, and all negative in the last case.
#
fig = ut.three_d_figure((29, 1), 'z = 3 x1^2 + 7 x2 ^2',
-2, 2, -2, 2, -1, 3,
figsize = (7, 7), qr = qr_setting)
qf = np.array([[3., 0.],[0., 7.]])
fig.plotQF(qf, alpha = 0.7)
fig.ax.set_zlabel('z')
fig.desc['zlabel'] = 'z'
fig.set_title(r'$z = 3x_1^2 + 7x_2^2$', 'z = 3x_1^2 + 7x_2^2', size = 18)
fig.save();
#
fig = ut.three_d_figure((29, 2), 'z = 3 x1^2',
-2, 2, -2, 2, -1, 3,
figsize = (7, 7), qr = qr_setting)
qf = np.array([[3., 0.],[0., 0.]])
fig.plotQF(qf, alpha = 0.7)
fig.ax.set_zlabel('z')
fig.desc['zlabel'] = 'z'
fig.set_title(r'$z = 3x_1^2$', 'z = 3x_1^2', size = 18)
fig.save();
#
fig = ut.three_d_figure((29, 3), 'z = 3 x1^2 - 7 x2 ^2',
-2, 2, -2, 2, -1, 3,
figsize = (7, 7), qr = qr_setting)
qf = np.array([[3., 0.],[0., -7.]])
fig.plotQF(qf, alpha = 0.7)
fig.ax.set_zlabel('z')
fig.desc['zlabel'] = 'z'
fig.set_title(r'$z = 3x_1^2 - 7x_2^2$', 'z = 3x_1^2 - 7x_2^2', size = 18)
fig.save();
#
fig = ut.three_d_figure((29, 4), 'z = -3 x1^2 - 7 x2 ^2',
-2, 2, -2, 2, -3, 1,
figsize = (7, 7), qr = qr_setting)
qf = np.array([[-3., 0.],[0., -7.]])
fig.plotQF(qf, alpha = 0.7)
fig.ax.set_zlabel('z')
fig.desc['zlabel'] = 'z'
fig.set_title(r'$z = -3x_1^2 - 7x_2^2$', 'z = -3x_1^2 - 7x_2^2', size = 18)
fig.save();
The differences between these surfaces is important for problems such as optimization.
In an optimization problem, one seeks the minimum or maximum value of a function (perhaps over a subset of its domain).
Definition. A quadratic form is:
If , then we will refer to as a positive (semi)definite or negative (semi)definite matrix.
Knowing which kind of quadratic form one is dealing with is important for optimization. Consider these two quadratic forms:
Notice that that matrices differ in only one position.
Let's say we would like to find
In each case, we need to ask: is it possible? Does a minimum value even exist?
#
fig = ut.three_d_figure((29, 5), 'P(x)',
-5, 5, -5, 5, -5, 5,
figsize = (7, 7), qr = qr_setting)
qf = np.array([[3., 2],[2, 1]])
fig.plotQF(qf, alpha = 0.7)
fig.ax.set_zlabel('z')
fig.desc['zlabel'] = 'z'
fig.set_title(r'$z = \mathbf{x}^TA\mathbf{x}$', 'z = x^TAx', size = 18)
fig.ax.view_init(azim = 30)
fig.save();
#
fig = ut.three_d_figure((29, 6), 'Q(x)',
-5, 5, -5, 5, -5, 5,
figsize = (7, 7), qr = qr_setting)
qf = np.array([[3., 2],[2, 3]])
fig.plotQF(qf, alpha = 0.7)
fig.ax.set_zlabel('z')
fig.desc['zlabel'] = 'z'
fig.set_title(r'$z = \mathbf{x}^TB\mathbf{x}$', 'z = x^TBx', size = 18)
fig.save();
Clearly, we cannot minimize over all ,
... but we can minimize over all .
How can we distinguish between these two cases in general?
That is, in high dimension, without the ability to visualize?
There is a remarkably simple way to determine, for any quadratic form, which class it falls into.
Theorem. Let be an symmetric matrix.
Then a quadratic form is
Let's see why this theorem holds for the positive semidefinite case. (The others are similar.)
Let's consider , an eigenvector of . Then
If all eigenvalues are positive or zero, then all such terms are positive or zero.
Since is symmetric, it is diagonalizable and so its eigenvectors span .
So any can be expressed as a weighted sum of 's eigenvectors .
Writing out the expansion of in terms of 's eigenvectors, we get only positive or zero terms.
Proof (semi-definite case). Let be an eigenvalue of with corresponding eigenvector . Then:
If then is negative, so cannot be positive semi-definite.
Or to state the contrapositive of this statement: if is positive-semidefinite, then can only have positive or zero eigenvalues.
Example. Let's look at the four quadratic forms above. Their associated matrices are
Remember that for a diagonal matrix, its eigenvalues equal its diagonal entries.
#
fig = ut.three_d_figure((29, 1), 'z = 3 x1^2 + 7 x2 ^2',
-2, 2, -2, 2, -1, 3,
figsize = (7, 7), qr = qr_setting)
qf = np.array([[3., 0.],[0., 7.]])
fig.plotQF(qf, alpha = 0.7)
fig.ax.set_zlabel('z')
fig.desc['zlabel'] = 'z'
fig.set_title(r'$z = 3x_1^2 + 7x_2^2$', 'z = 3x_1^2 + 7x_2^2', size = 18)
fig.save();
#
fig = ut.three_d_figure((29, 2), 'z = 3 x1^2',
-2, 2, -2, 2, -1, 3,
figsize = (7, 7), qr = qr_setting)
qf = np.array([[3., 0.],[0., 0.]])
fig.plotQF(qf, alpha = 0.7)
fig.ax.set_zlabel('z')
fig.desc['zlabel'] = 'z'
fig.set_title(r'$z = 3x_1^2$', 'z = 3x_1^2', size = 18)
fig.save();
#
fig = ut.three_d_figure((29, 3), 'z = 3 x1^2 - 7 x2 ^2',
-2, 2, -2, 2, -1, 3,
figsize = (7, 7), qr = qr_setting)
qf = np.array([[3., 0.],[0., -7.]])
fig.plotQF(qf, alpha = 0.7)
fig.ax.set_zlabel('z')
fig.desc['zlabel'] = 'z'
fig.set_title(r'$z = 3x_1^2 - 7x_2^2$', 'z = 3x_1^2 - 7x_2^2', size = 18)
fig.save();
#
fig = ut.three_d_figure((29, 4), 'z = -3 x1^2 - 7 x2 ^2',
-2, 2, -2, 2, -3, 1,
figsize = (7, 7), qr = qr_setting)
qf = np.array([[-3., 0.],[0., -7.]])
fig.plotQF(qf, alpha = 0.7)
fig.ax.set_zlabel('z')
fig.desc['zlabel'] = 'z'
fig.set_title(r'$z = -3x_1^2 - 7x_2^2$', 'z = -3x_1^2 - 7x_2^2', size = 18)
fig.save();
Example. Is positive definite?
Solution. Because of all the plus signs, this form "looks" positive definite. But the matrix of the form is
and the eigenvalues of this matrix turn out to be 5, 2, and -1.
So is an indefinite quadratic form.
Next lecture we'll talk about constrained optimization problems and how to solve them using eigenvalues.