# for QR codes use inline
%matplotlib inline
qr_setting = 'url'
#
# for lecture use notebook
# %matplotlib notebook
# qr_setting = None
#
%config InlineBackend.figure_format='retina'
# import libraries
import numpy as np
import matplotlib as mp
import pandas as pd
import matplotlib.pyplot as plt
import laUtilities as ut
import slideUtilities as sl
import demoUtilities as dm
import pandas as pd
from importlib import reload
from datetime import datetime
from matplotlib import animation
from IPython.display import Image
from IPython.display import display_html
from IPython.display import display
from IPython.display import Math
from IPython.display import Latex
from IPython.display import HTML;
from collections import Counter
import string
import random
import itertools
Of all of the linear transformations associated with a square matrix, scaling is special because:
If a matrix scales , then that transformation could also have been expressed without a matrix-vector multiplication, i.e., as for some scalar value .
An eigenvector of a matrix is a special vector that does not change its direction when multiplied by .
The eigenvalues of a matrix are all of the scalars that an eigenvector can be scaled by.
#
ax = ut.plotSetup(size=(12,8))
ut.centerAxes(ax)
A = np.array([[3,-2],[1,0]])
u = np.array([-1,1])
v = np.array([2,1])
#
ut.plotArrowVec(ax, v, [0,0], color='Red')
ut.plotArrowVec(ax, A.dot(v), [0,0], color='Red')
ax.text(v[0],v[1]+0.2,r'${\bf v}$',size=20)
ax.text(A.dot(v)[0],A.dot(v)[1]+0.2,r'$A{\bf v}$',size=20)
#
ut.plotArrowVec(ax, u, [0,0], color='Blue')
ut.plotArrowVec(ax, A.dot(u), [0,0], color='Blue')
ax.text(u[0]-0.5,u[1]+0.1,r'${\bf u}$',size=20)
ax.text(A.dot(u)[0]-0.7,A.dot(u)[1]+0.3,r'$A{\bf u}$',size=20);
To find an eigenvalue: we saw that is an eigenvalue of an matrix if and only if the equation
has a nontrivial solution.
Some special cases:
Combining these properties leads to the characteristic equation: a scalar is an eigenvalue of an matrix if and only if satisfies
This equation yields a polynomial in of degree , meaning there are at most solutions.
To find all eigenvectors corresponding to an eigenvalue: compute the null space of the matrix
(Remember that this forms a subspace, which we call the eigenspace of corresponding to .)
A difference equation (also known as a dynamical system or recurrence relation) contains two parts:
A Markov chain is a dynamical system whose state is a probability vector and whose state transitions are determined by a stochastic matrix , meaning that that
Here:
A important question about a Markov Chain is: what will happen in the distant future?
If is a stochastic matrix, then a steady-state vector (or equilibrium vector) for is a probability vector such that:
ax = plt.subplot(131)
plt.plot(range(15),xs.T[0],'o-')
ax.set_ylim([0,1])
plt.title(r'$x_1$',size=24)
ax = plt.subplot(132)
plt.plot(range(15),xs.T[1],'o-')
ax.set_ylim([0,1])
plt.title(r'$x_2$',size=24)
ax = plt.subplot(133)
plt.plot(range(15),xs.T[2],'o-')
ax.set_ylim([0,1])
plt.title(r'$x_3$',size=24)
plt.tight_layout()
We can find a steady state vector by solving the equation , or equivalently
and restricting to vectors that are probability vectors.
To solve a Markov Chain for its steady state:
We can place the theory of Markov Chains into the broader context of eigenvalues and eigenvectors.
Theorem. The largest eigenvalue of a Markov Chain is 1.
Proof. First of all, 1 is an eigenvalue of a Markov chain and its corresponding vector is the steady state vector . This is because
To prove that 1 is the largest eigenvalue, recall that each column of a Markov Chain sums to 1.
Then, consider the sum of the values in the vector .
Let's just sum the first terms in each component of :
So we can see that the sum of all terms in is equal to -- i.e., the sum of all terms in .
So there can be no such that
Previously, we were only able to ask about the "eventual" steady state of a Markov Chain.
But a crucial question is: how long does it take for a particular Markov Chain to reach steady state from some initial starting condition?
Example. Let's use our running example of population movement, defined by
Let's ask how long until it reaches steady state, from the starting point defined as
Remember that is probability vector -- its entries are nonnegative and sum to 1.
#
ax = ut.plotSetup(-0.1,1.7,-0.1,1.2)
ut.centerAxes(ax)
A = np.array([[0.95,0.03],[0.05,0.97]])
v1 = np.array([0.375,0.625])
v2 = np.array([0.225,-0.225])
x0 = v1 + v2
#
ax.plot([1,0],[0,1],'b--')
ax.plot(x0[0],x0[1],'bo')
ax.text(x0[0]+0.02,x0[1]+0.02,r'${\bf x_0}$',size=16)
#ax.text(A.dot(x0)[0]+0.2,A.dot(x0)[1]+0.2,r'$A{\bf x_0}$',size=16)
# ax.plot([-10,10],[5*10/6.0,-5*10/6.0],'b-')
#
ax.annotate('Initial State', xy=(x0[0], x0[1]), xycoords='data',
xytext=(0.4, 0.8), textcoords='data',
size=15,
#bbox=dict(boxstyle="round", fc="0.8"),
arrowprops={'arrowstyle': 'simple',
'fc': '0.5',
'ec': 'none',
'connectionstyle' : 'arc3,rad=-0.3'},
);
The matrix has the characteristic equation:
Using the quadratic formula, we find the roots of this equation to be 1 and 0.92. (Note that, as expected, 1 is the largest eigenvalue.)
Next, we can find a basis for each eigenspace of (each nullspace of ).
Next, we write the initial state as a linear combination of and This can be done because is obviously a basis for
To write this way, we want to solve the vector equation
In other words:
The matrix is invertible, so,
So, now we can put it all together.
We know that
and we know the values of that make this true.
And then we can use the eigendecomposition to compute subsequent terms :
Now note the power of the eigenvalue approach:
And so in general:
And using the and and we computed above:
This explicit formula for gives the solution of the Markov Chain starting from the initial state .
In other words:
As , . Thus the Markov chain converges to the steady state
#
ax = ut.plotSetup(-0.1,1.7,-0.1,1.2)
ut.centerAxes(ax)
A = np.array([[0.95,0.03],[0.05,0.97]])
v1 = np.array([0.375,0.625])
v2 = np.array([0.225,-0.225])
x0 = v1 + v2
#
ax.plot([1,0],[0,1],'b--')
ax.text(v1[0]+0.02,v1[1]+0.02,r'${\bf v_1}$',size=16)
ax.plot(x0[0],x0[1],'bo')
v = np.zeros((40,2))
for i in range(40):
v[i] = v1+(0.92**i)*v2
ax.plot(v[i,0],v[i,1],'o')
ax.text(v[4][0]+0.02,v[4][1]+0.02,r'${\bf x_4}$',size=12)
ax.text(v[10][0]+0.02,v[10][1]+0.02,r'${\bf x_{10}}$',size=12)
ax.text(x0[0]+0.02,x0[1]+0.02,r'${\bf x_0}$',size=16)
ax.plot(v1[0],v1[1],'ro')
#ax.text(A.dot(x0)[0]+0.2,A.dot(x0)[1]+0.2,r'$A{\bf x_0}$',size=16)
# ax.plot([-10,10],[5*10/6.0,-5*10/6.0],'b-')
#
ax.annotate('Steady State', xy=(v1[0], v1[1]), xycoords='data',
xytext=(0.1, 0.2), textcoords='data',
size=15,
#bbox=dict(boxstyle="round", fc="0.8"),
arrowprops={'arrowstyle': 'simple',
'fc': '0.5',
'ec': 'none',
'connectionstyle' : 'arc3,rad=-0.3'},
)
ax.annotate('Initial State', xy=(v[0,0], v[0,1]), xycoords='data',
xytext=(0.4, 0.8), textcoords='data',
size=15,
#bbox=dict(boxstyle="round", fc="0.8"),
arrowprops={'arrowstyle': 'simple',
'fc': '0.5',
'ec': 'none',
'connectionstyle' : 'arc3,rad=-0.3'},
)
print('')
Markov Chains have been deployed in many different applications. The earliest domain they were useful in was analysis of written language.
We'll walk through some of the early experiments in this domain.
Our goal: Make our computer write us a sentence.
Generate text by randomly selecting letters of the alphabet. Letters are equiprobable.
alphabet = list(string.ascii_lowercase + " ")
sentence0 = ''
for _ in range(100):
sentence0 += random.choice(alphabet)
print(sentence0)
ael piqomlhtlkyashc sgbahadtremjvahuxuqilvtudxgglotfoyqekpepktmaknvyowdjhdoavwmlpllzlfxeqvkgqxmrgdz
This looks more like a password generator than a text generator.
Fortunately, we know some things about English! Firstly, that not every letter occurs with the same frequency -- 's' and 'e' will be more common than 'q' and 'z'.
# From https://en.wikipedia.org/wiki/Letter_frequency
eng_freq = {'e': 0.09992, 't': 0.07424, 'a': 0.06432, 'o': 0.06112, 'i': 0.06056, 'n': 0.05784, 's': 0.05208, 'r': 0.05024, 'h': 0.04040, 'l': 0.03256, 'd': 0.03056, 'c': 0.02672, 'u': 0.02184, 'm': 0.02008, 'f': 0.0192, 'p': 0.01712, 'g': 0.01496, 'w': 0.01344, 'y': 0.01328, 'b': 0.01184, 'v': 0.0084, 'k': 0.00432, 'x': 0.00184, 'j': 0.00128, 'q': 0.00096, 'z': 0.00072, ' ': 0.2}
fig = plt.figure()
ax = fig.add_subplot(111)
i = np.arange(27)
ax.bar(i, [eng_freq[l] * 100 for l in sorted(eng_freq.keys())], width=0.8,
color='b', alpha=0.5, align='center')
ax.set_xticks(range(len(alphabet)))
ax.set_xticklabels(sorted(alphabet))
ax.set_xlabel("Letters")
ax.set_ylabel("Frequency (%)")
plt.show()
Generate text that reflects the relative frequencies of each letter in English.
sentence = random.choices(population=list(eng_freq.keys()), weights=eng_freq.values(), k=100)
print(''.join(sentence))
eloetmn enutle onsrl ulnbh nc h stnree roanevaiiee ansuamleand afio uatrsitr ynlkhfde oienmaadodt
This is slightly more comprehensible. But we can do better.
Letter frequency analysis is what Andrey Markov conducted in his initial paper on Markov Chains. He took Alexander Pushkin's Eugene Onegin and sampled, by hand, the first 20,000 letters of the novel.
He counted the number of vowels, the number of consonants, and the number of vowel-vowel, consonant-consonant, and vowel-consonant pairs.
He found that the first 20,000 letters were 8,638 (43%) vowels and 11,362 (57%) consonants.
If each letter were independent, we would expect the chance of two sequential vowels to be . This equates to 3,698 vowel-vowel pairs in 20,000 characters.
But Markov observed 1,104 -- nearly 1/3 the expected number!
Thus, we can conclude what we know intuitively -- that there is a dependence relationship between letters.
Thirty years later, another mathematician began investigating this dependence relationship between letters.
Claude Shannon was a mathematician and early computer scientist. He is considered by many to be the father of information theory, and had a significant impact on the direction of modern computing (including that we store information in bits!). During WWII, Shannon worked as a codebreaker, which led to his later work on English text analysis, information entropy, and redundancy.
Shannon observed that "anyone speaking a language possesses, implicitly, an enormous knowledge of the statistics of the language." He devised multiple experiments to demonstrate this.
For example, imagine the following game:
One observation from this: even beyond the likelihood of moving between consonants and vowels, there are many common combinations of letters. And the preceeding letters tell us a lot about what letter is most likely to come next!
Let's analyze some real text and find out which combinations of letters are most likely. In other words, let's estimate the probability of each letter transitioning to another.
# Check out Project Gutenberg for texts in the public domain
# https://www.gutenberg.org/
with open('data/moby.txt', 'r') as file:
text1 = file.read().rstrip()
print(f'Moby Dick has {len(text1)} characters')
with open('data/wonderland.txt', 'r') as file:
text2 = file.read().rstrip()
print(f'Alice in Wonderland has {len(text2)} characters')
with open('data/pride.txt', 'r') as file:
text3 = file.read().rstrip()
print(f'Pride & Prejudice has {len(text3)} characters')
text = text1 + text2 + text3
counts = Counter(text) # Counter({'l': 2, 'H': 1, 'e': 1, 'o': 1})
#print(counts)
letter_prob = {}
for i in counts.most_common():
letter_prob[i[0]] = round(i[1]/len(text),4)
print(i)
Moby Dick has 1147741 characters Alice in Wonderland has 134814 characters Pride & Prejudice has 664175 characters (' ', 362026) ('e', 198344) ('t', 144438) ('a', 127336) ('o', 116660) ('i', 110303) ('n', 109561) ('h', 103390) ('s', 102978) ('r', 89242) ('l', 68835) ('d', 64969) ('u', 44900) ('m', 39815) ('c', 38116) ('w', 36802) ('f', 34533) ('g', 33177) ('y', 31703) ('b', 27233) ('p', 26799) ('v', 15056) ('k', 12315) ('q', 2385) ('j', 2079) ('x', 2076) ('z', 1659)
stochastic_matrix = np.zeros(shape=(27,27))
gram = {}
for i in range(len(text)-1):
seq = text[i:i+1]
if seq not in gram:
gram[seq] = []
gram[seq].append(text[i+1])
for i in range(27):
for j in range(27):
if alphabet[j] in gram[alphabet[i]]:
letter_count = Counter(gram[alphabet[i]])
stochastic_matrix[i][j] = letter_count[alphabet[j]]/sum(letter_count.values())
print(np.round(stochastic_matrix[0:10,0:10],2))
[[0. 0.03 0.03 0.05 0. 0.01 0.02 0.01 0.04 0. ] [0.05 0.02 0. 0. 0.32 0. 0. 0. 0.04 0.01] [0.13 0. 0.02 0. 0.17 0. 0. 0.17 0.04 0. ] [0.03 0. 0. 0.01 0.12 0. 0. 0. 0.07 0. ] [0.05 0. 0.02 0.08 0.04 0.01 0.01 0. 0.01 0. ] [0.06 0. 0. 0. 0.08 0.04 0. 0. 0.08 0. ] [0.06 0. 0. 0. 0.12 0. 0.01 0.14 0.04 0. ] [0.18 0. 0. 0. 0.43 0. 0. 0. 0.14 0. ] [0.02 0.01 0.04 0.04 0.03 0.02 0.03 0. 0. 0. ] [0.25 0. 0. 0. 0.24 0. 0. 0. 0.01 0. ]]
Let's write a function that takes our training text, the number of letters to "look back," and the number of characters in our new sentence.
def ngram(text, n, k):
gram = {}
for i in range(len(text)-n):
seq = text[i:i+n]
if seq not in gram:
gram[seq] = []
gram[seq].append(text[i+n])
sentence = random.choice(list(gram.keys()))
for i in range(k):
sentence += random.choice(gram[sentence[i:i+n]])
return(sentence)
# One proceeding letter (e.g. 't' --> 'e').
# This is a first-order Markov Chain
print(ngram(text,1,100))
veyonit e s ly whend mof uotaturof thif way ari liveyindeer os s f ats hokecheand ncar otaraithinggan
# Two proceeding letters (e.g. 'th' --> 'e')
# This is a second-order Markov Chain
print(ngram(text,2,100))
eing but the he boady mr alat peribly re the of itteeptand trantty fough wougholl ritin boust aft land
# Three proceeding letters (e.g. 'the' --> 'r')
# This is a third-order Markov Chain
print(ngram(text,3,100))
kness darchings prove body chapper reforgot to whilation or stubb wantages de boats sting alica feedily
# and so on...
print(ngram(text,4,1000))
uccordaged share of it know do compactolus fancy i supportablissed whatever have as into tell be thats supperson his shoe captain you seems may boy visit in his cetold on bonely could varies art then say but is comrades to the try of sing i see now him so drink when witness so should and white angely sat downwards impetuously replied overtebr and while and and the to became transportion aged thin cupied inted to though forms one and arms an endlingley were sailors and dear one of the cast but ever forming how home to that he reless of radney and go found and she air father whale wrong thing or with the personable and elizarrol gouge gryphony by the turtle lowly because in have no broken into ther engration a sorts starbuckling out that in queequences we fly in tell your owned out one or throughout one degreenlandles some reached with that man floor lover manufactual the cannot a place and whale suppressioner and be remained and more from mortance her father complimentation me is took that
At this point, as we are essentially memorizing whole words, it might be more expedient to switch to a word-based model.
ttext = text.split()
wcounts = Counter(ttext) # Counter({'l': 2, 'H': 1, 'e': 1, 'o': 1})
print(wcounts.most_common(5))
word_prob = {}
for i in wcounts.most_common():
word_prob[i[0]] = round(i[1]/len(ttext),4)
[('the', 20182), ('and', 10790), ('of', 10614), ('to', 9449), ('a', 7222)]
Randomly select words from our corpus.
sentence = ''
for _ in range(100):
sentence += random.choice(ttext) + ' '
print(sentence)
minded the her i the but lydia life from folded we a failed triangles by when so of him having boys by whalemen suggesting did aye opening elizabeth looks at weeks at morcar with favour apprehension my a first this my she the nothing board been and might suspected you breakfast ordered eye seaport the could i embark proved by my the were deck same ball talk how of and are atmosphere end wholly while a in keeps the glimpse out should that a previous nearer he on that my go with times is which meant you to him his
Choose the next word based on the preceeding word.
def ngram_words(text, n, k):
ttext = text.split()
gram = {}
for i in range(len(ttext)-n):
seq = ' '.join(ttext[i:i+n])
if seq not in gram:
gram[seq] = []
gram[seq].append(ttext[i+n])
sentence = list(random.choice(list(gram.keys())).split())
for i in range(k):
sentence.append(random.choice(gram[' '.join(sentence[i:i+n])]))
return(' '.join(sentence))
print(ngram_words(text,1,100))
fangs of kings in talking over it was palpable wonders and thin parallel and unentered forests whose eagerness for the wharf towards the clear of which deferred miss bingleys continued in such is all the kelson so honestly blind wall leaving queequeg salamed before i am not even tide of the world of himself always good bye dont fancy long to the like yonder roof bear a laughs the ship is in overcoming it is nothing seemed to produce one all out to a settled they mostly the table but a mildly employed in this motion and wickham and an activity
We're getting close to readable text, but you can imagine there is still a long way to go before we reach grammatically correct or sensical text.
Modern text generation has moved to more complex models, but still operates on the same intuition -- based on the surrounding text, make a good prediction for what letter or word should come next.
(If you want to try some of this yourself, and see examples of Markov Chains deployed for amusement in the wild, check out https://github.com/jsvine/markovify).
Markov chains and their variants are still used in many real applications today.