#
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib as mp
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import laUtilities as ut
import slideUtilities as sl
import demoUtilities as dm
from matplotlib import animation
from importlib import reload
from datetime import datetime
from IPython.display import Image, display_html, display, Math, HTML;
qr_setting = None

mp.rcParams['animation.html'] = 'jshtml';


#
ax = ut.plotSetup(-10, 10, -10, 10, size = (7, 7))
ut.centerAxes(ax)
line = np.array([1, 0.5])
xlin = -10.0 + 20.0 * np.random.random(100)
ylin = line[0] + (line[1] * xlin) + np.random.normal(scale = 1.5, size = 100)
ax.plot(xlin, ylin, 'ro', markersize=6);


#
ax = ut.plotSetup(-10, 10, -10, 20, size = (7, 7))
ut.centerAxes(ax)
quad = np.array([1, 3, 0.5])
xquad = -10.0 + 20.0 * np.random.random(100)
yquad = quad[0] + (quad[1] * xquad) + (quad[2] * xquad * xquad) + np.random.normal(scale = 1.5, size = 100)
ax.plot(xquad, yquad, 'ro', markersize=6);


#
ax = ut.plotSetup(-10, 10, -10, 15, size = (7, 7))
ut.centerAxes(ax)
log = np.array([1, 4])
xlog = 10.0 * np.random.random(100)
ylog = log[0] + log[1] * np.log(xlog) + np.random.normal(scale = 1.5, size = 100)
ax.plot(xlog, ylog, 'ro', markersize=6);


#
ax = ut.plotSetup(-10, 10, -10, 10, size = (7, 7))
ut.centerAxes(ax)
line = np.array([1, 0.5])
xlin = -10.0 + 20.0 * np.random.random(100)
ylin = line[0] + (line[1] * xlin) + np.random.normal(scale = 1.5, size = 100)
ax.plot(xlin, ylin, 'ro', markersize = 6)
ax.plot(xlin, line[0] + line[1] *xlin, 'b-')
plt.text(-9, 3, r'$y = \beta_0 + \beta_1x$', size=20);


# image credit: Lay, LAA, 4th edition
display(Image("images/Lay-fig-6-6-1.jpg", width=550))


#
m = np.shape(xquad)[0]
X = np.array([np.ones(m), xquad,xquad**2]).T
beta = np.linalg.inv(X.T @ X) @ X.T @ yquad
#
ax = ut.plotSetup(-10, 10, -10, 20, size = (7, 7))
ut.centerAxes(ax)
xplot = np.linspace(-10, 10, 50)
yestplot = beta[0] + beta[1]  * xplot + beta[2] * xplot**2
ax.plot(xplot, yestplot, 'b-', lw=2)
ax.plot(xquad, yquad, 'ro', markersize = 8);


#
m = np.shape(xlog)[0]
X = np.array([np.ones(m), np.log(xlog)]).T
beta = np.linalg.inv(X.T @ X) @ X.T @ ylog
# 
ax = ut.plotSetup(-10, 10, -10, 15, size = (7, 7))
ut.centerAxes(ax)
xplot = np.linspace(0.1, 10, 50)
yestplot = beta[0] + beta[1] * np.log(xplot)
ax.plot(xplot, yestplot, 'b-', lw=2)
ax.plot(xlog, ylog, 'ro', markersize = 8);


#
fig = ut.three_d_figure((14, 1), 'multivariate regression example', 
                       -7, 7, -7, 7, -8, 8,
                        equalAxes = False, figsize = (7, 7), qr = qr_setting)
np.random.seed(6)
v = [4.0,  4.0, 2.0]
u = [-4.0, 3.0, 1.0]
npts = 70
# set locations of points that fall within x,y
xc = -7.0 + 14.0 * np.random.random(npts)
yc = -7.0 + 14.0 * np.random.random(npts)
A = np.array([u,v]).T
# project these points onto the plane
P = A @ np.linalg.inv(A.T @ A) @ A.T
coords = P @ np.array([xc,yc,np.zeros(npts)])
coords[2] += np.random.normal(scale = 1, size = npts)
for i in range(coords.shape[-1]):
    fig.plotPoint(coords[0,i],coords[1,i], coords[2,i], 'r')
fig.set_title('Terrain Data for Multiple Regression', 'Terrain Data for Multiple Regression', size = 20)
fig.ax.set_zlabel('y')
fig.ax.set_xlabel('u')
fig.ax.set_ylabel('v')
fig.desc['xlabel'] = 'u'
fig.desc['ylabel'] = 'v'
fig.desc['zlabel'] = 'y'
fig.ax.view_init(azim=30, elev = 15)
fig.save();


#
fig = ut.three_d_figure((14, 2), 'multivariate regression example with fitted plane', 
                       -7, 7, -7, 7, -8, 8,
                        equalAxes = False, figsize = (7, 7), qr = qr_setting)
np.random.seed(6)
v = [4.0,  4.0, 2.0]
u = [-4.0, 3.0, 1.0]
npts = 70
# set locations of points that fall within x,y
xc = -7.0 + 14.0 * np.random.random(npts)
yc = -7.0 + 14.0 * np.random.random(npts)
A = np.array([u,v]).T
# project these points onto the plane
P = A @ np.linalg.inv(A.T @ A) @ A.T
coords = P @ np.array([xc,yc,np.zeros(npts)])
coords[2] += np.random.normal(scale = 1, size = npts)
for i in range(coords.shape[-1]):
    fig.plotPoint(coords[0,i],coords[1,i], coords[2,i], 'r')
fig.set_title('Multiple Regression Fit to Data', 'Multiple Regression Fit to Data', size = 20)
fig.ax.set_zlabel('y')
fig.ax.set_xlabel('u')
fig.ax.set_ylabel('v')
fig.desc['xlabel'] = 'u'
fig.desc['ylabel'] = 'v'
fig.desc['zlabel'] = 'y'
fig.plotSpan(u, v, 'Green')
fig.ax.view_init(azim=30, elev = 30)
fig.save();


# read and parse the raw data
df = pd.read_csv('data/ames-housing-data/train.csv')

# split into the independent and dependent variables
X_no_intercept = df[['LotArea', 'GrLivArea', 'Fireplaces', 'FullBath', 'HalfBath', 'GarageArea', 'TotalBsmtSF']].values
y = df['SalePrice'].values


# run this line to visualize the data
df[['LotArea', 'GrLivArea', 'Fireplaces', 'FullBath', 'HalfBath', 'GarageArea', 'TotalBsmtSF', 'SalePrice']].head()


X = np.column_stack([np.ones(X_no_intercept.shape[0], dtype = 'int'), X_no_intercept])


# run this line to view the X matrix
X

array([[    1,  8450,  1710, ...,     1,   548,   856],
       [    1,  9600,  1262, ...,     0,   460,  1262],
       [    1, 11250,  1786, ...,     1,   608,   920],
       ...,
       [    1,  9042,  2340, ...,     0,   252,  1152],
       [    1,  9717,  1078, ...,     0,   240,  1078],
       [    1,  9937,  1256, ...,     1,   276,  1256]])


beta_hat = np.linalg.inv(X.T @ X) @ X.T @ y


 # run this line to view the model parameters
beta_hat

array([-2.92338280e+04,  1.87444579e-01,  3.94185205e+01,  1.45698657e+04,
        2.29695596e+04,  1.62834807e+04,  9.14770980e+01,  5.11282216e+01])


y_hat = X @ beta_hat


# plot of (X, Y) samples and the linear regression based on y_hat
plt.figure()
plt.plot(y, y_hat, '.')
plt.xlabel('Actual Sale Price')
plt.ylabel('Predicted Sale Price')
plt.title('Linear Model Predictions of House Prices')
plt.plot([0, 500000], [0, 500000], '-', label='yhat = y')
plt.legend(loc = 'best');

Announcements¶

Recap from last lecture¶

The Framework of Linear Models¶

Regression is a Least-Squares Problem¶

Lecture 26: Multiple Regression¶

26.1 The General Linear Model¶

Least-Squares Fitting of Other Models¶

26.2 Multiple Regression¶

Multiple Regression in Practice¶

	LotArea	GrLivArea	Fireplaces	FullBath	HalfBath	GarageArea	TotalBsmtSF	SalePrice
0	8450	1710	0	2	1	548	856	208500
1	9600	1262	1	2	0	460	1262	181500
2	11250	1786	1	2	1	608	920	223500
3	9550	1717	1	1	0	642	756	140000
4	14260	2198	1	2	1	836	1145	250000