In [2]:
# This is a sketch of the SVD material presented in class on Sept 18, translated into python.
# This sketch is limited to the use of the SVD in dimensionality reduction.
# We start with some toy data for 4 real persons using 3 attributes per person.
# We use the SVD to discover that each real person can be represented as linear combinations of 2 "canonical persons".
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd # Panda is used just to make the matrix output look nice.
from pdb import set_trace,pm
import scipy as sc
import scipy.linalg as lg
import scipy.sparse as sp
In [18]:
A = np.array([[1,2,3],[4,2,6],[7,8,9],[10,8,12]]) # The Data
cols=['age','wgt','hgt'] # The attributes
rows=['tom','dick','harry','janet'] # The Real Persons
def LLL(A,rows=rows,cols=cols): return pd.DataFrame(A,index=rows,columns=cols) # Used just to annotate the printouts
print(LLL(A))
age wgt hgt tom 1 2 3 dick 4 2 6 harry 7 8 9 janet 10 8 12
In [21]:
# center data by subtracting the means
print('means =',np.mean(A,axis=0))
print('outer product =\n',LLL(np.outer([1,1,1,1],np.mean(A,axis=0))))
AA=A-np.outer([1,1,1,1],np.mean(A,axis=0))
print('centered data =\n',LLL(AA))
means = [5.5 5. 7.5] outer product = age wgt hgt tom 5.5 5.0 7.5 dick 5.5 5.0 7.5 harry 5.5 5.0 7.5 janet 5.5 5.0 7.5 centered data = age wgt hgt tom -4.5 -3.0 -4.5 dick -1.5 -3.0 -1.5 harry 1.5 3.0 1.5 janet 4.5 3.0 4.5
In [23]:
ScatterMatrix = AA.T @ AA # '@' denotes matrix-matrix product. ".T" denotes transpose.
print('Scatter Matrix = \n',LLL(ScatterMatrix,rows=cols))
CovarianceMatrix = ScatterMatrix/len(ScatterMatrix)
print('Covariance Matrix = \n',LLL(CovarianceMatrix,rows=cols))
Scatter Matrix = age wgt hgt age 45.0 36.0 45.0 wgt 36.0 36.0 36.0 hgt 45.0 36.0 45.0 Covariance Matrix = age wgt hgt age 15.0 12.0 15.0 wgt 12.0 12.0 12.0 hgt 15.0 12.0 15.0
In [33]:
# Singular Value Decomposition of Centered Data yields representation of each person as a linear combination of
# 2 canonical persons (usually called principal components:
U,sigma,VT = lg.svd(AA)
print(' U sigma V.T =')
print(U)
print('sigma = ',sigma)
print(LLL(VT,rows=['PC1','PC2','PC3']))
U sigma V.T = [[-0.63731763 0.30631069 0.59313071 0.38496229] [-0.30631069 -0.63731763 -0.38496229 0.59313071] [ 0.30631069 0.63731763 -0.38496229 0.59313071] [ 0.63731763 -0.30631069 0.59313071 0.38496229]] sigma = [1.09830833e+01 2.31773205e+00 4.72717559e-17] age wgt hgt PC1 0.605913 5.154991e-01 0.605913 PC2 -0.364513 8.568901e-01 -0.364513 PC3 -0.707107 1.000653e-16 0.707107
In [38]:
#Because there are only two nonzero Sigma entries, we need only the first 2 cols of U & forst 2 rows of VT:
U1=U[:,:2]
sigma1=sigma[:2]
VT1=VT[:2,:]
PCS=VT1
In [39]:
## Each person can be represented by two coefficients in in terms of the canonical persons:
SCORES = AA @ VT1.T
print('SCORES = ')
print(LLL(SCORES,cols=['PC1','PC2']))
print('PCS = \n',LLL(PCS,rows=['PC1','PC2']))
SCORES = PC1 PC2 tom -6.999713 0.709946 dick -3.364236 -1.477131 harry 3.364236 1.477131 janet 6.999713 -0.709946 PCS = age wgt hgt PC1 0.605913 0.515499 0.605913 PC2 -0.364513 0.856890 -0.364513
In [40]:
# The rows of PCS are the Principal Components (canonical persons).
# The rows of SCORES are the amounts of each canoncial person present in the corresponding real person
# To verify that the real persons can be completely characterized by the canoncial persons (PCs), we reconstruct the centered data.
print('reconstructed centered data =')
print(LLL(SCORES @ PCS))
reconstructed centered data = age wgt hgt tom -4.5 -3.0 -4.5 dick -1.5 -3.0 -1.5 harry 1.5 3.0 1.5 janet 4.5 3.0 4.5
In [56]:
# The matrix VT can be obtained directly from the covariance matrix (at the cost of some accuracy):
(lamda,V ) = lg.eigh(-CovarianceMatrix)
lamda=-lamda
print('eigenvalues ',lamda,'\neigenvectors =\n',V)
print('Compare eigenvectors to transpose of VT = Principal Components: \n',VT.T)
eigenvalues [4.02093727e+01 1.79062729e+00 2.92448971e-16] eigenvectors = [[ 6.05912800e-01 3.64512933e-01 7.07106781e-01] [ 5.15499134e-01 -8.56890100e-01 1.11022302e-16] [ 6.05912800e-01 3.64512933e-01 -7.07106781e-01]] Compare eigenvectors to transpose of VT = Principal Components: [[ 6.05912800e-01 -3.64512933e-01 -7.07106781e-01] [ 5.15499134e-01 8.56890100e-01 1.00065300e-16] [ 6.05912800e-01 -3.64512933e-01 7.07106781e-01]]
In [ ]: