# source:branches/sandbox/la1.0/NIST.tcl@1123

Last change on this file since 1123 was 931, checked in by toby, 12 years ago

include rest of files

File size: 8.0 KB
Line
1# A Principal Components Analysis example
2#
3# walk through the NIST/Sematech PCA example data section 6.5
4# http://www.itl.nist.gov/div898/handbook/
5#
6# This file is part of the Hume La Package
7# (C)Copyright 2001, Hume Integration Services
8#
9# These calculations use the Hume Linear Algebra package, La
10#
11package require La
12catch { namespace import La::*}
13
14
15# Look at the bottom of this file to see the console output of this procedure.
16
17proc nist_pca {} {
18    puts {Matrix of observation data X[10,3]}
19    set X {2 10 3 7 4 3 4 1 8 6 3 5 8 6 1 8 5 7 7 2 9 5 3 3 9 5 8 7 4 5 8 2 2}
20    puts [show \$X]
21    puts {Compute column norms}
22    mnorms_br X Xmeans Xsigmas
23    puts "Means=[show \$Xmeans]"
24    puts "Sigmas=[show \$Xsigmas %.6f]"
25    puts {Normalize observation data creating Z[10,3]}
26    puts [show [set Z [mnormalize \$X \$Xmeans \$Xsigmas]] %10.6f]\n
27    puts {Naive computation method that is sensitive to round-off error}
28    puts {ZZ is (Ztranspose)(Z) }
29    puts "ZZ=\n[show [set ZZ [mmult [transpose \$Z] \$Z]] %10.6f]"
30    puts {the correlation matrix, R = (Zt)(Z)/(n-1)}
31    puts "R=\n[show [set R [mscale \$ZZ [expr 1/9.0]]] %8.4f]"
32    puts {Continuing the naive example use R for the eigensolution}
33    set V \$R
34    mevsvd_br V evals
35    puts "eigenvalues=[show \$evals %8.4f]"
36    puts "eigenvectors:\n[show \$V %8.4f]\n"
37    puts "Good computation method- perform SVD of Z"
38    set U \$Z
39    msvd_br U S V
40    puts "Singular values, S=[show \$S]"
41    puts "square S and divide by (number of observations)-1 to get the eigenvalues"
42    set SS [mprod \$S \$S]
43    set evals [mscale \$SS [expr 1.0/9.0]]
44    puts "eigenvalues=[show \$evals %8.4f]"
45    puts "eigenvectors:\n[show \$V %8.4f]"
46    puts "(the 3rd vector/column has inverted signs, thats ok the math still works)"
47    puts "\nverify V x transpose = I"
48    puts "As computed\n[show [mmult \$V [transpose \$V]] %20g]\n"
49    puts "Rounded off\n[show [mround [mmult \$V [transpose \$V]]]]"
50    #puts "Diagonal matrix of eigenvalues, L"
51    #set L [mdiag \$evals]
52    #
53
54    puts "\nsquare root of eigenvalues as a diagonal matrix"
55    proc sqrt x { return [expr sqrt(\$x)] }
56    set evals_sr [mat_unary_op \$evals sqrt]
57    set Lhalf [mdiag \$evals_sr]
58    puts [show \$Lhalf %8.4f]
59
60    puts "Factor Structure Matrix S"
61    set S [mmult \$V \$Lhalf]
62    puts [show \$S %8.4f]\n
63    set S2 [mrange \$S 0 1]
64    puts "Use first two eigenvalues\n[show \$S2 %8.4f]"
65    # Nash discusses better ways to compute this than multiplying SSt
66    # the NIST method is sensitive to round-off error
67    set S2S2 [mmult \$S2 [transpose \$S2]]
68    puts "SSt for first two components, communality is the diagonal"
69    puts [show \$S2S2 %8.4f]\n
70
71
72    # define reciprocal square root function
73    proc recipsqrt x { return [expr 1.0/sqrt(\$x)] }
74    # use it for Lrhalf calculation
75    set Lrhalf [mdiag [mat_unary_op \$evals recipsqrt]]
76
77    set B [mmult \$V \$Lrhalf]
78    puts "Factor score coefficients B Matrix:\n[show \$B %12.4f]"
79
80    puts "NIST shows B with a mistake, -.18 for -1.2 (-1.18 with a typo)"
81
82    puts "\nWork with first normalized observation row"
83    set z1 {2 3 0 0.065621795889 0.316227766017 -0.7481995314}
84    puts [show \$z1 %.4f]
85
86    puts "compute the \"factor scores\" from multiplying by B"
87    set t [mmult [transpose \$z1] \$B]
88    puts [show \$t %.4f]
89    puts "NIST implies you might chart these"
90
91    #set T2 [dotprod \$t \$t]
92    #puts "Hotelling T2=[format %.4f \$T2]"
93
94    puts "Compute the scores using V, these are more familiar"
95    set t [mmult [transpose \$z1] \$V]
96    puts [show \$t %.4f]
97    puts "Calculate T2 from the scores, sum ti*ti/evi"
98
99    set T2 [msum [transpose [mdiv [mprod \$t \$t] [transpose \$evals]]]]
100    puts "Hotelling T2=[format %.4f \$T2]"
101
102    puts "Fit z1 using first two principal components"
103    set p [mrange \$V 0 1]
104    puts p=\n[show \$p %10.4f]
105    set t [mmult [transpose \$z1] \$p]
106    set zhat [mmult \$t [transpose \$p]]
107    puts "  z1=[show \$z1 %10.6f]"
108    puts "zhat=[show \$zhat %10.6f]"
109    set diffs [msub [transpose \$z1] \$zhat]
110    puts "diff=[show \$diffs %10.6f]"
111    puts "Q statistic - distance from the model measurement for the first observation"
112    set Q [dotprod \$diffs \$diffs]
113    puts Q=[format %.4f \$Q]
114    puts "Some experts would compute a corrected Q that is larger since the
115observation was used in building the model.  The Q calculation just used
116properly applies to the analysis of new observations."
117    set corr [expr 10.0/(10.0 - 2 -1)]  ;# N/(N - Ncomp - 1)
118    puts "Corrected Q=[format %.4f [expr \$Q * \$corr]] (factor=[format %.4f \$corr])"
119    }
120
121return
122##########################################################################
123Console Printout
124##########################################################################
125Matrix of observation data X[10,3]
1267 4 3
1274 1 8
1286 3 5
1298 6 1
1308 5 7
1317 2 9
1325 3 3
1339 5 8
1347 4 5
1358 2 2
136Compute column norms
137Means=6.9 3.5 5.1
138Sigmas=1.523884 1.581139 2.806738
139Normalize observation data creating Z[10,3]
140  0.065622   0.316228  -0.748200
141 -1.903032  -1.581139   1.033228
142 -0.590596  -0.316228  -0.035629
143  0.721840   1.581139  -1.460771
144  0.721840   0.948683   0.676942
145  0.065622  -0.948683   1.389513
146 -1.246814  -0.316228  -0.748200
147  1.378058   0.948683   1.033228
148  0.065622   0.316228  -0.035629
149  0.721840  -0.948683  -1.104485
150
151Naive computation method that is sensitive to round-off error
152ZZ is (Ztranspose)(Z)
153ZZ=
154  9.000000   6.017916  -0.911824
155  6.017916   9.000000  -2.591349
156 -0.911824  -2.591349   9.000000
157the correlation matrix, R = (Zt)(Z)/(n-1)
158R=
159  1.0000   0.6687  -0.1013
160  0.6687   1.0000  -0.2879
161 -0.1013  -0.2879   1.0000
162Continuing the naive example use R for the eigensolution
163eigenvalues=  1.7688   0.9271   0.3041
164eigenvectors:
165  0.6420   0.3847  -0.6632
166  0.6864   0.0971   0.7207
167 -0.3417   0.9179   0.2017
168
169Good computation method- perform SVD of Z
170Singular values, S=3.98985804571 2.88854344819 1.65449373616
171square S and divide by (number of observations)-1 to get the eigenvalues
172eigenvalues=  1.7688   0.9271   0.3041
173eigenvectors:
174  0.6420   0.3847   0.6632
175  0.6864   0.0971  -0.7207
176 -0.3417   0.9179  -0.2017
177(the 3rd vector/column has inverted signs, thats ok the math still works)
178
179verify V x transpose = I
180As computed
181                   1        -5.55112e-017         1.38778e-016
182       -5.55112e-017                    1         5.55112e-017
183        1.38778e-016         5.55112e-017                    1
184
185Rounded off
1861.0 0.0 0.0
1870.0 1.0 0.0
1880.0 0.0 1.0
189
190square root of eigenvalues as a diagonal matrix
191  1.3300   0.0000   0.0000
192  0.0000   0.9628   0.0000
193  0.0000   0.0000   0.5515
194Factor Structure Matrix S
195  0.8538   0.3704   0.3658
196  0.9128   0.0935  -0.3975
197 -0.4544   0.8838  -0.1112
198
199Use first two eigenvalues
200  0.8538   0.3704
201  0.9128   0.0935
202 -0.4544   0.8838
203SSt for first two components, communality is the diagonal
204  0.8662   0.8140  -0.0606
205  0.8140   0.8420  -0.3321
206 -0.0606  -0.3321   0.9876
207
208Factor score coefficients B Matrix:
209      0.4827       0.3995       1.2026
210      0.5161       0.1009      -1.3069
211     -0.2569       0.9533      -0.3657
212NIST shows B with a mistake, -.18 for -1.2 (-1.18 with a typo)
213
214Work with first normalized observation row
2150.0656 0.3162 -0.7482
216compute the "factor scores" from multiplying by B
2170.3871 -0.6552 -0.0608
218NIST implies you might chart these
219Compute the scores using V, these are more familiar
2200.5148 -0.6308 -0.0335
221Calculate T2 from the scores, sum ti*ti/evi
222Hotelling T2=0.5828
223Fit z1 using first two principal components
224p=
225    0.6420     0.3847
226    0.6864     0.0971
227   -0.3417     0.9179
228  z1=  0.065622   0.316228  -0.748200
229zhat=  0.087847   0.292075  -0.754958
230diff= -0.022225   0.024153   0.006758
231Q statistic - distance from the model measurement for the first observation
232Q=0.0011
233Some experts would compute a corrected Q that is larger since the
234observation was used in building the model.  The Q calculation just used
235properly applies to the analysis of new observations.
236Corrected Q=0.0016 (factor=1.4286)
Note: See TracBrowser for help on using the repository browser.