1 | <html> |
---|
2 | <head> |
---|
3 | <title>NIST PCA Example</title> |
---|
4 | </head> |
---|
5 | <P> |
---|
6 | <H2>NIST PCA Example</H2> |
---|
7 | <pre> |
---|
8 | |
---|
9 | # A Principal Components Analysis example |
---|
10 | # |
---|
11 | # walk through the NIST/Sematech PCA example data section 6.5 |
---|
12 | # http://www.itl.nist.gov/div898/handbook/ |
---|
13 | # |
---|
14 | # This file is part of the Hume La Package |
---|
15 | # (C)Copyright 2001, Hume Integration Services |
---|
16 | # |
---|
17 | # These calculations use the Hume Linear Algebra package, La |
---|
18 | # |
---|
19 | package require La |
---|
20 | catch { namespace import La::*} |
---|
21 | |
---|
22 | |
---|
23 | # Look at the bottom of this file to see the console output of this procedure. |
---|
24 | |
---|
25 | proc nist_pca {} { |
---|
26 | puts {Matrix of observation data X[10,3]} |
---|
27 | set X {2 10 3 7 4 3 4 1 8 6 3 5 8 6 1 8 5 7 7 2 9 5 3 3 9 5 8 7 4 5 8 2 2} |
---|
28 | puts [show $X] |
---|
29 | puts {Compute column norms} |
---|
30 | mnorms_br X Xmeans Xsigmas |
---|
31 | puts "Means=[show $Xmeans]" |
---|
32 | puts "Sigmas=[show $Xsigmas %.6f]" |
---|
33 | puts {Normalize observation data creating Z[10,3]} |
---|
34 | puts [show [set Z [mnormalize $X $Xmeans $Xsigmas]] %10.6f]\n |
---|
35 | puts {Naive computation method that is sensitive to round-off error} |
---|
36 | puts {ZZ is (Ztranspose)(Z) } |
---|
37 | puts "ZZ=\n[show [set ZZ [mmult [transpose $Z] $Z]] %10.6f]" |
---|
38 | puts {the correlation matrix, R = (Zt)(Z)/(n-1)} |
---|
39 | puts "R=\n[show [set R [mscale $ZZ [expr 1/9.0]]] %8.4f]" |
---|
40 | puts {Continuing the naive example use R for the eigensolution} |
---|
41 | set V $R |
---|
42 | mevsvd_br V evals |
---|
43 | puts "eigenvalues=[show $evals %8.4f]" |
---|
44 | puts "eigenvectors:\n[show $V %8.4f]\n" |
---|
45 | puts "Good computation method- perform SVD of Z" |
---|
46 | set U $Z |
---|
47 | msvd_br U S V |
---|
48 | puts "Singular values, S=[show $S]" |
---|
49 | puts "square S and divide by (number of observations)-1 to get the eigenvalues" |
---|
50 | set SS [mprod $S $S] |
---|
51 | set evals [mscale $SS [expr 1.0/9.0]] |
---|
52 | puts "eigenvalues=[show $evals %8.4f]" |
---|
53 | puts "eigenvectors:\n[show $V %8.4f]" |
---|
54 | puts "(the 3rd vector/column has inverted signs, thats ok the math still works)" |
---|
55 | puts "\nverify V x transpose = I" |
---|
56 | puts "As computed\n[show [mmult $V [transpose $V]] %20g]\n" |
---|
57 | puts "Rounded off\n[show [mround [mmult $V [transpose $V]]]]" |
---|
58 | #puts "Diagonal matrix of eigenvalues, L" |
---|
59 | #set L [mdiag $evals] |
---|
60 | # |
---|
61 | |
---|
62 | puts "\nsquare root of eigenvalues as a diagonal matrix" |
---|
63 | proc sqrt x { return [expr sqrt($x)] } |
---|
64 | set evals_sr [mat_unary_op $evals sqrt] |
---|
65 | set Lhalf [mdiag $evals_sr] |
---|
66 | puts [show $Lhalf %8.4f] |
---|
67 | |
---|
68 | puts "Factor Structure Matrix S" |
---|
69 | set S [mmult $V $Lhalf] |
---|
70 | puts [show $S %8.4f]\n |
---|
71 | set S2 [mrange $S 0 1] |
---|
72 | puts "Use first two eigenvalues\n[show $S2 %8.4f]" |
---|
73 | # Nash discusses better ways to compute this than multiplying SSt |
---|
74 | # the NIST method is sensitive to round-off error |
---|
75 | set S2S2 [mmult $S2 [transpose $S2]] |
---|
76 | puts "SSt for first two components, communality is the diagonal" |
---|
77 | puts [show $S2S2 %8.4f]\n |
---|
78 | |
---|
79 | |
---|
80 | # define reciprocal square root function |
---|
81 | proc recipsqrt x { return [expr 1.0/sqrt($x)] } |
---|
82 | # use it for Lrhalf calculation |
---|
83 | set Lrhalf [mdiag [mat_unary_op $evals recipsqrt]] |
---|
84 | |
---|
85 | set B [mmult $V $Lrhalf] |
---|
86 | puts "Factor score coefficients B Matrix:\n[show $B %12.4f]" |
---|
87 | |
---|
88 | puts "NIST shows B with a mistake, -.18 for -1.2 (-1.18 with a typo)" |
---|
89 | |
---|
90 | puts "\nWork with first normalized observation row" |
---|
91 | set z1 {2 3 0 0.065621795889 0.316227766017 -0.7481995314} |
---|
92 | puts [show $z1 %.4f] |
---|
93 | |
---|
94 | puts "compute the \"factor scores\" from multiplying by B" |
---|
95 | set t [mmult [transpose $z1] $B] |
---|
96 | puts [show $t %.4f] |
---|
97 | puts "NIST implies you might chart these" |
---|
98 | |
---|
99 | #set T2 [dotprod $t $t] |
---|
100 | #puts "Hotelling T2=[format %.4f $T2]" |
---|
101 | |
---|
102 | puts "Compute the scores using V, these are more familiar" |
---|
103 | set t [mmult [transpose $z1] $V] |
---|
104 | puts [show $t %.4f] |
---|
105 | puts "Calculate T2 from the scores, sum ti*ti/evi" |
---|
106 | |
---|
107 | set T2 [msum [transpose [mdiv [mprod $t $t] [transpose $evals]]]] |
---|
108 | puts "Hotelling T2=[format %.4f $T2]" |
---|
109 | |
---|
110 | puts "Fit z1 using first two principal components" |
---|
111 | set p [mrange $V 0 1] |
---|
112 | puts p=\n[show $p %10.4f] |
---|
113 | set t [mmult [transpose $z1] $p] |
---|
114 | set zhat [mmult $t [transpose $p]] |
---|
115 | puts " z1=[show $z1 %10.6f]" |
---|
116 | puts "zhat=[show $zhat %10.6f]" |
---|
117 | set diffs [msub [transpose $z1] $zhat] |
---|
118 | puts "diff=[show $diffs %10.6f]" |
---|
119 | puts "Q statistic - distance from the model measurement for the first observation" |
---|
120 | set Q [dotprod $diffs $diffs] |
---|
121 | puts Q=[format %.4f $Q] |
---|
122 | puts "Some experts would compute a corrected Q that is larger since the |
---|
123 | observation was used in building the model. The Q calculation just used |
---|
124 | properly applies to the analysis of new observations." |
---|
125 | set corr [expr 10.0/(10.0 - 2 -1)] ;# N/(N - Ncomp - 1) |
---|
126 | puts "Corrected Q=[format %.4f [expr $Q * $corr]] (factor=[format %.4f $corr])" |
---|
127 | } |
---|
128 | |
---|
129 | return |
---|
130 | ########################################################################## |
---|
131 | Console Printout |
---|
132 | ########################################################################## |
---|
133 | Matrix of observation data X[10,3] |
---|
134 | 7 4 3 |
---|
135 | 4 1 8 |
---|
136 | 6 3 5 |
---|
137 | 8 6 1 |
---|
138 | 8 5 7 |
---|
139 | 7 2 9 |
---|
140 | 5 3 3 |
---|
141 | 9 5 8 |
---|
142 | 7 4 5 |
---|
143 | 8 2 2 |
---|
144 | Compute column norms |
---|
145 | Means=6.9 3.5 5.1 |
---|
146 | Sigmas=1.523884 1.581139 2.806738 |
---|
147 | Normalize observation data creating Z[10,3] |
---|
148 | 0.065622 0.316228 -0.748200 |
---|
149 | -1.903032 -1.581139 1.033228 |
---|
150 | -0.590596 -0.316228 -0.035629 |
---|
151 | 0.721840 1.581139 -1.460771 |
---|
152 | 0.721840 0.948683 0.676942 |
---|
153 | 0.065622 -0.948683 1.389513 |
---|
154 | -1.246814 -0.316228 -0.748200 |
---|
155 | 1.378058 0.948683 1.033228 |
---|
156 | 0.065622 0.316228 -0.035629 |
---|
157 | 0.721840 -0.948683 -1.104485 |
---|
158 | |
---|
159 | Naive computation method that is sensitive to round-off error |
---|
160 | ZZ is (Ztranspose)(Z) |
---|
161 | ZZ= |
---|
162 | 9.000000 6.017916 -0.911824 |
---|
163 | 6.017916 9.000000 -2.591349 |
---|
164 | -0.911824 -2.591349 9.000000 |
---|
165 | the correlation matrix, R = (Zt)(Z)/(n-1) |
---|
166 | R= |
---|
167 | 1.0000 0.6687 -0.1013 |
---|
168 | 0.6687 1.0000 -0.2879 |
---|
169 | -0.1013 -0.2879 1.0000 |
---|
170 | Continuing the naive example use R for the eigensolution |
---|
171 | eigenvalues= 1.7688 0.9271 0.3041 |
---|
172 | eigenvectors: |
---|
173 | 0.6420 0.3847 -0.6632 |
---|
174 | 0.6864 0.0971 0.7207 |
---|
175 | -0.3417 0.9179 0.2017 |
---|
176 | |
---|
177 | Good computation method- perform SVD of Z |
---|
178 | Singular values, S=3.98985804571 2.88854344819 1.65449373616 |
---|
179 | square S and divide by (number of observations)-1 to get the eigenvalues |
---|
180 | eigenvalues= 1.7688 0.9271 0.3041 |
---|
181 | eigenvectors: |
---|
182 | 0.6420 0.3847 0.6632 |
---|
183 | 0.6864 0.0971 -0.7207 |
---|
184 | -0.3417 0.9179 -0.2017 |
---|
185 | (the 3rd vector/column has inverted signs, thats ok the math still works) |
---|
186 | |
---|
187 | verify V x transpose = I |
---|
188 | As computed |
---|
189 | 1 -5.55112e-017 1.38778e-016 |
---|
190 | -5.55112e-017 1 5.55112e-017 |
---|
191 | 1.38778e-016 5.55112e-017 1 |
---|
192 | |
---|
193 | Rounded off |
---|
194 | 1.0 0.0 0.0 |
---|
195 | 0.0 1.0 0.0 |
---|
196 | 0.0 0.0 1.0 |
---|
197 | |
---|
198 | square root of eigenvalues as a diagonal matrix |
---|
199 | 1.3300 0.0000 0.0000 |
---|
200 | 0.0000 0.9628 0.0000 |
---|
201 | 0.0000 0.0000 0.5515 |
---|
202 | Factor Structure Matrix S |
---|
203 | 0.8538 0.3704 0.3658 |
---|
204 | 0.9128 0.0935 -0.3975 |
---|
205 | -0.4544 0.8838 -0.1112 |
---|
206 | |
---|
207 | Use first two eigenvalues |
---|
208 | 0.8538 0.3704 |
---|
209 | 0.9128 0.0935 |
---|
210 | -0.4544 0.8838 |
---|
211 | SSt for first two components, communality is the diagonal |
---|
212 | 0.8662 0.8140 -0.0606 |
---|
213 | 0.8140 0.8420 -0.3321 |
---|
214 | -0.0606 -0.3321 0.9876 |
---|
215 | |
---|
216 | Factor score coefficients B Matrix: |
---|
217 | 0.4827 0.3995 1.2026 |
---|
218 | 0.5161 0.1009 -1.3069 |
---|
219 | -0.2569 0.9533 -0.3657 |
---|
220 | NIST shows B with a mistake, -.18 for -1.2 (-1.18 with a typo) |
---|
221 | |
---|
222 | Work with first normalized observation row |
---|
223 | 0.0656 0.3162 -0.7482 |
---|
224 | compute the "factor scores" from multiplying by B |
---|
225 | 0.3871 -0.6552 -0.0608 |
---|
226 | NIST implies you might chart these |
---|
227 | Compute the scores using V, these are more familiar |
---|
228 | 0.5148 -0.6308 -0.0335 |
---|
229 | Calculate T2 from the scores, sum ti*ti/evi |
---|
230 | Hotelling T2=0.5828 |
---|
231 | Fit z1 using first two principal components |
---|
232 | p= |
---|
233 | 0.6420 0.3847 |
---|
234 | 0.6864 0.0971 |
---|
235 | -0.3417 0.9179 |
---|
236 | z1= 0.065622 0.316228 -0.748200 |
---|
237 | zhat= 0.087847 0.292075 -0.754958 |
---|
238 | diff= -0.022225 0.024153 0.006758 |
---|
239 | Q statistic - distance from the model measurement for the first observation |
---|
240 | Q=0.0011 |
---|
241 | Some experts would compute a corrected Q that is larger since the |
---|
242 | observation was used in building the model. The Q calculation just used |
---|
243 | properly applies to the analysis of new observations. |
---|
244 | Corrected Q=0.0016 (factor=1.4286) |
---|
245 | </PRE> |
---|
246 | </html> |
---|