# Principal Component Analysis (PCA) library(readr) library(plot3D) library(scatterplot3d) allSpec <- read_csv("allSpec.csv") load("co_stds.RData") load("cr_stds.RData") load("cu_stds.RData") load("ni_stds.RData") # Overview of PCA # generate some random xy data, where y is a f(x), with noise x1 = seq(-10,10,1) y1 = x1*2 + 8 + rnorm(21,0,7) # center and scale values help(scale) x1 = scale(x1) y1 = scale(y1) # plot the data and add x and y axes centered at (0,0) plot(x1,y1,asp = 1, pch = 19, col = "blue", xlab = "first variable", ylab = "second variable") abline(v = 0, col = "black", lty = 2) abline(h = 0, col = "black", lty = 2) # fit linear model to data; this is the first principal component xy1.lm = lm(y1 ~ x1) abline(xy1.lm, lwd = 2, col = "red") # plot vertical projection of points onto pc axis # distance from (0,0) of projections are called the scores a1 = xy1.lm\$coefficients[2] b1 = xy1.lm\$coefficients[1] c1 = y1 + x1/a1 dx1 = (c1 - b1)/(a1 + 1/a1) dy1 = a1 * dx1 + b1 points(dx1,dy1,pch = 19, col = "red") for (i in 1:21){ lines(x = c(x1[i], dx1[i]), y = c(y1[i], dy1[i]), lty = 3, lwd = 2, col = "red") } # draw new axis that is prepindicular to the first pc axis abline(a = 0, b = -1/a1,lwd = 2, col = "forestgreen") # plot vertical projection of points onto second pc axis # distance from (0,0) of projections are called the scores a2 = -1/a1 b2 = 0 c2 = y1 + x1/a2 dx2 = (c2 - b2)/(a2 + 1/a2) dy2 = a2 * dx2 + b2 points(dx2,dy2,pch = 19, col = "forestgreen") for (i in 1:21){ lines(x = c(x1[i], dx2[i]), y = c(y1[i], dy2[i]), lty = 3, lwd = 2, col = "forestgreen") } # PCA Worked Example # plot overlay of spectra for metals wavelengths = as.numeric(colnames(allSpec[8:642])) plot(x = wavelengths, y = allSpec[1, 8:642], type = "l", xlab = "wavelengths (nm)", ylab = "absorbance", col = "blue", lwd = 2) lines(x = wavelengths, y = allSpec[6, 8:642], col = "red", lwd = 2) lines(x = wavelengths, y = allSpec[11, 8:642], col = "orange", lwd = 2) lines(x = wavelengths, y = allSpec[16, 8:642], col = "forestgreen", lwd = 2) legend(x = 820, y = 0.4, legend = c("Cu", "Co", "Cr", "Ni"), lwd = 2, col = c("blue", "red", "orange", "forestgreen"), bty = "n") # select 16 wavelengths to use (wavelength_ids are column ids) wavelength_ids = seq(8,642,40) abline(v = as.numeric(colnames(allSpec[wavelength_ids])), lty = 3, lwd = 1) # subset data to use for this example (24 samples total) ex_one = allSpec[c(1, 6, 11, 21:25, 38:53), ] # complete pca and look at results help(prcomp) ex_one.pca = prcomp(ex_one[ , wavelength_ids], center = TRUE, scale = TRUE) summary(ex_one.pca) plot(ex_one.pca) # plot scores plot(ex_one.pca\$x, cex = 2, pch = 19, asp = 1) # plot scores using dimensions to color points plot(ex_one.pca\$x, cex = 2, pch = 19, asp = 1, col = factor(ex_one\$dimensions)) legend("topleft", legend = c("single component", "binary mixture", "ternary mixture"), col = c("black", "red", "green"), pch = 19, bty = "n") # plot loadings and scores as biplot help(biplot) biplot(ex_one.pca, cex = c(2, 0.6), xlabs = rep("•",24)) # plot scores with colors by concentration of copper cuPal = colorRampPalette(c("white", "blue")) cuColor = cuPal(50)[as.numeric(cut(ex_one\$concCu,breaks = 50))] plot(ex_one.pca\$x, pch = 21, bg = cuColor, cex = 2, asp = 1) # plot grid of pca results by concentration of each analyte # and a 3D scatterplot of the scores on all three pc axes cuPal = colorRampPalette(c("white", "blue")) cuColor = cuPal(50)[as.numeric(cut(ex_one\$concCu, breaks = 50))] crPal = colorRampPalette(c("white", "orange")) crColor = crPal(50)[as.numeric(cut(ex_one\$concCr, breaks = 50))] coPal = colorRampPalette(c("white", "red")) coColor = coPal(50)[as.numeric(cut(ex_one\$concCo, breaks = 50))] old.par = par(mfrow = c(2,2)) # plot(ex_one.pca\$x, pch = 19, cex = 1.6, asp = 1) plot(ex_one.pca\$x, pch = 21, bg = cuColor, cex = 1.6, asp = 1) plot(ex_one.pca\$x, pch = 21, bg = crColor, cex = 1.6, asp = 1) plot(ex_one.pca\$x, pch = 21, bg = coColor, cex = 1.6, asp = 1) colv = factor(ex_one\$dimensions) scatterplot3d(x = ex_one.pca\$x[,1], y = ex_one.pca\$x[,2], z = ex_one.pca\$x[,3], pch = 19, color = colv, cex.symbols = 1.6, type = "h", xlab = "PC1", ylab = "PC2", zlab = "PC3") par(old.par)