# Cluster Analysis (CA) library(readr) library(plot3D) library(scatterplot3d) allSpec <- read_csv("allSpec.csv") load("co_stds.RData") load("cr_stds.RData") load("cu_stds.RData") load("ni_stds.RData") # Overview of CA # plot some sample data (six data points, each with x and y values) a = c(1.70, -1.30, -0.90, -0.78, 3.20, -1.90) b = c(2.00, -0.51, 0.12, -0.11, -1.40, -0.17) plot(a, b, pch = 19, col = "blue") text(a + 0.1, b + 0.1, seq(1:6)) # first cluster: points 3 & 4 replaced with average segments(a[3], b[3], a[4], b[4], col = "red") a[7] = (a[3] + a[4])/2 b[7] = (b[3] + b[4])/2 points(x = a[7], y = b[7], pch = 19, col = "red") text(a[7] + 0.1, b[7] + 0.1, 7, col = "red") # second cluster: points 2 & 6 replaced with average segments(a[2], b[2], a[6], b[6], col = "red") a[8] = (a[2] + a[6])/2 b[8] = (b[2] + b[6])/2 points(x = a[8], y = b[8], pch = 19, col = "red") text(a[8] + 0.1, b[8] + 0.1, 8, col = "red") # remaining clusters segments(a[7], b[7], a[8], b[8], col = "red") a[9] = (a[7] + a[8])/2 b[9] = (b[7] + b[8])/2 points(x = a[9], y = b[9], pch = 19, col = "red") text(a[9] + 0.1, b[9] + 0.1, 9, col = "red") segments(a[1], b[1], a[9], b[9], col = "red") a[10] = (a[1] + a[9])/2 b[10] = (b[1] + b[9])/2 points(x = a[10], y = b[10], pch = 19, col = "red") text(a[10] + 0.1, b[10] + 0.1, 10, col = "red") segments(a[5], b[5], a[10], b[10], col = "red") a[11] = (a[5] + a[10])/2 b[11] = (b[5] + b[10])/2 points(x = a[11], y = b[11], pch = 19, col = "red") text(a[11] + 0.1, b[11] + 0.1, 11, col = "red") # calculate and draw dendrogram ab = data.frame(a[1:6], b[1:6]) help(dist) ab.dist = dist(ab) ab.dist help(hclust) ab.cluster = hclust(ab.dist, method = "average") help("plot.dendrogram") plot(ab.cluster, hang = -1, main = "", sub = "", xlab = "") # Cluster Analysis Worked Example # use same data from PCA worked example wavelength_ids = seq(8,642,40) abline(v = as.numeric(colnames(allSpec[wavelength_ids])), lty = 3, lwd = 1) ex_one = allSpec[c(1, 6, 11, 21:25, 38:53), ] # calculate and examine dist matrix ex_one.dist = dist(ex_one[ , wavelength_ids]) ex_one.dist # calculate and examine cluster analysis ex_one.cluster = hclust(ex_one.dist, method = "ward.D") plot(ex_one.cluster, hang = -1, main = "", sub = "", xlab = "", cex = 0.75) # highlight clusters in terms of fraction copper stock in sample ex_one$perCu = ex_one$concCu/ex_one$concCu[1] plot(ex_one.cluster, hang = -1, labels = ex_one$perCu, main = "copper", sub = "", xlab = "fraction of stock in sample", ylab = "", cex = 0.75) help("rect.hclust") rect.hclust(ex_one.cluster, k = 3, which = 1, border = "blue") # plot grid of cluster diagrams old.par = par(mfrow = c(2,2)) ex_one$perCo = ex_one$concCo/ex_one$concCo[2] ex_one$perCr = ex_one$concCr/ex_one$concCr[3] plot(ex_one.cluster, hang = -1, labels = ex_one$perCu, main = "copper", sub = "", xlab = "fraction of stock in sample", ylab = "", cex = 0.75) rect.hclust(ex_one.cluster, k = 3, which = 1, border = "blue") plot(ex_one.cluster, hang = -1, labels = ex_one$perCo, main = "cobalt", sub = "", xlab = "fraction of stock in sample", ylab = "", cex = 0.75) rect.hclust(ex_one.cluster, k = 3, which = 3, border = "red") plot(ex_one.cluster, hang = -1, labels = ex_one$perCr, main = "chromium", sub = "", xlab = "fraction of stock in sample", ylab = "", cex = 0.75) rect.hclust(ex_one.cluster, k = 3, which = 2, border = "orange") wave.dist = dist(t(ex_one[ , wavelength_ids])) wave.clust = hclust(wave.dist, method = "ward.D") plot(wave.clust, hang = -1, main = "clustering of wavelengths", sub = "", xlab = "wavelength", ylab = "") par(old.par)