drive <- "E:" code.dir <- paste(drive, "DataMining/Code", sep = "/") data.dir <- paste(drive, "DataMining/Data", sep = "/") # source(paste(code.dir, "creategrid.r", sep = "/")) source(paste(code.dir, "array2arraydist.r", sep = "/")) source(paste(code.dir, "vq_helpers.r", sep = "/")) source(paste(code.dir, "creategaussians.r", sep = "/")) source(paste(code.dir, "voronoi.r", sep = "/")) source(paste(code.dir, "readfleas.r", sep = "/")) library(stats) library(deldir) # Do a Voronoi (Dirichlet) tesselation. # i.e. Spliting the plane into nearest neighbours. x <- c(2.3,3.0,7.0,1.0,3.0,8.0) y <- c(2.3,3.0,2.0,5.0,8.0,9.0) # Put dummy points at the corners of the rectangular # window, i.e. at (0,0), (10,0), (10,10), and (0,10) try <- deldir(x, y, list(ndx = 2, ndy = 2), c(0, 10, 0, 10)) plot(try, wlines = "tess", col = "red") points(x, y, pch = 17) #===== = = ========================= # Vector Quantization - kmeans", # Linde-Buzo-Gray, Generalized Linde Algorithm # A "V" with random initialization. Col <- rainbow(20) # Set colours # Set up the data and codebook vectors n.x <- 1000 n.y <- n.x n.cb <- 7 x <- runif(n.x)*2 - 1 y <- runif(n.y)/2 + abs(x) Data <- cbind(x,y) # For replication # save(Data, file = paste(data.dir, "VQV.Rdata", sep = "/")) load(paste(data.dir, "VQV.Rdata", sep = "/")) cb <- Data[sample(n.x, n.cb),] graphics.off() loop <- T while (loop == T) { # Find the distance from the codebook vectors to data points Dist <- array2arraydist(cb, Data) # min.index gives the number of the nearest codebook vector to the data points min.dist <- apply(Dist, 2, min) min.index <- apply(Dist, 2, order)[1,] # Draw the Voronoi sets Voronoi(cb, 1:n.cb, c(200,200), range(Data[,1]), range(Data[,2])) # points(Data, col = Col[min.index], pch = "+") points(cb, col = "red", pch = 16, cex = 2) # Move the codebook vectors to the middle of the group mid.point <- matrix(0, n.cb, 2) for (c in 1:n.cb) { mid.point[c,] <- apply(Data[(min.index == c),], 2, mean) } points(mid.point, col = "green", pch = 17, cex = 2) for (i in 1:500000) {junk <- exp(100)} # delay max(mid.point - cb) if (max(mid.point - cb) <.001) { loop <- F } cb <- mid.point } # kmeans(Data[1:100,], 3, 10) #============ Coin flip ============= # Random initialization can lead to different # final centriods. xx<- matrix(c(-5,0,-4,0,-3,0,-2,0,-1,0,0,0,1,0,2,0,3,0,4,0,5,0,0,1,0,-1), nrow = 13,byrow = T) loop <- T while (loop) { oldpar <- par(mfrow = c(2,2)) for (i in 1:4) { xx.km <- kmeans(xx, 2, 200) a <- xx.km$centers if (xx.km$cluster[1] != 1) xx.km$cluster <- abs(xx.km$cluster-2)+1 plot(xx, col = xx.km$cluster*2, pch = 16, cex = 1.5) points(xx.km$centers, pch = 17, col = "black", cex = 2) } par(oldpar) res <- readline("Finished Y/N ...") if ((res == "Y") | (res == "y")) loop = F } #================================ # A "V" with specific initialization at side. cb <- matrix(c(1,1,1,1.05,1,1.1,1,1.2,1,1.3,1,1.35,1,1.4),7,2, byrow = T) # GOOD if (dev.cur() [[1]]!=1) dev.off(which=dev.cur()) plot(Data, col = Col[min.index], pch = "+") points(cb, col = "red", pch = 16, cex = 2) loop <- T while (loop == T) { # Find the distance from the codebook vectors to data points Dist <-array2arraydist(cb, Data) # min.index gives the number of the nearest codebook vector to the data points min.dist <- apply(Dist, 2, min) min.index <- apply(Dist, 2, order)[1,] # Draw the Voronoi sets Voronoi(cb, 1:n.cb, c(200,200), range(Data[,1]), range(Data[,2])) points(Data, col = Col[min.index], pch = "+") points(cb, col = "red", pch = 16, cex = 2) # Move the codebook vectors to the middle of the group mid.point <- matrix(0, n.cb, 2) for (c in 1:n.cb) { mid.point[c,] <- apply(Data[(min.index == c),], 2, mean) } points(mid.point, col = "green", pch = 17, cex = 2) max(mid.point - cb) if (max(mid.point - cb) < 0.001) { loop <- F } cb <- mid.point } #================================ #A "V" with specific initialization at apex. cb <- matrix(c(0,0.34, 0,.1, 0,.2, 0,.25, 0,.28, 0,.32,0,.33),7,2,byrow = T) plot(Data, col = Col[min.index], pch = "+") points(cb, col = "red", pch = 16, cex = 2) # loop <- T while (loop == T) { # Find the distance from the codebook vectors to data points Dist <- array2arraydist(cb, Data) # min.index gives the number of the nearest codebook vector to the data points min.dist <- apply(Dist, 2, min) min.index <- apply(Dist, 2, order)[1,] # Draw the Voronoi sets Voronoi(cb, 1:n.cb, c(200,200), range(Data[,1]), range(Data[,2])) points(Data, col = Col[min.index], pch = "+") points(cb, col = "red", pch = 16, cex = 2) # Move the codebook vectors to the middle of the group mid.point <- matrix(0, n.cb, 2) for (c in 1:n.cb) { mid.point[c,] <- apply(Data[(min.index == c),], 2, mean) } points(mid.point, col = "green", pch = 17, cex = 2) max(mid.point - cb) if (max(mid.point - cb) <.001) { loop <- F } cb <- mid.point } #================================ numb.centres <- 8 numbpoints <- 100 # Set up the centers for 8 clusters (centres <- cbind(c(1,3,6,2,5,8,4,3), c(7,1,3,8,6,2,2,5))) base.colors <- c("#FF0000","#00FF00","#0000FF","#FF00FF","#00FFFF","#FFFF00", "#800000","#008000","#000080","#FF0080","#408080","#804000", "#004080", "#FF8000") colors <- rep(base.colors, ceiling(numb.centres/length(base.colors))) # set.seed(123456) clust <- {} for (i in (1:numb.centres)){ x <- rnorm(100, centres[i,1],.3) y <- rnorm(100, centres[i,2],.3) clust <- c(clust, list(cbind(x,y))) } clust # For replication # save(clust, file = paste(data.dir, "VQ8clust.Rdata", sep = "/")) load(paste(data.dir, "VQ8clust.Rdata", sep = "/")) clust[[3]][1:5, ] # X.syn <- clust[[1]] for (i in (2:numb.centres)){ X.syn <- rbind(X.syn, clust[[i]]) } # oldpar <- par(mfrow = c(4, 4)) par(mar = c(2, 1, 2, 1)) errs <- rep(0, 10) DBI <- rep(0, 10) # for (i in 2:14) { KM <- kmeans(X.syn, i, 15) plot(X.syn, col = colors[KM$cluster], pch = KM$cluster, main = paste(i," clusters")) errs[i-1] <- sum(KM$withinss) DBI[i-1] <- Davies.Bouldin(KM$centers, KM$withinss, KM$size) print(KM$centers) } # plot(2:14, errs, main = "SS") lines(2:14, errs) # plot(2:14, DBI, main = "Davies-Bouldin") lines(2:14, DBI) # plot(0, 0, xlim = c(0,10), ylim = c(0,10), type = "n", main = "Correct") for (i in (1:numb.centres)){ for (j in 1:numbpoints) { points (clust[[i]][j,1], clust[[i]][j,2], col = colors[i]) } } par(oldpar) #================================ # An example of kmeans (Vector Quantization) # Image compression library(pixmap) d.file <- paste(data.dir, "lena_bw.pgm", sep = "/") # x <- read.pnm(d.file) graphics.off() plot(x) x@size #=========================== pix <- x@grey pix[1:4, 1:5] #============ 1/4 of data =============== z.mean.4 <- x blocked.4 <- deconstruct.4(pix) # Rows <- dim(pix)[1] Cols <- dim(pix)[2] new.bytes.mean.4 <- blocked.4 for (i in 1:((Rows/2)*(Cols/2))) { tmp <- mean(new.bytes.mean.4[i,]) for (j in 1:4) { new.bytes.mean.4[i,j] <- tmp } } blocked.4[1:5,] new.bytes.mean.4[1:5,] # This uses 1/4 of the info #============ 1/4 of data =============== z.mean.4@grey <- reconstruct.4(new.bytes.mean.4, Rows, Cols) # Cluster the blocked information into 100 clusters of 4 dimensions. numb.centers <- 100 classes.100.4 <- {} while (length(classes.100.4) == 0) { res <- try(kmeans(blocked.4, numb.centers, 50)) if (length(res) > 1) classes.100.4 <- res } # Get integer values for the centroids. # Each of the blocks has an associated centroid so we have # 16384 indices to the 100 4-D codebook vectors new.bytes.4 <- floor(classes.100.4$centers[classes.100.4$cluster,]*256)/256 new.bytes.4[1:5, ] # z.4 <- x z.4@grey <- reconstruct.4(new.bytes.4, Rows, Cols) #============ Plot =============== graphics.off() x11(width = 8,height = 6.5) layout(matrix(c(1,2,1,2,3,4), 3, 2, byrow = T)) par(mar = c(1,1,2,1)) plot(z.mean.4, main = "Mean of 4") plot(z.4, main = "Centroid of 4") #============ Detail ============== par(mar = c(1,1,1,1)) plot(z.mean.4[125:150,125:175]) plot(z.4[125:150,125:175]) #=========== 1/16 of data ============== z.mean.16 <- x blocked.16 <- deconstruct.16(pix) # Rows <- dim(pix)[1] Cols <- dim(pix)[2] new.bytes.mean.16 <- blocked.16 for (i in 1:((Rows/4)*(Cols/4))) { tmp <- mean(new.bytes.mean.16[i,]) for (j in 1:16) { new.bytes.mean.16[i,j] <- tmp } } # This uses 1/16 of the info z.mean.16@grey <- reconstruct.16(new.bytes.mean.16, Rows, Cols) # Cluster the blocked information into 100 clusters of 16 dimensions. numb.centers <- 100 classes.100.16 <- {} while (length(classes.100.16) == 0) { res <- try(kmeans(blocked.16, numb.centers, 50)) if (length(res) > 1) classes.100.16 <- res } # Get integer values for the centroids. # Each of the blocks has an associated centroid so we have # 4096 indices to the 100 16-D codebook vectors new.bytes.16 <- floor(classes.100.16$centers[classes.100.16$cluster,]*256)/256 z.16 <- x z.16@grey <- reconstruct.16(new.bytes.16, Rows, Cols) #============ Plot =============== graphics.off() x11(width = 8,height = 6.5) layout(matrix(c(1,2,1,2,3,4), 3, 2, byrow = T)) par(mar = c(1,1,2,1)) plot(z.mean.16, main = "Mean of 16") plot(z.16, main = "Centroid of 16") #============ Detail ============== par(mar = c(1,1,1,1)) plot(z.mean.16[125:150,125:175]) plot(z.16[125:150,125:175]) # Differences range(pix) z.diff.mean.4 <- x tmp <- pix - z.mean.4@grey z.diff.mean.4@grey <- abs(tmp) z.diff.4 <- x tmp <- pix - z.4@grey z.diff.4@grey <- abs(tmp) # z.diff.mean.16 <- x tmp <- pix - z.mean.16@grey z.diff.mean.16@grey <- abs(tmp) z.diff.16 <- x tmp <- pix-z.16@grey z.diff.16@grey <- abs(tmp) x11(width = 10, height = 10) oldpar <- par(mfrow = c(2, 2), mar = c(2, 1, 2, 1)) plot(z.diff.mean.4, main = "Error Mean of 4") mtext(paste("Err = ",floor(sum((pix - z.mean.4@grey)^2)*100)/100), 1) plot(z.diff.4, main = "Error Centroid of 4") mtext(paste("Err = ",floor(sum((pix - z.4@grey)^2)*100)/100), 1) plot(z.diff.mean.16, main = "Error Mean of 16") mtext(paste("Err = ",floor(sum((pix - z.mean.16@grey)^2)*100)/100), 1) plot(z.diff.16, main = "Error Centroid of 16") mtext(paste("Err = ",floor(sum((pix - z.16@grey)^2)*100)/100), 1) par(oldpar) #=========== VQ - split ================== # A "V" starting with 2 centroids and splitting. # LBG VQ design algorithm k <- 2 # Dimension of space ep <- 0.01 # Tolerance for change in cb position # N <- 1 # Put the codebook vector at the mean cb <- rbind(apply(Data, 2, mean)) graphics.off() while (N < 16) { for (i in 1:N) { cb <- rbind(cb, (1-ep)*cb[i,]) cb[i,] <- (1+ep)*cb[i,] } N <- 2*N # Repeat this step until little change in cb pos shift <- 100 j <- 2 while (shift > .003) { n.cb <- nrow(cb) Dist <- array2arraydist(cb, Data) # min.index gives the number of the nearest codebook vector # to the data points min.dist <- apply(Dist, 2, min) if (n.cb == 1) { min.index <- apply(Dist, 2, order) } else { min.index <- apply(Dist, 2, order)[1,] } j <- j + 1 Voronoi(cb, 1:n.cb, c(200,200), c(-1.2,1.2), c(-0.2,2)) points(Data, col = Col[min.index], pch = "+") points(cb, col = "red", pch = 16, cex = 2) # Move the codebook vectors to the middle of the group mid.point <- matrix(0, n.cb, 2) for (c in 1:n.cb) { mid.point[c,] <- apply(Data[(min.index ==c),], 2, mean) } points(mid.point, col = "green", pch = 17, cex = 2) for (i in 1:500000) {junk <- exp(100)} # delay shift <- max(mid.point - cb) cb <- mid.point } withinSS <- rep(0, n.cb) size <- rep(0, n.cb) for (k in 1:n.cb) { Data.in.cluster <- Data[min.index == k,] withinSS[k] <- sum(apply((t(t(Data.in.cluster) - cb[k,]))^2, 1, sum)) size[k] <- nrow(Data.in.cluster) } cat("N = ", N, " DBI = ", Davies.Bouldin(cb, withinSS, size), "\n") } # ========= VQ - split worst =============== #A modification of above. The "worst" cluster is split. k <- 2 # Dimension of space ep <- 0.01 N <- 1 # Put the vector at the mean to start cb <- rbind(apply(Data, 2, mean)) # graphics.off() plot(Data, pch = "+") points(cb[1], cb[2], col = "red", pch = 16, cex = 1) plot(Data, pch = "+") # # Then split cb <- rbind(cb, (1 + ep)*cb[1,]) cb[1,] <- (1-ep)*cb[1,] N <- 2 plot(Data, pch = "+") points(cb[,1],cb[,2], col = "red", pch = 16, cex = 1) # ======== Do the loop ================ for (loop in (1:5)) { res <- move.centroids(cb, Data) cb <- res[[1]] belongs.to <- res[[2]] n.cb <- nrow(cb) withinSS <- rep(0, n.cb) size <- rep(0, n.cb) for (k in 1:n.cb) { Data.in.cluster <- Data[belongs.to == k, ] withinSS[k] <- sum(apply((t(t(Data.in.cluster) - cb[k,]))^2, 1, sum)) size[k] <- nrow(Data.in.cluster) } # Find the largest worst.SS <- order(-withinSS)[1] cat("N = ", N, " DBI = ", Davies.Bouldin(cb, withinSS, size), "\n") readline("Press return...") cb <- rbind(cb, (1+ep)*cb[worst.SS,]) cb[worst.SS,] <- (1-ep)*cb[worst.SS,] N <- N+1 } # ========== Do on Gaussians ============== # See how it works on some Gaussians numb.clust <- 7 Data.GC <- create.gaussians(numb.clust, varx = 1, vary = 2) # save(Data.GC, file = paste(data.dir, "VQGauss.Rdata", sep = "/")) load(paste(data.dir, "VQGauss.Rdata", sep = "/")) Data.G <- Data.GC[ , 1:2] graphics.off() n.x <- dim(Data.G)[1] ep <- 0.01 N <- 1 # Put the vector at the mean cb <- rbind(apply(Data.G, 2, mean)) # plot(Data.G, pch = Data.GC[ , 3]) points(cb[1],cb[2], col = "red", pch = 16, cex = 1) cb <- rbind(cb, (1+ep)*cb[1,]) cb[1,] <- (1-ep)*cb[1,] # N <- 2 for (loop in (1:(numb.clust+3))) { res <- move.centroids(cb, Data.G, pch = Data.GC[,3]) cb <- res[[1]] belongs.to <- res[[2]] n.cb <- nrow(cb) withinSS <- rep(0, n.cb) size <- rep(0, n.cb) for (k in 1:n.cb) { Data.in.cluster <- Data.G[belongs.to == k,] withinSS[k] <- sum(apply((t(t(Data.in.cluster) - cb[k,]))^2, 1, sum)) size[k] <- nrow(Data.in.cluster) } # Find the largest worst.SS <- order(-withinSS)[1] cat("N = ", N, " DBI = ", Davies.Bouldin(cb, withinSS, size), "\n") readline("Press return...") cb <- rbind(cb, (1+ep)*cb[worst.SS,]) cb[worst.SS,] <- (1-ep)*cb[worst.SS,] N <- N+1 } plot(Data.G, col = Data.GC[ , 3]) #====== Flea library(mda) (k2.m.flea <- kmeans(d.flea, 2, 10)) library(stats) confusion(k2.m.flea$cluster, flea.species) confusion.expand <- function (pred.c, class) { temp <-confusion(pred.c,class) row.sum <- apply(temp,1,sum) col.sum <- apply(temp,2,sum) t.sum <- sum(col.sum) tmp <- rbind(temp, rep("----", dim(temp)[2]), col.sum) tmp <- noquote(cbind(tmp, rep("|",dim(tmp)[1]), c(row.sum, "----", t.sum))) dimnames(tmp)<-list(object = c(dimnames(temp)[[1]],"-------","Col Sum"), true = c(dimnames(temp)[[2]],"|","Row Sum")) attr(tmp, "error") <- attr(temp, "error") attr(tmp, "mismatch") <- attr(temp, "mismatch") return(tmp) } confusion.expand(k2.m.flea$cluster, flea.species) #====== All those in cluster 1 k2.m.flea$cluster == 1 #====== Indices of all those in cluster 1 which(k2.m.flea$cluster == 1) #====== All those in cluster 1 in.1 <- d.flea[which(k2.m.flea$cluster == 1),] #====== All those in cluster 1 dev.1 <- t(t(in.1)-k2.m.flea$center[1,]) sum(apply(dev.1^2,1,sum)) graphics.off() plot(d.flea[,6],d.flea[,1],xlab = "AEDE3", ylab = "TARS1", type = "n", main = "k-means - 2 clusters", cex.main = 1.5) text(d.flea[,6], d.flea[,1], flea.species, col = k2.m.flea$cluster*2) # Three clusters k3.m.flea <- kmeans(d.flea, 3, 10) confusion.expand(k3.m.flea$cluster, flea.species) # graphics.off() plot(d.flea[,6], d.flea[,1], xlab = "AEDE3", ylab = "TARS1", type = "n", main = "k-means - 3 clusters", cex.main = 1.5, asp = 1) text(d.flea[,6],d.flea[,1],flea.species, col = k3.m.flea$cluster+1) mtext(paste("DaviesBouldin Index =", (floor(1000*Davies.Bouldin(k3.m.flea$centers, k3.m.flea$withinss, k3.m.flea$size) + 0.5))/1000),3,0,cex = 1.3) #========== Look at data graphics.off() pairs(d.flea, col = species + 1, pch = 19, cex = 1.1) #=========Look at most promising graphics.off() plot(d.flea[,"aede3"], d.flea[,"tars1"], xlab = "AEDE3", ylab = "TARS1", col = species+1, pch = 19, cex = 1) # Dimension reduction k3.m.flea <- kmeans(d.flea[,c("tars1","aede3")], 3, 10) confusion.expand(k3.m.flea$cluster, flea.species) species k3.m.flea$cluster best.match <- function(table, candidates) { tmp <- {} for (i in candidates) { # Count the number that match the current candidate tmp <- c(tmp, length(which(table == i))) } candidates[which(tmp == max(tmp))] } (cand <- unique(k3.m.flea$cluster)) # all possible cluster numbers (error.pos <- c(which(k3.m.flea$cluster[1:21] != best.match(k3.m.flea$cluster[1:21], cand)), which(k3.m.flea$cluster[22:43] != best.match(k3.m.flea$cluster[22:43], cand))+22, which(k3.m.flea$cluster[44:74] != best.match(k3.m.flea$cluster[44:74], cand))+43)) # graphics.off() plot(d.flea[,6], d.flea[,1], xlab = "AEDE3", ylab = "TARS1", type = "n", main = "3 clusters using 2 variables", asp = 1) text(d.flea[,6], d.flea[,1], flea.species, col = (k3.m.flea$cluster+1)) # for (i in 1:length(error.pos)) { arrows(80, 160, d.flea[error.pos[i],6],d.flea[error.pos[i],1]) } text(80,158, "errors") # points(k3.m.flea$center[,2:1], pch = 17, col = 2:4, cex = 2) # Repeat to get different one k3.m.flea <- kmeans(d.flea[,c("tars1","aede3")], 3, 10) confusion.expand(k3.m.flea$cluster, flea.species) (cand <- unique(k3.m.flea$cluster)) # all possible cluster numbers (error.pos <- c(which(k3.m.flea$cluster[1:21] != best.match(k3.m.flea$cluster[1:21], cand)), which(k3.m.flea$cluster[22:43] != best.match(k3.m.flea$cluster[22:43], cand))+22, which(k3.m.flea$cluster[44:74] != best.match(k3.m.flea$cluster[44:74], cand))+43)) # graphics.off() plot(d.flea[,6],d.flea[,1], xlab = "AEDE3", ylab = "TARS1", type = "n", main = "3 clusters using 2 variables", asp = 1) text(d.flea[,6],d.flea[,1], flea.species, colk3.m.flea$cluster + 1)) # for (i in 1:length(error.pos)) { arrows(80, 160, d.flea[error.pos[i],6],d.flea[error.pos[i],1]) } text(80,158, "errors") # points(k3.m.flea$center[,2:1], pch = 17, col = 2:4, cex = 2) # Start of kmeans k3.m.flea <- kmeans(d.flea[,c("tars1","aede3")], 3, 1) graphics.off() plot(d.flea[,6], d.flea[,1], xlab = "AEDE3", ylab = "TARS1", type = "n", main = "First Stage of kmeans", asp = 1) text(d.flea[,6], d.flea[,1], flea.species, col = (k3.m.flea$cluster+1)) points(k3.m.flea$center[,2:1], pch = 17, col = 2:4, cex = 2) # End of kmeans k3.m.flea <- kmeans(d.flea[,c("tars1","aede3")], 3, 10) graphics.off() plot(d.flea[,6],d.flea[,1], xlab = "AEDE3", ylab = "TARS1",type = "n", main = "Final Stage of kmeans", asp = 1) text(d.flea[,6],d.flea[,1], flea.species, col = (k3.m.flea$cluster+1)) points(k3.m.flea$center[,2:1], pch = 17, col = 2:4, cex = 2) #=== 4 clusters k4.m.flea <- kmeans(d.flea, 4, 10) confusion.expand(k4.m.flea$cluster, flea.species) plot(d.flea[,6], d.flea[,1], xlab = "AEDE3", ylab = "TARS1", type = "n", main = "k-means - 4 clusters", cex.main = 1.5) text(d.flea[,6], d.flea[,1], flea.species, col = k4.m.flea$cluster+1) mtext(paste("DaviesBouldin Index =", (floor(1000*Davies.Bouldin(k4.m.flea$centers, k4.m.flea$withinss, k4.m.flea$size) + 0.5))/1000), 3, 0, cex = 1.3) # Hierarchical clustering library(stats) #source(paste(code.dir, "EJN_hierclust.r", sep = "/")) # Create some synthetic data x <- c(0.27525282, 0.78004913, 0.62244529, 0.29586737, 0.72816080, 0.12258387, 0.54350974, 0.43725947, 0.07487282, 0.84475204) y <- c(0.602621331, 0.750814938, 0.818124069, 0.325633894, 0.401875981, 0.942624914, 0.638426402, 0.002538590, 0.001100677, 0.616799933) d.synth <- cbind(x,y) plot(x,y, "n") text(x,y,format(1:10)) # (s.dist <- dist(d.synth)) # show what it is my.dist <- matrix(0, 10, 9) for (j in 1:9) { for (i in (j+1):10) { my.dist[i,j] <- ((x[i]-x[j])^2+(y[i]-y[j])^2)^(.5) } } my.dist[2:10,] x11(width = 8, height = 4) oldpar <- par(mfrow = c(1, 3)) s.hclust <- hclust(s.dist, method = "single" ) plot(s.hclust, main = "Single Linkage") # Complete s.hclust<-hclust(s.dist, method = "complete" ) plot(s.hclust, main = "Complete Linkage") # Average s.hclust<-hclust(s.dist, method = "ave" ) plot(s.hclust, main = "Average Linkage") par(oldpar) # Flea Beetles c.dist <- dist(d.flea) c.hclust <- hclust(c.dist, method = "single" ) plot(c.hclust, main = "Single Linkage") # Not very informative c.hclust <- hclust(c.dist, method = "single" ) c.hclust$labels <- flea.species plot(c.hclust, main = "Single Linkage") # c.hclust <- hclust(c.dist, method = "complete" ) c.hclust$labels <- flea.species plot(c.hclust, main = "Complete Linkage") # c.hclust<-hclust(c.dist, method = "ave" ) c.hclust$labels <- flea.species plot(c.hclust, main = "Average Linkage") # We can show what graphically cases are in a given number of clusters rect.hclust(c.hclust, k = 3, border = "red") # plot(c.hclust, main = "Average Linkage") rect.hclust(c.hclust, k = 4, border = "blue") # Create a list of the cluster elements (in.clust <- rect.hclust(c.hclust, k = 4, border = "blue")) k3.m.flea <- kmeans(d.flea, 3, 10) (kmeans.clust <- k3.m.flea$cluster) H.clust <- rep(0,74) H.clust[in.clust[[1]]] <- 1 H.clust[in.clust[[2]]] <- 2 H.clust[in.clust[[3]]] <- 3 H.clust[in.clust[[4]]] <- 2 H.clust # or at a given height plot(c.hclust, main = "Average Linkage") rect.hclust(c.hclust, h = 30, border = "green") (in.clust<-rect.hclust(c.hclust, h = 30, border = "green")) #==================== End ==================