drive <- "E:"
code.dir <- paste(drive, "DataMining/Code", sep = "/")
data.dir <- paste(drive, "DataMining/Data", sep = "/")
   #
source(paste(code.dir, "creategrid.r", sep = "/"))
source(paste(code.dir, "array2arraydist.r", sep = "/"))
source(paste(code.dir, "vq_helpers.r", sep = "/"))
source(paste(code.dir, "creategaussians.r", sep = "/"))
source(paste(code.dir, "voronoi.r", sep = "/"))
source(paste(code.dir, "readfleas.r", sep = "/"))
library(stats)
library(deldir)
   # Do a Voronoi (Dirichlet) tesselation.
   # i.e. Spliting the plane into nearest neighbours.
x   <- c(2.3,3.0,7.0,1.0,3.0,8.0)
y   <- c(2.3,3.0,2.0,5.0,8.0,9.0)
   # Put dummy points at the corners of the rectangular
   # window, i.e. at (0,0), (10,0), (10,10), and (0,10)
try <- deldir(x, y, list(ndx = 2, ndy = 2), c(0, 10, 0, 10))
plot(try, wlines = "tess", col = "red")
points(x, y, pch = 17)

   #===== =  = =========================
   # Vector Quantization - kmeans",
     # Linde-Buzo-Gray, Generalized Linde Algorithm
     # A "V" with random initialization.
Col <- rainbow(20)        # Set colours
   # Set up the data and codebook vectors
n.x <- 1000
n.y <- n.x
n.cb <- 7
x <- runif(n.x)*2 - 1
y <- runif(n.y)/2 + abs(x)
Data <- cbind(x,y)
# For replication
# save(Data, file = paste(data.dir, "VQV.Rdata", sep = "/"))
load(paste(data.dir, "VQV.Rdata", sep = "/"))

cb <- Data[sample(n.x, n.cb),]
graphics.off()
loop <- T
while (loop == T) {
      # Find the distance from the codebook vectors to data points
   Dist <- array2arraydist(cb, Data)
      # min.index gives the number of the nearest codebook vector to the data points
   min.dist  <- apply(Dist, 2, min)
   min.index <- apply(Dist, 2, order)[1,]
      # Draw the Voronoi sets
   Voronoi(cb, 1:n.cb, c(200,200), range(Data[,1]), range(Data[,2]))
      #
   points(Data, col = Col[min.index], pch = "+")
   points(cb, col = "red", pch = 16, cex = 2)
      #  Move the codebook vectors to the middle of the group
   mid.point <- matrix(0, n.cb, 2)
   for (c in 1:n.cb) {
      mid.point[c,] <- apply(Data[(min.index == c),], 2, mean)
   }
   points(mid.point, col = "green", pch = 17, cex = 2)
   for (i in 1:500000) {junk <- exp(100)}  # delay
   max(mid.point - cb)
   if (max(mid.point - cb) <.001) {
      loop <- F
   }
   cb <- mid.point
}

   #
kmeans(Data[1:100,], 3, 10)


   #============ Coin flip =============
   # Random initialization can lead to different
   # final centriods.
xx<- matrix(c(-5,0,-4,0,-3,0,-2,0,-1,0,0,0,1,0,2,0,3,0,4,0,5,0,0,1,0,-1),
            nrow = 13,byrow = T)
loop <- T
while (loop) {
   oldpar <- par(mfrow = c(2,2))
   for (i in 1:4) {
     xx.km <- kmeans(xx, 2, 200)
     a <- xx.km$centers
     if (xx.km$cluster[1] != 1) xx.km$cluster <- abs(xx.km$cluster-2)+1
     plot(xx, col = xx.km$cluster*2, pch = 16, cex = 1.5)
     points(xx.km$centers, pch = 17, col = "black", cex = 2)
   }
   par(oldpar)
   res <- readline("Finished Y/N ...")
   if ((res == "Y") | (res == "y")) loop = F
}


   #================================
   # A "V" with specific initialization at side.
cb <- matrix(c(1,1,1,1.05,1,1.1,1,1.2,1,1.3,1,1.35,1,1.4),7,2, byrow = T)   # GOOD
if (dev.cur() [[1]]!=1) dev.off(which=dev.cur())
plot(Data, col = Col[min.index], pch = "+")
points(cb, col = "red", pch = 16, cex = 2)

loop <- T
while (loop == T) {
      # Find the distance from the codebook vectors to data points
   Dist <-array2arraydist(cb, Data)
      # min.index gives the number of the nearest codebook vector to the data points
   min.dist  <- apply(Dist, 2, min)
   min.index <- apply(Dist, 2, order)[1,]
      # Draw the Voronoi sets
   Voronoi(cb, 1:n.cb, c(200,200), range(Data[,1]), range(Data[,2]))
   points(Data, col = Col[min.index], pch = "+")
   points(cb, col = "red", pch = 16, cex = 2)
      #  Move the codebook vectors to the middle of the group
   mid.point <- matrix(0, n.cb, 2)
   for (c in 1:n.cb) {
      mid.point[c,] <- apply(Data[(min.index == c),], 2, mean)
   }
   points(mid.point, col = "green", pch = 17, cex = 2)
   max(mid.point - cb)
   if (max(mid.point - cb) < 0.001) {
      loop <- F
   }
   cb <- mid.point
}


   #================================
   #A "V" with specific initialization at apex.
cb <- matrix(c(0,0.34, 0,.1, 0,.2, 0,.25, 0,.28, 0,.32,0,.33),7,2,byrow = T)
plot(Data, col = Col[min.index], pch = "+")
points(cb, col = "red", pch = 16, cex = 2)
   #
loop <- T
while (loop == T) {
      # Find the distance from the codebook vectors to data points
   Dist <- array2arraydist(cb, Data)
      # min.index gives the number of the nearest codebook vector to the data points
   min.dist  <- apply(Dist, 2, min)
   min.index <- apply(Dist, 2, order)[1,]
      # Draw the Voronoi sets
   Voronoi(cb, 1:n.cb, c(200,200), range(Data[,1]), range(Data[,2]))
   points(Data, col = Col[min.index], pch = "+")
   points(cb, col = "red", pch = 16, cex = 2)
      #  Move the codebook vectors to the middle of the group
   mid.point <- matrix(0, n.cb, 2)
   for (c in 1:n.cb) {
      mid.point[c,] <- apply(Data[(min.index == c),], 2, mean)
   }
   points(mid.point, col = "green", pch = 17, cex = 2)
   max(mid.point - cb)
   if (max(mid.point - cb) <.001) {
      loop <- F
   }
   cb <- mid.point
}


   #================================
numb.centres <- 8
numbpoints <- 100
   # Set up the centers for 8 clusters
(centres <- cbind(c(1,3,6,2,5,8,4,3), c(7,1,3,8,6,2,2,5)))
base.colors <- c("#FF0000","#00FF00","#0000FF","#FF00FF","#00FFFF","#FFFF00",
                 "#800000","#008000","#000080","#FF0080","#408080","#804000",
                 "#004080", "#FF8000")
colors <- rep(base.colors, ceiling(numb.centres/length(base.colors)))
   #
set.seed(123456)
clust <- {}
for (i in (1:numb.centres)){
   x <- rnorm(100, centres[i,1],.3)
   y <- rnorm(100, centres[i,2],.3)
   clust <- c(clust, list(cbind(x,y)))
}
clust
   # For replication
# save(clust, file = paste(data.dir, "VQ8clust.Rdata", sep = "/"))
load(paste(data.dir, "VQ8clust.Rdata", sep = "/"))

clust[[3]][1:5, ]
   #
X.syn <- clust[[1]]
for (i in (2:numb.centres)){
   X.syn <- rbind(X.syn, clust[[i]])
}

   #
oldpar <- par(mfrow = c(4, 4))
par(mar = c(2, 1, 2, 1))
errs <- rep(0, 10)
DBI <- rep(0, 10)
   #
for (i in 2:14) {
   KM <- kmeans(X.syn, i, 15)
   plot(X.syn, col = colors[KM$cluster], pch = KM$cluster,
        main = paste(i," clusters"))
   errs[i-1] <- sum(KM$withinss)
   DBI[i-1] <- Davies.Bouldin(KM$centers, KM$withinss, KM$size)
   print(KM$centers)
}
   #
plot(2:14, errs, main = "SS")
lines(2:14, errs)
   #
plot(2:14, DBI, main = "Davies-Bouldin")
lines(2:14, DBI)
   #
plot(0, 0, xlim = c(0,10), ylim = c(0,10), type = "n", main = "Correct")
for (i in (1:numb.centres)){
    for (j in 1:numbpoints) {
       points (clust[[i]][j,1], clust[[i]][j,2], col = colors[i])
    }
}
par(oldpar)

   #================================
   # An example of kmeans (Vector Quantization)
   #     Image compression
library(pixmap)
d.file <- paste(data.dir, "lena_bw.pgm", sep = "/")
   #
x <- read.pnm(d.file)
graphics.off()
plot(x)
x@size

   #===========================
pix <- x@grey
pix[1:4, 1:5]

   #============ 1/4 of data ===============
z.mean.4 <- x
blocked.4 <- deconstruct.4(pix)
   #
Rows <- dim(pix)[1]
Cols <- dim(pix)[2]
new.bytes.mean.4 <- blocked.4
for (i in 1:((Rows/2)*(Cols/2))) {
   tmp <- mean(new.bytes.mean.4[i,])
   for (j in 1:4) {
      new.bytes.mean.4[i,j] <- tmp
   }
}
blocked.4[1:5,]
new.bytes.mean.4[1:5,]
   # This uses 1/4 of the info


   #============ 1/4 of data ===============
z.mean.4@grey <- reconstruct.4(new.bytes.mean.4, Rows, Cols)

   # Cluster the blocked information into 100 clusters of 4 dimensions.
numb.centers <- 100
classes.100.4 <- {}
while (length(classes.100.4) == 0) {
   res <- try(kmeans(blocked.4, numb.centers, 50))
   if (length(res) > 1) classes.100.4 <- res
}
   # Get integer values for the centroids.
   # Each of the blocks has an associated centroid so we have
   # 16384 indices to the 100 4-D codebook vectors
new.bytes.4 <- floor(classes.100.4$centers[classes.100.4$cluster,]*256)/256
new.bytes.4[1:5, ]
   #
z.4 <- x
z.4@grey <- reconstruct.4(new.bytes.4, Rows, Cols)


   #============ Plot ===============
graphics.off()
x11(width = 8,height = 6.5)
layout(matrix(c(1,2,1,2,3,4), 3, 2, byrow = T))
par(mar = c(1,1,2,1))
plot(z.mean.4, main = "Mean of 4")
plot(z.4, main = "Centroid of 4")
   #============ Detail ==============
par(mar = c(1,1,1,1))
plot(z.mean.4[125:150,125:175])
plot(z.4[125:150,125:175])

   #=========== 1/16 of data ==============
z.mean.16 <- x
blocked.16 <- deconstruct.16(pix)
   #
Rows <- dim(pix)[1]
Cols <- dim(pix)[2]
new.bytes.mean.16 <- blocked.16
for (i in 1:((Rows/4)*(Cols/4))) {
   tmp <- mean(new.bytes.mean.16[i,])
   for (j in 1:16) {
      new.bytes.mean.16[i,j] <- tmp
   }
}
   # This uses 1/16 of the info

z.mean.16@grey <- reconstruct.16(new.bytes.mean.16, Rows, Cols)
   # Cluster the blocked information into 100 clusters of 16 dimensions.
numb.centers <- 100
classes.100.16 <- {}
while (length(classes.100.16) == 0) {
   res <- try(kmeans(blocked.16, numb.centers, 50))
   if (length(res) > 1) classes.100.16 <- res
}
   # Get integer values for the centroids.
   # Each of the blocks has an associated centroid so we have
   # 4096 indices to the 100 16-D codebook vectors
new.bytes.16 <- floor(classes.100.16$centers[classes.100.16$cluster,]*256)/256

z.16 <- x
z.16@grey <- reconstruct.16(new.bytes.16, Rows, Cols)

   #============ Plot ===============
graphics.off()
x11(width = 8,height = 6.5)
layout(matrix(c(1,2,1,2,3,4), 3, 2, byrow = T))
par(mar = c(1,1,2,1))
plot(z.mean.16, main = "Mean of 16")
plot(z.16, main = "Centroid of 16")
   #============ Detail ==============
par(mar = c(1,1,1,1))
plot(z.mean.16[125:150,125:175])
plot(z.16[125:150,125:175])

   # Differences
range(pix)

z.diff.mean.4 <- x
tmp <- pix - z.mean.4@grey
z.diff.mean.4@grey <- abs(tmp)
z.diff.4 <- x
tmp <- pix - z.4@grey
z.diff.4@grey <- abs(tmp)
   #
z.diff.mean.16 <- x
tmp <- pix - z.mean.16@grey
z.diff.mean.16@grey <- abs(tmp)
z.diff.16 <- x
tmp <- pix-z.16@grey
z.diff.16@grey <- abs(tmp)
x11(width = 10, height = 10)
oldpar <- par(mfrow = c(2, 2), mar = c(2, 1, 2, 1))
plot(z.diff.mean.4, main = "Error Mean of 4")
mtext(paste("Err = ",floor(sum((pix - z.mean.4@grey)^2)*100)/100), 1)
plot(z.diff.4, main = "Error Centroid of 4")
mtext(paste("Err = ",floor(sum((pix - z.4@grey)^2)*100)/100), 1)
plot(z.diff.mean.16, main = "Error Mean of 16")
mtext(paste("Err = ",floor(sum((pix - z.mean.16@grey)^2)*100)/100), 1)
plot(z.diff.16, main = "Error Centroid of 16")
mtext(paste("Err = ",floor(sum((pix - z.16@grey)^2)*100)/100), 1)
par(oldpar)


   #=========== VQ - split ==================
   # A "V" starting with 2 centroids and splitting.
   #        LBG VQ design algorithm
k <- 2         # Dimension of space
ep <- 0.01     # Tolerance for change in cb position
   #
N <- 1
   # Put the codebook vector at the mean
cb <- rbind(apply(Data, 2, mean))
graphics.off()
while (N < 16) {
   for (i in 1:N) {
      cb <- rbind(cb, (1-ep)*cb[i,])
      cb[i,] <- (1+ep)*cb[i,]
   }
   N <- 2*N
      #  Repeat this step until little change in cb pos
   shift <- 100
   j <- 2
   while (shift > .003) {
      n.cb <- nrow(cb)
      Dist <- array2arraydist(cb, Data)
         # min.index gives the number of the nearest codebook vector
         # to the data points
      min.dist  <- apply(Dist, 2, min)
      if (n.cb == 1) {
         min.index <- apply(Dist, 2, order)
      } else {
         min.index <- apply(Dist, 2, order)[1,]
      }
      j <- j + 1
      Voronoi(cb, 1:n.cb, c(200,200), c(-1.2,1.2), c(-0.2,2))
      points(Data, col = Col[min.index], pch = "+")
      points(cb, col = "red", pch = 16, cex = 2)
         #  Move the codebook vectors to the middle of the group
      mid.point <- matrix(0, n.cb, 2)
      for (c in 1:n.cb) {
         mid.point[c,] <- apply(Data[(min.index ==c),], 2, mean)
      }
      points(mid.point, col = "green", pch = 17, cex = 2)
      for (i in 1:500000) {junk <- exp(100)}  # delay
      shift <- max(mid.point - cb)
      cb <- mid.point
   }
   withinSS <- rep(0, n.cb)
   size <- rep(0, n.cb)
   for (k in 1:n.cb) {
      Data.in.cluster <- Data[min.index == k,]
      withinSS[k] <-  sum(apply((t(t(Data.in.cluster) - cb[k,]))^2, 1, sum))
      size[k] <- nrow(Data.in.cluster)
   }
   cat("N = ", N, "  DBI = ", Davies.Bouldin(cb, withinSS, size), "\n")
}


   # ========= VQ - split worst ===============
   #A modification of above. The "worst" cluster is split.
k <- 2         # Dimension of space
ep <- 0.01
N <- 1
   # Put the vector at the mean to start
cb <- rbind(apply(Data, 2, mean))
   #
graphics.off()
plot(Data, pch = "+")
points(cb[1], cb[2], col = "red", pch = 16, cex = 1)
plot(Data, pch = "+")
   #

   # Then split
cb <- rbind(cb, (1 + ep)*cb[1,])
cb[1,] <- (1-ep)*cb[1,]
N <- 2
plot(Data, pch = "+")
points(cb[,1],cb[,2], col = "red", pch = 16, cex = 1)

   # ======== Do the loop ================
for (loop in (1:5)) {
   res <- move.centroids(cb, Data)
   cb <- res[[1]]
   belongs.to <- res[[2]]
   n.cb <- nrow(cb)
   withinSS <- rep(0, n.cb)
   size <- rep(0, n.cb)
   for (k in 1:n.cb) {
      Data.in.cluster <- Data[belongs.to == k, ]
      withinSS[k] <-  sum(apply((t(t(Data.in.cluster) - cb[k,]))^2, 1, sum))
      size[k] <- nrow(Data.in.cluster)
   }
      # Find the largest
   worst.SS <- order(-withinSS)[1]

   cat("N = ", N, "  DBI = ", Davies.Bouldin(cb, withinSS, size), "\n")

   readline("Press return...")
   cb <- rbind(cb, (1+ep)*cb[worst.SS,])
   cb[worst.SS,] <- (1-ep)*cb[worst.SS,]
   N <- N+1
}




   # ========== Do on Gaussians ==============
   # See how it works on some Gaussians
numb.clust <- 7
Data.GC <- create.gaussians(numb.clust, varx = 1, vary = 2)
# save(Data.GC, file = paste(data.dir, "VQGauss.Rdata", sep = "/"))
load(paste(data.dir, "VQGauss.Rdata", sep = "/"))

Data.G <- Data.GC[ , 1:2]
graphics.off()

n.x <- dim(Data.G)[1]
ep <- 0.01
N <- 1
   # Put the vector at the mean
cb <- rbind(apply(Data.G, 2, mean))
   #
plot(Data.G, pch = Data.GC[ , 3])
points(cb[1],cb[2], col = "red", pch = 16, cex = 1)

cb <- rbind(cb, (1+ep)*cb[1,])
cb[1,] <- (1-ep)*cb[1,]
   #
N <- 2
for (loop in (1:(numb.clust+3))) {
   res <- move.centroids(cb, Data.G, pch = Data.GC[,3])
   cb <- res[[1]]
   belongs.to <- res[[2]]
   n.cb <- nrow(cb)
   withinSS <- rep(0, n.cb)
   size <- rep(0, n.cb)
   for (k in 1:n.cb) {
      Data.in.cluster <- Data.G[belongs.to == k,]
      withinSS[k] <-  sum(apply((t(t(Data.in.cluster) - cb[k,]))^2, 1, sum))
      size[k] <- nrow(Data.in.cluster)
   }
      # Find the largest
   worst.SS <- order(-withinSS)[1]
   cat("N = ", N, "  DBI = ", Davies.Bouldin(cb, withinSS, size), "\n")
   readline("Press return...")
   cb <- rbind(cb, (1+ep)*cb[worst.SS,])
   cb[worst.SS,] <- (1-ep)*cb[worst.SS,]
   N <- N+1
}
plot(Data.G, col = Data.GC[ , 3])

   #======  Flea
library(mda)
(k2.m.flea <- kmeans(d.flea, 2, 10))

library(stats)
confusion(k2.m.flea$cluster, flea.species)

confusion.expand <- function (pred.c, class) {
   temp <-confusion(pred.c,class)
   row.sum <- apply(temp,1,sum)
   col.sum <- apply(temp,2,sum)
   t.sum <- sum(col.sum)
   tmp <- rbind(temp, rep("----", dim(temp)[2]), col.sum)
   tmp <- noquote(cbind(tmp, rep("|",dim(tmp)[1]), c(row.sum, "----", t.sum)))
   dimnames(tmp)<-list(object = c(dimnames(temp)[[1]],"-------","Col Sum"),
                 true = c(dimnames(temp)[[2]],"|","Row Sum"))
   attr(tmp, "error") <- attr(temp, "error")
   attr(tmp, "mismatch") <- attr(temp, "mismatch")
   return(tmp)
}

confusion.expand(k2.m.flea$cluster, flea.species)

   #====== All those in cluster 1
k2.m.flea$cluster == 1
   #====== Indices of all those in cluster 1
which(k2.m.flea$cluster == 1)
   #====== All those in cluster 1
in.1 <- d.flea[which(k2.m.flea$cluster == 1),]
   #====== All those in cluster 1
dev.1 <- t(t(in.1)-k2.m.flea$center[1,])
sum(apply(dev.1^2,1,sum))

graphics.off()
plot(d.flea[,6],d.flea[,1],xlab = "AEDE3", ylab = "TARS1", type = "n",
     main = "k-means - 2 clusters", cex.main = 1.5)
text(d.flea[,6], d.flea[,1], flea.species, col = k2.m.flea$cluster*2)

   # Three clusters
k3.m.flea <- kmeans(d.flea, 3, 10)
confusion.expand(k3.m.flea$cluster, flea.species)

   #
graphics.off()
plot(d.flea[,6], d.flea[,1], xlab = "AEDE3", ylab = "TARS1", type = "n",
     main = "k-means - 3 clusters", cex.main = 1.5, asp = 1)
text(d.flea[,6],d.flea[,1],flea.species, col = k3.m.flea$cluster+1)
mtext(paste("DaviesBouldin Index =",
            (floor(1000*Davies.Bouldin(k3.m.flea$centers, k3.m.flea$withinss, k3.m.flea$size) + 0.5))/1000),3,0,cex = 1.3)

   #========== Look at data
graphics.off()
pairs(d.flea, col = species + 1, pch = 19, cex = 1.1)

   #=========Look at most promising
graphics.off()
plot(d.flea[,"aede3"], d.flea[,"tars1"], xlab = "AEDE3", ylab = "TARS1",
     col = species+1, pch = 19, cex = 1)

   # Dimension reduction
k3.m.flea <- kmeans(d.flea[,c("tars1","aede3")], 3, 10)
confusion.expand(k3.m.flea$cluster, flea.species)

species
k3.m.flea$cluster

best.match <- function(table, candidates) {
   tmp <- {}
   for (i in candidates) {
         # Count the number that match the current candidate
      tmp <- c(tmp, length(which(table == i)))
   }
   candidates[which(tmp == max(tmp))]
}

(cand <- unique(k3.m.flea$cluster))    # all possible cluster numbers

(error.pos <- c(which(k3.m.flea$cluster[1:21] != best.match(k3.m.flea$cluster[1:21], cand)),
               which(k3.m.flea$cluster[22:43] != best.match(k3.m.flea$cluster[22:43], cand))+22,
               which(k3.m.flea$cluster[44:74] != best.match(k3.m.flea$cluster[44:74], cand))+43))


   #
graphics.off()
plot(d.flea[,6], d.flea[,1], xlab = "AEDE3", ylab = "TARS1", type = "n", main = "3 clusters using 2 variables", asp = 1)
text(d.flea[,6], d.flea[,1], flea.species, col = (k3.m.flea$cluster+1))
   #
for (i in 1:length(error.pos)) {
   arrows(80, 160, d.flea[error.pos[i],6],d.flea[error.pos[i],1])
}
text(80,158, "errors")
   #
points(k3.m.flea$center[,2:1], pch = 17, col = 2:4, cex = 2)

   # Repeat to get different one
k3.m.flea <- kmeans(d.flea[,c("tars1","aede3")], 3, 10)
confusion.expand(k3.m.flea$cluster, flea.species)
(cand <- unique(k3.m.flea$cluster))    # all possible cluster numbers
(error.pos <- c(which(k3.m.flea$cluster[1:21] != best.match(k3.m.flea$cluster[1:21], cand)),
               which(k3.m.flea$cluster[22:43] != best.match(k3.m.flea$cluster[22:43], cand))+22,
               which(k3.m.flea$cluster[44:74] != best.match(k3.m.flea$cluster[44:74], cand))+43))
   #
graphics.off()
plot(d.flea[,6],d.flea[,1], xlab = "AEDE3", ylab = "TARS1", type = "n", main = "3 clusters using 2 variables", asp = 1)
text(d.flea[,6],d.flea[,1], flea.species, colk3.m.flea$cluster + 1))
   #
for (i in 1:length(error.pos)) {
   arrows(80, 160, d.flea[error.pos[i],6],d.flea[error.pos[i],1])
}
text(80,158, "errors")
   #
points(k3.m.flea$center[,2:1], pch = 17, col = 2:4, cex = 2)

   # Start of kmeans
k3.m.flea <- kmeans(d.flea[,c("tars1","aede3")], 3, 1)
graphics.off()
plot(d.flea[,6], d.flea[,1], xlab = "AEDE3", ylab = "TARS1", type = "n",
      main = "First Stage of kmeans", asp = 1)
text(d.flea[,6], d.flea[,1], flea.species, col = (k3.m.flea$cluster+1))
points(k3.m.flea$center[,2:1], pch = 17, col = 2:4, cex = 2)


   # End of kmeans
k3.m.flea <- kmeans(d.flea[,c("tars1","aede3")], 3, 10)
graphics.off()
plot(d.flea[,6],d.flea[,1], xlab = "AEDE3", ylab = "TARS1",type = "n",
      main = "Final Stage of kmeans", asp = 1)
text(d.flea[,6],d.flea[,1], flea.species, col = (k3.m.flea$cluster+1))
points(k3.m.flea$center[,2:1], pch = 17, col = 2:4, cex = 2)

   #=== 4 clusters
k4.m.flea <- kmeans(d.flea, 4, 10)
confusion.expand(k4.m.flea$cluster, flea.species)

plot(d.flea[,6], d.flea[,1], xlab = "AEDE3", ylab = "TARS1", type = "n",
     main = "k-means - 4 clusters", cex.main = 1.5)
text(d.flea[,6], d.flea[,1], flea.species, col = k4.m.flea$cluster+1)
mtext(paste("DaviesBouldin Index =",
            (floor(1000*Davies.Bouldin(k4.m.flea$centers, k4.m.flea$withinss, k4.m.flea$size) + 0.5))/1000), 3, 0, cex = 1.3)



   # Hierarchical clustering
library(stats)
#source(paste(code.dir, "EJN_hierclust.r", sep = "/"))
   # Create some synthetic data
x <- c(0.27525282, 0.78004913, 0.62244529, 0.29586737, 0.72816080, 0.12258387, 0.54350974, 0.43725947, 0.07487282, 0.84475204)
y <- c(0.602621331, 0.750814938, 0.818124069, 0.325633894, 0.401875981, 0.942624914, 0.638426402, 0.002538590, 0.001100677, 0.616799933)

d.synth <- cbind(x,y)
plot(x,y, "n")
text(x,y,format(1:10))

   #
(s.dist <- dist(d.synth))

   # show what it is
my.dist <- matrix(0, 10, 9)
for (j in 1:9) {
   for (i in (j+1):10) {
      my.dist[i,j] <- ((x[i]-x[j])^2+(y[i]-y[j])^2)^(.5)
   }
}
my.dist[2:10,]

x11(width = 8, height = 4)
oldpar <- par(mfrow = c(1, 3))
s.hclust <- hclust(s.dist, method = "single" )
plot(s.hclust, main  = "Single Linkage")
   # Complete
s.hclust<-hclust(s.dist, method = "complete" )
plot(s.hclust, main  = "Complete Linkage")
   # Average
s.hclust<-hclust(s.dist, method = "ave" )
plot(s.hclust, main  = "Average Linkage")
par(oldpar)


   # Flea Beetles
c.dist <- dist(d.flea)
c.hclust <- hclust(c.dist, method = "single" )
plot(c.hclust, main = "Single Linkage")
   # Not very informative

c.hclust <- hclust(c.dist, method = "single" )
c.hclust$labels <- flea.species
plot(c.hclust, main = "Single Linkage")

   #
c.hclust <- hclust(c.dist, method = "complete" )
c.hclust$labels <- flea.species
plot(c.hclust, main = "Complete Linkage")

   #
c.hclust<-hclust(c.dist, method = "ave" )
c.hclust$labels <- flea.species
plot(c.hclust, main = "Average Linkage")

   # We can show what graphically cases are in a given number of clusters
rect.hclust(c.hclust, k = 3, border = "red")

   #
plot(c.hclust, main = "Average Linkage")
rect.hclust(c.hclust, k = 4, border = "blue")

   # Create a list of the cluster elements
(in.clust <- rect.hclust(c.hclust, k = 4, border = "blue"))

k3.m.flea <- kmeans(d.flea, 3, 10)
(kmeans.clust <- k3.m.flea$cluster)

H.clust <- rep(0,74)
H.clust[in.clust[[1]]] <- 1
H.clust[in.clust[[2]]] <- 2
H.clust[in.clust[[3]]] <- 3
H.clust[in.clust[[4]]] <- 2
H.clust

   # or at a given height
plot(c.hclust, main = "Average Linkage")
rect.hclust(c.hclust, h = 30, border = "green")

(in.clust<-rect.hclust(c.hclust, h = 30, border = "green"))

   #==================== End ==================