drive <- "K:"
code.dir <- paste(drive, "STAT5703/Code", sep = "/")
data.dir <- paste(drive, "STAT5703/Data", sep = "/")

source(paste(code.dir, "Conv_run.r", sep = "/"))
source(paste(code.dir, "Newton_Interp.r", sep = "/"))

   # Create a test function.
set.seed(1234, "default")          # Seed for random numbers
   # Compute sine at 0,0.1,0.2,... for plotting a noisy sine
x.rough <- seq(0, 6, by = 0.1)
numb.rough <- length(x.rough)
y.rough <- sin(x.rough) + runif(numb.rough)-0.5  # Add uniform noise
   # Points at which to interpolate 
x.in <- seq(0, 6, by = 0.01)
noisy.sine <- list(x.rough, y.rough, x.in)
#save(noisy.sine, file = paste(data.dir, "noisySine.Rdata", sep="/"))
load(paste(data.dir, "noisySine.Rdata", sep = "/"))
x.rough <- noisy.sine[[1]]
y.rough <- noisy.sine[[2]]
x.in <- noisy.sine[[3]]
plot(x.rough, y.rough, col = "red", pch = 16)      # Plot the points
curve(sin, 0, 6, add=T, col = "blue")    # Add in the sine
   # Use Newton Interpolation to get the coefficients
   # and Horners method to compute the polynomial.
y.rough.in <- HornerN(InterpNewton(x.rough, y.rough), x.rough, x.in)
lines(x.in, y.rough.in, col = "green")

  # Try an interpolation of points on a line
x <- -5:5
y <- -5:5
x.by.1 <- seq(-6, 6, by = 0.1)
y.by.1 <- seq(-5, 5, by = 0.1)
y.in <- HornerN(InterpNewton(x, y), x, x.by.1)
plot(x, y, col = "red", pch = 16, xlim = c(-6,6), ylim = c(-6,7))
lines(x.by.1, y.in, col = "green")
lines(c(-5, 5), c(-5, 5),col = "blue")

x.blip <- -5:5
y.blip <- -5:5
x.blip.by.1 <- seq(-5.5, 5.5, by = 0.1)
y.blip.by.1 <- seq(-5.5, 5.5, by = 0.1)
y.blip[8] <- 2.5        # Move 1 point
y.blip

y.blip.in <- HornerN(InterpNewton(x.blip, y.blip), x.blip, x.blip.by.1)
plot(x.blip, y.blip, col = "red", pch = 16, xlim = c(-6, 6), ylim = c(-6, 7))
lines(x.blip.by.1, y.blip.in, col = "green")
lines(c(-5, 5), c(-5, 5), col = "blue")

   ### Spline
   # Compute and plot spline (spline returns the interpolated values).
plot(x.blip, y.blip, col = "red", pch = 16, xlim = c(-6, 6), ylim = c(-6, 7))
lines(spline(x.blip, y.blip, n = 201), col = "black")

   # Use Runge's function as an example.
runge <- function (x){
   1/(1+x^2)
}
   # Plot it at integers from -5 to 5
x.5.5 <- (-5:5)
y.5.5 <- sapply (x.5.5, runge)
x.5.5by.1 <- seq(-5, 5, by = 0.1)
plot(x.5.5, y.5.5, col = "green", pch = 20, ylim = c(-0.5, 2), cex = 1.3)

   # Get the interpolated values at -5,-4.9,.4.8...
y.in <- HornerN(InterpNewton(x.5.5, y.5.5), x.5.5, x.5.5by.1)
lines(x.5.5by.1, y.in, col = "red")
lines(spline(x.5.5, y.5.5, n = 201), col = "blue")

   #===================================================
   # Put some noise on it and compute spline
   #===================================================
y.noise <- sapply (x.5.5by.1, runge) + rnorm(101, 0, 0.1)
plot(x.5.5by.1, y.noise, col = "green", pch = 20, cex = 1.3)
lines(spline(x.5.5by.1, y.noise, n = 201), col = "red")
curve(runge, -5, 5, add = T, col = "blue")

   #===================================================
   # Illustrate convolution of vectors
   #===================================================
bandwidth <- 5
(x <- 1:30)
(y <- rep(1, bandwidth))

   #===================================================
   # pad with leading and trailing zeros
   #===================================================
(x.0 <- c(rep(0, bandwidth - 1), x, rep(0, bandwidth - 1)))
(x.1 <- c(rep(0, bandwidth - 1), rep(1, length(x)), rep(0, bandwidth - 1)))

   #===================================================
   # Compute a running mean with a specified bandwidth
   #===================================================
z <- rep(0, length(x))
for (i in 1:(length(x) + bandwidth - 1)) {
   t <- y*x.0[(1:bandwidth) + i - 1]
   n <- sum(y*x.1[(1:bandwidth) + i - 1])
   z[i] <- sum(t)/n
   cat("t = ", t, " n = ", n, " z = ", z[i], "\n")
}

   #===================================================
   # This does a running smooth
   #===================================================
f.conv <- function(x, bandwidth) {
   y <- rep(1, bandwidth)
   	# pad x with leading and trailing zeros
   x.0 <- c(rep(0, bandwidth - 1), x, rep(0, bandwidth - 1))
   x.1 <- c(rep(0, bandwidth -1), rep(1, length(x)), rep(0, bandwidth - 1))
   z <- rep(0, length(x))
   for (i in 1:(length(x) + bandwidth - 1)) {
      t <- y*x.0[(1:bandwidth) + i - 1]
      n <- sum(y*x.1[(1:bandwidth) + i - 1])
      z[i] <- sum(t)/n
   }
   return(z)
}

f.run.mean <- function(x, bandwidth) {
   bandwidth <- floor(bandwidth/2)*2 + 1   # Use odd width
   f.conv(x, bandwidth)[(bandwidth/2 + 1):((bandwidth/2)  + length(x))]
}

x <- 1:30
f.conv(x, 5)
f.run.mean(x, 5)

   # We are going to change some plotting parameters
   # so we save the old value to restore later
   
oldpar <- par(mfrow = c(3, 1), mar = c(4, 2, 2, 1))   # Split the plot into a stack of 3 plots
for (i in c(5, 15, 25)) {   
      # Plot the points
   plot(x.rough, y.rough, col = "red", pch = 20,
         main = paste(i, "point running mean"))    
   curve(sin, 0, 6, add=T, col = "blue")    # Add in the sine
   YS <- f.run.mean(y.rough, i)
   lines(x.rough, YS, col = "green")
}   
par(oldpar)
   

   #======= On Runge
oldpar <- par(mfrow = c(3, 1), mar = c(4, 2, 2, 1))   # Split the plot into a stack of 3 plots
for (i in c(5, 15, 25)) {   
      # Plot the points
   plot(x.5.5by.1, y.noise, col="red", pch=20, 
         main = paste(i, "point running mean"))    
   curve(runge, -5, 5, add=T, col="blue")    # Add in the curve
   YS <- f.run.mean(y.noise, i)
   lines(x.5.5by.1, YS, col="green")
}   
par(oldpar)

   #===================================================
   # Regression
   #===================================================
oldpar <- par(mfrow=c(2,1), mar = c(4, 2, 2, 1))
mu <- mean(y.rough)
plot(x.rough, y.rough, col="red", pch=20, main="Approximation by the mean")
curve(sin, 0, 6, add=T, col="blue") 
lines(c(0,6), c(mu, mu))

res.mean <- y.rough - mu
plot(x.rough, res.mean, pch=20, main="Residuals for approximation by the mean")
sum(res.mean*res.mean)
par(oldpar)

   #============ Regression line ============
lm.1 <- lm(y.rough ~ x.rough)

oldpar <- par(mfrow=c(2,1), mar = c(4, 2, 2, 1))
plot(x.rough, y.rough, col="red", pch=20, main="Approximation by regression line")
curve(sin, 0, 6, add=T, col="blue") 
abline(lm.1$coefficients[1],lm.1$coefficients[2], col="green")

res.line <- y.rough - (lm.1$coefficients[1] + x.rough*lm.1$coefficients[2])
plot(x.rough, res.line, pch=20, main="Residuals for approximation by regression line")
par(oldpar)

cat("The sum of squares for error:\n For mean = ", sum(res.mean*res.mean), "\n",
    "For regression line =", sum(res.line*res.line), " (better)\n")

   #============= 2 means ==============
   # Might try splitting the interval
   #  (we will see this strategy later).
mid.pt <- min(x.rough) + sum(range(x.rough))/2
oldpar <- par(mfrow=c(2,1), mar = c(4, 2, 2, 1))
plot(x.rough, y.rough, col="red", pch=20, main="Approximation by two means")
   # Get means on both sides
l.index <- (x.rough < mid.pt)*(1:numb.rough)
r.index <- (x.rough >= mid.pt)*(1:numb.rough)
mu.2.l <- mean(y.rough[l.index])
mu.2.r <- mean(y.rough[r.index])
lines(c(0,3,3,6), c(mu.2.l, mu.2.l, mu.2.r, mu.2.r))
curve(sin, 0, 6, add=T, col="blue")
res.2 <- c(y.rough[l.index]-mu.2.l,  y.rough[r.index]-mu.2.r)
plot(x.rough, res.2, pch=20, main="Residuals for approximation by two means")
par(oldpar)

cat("The sum of squares for error:\n For mean = ", 
      sum(res.mean*res.mean), "\n",
    "For regression line =", sum(res.line*res.line), 
    "\n For split is = ", sum(res.2*res.2),
    " an improvement\n")

   # Try splitting the interval for three regressions.
quart.pt <- min(x.rough) + sum(range(x.rough))/4
three.quart.pt <- min(x.rough) + sum(range(x.rough))*3/4
a.index <- (x.rough < quart.pt)*(1:numb.rough)
b.index <- ((x.rough >= quart.pt)&(x.rough < three.quart.pt))*(1:numb.rough)
c.index <- (x.rough >= three.quart.pt)*(1:numb.rough)
oldpar <- par(mfrow=c(2,1), mar = c(4, 2, 2, 1))
plot(x.rough, y.rough, col="red", pch=20, main="Approximation by three regression lines")
curve(sin, 0, 6, add=T, col="blue")
   # Get regression in the intervals.
lm.3.a <- lm(y.rough[a.index] ~ x.rough[a.index])
m <- lm.3.a$coefficients[2]
int <- lm.3.a$coefficients[1]
lines(c(0, quart.pt),c(int, m*quart.pt))

lm.3.b <- lm(y.rough[b.index] ~ x.rough[b.index])
m <- lm.3.b$coefficients[2]
int <- lm.3.b$coefficients[1]
lines(c(quart.pt, three.quart.pt), c(int+m*quart.pt, int+m*(three.quart.pt)))

lm.3.c <- lm(y.rough[c.index] ~ x.rough[c.index])
m <- lm.3.c$coefficients[2]
int <- lm.3.c$coefficients[1]
lines(c(three.quart.pt, 6), c(int+m*three.quart.pt, int+m*6))
   # Residuals
res.3 <- c(lm.3.a$residuals, lm.3.b$residuals, lm.3.c$residuals)
plot(x.rough, res.3, pch=20, main="Residuals for three regression lines")
par(oldpar)

cat("The sum of squares for error:\n For mean = ", 
      sum(res.mean*res.mean), "\n",
    "For regression line =", sum(res.line*res.line), 
    "\n For split = ", sum(res.2*res.2),
    "\n For 3 lines = ", sum(res.3*res.3),
    "  a further improvement\n")

   #============ Quadratic ============
oldpar <- par(mfrow=c(2,1), mar = c(4, 2, 2, 1))
plot(x.rough, y.rough, col="red", pch=20, main="Approximation by quadratic regression")
X.rough.q <- cbind(x.rough, x.rough^2)
lm.q <- lm(y.rough ~ X.rough.q)
cat("The coefficients are ", lm.q$coefficients, "\n")
T.q <- lm.q$coefficients[1] + x.in*lm.q$coefficients[2] + (x.in)^2*lm.q$coefficients[3]
lines(x.in, T.q, col="green")
curve(sin, 0, 6, add=T, col="blue")
plot(x.rough, lm.q$residuals, pch=20, main="Residuals for approximation by quadratic regression")
par(oldpar)

cat("The sum of squares for error:\n For mean = ", 
      sum(res.mean*res.mean), "\n",
    "For regression line =", sum(res.line*res.line), 
    "\n For split = ", sum(res.2*res.2),
    "\n For 3 lines = ", sum(res.3*res.3),
    "\n For quadratic = ", sum(lm.q$residuals*lm.q$residuals),
    "  slightly better than a straight line\n")

   #============= Cubic  ==================
oldpar <- par(mfrow=c(2,1), mar = c(4, 2, 2, 1))
plot(x.rough, y.rough, col = "red", pch = 20, main = "Approximation by cubic regression")
X.rough.cubic <- cbind(x.rough, x.rough^2, x.rough^3)
lm.cubic <- lm(y.rough ~ X.rough.cubic)
cat("The coefficients are ", lm.q$coefficients, "\n")
T.cubic <- lm.cubic$coefficients[1] + x.in*lm.cubic$coefficients[2]+
          (x.in)^2*lm.cubic$coefficients[3]+ (x.in)^3*lm.cubic$coefficients[4]
lines(x.in, T.cubic, col = "green")
curve(sin, 0, 6, add=T, col = "blue")
plot(x.rough, lm.cubic$residuals, pch=20, main="Residuals for approximation by cubic regression line")
par(oldpar)

cat("The sum of squares for error:\n For mean = ", 
      sum(res.mean*res.mean), "\n",
    "For regression line =", sum(res.line*res.line), 
    "\n For split = ", sum(res.2*res.2),
    "\n For 3 lines = ", sum(res.3*res.3),
    "\n For quadratic = ", sum(lm.q$residuals*lm.q$residuals),
    "\n For cubic = ", sum(lm.cubic$residuals*lm.cubic$residuals),
    "  similar to the 3 linear regressions\n")

   # Higher dimension regression
numb <- 1000
max <- 6
set.seed(1234)
x <- sort(runif(numb)*max)
y <- runif(numb)*max
z <- sin(x)*y + runif(numb)
lm.1 <- lm(z~x+y)
lm.1$coefficients

X.g <- seq(0, max, by=.1)
Y.g <- seq(0, max, by=.1)
g <- expand.grid(X.g, Y.g)

lm.g <- lm.1$coefficients[1] + g[,1]*lm.1$coefficients[2] + g[,2]*lm.1$coefficients[3]
lm.g.m <- matrix(lm.g, nrow=length(X.g))

library(rgl)
plot3d(x, y, z, col="red", type="s", size=0.3)
surface3d(X.g, Y.g, lm.g.m, alpha=0.5)

   # Residuals
plot3d(x, y, zlim=c(-6,6), type="s", size=0.3, lm.1$residuals)
sum(lm.1$residuals*lm.1$residuals)

   #=============== Cubic ===============
X <- cbind(x, x^2, x^3)
lm.2 <- lm(z~X+y)
lm.2$coefficients

lm.2.g <- (lm.2$coefficients[1] + g[,1]*lm.2$coefficients[2]+ (g[,1])^2*lm.2$coefficients[3]+ (g[,1])^3*lm.2$coefficients[4] 
         + g[,2]*lm.2$coefficients[5])
lm.2.g.m <- matrix(lm.2.g, nrow=length(X.g))

plot3d(x, y, z, col="red", type="s", size=0.3)
surface3d(X.g, Y.g, lm.2.g.m, alpha=0.5)

   # Residuals
plot3d(x, y, zlim = c(-6,6), type = "s", size = 0.3, lm.2$residuals)
sum(lm.2$residuals*lm.2$residuals)
   #=================

X <- cbind(y*x, y*x^2, y*x^3)
lm.3 <- lm(z~X)
lm.3$coefficients

lm.3.g <- (lm.3$coefficients[1] + g[,1]*g[,2]*lm.3$coefficients[2]+ (g[,1])^2*g[,2]*lm.3$coefficients[3]+ (g[,1])^3*g[,2]*lm.3$coefficients[4])
lm.3.g.m <- matrix(lm.3.g, nrow=length(X.g))

plot3d(x, y, z, col="red", type="s", size=0.3)
surface3d(X.g, Y.g, lm.3.g.m, alpha=0.5)
   # Residuals
plot3d(x, y, zlim=c(-6,6), type="s", size=0.3, lm.3$residuals)
sum(lm.3$residuals*lm.3$residuals)

   # =================  All subsets
   # Example from Neter, Kutner, Nactsheim, Wasseman 8.3
d.file <- paste(data.dir, "ch08ta01.dat", sep = "/")
d.temp <- matrix(scan(d.file), ncol = 6, byrow = T)
d.data <- d.temp[,c(1:4,6)]
names <- c("BloodClot", "Prog.Ind", "EnzyneFun", "LiverFun", "logSurvival")
dimnames(d.data) <- list(1:54, names)
df.data <- as.data.frame(d.data)


   # The following recursive routine creates all possible combinations
   # of size numb
next.combo <- function (combo, numb, all.current, prev, ind) {
   i <- prev + 1
   while (i <= numb) { 
      combo[ind, c(all.current, i)] <- c(all.current, i)
      ind <- ind + 1
      res <- next.combo (combo, numb, c(all.current, i), i, ind) 
      combo <- res[[1]]
      ind <- res[[2]]
      i <- i + 1
   }
   list(combo, ind) 
}

   # Determine the number of possible combinations of size 4
numb <- 4
n.combo <- 0
for (j in 1:numb) {
   n.combo <- n.combo + choose(numb,j)
}
combo <- matrix(0, n.combo, numb)

(res <- next.combo (combo, numb, {}, 0, 1))

   # Puts them in a better order
(combos <- res[[1]][order(apply(res[[1]]!=0, 1, sum)),])

   # Create regression models for all possible subsets and plot the SSE
xx <- 1:numb
plot(-1, 0, xlim = c(0,numb), ylim = c(0, 5), xlab = "", ylab = "")

lpm.all <- {}             # A null list
for (i in 1:n.combo) {
   comb <- d.data[,combos[i,]]  # Simplify names
   lpm <- lm(df.data[,5]~comb)
      # Put model in list
   lpm.all <- c(lpm.all, list(lpm))       
      # Plot the results
   points(sum(combos[i,] > 0), sum(lpm$residuals^2), pch = 19)
}
lpm.all

table <- matrix(0, n.combo + 1, 4)
TSS <- sum((df.data[ ,5] - mean(df.data[,5]))^2)
table[1,1] <- 53
table[1,2] <- floor(100000*TSS)/100000           # 5 digits
table[1,3] <- 0
table[1,4] <- floor(100000*(table[1, 2]/(54 - 2)))/100000
row.names <- "None"

for (i in 1:n.combo) {
   temp1  <- format(combos[i, ])
   temp2 <-{}
   for (j in 1:length(temp1)) {
      if (temp1[j] != "0")
         temp2 <- paste(temp2, "X", temp1[j], sep="")
   }
   row.names <-c(row.names,temp2)
   table[i+1,1] <- floor(100000*lpm.all[[i]]$df.residual)/100000
   table[i+1,2] <- floor(100000*sum(lpm.all[[i]]$residual^2))/100000
   table[i+1,3] <- floor(100000*(1 - table[i+1,2]/TSS))/100000
   table[i+1,4] <- floor(100000*table[i+1,2])/(54-2)/100000
}
dimnames(table) <- list(row.names, c("df","SSE","R^2","MSE"))
table

   #
mins <- matrix(10^(30), numb+1, 2)
for (i in 1:15) {
   temp <- sum(lpm.all[[i]]$residuals^2)
   if (temp < mins[lpm.all[[i]]$rank, 1]) {
      mins[lpm.all[[i]]$rank, 1] <- temp
      mins[lpm.all[[i]]$rank, 2] <- i
   }
}
mins[1,1] <- TSS
mins[1,2] <- 0
lines (0:4, mins[1:5, 1])
points (0:4, mins[1:5, 1], col = "red", pch = 19)


   #===================================================
   # Singular value decomposition
   #===================================================
(A <- cbind(c(1,0,1), c(1,1,0)))
nrow <- dim(A)[1]
ncol <- dim(A)[2]
(ATA <- t(A)%*%A)

   # Get the eigenvalues and eigenvectors
(ATA.eig <- eigen(ATA))

   # Set 'small' eigenvalues to 0 and remove them
   # Which are smaller that 10^(-15)
(abs(ATA.eig$values) > 10^(-15))

   # Create indices for the non-zero eigenvalues
(abs(ATA.eig$values) > 10^(-15))*(1:length(ATA.eig$values))

# List of the non-zero eigenvalues
(eigs.a <- ATA.eig$values[(abs(ATA.eig$values) > 10^(-15))*(1:length(ATA.eig$values))])

   # Put eigenvalues in descending order with order(-eigs.a)
(the.eigs <- eigs.a[order(-eigs.a)])
(r <- length(the.eigs))  # Number of non-zero eigenvalues

   # We order the eigenvectors in descending order of the eigenvectors
if (r < ncol) {
   V <- as.matrix(ATA.eig$vectors[,c(order(-eigs.a),(r+1):ncol)])
} else {
   V <- as.matrix(ATA.eig$vectors[,order(-eigs.a)])
}
V
t(V)%*%V

   # Create the singular value matrix using the 
   # square root  of the eigenvalues as 'diagonal' elements
   # This is of the same shape as A
(Sig <- diag((the.eigs)^(.5), nrow, ncol))

(Sig.Inv <- diag(1/(the.eigs)^(.5), nrow, ncol))


U <- matrix(0, nrow, nrow)
for (i in 1:r) {
   U[,i] <- A%*%V[,i]/sqrt(the.eigs[i])
}
U

(I <- diag(1, nrow, nrow))
if (r < nrow) {
   for (i in (r+1):nrow) {
      # Gram-Schmidt for next
      u <- I[,1]
      for (j in 1:(i-1)) {
         u <- u - t(I[,1])%*%U[,j]*U[,j]
      }
      U[,i] <- u/sqrt(sum(u*u))
   }
}
U

U%*%t(U)

U%*%Sig%*%t(V)
A


Y <- matrix(y.rough)
X <- cbind(rep(1,length(Y)),x.rough)
(s <- svd(t(X)%*%X))

(l <- s$v%*%diag(1/s$d)%*%t(s$u)%*%t(X)%*%Y)

   #====================== 
   # Standardize data
   #====================== 
f.data.std <- function(data) {
   data <- as.matrix(data)
   bar <- apply(data, 2, mean)
   s   <- apply(data, 2, sd)
   t((t(data) - bar)/s)
}

#============================================ 
# Convert standardized coeff to original
#============================================ 
std.to.orig <- function (std.coef, mean.X, mean.Y, s.X, s.Y) {
   sz <- length(std.coef)
   B.i <- matrix(0, sz, 1)
   for (i in 2:sz) {
      B.i[i,1] <- s.Y/s.X[i-1]*std.coef[i]
      std.coef[i] <- B.i[i,1]
      B.i[1,1] <- B.i[1,1] + B.i[i,1]*mean.X[i-1] 
   }
   std.coef[1] <- mean.Y - B.i[1,1]
   std.coef
}


   #==============
d.file <- paste(data.dir, "prostate.dat", sep = "/")
d.temp <- matrix(scan(d.file), ncol=10, byrow=T)
data.orig <- d.temp[,2:10]
names <- c("lcavol","lweight","age","lbph","svi","lcp","gleason","pgg45","lpsa")
dimnames(data.orig) <- list(1:97, names)
pred  <- 1:8
resp <- 9
p <- 8


   #============== Yhat
f.Yhat <- function(coef, X) {
   X <- as.matrix(X)
   cbind(rep(1,dim(X)[1]), X)%*%as.matrix(coef)
}
   

source(paste(code.dir, "pairs_ext.r", sep="/"))
pairs(data.orig, upper.panel=panel.cor, diag.panel=panel.hist)
cor(data.orig)


   # Try a least squares on the full data set - original and standardized
lm(data.orig[,resp]~data.orig[,pred])

   # Standardize the data
data.std <- f.data.std(data.orig)
(lm.std <- lm(data.std[,resp]~data.std[,pred]))

   # Convert back
std.to.orig(lm.std$coefficients, apply(data.orig[,pred], 2, mean), 
            mean(data.orig[,resp]), apply(data.orig[,pred],2,sd),
            sd(data.orig[,resp]))


   #============================================ 
   # A function to produce those indices NOT in a set.
   # Use this to get the test sample.
   #============================================ 
"%w/o%" <- function(x,y) x[!x %in% y]

   #============================================ 
   # Set the indices for the training/test sets
   #============================================ 
get.train <- function (data.sz, train.sz) {
      # Take subsets of data for training/test samples
      # Return the indices
   train.ind <- sample(data.sz, train.sz)
   test.ind <- (1:data.sz) %w/o% train.ind
   list(train=train.ind, test=test.ind)
}

   #============================================ 
   # Start the train/test split
   # Set the size of the training sample
   #============================================ 
Train.sz <- 67
   # Get the indices for the training and test samples
(tt.ind <- get.train(dim(data.orig)[1], Train.sz))


   #
train.X.orig <- data.orig[tt.ind$train, pred]
train.Y.orig <- data.orig[tt.ind$train, resp]
train.X.std <- f.data.std(train.X.orig)
train.Y.std <- f.data.std(train.Y.orig)

   # Do a least squares on the standardized data and convert
lm.trn <- lm(train.Y.std ~ train.X.std)
std.to.orig(lm.trn$coefficients, apply(train.X.orig, 2, mean), 
        mean(train.Y.orig), apply(train.X.orig,2,sd), sd(train.Y.orig))

   #
lm(train.Y.orig ~ train.X.orig)   # OK

   # Look at the effect of different training sets
for (i in 1:5) {
   tt.ind <- get.train(dim(data.std)[1], Train.sz)
   train.X.orig <- data.orig[tt.ind$train, pred]
   train.Y.orig <- data.orig[tt.ind$train, resp]
   train.X.std <- f.data.std(train.X.orig)
   train.Y.std <- f.data.std(train.Y.orig)
   lm.trn <- lm(train.Y.std~train.X.std)
   print(lm(train.Y.orig~train.X.orig))
   print(std.to.orig(lm.trn$coefficients, apply(train.X.orig,2,mean), 
             mean(train.Y.orig), apply(train.X.orig,2,sd), sd(train.Y.orig)))
   print("")
}

#============================================ 
#=========== Cross Validation ============
#=======================================================
# This section does a cross validation on the full LS model
# Set up a cross validation indices 
# Create a train/test split and then create 10 subsets
# cv.sets$cv.train    10 cross validation training sets from the training data
# cv.sets$cv.test     Corresponding test sets from the training data
# cv.sets$test        Test data
# For the sake of reproducibility, the training and test samples are fixed
#=======================================================

set.up.cv.ind <- function (data.sz, Train.sz) {
   tt.ind <- get.train(data.sz, Train.sz)
      # We ignore the result and fix the samples.
   tt.ind <- 
   list(train <- c(64, 62, 75,  6, 94, 56, 52, 80, 33, 55, 87, 32, 
                   23,  9, 59,  2, 96, 66, 90, 53, 12, 16, 11, 47, 
                   21, 54, 91, 24, 81, 86, 34, 51,  8, 17, 46, 45, 
                   93, 57,  4, 50, 83, 74, 60, 14, 48, 41, 35, 70, 
                   68, 72, 61, 28, 85, 13, 43, 29, 82, 77, 92, 22, 
                    3, 15, 78, 44, 20, 84, 76), 
         test <- c( 1,  5,  7, 10, 18, 19, 25, 26, 27, 30, 31, 36, 
                   37, 38, 39, 40, 42, 49, 58, 63, 65, 67, 69, 71, 
                   73, 79, 88, 89, 95, 97))

   n.in.set <- ceiling(Train.sz*.1)
   # Put all the indices in the set column wise into matrix 
   #   & pad with 0 to avoid repetition.
   cv.sets <- matrix(c(tt.ind[[1]], rep(0, n.in.set*10-Train.sz)), 
                         ncol=n.in.set,byrow=F)
   cv.train <- {}
   cv.test <- {}
   for (i in 1:10) {
      # Select the 1/10th for testing and 9/10ths for training
      cv.test  <- c(cv.test, list(cv.sets[i,]))   # The indices are from the full set
      cv.train <- c(cv.train, list(as.vector(cv.sets[1:10%w/o% i,])))
   }
   list(cv.train=cv.train, cv.test=cv.test, test=tt.ind[[2]])
}

   #===========================
cv.sets <- set.up.cv.ind(dim(data.std)[1], Train.sz)
cv.sets$cv.train   # 10 cross validation training sets from the training data

cv.sets$cv.test    # Corresponding test sets from the training data

#
cv.sets$test       # Test data

   #============== Best subset
   #========================================================
   # Recursive function to compute all possible subsets
   #========================================================
next.combo <- function (combo, numb, all.current, prev, ind) {
   i <- prev + 1
   while (i <= numb) { 
      combo[ind, c(all.current, i)] <- c(all.current, i)
      ind <- ind + 1
      res <- next.combo (combo, numb, c(all.current, i), i, ind) 
      combo <- res[[1]]
      ind <- res[[2]]
      i <- i + 1
   }
   list(combo, ind) 
}

   #========================================================
number.of.combos <- function (numb) {
   n.combo <- 0
   for (j in 1:numb) {
      n.combo <- n.combo + choose(numb,j)
   }
   n.combo
}

   #========================================================
create.combos <- function (numb) {
   n.combo <- number.of.combos(numb)
   combo <- matrix(0, n.combo, numb)
   temp <- next.combo (combo, numb, {}, 0, 1)[[1]] 
   temp[order(apply(temp!=0,1,sum)),]
}

   #========================================================
combos <- create.combos(8)
rownames(combos) <- c(1:255)
combos[c(1,2,3,11,12,13,253,254,255),]

   #========================================================
   # Create regression models for all possible subsets and plot the SSE.
   # We use the train and test stuff from above.
   # Because we want to use the full training sample for this part
   # we combine the cv.train & cv.test for the training sample
   #========================================================
train.X.orig <- data.orig[c(cv.sets$cv.train[[1]], cv.sets$cv.test[[1]]), pred]
train.Y.orig <- data.orig[c(cv.sets$cv.train[[1]], cv.sets$cv.test[[1]]), resp]
test.X.orig <- data.orig[cv.sets$test, pred]
test.Y.orig <- data.orig[cv.sets$test, resp]
train.X.std <- f.data.std(train.X.orig)
train.Y.std <- f.data.std(train.Y.orig)

   # The returned coef are for the original data
n.data <- length(train.Y.std)
lpm.all <- {}
for (i in 1:dim(combos)[1]) {
   X <- train.X.std[,combos[i,]]
   lpm <- lm(train.Y.std ~ X)
   lpm$coefficients <- std.to.orig(lpm$coefficients, apply(train.X.orig, 2, mean),
          mean(train.Y.orig), apply(train.X.orig,2,sd), sd(train.Y.orig))
   lpm.all <- c(lpm.all, list(lpm))
}
lpm.all[[1]]
lpm.all[[9]]


   # Plot
plot(-1, 0, xlim = c(0, 8), ylim = c(0, 100), xlab = "k", ylab = "SSE")
for (i in 1:dim(combos)[1]) {
   points(sum(combos[i,] > 0), sum(lpm.all[[i]]$residuals^2), pch = 19)
}

   #===========
trunc <- function (numb, dig) {
   mask <- 10^dig
   floor(mask*numb)/mask
}

   #===========
create.table <- function (combos, Y) {
   n.data <- length(Y)
   n.combo <- dim(combos)[1]
   table <- matrix(0, n.combo+1, 4)
   TSS <- sum((Y - mean(Y))^2)
   table[1,1] <- 66
   table[1,2] <- trunc(TSS, 5)
   table[1,3] <- 0
   table[1,4] <- trunc(table[1,2]/(n.data-2), 5)
   row.names <- "None"

   for (i in 1:n.combo) {
      temp1  <- format(combos[i,])
      temp2 <-{}
      for (j in 1:length(temp1)) {
         if (temp1[j] != "0")
            temp2 <- paste(temp2, "X", temp1[j], sep="")
      }
      row.names <-c(row.names,temp2)
      table[i+1,1] <- trunc(lpm.all[[i]]$df.residual,5)
      table[i+1,2] <- trunc(sum(lpm.all[[i]]$residual^2),5)
      table[i+1,3] <- trunc(1 - table[i+1,2]/TSS, 5)
      table[i+1,4] <- trunc(table[i+1,2]/(n.data-2), 5)
   }
   dimnames(table) <- list(row.names, c("df","SSE","R^2","MSE"))
   table
}

table <- create.table(combos, train.Y.std)
table[1:15,]

   #==========================
create.mins <- function (lpm.all, n.combo, numb) {
   mins <- matrix(10^(30), numb+1, 2)
   # Look at all the combos and find the minimum for each category (rank) 
   for (i in 1:n.combo) {
      temp <- sum(lpm.all[[i]]$residuals^2)
      if (temp < mins[lpm.all[[i]]$rank, 2]) {
         mins[lpm.all[[i]]$rank, 1] <- i
         mins[lpm.all[[i]]$rank, 2] <- temp
      }
   }
   mins[1,1] <- 0
   mins[1,2] <- table[1,2]
   mins
}

   #========================
(mins <- create.mins(lpm.all, dim(combos)[1], 8))
lines (0:p, mins[1:(p+1), 2])
points (0:p, mins[1:(p+1), 2], col = "red", pch = 19)

   #
for (i in 0:p) {
   cat( dimnames(table)[[1]][mins[i+1, 1]+1], table[mins[i+1, 1]+1,2], "\n") 
}

   # Display all the best
for (i in 1:(p-1)) {
   print(lpm.all[[mins[i+1,1] ]])
}


   #=================== Ridge
   # Example from Neter, Kutner, Nactsheim, Wasseman 7.1
d.file <- paste(data.dir, "ch07ta01.dat", sep = "/")
d.data <- matrix(scan(d.file), ncol=4, byrow=T)
names <- c("Triceps","Thigh","MidArm","BodyFat")
dimnames(d.data) <- list(1:20, names)
df.data <- as.data.frame(d.data)

   # Get the mean and standard deviations
(bar <- apply(d.data, 2, mean))
(s   <- apply(d.data, 2, sd))

d.data.t <- t((t(d.data) - bar)/s)/sqrt(dim(d.data)[1]-1)


(r.xx <- cor(d.data.t[,1:3]))
(r.xy <- cor(d.data.t[,1:3],d.data.t[,4]))

   #
   

I <- diag(1, 3, 3)
Ridge.coef <- matrix(0,29,4)
i <- 1
for (c in c(seq(0,.01,by=.001),seq(.02,.1,by=.01),seq(.2,1.0, by=.1))) {
   Ridge.coef[i,1] <- c
   Ridge.coef[i,2:4] <- solve(r.xx+c*I, r.xy)
   i <- i + 1
}
dimnames(Ridge.coef) <- list(NULL, c("c","b1","b2","b3"))
Ridge.coef

   #
vif <- matrix(0,29,5)
i <- 1
for (c in c(seq(0,.01,by=.001),seq(.02,.1,by=.01),seq(.2,1.0, by=.1))) {
   vif[i,1] <- c
   a.inv <- solve(r.xx+c*I,I)
   vif[i,2:4] <- diag(a.inv%*%r.xx%*%a.inv)
   vif[i,5] <- 1-sum( (Ridge.coef[i,2:4]%*%t(d.data.t[,1:3])-d.data.t[,4])^2)
   i <- i + 1
}
dimnames(vif) <- list(NULL, c("c","VIF1","VIF2","VIF3","R^2"))
vif


   # Plot
plot(log(Ridge.coef[2:29,1]),Ridge.coef[2:29,2],ylim=c(-2,3),"l",
          xlab="log(c)", ylab="b", col="red")
lines(log(Ridge.coef[2:29,1]),Ridge.coef[2:29,3], col="blue")
lines(log(Ridge.coef[2:29,1]),Ridge.coef[2:29,4], col="green")
legend(-2,3,legend=c("b1", "b2", "b3"), col=c("red","blue","green"),lty=1)

   #==========================
ridge <- function (data, p.cols, r.cols, lambda, use.c = T) {
   # Get the mean and standard deviations
   # p.cols are the columns for the perdictors
   # r.cols are the columns for the response

   data.std <- f.data.std(data)

   r.xx <- t(data.std[,p.cols])%*%data.std[,p.cols]
   r.xx
   r.xy <- t(data.std[,p.cols])%*%data.std[,r.cols]
   r.xy

   n.cols <- length(r.xy)
   I <- diag(1, n.cols, n.cols)
   
   Ridge.coef <- matrix(0, length(lambda), n.cols+2)

   i <- 1
   for (c in lambda) {
      if (use.c == F) {
         Ridge.coef[i,1] <- sum(d/(d+L[i]))
      } else {
         Ridge.coef[i,1] <- c
      }
      r.xx.c <- r.xx+c*I

      Ridge.coef[i, 2:(n.cols+2)] <- std.to.orig(c(0,lm(r.xy~r.xx.c-1)$coefficients),
                apply(data.orig[,pred], 2, mean),  mean(data.orig[,resp]),
                 apply(data.orig[,pred],2,sd), sd(data.orig[,resp]))
      i <- i + 1
   }

   vif <- matrix(0, length(lambda), n.cols+2)
   i <- 1
   for (c in lambda) {
      if (use.c == F) {
         vif[i,1] <- sum(d/(d+L[i]))
      } else {
         vif[i,1] <- c
      }
      a.inv <- solve(r.xx+c*I,I)
      vif[i,(1:n.cols)+1] <- diag(a.inv%*%r.xx%*%a.inv)
      vif[i,n.cols+2] <- 1-sum( (Ridge.coef[i,2:4]%*%t(data.std[,1:3])-data.std[,4])^2)
      i <- i + 1
   }
   list (coef=Ridge.coef, vif=vif)
}

   #========== Run ridge on full data
(d <- svd(data.orig[,pred])$d^2)
# or
(d <- eigen(t(data.orig[,pred])%*%data.orig[,pred])$values)

   # Set up points to give a good df(L) spread
L <- c(seq(0,7,by=7/10),seq(8,20,by=12/10),seq(22,50,by=28/10),
       seq(55,120,by=65/10),seq(130,400,by=270/10),seq(440,3500,by=3160/10),
       seq(3700,200000,by=196300/10),seq(250000,5000000,by=475000/10))
ridge.res <- ridge (data.orig, pred, resp, L, F)
Ridge.coef <- ridge.res$coef
Ridge.coef[1:5,]

   #
plot(Ridge.coef[1:dim(Ridge.coef)[1],1],Ridge.coef[1:dim(Ridge.coef)[1],3],
     ylim=c(-.2,1),"l", xlab="df(c)", ylab="b", col=1)
for (i in 2:p) {
   lines(Ridge.coef[1:dim(Ridge.coef)[1],1],
         Ridge.coef[1:dim(Ridge.coef)[1],i+1], col=i)
}
legend(0,1,legend=paste("b",1:p, sep=""), col=1:p,lty=1)

   # Smoothing revisited
   # A more sophisticated smoother - Clevland's loess
f.weight <- function (x){
   (1 - abs(x)^3)^3
}

   # Local regression
reg.value <- function (d.subset, pt, do.print=F) {
      # Get the distance from pt to every point in the interval.
   d.dist <- abs(d.subset[,1] - pt)    
   scaled.dist <- d.dist/max(d.dist)
   W <- f.weight(scaled.dist)
   n <- length(W)
      # Do a least squares fit (lm for linear model)
      #    lm(y ~ x, data, subset, weights)
   lw <- lm(d.subset[,2]~d.subset[,1],as.data.frame(d.subset), 1:n, W)
   if (do.print) {     
      print(pt)
      print(cbind(d.subset[,1],d.subset[,2],d.dist,scaled.dist,W))
      print(lw$coefficients)
      print(lw$coefficients%*%c(1,pt))
      print("************************************")
   }
   lw$coefficients%*%c(1,pt)
}

   # Simple lowess
f.lowess <- function(d.data, band, do.print=F) {
      # d.data has the x and y values of the data
      # band is the bandwidth
      # Initialize
   v <- rep(0, dim(d.lowess)[1])  # Holds the result
   i <- 1
   first <- 1
   last <- first + band - 1
   mid <- floor((first+last)/2)
   ind.pt <- 1               # Points to position of pt in the band
      # Part 1 - move the pt at which we want the smooth
      #          until we reach midpoint of band
   d.subset <- d.data[first:last,]
   while (ind.pt <= mid) {
      pt <- d.subset[ind.pt,1]
      v[i] <- reg.value (d.subset, pt, do.print)
      i <- i + 1
      ind.pt <- ind.pt + 1
   }
      # Part 2 - move the pt with the band
   ind.pt <- ind.pt - 1           # In middle of band
   repeat {
      first <- first + 1
      last  <- last + 1
      if (last > dim(d.data)[1]) break
      d.subset <- d.data[first:last,]
      pt <- d.subset[ind.pt,1]
      v[i] <- reg.value (d.subset, pt, do.print)
      i <- i + 1
   }
      # Part 3 - move the midpoint until we reach the end
   ind.pt <- ind.pt + 1
   while (ind.pt <= band) {
      pt <- d.subset[ind.pt,1]
      v[i] <- reg.value (d.subset, pt, do.print)
      i <- i + 1
      ind.pt <- ind.pt + 1
   }
   v
}
load(paste(data.dir, "noisySine.Rdata", sep="/"))
x.rough <- noisy.sine[[1]]
y.rough <- noisy.sine[[2]]
x.in <- noisy.sine[[3]]

      # Lowess on noisy sine
library(stats)
d.lowess <- cbind(x.rough, y.rough)
oldpar <- par(mfrow = c(3, 1), mar = c(4, 2, 2, 1))
for (b in c(9, 13, 21)) {
   v <- f.lowess(d.lowess, b, T)
   plot(d.lowess[,1], d.lowess[,2], col="red", pch=20, 
        main=paste("Lowess with bandwidth =", b))
   curve(sin, 0, 6, add=T, col="blue")
   lines(d.lowess[,1], v, col="black")
   lines(lowess(d.lowess,f=b/61), col="green")
}
par(oldpar)

   # Loess on Runge
oldpar <- par(mfrow = c(3, 1), mar = c(4, 2, 2, 1))
for (s in c(0.25, 0.5, 0.75)) {
      # Plot the noisy points
   plot (x.5.5by.1, y.noise, col = "red", pch = 20,
            main = paste("Span = ", s))
      # Plot thge Runge curve
   curve(runge, -5, 5, add = T, col = "blue")
   lines(x.5.5by.1, loess(y.noise ~ x.5.5by.1, span = s)$fitted, col="green")
}
par(oldpar)


   #========== Super smoother - Variable span - fixed bass ===============
   # Another sophisticated smoother - Friedman's super smoother
   
   # Variable span - fixed bass = 0
s <- c(0.05, 0.2, 0.5)
lab <- c("- tweeter", "- midrange", "- woofer")
oldpar <- par(mfrow = c(3, 1), mar = c(4, 2, 2, 1))
for (i in 1:3) {
   plot (x.5.5by.1, y.noise, col = "red", pch = 20,
         main = paste("Super smoother - span = ", s[i],lab[i]))
   ss <- supsmu(x.5.5by.1, y.noise, span = s[i])
   curve(runge, -5, 5, add=T, col = "blue")
   lines(ss$x, ss$y, col = "green")
}
par(oldpar)

   #========== Span chosen by cv - variable bass =============
b <- c(2, 6, 10)
oldpar <- par(mfrow = c(3, 1), mar = c(4, 2, 2, 1))
for (i in 1:3) {
   plot (x.5.5by.1, y.noise, col = "red", pch = 20,
         main = paste("Super smoother - span = cv, bass =", b[i]))
   ss <- supsmu(x.5.5by.1, y.noise, span = "cv", bass = b[i])
   curve(runge, -5, 5, add = T, col = "blue")
   lines(ss$x, ss$y, col = "green")
}
par(oldpar)

   # Friedman's example
n <- 200
x.f <- runif(n)
y.f <- sin(2*pi*(1-x.f)^2) + x.f*rnorm(n)
plot(x.f, y.f)

s <- c(0.05, 0.2, 0.5)
lab <- c("tweeter", "midrange", "woofer")
plot (x.f, y.f, col="red", pch=20)
ss.t <- supsmu(x.f, y.f, span = s[1])
ss.m <- supsmu(x.f, y.f, span = s[2])
ss.w <- supsmu(x.f, y.f, span = s[3])
lines(ss.t$x, ss.t$y, col="black")
lines(ss.m$x, ss.m$y, col="green")
lines(ss.w$x, ss.w$y, col="blue")

legend(0, 2, lab, col = c("black", "green", "blue"),
       lty = c(1, 1, 1), pch = c(-1, -1, -1))

b <- 0:10
plot (x.f, y.f, col="red", pch=20)
for (i in b) {
   ss.cv <- supsmu(x.f, y.f, span = "cv", bass = i)
   lines(ss.cv$x, ss.cv$y, col=i+1)
}
legend(0, 2, 0:10, col = 1:11, lty = c(1, 1, 1), pch = c(-1, -1, -1))

   #======== End ==========