drive <- "K:" code.dir <- paste(drive, "STAT5703/Code", sep = "/") data.dir <- paste(drive, "STAT5703/Data", sep = "/") source(paste(code.dir, "Conv_run.r", sep = "/")) source(paste(code.dir, "Newton_Interp.r", sep = "/")) # Create a test function. set.seed(1234, "default") # Seed for random numbers # Compute sine at 0,0.1,0.2,... for plotting a noisy sine x.rough <- seq(0, 6, by = 0.1) numb.rough <- length(x.rough) y.rough <- sin(x.rough) + runif(numb.rough)-0.5 # Add uniform noise # Points at which to interpolate x.in <- seq(0, 6, by = 0.01) noisy.sine <- list(x.rough, y.rough, x.in) #save(noisy.sine, file = paste(data.dir, "noisySine.Rdata", sep="/")) load(paste(data.dir, "noisySine.Rdata", sep = "/")) x.rough <- noisy.sine[[1]] y.rough <- noisy.sine[[2]] x.in <- noisy.sine[[3]] plot(x.rough, y.rough, col = "red", pch = 16) # Plot the points curve(sin, 0, 6, add=T, col = "blue") # Add in the sine # Use Newton Interpolation to get the coefficients # and Horners method to compute the polynomial. y.rough.in <- HornerN(InterpNewton(x.rough, y.rough), x.rough, x.in) lines(x.in, y.rough.in, col = "green") # Try an interpolation of points on a line x <- -5:5 y <- -5:5 x.by.1 <- seq(-6, 6, by = 0.1) y.by.1 <- seq(-5, 5, by = 0.1) y.in <- HornerN(InterpNewton(x, y), x, x.by.1) plot(x, y, col = "red", pch = 16, xlim = c(-6,6), ylim = c(-6,7)) lines(x.by.1, y.in, col = "green") lines(c(-5, 5), c(-5, 5),col = "blue") x.blip <- -5:5 y.blip <- -5:5 x.blip.by.1 <- seq(-5.5, 5.5, by = 0.1) y.blip.by.1 <- seq(-5.5, 5.5, by = 0.1) y.blip[8] <- 2.5 # Move 1 point y.blip y.blip.in <- HornerN(InterpNewton(x.blip, y.blip), x.blip, x.blip.by.1) plot(x.blip, y.blip, col = "red", pch = 16, xlim = c(-6, 6), ylim = c(-6, 7)) lines(x.blip.by.1, y.blip.in, col = "green") lines(c(-5, 5), c(-5, 5), col = "blue") ### Spline # Compute and plot spline (spline returns the interpolated values). plot(x.blip, y.blip, col = "red", pch = 16, xlim = c(-6, 6), ylim = c(-6, 7)) lines(spline(x.blip, y.blip, n = 201), col = "black") # Use Runge's function as an example. runge <- function (x){ 1/(1+x^2) } # Plot it at integers from -5 to 5 x.5.5 <- (-5:5) y.5.5 <- sapply (x.5.5, runge) x.5.5by.1 <- seq(-5, 5, by = 0.1) plot(x.5.5, y.5.5, col = "green", pch = 20, ylim = c(-0.5, 2), cex = 1.3) # Get the interpolated values at -5,-4.9,.4.8... y.in <- HornerN(InterpNewton(x.5.5, y.5.5), x.5.5, x.5.5by.1) lines(x.5.5by.1, y.in, col = "red") lines(spline(x.5.5, y.5.5, n = 201), col = "blue") #=================================================== # Put some noise on it and compute spline #=================================================== y.noise <- sapply (x.5.5by.1, runge) + rnorm(101, 0, 0.1) plot(x.5.5by.1, y.noise, col = "green", pch = 20, cex = 1.3) lines(spline(x.5.5by.1, y.noise, n = 201), col = "red") curve(runge, -5, 5, add = T, col = "blue") #=================================================== # Illustrate convolution of vectors #=================================================== bandwidth <- 5 (x <- 1:30) (y <- rep(1, bandwidth)) #=================================================== # pad with leading and trailing zeros #=================================================== (x.0 <- c(rep(0, bandwidth - 1), x, rep(0, bandwidth - 1))) (x.1 <- c(rep(0, bandwidth - 1), rep(1, length(x)), rep(0, bandwidth - 1))) #=================================================== # Compute a running mean with a specified bandwidth #=================================================== z <- rep(0, length(x)) for (i in 1:(length(x) + bandwidth - 1)) { t <- y*x.0[(1:bandwidth) + i - 1] n <- sum(y*x.1[(1:bandwidth) + i - 1]) z[i] <- sum(t)/n cat("t = ", t, " n = ", n, " z = ", z[i], "\n") } #=================================================== # This does a running smooth #=================================================== f.conv <- function(x, bandwidth) { y <- rep(1, bandwidth) # pad x with leading and trailing zeros x.0 <- c(rep(0, bandwidth - 1), x, rep(0, bandwidth - 1)) x.1 <- c(rep(0, bandwidth -1), rep(1, length(x)), rep(0, bandwidth - 1)) z <- rep(0, length(x)) for (i in 1:(length(x) + bandwidth - 1)) { t <- y*x.0[(1:bandwidth) + i - 1] n <- sum(y*x.1[(1:bandwidth) + i - 1]) z[i] <- sum(t)/n } return(z) } f.run.mean <- function(x, bandwidth) { bandwidth <- floor(bandwidth/2)*2 + 1 # Use odd width f.conv(x, bandwidth)[(bandwidth/2 + 1):((bandwidth/2) + length(x))] } x <- 1:30 f.conv(x, 5) f.run.mean(x, 5) # We are going to change some plotting parameters # so we save the old value to restore later oldpar <- par(mfrow = c(3, 1), mar = c(4, 2, 2, 1)) # Split the plot into a stack of 3 plots for (i in c(5, 15, 25)) { # Plot the points plot(x.rough, y.rough, col = "red", pch = 20, main = paste(i, "point running mean")) curve(sin, 0, 6, add=T, col = "blue") # Add in the sine YS <- f.run.mean(y.rough, i) lines(x.rough, YS, col = "green") } par(oldpar) #======= On Runge oldpar <- par(mfrow = c(3, 1), mar = c(4, 2, 2, 1)) # Split the plot into a stack of 3 plots for (i in c(5, 15, 25)) { # Plot the points plot(x.5.5by.1, y.noise, col="red", pch=20, main = paste(i, "point running mean")) curve(runge, -5, 5, add=T, col="blue") # Add in the curve YS <- f.run.mean(y.noise, i) lines(x.5.5by.1, YS, col="green") } par(oldpar) #=================================================== # Regression #=================================================== oldpar <- par(mfrow=c(2,1), mar = c(4, 2, 2, 1)) mu <- mean(y.rough) plot(x.rough, y.rough, col="red", pch=20, main="Approximation by the mean") curve(sin, 0, 6, add=T, col="blue") lines(c(0,6), c(mu, mu)) res.mean <- y.rough - mu plot(x.rough, res.mean, pch=20, main="Residuals for approximation by the mean") sum(res.mean*res.mean) par(oldpar) #============ Regression line ============ lm.1 <- lm(y.rough ~ x.rough) oldpar <- par(mfrow=c(2,1), mar = c(4, 2, 2, 1)) plot(x.rough, y.rough, col="red", pch=20, main="Approximation by regression line") curve(sin, 0, 6, add=T, col="blue") abline(lm.1$coefficients[1],lm.1$coefficients[2], col="green") res.line <- y.rough - (lm.1$coefficients[1] + x.rough*lm.1$coefficients[2]) plot(x.rough, res.line, pch=20, main="Residuals for approximation by regression line") par(oldpar) cat("The sum of squares for error:\n For mean = ", sum(res.mean*res.mean), "\n", "For regression line =", sum(res.line*res.line), " (better)\n") #============= 2 means ============== # Might try splitting the interval # (we will see this strategy later). mid.pt <- min(x.rough) + sum(range(x.rough))/2 oldpar <- par(mfrow=c(2,1), mar = c(4, 2, 2, 1)) plot(x.rough, y.rough, col="red", pch=20, main="Approximation by two means") # Get means on both sides l.index <- (x.rough < mid.pt)*(1:numb.rough) r.index <- (x.rough >= mid.pt)*(1:numb.rough) mu.2.l <- mean(y.rough[l.index]) mu.2.r <- mean(y.rough[r.index]) lines(c(0,3,3,6), c(mu.2.l, mu.2.l, mu.2.r, mu.2.r)) curve(sin, 0, 6, add=T, col="blue") res.2 <- c(y.rough[l.index]-mu.2.l, y.rough[r.index]-mu.2.r) plot(x.rough, res.2, pch=20, main="Residuals for approximation by two means") par(oldpar) cat("The sum of squares for error:\n For mean = ", sum(res.mean*res.mean), "\n", "For regression line =", sum(res.line*res.line), "\n For split is = ", sum(res.2*res.2), " an improvement\n") # Try splitting the interval for three regressions. quart.pt <- min(x.rough) + sum(range(x.rough))/4 three.quart.pt <- min(x.rough) + sum(range(x.rough))*3/4 a.index <- (x.rough < quart.pt)*(1:numb.rough) b.index <- ((x.rough >= quart.pt)&(x.rough < three.quart.pt))*(1:numb.rough) c.index <- (x.rough >= three.quart.pt)*(1:numb.rough) oldpar <- par(mfrow=c(2,1), mar = c(4, 2, 2, 1)) plot(x.rough, y.rough, col="red", pch=20, main="Approximation by three regression lines") curve(sin, 0, 6, add=T, col="blue") # Get regression in the intervals. lm.3.a <- lm(y.rough[a.index] ~ x.rough[a.index]) m <- lm.3.a$coefficients[2] int <- lm.3.a$coefficients[1] lines(c(0, quart.pt),c(int, m*quart.pt)) lm.3.b <- lm(y.rough[b.index] ~ x.rough[b.index]) m <- lm.3.b$coefficients[2] int <- lm.3.b$coefficients[1] lines(c(quart.pt, three.quart.pt), c(int+m*quart.pt, int+m*(three.quart.pt))) lm.3.c <- lm(y.rough[c.index] ~ x.rough[c.index]) m <- lm.3.c$coefficients[2] int <- lm.3.c$coefficients[1] lines(c(three.quart.pt, 6), c(int+m*three.quart.pt, int+m*6)) # Residuals res.3 <- c(lm.3.a$residuals, lm.3.b$residuals, lm.3.c$residuals) plot(x.rough, res.3, pch=20, main="Residuals for three regression lines") par(oldpar) cat("The sum of squares for error:\n For mean = ", sum(res.mean*res.mean), "\n", "For regression line =", sum(res.line*res.line), "\n For split = ", sum(res.2*res.2), "\n For 3 lines = ", sum(res.3*res.3), " a further improvement\n") #============ Quadratic ============ oldpar <- par(mfrow=c(2,1), mar = c(4, 2, 2, 1)) plot(x.rough, y.rough, col="red", pch=20, main="Approximation by quadratic regression") X.rough.q <- cbind(x.rough, x.rough^2) lm.q <- lm(y.rough ~ X.rough.q) cat("The coefficients are ", lm.q$coefficients, "\n") T.q <- lm.q$coefficients[1] + x.in*lm.q$coefficients[2] + (x.in)^2*lm.q$coefficients[3] lines(x.in, T.q, col="green") curve(sin, 0, 6, add=T, col="blue") plot(x.rough, lm.q$residuals, pch=20, main="Residuals for approximation by quadratic regression") par(oldpar) cat("The sum of squares for error:\n For mean = ", sum(res.mean*res.mean), "\n", "For regression line =", sum(res.line*res.line), "\n For split = ", sum(res.2*res.2), "\n For 3 lines = ", sum(res.3*res.3), "\n For quadratic = ", sum(lm.q$residuals*lm.q$residuals), " slightly better than a straight line\n") #============= Cubic ================== oldpar <- par(mfrow=c(2,1), mar = c(4, 2, 2, 1)) plot(x.rough, y.rough, col = "red", pch = 20, main = "Approximation by cubic regression") X.rough.cubic <- cbind(x.rough, x.rough^2, x.rough^3) lm.cubic <- lm(y.rough ~ X.rough.cubic) cat("The coefficients are ", lm.q$coefficients, "\n") T.cubic <- lm.cubic$coefficients[1] + x.in*lm.cubic$coefficients[2]+ (x.in)^2*lm.cubic$coefficients[3]+ (x.in)^3*lm.cubic$coefficients[4] lines(x.in, T.cubic, col = "green") curve(sin, 0, 6, add=T, col = "blue") plot(x.rough, lm.cubic$residuals, pch=20, main="Residuals for approximation by cubic regression line") par(oldpar) cat("The sum of squares for error:\n For mean = ", sum(res.mean*res.mean), "\n", "For regression line =", sum(res.line*res.line), "\n For split = ", sum(res.2*res.2), "\n For 3 lines = ", sum(res.3*res.3), "\n For quadratic = ", sum(lm.q$residuals*lm.q$residuals), "\n For cubic = ", sum(lm.cubic$residuals*lm.cubic$residuals), " similar to the 3 linear regressions\n") # Higher dimension regression numb <- 1000 max <- 6 set.seed(1234) x <- sort(runif(numb)*max) y <- runif(numb)*max z <- sin(x)*y + runif(numb) lm.1 <- lm(z~x+y) lm.1$coefficients X.g <- seq(0, max, by=.1) Y.g <- seq(0, max, by=.1) g <- expand.grid(X.g, Y.g) lm.g <- lm.1$coefficients[1] + g[,1]*lm.1$coefficients[2] + g[,2]*lm.1$coefficients[3] lm.g.m <- matrix(lm.g, nrow=length(X.g)) library(rgl) plot3d(x, y, z, col="red", type="s", size=0.3) surface3d(X.g, Y.g, lm.g.m, alpha=0.5) # Residuals plot3d(x, y, zlim=c(-6,6), type="s", size=0.3, lm.1$residuals) sum(lm.1$residuals*lm.1$residuals) #=============== Cubic =============== X <- cbind(x, x^2, x^3) lm.2 <- lm(z~X+y) lm.2$coefficients lm.2.g <- (lm.2$coefficients[1] + g[,1]*lm.2$coefficients[2]+ (g[,1])^2*lm.2$coefficients[3]+ (g[,1])^3*lm.2$coefficients[4] + g[,2]*lm.2$coefficients[5]) lm.2.g.m <- matrix(lm.2.g, nrow=length(X.g)) plot3d(x, y, z, col="red", type="s", size=0.3) surface3d(X.g, Y.g, lm.2.g.m, alpha=0.5) # Residuals plot3d(x, y, zlim = c(-6,6), type = "s", size = 0.3, lm.2$residuals) sum(lm.2$residuals*lm.2$residuals) #================= X <- cbind(y*x, y*x^2, y*x^3) lm.3 <- lm(z~X) lm.3$coefficients lm.3.g <- (lm.3$coefficients[1] + g[,1]*g[,2]*lm.3$coefficients[2]+ (g[,1])^2*g[,2]*lm.3$coefficients[3]+ (g[,1])^3*g[,2]*lm.3$coefficients[4]) lm.3.g.m <- matrix(lm.3.g, nrow=length(X.g)) plot3d(x, y, z, col="red", type="s", size=0.3) surface3d(X.g, Y.g, lm.3.g.m, alpha=0.5) # Residuals plot3d(x, y, zlim=c(-6,6), type="s", size=0.3, lm.3$residuals) sum(lm.3$residuals*lm.3$residuals) # ================= All subsets # Example from Neter, Kutner, Nactsheim, Wasseman 8.3 d.file <- paste(data.dir, "ch08ta01.dat", sep = "/") d.temp <- matrix(scan(d.file), ncol = 6, byrow = T) d.data <- d.temp[,c(1:4,6)] names <- c("BloodClot", "Prog.Ind", "EnzyneFun", "LiverFun", "logSurvival") dimnames(d.data) <- list(1:54, names) df.data <- as.data.frame(d.data) # The following recursive routine creates all possible combinations # of size numb next.combo <- function (combo, numb, all.current, prev, ind) { i <- prev + 1 while (i <= numb) { combo[ind, c(all.current, i)] <- c(all.current, i) ind <- ind + 1 res <- next.combo (combo, numb, c(all.current, i), i, ind) combo <- res[[1]] ind <- res[[2]] i <- i + 1 } list(combo, ind) } # Determine the number of possible combinations of size 4 numb <- 4 n.combo <- 0 for (j in 1:numb) { n.combo <- n.combo + choose(numb,j) } combo <- matrix(0, n.combo, numb) (res <- next.combo (combo, numb, {}, 0, 1)) # Puts them in a better order (combos <- res[[1]][order(apply(res[[1]]!=0, 1, sum)),]) # Create regression models for all possible subsets and plot the SSE xx <- 1:numb plot(-1, 0, xlim = c(0,numb), ylim = c(0, 5), xlab = "", ylab = "") lpm.all <- {} # A null list for (i in 1:n.combo) { comb <- d.data[,combos[i,]] # Simplify names lpm <- lm(df.data[,5]~comb) # Put model in list lpm.all <- c(lpm.all, list(lpm)) # Plot the results points(sum(combos[i,] > 0), sum(lpm$residuals^2), pch = 19) } lpm.all table <- matrix(0, n.combo + 1, 4) TSS <- sum((df.data[ ,5] - mean(df.data[,5]))^2) table[1,1] <- 53 table[1,2] <- floor(100000*TSS)/100000 # 5 digits table[1,3] <- 0 table[1,4] <- floor(100000*(table[1, 2]/(54 - 2)))/100000 row.names <- "None" for (i in 1:n.combo) { temp1 <- format(combos[i, ]) temp2 <-{} for (j in 1:length(temp1)) { if (temp1[j] != "0") temp2 <- paste(temp2, "X", temp1[j], sep="") } row.names <-c(row.names,temp2) table[i+1,1] <- floor(100000*lpm.all[[i]]$df.residual)/100000 table[i+1,2] <- floor(100000*sum(lpm.all[[i]]$residual^2))/100000 table[i+1,3] <- floor(100000*(1 - table[i+1,2]/TSS))/100000 table[i+1,4] <- floor(100000*table[i+1,2])/(54-2)/100000 } dimnames(table) <- list(row.names, c("df","SSE","R^2","MSE")) table # mins <- matrix(10^(30), numb+1, 2) for (i in 1:15) { temp <- sum(lpm.all[[i]]$residuals^2) if (temp < mins[lpm.all[[i]]$rank, 1]) { mins[lpm.all[[i]]$rank, 1] <- temp mins[lpm.all[[i]]$rank, 2] <- i } } mins[1,1] <- TSS mins[1,2] <- 0 lines (0:4, mins[1:5, 1]) points (0:4, mins[1:5, 1], col = "red", pch = 19) #=================================================== # Singular value decomposition #=================================================== (A <- cbind(c(1,0,1), c(1,1,0))) nrow <- dim(A)[1] ncol <- dim(A)[2] (ATA <- t(A)%*%A) # Get the eigenvalues and eigenvectors (ATA.eig <- eigen(ATA)) # Set 'small' eigenvalues to 0 and remove them # Which are smaller that 10^(-15) (abs(ATA.eig$values) > 10^(-15)) # Create indices for the non-zero eigenvalues (abs(ATA.eig$values) > 10^(-15))*(1:length(ATA.eig$values)) # List of the non-zero eigenvalues (eigs.a <- ATA.eig$values[(abs(ATA.eig$values) > 10^(-15))*(1:length(ATA.eig$values))]) # Put eigenvalues in descending order with order(-eigs.a) (the.eigs <- eigs.a[order(-eigs.a)]) (r <- length(the.eigs)) # Number of non-zero eigenvalues # We order the eigenvectors in descending order of the eigenvectors if (r < ncol) { V <- as.matrix(ATA.eig$vectors[,c(order(-eigs.a),(r+1):ncol)]) } else { V <- as.matrix(ATA.eig$vectors[,order(-eigs.a)]) } V t(V)%*%V # Create the singular value matrix using the # square root of the eigenvalues as 'diagonal' elements # This is of the same shape as A (Sig <- diag((the.eigs)^(.5), nrow, ncol)) (Sig.Inv <- diag(1/(the.eigs)^(.5), nrow, ncol)) U <- matrix(0, nrow, nrow) for (i in 1:r) { U[,i] <- A%*%V[,i]/sqrt(the.eigs[i]) } U (I <- diag(1, nrow, nrow)) if (r < nrow) { for (i in (r+1):nrow) { # Gram-Schmidt for next u <- I[,1] for (j in 1:(i-1)) { u <- u - t(I[,1])%*%U[,j]*U[,j] } U[,i] <- u/sqrt(sum(u*u)) } } U U%*%t(U) U%*%Sig%*%t(V) A Y <- matrix(y.rough) X <- cbind(rep(1,length(Y)),x.rough) (s <- svd(t(X)%*%X)) (l <- s$v%*%diag(1/s$d)%*%t(s$u)%*%t(X)%*%Y) #====================== # Standardize data #====================== f.data.std <- function(data) { data <- as.matrix(data) bar <- apply(data, 2, mean) s <- apply(data, 2, sd) t((t(data) - bar)/s) } #============================================ # Convert standardized coeff to original #============================================ std.to.orig <- function (std.coef, mean.X, mean.Y, s.X, s.Y) { sz <- length(std.coef) B.i <- matrix(0, sz, 1) for (i in 2:sz) { B.i[i,1] <- s.Y/s.X[i-1]*std.coef[i] std.coef[i] <- B.i[i,1] B.i[1,1] <- B.i[1,1] + B.i[i,1]*mean.X[i-1] } std.coef[1] <- mean.Y - B.i[1,1] std.coef } #============== d.file <- paste(data.dir, "prostate.dat", sep = "/") d.temp <- matrix(scan(d.file), ncol=10, byrow=T) data.orig <- d.temp[,2:10] names <- c("lcavol","lweight","age","lbph","svi","lcp","gleason","pgg45","lpsa") dimnames(data.orig) <- list(1:97, names) pred <- 1:8 resp <- 9 p <- 8 #============== Yhat f.Yhat <- function(coef, X) { X <- as.matrix(X) cbind(rep(1,dim(X)[1]), X)%*%as.matrix(coef) } source(paste(code.dir, "pairs_ext.r", sep="/")) pairs(data.orig, upper.panel=panel.cor, diag.panel=panel.hist) cor(data.orig) # Try a least squares on the full data set - original and standardized lm(data.orig[,resp]~data.orig[,pred]) # Standardize the data data.std <- f.data.std(data.orig) (lm.std <- lm(data.std[,resp]~data.std[,pred])) # Convert back std.to.orig(lm.std$coefficients, apply(data.orig[,pred], 2, mean), mean(data.orig[,resp]), apply(data.orig[,pred],2,sd), sd(data.orig[,resp])) #============================================ # A function to produce those indices NOT in a set. # Use this to get the test sample. #============================================ "%w/o%" <- function(x,y) x[!x %in% y] #============================================ # Set the indices for the training/test sets #============================================ get.train <- function (data.sz, train.sz) { # Take subsets of data for training/test samples # Return the indices train.ind <- sample(data.sz, train.sz) test.ind <- (1:data.sz) %w/o% train.ind list(train=train.ind, test=test.ind) } #============================================ # Start the train/test split # Set the size of the training sample #============================================ Train.sz <- 67 # Get the indices for the training and test samples (tt.ind <- get.train(dim(data.orig)[1], Train.sz)) # train.X.orig <- data.orig[tt.ind$train, pred] train.Y.orig <- data.orig[tt.ind$train, resp] train.X.std <- f.data.std(train.X.orig) train.Y.std <- f.data.std(train.Y.orig) # Do a least squares on the standardized data and convert lm.trn <- lm(train.Y.std ~ train.X.std) std.to.orig(lm.trn$coefficients, apply(train.X.orig, 2, mean), mean(train.Y.orig), apply(train.X.orig,2,sd), sd(train.Y.orig)) # lm(train.Y.orig ~ train.X.orig) # OK # Look at the effect of different training sets for (i in 1:5) { tt.ind <- get.train(dim(data.std)[1], Train.sz) train.X.orig <- data.orig[tt.ind$train, pred] train.Y.orig <- data.orig[tt.ind$train, resp] train.X.std <- f.data.std(train.X.orig) train.Y.std <- f.data.std(train.Y.orig) lm.trn <- lm(train.Y.std~train.X.std) print(lm(train.Y.orig~train.X.orig)) print(std.to.orig(lm.trn$coefficients, apply(train.X.orig,2,mean), mean(train.Y.orig), apply(train.X.orig,2,sd), sd(train.Y.orig))) print("") } #============================================ #=========== Cross Validation ============ #======================================================= # This section does a cross validation on the full LS model # Set up a cross validation indices # Create a train/test split and then create 10 subsets # cv.sets$cv.train 10 cross validation training sets from the training data # cv.sets$cv.test Corresponding test sets from the training data # cv.sets$test Test data # For the sake of reproducibility, the training and test samples are fixed #======================================================= set.up.cv.ind <- function (data.sz, Train.sz) { tt.ind <- get.train(data.sz, Train.sz) # We ignore the result and fix the samples. tt.ind <- list(train <- c(64, 62, 75, 6, 94, 56, 52, 80, 33, 55, 87, 32, 23, 9, 59, 2, 96, 66, 90, 53, 12, 16, 11, 47, 21, 54, 91, 24, 81, 86, 34, 51, 8, 17, 46, 45, 93, 57, 4, 50, 83, 74, 60, 14, 48, 41, 35, 70, 68, 72, 61, 28, 85, 13, 43, 29, 82, 77, 92, 22, 3, 15, 78, 44, 20, 84, 76), test <- c( 1, 5, 7, 10, 18, 19, 25, 26, 27, 30, 31, 36, 37, 38, 39, 40, 42, 49, 58, 63, 65, 67, 69, 71, 73, 79, 88, 89, 95, 97)) n.in.set <- ceiling(Train.sz*.1) # Put all the indices in the set column wise into matrix # & pad with 0 to avoid repetition. cv.sets <- matrix(c(tt.ind[[1]], rep(0, n.in.set*10-Train.sz)), ncol=n.in.set,byrow=F) cv.train <- {} cv.test <- {} for (i in 1:10) { # Select the 1/10th for testing and 9/10ths for training cv.test <- c(cv.test, list(cv.sets[i,])) # The indices are from the full set cv.train <- c(cv.train, list(as.vector(cv.sets[1:10%w/o% i,]))) } list(cv.train=cv.train, cv.test=cv.test, test=tt.ind[[2]]) } #=========================== cv.sets <- set.up.cv.ind(dim(data.std)[1], Train.sz) cv.sets$cv.train # 10 cross validation training sets from the training data cv.sets$cv.test # Corresponding test sets from the training data # cv.sets$test # Test data #============== Best subset #======================================================== # Recursive function to compute all possible subsets #======================================================== next.combo <- function (combo, numb, all.current, prev, ind) { i <- prev + 1 while (i <= numb) { combo[ind, c(all.current, i)] <- c(all.current, i) ind <- ind + 1 res <- next.combo (combo, numb, c(all.current, i), i, ind) combo <- res[[1]] ind <- res[[2]] i <- i + 1 } list(combo, ind) } #======================================================== number.of.combos <- function (numb) { n.combo <- 0 for (j in 1:numb) { n.combo <- n.combo + choose(numb,j) } n.combo } #======================================================== create.combos <- function (numb) { n.combo <- number.of.combos(numb) combo <- matrix(0, n.combo, numb) temp <- next.combo (combo, numb, {}, 0, 1)[[1]] temp[order(apply(temp!=0,1,sum)),] } #======================================================== combos <- create.combos(8) rownames(combos) <- c(1:255) combos[c(1,2,3,11,12,13,253,254,255),] #======================================================== # Create regression models for all possible subsets and plot the SSE. # We use the train and test stuff from above. # Because we want to use the full training sample for this part # we combine the cv.train & cv.test for the training sample #======================================================== train.X.orig <- data.orig[c(cv.sets$cv.train[[1]], cv.sets$cv.test[[1]]), pred] train.Y.orig <- data.orig[c(cv.sets$cv.train[[1]], cv.sets$cv.test[[1]]), resp] test.X.orig <- data.orig[cv.sets$test, pred] test.Y.orig <- data.orig[cv.sets$test, resp] train.X.std <- f.data.std(train.X.orig) train.Y.std <- f.data.std(train.Y.orig) # The returned coef are for the original data n.data <- length(train.Y.std) lpm.all <- {} for (i in 1:dim(combos)[1]) { X <- train.X.std[,combos[i,]] lpm <- lm(train.Y.std ~ X) lpm$coefficients <- std.to.orig(lpm$coefficients, apply(train.X.orig, 2, mean), mean(train.Y.orig), apply(train.X.orig,2,sd), sd(train.Y.orig)) lpm.all <- c(lpm.all, list(lpm)) } lpm.all[[1]] lpm.all[[9]] # Plot plot(-1, 0, xlim = c(0, 8), ylim = c(0, 100), xlab = "k", ylab = "SSE") for (i in 1:dim(combos)[1]) { points(sum(combos[i,] > 0), sum(lpm.all[[i]]$residuals^2), pch = 19) } #=========== trunc <- function (numb, dig) { mask <- 10^dig floor(mask*numb)/mask } #=========== create.table <- function (combos, Y) { n.data <- length(Y) n.combo <- dim(combos)[1] table <- matrix(0, n.combo+1, 4) TSS <- sum((Y - mean(Y))^2) table[1,1] <- 66 table[1,2] <- trunc(TSS, 5) table[1,3] <- 0 table[1,4] <- trunc(table[1,2]/(n.data-2), 5) row.names <- "None" for (i in 1:n.combo) { temp1 <- format(combos[i,]) temp2 <-{} for (j in 1:length(temp1)) { if (temp1[j] != "0") temp2 <- paste(temp2, "X", temp1[j], sep="") } row.names <-c(row.names,temp2) table[i+1,1] <- trunc(lpm.all[[i]]$df.residual,5) table[i+1,2] <- trunc(sum(lpm.all[[i]]$residual^2),5) table[i+1,3] <- trunc(1 - table[i+1,2]/TSS, 5) table[i+1,4] <- trunc(table[i+1,2]/(n.data-2), 5) } dimnames(table) <- list(row.names, c("df","SSE","R^2","MSE")) table } table <- create.table(combos, train.Y.std) table[1:15,] #========================== create.mins <- function (lpm.all, n.combo, numb) { mins <- matrix(10^(30), numb+1, 2) # Look at all the combos and find the minimum for each category (rank) for (i in 1:n.combo) { temp <- sum(lpm.all[[i]]$residuals^2) if (temp < mins[lpm.all[[i]]$rank, 2]) { mins[lpm.all[[i]]$rank, 1] <- i mins[lpm.all[[i]]$rank, 2] <- temp } } mins[1,1] <- 0 mins[1,2] <- table[1,2] mins } #======================== (mins <- create.mins(lpm.all, dim(combos)[1], 8)) lines (0:p, mins[1:(p+1), 2]) points (0:p, mins[1:(p+1), 2], col = "red", pch = 19) # for (i in 0:p) { cat( dimnames(table)[[1]][mins[i+1, 1]+1], table[mins[i+1, 1]+1,2], "\n") } # Display all the best for (i in 1:(p-1)) { print(lpm.all[[mins[i+1,1] ]]) } #=================== Ridge # Example from Neter, Kutner, Nactsheim, Wasseman 7.1 d.file <- paste(data.dir, "ch07ta01.dat", sep = "/") d.data <- matrix(scan(d.file), ncol=4, byrow=T) names <- c("Triceps","Thigh","MidArm","BodyFat") dimnames(d.data) <- list(1:20, names) df.data <- as.data.frame(d.data) # Get the mean and standard deviations (bar <- apply(d.data, 2, mean)) (s <- apply(d.data, 2, sd)) d.data.t <- t((t(d.data) - bar)/s)/sqrt(dim(d.data)[1]-1) (r.xx <- cor(d.data.t[,1:3])) (r.xy <- cor(d.data.t[,1:3],d.data.t[,4])) # I <- diag(1, 3, 3) Ridge.coef <- matrix(0,29,4) i <- 1 for (c in c(seq(0,.01,by=.001),seq(.02,.1,by=.01),seq(.2,1.0, by=.1))) { Ridge.coef[i,1] <- c Ridge.coef[i,2:4] <- solve(r.xx+c*I, r.xy) i <- i + 1 } dimnames(Ridge.coef) <- list(NULL, c("c","b1","b2","b3")) Ridge.coef # vif <- matrix(0,29,5) i <- 1 for (c in c(seq(0,.01,by=.001),seq(.02,.1,by=.01),seq(.2,1.0, by=.1))) { vif[i,1] <- c a.inv <- solve(r.xx+c*I,I) vif[i,2:4] <- diag(a.inv%*%r.xx%*%a.inv) vif[i,5] <- 1-sum( (Ridge.coef[i,2:4]%*%t(d.data.t[,1:3])-d.data.t[,4])^2) i <- i + 1 } dimnames(vif) <- list(NULL, c("c","VIF1","VIF2","VIF3","R^2")) vif # Plot plot(log(Ridge.coef[2:29,1]),Ridge.coef[2:29,2],ylim=c(-2,3),"l", xlab="log(c)", ylab="b", col="red") lines(log(Ridge.coef[2:29,1]),Ridge.coef[2:29,3], col="blue") lines(log(Ridge.coef[2:29,1]),Ridge.coef[2:29,4], col="green") legend(-2,3,legend=c("b1", "b2", "b3"), col=c("red","blue","green"),lty=1) #========================== ridge <- function (data, p.cols, r.cols, lambda, use.c = T) { # Get the mean and standard deviations # p.cols are the columns for the perdictors # r.cols are the columns for the response data.std <- f.data.std(data) r.xx <- t(data.std[,p.cols])%*%data.std[,p.cols] r.xx r.xy <- t(data.std[,p.cols])%*%data.std[,r.cols] r.xy n.cols <- length(r.xy) I <- diag(1, n.cols, n.cols) Ridge.coef <- matrix(0, length(lambda), n.cols+2) i <- 1 for (c in lambda) { if (use.c == F) { Ridge.coef[i,1] <- sum(d/(d+L[i])) } else { Ridge.coef[i,1] <- c } r.xx.c <- r.xx+c*I Ridge.coef[i, 2:(n.cols+2)] <- std.to.orig(c(0,lm(r.xy~r.xx.c-1)$coefficients), apply(data.orig[,pred], 2, mean), mean(data.orig[,resp]), apply(data.orig[,pred],2,sd), sd(data.orig[,resp])) i <- i + 1 } vif <- matrix(0, length(lambda), n.cols+2) i <- 1 for (c in lambda) { if (use.c == F) { vif[i,1] <- sum(d/(d+L[i])) } else { vif[i,1] <- c } a.inv <- solve(r.xx+c*I,I) vif[i,(1:n.cols)+1] <- diag(a.inv%*%r.xx%*%a.inv) vif[i,n.cols+2] <- 1-sum( (Ridge.coef[i,2:4]%*%t(data.std[,1:3])-data.std[,4])^2) i <- i + 1 } list (coef=Ridge.coef, vif=vif) } #========== Run ridge on full data (d <- svd(data.orig[,pred])$d^2) # or (d <- eigen(t(data.orig[,pred])%*%data.orig[,pred])$values) # Set up points to give a good df(L) spread L <- c(seq(0,7,by=7/10),seq(8,20,by=12/10),seq(22,50,by=28/10), seq(55,120,by=65/10),seq(130,400,by=270/10),seq(440,3500,by=3160/10), seq(3700,200000,by=196300/10),seq(250000,5000000,by=475000/10)) ridge.res <- ridge (data.orig, pred, resp, L, F) Ridge.coef <- ridge.res$coef Ridge.coef[1:5,] # plot(Ridge.coef[1:dim(Ridge.coef)[1],1],Ridge.coef[1:dim(Ridge.coef)[1],3], ylim=c(-.2,1),"l", xlab="df(c)", ylab="b", col=1) for (i in 2:p) { lines(Ridge.coef[1:dim(Ridge.coef)[1],1], Ridge.coef[1:dim(Ridge.coef)[1],i+1], col=i) } legend(0,1,legend=paste("b",1:p, sep=""), col=1:p,lty=1) # Smoothing revisited # A more sophisticated smoother - Clevland's loess f.weight <- function (x){ (1 - abs(x)^3)^3 } # Local regression reg.value <- function (d.subset, pt, do.print=F) { # Get the distance from pt to every point in the interval. d.dist <- abs(d.subset[,1] - pt) scaled.dist <- d.dist/max(d.dist) W <- f.weight(scaled.dist) n <- length(W) # Do a least squares fit (lm for linear model) # lm(y ~ x, data, subset, weights) lw <- lm(d.subset[,2]~d.subset[,1],as.data.frame(d.subset), 1:n, W) if (do.print) { print(pt) print(cbind(d.subset[,1],d.subset[,2],d.dist,scaled.dist,W)) print(lw$coefficients) print(lw$coefficients%*%c(1,pt)) print("************************************") } lw$coefficients%*%c(1,pt) } # Simple lowess f.lowess <- function(d.data, band, do.print=F) { # d.data has the x and y values of the data # band is the bandwidth # Initialize v <- rep(0, dim(d.lowess)[1]) # Holds the result i <- 1 first <- 1 last <- first + band - 1 mid <- floor((first+last)/2) ind.pt <- 1 # Points to position of pt in the band # Part 1 - move the pt at which we want the smooth # until we reach midpoint of band d.subset <- d.data[first:last,] while (ind.pt <= mid) { pt <- d.subset[ind.pt,1] v[i] <- reg.value (d.subset, pt, do.print) i <- i + 1 ind.pt <- ind.pt + 1 } # Part 2 - move the pt with the band ind.pt <- ind.pt - 1 # In middle of band repeat { first <- first + 1 last <- last + 1 if (last > dim(d.data)[1]) break d.subset <- d.data[first:last,] pt <- d.subset[ind.pt,1] v[i] <- reg.value (d.subset, pt, do.print) i <- i + 1 } # Part 3 - move the midpoint until we reach the end ind.pt <- ind.pt + 1 while (ind.pt <= band) { pt <- d.subset[ind.pt,1] v[i] <- reg.value (d.subset, pt, do.print) i <- i + 1 ind.pt <- ind.pt + 1 } v } load(paste(data.dir, "noisySine.Rdata", sep="/")) x.rough <- noisy.sine[[1]] y.rough <- noisy.sine[[2]] x.in <- noisy.sine[[3]] # Lowess on noisy sine library(stats) d.lowess <- cbind(x.rough, y.rough) oldpar <- par(mfrow = c(3, 1), mar = c(4, 2, 2, 1)) for (b in c(9, 13, 21)) { v <- f.lowess(d.lowess, b, T) plot(d.lowess[,1], d.lowess[,2], col="red", pch=20, main=paste("Lowess with bandwidth =", b)) curve(sin, 0, 6, add=T, col="blue") lines(d.lowess[,1], v, col="black") lines(lowess(d.lowess,f=b/61), col="green") } par(oldpar) # Loess on Runge oldpar <- par(mfrow = c(3, 1), mar = c(4, 2, 2, 1)) for (s in c(0.25, 0.5, 0.75)) { # Plot the noisy points plot (x.5.5by.1, y.noise, col = "red", pch = 20, main = paste("Span = ", s)) # Plot thge Runge curve curve(runge, -5, 5, add = T, col = "blue") lines(x.5.5by.1, loess(y.noise ~ x.5.5by.1, span = s)$fitted, col="green") } par(oldpar) #========== Super smoother - Variable span - fixed bass =============== # Another sophisticated smoother - Friedman's super smoother # Variable span - fixed bass = 0 s <- c(0.05, 0.2, 0.5) lab <- c("- tweeter", "- midrange", "- woofer") oldpar <- par(mfrow = c(3, 1), mar = c(4, 2, 2, 1)) for (i in 1:3) { plot (x.5.5by.1, y.noise, col = "red", pch = 20, main = paste("Super smoother - span = ", s[i],lab[i])) ss <- supsmu(x.5.5by.1, y.noise, span = s[i]) curve(runge, -5, 5, add=T, col = "blue") lines(ss$x, ss$y, col = "green") } par(oldpar) #========== Span chosen by cv - variable bass ============= b <- c(2, 6, 10) oldpar <- par(mfrow = c(3, 1), mar = c(4, 2, 2, 1)) for (i in 1:3) { plot (x.5.5by.1, y.noise, col = "red", pch = 20, main = paste("Super smoother - span = cv, bass =", b[i])) ss <- supsmu(x.5.5by.1, y.noise, span = "cv", bass = b[i]) curve(runge, -5, 5, add = T, col = "blue") lines(ss$x, ss$y, col = "green") } par(oldpar) # Friedman's example n <- 200 x.f <- runif(n) y.f <- sin(2*pi*(1-x.f)^2) + x.f*rnorm(n) plot(x.f, y.f) s <- c(0.05, 0.2, 0.5) lab <- c("tweeter", "midrange", "woofer") plot (x.f, y.f, col="red", pch=20) ss.t <- supsmu(x.f, y.f, span = s[1]) ss.m <- supsmu(x.f, y.f, span = s[2]) ss.w <- supsmu(x.f, y.f, span = s[3]) lines(ss.t$x, ss.t$y, col="black") lines(ss.m$x, ss.m$y, col="green") lines(ss.w$x, ss.w$y, col="blue") legend(0, 2, lab, col = c("black", "green", "blue"), lty = c(1, 1, 1), pch = c(-1, -1, -1)) b <- 0:10 plot (x.f, y.f, col="red", pch=20) for (i in b) { ss.cv <- supsmu(x.f, y.f, span = "cv", bass = i) lines(ss.cv$x, ss.cv$y, col=i+1) } legend(0, 2, 0:10, col = 1:11, lty = c(1, 1, 1), pch = c(-1, -1, -1)) #======== End ==========