##################################################################### # Simulate the population data for the sine curve and add random noise to y axis ##################################################################### x1<-seq(-3*pi,3*pi,by=0.001) y1<-sin(x1) data1<-as.data.frame(cbind(y1,poly(x1,degree = 15,raw=TRUE))) colnames(data1)[2:16] <- paste("x", colnames(data1)[2:16], sep = "") set.seed(42) data1$y<-data1$y+rnorm(nrow(data1),0,0.2) MSE<-mean((data1$y-data1$y1)^2) # Take a sample of 30 observations from the population set.seed(100) nobs1<-30 data<-data1[sample(nrow(data1),nobs1),2:17] nobs2<-20 data2<-data1[sample(nrow(data1),nobs2),2:17] ##################################################################### # Plot the data including training and test sample ##################################################################### # Plot the curve estimated by model 1 for the sampled data plot(data1$x1,data1$y1,type = "l",col="snow3",xlab = "x", ylab="y",lwd=2,cex.axis=1.4,cex.lab=1.8,ylim = c(-2,2)) points(data$x1,data$y,col="dodgerblue2",pch=20,cex=1.5,xlab="x",ylab="y") points(data2$x1,data2$y,col="indianred",pch=20,cex=1.5,xlab="x",ylab="y") par(xpd=TRUE) legend(5.5, -2.6, legend=c("Population", "Train Sample","Test Sample"), col=c("snow3", "dodgerblue2","indianred"), lty=c(1,0,0), lwd=5,pch=c(20,20),cex=0.7) par(xpd=FALSE) ##################################################################### # Model 1: fit a step-wise regression model to the sampled data ##################################################################### fit<-lm(y~x1+x2+x3+x4+x5+x6+x7+x8+x9+x10+x11+x12+x13+x14+x15,data=data) fit<-step(fit) summary(fit) # Plot the curve estimated by model 1 for the sampled data plot(data1$x1,data1$y1,type = "l",col="snow3",xlab = "x", ylab="y",lwd=2,cex.axis=1.4,cex.lab=1.8,ylim = c(-2,2)) with(data1,lines(x = data1$x1, y = predict(fit,data1), col="orange2",lwd=3)) points(data$x1,data$y,col="dodgerblue2",pch=20,cex=1.5,xlab="x",ylab="y") points(data2$x1,data2$y,col="indianred",pch=20,cex=1.5,xlab="x",ylab="y") par(xpd=TRUE) legend(5.5, -2.6, legend=c("Population", "Sample fit","Train Sample","Test Sample"), col=c("snow3", "orange2","dodgerblue2","indianred"), lty=c(1,1,0,0), lwd=5,pch=c(20,20),cex=0.7) par(xpd=FALSE) # find the MSE for model 1 MSE1<-mean((data1$y-predict(fit,data1))^2) text(1,2,labels=paste("MSE Train=",round(mean((data$y-predict(fit,data))^2),2))) text(1,1.7,labels=paste("MSE Test=",round(mean((data2$y-predict(fit,data2))^2),2))) text(1,1.4,labels=paste("MSE Population=",round(MSE1,2),2)) ##################################################################### # Model 2: Regularized regression with glmnet package ##################################################################### require(glmnet) # glmnet takes matrices as inputs. Hence, create predictor and reponse matrices x <- as.matrix(data[,-16]) # Predictor Variables (x1,..,x15) y <- as.double(as.matrix(data[, 16])) # Response Variable (y) # Make a semi-naive model based on intuition for alpha and lambda glmnet_fit<-glmnet(x, y, family="gaussian",alpha=1, standardize=TRUE,lambda = 0.0001) #Plot the estimated curve by model-2 and find MSE plot(data1$x1,data1$y1,type = "l",col="snow3",xlab = "x", ylab="y",lwd=2,cex.axis=1.4,cex.lab=1.8,ylim = c(-2.1,2.1)) with(data1,lines(x = data1$x1, y = predict(glmnet_fit,newx = as.matrix(data1[,2:16])), col="orange2",lwd=3)) points(data$x1,data$y,col="dodgerblue2",pch=20,cex=1.5,xlab="x",ylab="y") points(data2$x1,data2$y,col="indianred",pch=20,cex=1.5,xlab="x",ylab="y") par(xpd=TRUE) legend(5.5, -2.6, legend=c("Population", "Sample fit","Train Sample","Test Sample"), col=c("snow3", "orange2","dodgerblue2","indianred"), lty=c(1,1,0,0), lwd=5,pch=c(20,20),cex=0.7) par(xpd=FALSE) MSE2<-mean((data1$y-predict(glmnet_fit,newx = as.matrix(data1[,2:16])))^2) text(1,2, labels=paste("MSE Train=",round(mean((data$y-predict(glmnet_fit,newx = as.matrix(data[,2:16])))^2),2))) text(1,1.7, labels=paste("MSE Test=",round(mean((data2$y-predict(glmnet_fit,newx = as.matrix(data2[,2:16])))^2),2))) text(1,1.4,labels=paste("MSE Population=",round(MSE2,2))) ##################################################################### # Model 3: Fitting the model using cross-validation (lasso) ##################################################################### set.seed(2) x <- as.matrix(data[,c(-16)]) # Removes class y <- as.double(as.matrix(data[, 16])) # Only class cv.lasso <- cv.glmnet(x, y, family="gaussian", alpha=1, standardize=TRUE, type.measure='mse', nfolds = 5,lambda = 10^seq(0,-6,length=1000) ,maxit=10^6) # Results for lasso fit #plot(cv.lasso) cv.lasso$lambda.min coef(cv.lasso, s=cv.lasso$lambda.min) #Plot the estimated curve by model-3 and find MSE plot(data1$x1,data1$y1,type = "l",col="snow3",xlab = "x", ylab="y",lwd=2,cex.axis=1.4,cex.lab=1.8,ylim = c(-2.1,2.1)) with(data1,lines(x = data1$x1, y = predict(cv.lasso,newx = as.matrix(data1[,2:16])), col="orange2",lwd=3)) points(data$x1,data$y,col="dodgerblue2",pch=20,cex=1.5,xlab="x",ylab="y") points(data2$x1,data2$y,col="indianred",pch=20,cex=1.5,xlab="x",ylab="y") par(xpd=TRUE) legend(5.5, -2.6, legend=c("Population", "Sample fit","Train Sample","Test Sample"), col=c("snow3", "orange2","dodgerblue2","indianred"), lty=c(1,1,0,0), lwd=5,pch=c(20,20),cex=0.7) par(xpd=FALSE) MSE3<-mean((data1$y-predict(cv.lasso,newx = as.matrix(data1[,2:16])))^2) text(1,2, labels=paste("MSE Train=",round(mean((data$y-predict(cv.lasso,newx = as.matrix(data[,2:16])))^2),2))) text(1,1.7, labels=paste("MSE Test=",round(mean((data2$y-predict(cv.lasso,newx = as.matrix(data2[,2:16])))^2),2))) text(1,1.4,labels=paste("MSE Population=",round(MSE3,2))) #### To be continued in the next part ########################