#####################################################################
# Simulate the population data for the sine curve and add random noise to y axis
#####################################################################
x1<-seq(-3*pi,3*pi,by=0.001)
y1<-sin(x1)
data1<-as.data.frame(cbind(y1,poly(x1,degree = 15,raw=TRUE)))
colnames(data1)[2:16] <- paste("x", colnames(data1)[2:16], sep = "")
set.seed(42)
data1$y<-data1$y+rnorm(nrow(data1),0,0.2)
MSE<-mean((data1$y-data1$y1)^2)

# Take a sample of 30 observations from the population
set.seed(100)
nobs1<-30
data<-data1[sample(nrow(data1),nobs1),2:17]

nobs2<-20
data2<-data1[sample(nrow(data1),nobs2),2:17]

#####################################################################
# Plot the data including training and test sample
#####################################################################

# Plot the curve estimated by model 1 for the sampled data
plot(data1$x1,data1$y1,type = "l",col="snow3",xlab = "x",
     ylab="y",lwd=2,cex.axis=1.4,cex.lab=1.8,ylim = c(-2,2))

points(data$x1,data$y,col="dodgerblue2",pch=20,cex=1.5,xlab="x",ylab="y")
points(data2$x1,data2$y,col="indianred",pch=20,cex=1.5,xlab="x",ylab="y")
par(xpd=TRUE)
legend(5.5, -2.6, legend=c("Population", "Train Sample","Test Sample"),
       col=c("snow3", "dodgerblue2","indianred"), 
       lty=c(1,0,0), lwd=5,pch=c(20,20),cex=0.7)
par(xpd=FALSE)


#####################################################################
# Model 1: fit a step-wise regression model to the sampled data
#####################################################################
fit<-lm(y~x1+x2+x3+x4+x5+x6+x7+x8+x9+x10+x11+x12+x13+x14+x15,data=data)
fit<-step(fit)
summary(fit)

# Plot the curve estimated by model 1 for the sampled data
plot(data1$x1,data1$y1,type = "l",col="snow3",xlab = "x",
     ylab="y",lwd=2,cex.axis=1.4,cex.lab=1.8,ylim = c(-2,2))
with(data1,lines(x = data1$x1, 
                 y = predict(fit,data1),
                 col="orange2",lwd=3))
points(data$x1,data$y,col="dodgerblue2",pch=20,cex=1.5,xlab="x",ylab="y")
points(data2$x1,data2$y,col="indianred",pch=20,cex=1.5,xlab="x",ylab="y")
par(xpd=TRUE)
legend(5.5, -2.6, legend=c("Population", "Sample fit","Train Sample","Test Sample"),
       col=c("snow3", "orange2","dodgerblue2","indianred"), 
       lty=c(1,1,0,0), lwd=5,pch=c(20,20),cex=0.7)
par(xpd=FALSE)

# find the MSE for model 1
MSE1<-mean((data1$y-predict(fit,data1))^2)
text(1,2,labels=paste("MSE Train=",round(mean((data$y-predict(fit,data))^2),2)))
text(1,1.7,labels=paste("MSE Test=",round(mean((data2$y-predict(fit,data2))^2),2)))
text(1,1.4,labels=paste("MSE Population=",round(MSE1,2),2))

#####################################################################
# Model 2: Regularized regression with glmnet package
#####################################################################
require(glmnet)
# glmnet takes matrices as inputs. Hence, create predictor and reponse matrices
x <- as.matrix(data[,-16]) # Predictor Variables (x1,..,x15)
y <- as.double(as.matrix(data[, 16])) # Response Variable (y)

# Make a semi-naive model based on intuition for alpha and lambda
glmnet_fit<-glmnet(x, y, family="gaussian",alpha=1, 
                   standardize=TRUE,lambda = 0.0001)

#Plot the estimated curve by model-2 and find MSE
plot(data1$x1,data1$y1,type = "l",col="snow3",xlab = "x",
     ylab="y",lwd=2,cex.axis=1.4,cex.lab=1.8,ylim = c(-2.1,2.1))
with(data1,lines(x = data1$x1, 
                 y = predict(glmnet_fit,newx = as.matrix(data1[,2:16])),
                 col="orange2",lwd=3))
points(data$x1,data$y,col="dodgerblue2",pch=20,cex=1.5,xlab="x",ylab="y")
points(data2$x1,data2$y,col="indianred",pch=20,cex=1.5,xlab="x",ylab="y")
par(xpd=TRUE)
legend(5.5, -2.6, legend=c("Population", "Sample fit","Train Sample","Test Sample"),
       col=c("snow3", "orange2","dodgerblue2","indianred"), 
       lty=c(1,1,0,0), lwd=5,pch=c(20,20),cex=0.7)

par(xpd=FALSE)

MSE2<-mean((data1$y-predict(glmnet_fit,newx = as.matrix(data1[,2:16])))^2)
text(1,2,
     labels=paste("MSE Train=",round(mean((data$y-predict(glmnet_fit,newx = as.matrix(data[,2:16])))^2),2)))
text(1,1.7,
     labels=paste("MSE Test=",round(mean((data2$y-predict(glmnet_fit,newx = as.matrix(data2[,2:16])))^2),2)))
text(1,1.4,labels=paste("MSE Population=",round(MSE2,2)))

#####################################################################
# Model 3: Fitting the model using cross-validation (lasso)
#####################################################################
set.seed(2)
x <- as.matrix(data[,c(-16)]) # Removes class
y <- as.double(as.matrix(data[, 16])) # Only class

cv.lasso <- cv.glmnet(x, y, family="gaussian", alpha=1,
                      standardize=TRUE, type.measure='mse',
                      nfolds = 5,lambda = 10^seq(0,-6,length=1000)
                      ,maxit=10^6)

# Results for lasso fit
#plot(cv.lasso)
cv.lasso$lambda.min
coef(cv.lasso, s=cv.lasso$lambda.min)

#Plot the estimated curve by model-3 and find MSE
plot(data1$x1,data1$y1,type = "l",col="snow3",xlab = "x",
     ylab="y",lwd=2,cex.axis=1.4,cex.lab=1.8,ylim = c(-2.1,2.1))
with(data1,lines(x = data1$x1, 
                 y = predict(cv.lasso,newx = as.matrix(data1[,2:16])),
                 col="orange2",lwd=3))
points(data$x1,data$y,col="dodgerblue2",pch=20,cex=1.5,xlab="x",ylab="y")
points(data2$x1,data2$y,col="indianred",pch=20,cex=1.5,xlab="x",ylab="y")
par(xpd=TRUE)
legend(5.5, -2.6, legend=c("Population", "Sample fit","Train Sample","Test Sample"),
       col=c("snow3", "orange2","dodgerblue2","indianred"), 
       lty=c(1,1,0,0), lwd=5,pch=c(20,20),cex=0.7)

par(xpd=FALSE)

MSE3<-mean((data1$y-predict(cv.lasso,newx = as.matrix(data1[,2:16])))^2)
text(1,2,
     labels=paste("MSE Train=",round(mean((data$y-predict(cv.lasso,newx = as.matrix(data[,2:16])))^2),2)))
text(1,1.7,
     labels=paste("MSE Test=",round(mean((data2$y-predict(cv.lasso,newx = as.matrix(data2[,2:16])))^2),2)))
text(1,1.4,labels=paste("MSE Population=",round(MSE3,2)))



#### To be continued in the next part ########################