#Extract data for regression mode Clean_Data<-read.csv("http://ucanalytics.com/blogs/wp-content/uploads/2016/09/Regression-Clean-Data.csv") numeric<-c("Dist_Taxi", "Dist_Market", "Dist_Hospital", "Carpet","Builtup", "Rainfall") categoric <- c("Parking", "City_Category") Target<-c("House_Price") # Prepare train and test data for regression models set.seed(42) train <-sample(nrow(Clean_Data), 0.7*nrow(Clean_Data)) test<-setdiff(seq_len(nrow(Clean_Data)), train) # 1st regression model with all the variables Org_Reg<-lm(House_Price~.,data = Clean_Data[train,c(Target,numeric,categoric)]) summary(Org_Reg) Estimate <- predict(Org_Reg, type="response", newdata=Clean_Data[test, c(numeric,categoric,Target)]) Observed <- subset(Clean_Data[test,c(numeric,categoric,Target)],select = Target) format(cor(Estimate, Observed$House_Price)^2, digits=4) # 2nd Regression model with principal components require(FactoMineR) Data_for_PCA<-Clean_Data[,numeric] pca1<-PCA(Data_for_PCA) PCA_data<-as.data.frame(cbind(Clean_Data[train,c(Target,categoric)],pca1$ind$coord[train,])) Step_PCA_Reg<-step(lm(House_Price~.,data = PCA_data)) summary(Step_PCA_Reg) PCA_Estimate <- predict(Step_PCA_Reg, type="response", newdata=cbind(Clean_Data[test,c(Target,categoric)],pca1$ind$coord[test,])) format(cor(PCA_Estimate, Observed$House_Price)^2, digits=4) # 3rd regression model with dominant variables numeric_new<-c("Dist_Hospital", "Carpet") New_Reg<-lm(House_Price~.,data = Clean_Data[train,c(Target,numeric_new,categoric)]) options(scipen=999) summary(New_Reg) New_Estimate <- predict(New_Reg, type="response", newdata=Clean_Data[test, c(numeric,categoric,Target)]) Observed <- subset(Clean_Data[test,c(numeric,categoric,Target)],select = Target) format(cor(New_Estimate, Observed$House_Price)^2, digits=4)