HYPERPARAMETER TUNING
Huizi Yu 9/24/2019
From the previous section, we found the largest Rsquared 51.167% is achieved when we remove 2450 data points using SOD outlier removal method (5 folds cross validation). Removing more data points will lead to fluctuation around the 50% mark. For this section, we first remove said 2450 outliers and attempt to tune the hyperparameter in random forest. More specifically, we attempt to tune the following four parameters: number of trees, mtry (number of variables available for splitting at each tree node), maximum number of nodes, and sample fraction used to grow the forest (the rest is used to calculate Out-Of-Bag Error).
Loading Concrete Data into workplace
setwd("~/Hyperparameter")
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-16
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
library(abodOutlier)
## Loading required package: cluster
library(standardize)
library(OutliersO3)
library(OutlierDetection)
library(neuralnet)
library(HighDimOut)
library(caret)
library(tree)
library(gbm)
## Loaded gbm 2.1.5
library(xgboost)
library(ranger)
##
## Attaching package: 'ranger'
## The following object is masked from 'package:randomForest':
##
## importance
concrete <- read.csv("Clean_data.csv")
SOD <- read.csv("SOD.csv")
Removing the outliers
input <- concrete[,1:8]
input2 <- scale(input)
complete <- cbind(input2, concrete$overdesign)
concrete2 <- as.data.frame(complete)
colnames(concrete2) <- c("coarse_agg_weight", "fine_agg_weight", "current_weight", "fly_ash_weight", "AEA_dose", "type_awra_dose", "weight_ratio", "target", "overdesign")
concrete2[order(SOD, decreasing = TRUE)[1:2450],"Ind"] <- "Outlier"
concrete2[is.na(concrete2$Ind),"Ind"] <- "Inlier"
train_sov <- subset(concrete2, concrete2$Ind == "Inlier")
Setting benchmark before outlier removal (we also use five folds cross validation)
Rsquared_rf_avg <- c(NA, NA, NA, NA, NA)
for (j in 1:5) {
set.seed(1234567+j*1000)
samp<-sample(1:nrow(train_sov),nrow(train_sov)*0.8,replace = F)
train <-train_sov[samp,]
test <- train_sov[-samp,]
mu <- mean(test$overdesign)
tree_abod <- randomForest(y = train$overdesign , x = train[,1:7], ntree = 500, importance = TRUE)
rf.pred_abod <- predict(tree_abod, newdata =as.matrix(test[,1:7]))
Rsquared_abod_1 <- 1 - (sum((test$overdesign - rf.pred_abod)^2)/sum((test$overdesign - mu)^2))
Rsquared_rf_avg[j] = Rsquared_abod_1
}
mean(Rsquared_rf_avg)
## [1] 0.5116779
Hyperparameter tuning using “ranger” function (due to its fast computational speed)
(a) mtry: number of variables tried at individual tree
(b) num_trees: the number of trees in each random forest
(b) node_size: maximum number of node
(c) sample_size: number of sample used to build tree
hyper_grid <- expand.grid(
num_trees = c(300,500,800,1000),
mtry = seq(1, 7, by = 1),
node_size = seq(3, 9, by = 2),
sampe_size = c(.55, .632, .70, .80),
OOB_RMSE = 0
)
for(i in 1:nrow(hyper_grid)) {
# train model
model <- ranger(
formula = overdesign~coarse_agg_weight+fine_agg_weight+current_weight+fly_ash_weight+AEA_dose+type_awra_dose+weight_ratio,
data = train_sov,
num.trees = hyper_grid$num_trees[i],
mtry = hyper_grid$mtry[i],
min.node.size = hyper_grid$node_size[i],
sample.fraction = hyper_grid$sampe_size[i],
seed = 1234567
)
# add OOB error to grid
hyper_grid$OOB_RMSE[i] <- sqrt(model$prediction.error)
}
# best combination
ordered <- hyper_grid[order(hyper_grid$OOB_RMSE),]
ordered[1,]
## num_trees mtry node_size sampe_size OOB_RMSE
## 152 1000 3 5 0.632 0.1481207
Testing the tuned random forest on testing data set using five fold cross validation
Rsquared_rf_avg_tuned <- c(NA, NA, NA, NA, NA)
for (j in 1:5) {
set.seed(1234567+j*1000)
samp<-sample(1:nrow(train_sov),nrow(train_sov)*0.8,replace = F)
train <-train_sov[samp,]
test <- train_sov[-samp,]
test_x <- test[,1:7]
mu <- mean(test$overdesign)
tree_abod <- ranger(
formula = overdesign~coarse_agg_weight+fine_agg_weight+current_weight+fly_ash_weight+AEA_dose+type_awra_dose+weight_ratio,
data = train,
num.trees = 1000,
mtry = 3,
min.node.size = 5,
sample.fraction = 0.632,
seed = 1234567
)
rf.pred_abod <- predict(tree_abod, data = test_x)
Rsquared_abod_1 <- 1 - (sum((test$overdesign - rf.pred_abod$predictions)^2)/sum((test$overdesign - mu)^2))
Rsquared_rf_avg_tuned[j] = Rsquared_abod_1
}
mean(Rsquared_rf_avg_tuned)
## [1] 0.5128897