OUTLIER REMOVAL

Huizi Yu 9/1/2019

GOAL for outlier detection:

STEPS:

Seperate the data into 80% training and 20% testing
Standarize the variables
Set up benchmark prior to removing outliers by calculating Rsquared using
- Linear Regression
- LASSO
- Random Forest
- NeuralNetwork
- XGboost
Try the different outlier detection methods in the basic outlier detection package in R
- density-based (den)
- knn (nn)
- ensemble (OutlierDetection)
- cooks distance

Loading Concrete Data into workplace

setwd("~/Concrete_overdesign_PARIS")
library(glmnet)

## Loading required package: Matrix

## Loading required package: foreach

## Loaded glmnet 2.0-16

library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

library(abodOutlier)

## Loading required package: cluster

library(standardize)
library(OutliersO3)
library(OutlierDetection)
library(neuralnet)
library(HighDimOut)
library(caret)
library(tree)
library(gbm)

## Loaded gbm 2.1.5

library(xgboost)
library(knitr)
concrete <- read.csv("Clean_data.csv")

Randomly selecting one training set (80 %) and one testing set (20 %)

set.seed(1234567+5*1000)
input <- concrete[,1:8]
input2 <- scale(input)
complete <- cbind(input2, concrete$overdesign)
concrete2 <- as.data.frame(complete) 
colnames(concrete2) <- c("coarse_agg_weight", "fine_agg_weight", "current_weight", "fly_ash_weight", "AEA_dose", "type_awra_dose", "weight_ratio", "target", "overdesign")
samp<-sample(1:nrow(concrete2),nrow(concrete2)*0.8,replace = F)
train <-concrete2[samp,]
test <- concrete2[-samp,]
mu <- mean(test$overdesign)

Removing non-important variables?

tree_1 <- randomForest(y = concrete2$overdesign , x = concrete2[,1:8])
importance(tree_1, type = 2) 

##                   IncNodePurity
## coarse_agg_weight      47.92385
## fine_agg_weight        53.48026
## current_weight         49.65667
## fly_ash_weight         45.05715
## AEA_dose               53.77255
## type_awra_dose         71.22769
## weight_ratio           51.19938
## target                 40.40843

Random Forest picked coarse_agg_weight, fine_agg_weight, current_weight, fly_ash_weight, AEA_dose, type_awra_dose, W..C.P. all proved to be above 45, shows that we don’t need to remove any variables.

Setting the benchmark before removing any outliers

Methods
- Linear Regression
- LASSO
- Random Forest
- NeuralNetwork
- XGboost

Linear Regression

y = train$overdesign ~ train$coarse_agg_weight + train$fine_agg_weight + train$current_weight + train$fly_ash_weight + train$AEA_dose + train$type_awra_dose + train$weight_ratio + train$target
linear <- lm(y)
Rsquared_linear <- summary(linear)$r.squared
if (Rsquared_linear == 0) {Rsquared_linear = 0}
Rsquared_linear

## [1] 0.2407882

LASSO

cv.mod = glmnet (as.matrix(train[,1:8]),train$overdesign,alpha =1)
lasso.pred=predict (cv.mod,newx=as.matrix(test[,1:8]))
Rsquared_LASSO <- 1 - (sum((test$overdesign - lasso.pred)^2)/sum((test$overdesign - mu)^2))
if (Rsquared_LASSO <= 0) {Rsquared_LASSO = 0}
Rsquared_LASSO

## [1] 0

Random Forest

tree <- randomForest(y = train$overdesign , x = train[,1:8], ntree = 500, importance = TRUE)
rf.pred <- predict(tree, newdata =as.matrix(test[,1:8]))
Rsquared_RF <- 1 - (sum((test$overdesign - rf.pred)^2)/sum((test$overdesign - mu)^2))
Rsquared_RF

## [1] 0.4578791

NeuralNet

z = overdesign ~ coarse_agg_weight + fine_agg_weight + current_weight + fly_ash_weight + AEA_dose + type_awra_dose + weight_ratio + target
n <- neuralnet(z, data = train, hidden = 2, err.fct = 'sse', threshold = 0.1/0.5)
plot(n)
nn.pred <- compute(n, test[,1:8])
Rsquared_NN <- 1 - (sum((test$overdesign - nn.pred$net.result)^2)/sum((test$overdesign - mu)^2))
Rsquared_NN

## [1] 0.3500305

XGBoost

train_values <- train[,1:8]
train_result <- train$overdesign
test_values <- test[,1:8]
test_result <- test$overdesign
dtrain = xgb.DMatrix(data =  as.matrix(train_values), label = train_result)
dtest = xgb.DMatrix(data =  as.matrix(test_values), label = test_result)
watchlist = list(train=dtrain, test=dtest)
xgb_train <- xgb.train(data = dtrain, 
                       max.depth = 8, 
                       eta = 0.3, 
                       nthread = 2, 
                       nround = 10000, 
                       watchlist = watchlist, 
                       objective = "reg:linear", 
                       early_stopping_rounds = 50,
                       print_every_n = 500)

## [1]  train-rmse:0.603671 test-rmse:0.601318 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 50 rounds.
## 
## Stopping. Best iteration:
## [20] train-rmse:0.123723 test-rmse:0.165273

pred_val_xgboost <- predict(xgb_train, as.matrix(test[,1:8]))
Rsquared_XG <- 1 - (sum((test$overdesign - pred_val_xgboost)^2)/sum((test$overdesign - mu)^2))
Rsquared_XG

## [1] 0.4189204

The Benchmark table

Benchmark <- as.data.frame(cbind(c("Linear Regression", "LASSO", "Random Forest", "Neural Network", "XGBoost"), c(Rsquared_linear,Rsquared_LASSO, Rsquared_RF, Rsquared_NN, Rsquared_XG)))
kable(Benchmark, format = "markdown", col.names = c("Method", "R-squared"))

Method	R-squared
Linear Regression	0.240788237889636
LASSO	0
Random Forest	0.457879118012789
Neural Network	0.35003045563753
XGBoost	0.418920351934429

Basic Outlier Detection

Density based outlier function

a <-dens(train)
removed_1 <- train[-a$`Location of Outlier`,]
newtrain <- removed_1
tree <- randomForest(y = newtrain$overdesign , x = newtrain[,1:8])
rf.pred <- predict(tree, newdata =as.matrix(test[,1:8]))
Rsquared_RF_2 <- 1 - (sum((test$overdesign - rf.pred)^2)/sum((test$overdesign - mu)^2))
Rsquared_RF_2

## [1] 0.4513968

Knn outlier detection

b <- nn(train)
removed_2 <- train[-b$`Location of Outlier`,]
newtrain <- removed_2
## checking Rsquared with Random Forest
tree <- randomForest(y = newtrain$overdesign , x = newtrain[,1:8])
rf.pred <- predict(tree, newdata =as.matrix(test[,1:8]))
Rsquared_RF_3 <- 1 - (sum((test$overdesign - rf.pred)^2)/sum((test$overdesign - mu)^2))
Rsquared_RF_3

## [1] 0.4603604

Ensemble outlier detection

c <- OutlierDetection(train)
removed_3 <- train[-c$`Location of Outlier`,]
newtrain <- removed_3
tree <- randomForest(y = newtrain$overdesign , x = newtrain[,1:8])
rf.pred <- predict(tree, newdata =as.matrix(test[,1:8]))
Rsquared_RF_4 <- 1 - (sum((test$overdesign - rf.pred)^2)/sum((test$overdesign - mu)^2))
Rsquared_RF_4

## [1] 0.4210016

Cook’s distance

cooksd <- cooks.distance(linear)
influential <- as.numeric(names(cooksd)[(cooksd > (4/nrow(train)))])
newtrain <- train[-influential,]

y = newtrain$overdesign ~ newtrain$coarse_agg_weight + newtrain$fine_agg_weight + newtrain$current_weight + newtrain$fly_ash_weight + newtrain$AEA_dose + newtrain$type_awra_dose + newtrain$weight_ratio + newtrain$target
linear2 <- lm(y)
Rsquared_linear_rm <- summary(linear2)$r.squared
if (Rsquared_linear_rm <= 0) {Rsquared_linear_rm = 0}
Rsquared_linear_rm

## [1] 0.2730367

tree <- randomForest(y = newtrain$overdesign , x = newtrain[,1:8])
rf.pred <- predict(tree, newdata =as.matrix(test[,1:8]))
Rsquared_RF_5 <- 1 - (sum((test$overdesign - rf.pred)^2)/sum((test$overdesign - mu)^2))
Rsquared_RF_5

## [1] 0.4105054

We increase Rsquared for Linear Regression by 3% but no significant improvement in Random Forest

The result table

result <- as.data.frame(cbind(c("dens", "nn", "OutlierDetection", "Cook distance"), c(Rsquared_RF_2, Rsquared_RF_3, Rsquared_RF_4, Rsquared_RF_5)))
kable(result, format = "markdown", col.names = c("Method", "R-squared"))

Method	R-squared
dens	0.451396782797922
nn	0.460360367685005
OutlierDetection	0.42100160116157
Cook distance	0.410505358884566

Conclusion

So far, only KNN outlier detection method has marginally increased the Rsquared from 45% to 46%. For the next step, we attempt high dimension outlier removal method in the package “HighDimOut”