# Random forest / Install caret and randomForest --------------------------------------------------------

# Random Forest (use data set from logistic regression)
# https://stats.stackexchange.com/questions/98953/why-doesnt-random-forest-handle-missing-values-in-predictors

# Importing data from excel / Install readxl package first
library("readxl")
importdata <- read_excel("D:/Journal Club/Journal Club as Presenter October 2019/R code/ANALYSIS.xlsx",sheet = "data")

# HN is not need for analysis so it is not selected
selectdata <- importdata[c("DN","Gender","BUN","SCr","HDL","TG","BMI","YEAR")]
selectdata$DN <-as.factor(selectdata$DN)
head(selectdata)
str(selectdata)
summary(selectdata)

library(mlbench)
## Check if any missing values
mean(complete.cases(selectdata)) 
## Remove rows with NA or we have to impute data
df2 <- na.omit(selectdata)
## Remove column id
df2$Id <- NULL

# Split data
set.seed(1)
id <- sample(1:nrow(df2), 0.7*nrow(df2))
train_df2 <- df2[id, ]
test_df2 <- df2[-id, ]

library(caret)
library(randomForest)
set.seed(51)

# Training
RF <- randomForest(DN~.,data=train_df2,method = 'rf',importance=TRUE,ntree=300)
# Confusion Matrix
print(RF)
attributes(RF)
# Evaluate variable importance
importance(RF)
varImpPlot(RF)


# Prediction & Confusion Matrix - train data
library(caret)
p1rf <- predict(RF, train_df2)
confusionMatrix(p1rf, train_df2$DN)

# # Prediction & Confusion Matrix - test data
p2rf <- predict(RF, test_df2)
confusionMatrix(p2rf, test_df2$DN)

# Error rate of Random Forest
plot(RF)

# Extract Single Tree
getTree(RF, 1, labelVar = TRUE)
