# Machine learning algorithm can gain more performance ------------------------------------------------------------
# Importing data from excel / Install readxl package first
library("readxl")
importdata <- read_excel("D:/Journal Club/Journal Club as Presenter October 2019/R code/ANALYSIS.xlsx",sheet = "data")

# HN is not need for analysis so it is not selected
selectdata <- importdata[c("DN","Gender","BUN","SCr","HDL","TG","BMI","YEAR")]
selectdata$DN <-as.factor(selectdata$DN)
head(selectdata)
str(selectdata)
summary(selectdata)

#Splitiing data 30% as test / 70% as training
set.seed(123)
data_index <- 1:nrow(selectdata)
testindex <- sample(data_index, trunc(length(data_index)*30/100))
datatest <- selectdata[testindex,]
datatrain <- selectdata[-testindex,]

summary(datatest)
summary(datatrain)

# Decision tree / Install rpart.plot
# This algorithm can handle missing value well. We can leave missing value.
library(rpart)
library(rpart.plot)

# Create decision tree model with default parameter
DT <- rpart(DN~., data = datatrain, method ='class')
rpart.plot(DT)
# Performance
predict_DT <-predict(DT, datatest, type = 'class')
table_mat <- table(datatest$DN, predict_DT)
table_mat

# Accuracy
accuracy_Test <- sum(diag(table_mat)) / sum(table_mat)
print(paste('Accuracy for Decision Tree: ', accuracy_Test))

# Plot ROC Curve
library(pROC)
probd=predict(DT,datatest,type="prob")
predd <- prediction(probd[,2], datatest$DN)    
perfd <- performance(predd, measure = "tpr", x.measure = "fpr")     
plot(perfd, col=rainbow(7), main="ROC curve DN (Default DT model)", xlab="Specificity", ylab="Sensitivity")    
abline(0, 1) #add a 45 degree line

# Report AUC
auc1<- performance(predd, c("auc"))
unlist(slot(auc1 , "y.values"))

# Tuning
accuracy_tune <- function(DT) {
  predict_DT <- predict(DT, datatest, type = 'class')
  table_mat <- table(datatest$DN, predict_DT)
  accuracy_Test <- sum(diag(table_mat)) / sum(table_mat)
  accuracy_Test
}
control <- rpart.control(minsplit = 3,
                         minbucket = round(5 / 3),
                         maxdepth = 7,
                         cp = 0)
tune_DT <- rpart(DN~., data = datatrain, method = 'class', control = control)
accuracy_tune(tune_DT)
rpart.plot(tune_DT)

# Performance
predict_DT2 <-predict(tune_DT, datatest, type = 'class')
table_mat2 <- table(datatest$DN, predict_DT2)
table_mat2

library(pROC)
probd2=predict(tune_DT,datatest,type="prob")
preddt <- prediction(probd2[,2], datatest$DN)    
perfdt <- performance(preddt, measure = "tpr", x.measure = "fpr")     
plot(perfdt, col=rainbow(7), main="ROC curve DN (Default DT model)", xlab="Specificity", ylab="Sensitivity")    
abline(0, 1) #add a 45 degree line

# Report AUC
library(pROC)
library(ROCR)
auc2<- performance(preddt, c("auc"))
unlist(slot(auc2 , "y.values"))

# The result is easily apply in real practice
