# Logistic regression / Install mlbench ---------------------------------------------------------------------
# Use data ANALYSIS.xlsx duce to outcome has to be 0/1 format
library("readxl")
importdata <- read_excel("D:/Journal Club/Journal Club as Presenter October 2019/R code/ANALYSIS.xlsx",sheet = "data")

# HN is not need for analysis so it is not selected
selectdata2 <- importdata[c("DN","Gender","BUN","SCr","HDL","TG","BMI","YEAR")]
head(selectdata2)
str(selectdata2)
summary(selectdata2)

library(mlbench)
## Check if any missing values
mean(complete.cases(selectdata2)) 
## Remove rows with NA or we have to impute data
df <- na.omit(selectdata2)
## Remove column id
df$Id <- NULL

# 26985 records remains
# Split data
set.seed(1)
id <- sample(1:nrow(df), 0.7*nrow(df))
train_df <- df[id, ]
test_df <- df[-id, ]

## Train logistic regression
log_model <- glm(DN ~. , data = train_df, family = "binomial")

## Predict and evaluate train dataset
p1 <- predict(log_model, type = "response")
p1 <- ifelse(p1 >= .5, T, F)
train_result <- table(p1, train_df$DN)
print(paste0("Train Accuracy: ", sum(diag(train_result)/ nrow(train_df))) )

## Predict and evaluate test dataset
p2 <- predict(log_model, newdata = test_df, type = "response")
p2 <- ifelse(p2 >= .5, T, F)
test_result <- table(p2, test_df$DN)
print(paste0("Test Accuracy: ", round(sum(diag(test_result)/ nrow(test_df)), 4)))

summary(log_model)

# Plot ROC Curve
library(pROC)
library(ROCR)
problr=predict(log_model,test_df,type=c("response"))
predlr <- prediction(problr, test_df$DN)    
perflr <- performance(predlr, measure = "tpr", x.measure = "fpr")     
plot(perflr, col=rainbow(7), main="ROC curve DN", xlab="Specificity", ylab="Sensitivity")    
abline(0, 1) #add a 45 degree line

# Report AUC
auc0<- performance(predlr, c("auc"))
unlist(slot(auc0 , "y.values"))