library("MASS")
library("lme4")
library("Matrix")
library("glmmLasso")
library("dplyr")
library("caret")
library("openxlsx")
library("ROCR")
library("grid")
library("gridExtra")
library("scales")



#making dataset for regression
Regression.data <- function(data,segment,reponse,predictors,cluster){  
  #segment is "Tumor" or "Immune"
  segment.data <- dplyr::filter(data, Segment_type == segment)
  regress.data <- log2(segment.data[,names(segment.data) %in% predictors])
  regress.data$response <- segment.data[,names(segment.data)==response]
  regress.data$cluster <- factor(segment.data[,names(segment.data)==cluster])
  regress.data <- regress.data[complete.cases(regress.data),] 
  training.samples <- createDataPartition(factor(regress.data$response), p = 0.8, list = FALSE)
  train.data  <- regress.data[training.samples, ]
  test.data <- regress.data[-training.samples, ]
  return(list(regress.data=regress.data,train.data=train.data,test.data=test.data))
}
 
lasso.form <- function(predictors){
  fixeffect <- as.formula(paste("response~",
                                  paste(predictors,collapse="+")))
  return(fixeffect)
}


#lambda selection based on BIC
bic.glmmLasso <- function(dat,
                          predictors,
                          lambda.range,
                          family=stats::gaussian(link = "identity")){
  
  BIC_vec<-rep(Inf,length(lambda.range))
  fix.effect <- lasso.form(predictors)
  for(j in 2:length(lambda.range)){
    suppressMessages({
      suppressWarnings({
        glm1 <- try(glmmLasso(fix = fix.effect, rnd = list(cluster =~1),  
                              family = family, data = dat, lambda=lambda.range[j],switch.NR=T,final.re=TRUE),
                    silent=TRUE)
      })
    })
    if(class(glm1)!="try-error")
    {  
      BIC_vec[j]<-glm1$bic
    }
  }
  opt<-which.min(BIC_vec)
  suppressWarnings({
    glm1_final <- glmmLasso(fix=fix.effect, rnd = list( cluster =~1),  
                            family = family, data = dat, lambda=lambda.range[opt],switch.NR=F,final.re=TRUE)
    
    glm1_final })
  return(list(fit.opt=glm1_final,BIC_path=BIC_vec,opt.lambda=lambda.range[opt]))}



[ROCInfo] : 
  # Pass in the data that already consists the predicted score and actual outcome.
  # to obtain the ROC curve 
  # @data    : your data.table or data.frame type data that consists the column
  #            of the predicted score and actual outcome
  # @predict : predicted score's column name
  # @actual  : actual results' column name
  # @cost.fp : associated cost for a false positive 
  # @cost.fn : associated cost for a false negative 
  # return   : a list containing  
  #			 1. plot        : a side by side roc and cost plot, title showing optimal cutoff value
  # 				 	   		  title showing optimal cutoff, total cost, and area under the curve (auc)
# 		     2. cutoff      : optimal cutoff value according to the specified fp/fn cost 
#		     3. totalcost   : total cost according to the specified fp/fn cost
#			 4. auc 		: area under the curve
#		     5. sensitivity : TP / (TP + FN)
#		     6. specificity : TN / (FP + TN)

ROCInfo <- function( data, predict, actual, cost.fp, cost.fn )
{
  # calculate the values using the ROCR library
  # true positive, false postive 
  pred <- prediction( data[,predict], data[,actual] )
  perf <- performance( pred, "tpr", "fpr" )
  roc_dt <- data.frame( fpr = perf@x.values[[1]], tpr = perf@y.values[[1]] )
  
  # cost with the specified false positive and false negative cost 
  # false postive rate * number of negative instances * false positive cost + 
  # false negative rate * number of positive instances * false negative cost
  cost <- perf@x.values[[1]] * cost.fp * sum( data[,actual] == 0 ) + 
    ( 1 - perf@y.values[[1]] ) * cost.fn * sum( data[,actual] == 1 )
  
  cost_dt <- data.frame( cutoff = pred@cutoffs[[1]], cost = cost )
  
  # optimal cutoff value, and the corresponding true positive and false positive rate
  best_index  <- which.min(cost)
  best_cost   <- cost_dt[ best_index, "cost" ]
  best_tpr    <- roc_dt[ best_index, "tpr" ]
  best_fpr    <- roc_dt[ best_index, "fpr" ]
  best_cutoff <- pred@cutoffs[[1]][ best_index ]
  
  # area under the curve
  auc <- performance( pred, "auc" )@y.values[[1]]
  
  # normalize the cost to assign colors to 1
  normalize <- function(v) ( v - min(v) ) / diff( range(v) )
  
  # create color from a palette to assign to the 100 generated threshold between 0 ~ 1
  # then normalize each cost and assign colors to it, the higher the blacker
  # don't times it by 100, there will be 0 in the vector
  col_ramp <- colorRampPalette( c( "green", "orange", "red", "black" ) )(100)   
  col_by_cost <- col_ramp[ ceiling( normalize(cost) * 99 ) + 1 ]
  
  roc_plot <- ggplot( roc_dt, aes( fpr, tpr ) ) + 
    geom_line( color = rgb( 0, 0, 1, alpha = 0.3 ) ) +
    geom_point( color = col_by_cost, size = 4, alpha = 0.2 ) + 
    geom_segment( aes( x = 0, y = 0, xend = 1, yend = 1 ), alpha = 0.8, color = "royalblue" ) + 
    labs( title = "ROC", x = "False Postive Rate", y = "True Positive Rate" ) +
    geom_hline( yintercept = best_tpr, alpha = 0.8, linetype = "dashed", color = "steelblue4" ) +
    geom_vline( xintercept = best_fpr, alpha = 0.8, linetype = "dashed", color = "steelblue4" )				
  
  cost_plot <- ggplot( cost_dt, aes( cutoff, cost ) ) +
    geom_line( color = "blue", alpha = 0.5 ) +
    geom_point( color = col_by_cost, size = 4, alpha = 0.5 ) +
    ggtitle( "Cost" ) +
    scale_y_continuous( labels = comma ) +
    geom_vline( xintercept = best_cutoff, alpha = 0.8, linetype = "dashed", color = "steelblue4" )	
  
  # the main title for the two arranged plot
  sub_title <- sprintf( "Cutoff at %.2f - Total Cost = %g, AUC = %.3f", 
                        best_cutoff, best_cost, auc )
  
  # arranged into a side by side plot
  plot <- arrangeGrob( roc_plot, cost_plot, ncol = 2, 
                       top = textGrob( sub_title, gp = gpar( fontsize = 16, fontface = "bold" ) ) )
  
  return( list( plot 		  = plot, 
                cutoff 	  = best_cutoff, 
                totalcost   = best_cost, 
                auc         = auc,
                sensitivity = best_tpr, 
                specificity = 1 - best_fpr ) )
}

#Using ROCInfo to set optimal cutoff value and output confusion matrix for train and test data
evaluation <- function(fit.train,actual.train,fit.test,actual.test,cost.fp,cost.fn){
  data.train <- cbind(fit.train,actual.train)
  select.roc <- ROCInfo(data=data.train,predict="fit.train",actual="actual.train",
                        cost.fp=cost.fp,cost.fn=cost.fn)
  cutoff <- select.roc$cutoff
  grid.draw(select.roc$plot)
  confusion.matrix.train <- confusionMatrix(as.factor(ifelse(fit.train<cutoff,"0","1")),
                            as.factor(actual.train))
  confusion.matrix.test <- confusionMatrix(as.factor(ifelse(fit.test<cutoff,"0","1")),
                                           as.factor(actual.test))
  return(list(roc = select.roc, 
              CM.train = confusion.matrix.train,
              CM.test = confusion.matrix.test))
}
############################################################################################################################
set.seed(56)
my_data <- read.delim("OC_TMA1_2_NORM1.txt")

control <- c("Rb.IgG","Ms.IgG1","Ms.IgG2a","S6","GAPDH")
predictors <- names(my_data[,42:90])
predictors <- predictors[! predictors %in% control]
segment <- "Immune"
family <- stats::binomial(link="logit") #depends on the type of response variable
cluster <- "Patient_ID"
lambda.range <- seq(100,0,by=-1)

#run either block for the corresponding response variable
########################################
my_data$Type_1_2 <- my_data$Type_1_2 - 1 
response <- "Type_1_2"
#######################################
my_data <- my_data[!my_data$Type_immune_infiltration %in% c("insignificant","insignficant"),]
response <- "Type_immune_infiltration"
my_data$Type_immune_infiltration <- as.numeric(my_data$Type_immune_infiltration=="dispersed")
###############################################################################

regress.data <- Regression.data(my_data,segment,response,predictors,cluster)
train.data <- regress.data$train.data
test.data <- regress.data$test.data

BIC.Lasso <- bic.glmmLasso(train.data,predictors,lambda.range,family)
(summary(BIC.Lasso$fit.opt))
(BIC.Lasso$opt.lambda)
plot(x=lambda.range,y=BIC.Lasso$BIC_path,
     main="BIC of lambda from 0 to 20 for model of immune segment and tumor type response",
     ylab = "BIC", xlab="Lambda")

#############################################################################
#evaluation of BIC.Lasso model
actual.train <- train.data$response  #actual value of train data
fit.train <- as.numeric(predict(BIC.Lasso$fit.opt)) #predicted value of train data
cost.fp <- 100 #cost of false positive
cost.fn <- 100 #cost of false negative
actual.test <- test.data$response  #actual value of test data
fit.test <- as.numeric(predict(BIC.Lasso$fit.opt,test.data,allow.new.levels=TRUE)) #predicted value of test data

eva <- evaluation(fit.train,actual.train,fit.test,actual.test,cost.fp,cost.fn)