library("MASS") library("lme4") library("Matrix") library("glmmLasso") library("dplyr") library("caret") library("openxlsx") library("ROCR") library("grid") library("gridExtra") library("scales") #making dataset for regression Regression.data <- function(data,segment,reponse,predictors,cluster){ #segment is "Tumor" or "Immune" segment.data <- dplyr::filter(data, Segment_type == segment) regress.data <- log2(segment.data[,names(segment.data) %in% predictors]) regress.data$response <- segment.data[,names(segment.data)==response] regress.data$cluster <- factor(segment.data[,names(segment.data)==cluster]) regress.data <- regress.data[complete.cases(regress.data),] training.samples <- createDataPartition(factor(regress.data$response), p = 0.8, list = FALSE) train.data <- regress.data[training.samples, ] test.data <- regress.data[-training.samples, ] return(list(regress.data=regress.data,train.data=train.data,test.data=test.data)) } lasso.form <- function(predictors){ fixeffect <- as.formula(paste("response~", paste(predictors,collapse="+"))) return(fixeffect) } #lambda selection based on BIC bic.glmmLasso <- function(dat, predictors, lambda.range, family=stats::gaussian(link = "identity")){ BIC_vec<-rep(Inf,length(lambda.range)) fix.effect <- lasso.form(predictors) for(j in 2:length(lambda.range)){ suppressMessages({ suppressWarnings({ glm1 <- try(glmmLasso(fix = fix.effect, rnd = list(cluster =~1), family = family, data = dat, lambda=lambda.range[j],switch.NR=T,final.re=TRUE), silent=TRUE) }) }) if(class(glm1)!="try-error") { BIC_vec[j]<-glm1$bic } } opt<-which.min(BIC_vec) suppressWarnings({ glm1_final <- glmmLasso(fix=fix.effect, rnd = list( cluster =~1), family = family, data = dat, lambda=lambda.range[opt],switch.NR=F,final.re=TRUE) glm1_final }) return(list(fit.opt=glm1_final,BIC_path=BIC_vec,opt.lambda=lambda.range[opt]))} [ROCInfo] : # Pass in the data that already consists the predicted score and actual outcome. # to obtain the ROC curve # @data : your data.table or data.frame type data that consists the column # of the predicted score and actual outcome # @predict : predicted score's column name # @actual : actual results' column name # @cost.fp : associated cost for a false positive # @cost.fn : associated cost for a false negative # return : a list containing # 1. plot : a side by side roc and cost plot, title showing optimal cutoff value # title showing optimal cutoff, total cost, and area under the curve (auc) # 2. cutoff : optimal cutoff value according to the specified fp/fn cost # 3. totalcost : total cost according to the specified fp/fn cost # 4. auc : area under the curve # 5. sensitivity : TP / (TP + FN) # 6. specificity : TN / (FP + TN) ROCInfo <- function( data, predict, actual, cost.fp, cost.fn ) { # calculate the values using the ROCR library # true positive, false postive pred <- prediction( data[,predict], data[,actual] ) perf <- performance( pred, "tpr", "fpr" ) roc_dt <- data.frame( fpr = perf@x.values[[1]], tpr = perf@y.values[[1]] ) # cost with the specified false positive and false negative cost # false postive rate * number of negative instances * false positive cost + # false negative rate * number of positive instances * false negative cost cost <- perf@x.values[[1]] * cost.fp * sum( data[,actual] == 0 ) + ( 1 - perf@y.values[[1]] ) * cost.fn * sum( data[,actual] == 1 ) cost_dt <- data.frame( cutoff = pred@cutoffs[[1]], cost = cost ) # optimal cutoff value, and the corresponding true positive and false positive rate best_index <- which.min(cost) best_cost <- cost_dt[ best_index, "cost" ] best_tpr <- roc_dt[ best_index, "tpr" ] best_fpr <- roc_dt[ best_index, "fpr" ] best_cutoff <- pred@cutoffs[[1]][ best_index ] # area under the curve auc <- performance( pred, "auc" )@y.values[[1]] # normalize the cost to assign colors to 1 normalize <- function(v) ( v - min(v) ) / diff( range(v) ) # create color from a palette to assign to the 100 generated threshold between 0 ~ 1 # then normalize each cost and assign colors to it, the higher the blacker # don't times it by 100, there will be 0 in the vector col_ramp <- colorRampPalette( c( "green", "orange", "red", "black" ) )(100) col_by_cost <- col_ramp[ ceiling( normalize(cost) * 99 ) + 1 ] roc_plot <- ggplot( roc_dt, aes( fpr, tpr ) ) + geom_line( color = rgb( 0, 0, 1, alpha = 0.3 ) ) + geom_point( color = col_by_cost, size = 4, alpha = 0.2 ) + geom_segment( aes( x = 0, y = 0, xend = 1, yend = 1 ), alpha = 0.8, color = "royalblue" ) + labs( title = "ROC", x = "False Postive Rate", y = "True Positive Rate" ) + geom_hline( yintercept = best_tpr, alpha = 0.8, linetype = "dashed", color = "steelblue4" ) + geom_vline( xintercept = best_fpr, alpha = 0.8, linetype = "dashed", color = "steelblue4" ) cost_plot <- ggplot( cost_dt, aes( cutoff, cost ) ) + geom_line( color = "blue", alpha = 0.5 ) + geom_point( color = col_by_cost, size = 4, alpha = 0.5 ) + ggtitle( "Cost" ) + scale_y_continuous( labels = comma ) + geom_vline( xintercept = best_cutoff, alpha = 0.8, linetype = "dashed", color = "steelblue4" ) # the main title for the two arranged plot sub_title <- sprintf( "Cutoff at %.2f - Total Cost = %g, AUC = %.3f", best_cutoff, best_cost, auc ) # arranged into a side by side plot plot <- arrangeGrob( roc_plot, cost_plot, ncol = 2, top = textGrob( sub_title, gp = gpar( fontsize = 16, fontface = "bold" ) ) ) return( list( plot = plot, cutoff = best_cutoff, totalcost = best_cost, auc = auc, sensitivity = best_tpr, specificity = 1 - best_fpr ) ) } #Using ROCInfo to set optimal cutoff value and output confusion matrix for train and test data evaluation <- function(fit.train,actual.train,fit.test,actual.test,cost.fp,cost.fn){ data.train <- cbind(fit.train,actual.train) select.roc <- ROCInfo(data=data.train,predict="fit.train",actual="actual.train", cost.fp=cost.fp,cost.fn=cost.fn) cutoff <- select.roc$cutoff grid.draw(select.roc$plot) confusion.matrix.train <- confusionMatrix(as.factor(ifelse(fit.train