# TrainfingF2 is df_recat as factor with S_15 and S_30 series replaced with binary, likewise with S_06, S_28, and S_19 duplicated with factor versions # Also the "open" are removed as the tree does not acknowledge them. Likewise with Age is removed, only aldr_kat or age category is kept instead. # The others are S_09 merged together into one binary variable representing either digital or not in addition to the existing S_09 series being factors. # Further considerations are: Changing all 2 level factors to 3 levels, so that we have the NA's as its own factor instead. trainingF3$FODT <- NULL trainingF2$FODT <- NULL trainingF3$kommnr <- NULL trainingF2$kommnr <- NULL trainingF2$vekt <- NULL trainingF3$vekt <- NULL tree7 <- ctree(S_15Dcombined~., data = trainingF2, controls = ctree_control(mincriterion = 0.99,minsplit = 100)) plot(tree7) tree8 <- ctree(S_15Dcombined~., data = trainingF3, controls = ctree_control(mincriterion = 0.95,minsplit = 100)) plot(tree8) tree9 <- ctree(S_15Dcombined~., data = trainingF3, controls = ctree_control(mincriterion = 0.99,minsplit = 100)) plot(tree9) ??nodeapply node_barplot(tree9, col = "black", fill = NULL, beside = NULL, ymax = NULL, ylines = NULL, widths = 1, gap = NULL, reverse = NULL, id = TRUE) ?ctree table(predict(trainingF3), trainingF3$S_15Dcombined) # No effect, neither when it was tree9 as suggest in the help section treeresponse(tree9, newdata = trainingF3[1:98,]) treeresponse() ?treeresponse nodes(tree9, unique(where(tree9))) ?ctree str(tree9) #To get the confusion matrix add the 1198 and 288, and 118 + 304 use R as calculator. Then we get the summary then divide them to get the percentage. # Then do it on test data and get the numbers again, then see the difference. table(predict(tree9), trainingF3$S_15Dcombined) # Anyway, all the stuff needs to be in a TreeData df so that we can make 10-fold cross-validation instead of only train and test that is poor. # Below is copy-paste from training, just use it to create the same just with a datatree df istead. # Making the training data as factor.. trainingF <- trainingF[] <- lapply(trainingF, factor) trainingF2 <- trainingF trainingF2$S_30E <- NULL trainingF2$S_30EBinary <- NULL trainingF2$S_06 <- NULL trainingF2$S_15A <- NULL trainingF2$S_15B <- NULL trainingF2$S_15C <- NULL training <- training %>% mutate(S_15EBinary = ifelse(S_15E==5 | S_15E==4, "Availability Good", "Bad" )) validation <- training %>% mutate(S_15EBinary = ifelse(S_15E==5 | S_15E==4, "Availability Good", "Bad" )) training$BRANSJE <- as.factor(training$BRANSJE) training$aldr_kat <- as.factor(training$aldr_kat) training$S_16Binary <- as.factor(training$S_16Binary) training$s_20_open <- NULL training$Bransje_open <- NULL training$s_09r_open <- NULL training$S_15D <- NULL #To make a three variable dependent df_tree2$S_15DBinary <- cut(df_tree$S_15D,c(0,3,5,6),labels=c("Not Stressed","stressed", "None")) #||||||||||||||||||||||||||||||||||||||||||||||||DATA PREPARATION||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| df <- read_xlsx(paste0(here(), "/","Digitalt_stress_varen2017_numerisk.xlsx")) df_tree <- df df_tree$S_15DBinary <- cut(df_tree$S_15D,c(0,3,5,6),labels=c("Not Stressed","stressed", "None")) summary(df_tree$S_15DBinary) df_tree$S_15DBinary <- cut(df_tree$S_15D,c(0,3,5,6),labels=c("Not Stressed","stressed", "None")) df_tree$S_09Binary <- cut(df_tree$S_09A & df_tree$S_09B & df_tree$S_09C & df_tree$S_09D & df_tree$S_09E & df_tree$S_09F & df_tree$S_09G & df_tree$S_09H & df_tree$S_09I & df_tree$S_09J & df_tree$S_09K & df_tree$S_09L & df_tree$S_09M & df_tree$S_09N & df_tree$S_09O & df_tree$S_09P & df_tree$S_09Q,c(0,3,5,6),labels=c("Digital","Not Digital", "None")) # Below is a sample of an earlier mutation, just an example df_recat <- df_recat %>% mutate(S_06Binary = ifelse(S_06==1 | S_06==2 | S_06==3, "Leadership", "No Leader")) df_recat$S_06Binary <- as.factor(df_recat$S_06Binary) str(df_recat$S_06Binary) str(df_tree$S_06Binary) df_tree$S_06Binary <- df_recat$S_06Binary df_tree <- df_tree %>% mutate(S_09Binary = if_else(S_09A & S_09B & S_09C & S_09D & S_09E & S_09F & S_09G & S_09H & S_09I & S_09J & S_09K & S_09L & S_09M & S_09N & S_09O & S_09P & S_09Q <= 4, "Digital", "Not Digital")) df_tree$S_09Binary <- as.factor(df_tree$S_09Binary) summary(df_tree$S_09Binary) df_tree$S_15ABinary <- cut(df_tree$S_15A,c(0,3,5,6),labels=c("Digital Work-Relief","Digital Workload", "None")) df_tree$S_15BBinary <- cut(df_tree$S_15B,c(0,3,5,6),labels=c("Digital Relief","Digital Demand", "None")) df_tree$S_15CBinary <- cut(df_tree$S_15C,c(0,3,5,6),labels=c("Digital Time-Relief","Time-Pressure", "None")) df_tree$S_15EBinary <- cut(df_tree$S_15E,c(0,3,5,6),labels=c("Availability Positive","Availability Negative", "None")) df_tree$S_16Binary <- cut(df_tree$S_16,c(0,3,4,5),labels=c("Available","Unavailable", "None")) df_tree$S_19Binary <- cut(df_tree$S_19,c(0,4,5,6),labels=c("Involved","Rare-Never", "None")) summary(df_tree$S_19Binary) df_tree$S_23Binary <- cut(df_tree$S_23,c(0,3,5,6,7),labels=c("Supported","Rare-Never","No need","None")) summary(df_tree$S_23Binary) df_tree$s_20_open <- NULL df_tree$Bransje_open <- NULL df_tree$s_09r_open <- NULL df_tree$S_15D <- NULL PreviewTree <- ctree(S_15DBinary~., data = df_tree, controls = ctree_control(mincriterion = 0.99,minsplit = 100)) plot(PreviewTree) # The NA's in the respnse are making the plot poorer, consider reoving them all--.-- df_tree$S_28Binary <- cut(df_tree$S_28,c(0,3,5,6),labels=c("Digital Training","Rare-None","None")) summary(df_tree$S_28Binary) df_tree$S_29Binary <- cut(df_tree$S_29,c(0,3,5,6),labels=c("Digital Support","Not Supported","None")) df_tree$S_30ABinary <- cut(df_tree$S_30A,c(0,2,6,7),labels=c("Workload ok","Stressed","None")) df_tree$S_30BBinary <- cut(df_tree$S_30B,c(0,2,6,7),labels=c("Time ok","Time-Stress","None")) df_tree$S_30CBinary <- cut(df_tree$S_30C,c(0,2,6,7),labels=c("Mistakes ok","Mistake-Stress","None")) df_tree$S_30DBinary <- cut(df_tree$S_30D,c(0,2,6,7),labels=c("Wage ok","Wage-Stress","None")) df_tree$S_30EBinary <- cut(df_tree$S_30E,c(0,2,6,7),labels=c("Techno ok","Technostress","None")) df_tree$S_30FBinary <- cut(df_tree$S_30F,c(0,2,6,7),labels=c("Available ok","Available-Stress","None")) #Testing tree again before cutting duplicates PreviewTree2 <- ctree(S_15DBinary~., data = df_tree, controls = ctree_control(mincriterion = 0.99,minsplit = 100)) plot(PreviewTree2) # Duplicates ruin the tree, cleaning df df_tree$S_15A <- NULL df_tree$S_15B <- NULL df_tree$S_15C <- NULL df_tree$S_15E <- NULL df_tree$S_16 <- NULL df_tree$S_19 <- NULL df_tree$S_23 <- NULL df_tree$S_28 <- NULL df_tree$S_29 <- NULL df_tree$S_30A <- NULL df_tree$S_30B <- NULL df_tree$S_30C <- NULL df_tree$S_30D <- NULL df_tree$S_30E <- NULL df_tree$S_30F <- NULL # Tree purged from duplicates, trying again PreviewTree2 <- ctree(S_15DBinary~., data = df_tree, controls=ctree_control(mincriterion = 0.99,minsplit=100)) plot(PreviewTree2) # Consider making a tree just to show how things would be without the "None" or idk variable. # Here I do, and additionally I remove the S_30E, the question that asks about to what extent they are technostressed df_treeTest <- df_tree summary(df_factor$S_15Dcombined) str(df_factor$S_15Dcombined) df_treeTest$S_15DBinary <- df_factor$S_15Dcombined df_treeTest$S_14A <- cut(df_treeTest$S_14A,c(0,3,5,6),labels=c("Unreliable Tech","Reliable Tech", "None")) df_treeTest$S_14B <- cut(df_treeTest$S_14B,c(0,3,5,6),labels=c("Not User-Friendly","User-Friendly", "None")) df_treeTest$S_14C <- cut(df_treeTest$S_14C,c(0,3,5,6),labels=c("Slow Tech","Fast Tech", "None")) df_tree$S_29Binary <- cut(df_tree$S_29,c(0,3,5,6),labels=c("Digital Support","Not Supported","None")) # This tree was made still with S_30E, which did nothing logical. PreviewTree3 <- ctree(S_15DBinary~.- S_30EBinary, data = df_treeTest, controls=ctree_control(mincriterion = 0.95,minsplit=50)) plot(PreviewTree3) str(df2$S_15D) # A tree without S_30EBinary, because its the same question as df_treeTest$S_30EBinary <- NULL PreviewTree4 <- ctree(S_15DBinary~., data = df_treeTest, controls=ctree_control(mincriterion = 0.95,minsplit=50)) plot(PreviewTree4) pred=predict(PreviewTree4,data=df_treeTest) PreviewTree4 table(df_treeTest$S_15DBinary,pred) confusionMatrix(pred,df_treeTest$S_15DBinary) # Lets make cross-validation samples set.seed(333) RandomCV <- trainControl(method = "repeatedcv", number = 5, search = "random", repeats = 3, savePredictions = T) CVModel <- train(S_15DBinary~., data=na.exclude(df_treeTest), method = "ctree", controls=ctree_control(mincriterion = 0.95,minsplit=50)) CVModel$bestTune # Up to here all works, below the varimp does not. varimp(treevaluesNEW, mincriterion = 0, conditional = FALSE, threshold = 0.2, nperm = 1, OOB = TRUE, pre1.0_0 = conditional) ctreeVarImp = varImp(CVModel) caret::confusionMatrix(table()) # Lets try to split into 10 and create a great X-Validation! set.seed(5555) validPD <- sample(2,nrow(df_treeTest), replace = TRUE, prob = c(0.8,0.2)) Treetrain <- df_treeTest[validPD==1,] TreeValid <- df_treeTest[validPD==2,] # And heroically making 5 validation test-sets XvalidPD <- sample(5,nrow(df_treeTest), replace = TRUE, prob = c(0.2,0.2,0.2,0.2,0.2)) Xvalid1 <- df_treeTest[XvalidPD==1,] Xvalid2 <- df_treeTest[XvalidPD==2,] Xvalid3 <- df_treeTest[XvalidPD==3,] Xvalid4 <- df_treeTest[XvalidPD==4,] Xvalid5 <- df_treeTest[XvalidPD==5,] # What I did here is, I used the standart data partitioning to generate 5 random validation sets, while my training is the 0.8 standart # This way I can avoid overfitting and bias. As reccomended by several authors. # Here are the formulas to get the confusion matrixes # With them I will visualise and calculate misclass errors # I believe we must plot the training set and make it an independent tree first! TrainedTree1 <- ctree(S_15DBinary~., data = Treetrain, controls=ctree_control(mincriterion = 0.95,minsplit=50)) plot(TrainedTree1) tab <- table(predict(TrainedTree1), Treetrain$S_15DBinary) print(tab) 1-sum(diag(tab))/sum(tab) # Predict classes for the validation set # Matrix for the training set # not stressed stressed #not stressed 1131 223 #stressed 158 359 #> 1-sum(diag(tab))/sum(tab) #[1] 0.2036344 #20,36% Missclassification # pred=predict(TrainedTree1,newdata=TreeValid) table(TreeValid$S_15DBinary,pred) # pred # not stressed stressed #not stressed 297 42 #stressed 79 104 # Evaluating the model predictions confusionMatrix(pred,TreeValid$S_15DBinary) #Confusion Matrix and Statistics # #Reference #Prediction not stressed stressed #not stressed 297 79 #stressed 42 104 # #Accuracy : 0.7682 #95% CI : (0.7296, 0.8038) #No Information Rate : 0.6494 #P-Value [Acc > NIR] : 2.834e-09 # #Kappa : 0.4661 # #Mcnemar's Test P-Value : 0.001065 # # Sensitivity : 0.8761 # Specificity : 0.5683 # Pos Pred Value : 0.7899 # Neg Pred Value : 0.7123 # Prevalence : 0.6494 # Detection Rate : 0.5690 # Detection Prevalence : 0.7203 # Balanced Accuracy : 0.7222 # # 'Positive' Class : not stressed #|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| # Trying the same just with a fast method for the 10-fold x-validation library(caret) TenFoldValid <- train(S_15DBinary ~. , data = df_treeTest, method = "ctree", trControl = trainControl(method = "cv", number = 10), na.action = na.exclude, tuneLength=5) # Print results TenFoldValid # This prints a final model TenFoldValid$finalModel # Model Performance confusionMatrix(TenFoldValid) Bootstrap <- train(S_15DBinary ~ ., data = Treetrain, method = "ctree", na.action = na.exclude, tuneLength=5) Bootstrap plot(Bootstrap) # Good, but NA's excluded TreeValid <- na.exclude # Lets also do with the test set Evaluation <- predict(Bootstrap, newdata = TreeValid) confusionMatrix(Evaluation, TreeValid$S_15DBinary, na.action = na.exclude) # Trying to fix the NA problem TreeValid[complete.cases(TreeValid),] # This did not do anything # Will run a total clean up of all NA's the hard way. df_treeTest$kommnr <- NULL df_treeTest$kommstr <- NULL df_treeTest$FODT <- NULL df_treeTest$alder <- NULL df_treeTest$fylke <- NULL # The above had massive ranges, the formula could not take them, cosider making all other numeric to factor not to disturb the model. # Remember that numeric was made to "increase" the background, meaning more education and age is more important. Here we have all doors open! MUST BE FACTOR. # Numeric as factor is my suggestion str(df_treeTest) summary(df_treeTest) # NA's in 10A, B, and C; 17A, B, C, D; S_20_1, 2, 3, 4, 5; S_21, S_26. Will impute as the I dont know alternative, which is 19 for the 10 series and 5 for the rest. table(replace_na(df_treeTest$S_10A, value = 19), useNA = "always") ?replace_na df_treeTest$S_10A <- df_treeTest$S_10A %>% replace_na(19) df_treeTest$S_10B <- df_treeTest$S_10B %>% replace_na(19) df_treeTest$S_10C <- df_treeTest$S_10C %>% replace_na(19) df_treeTest$S_17a <- df_treeTest$S_17a %>% replace_na(5) df_treeTest$S_17b <- df_treeTest$S_17b %>% replace_na(5) df_treeTest$S_17c <- df_treeTest$S_17c %>% replace_na(5) df_treeTest$S_17d <- df_treeTest$S_17d %>% replace_na(5) df_treeTest$S_20_1 <- df_treeTest$S_20_1 %>% replace_na(5) df_treeTest$S_20_2 <- df_treeTest$S_20_2 %>% replace_na(5) df_treeTest$S_20_3 <- df_treeTest$S_20_3 %>% replace_na(5) df_treeTest$S_20_4 <- df_treeTest$S_20_4 %>% replace_na(5) df_treeTest$S_20_5 <- df_treeTest$S_20_5 %>% replace_na(5) df_treeTest$aldr_kat <- df_treeTest$aldr_kat %>% replace_na(5) df_treeTest$aldr_kat <- cut(df_treeTest$aldr_kat,c(0,1,2,3,4,5),labels=c(1,2,3,4,5)) summary(df_treeTest$aldr_kat) df_treeTest$S_21 <- df_treeTest$S_21 %>% replace_na(5) df_treeTest$S_26 <- df_treeTest$S_26 %>% replace_na(5) df_treeTest$S_12A <- df_tree$S_12A # 12 series were made with percentages with above 50 different scores as the respondent could answer at will. Anything above 50% is categorized as often and below rare # Note I merged the I dont know answers that were 999 into zero as you see below, now none have 999 only 0 as we dont have good data df_treeTest$S_12A <- replace(df_treeTest$S_12A, df_treeTest$S_12A == 999, 0) df_treeTest$S_12B <- replace(df_treeTest$S_12B, df_treeTest$S_12B == 999, 0) df_treeTest$S_12C <- replace(df_treeTest$S_12C, df_treeTest$S_12C == 999, 0) df_treeTest$S_12D <- replace(df_treeTest$S_12D, df_treeTest$S_12D == 999, 0) #split into binary # Note that in its original form, this variable had no relevance in the preview and experimental models df_treeTest <- df_treeTest %>% mutate(S_12A = ifelse(S_12A >= 50, "Core Often", "Rare")) df_treeTest <- df_treeTest %>% mutate(S_12B = ifelse(S_12B >= 50, "Report Often", "Rare")) df_treeTest <- df_treeTest %>% mutate(S_12C = ifelse(S_12C >= 50, "Coord Often", "Rare")) df_treeTest <- df_treeTest %>% mutate(S_12D = ifelse(S_12D >= 50, "Other Often", "Rare")) summary(df_treeTest$S_12A) summary.matrix(df_treeTest$S_12A) summary(df_treeFactor$S_12A) str(df_treeTest$S_12A) df_treeTest$S_12A <- as.factor(df_treeTest$S_12A) df_treeTest$S_12B <- as.factor(df_treeTest$S_12B) df_treeTest$S_12C <- as.factor(df_treeTest$S_12C) df_treeTest$S_12D <- as.factor(df_treeTest$S_12D) str(df_treeTest$aldr_kat) # Still aldr_kat has one NA in the entire data set. Need to purge. df_treeTest$vekt <- NULL # Moment of truth, I believe we will require a factor version to get the optimal version of the data-set for the models., df_treeTest$kommstr <- df_tree$kommstr # Below command removed the faulty values that got created because of a typo lacking the dollar-sign str(df_treeTest$kommstr) df_treeFactor$kommstr <- df_treeTest$kommstr df_treeFactor <- df_treeTest # Making the df_treeFactor into factor for all the variables. Grand! df_treeFactor <- data.frame(lapply(df_treeFactor,as.factor)) summary(df_treeTest) str(df_treeFactor) PreviewTree5 <- ctree(S_15DBinary~., data = df_treeFactor, controls=ctree_control(mincriterion = 0.95,minsplit=100)) plot(PreviewTree5) # Looks great. The only issue is that the 10-series might be too long having up to 19 factors may not work. Can exclude them as an option or even converting back to numeric # Or make them the popular factors such as 1 = PC, 2 = Handy, 3 = eCard, 4 = Other # Like this: df_treeFactor$S_10A <- cut(df_treeTest$S_10A,c(0,1,2,3,19),labels=c("PC","Handy","eCard","Other")) df_treeFactor$S_10B <- cut(df_treeTest$S_10B,c(0,1,2,3,19),labels=c("PC","Handy","eCard","Other")) df_treeFactor$S_10C <- cut(df_treeTest$S_10C,c(0,1,2,3,19),labels=c("PC","Handy","eCard","Other")) df_treeFactor$kommstr <- cut(df_treeFactor$kommstr,c(0,1,2,3),labels = c("small","Medium","Large")) str(df_treeFactor$kommstr) PreviewTree7 <-ctree(S_15DBinary~., data = df_treeFactor, controls=ctree_control(mincriterion = 0.95,minsplit=200)) plot(PreviewTree7) pred=predict(PreviewTree7,data=df_treeFactor) PreviewTree4 table(df_treeFactor$S_15DBinary,pred) confusionMatrix(pred,df_treeFactor$S_15DBinary) # Ok lets split the data as the manual says set.seed(5555) tFPartition <- sample(2,nrow(df_treeFactor), replace = TRUE, prob = c(0.8,0.2)) tFtrain <- df_treeTest[tFPartition==1,] tFtest <- df_treeTest[tFPartition==2,] tFPartition <- createDataPartition(df_treeFactor$S_15DBinary, p = .80, list=FALSE) tFtrain <- df_treeTest[tFPartition,] tFtest <- df_treeTest[-tFPartition,] summary(tFtrain) tFtrain <- data.frame(lapply(tFtrain,as.factor)) tFtest <- data.frame(lapply(tFtest,as.factor)) # Lets try to cross-validate using the ones we created fit4 <- train(S_15DBinary ~ ., data = df_treeFactor, method = "ctree", trControl = trainControl(method = "cv", number = 10), tuneLength=5) fit4 # above works when partitioned with c(0.8,0.2) Note: it only uses df_TreeFactor as #a whole, not the training set fit4$finalModel #Prints nicely confusionMatrix(fit4) # Prints nicely the performance at 77% ca PreviewTree6 <- ctree(S_15DBinary~., data = tFtrain, controls=ctree_control(mincriterion = 0.95,minsplit=50)) plot(PreviewTree6) # Lets train fit5 <- train(S_15DBinary ~ ., data = tFtrain, method = "ctree", trControl = trainControl(method = "cv", number = 10), tuneLength=5) fit5 plot(fit5) fit5$finalModel # Training data simple matrix table(predict(fit5), tFtrain$S_15DBinary) pred <- predict(fit5, newdata = tFtrain) confusionMatrix(pred, tFtrain$S_15DBinary) PreviewTree8 <- ctree(S_15DBinary~., data = tFtrain, controls=ctree_control(mincriterion = 0.99,minsplit=50)) plot(PreviewTree8) # Lets stress-test the training, evaluating the model with test part tFtest pred2 <- predict(fit5, newdata = tFtest) confusionMatrix(pred2, tFtest$S_15DBinary) #Predict probabilities predict(fit5,tFtest,type="prob") # So far everything is ok. a good walk through # Suggest comparing this to the rpart tree also, doing the same procedure library(rpart) Tree_rpart <- rpart(S_15DBinary~., tFtrain) library(rpart.plot) rpart.plot(Tree_rpart,extra=4,1) # Prediction predict(Tree_rpart,tFtest) #Missclassification error for training set Training_tab <- table(predict(fit5), tFtrain$S_15DBinary) # REMEMBER, just follow the online tutorial dont mix the vid.. http://www.just.edu.jo/~haalshraideh/Courses/IE759/DT3.html library(caret) #The below is for rpart similar #||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| # define training control train_control<- trainControl(method="cv", number=10) # train the model model_rpart<- train(S_15DBinary~., data=tFtrain, trControl=train_control, method="rpart") # make predictions predictions_rpart<- predict(model_rpart,tFtrain) # append predictions MyData_rpart<- cbind(tFtrain,predictions_rpart) # summarize results confusionMatrix<- confusionMatrix(predictions_rpart,tFtrain$S_15DBinary) predictions_rpart confusionMatrix fitRpart <- train(S_15DBinary ~ ., data = tFtrain , method = "ctree", trControl = trainControl(method = "cv", number = 10), tuneLength=5) fitRpart #||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| # Then build a performance distribution for discussion # First try on the varimp RandomCV <- trainControl(method = "repeatedcv", number = 5, search = "random", repeats = 3, savePredictions = T) CVModel <- train(S_15DBinary~., data=df_treeFactor, method = "ctree", controls=ctree_control(mincriterion = 0.99,minsplit=50)) CVModel$bestTune # Up to here all works, below the varimp does not. I think its only for RandomForest varimp(CVModel, mincriterion = 0.95, conditional = FALSE, threshold = 0.2, nperm = 1, OOB = TRUE, pre1.0_0 = conditional) # could be that the "train" is supposed to be a df or defined trained tree in the CVModel formula otherwise figure out what an S4 object is # Trying to do varimp after random forest running the whole process instead. library(randomForest) rf.train <- tFtrain rf.train$S_15DBinary <- NULL rf.test <- tFtest rf.test$S_15DBinary <- NULL rf.label <- as.factor(tFtrain$S_15DBinary) rf.label2 <- as.factor(tFtest$S_15DBinary) rf.1<-randomForest(S_15DBinary~., data=tFtrain, ntree = 1000) rf.1 varImpPlot(rf.1) # Lame scores 22.32% OOB error rate, stressed at 47,5% wrongly predicted in matrix str(df_r) # Error rate plot plot(rf.1) # After 200 trees little improvement # Lets tune TunedRF <- tuneRF(x = rf.train, y = rf.label, sstepFactor = 0.2, plot = TRUE, ntreeTry = 500, trace = TRUE, improve = 0.02) # So lets make a new rf model that performs slightly better rf.2<-randomForest(x = rf.train, y = rf.label, ntree = 1000, mtry = 18, importance = TRUE, proximity = TRUE) print(rf.1) print(rf.2) plot(rf.1) plot(rf.2) varImpPlot(rf.2) # Prediction & Confusion Matrix - Training library(caret) predict1 <- predict(rf.2, rf.train) confusionMatrix(predict1, rf.label) # Prediction & Confusion Matrix - Test Check if the one above # even applies, because we allready have a matrix from rf.2 based on # training data predict2 <- predict(rf.2, rf.test) confusionMatrix(predict2, rf.label2) library(party) #No. of nodes for the trees hist(treesize(rf.2), main = "No. of Nodes for all Trees", col = "grey") varImpPlot(rf.2) varImpPlot(rf.2, sort = T, n.var = 10, main = "Top 10 Variable Importance") # The same with S_30E instead of S_15D, because that variable ranks the stress by digital tools also. Lets see if we can make hypos based off that. # Just flip the variables, put together S_15E in the same order as the others, and combine S_15combined as a variable of all the technostress-related issues df_treeFactor2 <- df_treeFactor df2 <- df # df2$S_15E <- cut(df2$S_15E,c(0,1,2,3,4,5,6),labels=c("6","5","4","3","2","1")) # str(df2$S_15E) # df2$S_15E <- as.numeric(df2$S_15E) # df_recat <- df2 %>% mutate(S_15Combined = if_else(S_15A & S_15B & S_15C & S_15D & S_15E <= 4, "Not Stressed","Stressed")) # df_treeFactor2$S_15Combined <- as.factor(df_recat$S_15Combined) # str(df_treeFactor2$S_15Combined) df_treeFactor2$S_15ABinary <- NULL df_treeFactor2$S_15BBinary <- NULL df_treeFactor2$S_15CBinary <- NULL df_treeFactor2$S_15EBinary <- NULL df_treeFactor2$S_24B <- as.numeric(df_treeFactor2$S_24B) df_treeFactor2$S_24B <- cut(df_treeFactor2$S_24B,c(0,3,5,6),labels=c("Digital Fast","Slow", "None")) df_treeFactor2$S_24B <- as.factor(df_treeFactor2$S_24B) df_treeFactor2$S_24A <- as.numeric(df_treeFactor2$S_24A) df_treeFactor2$S_24A <- cut(df_treeFactor2$S_24A,c(0,3,5,6),labels=c("Work Quality","Work Poor", "None")) df_treeFactor2$S_24A <- as.factor(df_treeFactor2$S_24A) df_treeFactor2$S_27A <- as.numeric(df_treeFactor2$S_27A) df_treeFactor2$S_27A <- cut(df_treeFactor2$S_27A,c(0,2,3,5,6),labels=c("Utilize Tools","Neither","Not Utilize", "None")) df_treeFactor2$S_27A <- as.factor(df_treeFactor2$S_27A) ##################################### summary(df_treeFactor2$S_30EBinary) summary(df_tree$S_30E) PreviewTree9 <- ctree(S_15Combined~., data = df_treeFactor2, controls=ctree_control(mincriterion = 0.95,minsplit=50)) plot(PreviewTree9) # Overview ok, new findings, lets train set.seed(6666) tFPartition2 <- sample(2,nrow(df_treeFactor2), replace = TRUE, prob = c(0.8,0.2)) tFtrain2 <- df_treeFactor2[tFPartition==1,] tFtest2 <- df_treeFactor2[tFPartition==2,] # Lets train fit6 <- train(S_15Combined ~ ., data = tFtrain2, method = "ctree", trControl = trainControl(method = "cv", number = 10), tuneLength=5) fit6 plot(fit6) fit5$finalModel # Training data simple matrix table(predict(fit5), tFtrain$S_15DBinary) pred <- predict(fit5, newdata = tFtrain) confusionMatrix(pred, tFtrain$S_15DBinary) PreviewTree8 <- ctree(S_15DBinary~., data = tFtrain, controls=ctree_control(mincriterion = 0.99,minsplit=50)) plot(PreviewTree8) # Lets stress-test the training, evaluating the model with test part tFtest pred2 <- predict(fit5, newdata = tFtest) confusionMatrix(pred2, tFtest$S_15DBinary) #Predict probabilities predict(fit5,tFtest,type="prob")