# TrainfingF2 is df_recat as factor with S_15 and S_30 series replaced with binary, likewise with S_06, S_28, and S_19 duplicated with factor versions 

# Also the "open" are removed as the tree does not acknowledge them. Likewise with Age is removed, only aldr_kat or age category is kept instead.

# The others are S_09 merged together into one binary variable representing either digital or not in addition to the existing S_09 series being factors. 

# Further considerations are: Changing all 2 level factors to 3 levels, so that we have the NA's as its own factor instead. 
trainingF3$FODT <- NULL
trainingF2$FODT <- NULL
trainingF3$kommnr <- NULL
trainingF2$kommnr <- NULL
trainingF2$vekt <- NULL
trainingF3$vekt <- NULL

tree7 <- ctree(S_15Dcombined~., data = trainingF2, controls = ctree_control(mincriterion = 0.99,minsplit = 100))
plot(tree7)

tree8 <- ctree(S_15Dcombined~., data = trainingF3, controls = ctree_control(mincriterion = 0.95,minsplit = 100))
plot(tree8)

tree9 <- ctree(S_15Dcombined~., data = trainingF3, controls = ctree_control(mincriterion = 0.99,minsplit = 100))
plot(tree9)

??nodeapply

node_barplot(tree9, col = "black", fill = NULL, beside = NULL,
             ymax = NULL, ylines = NULL, widths = 1, gap = NULL, 
             reverse = NULL, id = TRUE)
?ctree
table(predict(trainingF3), trainingF3$S_15Dcombined)
# No effect, neither when it was tree9 as suggest in the help section

treeresponse(tree9, newdata = trainingF3[1:98,])

treeresponse()
?treeresponse





nodes(tree9, unique(where(tree9)))
?ctree

str(tree9)
#To get the confusion matrix add the 1198 and 288, and 118 + 304 use R as calculator. Then we get the summary then divide them to get the percentage.
# Then do it on test data and get the numbers again, then see the difference. 
table(predict(tree9), trainingF3$S_15Dcombined)


# Anyway, all the stuff needs to be in a TreeData df so that we can make 10-fold cross-validation instead of only train and test that is poor. 

# Below is copy-paste from training, just use it to create the same just with a datatree df istead. 

# Making the training data as factor.. 

trainingF <- 
  trainingF[] <- lapply(trainingF, factor)
trainingF2 <- trainingF

trainingF2$S_30E <- NULL
trainingF2$S_30EBinary <- NULL

trainingF2$S_06 <- NULL
trainingF2$S_15A <- NULL
trainingF2$S_15B <- NULL
trainingF2$S_15C <- NULL


training <- training %>% mutate(S_15EBinary = ifelse(S_15E==5 | S_15E==4, "Availability Good", "Bad" ))
validation <- training  %>% mutate(S_15EBinary = ifelse(S_15E==5 | S_15E==4, "Availability Good", "Bad" ))

training$BRANSJE <- as.factor(training$BRANSJE)
training$aldr_kat <- as.factor(training$aldr_kat)
training$S_16Binary <- as.factor(training$S_16Binary)

training$s_20_open <- NULL
training$Bransje_open <- NULL
training$s_09r_open <- NULL
training$S_15D <- NULL


#To make a three variable dependent
df_tree2$S_15DBinary <- cut(df_tree$S_15D,c(0,3,5,6),labels=c("Not Stressed","stressed", "None")) 

#||||||||||||||||||||||||||||||||||||||||||||||||DATA PREPARATION|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||

df <- read_xlsx(paste0(here(), "/","Digitalt_stress_varen2017_numerisk.xlsx"))
 
df_tree <- df 
df_tree$S_15DBinary <- cut(df_tree$S_15D,c(0,3,5,6),labels=c("Not Stressed","stressed", "None")) 
summary(df_tree$S_15DBinary)
df_tree$S_15DBinary <- cut(df_tree$S_15D,c(0,3,5,6),labels=c("Not Stressed","stressed", "None")) 
df_tree$S_09Binary <- cut(df_tree$S_09A & df_tree$S_09B & df_tree$S_09C & df_tree$S_09D & df_tree$S_09E & df_tree$S_09F & df_tree$S_09G & df_tree$S_09H & df_tree$S_09I & df_tree$S_09J & df_tree$S_09K & df_tree$S_09L & df_tree$S_09M & df_tree$S_09N & df_tree$S_09O & df_tree$S_09P & df_tree$S_09Q,c(0,3,5,6),labels=c("Digital","Not Digital", "None"))

# Below is a sample of an earlier mutation, just an example
df_recat <- df_recat %>% mutate(S_06Binary = ifelse(S_06==1 | S_06==2 | S_06==3, "Leadership", "No Leader"))
df_recat$S_06Binary <- as.factor(df_recat$S_06Binary)

str(df_recat$S_06Binary)
str(df_tree$S_06Binary)
df_tree$S_06Binary <- df_recat$S_06Binary
  
df_tree <-  df_tree %>% mutate(S_09Binary = if_else(S_09A & S_09B & S_09C & S_09D & S_09E & S_09F & S_09G & S_09H & S_09I & S_09J & S_09K & S_09L & S_09M & S_09N & S_09O & S_09P & S_09Q <= 4, "Digital", "Not Digital"))
df_tree$S_09Binary <- as.factor(df_tree$S_09Binary)
summary(df_tree$S_09Binary)
df_tree$S_15ABinary <- cut(df_tree$S_15A,c(0,3,5,6),labels=c("Digital Work-Relief","Digital Workload", "None"))
df_tree$S_15BBinary <- cut(df_tree$S_15B,c(0,3,5,6),labels=c("Digital Relief","Digital Demand", "None"))
df_tree$S_15CBinary <- cut(df_tree$S_15C,c(0,3,5,6),labels=c("Digital Time-Relief","Time-Pressure", "None"))
df_tree$S_15EBinary <- cut(df_tree$S_15E,c(0,3,5,6),labels=c("Availability Positive","Availability Negative", "None"))
df_tree$S_16Binary <- cut(df_tree$S_16,c(0,3,4,5),labels=c("Available","Unavailable", "None"))
df_tree$S_19Binary <- cut(df_tree$S_19,c(0,4,5,6),labels=c("Involved","Rare-Never", "None"))
summary(df_tree$S_19Binary)
df_tree$S_23Binary <- cut(df_tree$S_23,c(0,3,5,6,7),labels=c("Supported","Rare-Never","No need","None"))                          
summary(df_tree$S_23Binary)

df_tree$s_20_open <- NULL
df_tree$Bransje_open <- NULL
df_tree$s_09r_open <- NULL
df_tree$S_15D <- NULL

PreviewTree <- ctree(S_15DBinary~., data = df_tree, controls = ctree_control(mincriterion = 0.99,minsplit = 100))
plot(PreviewTree)

# The NA's in the respnse are making the plot poorer, consider reoving them all--.--

df_tree$S_28Binary <- cut(df_tree$S_28,c(0,3,5,6),labels=c("Digital Training","Rare-None","None"))
summary(df_tree$S_28Binary)

df_tree$S_29Binary <- cut(df_tree$S_29,c(0,3,5,6),labels=c("Digital Support","Not Supported","None"))
df_tree$S_30ABinary <- cut(df_tree$S_30A,c(0,2,6,7),labels=c("Workload ok","Stressed","None"))
df_tree$S_30BBinary <- cut(df_tree$S_30B,c(0,2,6,7),labels=c("Time ok","Time-Stress","None"))
df_tree$S_30CBinary <- cut(df_tree$S_30C,c(0,2,6,7),labels=c("Mistakes ok","Mistake-Stress","None"))
df_tree$S_30DBinary <- cut(df_tree$S_30D,c(0,2,6,7),labels=c("Wage ok","Wage-Stress","None"))
df_tree$S_30EBinary <- cut(df_tree$S_30E,c(0,2,6,7),labels=c("Techno ok","Technostress","None"))
df_tree$S_30FBinary <- cut(df_tree$S_30F,c(0,2,6,7),labels=c("Available ok","Available-Stress","None"))

#Testing tree again before cutting duplicates
PreviewTree2 <- ctree(S_15DBinary~., data = df_tree, controls = ctree_control(mincriterion = 0.99,minsplit = 100))
plot(PreviewTree2)

# Duplicates ruin the tree, cleaning df

df_tree$S_15A <- NULL
df_tree$S_15B <- NULL
df_tree$S_15C <- NULL
df_tree$S_15E <- NULL
df_tree$S_16 <- NULL
df_tree$S_19 <- NULL
df_tree$S_23 <- NULL
df_tree$S_28 <- NULL
df_tree$S_29 <- NULL
df_tree$S_30A <- NULL
df_tree$S_30B <- NULL
df_tree$S_30C <- NULL
df_tree$S_30D <- NULL
df_tree$S_30E <- NULL
df_tree$S_30F <- NULL

# Tree purged from duplicates, trying again

PreviewTree2 <- ctree(S_15DBinary~., data = df_tree, controls=ctree_control(mincriterion = 0.99,minsplit=100))
plot(PreviewTree2)

# Consider making a tree just to show how things would be without the "None" or idk variable. 

# Here I do, and additionally I remove the S_30E, the question that asks about to what extent they are technostressed
df_treeTest <- df_tree
summary(df_factor$S_15Dcombined)

str(df_factor$S_15Dcombined)

df_treeTest$S_15DBinary <- df_factor$S_15Dcombined

df_treeTest$S_14A <- cut(df_treeTest$S_14A,c(0,3,5,6),labels=c("Unreliable Tech","Reliable Tech", "None"))
df_treeTest$S_14B <- cut(df_treeTest$S_14B,c(0,3,5,6),labels=c("Not User-Friendly","User-Friendly", "None"))
df_treeTest$S_14C <- cut(df_treeTest$S_14C,c(0,3,5,6),labels=c("Slow Tech","Fast Tech", "None"))
df_tree$S_29Binary <- cut(df_tree$S_29,c(0,3,5,6),labels=c("Digital Support","Not Supported","None"))

# This tree was made still with S_30E, which did nothing logical.
PreviewTree3 <- ctree(S_15DBinary~.- S_30EBinary, data = df_treeTest, controls=ctree_control(mincriterion = 0.95,minsplit=50))
plot(PreviewTree3)

str(df2$S_15D)

# A tree without S_30EBinary, because its the same question as 

df_treeTest$S_30EBinary <- NULL

PreviewTree4 <- ctree(S_15DBinary~., data = df_treeTest, controls=ctree_control(mincriterion = 0.95,minsplit=50))
plot(PreviewTree4)

pred=predict(PreviewTree4,data=df_treeTest)
PreviewTree4
table(df_treeTest$S_15DBinary,pred)

confusionMatrix(pred,df_treeTest$S_15DBinary)


# Lets make cross-validation samples

set.seed(333)
RandomCV <- trainControl(method = "repeatedcv",
                         number = 5,
                         search = "random", repeats = 3, savePredictions = T)
CVModel <- train(S_15DBinary~., data=na.exclude(df_treeTest),
                 method = "ctree",
                 controls=ctree_control(mincriterion = 0.95,minsplit=50))
CVModel$bestTune

# Up to here all works, below the varimp does not.

varimp(treevaluesNEW, mincriterion = 0, conditional = FALSE, 
      threshold = 0.2, nperm = 1, OOB = TRUE, pre1.0_0 = conditional)

ctreeVarImp = varImp(CVModel)

caret::confusionMatrix(table())

# Lets try to split into 10 and create a great X-Validation! 

set.seed(5555)
validPD <- sample(2,nrow(df_treeTest), replace = TRUE, prob = c(0.8,0.2))
Treetrain <- df_treeTest[validPD==1,]
TreeValid <- df_treeTest[validPD==2,]

# And heroically making 5 validation test-sets
XvalidPD <- sample(5,nrow(df_treeTest), replace = TRUE, prob = c(0.2,0.2,0.2,0.2,0.2))
Xvalid1 <- df_treeTest[XvalidPD==1,]
Xvalid2 <- df_treeTest[XvalidPD==2,]
Xvalid3 <- df_treeTest[XvalidPD==3,]
Xvalid4 <- df_treeTest[XvalidPD==4,]
Xvalid5 <- df_treeTest[XvalidPD==5,]

# What I did here is, I used the standart data partitioning to generate 5 random validation sets, while my training is the 0.8 standart
# This way I can avoid overfitting and bias. As reccomended by several authors. 

# Here are the formulas to get the confusion matrixes
# With them I will visualise and calculate misclass errors

# I believe we must plot the training set and make it an independent tree first!
TrainedTree1 <- ctree(S_15DBinary~., data = Treetrain, controls=ctree_control(mincriterion = 0.95,minsplit=50))
plot(TrainedTree1)

tab <- table(predict(TrainedTree1), Treetrain$S_15DBinary)
print(tab)
1-sum(diag(tab))/sum(tab)

# Predict classes for the validation set 


# Matrix for the training set

#               not stressed stressed
#not stressed         1131      223
#stressed              158      359
#> 1-sum(diag(tab))/sum(tab)
#[1] 0.2036344 
#20,36% Missclassification


#
pred=predict(TrainedTree1,newdata=TreeValid)

table(TreeValid$S_15DBinary,pred)


#                           pred
#                 not stressed stressed
#not stressed          297       42
#stressed               79      104

# Evaluating the model predictions

confusionMatrix(pred,TreeValid$S_15DBinary)

#Confusion Matrix and Statistics
#
#Reference
#Prediction     not stressed stressed
#not stressed          297       79
#stressed               42      104
#
#Accuracy : 0.7682          
#95% CI : (0.7296, 0.8038)
#No Information Rate : 0.6494          
#P-Value [Acc > NIR] : 2.834e-09       
#
#Kappa : 0.4661          
#
#Mcnemar's Test P-Value : 0.001065        
#                                          
#            Sensitivity : 0.8761          
#            Specificity : 0.5683          
#         Pos Pred Value : 0.7899          
#         Neg Pred Value : 0.7123          
#             Prevalence : 0.6494          
#         Detection Rate : 0.5690          
#   Detection Prevalence : 0.7203          
#      Balanced Accuracy : 0.7222          
#                                          
#       'Positive' Class : not stressed    
                                          
#||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
# Trying the same just with a fast method for the 10-fold x-validation

library(caret)

TenFoldValid <- train(S_15DBinary ~. , data = df_treeTest, method = "ctree",
             trControl = trainControl(method = "cv", number = 10), na.action = na.exclude,
             tuneLength=5) 
# Print results 
TenFoldValid

# This prints a final model
TenFoldValid$finalModel

# Model Performance 

confusionMatrix(TenFoldValid)



Bootstrap <- train(S_15DBinary ~ ., data = Treetrain, method = "ctree", na.action = na.exclude,
              tuneLength=5)
Bootstrap

plot(Bootstrap)

# Good, but NA's excluded 

TreeValid <- na.exclude

# Lets also do with the test set 

Evaluation <- predict(Bootstrap, newdata = TreeValid)
confusionMatrix(Evaluation, TreeValid$S_15DBinary, na.action = na.exclude)

# Trying to fix the NA problem 

TreeValid[complete.cases(TreeValid),] # This did not do anything 

# Will run a total clean up of all NA's the hard way. 

df_treeTest$kommnr <- NULL
df_treeTest$kommstr <- NULL
df_treeTest$FODT <- NULL
df_treeTest$alder <- NULL 
df_treeTest$fylke <- NULL


# The above had massive ranges, the formula could not take them, cosider making all other numeric to factor not to disturb the model. 
# Remember that numeric was made to "increase" the background, meaning more education and age is more important. Here we have all doors open! MUST BE FACTOR. 
# Numeric as factor is my suggestion 

str(df_treeTest)
summary(df_treeTest) 
# NA's in 10A, B, and C; 17A, B, C, D; S_20_1, 2, 3, 4, 5; S_21, S_26. Will impute as the I dont know alternative, which is 19 for the 10 series and 5 for the rest. 

table(replace_na(df_treeTest$S_10A, value = 19), useNA = "always")
?replace_na

df_treeTest$S_10A <- df_treeTest$S_10A %>% replace_na(19) 
df_treeTest$S_10B <- df_treeTest$S_10B %>% replace_na(19) 
df_treeTest$S_10C <- df_treeTest$S_10C %>% replace_na(19) 

df_treeTest$S_17a <- df_treeTest$S_17a %>% replace_na(5)
df_treeTest$S_17b <- df_treeTest$S_17b %>% replace_na(5) 
df_treeTest$S_17c <- df_treeTest$S_17c %>% replace_na(5) 
df_treeTest$S_17d <- df_treeTest$S_17d %>% replace_na(5) 

df_treeTest$S_20_1 <- df_treeTest$S_20_1 %>% replace_na(5)
df_treeTest$S_20_2 <- df_treeTest$S_20_2 %>% replace_na(5) 
df_treeTest$S_20_3 <- df_treeTest$S_20_3 %>% replace_na(5) 
df_treeTest$S_20_4 <- df_treeTest$S_20_4 %>% replace_na(5) 
df_treeTest$S_20_5 <- df_treeTest$S_20_5 %>% replace_na(5) 

df_treeTest$aldr_kat <- df_treeTest$aldr_kat %>% replace_na(5)
df_treeTest$aldr_kat <- cut(df_treeTest$aldr_kat,c(0,1,2,3,4,5),labels=c(1,2,3,4,5))

summary(df_treeTest$aldr_kat)

df_treeTest$S_21 <- df_treeTest$S_21 %>% replace_na(5) 

df_treeTest$S_26 <- df_treeTest$S_26 %>% replace_na(5) 

df_treeTest$S_12A <- df_tree$S_12A

# 12 series were made with percentages with above 50 different scores as the respondent could answer at will. Anything above 50% is categorized as often and below rare 
# Note I merged the I dont know answers that were 999 into zero as you see below, now none have 999 only 0 as we dont have good data

df_treeTest$S_12A <- replace(df_treeTest$S_12A, df_treeTest$S_12A == 999, 0)
df_treeTest$S_12B <- replace(df_treeTest$S_12B, df_treeTest$S_12B == 999, 0)
df_treeTest$S_12C <- replace(df_treeTest$S_12C, df_treeTest$S_12C == 999, 0)
df_treeTest$S_12D <- replace(df_treeTest$S_12D, df_treeTest$S_12D == 999, 0)

#split into binary 
# Note that in its original form, this variable had no relevance in the preview and experimental models

df_treeTest <- df_treeTest %>% mutate(S_12A = ifelse(S_12A >= 50, "Core Often", "Rare"))
df_treeTest <- df_treeTest %>% mutate(S_12B = ifelse(S_12B >= 50, "Report Often", "Rare"))
df_treeTest <- df_treeTest %>% mutate(S_12C = ifelse(S_12C >= 50, "Coord Often", "Rare"))
df_treeTest <- df_treeTest %>% mutate(S_12D = ifelse(S_12D >= 50, "Other Often", "Rare"))

summary(df_treeTest$S_12A)
summary.matrix(df_treeTest$S_12A)
summary(df_treeFactor$S_12A)
str(df_treeTest$S_12A)

df_treeTest$S_12A <- as.factor(df_treeTest$S_12A)
df_treeTest$S_12B <- as.factor(df_treeTest$S_12B)
df_treeTest$S_12C <- as.factor(df_treeTest$S_12C)
df_treeTest$S_12D <- as.factor(df_treeTest$S_12D)

str(df_treeTest$aldr_kat)

# Still aldr_kat has one NA in the entire data set. Need to purge. 


df_treeTest$vekt <- NULL

# Moment of truth, I believe we will require a factor version to get the optimal version of the data-set for the models.,
df_treeTest$kommstr <- df_tree$kommstr

# Below command removed the faulty values that got created because of a typo lacking the dollar-sign

str(df_treeTest$kommstr)

df_treeFactor$kommstr <- df_treeTest$kommstr 

df_treeFactor <- df_treeTest

# Making the df_treeFactor into factor for all the variables. Grand!
df_treeFactor <- data.frame(lapply(df_treeFactor,as.factor))

summary(df_treeTest)
str(df_treeFactor)
PreviewTree5 <- ctree(S_15DBinary~., data = df_treeFactor, controls=ctree_control(mincriterion = 0.95,minsplit=100))
plot(PreviewTree5)

# Looks great. The only issue is that the 10-series might be too long having up to 19 factors may not work. Can exclude them as an option or even converting back to numeric
# Or make them the popular factors such as 1 = PC, 2 = Handy, 3 = eCard, 4 = Other
# Like this: 

df_treeFactor$S_10A <- cut(df_treeTest$S_10A,c(0,1,2,3,19),labels=c("PC","Handy","eCard","Other"))
df_treeFactor$S_10B <- cut(df_treeTest$S_10B,c(0,1,2,3,19),labels=c("PC","Handy","eCard","Other"))
df_treeFactor$S_10C <- cut(df_treeTest$S_10C,c(0,1,2,3,19),labels=c("PC","Handy","eCard","Other"))
df_treeFactor$kommstr <- cut(df_treeFactor$kommstr,c(0,1,2,3),labels = c("small","Medium","Large"))


str(df_treeFactor$kommstr)

PreviewTree7 <-ctree(S_15DBinary~., data = df_treeFactor, controls=ctree_control(mincriterion = 0.95,minsplit=200))
plot(PreviewTree7)

pred=predict(PreviewTree7,data=df_treeFactor)
PreviewTree4
table(df_treeFactor$S_15DBinary,pred)

confusionMatrix(pred,df_treeFactor$S_15DBinary)

# Ok lets split the data as the manual says

set.seed(5555)
tFPartition <- sample(2,nrow(df_treeFactor), replace = TRUE, prob = c(0.8,0.2))
tFtrain <- df_treeTest[tFPartition==1,]
tFtest <- df_treeTest[tFPartition==2,]

tFPartition <- createDataPartition(df_treeFactor$S_15DBinary, p = .80, list=FALSE)
tFtrain <- df_treeTest[tFPartition,]
tFtest <- df_treeTest[-tFPartition,]

summary(tFtrain)

tFtrain <- data.frame(lapply(tFtrain,as.factor))
tFtest <- data.frame(lapply(tFtest,as.factor))


# Lets try to cross-validate using the ones we created
fit4 <- train(S_15DBinary ~ ., data = df_treeFactor, method = "ctree",
             trControl = trainControl(method = "cv", number = 10),
             tuneLength=5) 

fit4
# above works when partitioned with c(0.8,0.2) Note: it only uses df_TreeFactor as
#a whole, not the training set

fit4$finalModel

#Prints nicely

confusionMatrix(fit4)

# Prints nicely the performance at 77% ca 

PreviewTree6 <- ctree(S_15DBinary~., data = tFtrain, controls=ctree_control(mincriterion = 0.95,minsplit=50))
plot(PreviewTree6)

# Lets train 

fit5 <- train(S_15DBinary ~ ., data = tFtrain, method = "ctree", trControl = trainControl(method = "cv", number = 10),
           tuneLength=5)

fit5
plot(fit5)
fit5$finalModel
# Training data simple matrix
table(predict(fit5), tFtrain$S_15DBinary)
pred <- predict(fit5, newdata = tFtrain)

confusionMatrix(pred, tFtrain$S_15DBinary)

PreviewTree8 <- ctree(S_15DBinary~., data = tFtrain, controls=ctree_control(mincriterion = 0.99,minsplit=50))
plot(PreviewTree8)
# Lets stress-test the training, evaluating the model with test part tFtest
pred2 <- predict(fit5, newdata = tFtest)
confusionMatrix(pred2, tFtest$S_15DBinary)

#Predict probabilities
predict(fit5,tFtest,type="prob")

# So far everything is ok. a good walk through 
# Suggest comparing this to the rpart tree also, doing the same procedure

library(rpart)
Tree_rpart <- rpart(S_15DBinary~., tFtrain)
library(rpart.plot)
rpart.plot(Tree_rpart,extra=4,1)

# Prediction 
predict(Tree_rpart,tFtest)

#Missclassification error for training set 
Training_tab <- table(predict(fit5), tFtrain$S_15DBinary) 




# REMEMBER, just follow the online tutorial dont mix the vid.. 


http://www.just.edu.jo/~haalshraideh/Courses/IE759/DT3.html

library(caret)
#The below is for rpart similar
#|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
# define training control
train_control<- trainControl(method="cv", number=10)

# train the model 
model_rpart<- train(S_15DBinary~., data=tFtrain, trControl=train_control, method="rpart")

# make predictions
predictions_rpart<- predict(model_rpart,tFtrain)

# append predictions
MyData_rpart<- cbind(tFtrain,predictions_rpart)

# summarize results
confusionMatrix<- confusionMatrix(predictions_rpart,tFtrain$S_15DBinary)
predictions_rpart
confusionMatrix

fitRpart <- train(S_15DBinary ~ ., data = tFtrain , method = "ctree",
             trControl = trainControl(method = "cv", number = 10),
             tuneLength=5) 
fitRpart
#|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
                                  
                                  
# Then build a performance distribution for discussion 




# First try on the varimp 

RandomCV <- trainControl(method = "repeatedcv",
                         number = 5,
                         search = "random", repeats = 3, savePredictions = T)
CVModel <- train(S_15DBinary~., data=df_treeFactor,
                 method = "ctree",
                 controls=ctree_control(mincriterion = 0.99,minsplit=50))
CVModel$bestTune



# Up to here all works, below the varimp does not. I think its only for RandomForest

varimp(CVModel, mincriterion = 0.95, conditional = FALSE, 
       threshold = 0.2, nperm = 1, OOB = TRUE, pre1.0_0 = conditional)

# could be that the "train" is supposed to be a df or defined trained tree in the CVModel formula otherwise figure out what an S4 object is

# Trying to do varimp after random forest running the whole process instead. 

library(randomForest)



rf.train <- tFtrain
rf.train$S_15DBinary <- NULL
rf.test <- tFtest
rf.test$S_15DBinary <- NULL
rf.label <- as.factor(tFtrain$S_15DBinary)
rf.label2 <- as.factor(tFtest$S_15DBinary)
rf.1<-randomForest(S_15DBinary~., data=tFtrain, ntree = 1000)
rf.1
varImpPlot(rf.1)
# Lame scores 22.32% OOB error rate, stressed at 47,5% wrongly predicted in matrix

str(df_r)


# Error rate plot
plot(rf.1)
# After 200 trees little improvement 

# Lets tune
TunedRF <- tuneRF(x = rf.train, y = rf.label,
       sstepFactor = 0.2,
       plot = TRUE,
       ntreeTry = 500,
       trace = TRUE,
       improve = 0.02)

# So lets make a new rf model that performs slightly better 

rf.2<-randomForest(x = rf.train, y = rf.label,
                   ntree = 1000,
                   mtry = 18,
                   importance = TRUE,
                   proximity = TRUE)
print(rf.1)
print(rf.2)
plot(rf.1)
plot(rf.2)
varImpPlot(rf.2)
# Prediction & Confusion Matrix - Training

library(caret)
predict1 <- predict(rf.2, rf.train)
confusionMatrix(predict1, rf.label)

#  Prediction & Confusion Matrix - Test Check if the one above
# even applies, because we allready have a matrix from rf.2 based on
# training data
predict2 <- predict(rf.2, rf.test)
confusionMatrix(predict2, rf.label2)
library(party)
#No. of nodes for the trees 
hist(treesize(rf.2),
     main = "No. of Nodes for all Trees",
     col = "grey")
varImpPlot(rf.2)
varImpPlot(rf.2,
           sort = T,
           n.var = 10,
           main = "Top 10 Variable Importance")

# The same with S_30E instead of S_15D, because that variable ranks the stress by digital tools also. Lets see if we can make hypos based off that. 

# Just flip the variables, put together S_15E in the same order as the others, and combine S_15combined as a variable of all the technostress-related issues

df_treeFactor2 <- df_treeFactor

df2 <- df
#
df2$S_15E <- cut(df2$S_15E,c(0,1,2,3,4,5,6),labels=c("6","5","4","3","2","1"))
#
str(df2$S_15E)
#
df2$S_15E <- as.numeric(df2$S_15E) 
#
df_recat <- df2 %>%  mutate(S_15Combined = if_else(S_15A & S_15B & S_15C & S_15D & S_15E <= 4, "Not Stressed","Stressed"))
#
df_treeFactor2$S_15Combined <- as.factor(df_recat$S_15Combined)
#
str(df_treeFactor2$S_15Combined)

df_treeFactor2$S_15ABinary <- NULL
df_treeFactor2$S_15BBinary <- NULL
df_treeFactor2$S_15CBinary <- NULL 
df_treeFactor2$S_15EBinary <- NULL 
df_treeFactor2$S_24B <- as.numeric(df_treeFactor2$S_24B)
df_treeFactor2$S_24B <- cut(df_treeFactor2$S_24B,c(0,3,5,6),labels=c("Digital Fast","Slow", "None"))
df_treeFactor2$S_24B <- as.factor(df_treeFactor2$S_24B)

df_treeFactor2$S_24A <- as.numeric(df_treeFactor2$S_24A)
df_treeFactor2$S_24A <- cut(df_treeFactor2$S_24A,c(0,3,5,6),labels=c("Work Quality","Work Poor", "None"))
df_treeFactor2$S_24A <- as.factor(df_treeFactor2$S_24A)


df_treeFactor2$S_27A <- as.numeric(df_treeFactor2$S_27A)
df_treeFactor2$S_27A <- cut(df_treeFactor2$S_27A,c(0,2,3,5,6),labels=c("Utilize Tools","Neither","Not Utilize", "None"))
df_treeFactor2$S_27A <- as.factor(df_treeFactor2$S_27A)

#####################################


summary(df_treeFactor2$S_30EBinary)
summary(df_tree$S_30E)

PreviewTree9 <- ctree(S_15Combined~., data = df_treeFactor2, controls=ctree_control(mincriterion = 0.95,minsplit=50))
plot(PreviewTree9)
# Overview ok, new findings, lets train

set.seed(6666)
tFPartition2 <- sample(2,nrow(df_treeFactor2), replace = TRUE, prob = c(0.8,0.2))
tFtrain2 <- df_treeFactor2[tFPartition==1,]
tFtest2 <- df_treeFactor2[tFPartition==2,]

# Lets train 

fit6 <- train(S_15Combined ~ ., data = tFtrain2, method = "ctree", trControl = trainControl(method = "cv", number = 10),
              tuneLength=5)

fit6
plot(fit6)
fit5$finalModel
# Training data simple matrix
table(predict(fit5), tFtrain$S_15DBinary)
pred <- predict(fit5, newdata = tFtrain)

confusionMatrix(pred, tFtrain$S_15DBinary)

PreviewTree8 <- ctree(S_15DBinary~., data = tFtrain, controls=ctree_control(mincriterion = 0.99,minsplit=50))
plot(PreviewTree8)
# Lets stress-test the training, evaluating the model with test part tFtest
pred2 <- predict(fit5, newdata = tFtest)
confusionMatrix(pred2, tFtest$S_15DBinary)

#Predict probabilities
predict(fit5,tFtest,type="prob")