# R code

#install necessary packages
library(leaps)
library(gvlma)
library(ggplot2)
library(magrittr)
library(dplyr)
library(car)

# Set seed for reproducibility
set.seed(1)

# Number of data sets
num_datasets <- 1000

# Function to create a data set
generate_dataset <- function(seed) {
  # Number of observations per data set
  n <- 1000
  # Number of independent variables
  p <- 20
  set.seed(seed) 
  explanatoryvariables <- matrix(rnorm(n * p), nrow = n)
  true_coefficients <- rnorm(p)
  y <- explanatoryvariables %*% true_coefficients + rnorm(n)
  if (runif(1) < 0.5) {
    y <- y + explanatoryvariables[, 1]^2
  }
  dataset <- data.frame(Y = y, explanatoryvariables)
  names(dataset) <- c("y", paste0("x", 1:p))
  return(dataset)
}

# Function to calculate predicted R2
calculate_pred_r2 <- function(actual_values, predicted_values) {
  ss_res <- sum((actual_values-predicted_values)^2)
  ss_tot <- sum((actual_values -mean(actual_values))^2)
  pred_r2 <- 1-(ss_res/ss_tot)
  return(pred_r2)
}

# Initialize an empty list to store future results
results_list <- list()

# Loop that creates each data set, divides it into a training set and 
# a validation set and performs best subset selection
for (i in 1:num_datasets) {
  dataset <- generate_dataset(i)
  
  # Split the data set into a training set and a validation set
  set.seed(i)  # The seed used for the split is the same as the seed used when creating the data set
  chosenrows <- sample.int(n, size = 500)
  training <- dataset[chosenrows, ]
  validation <- dataset[-chosenrows, ]
  
  # Perform best subset selection with all variables included
  subset <- regsubsets(y ~ ., data = training, nvmax = 20)
  summary_subset <- summary(subset)
  
  # Create lists for values
  f_test_results <- character(length=20)
  multicollinearity_flags <- character(length=20)
  assumption_violation_flags <- character(length=20)
  leverage_point_flags <- character(length=20)
  outlier_flags <- character(length=20)
  BIC_values <- numeric(length = 20)
  Radj_values <- numeric(length = 20)
  R2_values <- numeric(length = 20)
  Cp_values <- numeric(length = 20)
  AIC_values <- numeric(length = 20)
  predicted_r2_values <- numeric(length = 20)
  number_of_sig_variables <- numeric(length = 20)
  num_vars <- numeric(length = 20)
  
  # Loop that saves the models for each data set and extracts or calculates 
  # different values for each model
  for (j in 1:20) {
    # Create the 20 models
    model_vars <- names(coef(subset, id = j))[-1]  # Exclude intercept
    model_formula <- as.formula(paste("y ~", paste(model_vars, collapse = " + ")))
    model <- lm(model_formula, data = training)
    
    #Check values for each model
    
    # Extract the p-value from the F-test for each model
    f_stat <- summary(model)$fstatistic
    p_value <- pf(f_stat[1], f_stat[2], f_stat[3], lower.tail = FALSE)
    f_test_results [j] <- if(p_value < 0.05) "Yes" else "No"
    
    # Extract the p-values for each coefficients to count the number of significant coefficients in each model
    p_values <- summary(model)$coefficients[, "Pr(>|t|)"]
    p_values_corrected <- p.adjust(p_values[-1], method="holm")
    num_significant <- sum(p_values_corrected < 0.05)
    number_of_sig_variables[j] <- num_significant
    
    # Check VIF-values and flag for multicollinearity (VIF > 5)
    if(length(model_vars) > 1) {
      vif_values <- car::vif(model)
      multicollinearity_flags[j] <- ifelse(any(vif_values > 5), "Yes", "No")
    } else {
      multicollinearity_flags[j] <- NA #One explanatory variable
    }
    
    # Check model assumptions using the gvlma command
    gvlma_result <- gvlma(model)
    pvals <- summary(gvlma_result)$`p-value`
    assumption_violation_flags[j] <- ifelse(any(pvals < 0.05), "Yes", "No")
    
    # Check for leverage points
    n <- nrow(training)
    leverage_values <- hatvalues(model)
    cutoff <- 4/n
    leverage_point_flags[j] <- ifelse(any(leverage_values > cutoff), "Yes", "No")
    
    # Check for outliers (residuals bigger than 4)
    residuals <- residuals(model)
    outlier_flags[j] <- if(any(abs(residuals) > 4)) "Yes" else "No"
    
    # Extract BIC, adjusted R², R² and Mallow's Cp for each model
    BIC_values[j] <- BIC(model)
    Radj_values[j] <- summary_subset$adjr2[j]
    R2_values[j] <- summary_subset$rsq[j]
    Cp_values[j] <- summary_subset$cp[j]
    AIC_values[j] <- AIC(model)
    
    # Calculate predicted R² for each model
    predicted_values <- predict(model, newdata = validation)
    actual_values <- validation$y
    pred_r2 <- calculate_pred_r2(actual_values, predicted_values)
    predicted_r2_values[j] <- pred_r2
    
    # Variable that counts the number of explanatory variables in the model
    num_vars[j] <-length(coef(model)) -1
    
    # Combine all results so far into a final results data frame
    dataset_results <- data.frame(
      Dataset_number = i,
      Model_number = 1:20,
      Number_of_variables = num_vars,
      Ftest = f_test_results,
      Number_of_significant_variables = number_of_sig_variables,
      AIC = AIC_values,
      BIC = BIC_values,
      Adjusted_R2 = Radj_values,
      R2 = R2_values,
      Predicted_R2 = predicted_r2_values,
      Mallows_Cp = Cp_values,
      Multicollinearity = multicollinearity_flags,
      Assumption_violation = assumption_violation_flags,
      Leverage_points = leverage_point_flags,
      Outliers = outlier_flags
    )
  }
  # Exclude models that violate the model assumptions according to gvlma
  dataset_results_filtered <- subset(dataset_results, Assumption_violation == "No")
  
  # Store the results in the list
  results_list[[i]] <- dataset_results_filtered
}

# Combine all results into one data frame
all_models <- do.call(rbind, results_list)

# View the results
View(all_models)

############################################################

# 2 Create plots for the indcluded models

# Plot for the relationship between number of variables and 
# the mean number of statistically significant variables

# Calculate mean and standard deviation for each level of p
mean_sd_values <- all_models %>%
  group_by(Number_of_variables) %>%
  summarise(mean_value = mean(Number_of_significant_variables),
            sd_value = sd(Number_of_significant_variables))

# Create the plot
ggplot(data = mean_sd_values, aes(x = Number_of_variables, y = mean_value)) +
  geom_vline(xintercept = mean_sd_values$Number_of_variables, linetype = "dotted", color = "grey80") +
  geom_errorbar(aes(ymin = mean_value - sd_value, ymax = mean_value + sd_value), width = 0.2, color = "skyblue2") +
  geom_text(aes(label = round(mean_value, 2)), vjust = -2, size = 2.5) +
  geom_point(size = 2, color = "black") + 
  labs(
    title = "Relationship between number of variables\nand mean number of statistically significant variables",
    x = "\nNumber of explanatory variables",
    y = "Mean number of statistically significant variables"
  ) +
  theme_light() +
  theme(plot.title = element_text(), panel.grid.major.x = element_blank(), 
        panel.grid.minor.x = element_blank(), panel.grid.major.y = element_blank(),  
        panel.grid.minor.y = element_blank())


# Plot for the relationship between number of explanatory 
# variables and percentage of models with only statistically 
# significant explanatory variables

# Calculate the percentage for each group within p 
# that meets the criteria p = number_of_sig
percentage_df <- all_models %>%
  group_by(Number_of_variables) %>%
  summarise(
    total_count = n(),
    match_count = sum(Number_of_variables == Number_of_significant_variables),
    percentage = (match_count / total_count) * 100
  )

# Create the plot
ggplot(data = percentage_df, aes(x = Number_of_variables, y = percentage)) +
  geom_text(aes(label = round(percentage, 2)), vjust = -1, size = 3) +
  geom_vline(xintercept = percentage_df$Number_of_variables, linetype = "dotted", color = "grey80") +
  geom_point(size = 2, color = "black") +
  labs(
    title = "Relationship between number of variables and percentage of\nmodels with only statistically significant variables",
    x = "\nNumber of explanatory variables",
    y = "Percentage with only statistically significant variables"
  ) +
  theme_light() +
  theme(plot.title = element_text(margin=margin(t=5, r=10, b=10, l=0)), panel.grid.major.x = element_blank(), 
        panel.grid.minor.x = element_blank(), panel.grid.major.y = element_blank(),  
        panel.grid.minor.y = element_blank())


# Plot for the relationship between number of explanatory 
# variables and AIC

# Calculate mean AIC values for each group of models (with 
# 1-20 variables)
aic_values <- all_models %>%
  group_by(Number_of_variables) %>%
  summarize(mean_AIC = mean(AIC))

# Create the plot
ggplot(data=all_models, aes(x=Number_of_variables, y=AIC)) + geom_point(size=0.7, color="gray70") +
  labs(x="\nNumber of explanatory variables",y="AIC") + theme_light() + 
  geom_point(data=aic_values, aes(x=Number_of_variables, y=mean_AIC), color="red", size=2) +
  geom_line(data=aic_values, aes(x=Number_of_variables, y=mean_AIC), color="red", size=0.3, linetype="dashed") +
  ggtitle("Relationship between\nnumber of explanatory variables and AIC") + 
  theme(plot.title = element_text(lineheight=1, size=14)) +
  theme(axis.title.y = element_text(size = 12, angle = 90)) + 
  theme(axis.title.x = element_text(size = 12))


# Plot for the relationship between number of explanatory 
# variables and BIC

# Calculate mean BIC values for each group of models (with 
# 1-20 variables)
mean_bic <- all_models %>%
  group_by(Number_of_variables) %>%
  summarize(mean_BIC = mean(BIC))

# Create the plot
ggplot(data=all_models, aes(x=Number_of_variables, y=BIC)) + geom_point(size=0.7, color="gray70") +
  labs(x="\nNumber of explanatory variables",y="BIC") + theme_light() +
  geom_point(data=mean_bic, aes(x=Number_of_variables, y=mean_BIC), color="red", size=2) +
  geom_line(data=mean_bic, aes(x=Number_of_variables, y=mean_BIC), color="red", size=0.3, linetype="dashed") +
  ggtitle("Relationship between\nnumber of explanatory variables and BIC") + 
  theme(plot.title = element_text(lineheight=1, size=14)) +
  theme(axis.title.y = element_text(size = 12, angle = 90)) + 
  theme(axis.title.x = element_text(size = 12))


# Plot for the relationship between number of explanatory 
# variables and adjusted R²

# Calculate mean adjusted R² values for each group of models 
# (1-20 variables)
mean_adjusted_r2 <- all_models %>%
  group_by(Number_of_variables) %>%
  summarize(mean_Adjusted_R2 = mean(Adjusted_R2))

# Create the plot
ggplot(data=all_models, aes(x=Number_of_variables, y=Adjusted_R2)) + geom_point(size=0.7, color="gray70") +
  labs(x = "\nNumber of explanatory variables", y = expression(paste("Adjusted ", R^{2}))) + theme_light() + 
  geom_point(data=mean_adjusted_r2, aes(x=Number_of_variables, y=mean_Adjusted_R2), color="red", size=2) +
  geom_line(data=mean_adjusted_r2, aes(x=Number_of_variables, y=mean_Adjusted_R2), color="red", size=0.2, linetype="dashed") +
  ggtitle(expression(paste("Relationship between\nnumber of explanatory variables and adjusted ", R^{2}))) + 
  theme(plot.title = element_text(lineheight=1, size=14, margin=margin(t=15, r=10, b=5, l=0))) +
  theme(axis.title.y = element_text(size = 12, angle = 90)) + 
  theme(axis.title.x = element_text(size = 12))


# Plot for the relationship between number of explanatory 
# variables and predicted R²

# Calculate mean predicted R² values for each group of models 
# (1-20 variables)
mean_predicted_r2 <- all_models %>%
  group_by(Number_of_variables) %>%
  summarize(mean_Predicted_R2 = mean(Predicted_R2))

# Create the plot
ggplot(data=all_models, aes(x=Number_of_variables, y=Predicted_R2)) + geom_point(size=0.7, color="gray70") +
  labs(x = "\nNumber of explanatory variables", y = expression(paste("Predicted ", R^{2}))) + theme_light() +
  geom_point(data=mean_predicted_r2, aes(x=Number_of_variables, y=mean_Predicted_R2), color="red", size=2) +
  geom_line(data=mean_predicted_r2, aes(x=Number_of_variables, y=mean_Predicted_R2), color="red", size=0.2, linetype="dashed") +
  ggtitle(expression(paste("Relationship between\nnumber of explanatory variables and predicted ", R^{2}))) + 
  theme(plot.title = element_text(lineheight=1, size=14, margin=margin(t=15, r=10, b=5, l=0))) +
  theme(axis.title.y = element_text(size = 12, angle = 90)) + 
  theme(axis.title.x = element_text(size = 12))

# Plot for the relationship between number of explanatory 
# variables and Mallow's Cp

# Calculate mean Mallow's Cp values for each group of models 
# (1-20 variables)
mean_mallows_cp <- all_models %>%
  group_by(Number_of_variables) %>%
  summarize(mean_Mallows_Cp = mean(Mallows_Cp))

# Create the plot
ggplot(data=all_models, aes(x=Number_of_variables, y=Mallows_Cp)) + geom_point(size=0.7, color="gray70") +
  labs(x="\nNumber of explanatory variables",y = expression(paste("Mallow's ", C[p]))) + theme_light() +
  geom_line(data=mean_mallows_cp, aes(x=Number_of_variables, y=mean_Mallows_Cp), color="red", size=0.3, linetype="dashed") +
  geom_point(data=mean_mallows_cp, aes(x=Number_of_variables, y=mean_Mallows_Cp), color="red", size=2) +
  ggtitle(expression(paste("Relationship between\nnumber of explanatory variables and Mallow's ", C[p]))) + 
  theme(plot.title = element_text(lineheight=1, size=14, margin=margin(t=15, r=10, b=5, l=0))) +
  theme(axis.title.y = element_text(size = 12, angle = 90)) + 
  theme(axis.title.x = element_text(size = 12))

# Plot for the relationship between number of explanatory
# variables and the models' final rank

# Rank the models according to AIC, BIC, Mallow's Cp,
# adjusted R² and predicted R²
all_models_ranked <- all_models %>%
  mutate(
    rank_AIC = rank(AIC),
    rank_BIC = rank(BIC),
    rank_Mallows = rank(Mallows_Cp),
    rank_adjusted_R2 = rank(desc(Adjusted_R2)),
    rank_predicted_R2 = rank(desc(Predicted_R2))
  )

# Sum the ranks for each model to give them a final position
all_models_ranked <- all_models_ranked %>%
  mutate(final_rank = rank_AIC + rank_BIC + rank_adjusted_R2 + rank_predicted_R2 + rank_Mallows)

# Calculate the mean position for models with different
# numbers of explanatory variables
mean_position <- all_models_ranked %>%
  group_by(Number_of_variables) %>%
  summarize(mean_final_rank = mean(final_rank))

# Create the plot
ggplot(data=mean_position, aes(x=Number_of_variables, y=mean_final_rank)) +
  geom_line(size=0.5, color="gray", linetype="dashed") +
  geom_point(size=2, color="black") +
  labs(x="\nNumber of explanatory variables", y="Mean final rank") +
  theme_light() +
  ggtitle("Relationship between mean final rank\nand number of explanatory variables") +
  theme(plot.title = element_text(lineheight=1, size=14, margin=margin(t=5, r=10, b=5, l=0))) +
  theme(axis.title.y = element_text(size=12, angle=90)) + 
  theme(axis.title.x = element_text(size=12))

############################################################

# 3 Print model diagnostics

# Print a table that counts the number of models with multicollinearity (including NA values)
invisible(multicollinearity <- table(all_models$Multicollinearity, useNA = "ifany"))
print(multicollinearity)

# Print the number of models with leverage points
table(all_models$Leverage_points)

# Print the number of models with outliers
table(all_models$Outliers)

# Print the number of significant models
table(all_models$Ftest)

# Create a plot showing the number of models of each size (with each number of significant variables)
table_modelsize <- table(all_models$Number_of_variables)
table_modelsize_df <- as.data.frame(table_modelsize)
ggplot(table_modelsize_df, aes(x = Var1, y = Freq)) +
  geom_bar(stat = "identity", fill = "gray") +
  labs(x = "Number of explanatory variables", y = "Number of included models", title = "Number of models of each size") +
  theme_minimal()