# R code #install necessary packages library(leaps) library(gvlma) library(ggplot2) library(magrittr) library(dplyr) library(car) # Set seed for reproducibility set.seed(1) # Number of data sets num_datasets <- 1000 # Function to create a data set generate_dataset <- function(seed) { # Number of observations per data set n <- 1000 # Number of independent variables p <- 20 set.seed(seed) explanatoryvariables <- matrix(rnorm(n * p), nrow = n) true_coefficients <- rnorm(p) y <- explanatoryvariables %*% true_coefficients + rnorm(n) if (runif(1) < 0.5) { y <- y + explanatoryvariables[, 1]^2 } dataset <- data.frame(Y = y, explanatoryvariables) names(dataset) <- c("y", paste0("x", 1:p)) return(dataset) } # Function to calculate predicted R2 calculate_pred_r2 <- function(actual_values, predicted_values) { ss_res <- sum((actual_values-predicted_values)^2) ss_tot <- sum((actual_values -mean(actual_values))^2) pred_r2 <- 1-(ss_res/ss_tot) return(pred_r2) } # Initialize an empty list to store future results results_list <- list() # Loop that creates each data set, divides it into a training set and # a validation set and performs best subset selection for (i in 1:num_datasets) { dataset <- generate_dataset(i) # Split the data set into a training set and a validation set set.seed(i) # The seed used for the split is the same as the seed used when creating the data set chosenrows <- sample.int(n, size = 500) training <- dataset[chosenrows, ] validation <- dataset[-chosenrows, ] # Perform best subset selection with all variables included subset <- regsubsets(y ~ ., data = training, nvmax = 20) summary_subset <- summary(subset) # Create lists for values f_test_results <- character(length=20) multicollinearity_flags <- character(length=20) assumption_violation_flags <- character(length=20) leverage_point_flags <- character(length=20) outlier_flags <- character(length=20) BIC_values <- numeric(length = 20) Radj_values <- numeric(length = 20) R2_values <- numeric(length = 20) Cp_values <- numeric(length = 20) AIC_values <- numeric(length = 20) predicted_r2_values <- numeric(length = 20) number_of_sig_variables <- numeric(length = 20) num_vars <- numeric(length = 20) # Loop that saves the models for each data set and extracts or calculates # different values for each model for (j in 1:20) { # Create the 20 models model_vars <- names(coef(subset, id = j))[-1] # Exclude intercept model_formula <- as.formula(paste("y ~", paste(model_vars, collapse = " + "))) model <- lm(model_formula, data = training) #Check values for each model # Extract the p-value from the F-test for each model f_stat <- summary(model)$fstatistic p_value <- pf(f_stat[1], f_stat[2], f_stat[3], lower.tail = FALSE) f_test_results [j] <- if(p_value < 0.05) "Yes" else "No" # Extract the p-values for each coefficients to count the number of significant coefficients in each model p_values <- summary(model)$coefficients[, "Pr(>|t|)"] p_values_corrected <- p.adjust(p_values[-1], method="holm") num_significant <- sum(p_values_corrected < 0.05) number_of_sig_variables[j] <- num_significant # Check VIF-values and flag for multicollinearity (VIF > 5) if(length(model_vars) > 1) { vif_values <- car::vif(model) multicollinearity_flags[j] <- ifelse(any(vif_values > 5), "Yes", "No") } else { multicollinearity_flags[j] <- NA #One explanatory variable } # Check model assumptions using the gvlma command gvlma_result <- gvlma(model) pvals <- summary(gvlma_result)$`p-value` assumption_violation_flags[j] <- ifelse(any(pvals < 0.05), "Yes", "No") # Check for leverage points n <- nrow(training) leverage_values <- hatvalues(model) cutoff <- 4/n leverage_point_flags[j] <- ifelse(any(leverage_values > cutoff), "Yes", "No") # Check for outliers (residuals bigger than 4) residuals <- residuals(model) outlier_flags[j] <- if(any(abs(residuals) > 4)) "Yes" else "No" # Extract BIC, adjusted R², R² and Mallow's Cp for each model BIC_values[j] <- BIC(model) Radj_values[j] <- summary_subset$adjr2[j] R2_values[j] <- summary_subset$rsq[j] Cp_values[j] <- summary_subset$cp[j] AIC_values[j] <- AIC(model) # Calculate predicted R² for each model predicted_values <- predict(model, newdata = validation) actual_values <- validation$y pred_r2 <- calculate_pred_r2(actual_values, predicted_values) predicted_r2_values[j] <- pred_r2 # Variable that counts the number of explanatory variables in the model num_vars[j] <-length(coef(model)) -1 # Combine all results so far into a final results data frame dataset_results <- data.frame( Dataset_number = i, Model_number = 1:20, Number_of_variables = num_vars, Ftest = f_test_results, Number_of_significant_variables = number_of_sig_variables, AIC = AIC_values, BIC = BIC_values, Adjusted_R2 = Radj_values, R2 = R2_values, Predicted_R2 = predicted_r2_values, Mallows_Cp = Cp_values, Multicollinearity = multicollinearity_flags, Assumption_violation = assumption_violation_flags, Leverage_points = leverage_point_flags, Outliers = outlier_flags ) } # Exclude models that violate the model assumptions according to gvlma dataset_results_filtered <- subset(dataset_results, Assumption_violation == "No") # Store the results in the list results_list[[i]] <- dataset_results_filtered } # Combine all results into one data frame all_models <- do.call(rbind, results_list) # View the results View(all_models) ############################################################ # 2 Create plots for the indcluded models # Plot for the relationship between number of variables and # the mean number of statistically significant variables # Calculate mean and standard deviation for each level of p mean_sd_values <- all_models %>% group_by(Number_of_variables) %>% summarise(mean_value = mean(Number_of_significant_variables), sd_value = sd(Number_of_significant_variables)) # Create the plot ggplot(data = mean_sd_values, aes(x = Number_of_variables, y = mean_value)) + geom_vline(xintercept = mean_sd_values$Number_of_variables, linetype = "dotted", color = "grey80") + geom_errorbar(aes(ymin = mean_value - sd_value, ymax = mean_value + sd_value), width = 0.2, color = "skyblue2") + geom_text(aes(label = round(mean_value, 2)), vjust = -2, size = 2.5) + geom_point(size = 2, color = "black") + labs( title = "Relationship between number of variables\nand mean number of statistically significant variables", x = "\nNumber of explanatory variables", y = "Mean number of statistically significant variables" ) + theme_light() + theme(plot.title = element_text(), panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank(), panel.grid.major.y = element_blank(), panel.grid.minor.y = element_blank()) # Plot for the relationship between number of explanatory # variables and percentage of models with only statistically # significant explanatory variables # Calculate the percentage for each group within p # that meets the criteria p = number_of_sig percentage_df <- all_models %>% group_by(Number_of_variables) %>% summarise( total_count = n(), match_count = sum(Number_of_variables == Number_of_significant_variables), percentage = (match_count / total_count) * 100 ) # Create the plot ggplot(data = percentage_df, aes(x = Number_of_variables, y = percentage)) + geom_text(aes(label = round(percentage, 2)), vjust = -1, size = 3) + geom_vline(xintercept = percentage_df$Number_of_variables, linetype = "dotted", color = "grey80") + geom_point(size = 2, color = "black") + labs( title = "Relationship between number of variables and percentage of\nmodels with only statistically significant variables", x = "\nNumber of explanatory variables", y = "Percentage with only statistically significant variables" ) + theme_light() + theme(plot.title = element_text(margin=margin(t=5, r=10, b=10, l=0)), panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank(), panel.grid.major.y = element_blank(), panel.grid.minor.y = element_blank()) # Plot for the relationship between number of explanatory # variables and AIC # Calculate mean AIC values for each group of models (with # 1-20 variables) aic_values <- all_models %>% group_by(Number_of_variables) %>% summarize(mean_AIC = mean(AIC)) # Create the plot ggplot(data=all_models, aes(x=Number_of_variables, y=AIC)) + geom_point(size=0.7, color="gray70") + labs(x="\nNumber of explanatory variables",y="AIC") + theme_light() + geom_point(data=aic_values, aes(x=Number_of_variables, y=mean_AIC), color="red", size=2) + geom_line(data=aic_values, aes(x=Number_of_variables, y=mean_AIC), color="red", size=0.3, linetype="dashed") + ggtitle("Relationship between\nnumber of explanatory variables and AIC") + theme(plot.title = element_text(lineheight=1, size=14)) + theme(axis.title.y = element_text(size = 12, angle = 90)) + theme(axis.title.x = element_text(size = 12)) # Plot for the relationship between number of explanatory # variables and BIC # Calculate mean BIC values for each group of models (with # 1-20 variables) mean_bic <- all_models %>% group_by(Number_of_variables) %>% summarize(mean_BIC = mean(BIC)) # Create the plot ggplot(data=all_models, aes(x=Number_of_variables, y=BIC)) + geom_point(size=0.7, color="gray70") + labs(x="\nNumber of explanatory variables",y="BIC") + theme_light() + geom_point(data=mean_bic, aes(x=Number_of_variables, y=mean_BIC), color="red", size=2) + geom_line(data=mean_bic, aes(x=Number_of_variables, y=mean_BIC), color="red", size=0.3, linetype="dashed") + ggtitle("Relationship between\nnumber of explanatory variables and BIC") + theme(plot.title = element_text(lineheight=1, size=14)) + theme(axis.title.y = element_text(size = 12, angle = 90)) + theme(axis.title.x = element_text(size = 12)) # Plot for the relationship between number of explanatory # variables and adjusted R² # Calculate mean adjusted R² values for each group of models # (1-20 variables) mean_adjusted_r2 <- all_models %>% group_by(Number_of_variables) %>% summarize(mean_Adjusted_R2 = mean(Adjusted_R2)) # Create the plot ggplot(data=all_models, aes(x=Number_of_variables, y=Adjusted_R2)) + geom_point(size=0.7, color="gray70") + labs(x = "\nNumber of explanatory variables", y = expression(paste("Adjusted ", R^{2}))) + theme_light() + geom_point(data=mean_adjusted_r2, aes(x=Number_of_variables, y=mean_Adjusted_R2), color="red", size=2) + geom_line(data=mean_adjusted_r2, aes(x=Number_of_variables, y=mean_Adjusted_R2), color="red", size=0.2, linetype="dashed") + ggtitle(expression(paste("Relationship between\nnumber of explanatory variables and adjusted ", R^{2}))) + theme(plot.title = element_text(lineheight=1, size=14, margin=margin(t=15, r=10, b=5, l=0))) + theme(axis.title.y = element_text(size = 12, angle = 90)) + theme(axis.title.x = element_text(size = 12)) # Plot for the relationship between number of explanatory # variables and predicted R² # Calculate mean predicted R² values for each group of models # (1-20 variables) mean_predicted_r2 <- all_models %>% group_by(Number_of_variables) %>% summarize(mean_Predicted_R2 = mean(Predicted_R2)) # Create the plot ggplot(data=all_models, aes(x=Number_of_variables, y=Predicted_R2)) + geom_point(size=0.7, color="gray70") + labs(x = "\nNumber of explanatory variables", y = expression(paste("Predicted ", R^{2}))) + theme_light() + geom_point(data=mean_predicted_r2, aes(x=Number_of_variables, y=mean_Predicted_R2), color="red", size=2) + geom_line(data=mean_predicted_r2, aes(x=Number_of_variables, y=mean_Predicted_R2), color="red", size=0.2, linetype="dashed") + ggtitle(expression(paste("Relationship between\nnumber of explanatory variables and predicted ", R^{2}))) + theme(plot.title = element_text(lineheight=1, size=14, margin=margin(t=15, r=10, b=5, l=0))) + theme(axis.title.y = element_text(size = 12, angle = 90)) + theme(axis.title.x = element_text(size = 12)) # Plot for the relationship between number of explanatory # variables and Mallow's Cp # Calculate mean Mallow's Cp values for each group of models # (1-20 variables) mean_mallows_cp <- all_models %>% group_by(Number_of_variables) %>% summarize(mean_Mallows_Cp = mean(Mallows_Cp)) # Create the plot ggplot(data=all_models, aes(x=Number_of_variables, y=Mallows_Cp)) + geom_point(size=0.7, color="gray70") + labs(x="\nNumber of explanatory variables",y = expression(paste("Mallow's ", C[p]))) + theme_light() + geom_line(data=mean_mallows_cp, aes(x=Number_of_variables, y=mean_Mallows_Cp), color="red", size=0.3, linetype="dashed") + geom_point(data=mean_mallows_cp, aes(x=Number_of_variables, y=mean_Mallows_Cp), color="red", size=2) + ggtitle(expression(paste("Relationship between\nnumber of explanatory variables and Mallow's ", C[p]))) + theme(plot.title = element_text(lineheight=1, size=14, margin=margin(t=15, r=10, b=5, l=0))) + theme(axis.title.y = element_text(size = 12, angle = 90)) + theme(axis.title.x = element_text(size = 12)) # Plot for the relationship between number of explanatory # variables and the models' final rank # Rank the models according to AIC, BIC, Mallow's Cp, # adjusted R² and predicted R² all_models_ranked <- all_models %>% mutate( rank_AIC = rank(AIC), rank_BIC = rank(BIC), rank_Mallows = rank(Mallows_Cp), rank_adjusted_R2 = rank(desc(Adjusted_R2)), rank_predicted_R2 = rank(desc(Predicted_R2)) ) # Sum the ranks for each model to give them a final position all_models_ranked <- all_models_ranked %>% mutate(final_rank = rank_AIC + rank_BIC + rank_adjusted_R2 + rank_predicted_R2 + rank_Mallows) # Calculate the mean position for models with different # numbers of explanatory variables mean_position <- all_models_ranked %>% group_by(Number_of_variables) %>% summarize(mean_final_rank = mean(final_rank)) # Create the plot ggplot(data=mean_position, aes(x=Number_of_variables, y=mean_final_rank)) + geom_line(size=0.5, color="gray", linetype="dashed") + geom_point(size=2, color="black") + labs(x="\nNumber of explanatory variables", y="Mean final rank") + theme_light() + ggtitle("Relationship between mean final rank\nand number of explanatory variables") + theme(plot.title = element_text(lineheight=1, size=14, margin=margin(t=5, r=10, b=5, l=0))) + theme(axis.title.y = element_text(size=12, angle=90)) + theme(axis.title.x = element_text(size=12)) ############################################################ # 3 Print model diagnostics # Print a table that counts the number of models with multicollinearity (including NA values) invisible(multicollinearity <- table(all_models$Multicollinearity, useNA = "ifany")) print(multicollinearity) # Print the number of models with leverage points table(all_models$Leverage_points) # Print the number of models with outliers table(all_models$Outliers) # Print the number of significant models table(all_models$Ftest) # Create a plot showing the number of models of each size (with each number of significant variables) table_modelsize <- table(all_models$Number_of_variables) table_modelsize_df <- as.data.frame(table_modelsize) ggplot(table_modelsize_df, aes(x = Var1, y = Freq)) + geom_bar(stat = "identity", fill = "gray") + labs(x = "Number of explanatory variables", y = "Number of included models", title = "Number of models of each size") + theme_minimal()