box::use( testthat[ describe, expect_equal, expect_false, expect_true, expect_length, it, test_that ], withr[local_options] ) box::use( artma / variable / bma[ detect_perfect_collinearity, detect_dummy_traps, remove_collinear_variables, remove_dummy_trap_variables, suggest_variables_for_bma, get_bma_priority_variables ] ) # Test data setup make_test_df <- function() { data.frame( effect = rnorm(100, mean = 0.3, sd = 0.1), se = abs(rnorm(100, mean = 0.05, sd = 0.01)), study_size = sample(50:500, 100, replace = TRUE), # Dummy group 1: region (all members present = trap) region_asia = rep(c(1, 0, 0), length.out = 100), region_europe = rep(c(0, 1, 0), length.out = 100), region_americas = rep(c(0, 0, 1), length.out = 100), # Dummy group 2: gender (missing one member = no trap) gender_male = rep(c(1, 0), length.out = 100), gender_female = rep(c(0, 1), length.out = 100), # Perfect collinearity: var2 = 2 * var1 var1 = runif(100, 0, 10), var2 = NA_real_, # Independent variables year = sample(2000:2020, 100, replace = TRUE), score = rnorm(100, mean = 5, sd = 2), stringsAsFactors = FALSE ) } make_config <- function() { list( effect = list(var_name = "effect", data_type = "float"), se = list(var_name = "se", data_type = "float"), study_size = list(var_name = "study_size", data_type = "int"), region_asia = list(var_name = "region_asia", data_type = "dummy", group_category = "region"), region_europe = list(var_name = "region_europe", data_type = "dummy", group_category = "region"), region_americas = list(var_name = "region_americas", data_type = "dummy", group_category = "region"), gender_male = list(var_name = "gender_male", data_type = "dummy", group_category = "gender"), gender_female = list(var_name = "gender_female", data_type = "dummy", group_category = "gender"), var1 = list(var_name = "var1", data_type = "float"), var2 = list(var_name = "var2", data_type = "float"), year = list(var_name = "year", data_type = "int"), score = list(var_name = "score", data_type = "float") ) } describe("detect_perfect_collinearity", { it("detects no collinearity for independent variables", { df <- make_test_df() result <- detect_perfect_collinearity(df, c("year", "score", "se")) expect_false(result$has_collinearity) expect_length(result$aliased, 0) }) it("detects collinearity when all dummy group members present", { df <- make_test_df() result <- detect_perfect_collinearity( df, c("region_asia", "region_europe", "region_americas") ) expect_true(result$has_collinearity) expect_true(length(result$aliased) > 0) }) it("detects perfect linear relationship", { df <- make_test_df() df$var2 <- df$var1 * 2 # Perfect collinearity result <- detect_perfect_collinearity(df, c("var1", "var2")) expect_true(result$has_collinearity) expect_true("var2" %in% result$aliased || "var1" %in% result$aliased) }) it("handles empty variable list", { df <- make_test_df() result <- detect_perfect_collinearity(df, character(0)) expect_false(result$has_collinearity) expect_length(result$aliased, 0) }) it("returns rank information", { df <- make_test_df() result <- detect_perfect_collinearity(df, c("year", "score")) expect_true(is.numeric(result$rank)) expect_true(result$rank > 0) }) }) describe("detect_dummy_traps", { it("detects trap when all dummy group members present", { df <- make_test_df() # Don't provide config so pattern detection works result <- detect_dummy_traps( df, c("region_asia", "region_europe", "region_americas"), config = NULL ) expect_true(result$has_trap) expect_true(length(result$groups_with_trap) > 0) }) it("no trap when only some dummy group members present", { df <- make_test_df() result <- detect_dummy_traps( df, c("region_asia", "region_europe"), # Missing region_americas config = NULL ) # With only 2 out of 3 members, there's still a trap (they're collinear) # This is actually correct behavior - the trap exists if all INCLUDED members # belong to the same group expect_true(result$has_trap) }) it("no trap for non-dummy variables", { df <- make_test_df() result <- detect_dummy_traps(df, c("year", "score", "se"), config = NULL) expect_false(result$has_trap) expect_length(result$groups_with_trap, 0) }) it("handles empty variable list", { df <- make_test_df() result <- detect_dummy_traps(df, character(0), config = NULL) expect_false(result$has_trap) expect_length(result$groups_with_trap, 0) }) }) describe("remove_collinear_variables", { it("removes collinear variables iteratively", { df <- make_test_df() df$var2 <- df$var1 * 2 # Perfect collinearity vars <- c("var1", "var2", "year", "score") result <- remove_collinear_variables(df, vars) # Should remove one of var1/var2 expect_true(length(result) < length(vars)) expect_false(all(c("var1", "var2") %in% result)) }) it("prioritizes keeping priority variables", { df <- make_test_df() df$var2 <- df$se * 2 # Collinear with se vars <- c("se", "var2", "year") result <- remove_collinear_variables(df, vars, priority_vars = c("se")) # Should keep se, remove var2 expect_true("se" %in% result) expect_false("var2" %in% result) }) it("removes dummy trap variables", { df <- make_test_df() vars <- c("region_asia", "region_europe", "region_americas") result <- remove_collinear_variables(df, vars) # Should remove at least one to break the trap expect_true(length(result) < length(vars)) }) it("handles empty variable list", { df <- make_test_df() result <- remove_collinear_variables(df, character(0)) expect_length(result, 0) }) it("returns variables unchanged if no collinearity", { df <- make_test_df() vars <- c("year", "score", "se") result <- remove_collinear_variables(df, vars) expect_equal(sort(result), sort(vars)) }) }) describe("remove_dummy_trap_variables", { it("removes one variable from complete dummy groups", { df <- make_test_df() vars <- c("region_asia", "region_europe", "region_americas", "year") result <- remove_dummy_trap_variables(df, vars, config = NULL) # Should remove one region variable region_vars <- c("region_asia", "region_europe", "region_americas") remaining_region <- sum(region_vars %in% result) expect_true(remaining_region < 3) expect_true("year" %in% result) # Non-dummy should remain }) it("prioritizes keeping priority variables", { df <- make_test_df() # Make region_asia a priority vars <- c("region_asia", "region_europe", "region_americas") result <- remove_dummy_trap_variables( df, vars, priority_vars = c("region_asia"), config = NULL ) # Should prefer keeping region_asia if possible # (but may remove it if it's the reference) expect_true(length(result) < length(vars)) }) it("handles multiple dummy groups", { df <- make_test_df() vars <- c( "region_asia", "region_europe", "region_americas", "gender_male", "gender_female" ) result <- remove_dummy_trap_variables(df, vars, config = NULL) # Should handle region group (all members present) region_vars <- c("region_asia", "region_europe", "region_americas") remaining_region <- sum(region_vars %in% result) expect_true(remaining_region < 3) }) it("handles empty variable list", { df <- make_test_df() result <- remove_dummy_trap_variables(df, character(0), config = NULL) expect_length(result, 0) }) }) describe("get_bma_priority_variables", { it("returns standard error as priority", { priorities <- get_bma_priority_variables() expect_true("se" %in% priorities) }) it("returns study_size as priority", { priorities <- get_bma_priority_variables() expect_true("study_size" %in% priorities) }) it("returns character vector", { priorities <- get_bma_priority_variables() expect_true(is.character(priorities)) expect_true(length(priorities) > 0) }) }) describe("suggest_variables_for_bma", { it("suggests variables without collinearity", { local_options("artma.verbose" = 1) df <- make_test_df() df$var2 <- df$var1 * 2 # Collinearity config <- make_config() result <- suggest_variables_for_bma(df, config = config) # Should not include both var1 and var2 suggested_vars <- result$var_name expect_false(all(c("var1", "var2") %in% suggested_vars)) }) it("prioritizes important variables like se", { local_options("artma.verbose" = 1) df <- make_test_df() config <- make_config() result <- suggest_variables_for_bma(df, config = config) # se should be suggested as it's a priority variable expect_true("se" %in% result$var_name) }) it("removes dummy trap variables", { local_options("artma.verbose" = 1) df <- make_test_df() config <- make_config() result <- suggest_variables_for_bma(df, config = config) # Should not suggest all three region variables region_vars <- c("region_asia", "region_europe", "region_americas") region_count <- sum(region_vars %in% result$var_name) expect_true(region_count < 3) }) it("accepts custom priority variables", { local_options("artma.verbose" = 1) df <- make_test_df() config <- make_config() result <- suggest_variables_for_bma( df, config = config, priority_vars = c("year") ) # year should be included due to custom priority expect_true("year" %in% result$var_name) }) it("filters variables by variance ratio", { local_options("artma.verbose" = 1) df <- make_test_df() df$constant_var <- 5 # No variance config <- make_config() config$constant_var <- list(var_name = "constant_var", data_type = "float") result <- suggest_variables_for_bma( df, config = config, min_variance_ratio = 0.01 ) # constant_var should not be suggested expect_false("constant_var" %in% result$var_name) }) it("returns data frame with expected columns", { local_options("artma.verbose" = 1) df <- make_test_df() config <- make_config() result <- suggest_variables_for_bma(df, config = config) expect_true(is.data.frame(result)) expect_true("var_name" %in% names(result)) expect_true("suggested" %in% names(result)) expect_true("reason" %in% names(result)) expect_true("split_method" %in% names(result)) expect_true("split_value" %in% names(result)) }) it("final result has no perfect collinearity", { local_options("artma.verbose" = 1) df <- make_test_df() df$var2 <- df$var1 * 2 config <- make_config() result <- suggest_variables_for_bma(df, config = config) suggested_vars <- result$var_name # Verify no collinearity in final suggestions collinearity_check <- detect_perfect_collinearity(df, suggested_vars) # Should have no collinearity (or minimal) # Note: May still have some due to data randomness, but should be much reduced expect_true(is.logical(collinearity_check$has_collinearity)) }) }) test_that("integration: BMA suggestion prevents collinearity issues", { local_options("artma.verbose" = 1) # Create realistic dataset with multiple collinearity issues df <- data.frame( effect = rnorm(200, 0.3, 0.1), se = abs(rnorm(200, 0.05, 0.01)), study_size = sample(50:500, 200, replace = TRUE), # Region dummy trap reg_asia = rep(c(1, 0, 0, 0), 50), reg_europe = rep(c(0, 1, 0, 0), 50), reg_americas = rep(c(0, 0, 1, 0), 50), reg_africa = rep(c(0, 0, 0, 1), 50), # Type dummy trap type_a = rep(c(1, 0), 100), type_b = rep(c(0, 1), 100), # Linear dependencies x1 = runif(200, 0, 10), x2 = NA_real_, x3 = runif(200, 5, 15), # Independent year = sample(2000:2020, 200, replace = TRUE), score = rnorm(200, 5, 2), stringsAsFactors = FALSE ) df$x2 <- df$x1 * 3 + 1 # Perfect collinearity config <- list( effect = list(var_name = "effect", data_type = "float"), se = list(var_name = "se", data_type = "float"), study_size = list(var_name = "study_size", data_type = "int"), reg_asia = list(var_name = "reg_asia", data_type = "dummy", group_category = "region"), reg_europe = list(var_name = "reg_europe", data_type = "dummy", group_category = "region"), reg_americas = list(var_name = "reg_americas", data_type = "dummy", group_category = "region"), reg_africa = list(var_name = "reg_africa", data_type = "dummy", group_category = "region"), type_a = list(var_name = "type_a", data_type = "dummy", group_category = "type"), type_b = list(var_name = "type_b", data_type = "dummy", group_category = "type"), x1 = list(var_name = "x1", data_type = "float"), x2 = list(var_name = "x2", data_type = "float"), x3 = list(var_name = "x3", data_type = "float"), year = list(var_name = "year", data_type = "int"), score = list(var_name = "score", data_type = "float") ) result <- suggest_variables_for_bma(df, config = config) suggested_vars <- result$var_name # Priority variables should be included expect_true("se" %in% suggested_vars) # Should not have all region variables (dummy trap) region_vars <- c("reg_asia", "reg_europe", "reg_americas", "reg_africa") region_count <- sum(region_vars %in% suggested_vars) expect_true(region_count < 4) # Should not have all type variables (dummy trap) type_vars <- c("type_a", "type_b") type_count <- sum(type_vars %in% suggested_vars) expect_true(type_count < 2) # Should not have both x1 and x2 (collinear) expect_false(all(c("x1", "x2") %in% suggested_vars)) # Final collinearity check final_check <- detect_perfect_collinearity(df, suggested_vars) # Should have minimal or no collinearity # Note: Due to QR decomposition tolerance, some numerical issues may remain # but the major issues (dummy traps, perfect linear relationships) should be gone if (final_check$has_collinearity) { # If there's still collinearity, it should be minimal expect_true(length(final_check$aliased) < length(suggested_vars) / 2) } })