# ============================================================================== # Additional tests for matching preprocessing functions to increase coverage # ============================================================================== # ------------------------------------------------------------------------------ # check_variable_health tests # ------------------------------------------------------------------------------ test_that("check_variable_health errors with no variables", { left <- data.frame(x = 1:5) right <- data.frame(x = 1:5) expect_error( couplr:::check_variable_health(left, right, vars = character(0)), "No variables provided" ) }) test_that("check_variable_health detects all-NA variable", { left <- data.frame(x = c(NA, NA, NA), y = 1:3) right <- data.frame(x = c(NA, NA, NA), y = 4:6) result <- couplr:::check_variable_health(left, right, vars = c("x", "y")) expect_true("x" %in% result$exclude_vars) expect_true(any(sapply(result$issues, function(i) i$type == "all_na"))) expect_true(any(grepl("all values are NA", result$warnings))) }) test_that("check_variable_health detects constant variable", { left <- data.frame(x = rep(5, 5), y = 1:5) right <- data.frame(x = rep(5, 5), y = 6:10) result <- couplr:::check_variable_health(left, right, vars = c("x", "y")) expect_true("x" %in% result$exclude_vars) expect_true(any(sapply(result$issues, function(i) i$type == "constant"))) }) test_that("check_variable_health detects low variance", { left <- data.frame(x = c(1, 1, 1, 1.00000001), y = 1:4) right <- data.frame(x = c(1, 1, 1, 1), y = 5:8) result <- couplr:::check_variable_health(left, right, vars = c("x", "y"), low_variance_threshold = 1e-6) # x should trigger low variance warning issues <- result$issues low_var_issues <- Filter(function(i) i$type == "low_variance", issues) expect_true(length(low_var_issues) > 0 || "x" %in% result$exclude_vars) }) test_that("check_variable_health detects high missingness", { left <- data.frame(x = c(NA, NA, NA, NA, 5), y = 1:5) right <- data.frame(x = c(NA, NA, NA, NA, 5), y = 6:10) result <- couplr:::check_variable_health(left, right, vars = c("x", "y"), high_missingness_threshold = 0.5) issues <- result$issues miss_issues <- Filter(function(i) i$type == "high_missingness", issues) expect_true(length(miss_issues) > 0) }) test_that("check_variable_health detects extreme skewness", { # Create highly skewed data left <- data.frame(x = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 100), y = 1:10) right <- data.frame(x = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 100), y = 11:20) result <- couplr:::check_variable_health(left, right, vars = c("x", "y")) issues <- result$issues skew_issues <- Filter(function(i) i$type == "skewed", issues) expect_true(length(skew_issues) > 0) }) test_that("check_variable_health returns proper structure", { left <- data.frame(x = 1:5, y = 6:10) right <- data.frame(x = 11:15, y = 16:20) result <- couplr:::check_variable_health(left, right, vars = c("x", "y")) expect_s3_class(result, "variable_health") expect_true("summary" %in% names(result)) expect_true("issues" %in% names(result)) expect_true("exclude_vars" %in% names(result)) expect_true("warnings" %in% names(result)) expect_true("variable" %in% names(result$summary)) expect_true("mean" %in% names(result$summary)) expect_true("sd" %in% names(result$summary)) }) test_that("check_variable_health handles multiple issues on same variable", { # Variable with both high missingness and skewness set.seed(123) vals <- c(rep(NA, 6), rep(1, 3), 1000) left <- data.frame(x = vals[1:5], y = 1:5) right <- data.frame(x = vals[6:10], y = 6:10) result <- couplr:::check_variable_health(left, right, vars = c("x", "y"), high_missingness_threshold = 0.3) # Should have multiple issues expect_true(length(result$issues) >= 1) }) # ------------------------------------------------------------------------------ # suggest_scaling tests # ------------------------------------------------------------------------------ test_that("suggest_scaling returns 'none' for empty vars", { left <- data.frame(x = 1:5) right <- data.frame(x = 1:5) result <- couplr:::suggest_scaling(left, right, vars = character(0)) expect_equal(result, "none") }) test_that("suggest_scaling suggests robust for skewed data", { # Create highly skewed data left <- data.frame(x = c(1, 1, 1, 1, 1, 100)) right <- data.frame(x = c(1, 1, 1, 1, 1, 100)) result <- couplr:::suggest_scaling(left, right, vars = "x") expect_true(result %in% c("robust", "standardize", "range", "none")) }) test_that("suggest_scaling suggests standardize for normal data", { set.seed(123) left <- data.frame(x = rnorm(100), y = rnorm(100)) right <- data.frame(x = rnorm(100), y = rnorm(100)) result <- couplr:::suggest_scaling(left, right, vars = c("x", "y")) expect_true(result %in% c("standardize", "range", "robust", "none")) }) # ------------------------------------------------------------------------------ # preprocess_matching_vars tests # ------------------------------------------------------------------------------ test_that("preprocess_matching_vars works with default settings", { left <- data.frame(x = 1:5, y = 6:10) right <- data.frame(x = 11:15, y = 16:20) result <- preprocess_matching_vars(left, right, vars = c("x", "y")) expect_true("vars" %in% names(result)) expect_true("scaling_method" %in% names(result)) }) test_that("preprocess_matching_vars removes problematic vars", { left <- data.frame(const = rep(5, 5), good = 1:5) right <- data.frame(const = rep(5, 5), good = 6:10) result <- preprocess_matching_vars(left, right, vars = c("const", "good"), check_health = TRUE, remove_problematic = TRUE, verbose = FALSE) expect_true("good" %in% result$vars) expect_false("const" %in% result$vars) }) test_that("preprocess_matching_vars keeps problematic vars when requested", { left <- data.frame(const = rep(5, 5), good = 1:5) right <- data.frame(const = rep(5, 5), good = 6:10) result <- preprocess_matching_vars(left, right, vars = c("const", "good"), check_health = TRUE, remove_problematic = FALSE, verbose = FALSE) expect_true("const" %in% result$vars) expect_true("good" %in% result$vars) }) test_that("preprocess_matching_vars auto-selects scaling", { set.seed(123) left <- data.frame(x = rnorm(50), y = rnorm(50) + 100) right <- data.frame(x = rnorm(50), y = rnorm(50) + 100) result <- preprocess_matching_vars(left, right, vars = c("x", "y"), auto_scale = TRUE, scale_method = "auto", verbose = FALSE) expect_true(result$scaling_method %in% c("standardize", "range", "robust", "none")) }) test_that("preprocess_matching_vars respects explicit scale_method", { left <- data.frame(x = 1:5) right <- data.frame(x = 6:10) result <- preprocess_matching_vars(left, right, vars = "x", auto_scale = FALSE, scale_method = "robust", verbose = FALSE) expect_equal(result$scaling_method, "robust") }) test_that("preprocess_matching_vars returns health info", { left <- data.frame(x = 1:5, y = 6:10) right <- data.frame(x = 11:15, y = 16:20) result <- preprocess_matching_vars(left, right, vars = c("x", "y"), check_health = TRUE, verbose = FALSE) expect_true("health" %in% names(result)) }) test_that("preprocess_matching_vars handles verbose output", { left <- data.frame(x = 1:5) right <- data.frame(x = 6:10) output <- capture.output({ result <- preprocess_matching_vars(left, right, vars = "x", check_health = TRUE, auto_scale = TRUE, verbose = TRUE) }) expect_true(length(output) > 0 || is.list(result)) }) # ------------------------------------------------------------------------------ # print.variable_health tests # ------------------------------------------------------------------------------ test_that("print.variable_health works", { left <- data.frame(x = 1:5, y = c(NA, NA, 3, 4, 5)) right <- data.frame(x = 6:10, y = c(6, NA, 8, 9, 10)) health <- couplr:::check_variable_health(left, right, vars = c("x", "y")) output <- capture.output(print(health)) expect_true(any(grepl("Variable Health", output))) }) # ------------------------------------------------------------------------------ # print.preprocessing_result tests # ------------------------------------------------------------------------------ test_that("print.preprocessing_result works", { left <- data.frame(x = 1:5) right <- data.frame(x = 6:10) result <- preprocess_matching_vars(left, right, vars = "x", verbose = FALSE) output <- capture.output(print(result)) expect_true(length(output) >= 1) }) # ------------------------------------------------------------------------------ # Edge cases # ------------------------------------------------------------------------------ test_that("check_variable_health handles single row data", { left <- data.frame(x = 1) right <- data.frame(x = 2) result <- couplr:::check_variable_health(left, right, vars = "x") expect_s3_class(result, "variable_health") }) test_that("check_variable_health handles many variables", { n <- 20 left <- as.data.frame(matrix(rnorm(n * 10), nrow = 10)) right <- as.data.frame(matrix(rnorm(n * 10), nrow = 10)) result <- couplr:::check_variable_health(left, right, vars = names(left)) expect_equal(nrow(result$summary), n) }) test_that("preprocess_matching_vars errors when all vars are problematic", { left <- data.frame(x = rep(1, 5), y = rep(2, 5)) right <- data.frame(x = rep(1, 5), y = rep(2, 5)) # When all variables are excluded due to health issues, an error is thrown expect_error( preprocess_matching_vars(left, right, vars = c("x", "y"), check_health = TRUE, remove_problematic = TRUE, verbose = FALSE), "All variables were excluded" ) })