# ============================================================================== # Additional tests for matching_preprocessing.R to reach 90%+ coverage # ============================================================================== # Create test datasets set.seed(456) left_df <- data.frame( id = 1:50, normal_var = rnorm(50, 100, 15), income = rnorm(50, 50000, 15000), stringsAsFactors = FALSE ) right_df <- data.frame( id = 51:100, normal_var = rnorm(50, 105, 15), income = rnorm(50, 52000, 15000), stringsAsFactors = FALSE ) # ------------------------------------------------------------------------------ # check_variable_health tests # ------------------------------------------------------------------------------ test_that("check_variable_health works with normal variables", { health <- couplr:::check_variable_health( left_df, right_df, vars = c("normal_var", "income") ) expect_s3_class(health, "variable_health") expect_true(is.data.frame(health$summary)) expect_equal(nrow(health$summary), 2) }) test_that("check_variable_health detects constant variables", { left_const <- left_df left_const$const_var <- 1 # Constant right_const <- right_df right_const$const_var <- 1 health <- couplr:::check_variable_health( left_const, right_const, vars = c("normal_var", "const_var") ) expect_true("const_var" %in% health$exclude_vars) expect_true(any(sapply(health$issues, function(x) x$type == "constant"))) }) test_that("check_variable_health detects all-NA variables", { left_na <- left_df left_na$na_var <- NA_real_ right_na <- right_df right_na$na_var <- NA_real_ health <- couplr:::check_variable_health( left_na, right_na, vars = c("normal_var", "na_var") ) expect_true("na_var" %in% health$exclude_vars) expect_true(any(sapply(health$issues, function(x) x$type == "all_na"))) }) test_that("check_variable_health detects high missingness", { left_miss <- left_df left_miss$miss_var <- c(1:10, rep(NA, 40)) # 80% missing right_miss <- right_df right_miss$miss_var <- c(1:10, rep(NA, 40)) health <- couplr:::check_variable_health( left_miss, right_miss, vars = c("normal_var", "miss_var"), high_missingness_threshold = 0.5 ) expect_true(any(sapply(health$issues, function(x) x$type == "high_missingness"))) }) test_that("check_variable_health detects low variance", { left_lowvar <- left_df left_lowvar$lowvar <- rep(c(1, 1.0000001), 25) # Very low variance right_lowvar <- right_df right_lowvar$lowvar <- rep(c(1, 1.0000001), 25) health <- couplr:::check_variable_health( left_lowvar, right_lowvar, vars = c("lowvar"), low_variance_threshold = 0.001 ) expect_true(any(sapply(health$issues, function(x) x$type == "low_variance"))) }) test_that("check_variable_health detects skewness", { left_skew <- left_df left_skew$skew_var <- exp(rnorm(50)) # Highly skewed right_skew <- right_df right_skew$skew_var <- exp(rnorm(50)) health <- couplr:::check_variable_health( left_skew, right_skew, vars = c("skew_var") ) # May or may not detect skewness depending on random values expect_s3_class(health, "variable_health") }) test_that("check_variable_health errors with no variables", { expect_error( couplr:::check_variable_health(left_df, right_df, vars = character(0)), "No variables provided" ) }) # ------------------------------------------------------------------------------ # suggest_scaling tests # ------------------------------------------------------------------------------ test_that("suggest_scaling returns 'none' for empty vars", { result <- couplr:::suggest_scaling(left_df, right_df, vars = character(0)) expect_equal(result, "none") }) test_that("suggest_scaling suggests 'robust' for outliers", { left_outlier <- left_df left_outlier$outlier_var <- c(rnorm(45, 10, 1), 100, 200, 300, 400, 500) right_outlier <- right_df right_outlier$outlier_var <- c(rnorm(45, 10, 1), 100, 200, 300, 400, 500) result <- couplr:::suggest_scaling( left_outlier, right_outlier, vars = "outlier_var" ) # May suggest robust due to outliers expect_true(result %in% c("robust", "standardize", "none")) }) test_that("suggest_scaling suggests 'standardize' for different scales", { left_scale <- data.frame( small = rnorm(50, 0, 1), large = rnorm(50, 0, 100) ) right_scale <- data.frame( small = rnorm(50, 0, 1), large = rnorm(50, 0, 100) ) result <- couplr:::suggest_scaling( left_scale, right_scale, vars = c("small", "large") ) expect_equal(result, "standardize") }) test_that("suggest_scaling handles NA values", { left_with_na <- left_df left_with_na$with_na <- c(rnorm(40), rep(NA, 10)) right_with_na <- right_df right_with_na$with_na <- c(rnorm(40), rep(NA, 10)) result <- couplr:::suggest_scaling( left_with_na, right_with_na, vars = "with_na" ) expect_true(result %in% c("robust", "standardize", "none")) }) # ------------------------------------------------------------------------------ # auto_encode_categorical tests # ------------------------------------------------------------------------------ test_that("auto_encode_categorical handles numeric variables", { left_num <- data.frame(x = 1:10) right_num <- data.frame(x = 11:20) result <- couplr:::auto_encode_categorical(left_num, right_num, "x") expect_equal(result$method, "none") expect_equal(result$left, 1:10) expect_equal(result$right, 11:20) }) test_that("auto_encode_categorical encodes binary variables", { left_bin <- data.frame(gender = c("M", "F", "M", "F", "M")) right_bin <- data.frame(gender = c("F", "M", "F", "M", "F")) result <- couplr:::auto_encode_categorical(left_bin, right_bin, "gender") expect_equal(result$method, "binary") expect_true(all(result$left %in% c(0, 1))) expect_true(all(result$right %in% c(0, 1))) }) test_that("auto_encode_categorical encodes ordered factors", { left_ord <- data.frame( edu = factor(c("low", "med", "high", "low", "med"), levels = c("low", "med", "high"), ordered = TRUE) ) right_ord <- data.frame( edu = factor(c("med", "high", "low", "med", "high"), levels = c("low", "med", "high"), ordered = TRUE) ) result <- couplr:::auto_encode_categorical(left_ord, right_ord, "edu") expect_equal(result$method, "ordered") expect_true(all(result$left %in% c(1, 2, 3))) }) test_that("auto_encode_categorical errors on unordered multi-level factor", { left_cat <- data.frame(color = c("red", "blue", "green", "red")) right_cat <- data.frame(color = c("blue", "green", "red", "blue")) expect_error( couplr:::auto_encode_categorical(left_cat, right_cat, "color"), "categorical but not binary or ordered" ) }) # ------------------------------------------------------------------------------ # preprocess_matching_vars tests # ------------------------------------------------------------------------------ test_that("preprocess_matching_vars works with auto_scale", { result <- preprocess_matching_vars( left_df, right_df, vars = c("normal_var", "income"), auto_scale = TRUE, scale_method = "auto" ) expect_s3_class(result, "preprocessing_result") expect_equal(length(result$vars), 2) }) test_that("preprocess_matching_vars removes problematic variables", { left_prob <- left_df left_prob$const <- 1 right_prob <- right_df right_prob$const <- 1 result <- suppressWarnings( preprocess_matching_vars( left_prob, right_prob, vars = c("normal_var", "const"), remove_problematic = TRUE, verbose = FALSE ) ) expect_false("const" %in% result$vars) expect_true("const" %in% result$excluded_vars) }) test_that("preprocess_matching_vars errors when all vars excluded", { left_bad <- data.frame(const = rep(1, 10)) right_bad <- data.frame(const = rep(1, 10)) expect_error( suppressWarnings( preprocess_matching_vars( left_bad, right_bad, vars = "const", remove_problematic = TRUE ) ), "All variables were excluded" ) }) test_that("preprocess_matching_vars errors with no variables", { expect_error( preprocess_matching_vars(left_df, right_df, vars = character(0)), "No variables provided" ) }) test_that("preprocess_matching_vars errors on missing variables", { expect_error( preprocess_matching_vars(left_df, right_df, vars = c("nonexistent")), "Variables not found" ) }) test_that("preprocess_matching_vars with scale_method=FALSE", { result <- preprocess_matching_vars( left_df, right_df, vars = c("normal_var"), auto_scale = FALSE, scale_method = FALSE ) expect_equal(result$scaling_method, "none") }) test_that("preprocess_matching_vars with verbose=TRUE issues messages", { left_outlier <- left_df left_outlier$outlier <- c(rnorm(45), rep(1000, 5)) right_outlier <- right_df right_outlier$outlier <- c(rnorm(45), rep(1000, 5)) expect_message( preprocess_matching_vars( left_outlier, right_outlier, vars = c("normal_var", "outlier"), auto_scale = TRUE, scale_method = "auto", verbose = TRUE ), "Auto-selected scaling method" ) }) test_that("preprocess_matching_vars with check_health=FALSE skips health check", { left_const <- left_df left_const$const <- 1 right_const <- right_df right_const$const <- 1 result <- preprocess_matching_vars( left_const, right_const, vars = c("normal_var", "const"), check_health = FALSE, auto_scale = FALSE ) # const is NOT excluded because health check is skipped expect_true("const" %in% result$vars) }) # ------------------------------------------------------------------------------ # Print methods # ------------------------------------------------------------------------------ test_that("print.variable_health works", { health <- couplr:::check_variable_health( left_df, right_df, vars = c("normal_var") ) expect_output(print(health), "Variable Health") }) test_that("print.variable_health shows excluded vars", { left_const <- left_df left_const$const <- 1 right_const <- right_df right_const$const <- 1 health <- couplr:::check_variable_health( left_const, right_const, vars = c("const", "normal_var") ) expect_output(print(health), "exclude") }) test_that("print.preprocessing_result works", { result <- preprocess_matching_vars( left_df, right_df, vars = c("normal_var"), auto_scale = FALSE ) expect_output(print(result), "Preprocessing Result") }) test_that("print.preprocessing_result shows excluded vars", { left_const <- left_df left_const$const <- 1 right_const <- right_df right_const$const <- 1 result <- suppressWarnings( preprocess_matching_vars( left_const, right_const, vars = c("const", "normal_var"), verbose = FALSE ) ) expect_output(print(result), "Excluded") })