# Tests for ROBOS_RM_SMOTE() # Run with: devtools::test() or testthat::test_file("tests/testthat/test-rm_smote.R") # Helper: create a small imbalanced dataset make_imbalanced <- function(n_maj = 100, n_min = 20, n_out = 3, seed = 42) { set.seed(seed) majority <- data.frame( x1 = rnorm(n_maj, mean = 0, sd = 1), x2 = rnorm(n_maj, mean = 0, sd = 1), class = factor("negative", levels = c("negative", "positive")) ) minority <- data.frame( x1 = rnorm(n_min, mean = 3, sd = 1), x2 = rnorm(n_min, mean = 3, sd = 1), class = factor("positive", levels = c("negative", "positive")) ) if (n_out > 0) { outliers <- data.frame( x1 = rnorm(n_out, mean = 15, sd = 0.1), x2 = rnorm(n_out, mean = 15, sd = 0.1), class = factor("positive", levels = c("negative", "positive")) ) df <- rbind(majority, minority, outliers) } else { df <- rbind(majority, minority) } df[sample(nrow(df)), ] } # ── Output structure ─────────────────────────────────────────────────────── test_that("ROBOS_RM_SMOTE returns a data frame", { dt <- make_imbalanced() result <- ROBOS_RM_SMOTE(dt, target = "positive", eIR = 1) expect_s3_class(result, "data.frame") }) test_that("output has the same columns as input", { dt <- make_imbalanced() result <- ROBOS_RM_SMOTE(dt, target = "positive", eIR = 1) expect_equal(colnames(result), colnames(dt)) }) test_that("output has more rows than input", { dt <- make_imbalanced() result <- ROBOS_RM_SMOTE(dt, target = "positive", eIR = 1) expect_gt(nrow(result), nrow(dt)) }) # ── Class balance ────────────────────────────────────────────────────────── test_that("eIR = 1 produces a balanced dataset", { dt <- make_imbalanced(n_maj = 100, n_min = 20, n_out = 0) result <- ROBOS_RM_SMOTE(dt, target = "positive", eIR = 1) n_neg <- sum(result$class == "negative") n_pos <- sum(result$class == "positive") # After balancing: positive count should equal majority count expect_equal(n_neg, n_pos) }) test_that("minority class count increases after ROBOS_RM_SMOTE", { dt <- make_imbalanced() n_min_before <- sum(dt$class == "positive") result <- ROBOS_RM_SMOTE(dt, target = "positive", eIR = 1) n_min_after <- sum(result$class == "positive") expect_gt(n_min_after, n_min_before) }) test_that("majority class count is unchanged", { dt <- make_imbalanced() n_maj_before <- sum(dt$class == "negative") result <- ROBOS_RM_SMOTE(dt, target = "positive", eIR = 1) n_maj_after <- sum(result$class == "negative") expect_equal(n_maj_after, n_maj_before) }) # ── eIR parameter ───────────────────────────────────────────────────────── test_that("eIR > 1 results in partial balancing", { dt <- make_imbalanced(n_maj = 100, n_min = 20, n_out = 0) result_eir1 <- ROBOS_RM_SMOTE(dt, target = "positive", eIR = 1) result_eir2 <- ROBOS_RM_SMOTE(dt, target = "positive", eIR = 2) # eIR=2 should add fewer synthetics than eIR=1 expect_lt(nrow(result_eir2), nrow(result_eir1)) }) # ── dup_size parameter ───────────────────────────────────────────────────── test_that("dup_size controls number of synthetic samples", { dt <- make_imbalanced(n_maj = 100, n_min = 20, n_out = 0) result <- ROBOS_RM_SMOTE(dt, target = "positive", dup_size = 2) n_synthetic <- nrow(result) - nrow(dt) # dup_size = 2 → 2 * 20 = 40 synthetic samples expect_equal(n_synthetic, 40) }) # ── Covariance methods ───────────────────────────────────────────────────── test_that("all cov_method options run without error", { dt <- make_imbalanced(n_maj = 80, n_min = 20, n_out = 0) methods <- c("mcd", "mve", "mest", "mmest", "sde", "sest", "ogk") for (m in methods) { expect_no_error( ROBOS_RM_SMOTE(dt, target = "positive", eIR = 1, cov_method = m) ) } }) # ── Weight functions ─────────────────────────────────────────────────────── test_that("all weight_func options run without error", { dt <- make_imbalanced() for (wf in 1:3) { expect_no_error( ROBOS_RM_SMOTE(dt, target = "positive", eIR = 1, weight_func = wf) ) } }) # ── Outlier robustness ───────────────────────────────────────────────────── test_that("ROBOS_RM_SMOTE with outliers produces fewer outlier-derived synthetics (weight_func=1)", { # With weight_func=1, outliers get weight 0 → cannot be selected as parents # All synthetic samples should lie within the normal minority region set.seed(42) normal_min <- data.frame( x1 = rnorm(20, 3, 1), x2 = rnorm(20, 3, 1), class = factor("positive", levels = c("negative", "positive")) ) outlier_min <- data.frame( x1 = c(20, 21), x2 = c(20, 21), class = factor("positive", levels = c("negative", "positive")) ) majority <- data.frame( x1 = rnorm(80), x2 = rnorm(80), class = factor("negative", levels = c("negative", "positive")) ) dt <- rbind(majority, normal_min, outlier_min) result <- ROBOS_RM_SMOTE(dt, target = "positive", eIR = 1, weight_func = 1, cov_method = "mcd") # Synthetic observations are the new rows synthetic <- tail(result, nrow(result) - nrow(dt)) # No synthetic x1 value should be close to the outlier region (> 15) expect_true(all(synthetic$x1 < 15), info = "Synthetics should not be generated near outlier region") }) # ── Input validation ─────────────────────────────────────────────────────── test_that("missing 'class' column raises error", { dt <- data.frame(x1 = rnorm(50), x2 = rnorm(50)) expect_error(ROBOS_RM_SMOTE(dt), regexp = "class") }) test_that("unknown target raises error", { dt <- make_imbalanced() expect_error(ROBOS_RM_SMOTE(dt, target = "unknown"), regexp = "not found") }) test_that("non-data-frame input raises error", { expect_error(ROBOS_RM_SMOTE(matrix(1:20, 10, 2)), regexp = "data frame") }) test_that("eIR >= IR raises error", { dt <- make_imbalanced(n_maj = 100, n_min = 20, n_out = 0) # IR = 5, so eIR = 6 should fail expect_error(ROBOS_RM_SMOTE(dt, target = "positive", eIR = 6), regexp = "imbalance ratio") }) test_that("negative k raises error", { dt <- make_imbalanced() expect_error(ROBOS_RM_SMOTE(dt, target = "positive", k = -1), regexp = "positive") }) # ── Reproducibility ──────────────────────────────────────────────────────── test_that("same seed produces identical results", { dt <- make_imbalanced() set.seed(99); r1 <- ROBOS_RM_SMOTE(dt, target = "positive", eIR = 1) set.seed(99); r2 <- ROBOS_RM_SMOTE(dt, target = "positive", eIR = 1) expect_equal(r1, r2) })