# ============================================================================= # test-utils.R — Unit tests for GeneSelectR utility functions # ============================================================================= # --------------------------------------------------------------------------- # percentile01 # --------------------------------------------------------------------------- test_that("percentile01 normalises to (0,1]", { x <- c(0, 1, 2, 3, 4, 5) out <- percentile01(x) expect_length(out, length(x)) expect_true(all(out >= 0 & out <= 1)) expect_equal(out[1], 0) expect_true(all(diff(out[-1]) >= 0)) }) test_that("percentile01 handles all-zero input", { expect_equal(percentile01(rep(0, 5)), rep(0, 5)) }) test_that("percentile01 handles single unique value", { expect_equal(percentile01(rep(3, 4)), rep(0, 4)) }) test_that("percentile01 handles NA values gracefully", { x <- c(1, 2, NA, 4) out <- percentile01(x) expect_length(out, 4) expect_equal(out[3], 0) }) test_that("percentile01 handles ties correctly", { x <- c(0, 2, 2, 5) out <- percentile01(x) expect_equal(out[2], out[3]) }) # --------------------------------------------------------------------------- # validate_inputs # --------------------------------------------------------------------------- test_that("validate_inputs passes with valid data", { X <- matrix(rnorm(200), nrow = 20, ncol = 10) colnames(X) <- paste0("Gene", 1:10) y <- factor(rep(c("A", "B"), each = 10)) expect_true(validate_inputs(X, y, colnames(X), n_folds = 5)) }) test_that("validate_inputs rejects non-matrix X", { expect_error( validate_inputs(data.frame(a = 1:10), factor(rep("A", 10))), "X must be a matrix" ) }) test_that("validate_inputs rejects non-factor y", { X <- matrix(1, 10, 2, dimnames = list(NULL, c("A", "B"))) expect_error(validate_inputs(X, c(0, 1, 0, 1, 0, 1, 0, 1, 0, 1)), "y must be a factor") }) test_that("validate_inputs rejects multi-class y", { X <- matrix(1, 15, 2, dimnames = list(NULL, c("A", "B"))) y <- factor(rep(c("A", "B", "C"), each = 5)) expect_error(validate_inputs(X, y), "exactly 2 levels") }) test_that("validate_inputs rejects dimension mismatch", { X <- matrix(1, 10, 2, dimnames = list(NULL, c("A", "B"))) y <- factor(rep(c("A", "B"), each = 6)) expect_error(validate_inputs(X, y), "mismatch") }) test_that("validate_inputs rejects duplicate gene names", { X <- matrix(1, 10, 3, dimnames = list(NULL, c("G1", "G1", "G2"))) y <- factor(rep(c("A", "B"), each = 5)) expect_error(validate_inputs(X, y), "Duplicate") }) test_that("validate_inputs rejects missing colnames", { X <- matrix(1, 10, 2) y <- factor(rep(c("A", "B"), each = 5)) expect_error(validate_inputs(X, y), "column names") }) test_that("validate_inputs warns about zero-variance genes", { X <- matrix(rnorm(200), 20, 10) X[, 3] <- 5 colnames(X) <- paste0("G", 1:10) y <- factor(rep(c("A", "B"), each = 10)) expect_warning(validate_inputs(X, y), "zero variance") }) test_that("validate_inputs warns about NAs", { X <- matrix(rnorm(200), 20, 10) X[1, 1] <- NA colnames(X) <- paste0("G", 1:10) y <- factor(rep(c("A", "B"), each = 10)) expect_warning(validate_inputs(X, y), "NA") }) test_that("validate_inputs rejects infinite values", { X <- matrix(rnorm(200), 20, 10) X[1, 1] <- Inf colnames(X) <- paste0("G", 1:10) y <- factor(rep(c("A", "B"), each = 10)) expect_error(validate_inputs(X, y), "infinite") }) test_that("validate_inputs rejects too-few samples for CV", { X <- matrix(rnorm(16), 4, 4, dimnames = list(NULL, paste0("G", 1:4))) y <- factor(c("A", "A", "B", "B")) expect_error(validate_inputs(X, y, n_folds = 5), "Insufficient") }) # --------------------------------------------------------------------------- # adaptive_bin_count # --------------------------------------------------------------------------- test_that("adaptive_bin_count returns reasonable values", { expect_equal(adaptive_bin_count(100, 5), 5) expect_equal(adaptive_bin_count(9, 5), 3) expect_gte(adaptive_bin_count(50, 10), 3) expect_lte(adaptive_bin_count(50, 10), 10) }) # --------------------------------------------------------------------------- # discretize_continuous # --------------------------------------------------------------------------- test_that("discretize_continuous returns integer bins", { x <- rnorm(100) bins <- discretize_continuous(x, n_bins = 5) expect_type(bins, "integer") expect_length(bins, 100) expect_true(all(bins >= 0)) }) test_that("discretize_continuous handles constant input", { x <- rep(3, 50) bins <- discretize_continuous(x, n_bins = 5) expect_true(all(bins == 0)) }) # --------------------------------------------------------------------------- # compute_mutual_information # --------------------------------------------------------------------------- test_that("MI is non-negative", { set.seed(1) x <- rnorm(100) y <- factor(rep(c(0, 1), each = 50)) mi <- compute_mutual_information(x, y) expect_gte(mi, 0) }) test_that("MI is higher for informative features", { set.seed(42) n <- 200 y <- factor(rep(c(0, 1), each = n / 2)) x_informative <- c(rnorm(n / 2, mean = 0), rnorm(n / 2, mean = 3)) x_noise <- rnorm(n) mi_signal <- compute_mutual_information(x_informative, y) mi_noise <- compute_mutual_information(x_noise, y) expect_gt(mi_signal, mi_noise) }) test_that("MI is zero for constant feature", { x <- rep(1, 100) y <- factor(rep(c(0, 1), each = 50)) mi <- compute_mutual_information(x, y) expect_equal(mi, 0) }) test_that("compute_mi_vectorized returns correct length", { set.seed(1) X <- matrix(rnorm(500), 50, 10) y <- factor(rep(c(0, 1), each = 25)) mi <- compute_mi_vectorized(X, y) expect_length(mi, 10) expect_true(all(mi >= 0)) }) # --------------------------------------------------------------------------- # create_cv_folds # --------------------------------------------------------------------------- test_that("create_cv_folds creates correct number of folds", { y <- factor(rep(c("A", "B"), each = 50)) folds <- create_cv_folds(y, K = 5, R = 3, random_seed = 1) expect_length(folds, 15) }) test_that("create_cv_folds produces non-overlapping train/test", { y <- factor(rep(c("A", "B"), each = 50)) folds <- create_cv_folds(y, K = 5, R = 1, random_seed = 1) for (fold in folds) { expect_equal(length(intersect(fold$train, fold$test)), 0) expect_equal(unname(sort(c(fold$train, fold$test))), 1:100) } }) test_that("create_cv_folds is stratified", { y <- factor(rep(c("A", "B"), each = 50)) folds <- create_cv_folds(y, K = 5, R = 1, random_seed = 1) for (fold in folds) { test_classes <- table(y[fold$test]) expect_true(all(test_classes >= 8 & test_classes <= 12)) } }) test_that("create_cv_folds is reproducible", { y <- factor(rep(c("A", "B"), each = 50)) folds1 <- create_cv_folds(y, K = 5, R = 2, random_seed = 42) folds2 <- create_cv_folds(y, K = 5, R = 2, random_seed = 42) expect_identical(folds1, folds2) }) # --------------------------------------------------------------------------- # combine_scores # --------------------------------------------------------------------------- test_that("arithmetic combination is a weighted mean", { pi <- c(0.8, 0.4, 0.1) u <- c(0.6, 0.5, 0.3) b <- c(0.9, 0.7, 0.2) w <- c(1, 1, 1) out <- combine_scores(pi, u, b, formula = "arithmetic", weights = w) expected <- (pi + u + b) / 3 expect_equal(out, expected, tolerance = 1e-10) }) test_that("geometric combination respects rank order", { pi <- c(0.9, 0.5, 0.1) u <- c(0.9, 0.5, 0.1) b <- c(0.9, 0.5, 0.1) out <- combine_scores(pi, u, b, formula = "geometric") expect_true(out[1] > out[2]) expect_true(out[2] > out[3]) }) test_that("harmonic combination penalises low components", { pi <- c(0.9, 0.9) u <- c(0.9, 0.1) b <- c(0.9, 0.9) out <- combine_scores(pi, u, b, formula = "harmonic") expect_gt(out[1], out[2]) }) test_that("minimum combination returns element-wise min", { pi <- c(0.8, 0.3) u <- c(0.5, 0.9) b <- c(0.6, 0.4) out <- combine_scores(pi, u, b, formula = "minimum") expect_equal(out, pmin(pi, u, b)) }) test_that("unknown formula raises error", { expect_error(combine_scores(1, 1, 1, formula = "unknown"), "Unknown formula") }) test_that("custom weights change the ranking", { pi <- c(0.9, 0.2) u <- c(0.1, 0.8) b <- c(0.5, 0.5) out_pi_heavy <- combine_scores(pi, u, b, "arithmetic", weights = c(5, 1, 1)) out_u_heavy <- combine_scores(pi, u, b, "arithmetic", weights = c(1, 5, 1)) expect_gt(out_pi_heavy[1], out_pi_heavy[2]) expect_gt(out_u_heavy[2], out_u_heavy[1]) }) # --------------------------------------------------------------------------- # aggregate_cv_results # --------------------------------------------------------------------------- test_that("aggregate_cv_results computes correct selection frequency", { n_genes <- 5 cv_results <- list( list(selected = c(1, 3), coefficients = c(0.5, 0.3), mi_scores = c(0.1, 0.2, 0.3, 0.05, 0.15), auc = 0.8, fold_num = 1, repeat_num = 1), list(selected = c(1, 2, 3), coefficients = c(0.4, 0.2, 0.6), mi_scores = c(0.12, 0.22, 0.28, 0.06, 0.14), auc = 0.85, fold_num = 2, repeat_num = 1), list(selected = c(1), coefficients = c(0.7), mi_scores = c(0.11, 0.19, 0.31, 0.04, 0.16), auc = 0.75, fold_num = 3, repeat_num = 1), list(selected = c(1, 3, 5), coefficients = c(0.3, 0.5, 0.1), mi_scores = c(0.09, 0.21, 0.29, 0.07, 0.13), auc = 0.82, fold_num = 4, repeat_num = 1) ) agg <- aggregate_cv_results(cv_results, n_genes) expect_equal(agg$pi_exact[1], 1.0) expect_equal(agg$pi_exact[4], 0.0) expect_equal(agg$pi_exact[3], 0.75) expect_length(agg$u_coef, n_genes) expect_length(agg$u_mi, n_genes) }) test_that("aggregate_cv_results handles empty selections", { cv_results <- list( list(selected = integer(0), coefficients = numeric(0), mi_scores = c(0.1, 0.2), auc = 0.5, fold_num = 1, repeat_num = 1), list(selected = integer(0), coefficients = numeric(0), mi_scores = c(0.15, 0.25), auc = 0.5, fold_num = 2, repeat_num = 1) ) agg <- aggregate_cv_results(cv_results, 2) expect_equal(agg$pi_exact, c(0, 0)) }) # --------------------------------------------------------------------------- # compute_auc # --------------------------------------------------------------------------- test_that("compute_auc returns 1 for perfect predictions", { skip_if_not_installed("pROC") y_true <- factor(c(rep("A", 50), rep("B", 50))) y_pred <- c(rep(0, 50), rep(1, 50)) expect_equal(compute_auc(y_true, y_pred), 1.0) }) test_that("compute_auc returns ~0.5 for random predictions", { skip_if_not_installed("pROC") set.seed(7) y_true <- factor(rep(c("A", "B"), each = 500)) y_pred <- runif(1000) auc <- compute_auc(y_true, y_pred) expect_true(abs(auc - 0.5) < 0.1) }) # --------------------------------------------------------------------------- # auto_generate_groups # --------------------------------------------------------------------------- test_that("auto_generate_groups returns valid group assignments", { set.seed(1) X <- matrix(rnorm(1000), 50, 20) groups <- auto_generate_groups(X) expect_length(groups, 20) expect_type(groups, "integer") expect_true(all(groups >= 1)) expect_gt(length(unique(groups)), 1) }) # --------------------------------------------------------------------------- # fit_regularized_model # --------------------------------------------------------------------------- test_that("fit_regularized_model returns expected structure", { skip_if_not_installed("glmnet") set.seed(1) n <- 100; p <- 20 X <- matrix(rnorm(n * p), n, p) colnames(X) <- paste0("G", 1:p) y <- factor(rep(c("A", "B"), each = n / 2)) X[y == "B", 1] <- X[y == "B", 1] + 2 X[y == "B", 2] <- X[y == "B", 2] - 1.5 result <- fit_regularized_model(X, y, method = "elastic_net", alpha = 0.5) expect_type(result, "list") expect_true(all(c("selected", "coefficients", "predict_fn", "lambda", "n_selected", "cv_fit") %in% names(result))) expect_true(is.function(result$predict_fn)) preds <- result$predict_fn(X) expect_length(preds, n) expect_true(all(preds >= 0 & preds <= 1)) }) test_that("lasso selects a subset of features", { skip_if_not_installed("glmnet") set.seed(2) n <- 100; p <- 50 X <- matrix(rnorm(n * p), n, p) colnames(X) <- paste0("G", 1:p) y <- factor(rep(c("A", "B"), each = n / 2)) X[y == "B", 1:3] <- X[y == "B", 1:3] + 2 result <- fit_regularized_model(X, y, method = "lasso") expect_lt(result$n_selected, p) }) test_that("unknown regularization method raises error", { X <- matrix(1, 10, 2, dimnames = list(NULL, c("A", "B"))) y <- factor(rep(c("A", "B"), each = 5)) expect_error(fit_regularized_model(X, y, method = "ridge"), "Unknown regularization") })