# W1 Numerical Parity — Python vs R behavioral equivalence # Max 20 tests. Python is the source of truth. library(ml) # ── Shared data (same for all tests) ────────────────────────────────────────── # Use deterministic pre-seeded data to avoid RNG differences between languages. # 150-row iris (identical in both R and Python via datasets::iris). # ── SPLIT PARITY ───────────────────────────────────────────────────────────── test_that("W1-01: split 60/20/20 ratio produces correct approximate sizes", { s <- ml_split(iris, "Species", seed = 42L) n <- nrow(iris) # 150 expect_true(abs(nrow(s$train) / n - 0.60) < 0.05) expect_true(abs(nrow(s$valid) / n - 0.20) < 0.05) expect_true(abs(nrow(s$test) / n - 0.20) < 0.05) expect_equal(nrow(s$train) + nrow(s$valid) + nrow(s$test), n) }) test_that("W1-02: split small-partition warning threshold matches Python (< 30 rows)", { # Python: warns when partition has < 30 rows # R was using < 10 — must be updated to match Python # Use mtcars (32 rows): 60/20/20 → valid ~= 6, test ~= 7 → both < 30 expect_warning( ml_split(mtcars, "mpg", seed = 42L), regexp = "rows" ) }) test_that("W1-03: split 2-tuple ratio is NOT supported in R (3-tuple required)", { # Python: accepts 2-tuple (train, test) and sets valid to empty # R spec: requires length-3 ratio. Document the difference. expect_error( ml_split(iris, "Species", seed = 42L, ratio = c(0.8, 0.2)), regexp = "length 3" ) }) test_that("W1-04: stratified split preserves class proportions within 10pp", { s <- ml_split(iris, "Species", seed = 42L) orig_freq <- table(iris$Species) / nrow(iris) train_freq <- table(s$train$Species) / nrow(s$train) for (cls in names(orig_freq)) { expect_true(abs(train_freq[[cls]] - orig_freq[[cls]]) < 0.10, info = paste("Class", cls, "proportion diverges")) } }) test_that("W1-05: split $dev = rbind($train, $valid) exactly", { s <- ml_split(iris, "Species", seed = 42L) expect_equal(nrow(s$dev), nrow(s$train) + nrow(s$valid)) # Column names must match expect_equal(names(s$dev), names(s$train)) }) # ── DRIFT SEVERITY THRESHOLDS ──────────────────────────────────────────────── test_that("W1-06: drift severity thresholds match Python (0.1 / 0.3 breakpoints)", { skip_if(getRversion() < "4.4", "KS test behavior differs in R < 4.4") # Python thresholds: # frac < 0.1 → "low" # frac < 0.3 → "medium" # else → "high" # Use fully deterministic data (no rnorm) to avoid R version RNG differences. n <- 150L ref <- data.frame( v1 = seq(1, n), v2 = seq(n, 1), v3 = rep(1:10, n/10), v4 = rep(1:5, n/5), v5 = seq(0, 1, length.out = n), v6 = seq(1, n) * 0.1, v7 = rep(c(-1, 1), n/2), v8 = seq(1, n) %% 7, v9 = seq(1, n) %% 13, v10 = seq(1, n) %% 3 ) # No perturbation → no drift → severity "none" res0 <- ml_drift(reference = ref, new = ref) expect_equal(res0$severity, "none") # Perturb 1/10 features — completely different distribution new1 <- ref new1$v1 <- rev(ref$v1) + 1000 # massive shift, guaranteed KS p < 0.05 res1 <- ml_drift(reference = ref, new = new1) # 1/10 = 0.10, which is NOT < 0.1 → "medium" (frac < 0.3) expect_true(res1$severity %in% c("low", "medium"), info = paste("1-feature drift got severity:", res1$severity)) }) test_that("W1-07: drift severity 'high' when frac >= 0.3 (matches Python)", { skip_if(getRversion() < "4.4", "KS test behavior differs in R < 4.4") # Use fully deterministic data (no rnorm) to avoid R version RNG differences. n <- 150L ref <- data.frame( v1 = seq(1, n), v2 = seq(n, 1), v3 = rep(1:10, n/10), v4 = rep(1:5, n/5), v5 = seq(0, 1, length.out = n) ) # Perturb 3/5 = 0.6 of features → should be "high" (frac >= 0.3) new_high <- ref new_high$v1 <- ref$v1 + 1000 new_high$v2 <- ref$v2 + 1000 new_high$v3 <- ref$v3 + 1000 res <- ml_drift(reference = ref, new = new_high) expect_equal(res$severity, "high") }) test_that("W1-08: drift features_shifted is sorted alphabetically (matches Python sorted())", { # Python: features_shifted = sorted(features_shifted) # R must also sort alphabetically set.seed(1L) n <- 100L ref <- data.frame( z_col = seq(1, n), a_col = seq(n, 1), m_col = rep(1:10, n/10) ) new_df <- ref new_df$z_col <- ref$z_col + 1000 new_df$a_col <- ref$a_col + 1000 res <- ml_drift(reference = ref, new = new_df) if (length(res$features_shifted) >= 2L) { expect_equal(res$features_shifted, sort(res$features_shifted)) } }) # ── SHELF PARITY ───────────────────────────────────────────────────────────── test_that("W1-09: shelf() requires CV model (scores_ non-NULL)", { # Python: holdout model → shelf returns fresh=TRUE with no-baseline message # R: holdout model → raises model_error (currently) # This is a DOCUMENTED divergence: R requires CV model for shelf() s <- ml_split(iris, "Species", seed = 42L) holdout_model <- ml_fit(s$train, "Species", algorithm = "logistic", seed = 42L) # R deliberately requires scores_ to be non-NULL expect_error( ml_shelf(holdout_model, new = s$test, target = "Species"), regexp = "scores_|cross-validated|cv" ) }) test_that("W1-10: shelf() with CV model returns ShelfResult with correct fields", { skip_if_not_installed("e1071") # svm might not be installed cv <- ml_split(iris, "Species", seed = 42L, folds = 3L) model <- ml_fit(cv, "Species", algorithm = "logistic", seed = 42L) expect_true(!is.null(model$scores_)) # must have CV scores new_batch <- iris[sample(nrow(iris), 40L), ] result <- ml_shelf(model, new = new_batch, target = "Species") expect_true(inherits(result, "ml_shelf_result")) expect_true(is.logical(result$fresh)) expect_true(is.list(result$metrics_then)) expect_true(is.list(result$metrics_now)) expect_true(is.list(result$degradation)) expect_true(is.character(result$recommendation)) expect_true(is.numeric(result$n_new)) }) test_that("W1-11: shelf() stale property = !fresh (matches Python ShelfResult.stale)", { cv <- ml_split(iris, "Species", seed = 42L, folds = 3L) model <- ml_fit(cv, "Species", algorithm = "logistic", seed = 42L) new_batch <- iris[sample(nrow(iris), 40L), ] result <- ml_shelf(model, new = new_batch, target = "Species") expect_equal(result$stale, !result$fresh) }) # ── EVALUATE METRIC NAMES ──────────────────────────────────────────────────── test_that("W1-12: evaluate() returns named numeric for binary clf (accuracy, f1, etc.)", { s <- ml_split(iris[iris$Species != "virginica", ], "Species", seed = 42L) model <- ml_fit(s$train, "Species", algorithm = "logistic", seed = 42L) m <- ml_evaluate(model, s$valid) expect_true(is.numeric(m)) # Python binary: accuracy, f1, precision, recall, roc_auc for (metric in c("accuracy", "f1", "precision", "recall", "roc_auc")) { expect_true(metric %in% names(m), info = paste("Missing metric:", metric)) } }) test_that("W1-13: evaluate() returns multiclass metrics (f1_weighted not f1)", { # Python multiclass: accuracy, f1_weighted, f1_macro, precision_weighted, recall_weighted, roc_auc_ovr # R must match these names s <- ml_split(iris, "Species", seed = 42L) model <- ml_fit(s$train, "Species", algorithm = "logistic", seed = 42L) m <- ml_evaluate(model, s$valid) expect_true("accuracy" %in% names(m)) expect_true("f1_weighted" %in% names(m), info = paste("Missing f1_weighted. Got:", paste(names(m), collapse=", "))) }) test_that("W1-14: evaluate() returns regression metrics (rmse, mae, r2)", { # Python regression: rmse, mae, r2 s <- ml_split(mtcars, "mpg", seed = 42L) model <- ml_fit(s$train, "mpg", algorithm = "random_forest", seed = 42L) m <- ml_evaluate(model, s$valid) for (metric in c("rmse", "mae", "r2")) { expect_true(metric %in% names(m), info = paste("Missing regression metric:", metric)) } }) test_that("W1-15: evaluate() metrics are rounded to 4 decimal places (matches Python)", { # Python: return Metrics({k: round(v, 4) for k, v in result.items()}) s <- ml_split(iris, "Species", seed = 42L) model <- ml_fit(s$train, "Species", algorithm = "logistic", seed = 42L) m <- ml_evaluate(model, s$valid) for (nm in names(m)) { v <- m[[nm]] if (is.numeric(v) && !is.na(v)) { expect_equal(round(v, 4L), v, info = paste("Metric", nm, "not rounded to 4dp:", v)) } } }) # ── DRIFT NUMERICAL VALUES ──────────────────────────────────────────────────── test_that("W1-16: drift() p-values are in [0, 1] range (numerical sanity)", { s <- ml_split(iris, "Species", seed = 42L) new_data <- s$test new_data$Sepal.Length <- new_data$Sepal.Length + 5.0 # obvious drift res <- ml_drift(reference = s$train, new = new_data, target = "Species") for (nm in names(res$features)) { p <- res$features[[nm]] expect_true(p >= 0 && p <= 1, info = paste("p-value out of range for feature", nm, ":", p)) } }) test_that("W1-17: drift() Sepal.Length+5 is detected as shifted", { # Massively shift one numeric feature — must be detected s <- ml_split(iris, "Species", seed = 42L) new_data <- s$test new_data$Sepal.Length <- new_data$Sepal.Length + 5.0 res <- ml_drift(reference = s$train, new = new_data, target = "Species") expect_true(res$shifted, info = "Sepal.Length +5 should be detected as drift") expect_true("Sepal.Length" %in% res$features_shifted) }) test_that("W1-18: drift() identical datasets have shifted=FALSE (no false positives)", { s <- ml_split(iris, "Species", seed = 42L) res <- ml_drift(reference = s$train, new = s$train, target = "Species") # Identical data — no real drift; KS test p should be 1.0 for all expect_false(res$shifted, info = "Identical reference/new should not be drifted") }) # ── PROFILE NUMERICAL PARITY ───────────────────────────────────────────────── test_that("W1-19: profile() returns list with expected keys matching Python dict", { # Python returns dict with keys: n_rows, n_cols, n_numeric, n_categorical, task, target, ... p <- ml_profile(iris, "Species") expect_true(is.list(p)) for (key in c("n_rows", "n_cols", "task", "target")) { expect_true(key %in% names(p), info = paste("Missing profile key:", key)) } expect_equal(p[["n_rows"]], nrow(iris)) expect_equal(p[["n_cols"]], ncol(iris)) }) test_that("W1-20: evaluate() accuracy is in [0, 1] range (numerical sanity)", { s <- ml_split(iris, "Species", seed = 42L) model <- ml_fit(s$train, "Species", algorithm = "logistic", seed = 42L) m <- ml_evaluate(model, s$valid) expect_true(m[["accuracy"]] >= 0 && m[["accuracy"]] <= 1) # roc_auc should also be in [0, 1] if ("roc_auc_ovr" %in% names(m)) { expect_true(m[["roc_auc_ovr"]] >= 0 && m[["roc_auc_ovr"]] <= 1) } })