test_that("harness scenario manifest is stable", { m <- harness_scenario_manifest() expect_true(all(c("scenario_id", "scenario_type", "tier_min") %in% names(m))) expect_true("mlm_power_ri" %in% m$scenario_id) expect_true("kfas_local_level" %in% m$scenario_id) }) test_that("harness_expand_runs respects tier and backends", { m <- harness_scenario_manifest() g <- harness_expand_runs(m, "smoke", c("lme4", "kfas")) expect_true(all(g$backend %in% c("lme4", "kfas"))) expect_true(any(g$scenario_id == "mlm_power_ri" & g$backend == "lme4")) expect_true(any(g$scenario_id == "mlm_lag_x" & g$backend == "lme4")) expect_true(any(g$scenario_id == "kfas_local_level" & g$backend == "kfas")) g2 <- harness_expand_runs(m, "smoke", "lme4") expect_false(any(g2$backend == "kfas")) g3 <- harness_expand_runs(m, "nightly", c("ctsem")) expect_true(any(g3$scenario_id == "ctsem_univariate")) }) test_that("harness_parse_cli_args parses flags", { a <- harness_parse_cli_args(c("--tier", "nightly", "--n-sim", "5", "--seed", "1")) expect_equal(a$tier, "nightly") expect_equal(a$n_sim, 5L) expect_equal(a$seed, 1L) }) test_that("summarize_validation_metrics and thresholds are deterministic", { est <- c(0.3, 0.4, 0.36) truth <- 0.35 bias <- est - truth raw <- tibble::tibble( run_id = "r1", git_sha = "abc", tier = "smoke", scenario_id = "mlm_power_ri", backend = "lme4", replicate_id = 1:3, truth = truth, estimate = est, bias = bias, squared_error = bias^2, std_error = c(0.1, 0.1, 0.1), ci_low = est - 0.2, ci_high = est + 0.2, covered = c(TRUE, TRUE, TRUE), nominal_level = 0.95, elapsed_sec = c(1, 1, 1), converged = TRUE, singular = FALSE, fit_error = NA_character_, n_guardrails = 0L, guardrail_rule_ids = "", skipped_reason = "" ) s <- summarize_validation_metrics(raw) expect_equal(nrow(s), 1L) expect_true(is.finite(s$convergence_rate)) expect_true(s$mean_coverage <= 1) thr <- list( defaults = list(min_coverage = 0.5, max_convergence_failure_rate = 0.5, min_n_converged = 1L), by_backend = list(), by_scenario = list() ) chk <- harness_evaluate_thresholds(s, thr) expect_true(all(chk$status %in% c("pass", "warn", "fail"))) expect_equal(harness_threshold_exit_status(chk), 0L) }) test_that("harness_evaluate_thresholds fails on bad convergence", { s <- tibble::tibble( tier = "smoke", scenario_id = "mlm_power_ri", backend = "lme4", n_replicates = 5L, n_skipped = 0L, n_install_skipped = 0L, n_ran = 5L, n_converged = 0L, convergence_rate = 0, mean_bias = NA_real_, rmse = NA_real_, mean_coverage = NA_real_, n_coverage_defined = 0L, calibration_gap = NA_real_, mean_elapsed_sec = 1, mean_n_guardrails = 0 ) thr <- list( defaults = list(min_coverage = 0.5, max_convergence_failure_rate = 0.1, min_n_converged = 1L), by_backend = list(), by_scenario = list() ) chk <- harness_evaluate_thresholds(s, thr) expect_true(any(chk$status == "fail")) expect_equal(harness_threshold_exit_status(chk), 1L) }) test_that("harness smoke benchmark runs for lme4 when available", { res <- harness_run_benchmark( tier = "smoke", backends = "lme4", n_sim = 2L, seed = 4242L, run_id = "test-smoke", git_sha = "testsha" ) expect_true(nrow(res$raw) >= 2L) expect_true(all(c("estimate", "converged", "scenario_id", "backend") %in% names(res$raw))) expect_true(all(res$raw$backend == "lme4")) }) test_that("harness_read_thresholds_json reads package thresholds", { skip_if_not_installed("jsonlite") p <- system.file("benchmarks", "thresholds-smoke.json", package = "tidyILD") skip_if(!nzchar(p), "installed package has no inst/benchmarks (use load_all)") thr <- harness_read_thresholds_json(p) expect_type(thr, "list") expect_true(!is.null(thr$defaults)) })