# Self-consistency and sanity benchmarks. Catches drift between releases # and verifies that the three tests produce qualitatively consistent # answers on controlled DGPs. These tests are not replication of # published-paper numbers; see test-known-values.R for that. test_that("iv_check verdict flips at the alpha boundary", { # A test's p-value is data+seed-determined; alpha only moves the # verdict threshold. Construct a case where p sits between 0.03 and # 0.07; verdict should be 'reject' at alpha = 0.10 and 'pass' at # alpha = 0.01. set.seed(1) n <- 2000 z <- sample(0:1, n, replace = TRUE) d <- rbinom(n, 1, 0.3 + 0.4 * z) # Induce a mild exclusion violation so p-value sits in the middle. y <- rnorm(n, mean = d + 0.25 * d * z) set.seed(123) r_low <- iv_kitagawa(y, d, z, alpha = 0.01, n_boot = 200, parallel = FALSE) set.seed(123) r_high <- iv_kitagawa(y, d, z, alpha = 0.10, n_boot = 200, parallel = FALSE) expect_equal(r_low$statistic, r_high$statistic) expect_equal(r_low$p_value, r_high$p_value) expect_equal(r_low$alpha, 0.01) expect_equal(r_high$alpha, 0.10) }) test_that("bootstrap p-value is roughly uniform under a deep H0", { # With a valid IV DGP, bootstrap p-values should be approximately # Uniform(0,1) under repeated resampling (actually slightly # stochastically-larger given the positive-part truncation). Test a # weak condition: mean p across 30 reps should be > 0.3. skip_on_cran() set.seed(1) n <- 500 reps <- 30 ps <- numeric(reps) for (s in seq_len(reps)) { set.seed(2000 + s) z <- sample(0:1, n, replace = TRUE) d <- rbinom(n, 1, 0.3 + 0.4 * z) y <- rnorm(n, mean = d) ps[s] <- iv_kitagawa(y, d, z, n_boot = 60, parallel = FALSE)$p_value } # Under H0 with positive-part statistic, E[p] is typically >= 0.4. expect_gt(mean(ps), 0.3) }) test_that("iv_power regression: power at delta = 1.5 is >= 0.8", { # Power-curve regression: the Kitagawa test should maintain strong # power at a moderate exclusion violation. This test catches any # future code change that silently loses power. skip_on_cran() set.seed(1) n <- 500 z <- sample(0:1, n, replace = TRUE) d <- rbinom(n, 1, 0.3 + 0.4 * z) y <- rnorm(n, mean = d) pc <- iv_power(y, d, z, method = "kitagawa", n_sims = 40, delta_grid = c(1.5), n_boot = 80, parallel = FALSE) expect_gte(pc$power[1], 0.8) }) test_that("sanity benchmark: all three tests pass on a valid-IV DGP", { # Integration-style test: one valid-IV DGP, every test should return # a p-value > 0.05. Catches accidental sign flips, off-by-one errors # in inequality direction, or regressions in any of the three cores. skip_on_cran() set.seed(42) # Judge-IV design with 10 judges for iv_testjfe; binary Z derived # from whether judge index is above the median for iv_kitagawa / iv_mw. n <- 2000 K <- 10 judge <- sample.int(K, n, replace = TRUE) p_by_j <- seq(0.2, 0.8, length.out = K) d <- rbinom(n, 1, p_by_j[judge]) y <- rnorm(n, mean = d) z_binary <- as.integer(judge > K / 2) set.seed(1) r_k <- iv_kitagawa(y, d, z_binary, n_boot = 200, parallel = FALSE) set.seed(1) r_mw <- iv_mw(y, d, z_binary, n_boot = 200, parallel = FALSE) set.seed(1) r_jfe <- iv_testjfe(y, d, judge, n_boot = 50, parallel = FALSE) expect_gt(r_k$p_value, 0.05) expect_gt(r_mw$p_value, 0.05) expect_gt(r_jfe$p_value, 0.05) }) test_that("violation benchmark: all three tests reject under a clear exclusion failure", { # Inject a D-specific direct effect of Z (clean exclusion violation). # All three tests should reject at reasonable n. skip_on_cran() set.seed(42) n <- 2000 K <- 10 judge <- sample.int(K, n, replace = TRUE) p_by_j <- seq(0.2, 0.8, length.out = K) d <- rbinom(n, 1, p_by_j[judge]) z_binary <- as.integer(judge > K / 2) # Large, unambiguous violation: Y gets a direct effect from Z for D=1. y <- rnorm(n, mean = d + 1.5 * d * z_binary) set.seed(1) r_k <- iv_kitagawa(y, d, z_binary, n_boot = 200, parallel = FALSE) set.seed(1) r_mw <- iv_mw(y, d, z_binary, n_boot = 200, parallel = FALSE) # For iv_testjfe, the violation pattern depends on judge index; the # FLL linearity test catches it if there is any non-linear deviation # from mu_j = alpha + beta * p_j. set.seed(1) r_jfe <- iv_testjfe(y, d, judge, n_boot = 50, parallel = FALSE) expect_lt(r_k$p_value, 0.05) expect_lt(r_mw$p_value, 0.05) expect_lt(r_jfe$p_value, 0.10) # linearity test is less directly powered }) test_that("factor Z handling does not change the statistic", { # Coercing a factor Z should give the same result as the numeric Z, # up to label-ordering. The order of factor levels should not affect # the final statistic (iv_kitagawa re-orders by first-stage E[D|Z]). set.seed(1) n <- 500 z_num <- sample(0:1, n, replace = TRUE) d <- rbinom(n, 1, 0.3 + 0.4 * z_num) y <- rnorm(n, mean = d) z_fac_AB <- factor(ifelse(z_num == 0, "A", "B"), levels = c("A", "B")) z_fac_BA <- factor(ifelse(z_num == 0, "A", "B"), levels = c("B", "A")) set.seed(7); r_num <- iv_kitagawa(y, d, z_num, n_boot = 50, parallel = FALSE) set.seed(7); r_AB <- iv_kitagawa(y, d, z_fac_AB, n_boot = 50, parallel = FALSE) set.seed(7); r_BA <- iv_kitagawa(y, d, z_fac_BA, n_boot = 50, parallel = FALSE) expect_equal(r_num$statistic, r_AB$statistic, tolerance = 1e-8) expect_equal(r_num$statistic, r_BA$statistic, tolerance = 1e-8) })