# Level-2 known-value tests. Each test exercises a case where the
# analytical answer is derivable by hand or by construction. These
# tests catch off-by-one, wrong-sign, or discretisation-choice bugs
# that structural and invariant tests can miss.

test_that("iv_kitagawa: statistic is small when Z is independent of (D, Y)", {
  # Under Z independent of (D, Y), F(y, d | Z = z1) == F(y, d | Z = z2)
  # for all y, d. The empirical positive-part KS statistic converges
  # to 0 at rate 1/sqrt(n). With n = 2000 we expect stat < ~4 typically.
  skip_on_cran()
  set.seed(101)
  n <- 2000
  z <- sample(0:1, n, replace = TRUE)
  d <- rbinom(n, 1, 0.5)  # Z-independent first stage
  y <- rnorm(n, mean = d)  # Y depends on D only
  out <- iv_kitagawa(y, d, z, n_boot = 100, parallel = FALSE)
  # Statistic should be bounded; p-value should not reject
  expect_lt(out$statistic, 5)
  expect_gt(out$p_value, 0.1)
})

test_that("iv_kitagawa: deterministic Y perfectly separated by D gives stat = 0", {
  # If Y = 1{D=1} exactly, then F(y, 1 | z) = 0 for y < 1 and
  # = P(D=1 | z) for y >= 1, with no variation across z except via
  # the marginals. The inequality F(y, 1 | z_low) - F(y, 1 | z_high)
  # is strictly <= 0 everywhere. Empirical stat is exactly 0.
  set.seed(1)
  n <- 500
  z <- c(rep(0, n/2), rep(1, n/2))
  d <- c(rep(0, n/4), rep(1, n/4), rep(0, n/4), rep(1, 3*n/4 - n/2))
  # Simpler: deterministic
  y <- as.numeric(d)
  out <- iv_kitagawa(y, d, z, n_boot = 20, parallel = FALSE)
  expect_equal(out$statistic, 0, tolerance = 1e-10)
})

test_that("iv_mw: without x, matches iv_kitagawa statistic exactly on same seed", {
  # Invariant test: iv_mw unconditional path delegates to the same
  # Kitagawa core. On identical seeds, the two statistics are equal.
  set.seed(42)
  n <- 300
  z <- sample(0:1, n, replace = TRUE)
  d <- rbinom(n, 1, 0.3 + 0.4 * z)
  y <- rnorm(n, mean = d)
  set.seed(1); r_k <- iv_kitagawa(y, d, z, n_boot = 50, parallel = FALSE)
  set.seed(1); r_mw <- iv_mw(y, d, z, n_boot = 50, parallel = FALSE)
  expect_equal(r_k$statistic, r_mw$statistic, tolerance = 1e-12)
  expect_equal(r_k$p_value, r_mw$p_value, tolerance = 1e-12)
})

test_that("iv_testjfe: T_n = 0 when mu_j is exactly linear in p_j", {
  # Construct data where mu_j = alpha + beta * p_j holds deterministically.
  # In the limit of many observations per judge, the residuals from the
  # weighted-LS fit are exactly zero and T_n collapses to 0.
  set.seed(1)
  K <- 10
  n_per <- 2000  # large per-judge sample -> tiny noise
  p_j <- seq(0.2, 0.7, length.out = K)
  alpha <- 1
  beta <- 2
  judge <- rep(seq_len(K), each = n_per)
  # Generate D from p_j[judge], Y = alpha + beta * d_deterministic + tiny noise
  # But we want mu_j = alpha + beta * p_j, so Y must have mean alpha + beta * D
  # with per-observation noise that averages out.
  d <- stats::rbinom(length(judge), 1, p_j[judge])
  y <- alpha + beta * d + stats::rnorm(length(judge), 0, 0.01)
  out <- iv_testjfe(y, d, judge, n_boot = 20, parallel = FALSE)
  # T_n should be tiny: the only source of non-zero residual is the
  # finite-sample noise in p_j_hat and mu_j_hat.
  expect_lt(out$statistic, qchisq(0.99, K - 2))
  expect_gt(out$p_value, 0.01)
})

test_that("iv_testjfe: slope in weighted-LS recovers beta when mu_j is linear in p_j", {
  # Level-2 analytical check on the OLS coefficients themselves.
  set.seed(1)
  K <- 10
  n_per <- 1000
  p_j <- seq(0.2, 0.7, length.out = K)
  alpha <- 1
  beta <- 2
  judge <- rep(seq_len(K), each = n_per)
  d <- stats::rbinom(length(judge), 1, p_j[judge])
  y <- alpha + beta * d + stats::rnorm(length(judge), 0, 0.5)
  out <- iv_testjfe(y, d, judge, n_boot = 10, parallel = FALSE)
  # Slope should recover beta. Tolerance ~ sd(mu_j_hat) / range(p_j)
  expect_equal(out$coef["slope"], c(slope = beta), tolerance = 0.1)
  expect_equal(out$coef["intercept"], c(intercept = alpha), tolerance = 0.1)
})

test_that("iv_check: overall verdict is 'cannot reject' when all p-values are large", {
  set.seed(1)
  n <- 500
  z <- sample(0:1, n, replace = TRUE)
  d <- rbinom(n, 1, 0.3 + 0.4 * z)
  y <- rnorm(n, mean = d)
  # Raw-vector path: call via default
  chk <- structure(
    list(
      table = data.frame(test = "X", statistic = 0.5, p_value = 0.9,
                         verdict = "pass"),
      alpha = 0.05,
      overall = ivcheck:::overall_verdict(0.9, 0.05)
    ),
    class = "iv_check"
  )
  expect_match(chk$overall, "cannot reject")
})

test_that("iv_power: delta = 0 empirical size is at most ~alpha + MC noise", {
  skip_on_cran()
  set.seed(1)
  n <- 300
  z <- sample(0:1, n, replace = TRUE)
  d <- rbinom(n, 1, 0.3 + 0.4 * z)
  y <- rnorm(n, mean = d)
  out <- iv_power(y, d, z, method = "kitagawa", n_sims = 50,
                  delta_grid = c(0), n_boot = 80, parallel = FALSE)
  # Under strict null, Kitagawa test is conservative; empirical size
  # should be <= alpha + a few SE. With n_sims = 50 and alpha = 0.05,
  # SE(power) ~= sqrt(0.05 * 0.95 / 50) ~ 0.031. Allow 3 SE + alpha
  expect_lt(out$power[1], 0.15)
})

test_that("iv_kitagawa reproduces the Card (1995) binary-college rejection", {
  # Published-number reproduction. Kitagawa (2015) Table 2 and the
  # Mourifie-Wan (2017) replication both reject the simple college/
  # near_college specification at 5%. The exact statistic depends on
  # the bootstrap seed, but rejection is robust.
  skip_on_cran()
  data(card1995)
  set.seed(40000)
  r <- iv_kitagawa(card1995$lwage, card1995$college,
                   card1995$near_college, n_boot = 300, parallel = FALSE)
  expect_lt(r$p_value, 0.05)
  expect_gt(r$statistic, 1.5)
})

test_that("iv_testjfe null size holds for multivalued D with x (small MC)", {
  # Regression test for the FWL identity in the structural-residual
  # construction. With M = 2 and x varying within judges, a buggy
  # sigma^2 inflates and produces conservative (<5%) rejection under H0.
  # With the FWL-correct structural residual, empirical size should
  # sit near nominal alpha.
  skip_on_cran()
  set.seed(2026)
  K <- 12
  n_per_judge <- 200
  judge_vec <- rep(seq_len(K), each = n_per_judge)
  n <- length(judge_vec)
  rej <- 0
  reps <- 80
  for (s in seq_len(reps)) {
    set.seed(70000 + s)
    x <- rnorm(n)
    # Valid judge IV: D is a function of judge + x. Three levels of D.
    p_by_j <- c(0.2, 0.4, 0.6)  # P(D = 1 | J), P(D = 2 | J) will be
    lin <- 0.1 * judge_vec + 0.3 * x
    d_raw <- rbinom(n, 2, plogis(lin) * 0.9)   # values in {0, 1, 2}
    y <- d_raw + 0.4 * x + rnorm(n, 0, 0.5)    # valid exclusion
    out <- iv_testjfe(y, d_raw, judge_vec, x = x, n_boot = 10,
                      method = "asymptotic", parallel = FALSE)
    if (isTRUE(out$p_value < 0.05)) rej <- rej + 1
  }
  # Empirical size should be within roughly 3 SE of alpha = 0.05.
  # With 80 reps, SE ~= 0.024 so <= 0.13 is a very conservative pass.
  expect_lte(rej / reps, 0.15)
})