# tests/testthat/test-delta-lsi.R
# Unit tests for delta_lsi() and its internal helpers.

# ── Helper: build minimal LeakFit objects ─────────────────────────────────────

# Build a LeakSplits with explicit test/train indices.
.make_dlsi_splits <- function(n_folds, n_obs, n_repeats = 1L) {
  fold_size <- floor(n_obs / n_folds)
  indices   <- lapply(seq_len(n_folds * n_repeats), function(i) {
    fold_num <- ((i - 1L) %% n_folds) + 1L
    rep_num  <- ceiling(i / n_folds)
    te       <- ((fold_num - 1L) * fold_size + 1L):(fold_num * fold_size)
    tr       <- setdiff(seq_len(n_obs), te)
    list(fold = fold_num, repeat_id = rep_num, test = te, train = tr)
  })
  methods::new("LeakSplits",
    mode    = "subject_grouped",
    indices = indices,
    info    = list()
  )
}

# Build a minimal LeakFit with controlled per-fold AUC values.
.make_dlsi_fit <- function(fold_aucs, n_obs = 30L, n_repeats = 1L,
                           seed = 1L) {
  n_folds <- length(fold_aucs) %/% n_repeats
  splits  <- .make_dlsi_splits(n_folds, n_obs, n_repeats)

  metrics_df <- data.frame(
    fold    = seq_along(fold_aucs),
    learner = "glm",
    auc     = as.numeric(fold_aucs),
    stringsAsFactors = FALSE
  )

  set.seed(seed)
  fold_size <- floor(n_obs / n_folds)
  preds <- lapply(seq_along(fold_aucs), function(i) {
    te <- splits@indices[[i]]$test
    n  <- length(te)
    # Predictions with spread proportional to the AUC value
    y  <- rbinom(n, 1L, 0.5)
    p  <- pmin(1, pmax(0, rnorm(n, mean = 0.5, sd = fold_aucs[i] / 2)))
    data.frame(
      id = te, fold = i, learner = "glm",
      truth = y, pred = p,
      stringsAsFactors = FALSE
    )
  })

  methods::new("LeakFit",
    splits        = splits,
    metrics       = metrics_df,
    metric_summary = data.frame(),
    audit         = data.frame(),
    predictions   = preds,
    preprocess    = list(),
    learners      = list(),
    outcome       = "y",
    task          = "binomial",
    feature_names = character(0L),
    info          = list()
  )
}

# ── Basic construction ─────────────────────────────────────────────────────────

test_that("delta_lsi returns a LeakDeltaLSI object", {
  set.seed(1)
  fit <- .make_dlsi_fit(rep(0.65, 6L), n_repeats = 2L)
  res <- suppressWarnings(delta_lsi(fit, fit, metric = "auc", seed = 1L))
  expect_s4_class(res, "LeakDeltaLSI")
})

test_that("delta_lsi with same fit: delta_metric and delta_lsi are zero", {
  set.seed(1)
  # 6 folds over 2 repeats → R_eff = 2 (D_insufficient tier but point estimate fine)
  fit <- .make_dlsi_fit(rep(0.7, 6L), n_repeats = 2L)
  res <- suppressWarnings(delta_lsi(fit, fit, metric = "auc", seed = 1L))
  expect_equal(res@delta_metric, 0.0, tolerance = 1e-10)
  expect_equal(res@delta_lsi,    0.0, tolerance = 1e-10)
})

test_that("delta_lsi detects inflated naive pipeline", {
  set.seed(2)
  # Naive: AUC ~ 0.85 per fold; guarded: AUC ~ 0.60
  fit_n <- .make_dlsi_fit(rep(0.85, 10L), n_repeats = 2L)
  fit_g <- .make_dlsi_fit(rep(0.60, 10L), n_repeats = 2L)
  res   <- suppressWarnings(delta_lsi(fit_n, fit_g, metric = "auc", seed = 2L))
  expect_gt(res@delta_lsi,    0.0)
  expect_gt(res@delta_metric, 0.0)
})

# ── Tier assignment ────────────────────────────────────────────────────────────

test_that("tier D_insufficient for R_eff < 5", {
  fit_n <- .make_dlsi_fit(rep(0.75, 3L), n_repeats = 1L)
  fit_g <- .make_dlsi_fit(rep(0.60, 3L), n_repeats = 1L)
  expect_warning(
    res <- delta_lsi(fit_n, fit_g, metric = "auc"),
    regexp = "R_eff"
  )
  expect_equal(res@tier, "D_insufficient")
  expect_equal(res@R_eff, 1L)  # paired R=1 → R_eff=1, but below the 5-repeat threshold
})

test_that("tier C_signflip for R_eff in 5..9", {
  set.seed(3)
  # 5 repeats × 3 folds = 15 folds each
  fit_n <- .make_dlsi_fit(rep(0.8, 15L), n_obs = 45L, n_repeats = 5L)
  fit_g <- .make_dlsi_fit(rep(0.6, 15L), n_obs = 45L, n_repeats = 5L)
  res   <- delta_lsi(fit_n, fit_g, metric = "auc", seed = 3L)
  expect_equal(res@tier, "C_signflip")
  expect_equal(res@R_eff, 5L)
  # CI not available at tier C
  expect_true(all(!is.finite(res@delta_lsi_ci)))
  # Sign-flip p-value is available at tier C
  expect_true(is.finite(res@p_value))
})

test_that("tier B_signflip_ci for R_eff in 10..19", {
  set.seed(4)
  fit_n <- .make_dlsi_fit(rep(0.8, 30L), n_obs = 90L, n_repeats = 10L)
  fit_g <- .make_dlsi_fit(rep(0.6, 30L), n_obs = 90L, n_repeats = 10L)
  res   <- delta_lsi(fit_n, fit_g, metric = "auc", seed = 4L,
                     M_boot = 200L, M_flip = 500L)
  expect_equal(res@tier, "B_signflip_ci")
  expect_equal(res@R_eff, 10L)
  # Both p-value and CI are available at tier B
  expect_true(is.finite(res@p_value))
  expect_true(all(is.finite(res@delta_lsi_ci)))
})

test_that("tier A_full_inference for R_eff >= 20", {
  set.seed(5)
  fit_n <- .make_dlsi_fit(rep(0.8, 60L), n_obs = 120L, n_repeats = 20L)
  fit_g <- .make_dlsi_fit(rep(0.6, 60L), n_obs = 120L, n_repeats = 20L)
  res   <- delta_lsi(fit_n, fit_g, metric = "auc", seed = 5L,
                     M_boot = 200L, M_flip = 1000L)
  expect_equal(res@tier, "A_full_inference")
  expect_true(res@inference_ok)
  expect_true(all(is.finite(res@delta_lsi_ci)))
  expect_true(is.finite(res@p_value))
})

# ── strict mode ───────────────────────────────────────────────────────────────

test_that("strict = TRUE errors on insufficient R_eff", {
  fit_n <- .make_dlsi_fit(rep(0.7, 2L), n_repeats = 1L)
  fit_g <- .make_dlsi_fit(rep(0.5, 2L), n_repeats = 1L)
  expect_error(
    delta_lsi(fit_n, fit_g, metric = "auc", strict = TRUE),
    regexp = "R_eff"
  )
})

# ── Input validation ───────────────────────────────────────────────────────────

test_that("non-LeakFit input errors informatively", {
  fit <- .make_dlsi_fit(rep(0.7, 4L))
  expect_error(delta_lsi("not_a_fit", fit), regexp = "LeakFit")
  expect_error(delta_lsi(fit, 42),          regexp = "LeakFit")
})

# ── Unpaired fits ─────────────────────────────────────────────────────────────

test_that("unpaired fits give D_insufficient with point estimate", {
  fit_n <- .make_dlsi_fit(rep(0.8, 6L), n_obs = 30L, n_repeats = 2L)  # R=2
  fit_g <- .make_dlsi_fit(rep(0.6, 9L), n_obs = 30L, n_repeats = 3L)  # R=3
  res   <- suppressWarnings(
    delta_lsi(fit_n, fit_g, metric = "auc", seed = 1L)
  )
  expect_equal(res@tier, "D_insufficient")
  expect_equal(res@R_eff, 0L)
  # Point estimate should still be computed
  expect_true(is.finite(res@delta_lsi))
  expect_true(is.finite(res@delta_metric))
  expect_gt(res@delta_lsi, 0.0)
})

# ── Return details ────────────────────────────────────────────────────────────

test_that("return_details = TRUE stores delta_r and fits in info", {
  set.seed(6)
  fit_n <- .make_dlsi_fit(rep(0.75, 15L), n_obs = 45L, n_repeats = 5L)
  fit_g <- .make_dlsi_fit(rep(0.55, 15L), n_obs = 45L, n_repeats = 5L)
  res   <- delta_lsi(fit_n, fit_g, metric = "auc",
                     return_details = TRUE, seed = 6L)
  expect_true("delta_r"     %in% names(res@info))
  expect_true("fit_leaky"   %in% names(res@info))
  expect_true("fit_naive"   %in% names(res@info))
  expect_true("fit_guarded" %in% names(res@info))
  expect_length(res@info$delta_r, 5L)
})

test_that("deprecated fit_naive argument maps to fit_leaky", {
  set.seed(6)
  fit_n <- .make_dlsi_fit(rep(0.75, 15L), n_obs = 45L, n_repeats = 5L)
  fit_g <- .make_dlsi_fit(rep(0.55, 15L), n_obs = 45L, n_repeats = 5L)

  expect_warning(
    res <- delta_lsi(fit_naive = fit_n, fit_guarded = fit_g, metric = "auc",
                     seed = 6L),
    regexp = "fit_naive is deprecated; use fit_leaky instead"
  )
  expect_s4_class(res, "LeakDeltaLSI")
})

# ── BCa CI covers zero when naive = guarded ───────────────────────────────────

test_that("BCa CI covers 0 when no leakage (same AUC)", {
  set.seed(7)
  # 10 repeats × 3 folds; same AUC → delta = 0
  aucs <- rep(0.65, 30L)
  fit_n <- .make_dlsi_fit(aucs, n_obs = 90L, n_repeats = 10L)
  fit_g <- .make_dlsi_fit(aucs, n_obs = 90L, n_repeats = 10L)
  res   <- delta_lsi(fit_n, fit_g, metric = "auc",
                     M_boot = 500L, seed = 7L)
  expect_equal(res@delta_lsi, 0.0, tolerance = 1e-10)
  ci <- res@delta_lsi_ci
  if (all(is.finite(ci))) {
    expect_lte(ci[1L], 0.0 + 1e-8)
    expect_gte(ci[2L], 0.0 - 1e-8)
  }
})

# ── Sign-flip p-value ──────────────────────────────────────────────────────────

test_that("sign-flip p-value is in [0, 1]", {
  set.seed(8)
  fit_n <- .make_dlsi_fit(rep(0.8, 15L), n_obs = 45L, n_repeats = 5L)
  fit_g <- .make_dlsi_fit(rep(0.6, 15L), n_obs = 45L, n_repeats = 5L)
  res   <- delta_lsi(fit_n, fit_g, metric = "auc",
                     M_flip = 500L, seed = 8L)
  expect_gte(res@p_value, 0.0)
  expect_lte(res@p_value, 1.0)
})

test_that("sign-flip p-value small when inflation is large and consistent", {
  set.seed(9)
  # 20 repeats × 3 folds; large consistent gap
  fit_n <- .make_dlsi_fit(rep(0.95, 60L), n_obs = 120L, n_repeats = 20L)
  fit_g <- .make_dlsi_fit(rep(0.50, 60L), n_obs = 120L, n_repeats = 20L)
  res   <- delta_lsi(fit_n, fit_g, metric = "auc",
                     M_boot = 500L, M_flip = 2000L, seed = 9L)
  expect_lt(res@p_value, 0.05)
})

# ── show / summary don't error ────────────────────────────────────────────────

test_that("show() and summary() run without error", {
  set.seed(10)
  fit_n <- .make_dlsi_fit(rep(0.75, 15L), n_obs = 45L, n_repeats = 5L)
  fit_g <- .make_dlsi_fit(rep(0.55, 15L), n_obs = 45L, n_repeats = 5L)
  res   <- delta_lsi(fit_n, fit_g, metric = "auc",
                     M_boot = 100L, M_flip = 200L, seed = 10L)
  expect_output(show(res),     "LeakDeltaLSI")
  expect_output(summary(res),  "Delta LSI")
})

# ── Fold / repeat data frames ─────────────────────────────────────────────────

test_that("folds_naive and folds_guarded have expected columns", {
  set.seed(11)
  fit_n <- .make_dlsi_fit(c(0.7, 0.8, 0.75), n_obs = 30L, n_repeats = 1L)
  fit_g <- .make_dlsi_fit(c(0.6, 0.65, 0.6), n_obs = 30L, n_repeats = 1L)
  res   <- suppressWarnings(delta_lsi(fit_n, fit_g, metric = "auc"))
  expected_cols <- c("fold", "metric", "repeat_id", "n")
  expect_true(all(expected_cols %in% names(res@folds_naive)))
  expect_true(all(expected_cols %in% names(res@folds_guarded)))
})

# ── Internal helpers ──────────────────────────────────────────────────────────

test_that(".dlsi_huber returns median for single value", {
  expect_equal(bioLeak:::.dlsi_huber(42), 42)
})

test_that(".dlsi_huber returns NA for empty input", {
  expect_true(is.na(bioLeak:::.dlsi_huber(numeric(0))))
})

test_that(".dlsi_huber is robust to one outlier", {
  x   <- c(0, 0, 0, 0, 0, 100)
  est <- bioLeak:::.dlsi_huber(x)
  # Huber M-estimator should be much closer to 0 than arithmetic mean (100/6 ≈ 16.7)
  expect_lt(abs(est), abs(mean(x)))
})

test_that(".dlsi_bca_ci covers known mean for symmetric distribution", {
  set.seed(42)
  x  <- rnorm(20, mean = 0.2, sd = 0.05)
  ci <- bioLeak:::.dlsi_bca_ci(x, mean, M = 2000L, seed = 42L)
  expect_true(all(is.finite(ci)))
  expect_lt(ci[1L], 0.2)
  expect_gt(ci[2L], 0.2)
})

test_that(".dlsi_sign_flip p-value is NA for length-1 input", {
  expect_true(is.na(bioLeak:::.dlsi_sign_flip(0.5)))
})

test_that(".dlsi_sign_flip exact enumeration: known result for R=2", {
  # delta_r = c(1, 1); obs_stat = mean(1, 1) = 1
  # All 4 sign combos (R=2 <= 15 → exact):
  #   (--) → mean(-1, -1) = -1;  (-+) → mean(-1,  1) =  0
  #   (+-) → mean( 1, -1) =  0;  (++) → mean( 1,  1) =  1
  # |null| >= |obs| = 1: combos (--) and (++) → count = 2
  # Exact p = 2 / 4 = 0.5  (no continuity correction for full enumeration)
  p <- bioLeak:::.dlsi_sign_flip(c(1, 1), seed = 1L)
  expect_equal(p, 0.5, tolerance = 1e-10)
})

# ── Adversarial tests ─────────────────────────────────────────────────────────

test_that("equal repeat count but mismatched fold structures warns and goes unpaired", {
  # fit_n uses n_obs=30, fit_g uses n_obs=24 → different test indices → unpaired
  fit_n <- .make_dlsi_fit(rep(0.8, 6L), n_obs = 30L, n_repeats = 2L)
  fit_g <- .make_dlsi_fit(rep(0.6, 6L), n_obs = 24L, n_repeats = 2L)
  res <- expect_warning_match(
    delta_lsi(fit_n, fit_g, metric = "auc"),
    pattern = "fold structures"
  )
  expect_equal(res@tier, "D_insufficient")
  expect_equal(res@R_eff, 0L)
  # Point estimate should still be computed unpaired
  expect_true(is.finite(res@delta_lsi))
})

test_that("relabeled repeat_ids do not corrupt pairing (regression)", {
  n_folds <- 3L; n_obs <- 45L; n_repeats <- 5L

  # Naive: per-repeat means decrease from 0.9 to 0.7
  naive_aucs <- rep(c(0.9, 0.85, 0.80, 0.75, 0.70), each = n_folds)
  fit_n <- .make_dlsi_fit(naive_aucs, n_obs = n_obs, n_repeats = n_repeats)

  # Guarded: per-repeat means decrease from 0.8 to 0.6
  guard_aucs <- rep(c(0.80, 0.75, 0.70, 0.65, 0.60), each = n_folds)
  fit_g <- .make_dlsi_fit(guard_aucs, n_obs = n_obs, n_repeats = n_repeats)

  # Reverse the repeat_id labels in guarded fit's splits.
  # Original: positions 1-3 → rid=1, 4-6 → rid=2, ..., 13-15 → rid=5
  # Reversed: positions 1-3 → rid=5, 4-6 → rid=4, ..., 13-15 → rid=1
  new_idx <- fit_g@splits@indices
  for (i in seq_along(new_idx)) {
    new_idx[[i]]$repeat_id <- (n_repeats + 1L) - new_idx[[i]]$repeat_id
  }
  fit_g@splits@indices <- new_idx

  res <- delta_lsi(fit_n, fit_g, metric = "auc",
                   return_details = TRUE, seed = 42L)

  # With correct positional pairing, every repeat delta should be exactly 0.1.
  # Bug (before fix): deltas would be 0.3, 0.2, 0.1, 0.0, -0.1 due to
  # repeat_id sort-order mismatch between the two fits.
  expect_equal(res@info$delta_r, rep(0.1, n_repeats), tolerance = 1e-10)
  expect_true(res@info$paired)
  expect_equal(res@R_eff, n_repeats)
})

test_that("lower-is-better metric: delta > 0 when naive has lower (better) values", {
  set.seed(1)
  # For lower-is-better metrics: naive = 0.3 (better), guarded = 0.5 (worse)
  # Without sign flip: naive - guarded = -0.2 < 0 (looks like naive is worse)
  # With higher_is_better = FALSE: sign_factor = -1 → delta = -1 * (-0.2) = +0.2 > 0
  fit_n <- .make_dlsi_fit(rep(0.3, 10L), n_obs = 30L, n_repeats = 2L)
  fit_g <- .make_dlsi_fit(rep(0.5, 10L), n_obs = 30L, n_repeats = 2L)
  res   <- suppressWarnings(delta_lsi(
    fit_n, fit_g, metric = "auc",
    higher_is_better = FALSE, seed = 1L
  ))
  expect_gt(res@delta_lsi,    0.0)
  expect_gt(res@delta_metric, 0.0)
  expect_false(isTRUE(res@info[["higher_is_better"]]))
})

test_that("metrics df without fold column falls back to predictions gracefully", {
  # Build a LeakFit whose @metrics has the requested metric but no 'fold' column.
  # delta_lsi should fall back to .dlsi_fold_metrics_from_preds() without crashing.
  splits <- methods::new("LeakSplits",
    mode    = "subject_grouped",
    indices = list(
      list(fold = 1L, repeat_id = 1L, test = 1:15, train = 16:30),
      list(fold = 2L, repeat_id = 1L, test = 16:30, train = 1:15)
    ),
    info = list()
  )
  # @metrics has auc but NO fold column
  mdf <- data.frame(learner = "glm", auc = c(0.7, 0.75), stringsAsFactors = FALSE)

  set.seed(1)
  make_preds <- function(te, auc_val) {
    n <- length(te)
    y <- rbinom(n, 1L, 0.5)
    p <- pmin(1, pmax(0, rnorm(n, 0.5, auc_val / 2)))
    data.frame(id = te, fold = if (min(te) == 1L) 1L else 2L,
               learner = "glm", truth = y, pred = p, stringsAsFactors = FALSE)
  }
  preds <- list(make_preds(1:15, 0.7), make_preds(16:30, 0.75))

  fit_bad <- methods::new("LeakFit",
    splits = splits, metrics = mdf, metric_summary = data.frame(),
    audit = data.frame(), predictions = preds, preprocess = list(),
    learners = list(), outcome = "y", task = "binomial",
    feature_names = character(0L), info = list()
  )
  fit_good <- .make_dlsi_fit(c(0.6, 0.65), n_obs = 30L, n_repeats = 1L)

  # Should not crash; falls back to prediction-based metric recomputation
  expect_no_error(suppressWarnings(
    delta_lsi(fit_bad, fit_good, metric = "auc")
  ))
})

test_that("auto-detect higher_is_better = FALSE for rmse via metric name", {
  set.seed(2)
  # Naive RMSE = 0.3 (lower = better), guarded RMSE = 0.5
  fit_n <- .make_dlsi_fit(rep(0.3, 10L), n_obs = 30L, n_repeats = 2L)
  fit_g <- .make_dlsi_fit(rep(0.5, 10L), n_obs = 30L, n_repeats = 2L)
  # Add rmse column so .dlsi_fold_metrics can find it
  fit_n@metrics$rmse <- fit_n@metrics$auc
  fit_g@metrics$rmse <- fit_g@metrics$auc
  # Auto-detect: metric="rmse" with higher_is_better=NULL (default)
  res <- suppressWarnings(delta_lsi(fit_n, fit_g, metric = "rmse", seed = 2L))
  expect_false(isTRUE(res@info[["higher_is_better"]]))
  # Naive RMSE is lower (better) → positive delta (leakage inflation)
  expect_gt(res@delta_lsi,    0.0)
  expect_gt(res@delta_metric, 0.0)
})

# ── Exchangeability: by_group / within_batch warnings ─────────────────────────

test_that("exchangeability 'by_group' emits iid-fallback warning", {
  set.seed(20)
  fit_n <- .make_dlsi_fit(rep(0.8, 15L), n_obs = 45L, n_repeats = 5L)
  fit_g <- .make_dlsi_fit(rep(0.6, 15L), n_obs = 45L, n_repeats = 5L)
  expect_warning(
    res <- delta_lsi(fit_n, fit_g, metric = "auc",
                     exchangeability = "by_group", seed = 20L),
    regexp = "iid repeat structure"
  )
  # Despite the warning, a p-value is still computed (using iid sign-flip)
  expect_true(is.finite(res@p_value))
  expect_equal(res@exchangeability, "by_group")
})

test_that("exchangeability 'within_batch' emits iid-fallback warning", {
  set.seed(21)
  fit_n <- .make_dlsi_fit(rep(0.8, 15L), n_obs = 45L, n_repeats = 5L)
  fit_g <- .make_dlsi_fit(rep(0.6, 15L), n_obs = 45L, n_repeats = 5L)
  expect_warning(
    delta_lsi(fit_n, fit_g, metric = "auc",
              exchangeability = "within_batch", seed = 21L),
    regexp = "iid repeat structure"
  )
})

# ── Exchangeability: blocked_time block sign-flip ──────────────────────────────

test_that(".dlsi_resolve_block_size returns 1 for uncorrelated Δ_r", {
  # iid Δ_r has rho1 ≈ 0, so block_size should be 1
  set.seed(42)
  delta_r <- rnorm(20)
  bs <- bioLeak:::.dlsi_resolve_block_size(delta_r)
  expect_gte(bs, 1L)
  # With low autocorrelation, expected block_size is 1 (or very small)
  expect_lte(bs, 3L)
})

test_that(".dlsi_resolve_block_size respects explicit block_size argument", {
  delta_r <- rnorm(20)
  expect_equal(bioLeak:::.dlsi_resolve_block_size(delta_r, block_size = 4L), 4L)
})

test_that(".dlsi_resolve_block_size caps at floor(R/3)", {
  # Highly correlated Δ_r: AR(1) rho ~ 0.99, would give block_size = R
  # but cap forces at most floor(R/3)
  delta_r <- cumsum(rnorm(12))   # near unit root, very high autocorrelation
  bs      <- bioLeak:::.dlsi_resolve_block_size(delta_r)
  expect_lte(bs, floor(length(delta_r) / 3L))
})

test_that(".dlsi_sign_flip_blocked exact: known result for R=4, block_size=2", {
  # delta_r = c(1, 1, 1, 1); block_size=2 → block_id = c(1,1,2,2); n_blocks=2
  # All 4 block-sign combos (exact enumeration, n_blocks=2 <= 15):
  #   (--): mean(-1,-1,-1,-1) = -1
  #   (-+): mean(-1,-1, 1, 1) =  0
  #   (+-): mean( 1, 1,-1,-1) =  0
  #   (++): mean( 1, 1, 1, 1) =  1
  # obs_stat = 1; |null| >= |obs|=1: combos (--) and (++) → count = 2
  # Exact p = 2/4 = 0.5
  res <- bioLeak:::.dlsi_sign_flip_blocked(c(1, 1, 1, 1), block_size = 2L, seed = 1L)
  expect_equal(res$p_value,        0.5, tolerance = 1e-10)
  expect_equal(res$n_blocks,       2L)
  expect_equal(res$block_size_used, 2L)
})

test_that("blocked_time sign-flip gives more conservative p than iid for AR Δ_r", {
  set.seed(30)
  # Generate strongly AR(1) Δ_r with a positive mean (leakage signal)
  R <- 15L
  delta_r <- 0.2 + as.numeric(arima.sim(list(ar = 0.85), n = R, sd = 0.05))

  # iid sign-flip
  p_iid <- bioLeak:::.dlsi_sign_flip(delta_r, M_flip = 5000L, seed = 30L)

  # Blocked sign-flip with block_size=3 (5 blocks from 15 repeats)
  sf_blk <- bioLeak:::.dlsi_sign_flip_blocked(delta_r, block_size = 3L,
                                               M_flip = 5000L, seed = 30L)
  p_blk <- sf_blk$p_value

  # Block p-value should be >= iid p-value (more conservative under autocorrelation)
  # Allow tolerance for MC noise; the key property is p_blk is generally larger
  expect_gte(sf_blk$n_blocks, 5L)  # at least 5 blocks for inference
  expect_true(is.finite(p_blk))
  expect_true(is.finite(p_iid))
})

test_that("blocked_time with n_blocks < 5 sets p_value = NA and warns", {
  set.seed(31)
  # 6 repeats, block_size=3 → 2 blocks → below threshold of 5
  fit_n <- .make_dlsi_fit(rep(0.8, 18L), n_obs = 54L, n_repeats = 6L)
  fit_g <- .make_dlsi_fit(rep(0.6, 18L), n_obs = 54L, n_repeats = 6L)
  expect_warning(
    res <- delta_lsi(fit_n, fit_g, metric = "auc",
                     exchangeability = "blocked_time",
                     block_size = 3L, seed = 31L),
    regexp = "n_blocks"
  )
  expect_true(is.na(res@p_value))
  expect_equal(res@info$n_blocks, 2L)
})

test_that("blocked_time with explicit block_size=1 behaves like iid sign-flip", {
  set.seed(32)
  fit_n <- .make_dlsi_fit(rep(0.8, 15L), n_obs = 45L, n_repeats = 5L)
  fit_g <- .make_dlsi_fit(rep(0.6, 15L), n_obs = 45L, n_repeats = 5L)
  # block_size=1 → each repeat is its own block → equivalent to iid
  res_blk <- delta_lsi(fit_n, fit_g, metric = "auc",
                        exchangeability = "blocked_time",
                        block_size = 1L, seed = 32L)
  res_iid <- delta_lsi(fit_n, fit_g, metric = "auc",
                        exchangeability = "iid", seed = 32L)
  # p-values should be identical (same enumeration, same seed offset)
  expect_equal(res_blk@p_value, res_iid@p_value, tolerance = 1e-10)
})

test_that("blocked_time stores block_size_used and n_blocks in info", {
  set.seed(33)
  fit_n <- .make_dlsi_fit(rep(0.8, 30L), n_obs = 90L, n_repeats = 10L)
  fit_g <- .make_dlsi_fit(rep(0.6, 30L), n_obs = 90L, n_repeats = 10L)
  res <- delta_lsi(fit_n, fit_g, metric = "auc",
                    exchangeability = "blocked_time",
                    block_size = 2L, seed = 33L,
                    M_boot = 100L, M_flip = 500L)
  expect_equal(res@info$block_size_used, 2L)
  expect_equal(res@info$n_blocks,        5L)   # 10 repeats / block_size=2 = 5 blocks
  expect_true(is.finite(res@p_value))          # 5 blocks >= 5 → p_value available
})