test_that("reference_case_benchmark returns package-native benchmark bundle", {
  bench <- suppressWarnings(reference_case_benchmark(
    cases = c("synthetic_truth", "synthetic_bias_contract", "study1_itercal_pair"),
    method = "MML",
    quad_points = 5,
    maxit = 30
  ))

  expect_s3_class(bench, "mfrm_reference_benchmark")
  expect_true(all(c(
    "overview", "summary", "table", "fit_runs", "case_summary",
    "design_checks", "recovery_checks", "bias_checks", "pair_checks",
    "linking_checks", "source_profile", "settings", "notes"
  ) %in% names(bench)))

  expect_true(is.data.frame(bench$overview))
  expect_true(is.data.frame(bench$fit_runs))
  expect_true(is.data.frame(bench$case_summary))
  expect_true(is.data.frame(bench$design_checks))
  expect_true(is.data.frame(bench$recovery_checks))
  expect_true(is.data.frame(bench$bias_checks))
  expect_true(is.data.frame(bench$pair_checks))
  expect_true(is.data.frame(bench$linking_checks))
  expect_true(is.data.frame(bench$source_profile))
  expect_identical(bench$settings$intended_use, "internal_benchmark")
  expect_false(isTRUE(bench$settings$external_validation))

  expect_equal(nrow(bench$case_summary), 3)
  expect_true(all(c("synthetic_truth", "synthetic_bias_contract", "study1_itercal_pair") %in% bench$case_summary$Case))
  expect_true(all(bench$fit_runs$PrecisionTier %in% c("model_based", "hybrid", "exploratory")))
})

test_that("reference_case_benchmark recovers synthetic truth under MML", {
  bench <- suppressWarnings(reference_case_benchmark(
    cases = "synthetic_truth",
    method = "MML",
    quad_points = 7,
    maxit = 40
  ))

  expect_true(all(bench$recovery_checks$Status %in% c("Pass", "Warn", "Fail")))
  expect_true(min(bench$recovery_checks$Correlation, na.rm = TRUE) > 0.95)
  expect_true(max(bench$recovery_checks$MeanAbsoluteDeviation, na.rm = TRUE) < 0.30)
  expect_true(all(bench$fit_runs$SupportsFormalInference))
})

test_that("reference_case_benchmark captures pair stability and summary output", {
  bench <- suppressWarnings(reference_case_benchmark(
    cases = "study1_itercal_pair",
    method = "MML",
    quad_points = 5,
    maxit = 30
  ))

  expect_true(all(c("Criterion", "Rater", "OverallFit") %in% bench$pair_checks$Facet))
  expect_true(all(c("Criterion", "Rater") %in% bench$linking_checks$Facet))
  expect_identical(
    as.character(bench$linking_checks$Status[bench$linking_checks$Facet == "Rater"][1]),
    "Pass"
  )
  expect_identical(
    as.character(bench$linking_checks$Status[bench$linking_checks$Facet == "Criterion"][1]),
    "Warn"
  )
  criterion_row <- bench$pair_checks[bench$pair_checks$Facet == "Criterion", , drop = FALSE]
  expect_true(criterion_row$Pearson[1] > 0.95)
  expect_true(criterion_row$MeanAbsoluteDifference[1] < 0.10)

  s <- summary(bench)
  expect_s3_class(s, "summary.mfrm_bundle")
  expect_true(is.data.frame(s$overview))
  expect_identical(as.character(s$overview$Class[1]), "mfrm_reference_benchmark")

  printed <- paste(capture.output(print(s)), collapse = "\n")
  expect_match(printed, "mfrmr Internal Benchmark Summary", fixed = TRUE)
})

test_that("reference_case_benchmark verifies bias-contract identities", {
  bench <- suppressWarnings(reference_case_benchmark(
    cases = "synthetic_bias_contract",
    method = "MML",
    quad_points = 7,
    maxit = 40
  ))

  expect_true(nrow(bench$bias_checks) >= 4)
  stable_rows <- bench$bias_checks[
    bench$bias_checks$Metric %in% c("BiasDFIdentity", "LocalMeasureIdentity", "PairContrastIdentity"),
    ,
    drop = FALSE
  ]
  expect_true(nrow(stable_rows) == 3)
  expect_true(all(stable_rows$Status == "Pass"))
  expect_true(max(stable_rows$MaxError, na.rm = TRUE) < 1e-8)
  expect_true(any(bench$source_profile$RuleID == "bias_pairwise_welch"))
})

test_that("reference_case_benchmark reports exploratory precision under JML", {
  bench <- suppressWarnings(reference_case_benchmark(
    cases = "synthetic_truth",
    method = "JML",
    maxit = 30
  ))

  expect_true(all(bench$fit_runs$PrecisionTier == "exploratory"))
  expect_true(all(!bench$fit_runs$SupportsFormalInference))
})

test_that("reference_case_benchmark handles single-case runs without warnings", {
  expect_no_warning(
    reference_case_benchmark(
      cases = "synthetic_truth",
      method = "MML",
      quad_points = 5,
      maxit = 20
    )
  )
})

test_that("reference_case_benchmark does not pass bias-contract cases when checks are missing", {
  local_mocked_bindings(
    estimate_bias = function(...) {
      structure(list(table = data.frame()), class = "mfrm_bias")
    },
    .package = "mfrmr"
  )

  bench <- suppressWarnings(reference_case_benchmark(
    cases = "synthetic_bias_contract",
    method = "MML",
    quad_points = 5,
    maxit = 20
  ))

  row <- bench$case_summary[bench$case_summary$Case == "synthetic_bias_contract", , drop = FALSE]
  expect_identical(as.character(row$Status[1]), "Warn")
  expect_match(row$KeySignal[1], "No bias-contract checks were produced", fixed = TRUE)
})