# test-criterion_fn.R
# TDD tests for custom criterion callback feature (Objective 27)
#
# Covers custom criterion_fn parameter in summary.irt_results() that allows
# users to compute item-level custom performance criteria, augmenting the
# built-in Morris et al. (2019) criteria.

# =============================================================================
# Helper: Build a minimal irt_results fixture for fast tests
# =============================================================================

make_fixture_results <- function(
  n_items = 3,
  sample_sizes = c(100, 200),
  iterations = 5,
  model = "1PL",
  seed = 42
) {
  # Build item_results with deterministic estimates
  rows <- list()
  set.seed(seed)
  for (ss in sample_sizes) {
    for (item in seq_len(n_items)) {
      for (param in "b") {
        tv <- seq(-2, 2, length.out = n_items)[item]
        ests <- tv + rnorm(iterations, mean = 0.01, sd = 0.1)
        rows[[length(rows) + 1]] <- data.frame(
          iteration   = seq_len(iterations),
          sample_size = rep(ss, iterations),
          item        = rep(item, iterations),
          param       = rep(param, iterations),
          true_value  = rep(tv, iterations),
          estimate    = ests,
          se          = rep(0.1, iterations),
          ci_lower    = ests - 0.2,
          ci_upper    = ests + 0.2,
          converged   = rep(TRUE, iterations),
          stringsAsFactors = FALSE
        )
      }
    }
  }
  item_results <- do.call(rbind, rows)
  rownames(item_results) <- NULL

  # Build theta_results
  theta_results <- data.frame(
    iteration   = rep(seq_len(iterations), each = length(sample_sizes)),
    sample_size = rep(sample_sizes, times = iterations),
    theta_cor   = runif(iterations * length(sample_sizes), 0.8, 0.99),
    theta_rmse  = runif(iterations * length(sample_sizes), 0.2, 0.5),
    converged   = TRUE,
    stringsAsFactors = FALSE
  )

  # Build design and study stubs
  design_stub <- list(
    model = model,
    n_items = n_items,
    item_params = list(b = seq(-2, 2, length.out = n_items))
  )
  class(design_stub) <- "irt_design"

  study_stub <- list(
    design = design_stub,
    sample_sizes = sample_sizes,
    missing_type = "none"
  )
  class(study_stub) <- "irt_study"

  structure(
    list(
      item_results  = item_results,
      theta_results = theta_results,
      study         = study_stub,
      iterations    = iterations,
      seed          = seed,
      elapsed       = 0.1
    ),
    class = "irt_results"
  )
}


# =============================================================================
# Test 1: Happy path — single named scalar
# =============================================================================

test_that("criterion_fn returning single named scalar creates correct column", {
  results <- make_fixture_results()

  # Simple callback: return constant value
  my_fn <- function(estimates, true_value, ci_lower, ci_upper, converged, ...) {
    c(my_stat = 0.5)
  }

  s <- summary(results, criterion_fn = my_fn)

  # Should have my_stat column
  expect_true("my_stat" %in% names(s$item_summary))

  # All values should be 0.5 (since callback always returns 0.5)
  expect_true(all(s$item_summary$my_stat == 0.5, na.rm = TRUE))

  # Standard columns should still exist
  expect_true("bias" %in% names(s$item_summary))
  expect_true("sample_size" %in% names(s$item_summary))
})


# =============================================================================
# Test 2: Happy path — multiple named scalars
# =============================================================================

test_that("criterion_fn returning multiple named scalars creates all columns", {
  results <- make_fixture_results()

  my_fn <- function(estimates, true_value, ci_lower, ci_upper, converged, ...) {
    c(stat_a = 1.0, stat_b = 2.5)
  }

  s <- summary(results, criterion_fn = my_fn)

  # Both columns should exist
  expect_true("stat_a" %in% names(s$item_summary))
  expect_true("stat_b" %in% names(s$item_summary))

  # Check values
  expect_true(all(s$item_summary$stat_a == 1.0, na.rm = TRUE))
  expect_true(all(s$item_summary$stat_b == 2.5, na.rm = TRUE))
})


# =============================================================================
# Test 3: Backward compatibility — no criterion_fn produces identical output
# =============================================================================

test_that("summary with no criterion_fn identical to criterion_fn = NULL", {
  results <- make_fixture_results()

  s1 <- summary(results)
  s2 <- summary(results, criterion_fn = NULL)

  # Same columns
  expect_equal(names(s1$item_summary), names(s2$item_summary))

  # Same data
  expect_equal(s1$item_summary, s2$item_summary)
})

test_that("backward compat: no custom columns without criterion_fn", {
  results <- make_fixture_results()

  s <- summary(results)

  # Should have only built-in columns (no custom columns)
  expected_cols <- c(
    "sample_size", "item", "param", "true_value",
    "bias", "empirical_se", "mse", "rmse", "coverage",
    "mcse_bias", "mcse_mse", "n_converged"
  )
  expect_equal(sort(names(s$item_summary)), sort(expected_cols))
})


# =============================================================================
# Test 4: Callback receives correct data per cell
# =============================================================================

test_that("callback receives correct estimates, true_value, CIs, converged", {
  results <- make_fixture_results(n_items = 1, sample_sizes = 100, iterations = 3)

  # Capture inputs in a parent-scope list
  captured <- list()

  my_fn <- function(estimates, true_value, ci_lower, ci_upper, converged, ...) {
    # Verify structure
    expect_true(is.numeric(estimates))
    expect_true(is.numeric(true_value))
    expect_length(true_value, 1)
    expect_true(is.numeric(ci_lower))
    expect_true(is.numeric(ci_upper))
    expect_true(is.logical(converged))

    # Verify lengths match
    expect_length(estimates, length(converged))
    expect_length(ci_lower, length(converged))
    expect_length(ci_upper, length(converged))

    c(test_stat = 0)
  }

  s <- summary(results, criterion_fn = my_fn)

  # If we got here, all expect_true calls inside the callback passed
  expect_true(TRUE)
})


# =============================================================================
# Test 5: Error — non-function criterion_fn
# =============================================================================

test_that("criterion_fn = string raises informative error", {
  results <- make_fixture_results()

  expect_error(
    summary(results, criterion_fn = "not_a_function"),
    "criterion_fn.*function"
  )
})

test_that("criterion_fn = list raises informative error", {
  results <- make_fixture_results()

  expect_error(
    summary(results, criterion_fn = list(a = 1)),
    "criterion_fn.*function"
  )
})


# =============================================================================
# Test 6: Error — callback returns non-numeric
# =============================================================================

test_that("callback returning character vector raises error with cell context", {
  results <- make_fixture_results(n_items = 1, sample_sizes = 100)

  bad_fn <- function(estimates, true_value, ci_lower, ci_upper, converged, ...) {
    c(x = "hello")
  }

  expect_error(
    summary(results, criterion_fn = bad_fn),
    "numeric.*sample_size.*item.*param"
  )
})

test_that("callback returning list raises error", {
  results <- make_fixture_results(n_items = 1, sample_sizes = 100)

  bad_fn <- function(estimates, true_value, ci_lower, ci_upper, converged, ...) {
    list(x = 1)
  }

  expect_error(
    summary(results, criterion_fn = bad_fn),
    "numeric"
  )
})


# =============================================================================
# Test 7: Error — callback returns unnamed vector
# =============================================================================

test_that("callback returning unnamed numeric vector raises error", {
  results <- make_fixture_results(n_items = 1, sample_sizes = 100)

  bad_fn <- function(estimates, true_value, ci_lower, ci_upper, converged, ...) {
    c(1, 2, 3)
  }

  expect_error(
    summary(results, criterion_fn = bad_fn),
    "named"
  )
})

test_that("callback with some empty names raises error", {
  results <- make_fixture_results(n_items = 1, sample_sizes = 100)

  bad_fn <- function(estimates, true_value, ci_lower, ci_upper, converged, ...) {
    c(a = 1, 2)
  }

  expect_error(
    summary(results, criterion_fn = bad_fn),
    "named"
  )
})


# =============================================================================
# Test 8: Error — inconsistent names across cells
# =============================================================================

test_that("callback returns inconsistent names across cells raises error", {
  results <- make_fixture_results(n_items = 2, sample_sizes = c(100, 200))

  call_count <- 0
  bad_fn <- function(estimates, true_value, ci_lower, ci_upper, converged, ...) {
    call_count <<- call_count + 1
    # First few calls return 'a', later calls return 'b'
    if (call_count <= 3) {
      c(a = 1)
    } else {
      c(b = 2)
    }
  }

  expect_error(
    summary(results, criterion_fn = bad_fn),
    "consistent names|reference"
  )
})


# =============================================================================
# Test 9: Error — callback throws
# =============================================================================

test_that("callback throwing error wrapped with cell context", {
  results <- make_fixture_results(n_items = 1, sample_sizes = 100)

  bad_fn <- function(estimates, true_value, ci_lower, ci_upper, converged, ...) {
    stop("oh no", call. = FALSE)
  }

  expect_error(
    summary(results, criterion_fn = bad_fn),
    "oh no"
  )
  # Error should also include cell identifier
  expect_error(
    summary(results, criterion_fn = bad_fn),
    "sample_size.*item.*param"
  )
})


# =============================================================================
# Test 10: Compatibility with criterion filter
# =============================================================================

test_that("criterion filter keeps custom columns even when filtering built-ins", {
  results <- make_fixture_results()

  my_fn <- function(estimates, true_value, ci_lower, ci_upper, converged, ...) {
    c(my_custom = 99.9)
  }

  s <- summary(results, criterion = "bias", criterion_fn = my_fn)

  # Built-in columns should be filtered
  expect_true("bias" %in% names(s$item_summary))
  expect_false("mse" %in% names(s$item_summary))

  # But custom column should still be present
  expect_true("my_custom" %in% names(s$item_summary))
})


# =============================================================================
# Test 11: Compatibility with param filter
# =============================================================================

test_that("param filter works with criterion_fn", {
  results <- make_fixture_results(n_items = 3)

  my_fn <- function(estimates, true_value, ci_lower, ci_upper, converged, ...) {
    c(custom_stat = 42)
  }

  # Only summarize "b" parameter (which is all we have in this fixture)
  s <- summary(results, param = "b", criterion_fn = my_fn)

  # All rows should be param "b"
  expect_true(all(s$item_summary$param == "b"))

  # Custom column should still exist
  expect_true("custom_stat" %in% names(s$item_summary))
})


# =============================================================================
# Test 12: Compatibility with model misspecification (cross-fit)
# =============================================================================

test_that("criterion_fn works with cross-fit (gen=1PL, est=2PL)", {
  # Build a 1PL design and 2PL estimation model
  design <- irt_design(
    model = "1PL",
    n_items = 3,
    item_params = list(b = c(-1, 0, 1))
  )

  study <- irt_study(
    design,
    sample_sizes = 100,
    estimation_model = "2PL"
  )

  results <- irt_simulate(study, iterations = 3, seed = 42, progress = FALSE)

  my_fn <- function(estimates, true_value, ci_lower, ci_upper, converged, ...) {
    c(cross_fit_stat = mean(estimates, na.rm = TRUE))
  }

  # Should not error and should produce custom column
  s <- summary(results, criterion_fn = my_fn)

  expect_true("cross_fit_stat" %in% names(s$item_summary))
  # Estimation model is 2PL, so extract_params() returns both a and b for each
  # of the 3 items — 6 rows per sample_size. Obj 26's
  # build_true_params_for_estimation() adds a=1 rows (Rasch constraint) so true
  # values exist for all estimated params.
  expect_equal(nrow(s$item_summary), 6)  # 3 items × 2 params (a + b)
})


# =============================================================================
# Test 13: theta_summary unchanged regardless of criterion_fn
# =============================================================================

test_that("theta_summary unaffected by criterion_fn", {
  results <- make_fixture_results()

  my_fn <- function(estimates, true_value, ci_lower, ci_upper, converged, ...) {
    c(custom = 1.5)
  }

  s1 <- summary(results)
  s2 <- summary(results, criterion_fn = my_fn)

  # theta_summary should be identical
  expect_equal(s1$theta_summary, s2$theta_summary)
})


# =============================================================================
# Test 14: Callback can use ... to accept future parameters
# =============================================================================

test_that("callback with ... accepts named args without error", {
  results <- make_fixture_results()

  # Function with ... for future-proofing
  flexible_fn <- function(estimates, true_value, ...) {
    c(flexible_stat = 3.14)
  }

  expect_no_error(
    summary(results, criterion_fn = flexible_fn)
  )
})


# =============================================================================
# Test 15: Custom columns positioned correctly in output
# =============================================================================

test_that("custom columns appended after n_converged", {
  results <- make_fixture_results()

  my_fn <- function(estimates, true_value, ci_lower, ci_upper, converged, ...) {
    c(custom_a = 1, custom_b = 2)
  }

  s <- summary(results, criterion_fn = my_fn)

  col_names <- names(s$item_summary)

  # Find positions
  n_converged_pos <- match("n_converged", col_names)
  custom_a_pos <- match("custom_a", col_names)
  custom_b_pos <- match("custom_b", col_names)

  # Custom columns should come after n_converged
  expect_true(custom_a_pos > n_converged_pos)
  expect_true(custom_b_pos > n_converged_pos)
})