# test-irt_simulate.R
# TDD tests for irt_simulate() — Objective 8
# These tests define the expected behavior BEFORE implementation exists.
#
# irt_simulate() is the core simulation engine. It takes an irt_study object
# and runs a Monte Carlo simulation loop:
#   for each iteration:
#     for each sample_size:
#       generate data → apply missing → fit model → extract parameters
#
# Design decision (Approach A): irt_simulate() stores RAW per-iteration
# parameter estimates (point estimates, SEs, CIs, convergence). Criterion
# computation (MSE, bias, RMSE, coverage, etc.) happens at summary() time,
# not inside irt_simulate(). This follows the simhelpers/SimDesign/Morris
# et al. (2019) convention.
#
# Per-iteration storage:
#   iteration | sample_size | item | param | true_value | estimate | se |
#     ci_lower | ci_upper | converged
#
# For theta-level summaries (correlation with true theta, RMSE), a separate
# theta_results data.frame stores per-iteration summary stats:
#   iteration | sample_size | theta_cor | theta_rmse | converged


# --- Helper: reusable designs and studies ------------------------------------

make_1pl_study <- function(n_items = 10,
                           sample_sizes = c(100, 250),
                           missing = "none") {
  design <- irt_design(
    model = "1PL",
    n_items = n_items,
    item_params = list(b = seq(-2, 2, length.out = n_items))
  )
  irt_study(design, sample_sizes = sample_sizes, missing = missing)
}

make_2pl_study <- function(n_items = 10,
                           sample_sizes = c(100, 250),
                           missing = "none") {
  design <- irt_design(
    model = "2PL",
    n_items = n_items,
    item_params = list(
      a = rep(1.2, n_items),
      b = seq(-2, 2, length.out = n_items)
    )
  )
  irt_study(design, sample_sizes = sample_sizes, missing = missing)
}

make_grm_study <- function(n_items = 8,
                           n_categories = 4,
                           sample_sizes = c(200, 500)) {
  n_thresh <- n_categories - 1L
  b_mat <- matrix(
    seq(-2, 2, length.out = n_items * n_thresh),
    nrow = n_items, ncol = n_thresh
  )
  if (n_thresh > 1L) {
    b_mat <- t(apply(b_mat, 1, sort))
  }
  design <- irt_design(
    model = "GRM",
    n_items = n_items,
    item_params = list(a = rep(1.0, n_items), b = b_mat)
  )
  irt_study(design, sample_sizes = sample_sizes)
}


# =============================================================================
# 1. Return Type and S3 Class
# =============================================================================

test_that("irt_simulate returns an irt_results S3 object", {
  study <- make_1pl_study()
  res <- irt_simulate(study, iterations = 3, seed = 42)

  expect_s3_class(res, "irt_results")
})

test_that("irt_results contains required top-level elements", {
  study <- make_1pl_study()
  res <- irt_simulate(study, iterations = 3, seed = 42)

  # Must contain: item_results, theta_results, study, iterations, seed, timings

  expect_true("item_results" %in% names(res))
  expect_true("theta_results" %in% names(res))
  expect_true("study" %in% names(res))
  expect_true("iterations" %in% names(res))
  expect_true("seed" %in% names(res))
})

test_that("irt_results$study is the original irt_study object", {
  study <- make_1pl_study()
  res <- irt_simulate(study, iterations = 3, seed = 42)

  expect_identical(res$study, study)
})


# =============================================================================
# 2. item_results Structure
# =============================================================================

test_that("item_results is a data.frame with expected columns", {
  study <- make_1pl_study(n_items = 10, sample_sizes = c(100, 250))
  res <- irt_simulate(study, iterations = 3, seed = 42)

  ir <- res$item_results
  expect_s3_class(ir, "data.frame")

  expected_cols <- c(
    "iteration", "sample_size", "item", "param",
    "true_value", "estimate", "se", "ci_lower", "ci_upper", "converged"
  )
  for (col in expected_cols) {
    expect_true(col %in% names(ir), info = paste("Missing column:", col))
  }
})

test_that("item_results has correct number of rows for 1PL", {
  n_items <- 10
  sample_sizes <- c(100, 250)
  n_ss <- length(sample_sizes)
  iterations <- 3

  study <- make_1pl_study(n_items = n_items, sample_sizes = sample_sizes)
  res <- irt_simulate(study, iterations = iterations, seed = 42)

  ir <- res$item_results

  # 1PL: one parameter per item (b/difficulty)
  # Rows = iterations × sample_sizes × items × params_per_item
  # 1PL has 1 param (b) per item
  n_params_per_item <- 1L
  expected_rows <- iterations * n_ss * n_items * n_params_per_item
  expect_equal(nrow(ir), expected_rows)
})

test_that("item_results has correct number of rows for 2PL", {
  n_items <- 10
  sample_sizes <- c(100, 250)
  n_ss <- length(sample_sizes)
  iterations <- 3

  study <- make_2pl_study(n_items = n_items, sample_sizes = sample_sizes)
  res <- irt_simulate(study, iterations = iterations, seed = 42)

  ir <- res$item_results

  # 2PL: two parameters per item (a, b)
  n_params_per_item <- 2L
  expected_rows <- iterations * n_ss * n_items * n_params_per_item
  expect_equal(nrow(ir), expected_rows)
})

test_that("item_results iteration column contains 1:iterations", {
  study <- make_1pl_study()
  res <- irt_simulate(study, iterations = 5, seed = 42)

  expect_equal(sort(unique(res$item_results$iteration)), 1:5)
})

test_that("item_results sample_size column matches study sample_sizes", {
  study <- make_1pl_study(sample_sizes = c(100, 250))
  res <- irt_simulate(study, iterations = 3, seed = 42)

  expect_equal(sort(unique(res$item_results$sample_size)), c(100L, 250L))
})

test_that("item_results item column indexes all items", {
  n_items <- 10
  study <- make_1pl_study(n_items = n_items)
  res <- irt_simulate(study, iterations = 3, seed = 42)

  expect_equal(sort(unique(res$item_results$item)), 1:n_items)
})

test_that("item_results param column contains 'b' for 1PL", {
  study <- make_1pl_study()
  res <- irt_simulate(study, iterations = 3, seed = 42)

  expect_equal(unique(res$item_results$param), "b")
})

test_that("item_results param column contains 'a' and 'b' for 2PL", {
  study <- make_2pl_study()
  res <- irt_simulate(study, iterations = 3, seed = 42)

  expect_true(setequal(unique(res$item_results$param), c("a", "b")))
})

test_that("item_results true_value matches design parameters", {
  n_items <- 10
  b_true <- seq(-2, 2, length.out = n_items)

  study <- make_1pl_study(n_items = n_items)
  res <- irt_simulate(study, iterations = 3, seed = 42)

  ir <- res$item_results
  # For each item, the true b value should match the design
  b_rows <- ir[ir$param == "b", ]
  for (i in seq_len(n_items)) {
    item_true <- unique(b_rows$true_value[b_rows$item == i])
    expect_equal(item_true, b_true[i], tolerance = 1e-10,
                 info = paste("Item", i, "true_value mismatch"))
  }
})

test_that("item_results estimate values are numeric and finite (converged)", {

  study <- make_1pl_study(sample_sizes = c(250))
  res <- irt_simulate(study, iterations = 5, seed = 42)

  ir <- res$item_results
  converged_rows <- ir[ir$converged, ]

  # Estimates should be numeric and finite for converged replications
  expect_true(is.numeric(converged_rows$estimate))
  expect_true(all(is.finite(converged_rows$estimate)))
})

test_that("item_results se values are non-negative for converged iterations", {
  study <- make_1pl_study(sample_sizes = c(250))
  res <- irt_simulate(study, iterations = 5, seed = 42)

  ir <- res$item_results
  converged_rows <- ir[ir$converged, ]

  expect_true(all(converged_rows$se >= 0))
})

test_that("item_results ci_lower <= estimate <= ci_upper for converged rows", {
  study <- make_2pl_study(sample_sizes = c(250))
  res <- irt_simulate(study, iterations = 5, seed = 42)

  ir <- res$item_results
  # Filter to converged rows that have non-NA CIs (SEM SE computation

  # can fail for individual parameters even when the model converges)
  cr <- ir[ir$converged & !is.na(ir$ci_lower) & !is.na(ir$ci_upper), ]

  expect_true(nrow(cr) > 0)  # at least some rows have CIs
  expect_true(all(cr$ci_lower <= cr$estimate))
  expect_true(all(cr$estimate <= cr$ci_upper))
})


# =============================================================================
# 3. theta_results Structure
# =============================================================================

test_that("theta_results is a data.frame with expected columns", {
  study <- make_1pl_study()
  res <- irt_simulate(study, iterations = 3, seed = 42)

  tr <- res$theta_results
  expect_s3_class(tr, "data.frame")

  expected_cols <- c(
    "iteration", "sample_size", "theta_cor", "theta_rmse", "converged"
  )
  for (col in expected_cols) {
    expect_true(col %in% names(tr), info = paste("Missing column:", col))
  }
})

test_that("theta_results has one row per iteration × sample_size", {
  sample_sizes <- c(100, 250)
  iterations <- 5

  study <- make_1pl_study(sample_sizes = sample_sizes)
  res <- irt_simulate(study, iterations = iterations, seed = 42)

  tr <- res$theta_results
  expected_rows <- iterations * length(sample_sizes)
  expect_equal(nrow(tr), expected_rows)
})

test_that("theta_cor is between -1 and 1 for converged iterations", {
  study <- make_1pl_study(sample_sizes = c(500))
  res <- irt_simulate(study, iterations = 5, seed = 42)

  tr <- res$theta_results
  cr <- tr[tr$converged, ]

  expect_true(all(cr$theta_cor >= -1 & cr$theta_cor <= 1))
})

test_that("theta_rmse is non-negative for converged iterations", {
  study <- make_1pl_study(sample_sizes = c(500))
  res <- irt_simulate(study, iterations = 5, seed = 42)

  tr <- res$theta_results
  cr <- tr[tr$converged, ]

  expect_true(all(cr$theta_rmse >= 0))
})


# =============================================================================
# 4. Seed Reproducibility
# =============================================================================

test_that("irt_simulate is reproducible with the same seed", {
  study <- make_1pl_study(sample_sizes = c(100))
  res1 <- irt_simulate(study, iterations = 3, seed = 42)
  res2 <- irt_simulate(study, iterations = 3, seed = 42)

  expect_identical(res1$item_results, res2$item_results)
  expect_identical(res1$theta_results, res2$theta_results)
})

test_that("irt_simulate produces different results with different seeds", {
  study <- make_1pl_study(sample_sizes = c(100))
  res1 <- irt_simulate(study, iterations = 3, seed = 1)
  res2 <- irt_simulate(study, iterations = 3, seed = 2)

  expect_false(identical(res1$item_results, res2$item_results))
})


# =============================================================================
# 5. Non-Convergence Handling
# =============================================================================

test_that("non-convergence is tracked in item_results$converged", {
  study <- make_1pl_study()
  res <- irt_simulate(study, iterations = 5, seed = 42)

  expect_true(is.logical(res$item_results$converged))
})

test_that("non-convergence is tracked in theta_results$converged", {
  study <- make_1pl_study()
  res <- irt_simulate(study, iterations = 5, seed = 42)

  expect_true(is.logical(res$theta_results$converged))
})

test_that("non-converged iterations have NA estimates", {
  # We can't force non-convergence easily, but we can test the structural
  # contract: if converged == FALSE, estimate/se/ci should be NA
  study <- make_1pl_study()
  res <- irt_simulate(study, iterations = 5, seed = 42)

  ir <- res$item_results
  non_conv <- ir[!ir$converged, ]

  if (nrow(non_conv) > 0) {
    expect_true(all(is.na(non_conv$estimate)))
    expect_true(all(is.na(non_conv$se)))
    expect_true(all(is.na(non_conv$ci_lower)))
    expect_true(all(is.na(non_conv$ci_upper)))
  } else {
    # All converged — that's fine for a well-specified simple model
    expect_true(TRUE)
  }
})

test_that("non-converged iterations are never silently dropped", {
  # Total row count must equal iterations × sample_sizes × items × params
  # regardless of convergence
  n_items <- 10
  sample_sizes <- c(100, 250)
  iterations <- 5

  study <- make_1pl_study(n_items = n_items, sample_sizes = sample_sizes)
  res <- irt_simulate(study, iterations = iterations, seed = 42)

  n_params_per_item <- 1L  # 1PL
  expected_rows <- iterations * length(sample_sizes) * n_items * n_params_per_item
  expect_equal(nrow(res$item_results), expected_rows)

  expected_theta_rows <- iterations * length(sample_sizes)
  expect_equal(nrow(res$theta_results), expected_theta_rows)
})


# =============================================================================
# 6. Input Validation
# =============================================================================

test_that("irt_simulate errors for non-irt_study input", {
  expect_error(
    irt_simulate("not a study", iterations = 3, seed = 42),
    "irt_study"
  )
})

test_that("irt_simulate errors for non-positive iterations", {
  study <- make_1pl_study()

  expect_error(irt_simulate(study, iterations = 0, seed = 42), "iterations")
  expect_error(irt_simulate(study, iterations = -5, seed = 42), "iterations")
})

test_that("irt_simulate errors for non-integer iterations", {
  study <- make_1pl_study()

  expect_error(irt_simulate(study, iterations = 3.5, seed = 42), "iterations")
})

test_that("irt_simulate errors when seed is missing", {
  study <- make_1pl_study()

  # seed is required for reproducibility
  expect_error(irt_simulate(study, iterations = 3), "seed")
})


# =============================================================================
# 7. Model-Specific Behavior
# =============================================================================

test_that("irt_simulate works end-to-end for 1PL", {
  study <- make_1pl_study(n_items = 10, sample_sizes = c(200))
  res <- irt_simulate(study, iterations = 3, seed = 42)

  expect_s3_class(res, "irt_results")
  expect_equal(unique(res$item_results$param), "b")
  expect_true(nrow(res$item_results) > 0)
})

test_that("irt_simulate works end-to-end for 2PL", {
  study <- make_2pl_study(n_items = 10, sample_sizes = c(200))
  res <- irt_simulate(study, iterations = 3, seed = 42)

  expect_s3_class(res, "irt_results")
  expect_true(setequal(unique(res$item_results$param), c("a", "b")))
  expect_true(nrow(res$item_results) > 0)
})

test_that("irt_simulate works end-to-end for GRM", {
  study <- make_grm_study(n_items = 8, n_categories = 4, sample_sizes = c(300))
  res <- irt_simulate(study, iterations = 3, seed = 42)

  expect_s3_class(res, "irt_results")
  # GRM has a and indexed threshold parameters (b1, b2, b3, ...)
  expect_true("a" %in% unique(res$item_results$param))
  b_params <- grep("^b\\d+$", unique(res$item_results$param), value = TRUE)
  expect_true(length(b_params) >= 1)
  expect_true(nrow(res$item_results) > 0)
})


# =============================================================================
# 8. Missing Data Integration
# =============================================================================

test_that("irt_simulate works with MCAR missing data", {
  design <- irt_design(
    model = "1PL",
    n_items = 10,
    item_params = list(b = seq(-2, 2, length.out = 10))
  )
  study <- irt_study(design, sample_sizes = c(200),
                     missing = "mcar", missing_rate = 0.1)

  res <- irt_simulate(study, iterations = 3, seed = 42)
  expect_s3_class(res, "irt_results")
  expect_true(nrow(res$item_results) > 0)
})


# =============================================================================
# 9. Parameter Recovery Sanity Checks
# =============================================================================
# These are NOT precision tests — they're sanity checks that the simulation
# loop is wired correctly (generating, fitting, extracting in the right order).
# Actual criterion precision testing belongs in objectives 10-11.

test_that("1PL b estimates are in a reasonable range", {
  # True b in [-2, 2], estimates should be in roughly [-4, 4]
  study <- make_1pl_study(n_items = 10, sample_sizes = c(500))
  res <- irt_simulate(study, iterations = 5, seed = 42)

  ir <- res$item_results
  cr <- ir[ir$converged & ir$param == "b", ]

  expect_true(all(cr$estimate > -6))
  expect_true(all(cr$estimate < 6))
})

test_that("2PL a estimates are positive", {
  study <- make_2pl_study(n_items = 10, sample_sizes = c(500))
  res <- irt_simulate(study, iterations = 5, seed = 42)

  ir <- res$item_results
  cr <- ir[ir$converged & ir$param == "a", ]

  # Discrimination estimates should be positive
  expect_true(all(cr$estimate > 0))
})

test_that("mean b estimates are close to true values with large N", {
  # With N = 1000, mean estimates across iterations should be near truth
  n_items <- 10
  b_true <- seq(-2, 2, length.out = n_items)

  study <- make_1pl_study(n_items = n_items, sample_sizes = c(1000))
  res <- irt_simulate(study, iterations = 10, seed = 42)

  ir <- res$item_results
  cr <- ir[ir$converged & ir$param == "b", ]

  # Check mean estimate per item — should be within 0.5 of truth
  for (i in seq_len(n_items)) {
    item_ests <- cr$estimate[cr$item == i]
    if (length(item_ests) > 0) {
      mean_est <- mean(item_ests)
      expect_lt(abs(mean_est - b_true[i]), 0.5,
                label = paste("Item", i, ": |mean_est - true| =",
                              round(abs(mean_est - b_true[i]), 3)))
    }
  }
})


# =============================================================================
# 10. Print Method
# =============================================================================

test_that("print.irt_results produces output without error", {
  study <- make_1pl_study()
  res <- irt_simulate(study, iterations = 3, seed = 42)

  expect_output(print(res))
})

test_that("print.irt_results shows iterations and convergence rate", {
  study <- make_1pl_study()
  res <- irt_simulate(study, iterations = 3, seed = 42)

  output <- capture.output(print(res))
  combined <- paste(output, collapse = "\n")

  # Should mention iterations count
  expect_match(combined, "3", fixed = TRUE)
  # Should mention convergence
  expect_match(combined, "[Cc]onverg")
})


# =============================================================================
# 11. Timing Information
# =============================================================================

test_that("irt_results contains elapsed time information", {
  study <- make_1pl_study(sample_sizes = c(100))
  res <- irt_simulate(study, iterations = 3, seed = 42)

  expect_true("elapsed" %in% names(res))
  expect_true(is.numeric(res$elapsed))
  expect_true(res$elapsed >= 0)
})