# Backend equivalence battery: compare statuser outputs to underlying engines.

#t.test2_027
test_that("backend: t.test2 matches stats::t.test across core scenarios", {
  set.seed(101)
  
  x <- rnorm(60, mean = 0.2)
  y <- rnorm(55, mean = 0.5)
  
  scenarios <- list(
    list(label = "two_sample_welch", ref = list(x = x, y = y), args = list()),
    list(label = "two_sample_student", ref = list(x = x, y = y), args = list(var.equal = TRUE)),
    list(label = "paired", ref = list(x = x[1:50], y = y[1:50]), args = list(paired = TRUE)),
    list(label = "one_sample", ref = list(x = x, y = NULL), args = list(mu = 0.1)),
    list(label = "alt_less", ref = list(x = x, y = y), args = list(alternative = "less")),
    list(label = "conf_99", ref = list(x = x, y = y), args = list(conf.level = 0.99))
  )
  
  for (sc in scenarios) {
    if (is.null(sc$ref$y)) {
      tt_ref <- do.call(stats::t.test, c(list(x = sc$ref$x), sc$args))
      tt2 <- do.call(statuser::t.test2, c(list(sc$ref$x), sc$args))
    } else {
      tt_ref <- do.call(stats::t.test, c(list(x = sc$ref$x, y = sc$ref$y), sc$args))
      tt2 <- do.call(statuser::t.test2, c(list(sc$ref$x, sc$ref$y), sc$args))
    }
    
    be_expect_equal_num(tt2$p.value, tt_ref$p.value, tol = 1e-10)
    be_expect_equal_num(tt2$t, as.numeric(tt_ref$statistic), tol = 1e-10)
    be_expect_equal_num(tt2$df, as.numeric(tt_ref$parameter), tol = 1e-10)
    be_expect_equal_num(tt2$ci.L, tt_ref$conf.int[1], tol = 1e-10)
    be_expect_equal_num(tt2$ci.H, tt_ref$conf.int[2], tol = 1e-10)
  }
})

#t.test2_028
test_that("backend: t.test2 formula interface matches stats::t.test", {
  set.seed(102)
  
  n <- 120
  g <- factor(rep(c("A", "B"), each = n / 2), levels = c("A", "B"))
  y <- rnorm(n, mean = ifelse(g == "B", 0.3, 0))
  df <- data.frame(y = y, g = g)
  
  tt_ref <- stats::t.test(y ~ g, data = df)
  tt2 <- statuser::t.test2(y ~ g, data = df)
  
  be_expect_equal_num(tt2$p.value, tt_ref$p.value, tol = 1e-10)
  be_expect_equal_num(tt2$t, as.numeric(tt_ref$statistic), tol = 1e-10)
  be_expect_equal_num(tt2$df, as.numeric(tt_ref$parameter), tol = 1e-10)
  be_expect_equal_num(tt2$ci.L, tt_ref$conf.int[1], tol = 1e-10)
  be_expect_equal_num(tt2$ci.H, tt_ref$conf.int[2], tol = 1e-10)
})

#lm2_051
test_that("backend: lm2 matches estimatr::lm_robust coefficients and inference", {
  testthat::skip_if_not_installed("estimatr")
  
  set.seed(201)
  n <- 300
  x1 <- rnorm(n)
  x2 <- rnorm(n)
  y <- 1 + 0.5 * x1 - 0.2 * x2 + rnorm(n)
  df <- data.frame(y = y, x1 = x1, x2 = x2)
  
  m2 <- statuser::lm2(y ~ x1 + x2, data = df, notes = FALSE)
  ref <- estimatr::lm_robust(y ~ x1 + x2, data = df, se_type = "HC3")
  
  be_expect_equal_num(unname(m2$coefficients), unname(ref$coefficients), tol = 1e-10)
  be_expect_equal_num(unname(m2$std.error), unname(ref$std.error), tol = 1e-10)
  be_expect_equal_num(unname(m2$statistic), unname(ref$statistic), tol = 1e-10)
  be_expect_equal_num(unname(m2$p.value), unname(ref$p.value), tol = 1e-10)
  be_expect_equal_num(unname(m2$df), unname(ref$df), tol = 1e-10)
})

#interprobe_012
test_that("backend: interprobe JN output matches marginaleffects slopes and regions of significance", {
  testthat::skip_if_not_installed("marginaleffects")
  testthat::skip_if_not_installed("mgcv")
  
  set.seed(301)
  n <- 500
  x <- rnorm(n)
  z <- rnorm(n)
  y <- x * z + rnorm(n)
  df <- data.frame(y = y, x = x, z = z)
  
  m <- stats::lm(y ~ x * z, data = df)
  
  spotlights <- c(-1, 0, 1)
  probe_bins <- 40
  zs <- seq(min(df$z), max(df$z), length.out = probe_bins)
  
  grDevices::pdf(file = tempfile(fileext = ".pdf"), width = 7, height = 7)
  on.exit(grDevices::dev.off(), add = TRUE)
  
  res <- statuser::interprobe(
    model = m,
    x = "x",
    z = "z",
    draw = "jn",
    histogram = FALSE,
    quiet = TRUE,
    spotlights = spotlights,
    probe.bins = probe_bins
  )
  
  jn_ref_list <- lapply(spotlights, function(xj) {
    nd <- data.frame(x = xj, z = zs)
    marginaleffects::slopes(m, newdata = nd, var = "x")
  })
  jn_ref <- do.call(rbind, jn_ref_list)
  jn_ref$x <- rep(spotlights, each = probe_bins)
  
  jn_stat <- res$johnson.neyman
  
  jn_ref <- jn_ref[order(jn_ref$x, jn_ref$z), ]
  jn_stat <- jn_stat[order(jn_stat$x, jn_stat$z), ]
  
  testthat::expect_equal(nrow(jn_ref), nrow(jn_stat))
  
  be_expect_equal_num(jn_stat$marginal.effect, jn_ref$estimate, tol = 1e-8)
  be_expect_equal_num(jn_stat$conf.low, jn_ref$conf.low, tol = 1e-8)
  be_expect_equal_num(jn_stat$conf.high, jn_ref$conf.high, tol = 1e-8)
  
  sig_ref <- be_sig_from_ci(jn_ref$conf.low, jn_ref$conf.high)
  sig_stat <- be_sig_from_ci(jn_stat$conf.low, jn_stat$conf.high)
  testthat::expect_identical(sig_stat, sig_ref)
})

#interprobe_013
test_that("backend: interprobe internal interaction test p-values match returned testing models (binary x)", {
  testthat::skip_if_not_installed("marginaleffects")
  testthat::skip_if_not_installed("mgcv")
  
  set.seed(302)
  n <- 240
  z <- rnorm(n)
  x <- rep(c(0, 1), each = n / 2)
  y <- 0.2 + 0.5 * x + 0.8 * z + 0.9 * x * z + rnorm(n)
  
  grDevices::pdf(file = tempfile(fileext = ".pdf"), width = 7, height = 7)
  on.exit(grDevices::dev.off(), add = TRUE)
  
  out <- capture.output(
    res <- statuser::interprobe(
      x, z, y,
      quiet = FALSE,
      draw = "jn",
      histogram = FALSE,
      spotlights = c(-1, 0, 1),
      probe.bins = 30
    )
  )
  
  testthat::expect_true(is.list(res))
  testthat::expect_true("gam_results_testing" %in% names(res))
  
  gam_p <- NA_real_
  sm <- summary(res$gam_results_testing)
  st <- sm$s.table
  rn <- rownames(st)
  if (!is.null(rn)) {
    hits <- which(
      grepl("ti(", rn, fixed = TRUE) &
        grepl("x", rn, fixed = TRUE) &
        grepl("z", rn, fixed = TRUE)
    )
    if (length(hits) > 0) {
      gam_p <- as.numeric(st[hits[1], "p-value"])
    }
  }
  
  gam_line <- out[grepl("^\\s*GAM:", out)]
  gam_line <- if (length(gam_line) > 0) gam_line[1] else NA_character_
  gam_p_reported <- be_parse_p_from_apa_line(gam_line)
  
  # If output was of the form p < .001, we parse as 0.001 and only check the direction.
  if (is.finite(gam_p_reported) && identical(gam_p_reported, 0.001)) {
    testthat::expect_true(is.finite(gam_p))
    testthat::expect_true(gam_p < 0.001)
  } else {
    be_expect_equal_num(gam_p_reported, gam_p, tol = 1e-10)
  }
  
  # Linear interaction p-value: use lm2_results if available, otherwise compute from lm on the same data.
  lin_line <- out[grepl("^\\s*linear model:", out)]
  lin_line <- if (length(lin_line) > 0) lin_line[1] else NA_character_
  lin_p_reported <- be_parse_p_from_apa_line(lin_line)
  
  lin_p_ref <- NA_real_
  if ("lm2_results" %in% names(res) && !is.null(res$lm2_results)) {
    tbl <- attr(res$lm2_results, "statuser_table")
    if (!is.null(tbl) && "term" %in% names(tbl) && "p.value" %in% names(tbl)) {
      idx2 <- match(c("x:z", "z:x"), tbl$term)
      idx2 <- idx2[!is.na(idx2)]
      if (length(idx2) > 0) lin_p_ref <- as.numeric(tbl$p.value[idx2[1]])
    }
  }
  if (!is.finite(lin_p_ref)) {
    df <- data.frame(y = y, x = x, z = z)
    m <- stats::lm(y ~ x * z, data = df)
    lin_p_ref <- be_extract_linear_interaction_p(m, "x", "z")
  }
  
  if (is.finite(lin_p_reported) && identical(lin_p_reported, 0.001)) {
    testthat::expect_true(is.finite(lin_p_ref))
    testthat::expect_true(lin_p_ref < 0.001)
  } else {
    be_expect_equal_num(lin_p_reported, lin_p_ref, tol = 1e-10)
  }
})