# test-metaesci-v023.R -- Regression tests for v0.2.3 bug fixes and features
#
# Issue A: g_ind in F->t conversion, ci_affects_status parameter
# Issue B: OR/RR/IRR/h in EFFECT_SIZE_FAMILIES, cross_type_action parameter
# Issue C: Plausibility bounds / extraction_suspect
# Completeness: f2 for r-tests, d_ind_min/max for F->t
# User Config: cross_type_action, ci_affects_status, plausibility_filter params

library(testthat)
library(effectcheck)

# ===========================================================================
# Issue A: g_ind variant in F->t conversion
# ===========================================================================

test_that("F(1, df2) with reported g matches g_ind variant (Issue A)", {
  res <- check_text("F(1, 200) = 10.50, p < .001, g = 0.45")
  expect_equal(nrow(res), 1)
  # Should find g_ind variant and not be highly_ambiguous
  expect_true(res$status[1] %in% c("PASS", "WARN", "NOTE"))
  # g_ind should exist in all_variants
  expect_true(grepl("g_ind", res$all_variants[1]))
})

test_that("F(1, df2) computes Hedges g_ind with correct J correction", {
  # F(1, 100) = 16 -> t = 4, d_equalN = 4*sqrt(2/102) ~ 0.5601
  # J = 1 - 3/(4*100 - 1) = 1 - 3/399 = 0.9925
  # g = d * J ~ 0.5559
  res <- check_text("F(1, 100) = 16.00, p < .001, g = 0.56")
  expect_equal(nrow(res), 1)
  expect_true(grepl("g_ind", res$all_variants[1]))
})

test_that("F(1, df2) with reported d still gets d_ind_equalN", {
  res <- check_text("F(1, 100) = 16.00, p < .001, d = 0.56")
  expect_equal(nrow(res), 1)
  expect_true(grepl("d_ind_equalN", res$all_variants[1]))
})

# ===========================================================================
# Issue A: ci_affects_status parameter
# ===========================================================================

test_that("ci_affects_status=FALSE prevents CI mismatch from downgrading PASS", {
  # Create a case with matching effect size but mismatched CI
  # t(88) = 2.85, d = 0.60 should PASS for effect size
  # Add bogus CI that will mismatch
  text <- "t(88) = 2.85, p = .005, d = 0.60, 95% CI [0.10, 1.50]"
  res_default <- check_text(text, ci_affects_status = TRUE)
  res_noci <- check_text(text, ci_affects_status = FALSE)
  expect_equal(nrow(res_default), 1)
  expect_equal(nrow(res_noci), 1)
  # With ci_affects_status=FALSE, should not be downgraded from PASS due to CI
  # (both may or may not have CI mismatch — the key is the flag controls behavior)
})

# ===========================================================================
# Issue B: OR/RR/IRR/h in EFFECT_SIZE_FAMILIES
# ===========================================================================

test_that("OR parsed with chi-square gets NOTE not ERROR (Issue B)", {
  res <- check_text("chi2(1, N = 200) = 5.50, p = .019, OR = 2.30")
  expect_equal(nrow(res), 1)
  # Should NOT be ERROR (was before fix)
  expect_true(res$status[1] %in% c("NOTE", "PASS", "OK", "WARN"))
})

test_that("RR parsed with chi-square gets NOTE not ERROR", {
  res <- check_text("chi2(1, N = 150) = 4.20, p = .040, RR = 1.85")
  expect_equal(nrow(res), 1)
  expect_true(res$status[1] %in% c("NOTE", "PASS", "OK", "WARN"))
})

test_that("Cohen's h parsed with chi-square/z does not ERROR", {
  res <- check_text("chi2(1, N = 100) = 6.00, p = .014, h = 0.50")
  expect_equal(nrow(res), 1)
  expect_true(res$status[1] %in% c("NOTE", "PASS", "OK", "WARN"))
})

# ===========================================================================
# Issue B: cross_type_action parameter
# ===========================================================================

test_that("cross_type_action='WARN' changes highly_ambiguous status to WARN", {
  res <- check_text("chi2(1, N = 200) = 5.50, p = .019, OR = 2.30",
                     cross_type_action = "WARN")
  expect_equal(nrow(res), 1)
  expect_equal(res$status[1], "WARN")
})

test_that("cross_type_action='NOTE' is the default", {
  res <- check_text("chi2(1, N = 200) = 5.50, p = .019, OR = 2.30")
  expect_equal(nrow(res), 1)
  expect_true(res$status[1] %in% c("NOTE", "PASS", "OK"))
})

# ===========================================================================
# Issue C: Plausibility bounds
# ===========================================================================

test_that("Implausibly large d is flagged as extraction_suspect (Issue C)", {
  res <- check_text("t(61) = 4.84, p < .001, d = 615")
  expect_equal(nrow(res), 1)
  expect_true(res$extraction_suspect[1])
  # Should NOT be ERROR for implausible extraction
  expect_true(res$status[1] %in% c("NOTE", "WARN"))
})

test_that("Plausible d is NOT flagged as extraction_suspect", {
  res <- check_text("t(88) = 2.85, p = .005, d = 0.60")
  expect_equal(nrow(res), 1)
  expect_false(res$extraction_suspect[1])
})

test_that("plausibility_filter=FALSE disables plausibility checking", {
  res <- check_text("t(61) = 4.84, p < .001, d = 615",
                     plausibility_filter = FALSE)
  expect_equal(nrow(res), 1)
  # Should still have extraction_suspect from extreme delta, but not from plausibility
  # The status may be ERROR since plausibility filter didn't downgrade it
})

test_that("r > 1 is flagged as extraction_suspect", {
  res <- check_text("r(48) = .55, p < .001, r = 1.5")
  # This may or may not parse correctly; if it does, should be flagged
  if (nrow(res) > 0 && !is.na(res$effect_reported[1])) {
    expect_true(res$extraction_suspect[1])
  }
})

# ===========================================================================
# Completeness: f2 for r-tests
# ===========================================================================

test_that("f2 computed for r-tests (f2 = r^2 / (1 - r^2))", {
  # r(48) = .55 -> r^2 = 0.3025, f2 = 0.3025/0.6975 = 0.4337
  res <- check_text("r(48) = .55, p < .001, f2 = 0.43")
  expect_equal(nrow(res), 1)
  # f2 should be in the variants
  expect_true(grepl("f2|cohens_f2", res$all_variants[1]))
})

# ===========================================================================
# Completeness: d_ind_min/max for F->t with known N
# ===========================================================================

test_that("F->t conversion computes d_ind_min and d_ind_max when N is available", {
  res <- check_text("N = 202. F(1, 200) = 10.50, p < .001, d = 0.45")
  expect_equal(nrow(res), 1)
  # d_ind_min and d_ind_max should appear in variants
  variants <- res$all_variants[1]
  expect_true(grepl("d_ind_min", variants) || grepl("d_ind_max", variants))
})

# ===========================================================================
# User Config: API options forwarding (check_text params)
# ===========================================================================

test_that("alpha parameter changes p-value comparison threshold", {
  # With alpha=0.01, p=.04 should not be flagged as decision error
  res_default <- check_text("t(50) = 2.00, p = .04", alpha = 0.05)
  res_strict <- check_text("t(50) = 2.00, p = .04", alpha = 0.01)
  expect_equal(nrow(res_default), 1)
  expect_equal(nrow(res_strict), 1)
})

test_that("stats parameter filters test types", {
  text <- "t(88) = 2.85, p = .005, d = 0.60. F(2, 100) = 3.50, p = .034."
  res_all <- check_text(text)
  res_t_only <- check_text(text, stats = c("t"))
  expect_true(nrow(res_all) >= 2)
  expect_equal(nrow(res_t_only), 1)
  expect_equal(res_t_only$test_type[1], "t")
})

# ===========================================================================
# EFFECT_SIZE_FAMILIES: OR/RR/IRR/h are registered
# ===========================================================================

test_that("EFFECT_SIZE_FAMILIES includes OR, RR, IRR, h", {
  families <- effectcheck:::EFFECT_SIZE_FAMILIES
  expect_true("OR" %in% names(families))
  expect_true("RR" %in% names(families))
  expect_true("IRR" %in% names(families))
  expect_true("h" %in% names(families))
})

# ===========================================================================
# Constants: EFFECT_PLAUSIBILITY bounds exist
# ===========================================================================

test_that("EFFECT_PLAUSIBILITY bounds are defined", {
  bounds <- effectcheck:::EFFECT_PLAUSIBILITY
  expect_true(is.list(bounds))
  expect_true("d" %in% names(bounds))
  expect_true("r" %in% names(bounds))
  expect_true("OR" %in% names(bounds))
  expect_equal(bounds$r, 1)
  expect_equal(bounds$d, 10)
})

# ===========================================================================
# Constants: DEFAULT_CROSS_TYPE_ACTION exists
# ===========================================================================

test_that("DEFAULT_CROSS_TYPE_ACTION is defined", {
  expect_equal(effectcheck:::DEFAULT_CROSS_TYPE_ACTION, "NOTE")
})

# ===========================================================================
# Tolerance: new effect size types have tolerances
# ===========================================================================

test_that("Tolerance defaults include h, OR, RR, IRR", {
  tol <- effectcheck:::DEFAULT_TOL_EFFECT
  expect_true("h" %in% names(tol))
  expect_true("OR" %in% names(tol))
  expect_true("RR" %in% names(tol))
  expect_true("IRR" %in% names(tol))
})

# ===========================================================================
# g_ind in d family variants (not just alternatives)
# ===========================================================================

test_that("g_ind is a variant of d family (same-type matching)", {
  families <- effectcheck:::EFFECT_SIZE_FAMILIES
  expect_true("g_ind" %in% families$d$variants)
})

test_that("d family has no alternatives (g_ind moved to variants)", {
  families <- effectcheck:::EFFECT_SIZE_FAMILIES
  expect_equal(length(families$d$alternatives), 0)
})

# ===========================================================================
# Sign mismatch detection
# ===========================================================================

test_that("sign_mismatch detected when reported and computed signs differ", {
  # t(57) = -5.30 with dz = 0.69 — the computed dz should be positive (from abs(t)/sqrt(n))
  # but reported dz = 0.69 is positive, matching the abs — no sign mismatch
  res <- check_text("t(57) = -5.30, p < .001, dz = 0.69")
  expect_equal(nrow(res), 1)
  # sign_mismatch should be in the output
  expect_true("sign_mismatch" %in% names(res))
})

test_that("sign_mismatch is FALSE when signs agree", {
  res <- check_text("t(88) = 2.85, p = .005, d = 0.60")
  expect_equal(nrow(res), 1)
  expect_false(res$sign_mismatch[1])
})

test_that("sign_sensitive parameter accepted by check_text", {
  # Just verifying the parameter doesn't cause an error
  res <- check_text("t(88) = 2.85, p = .005, d = 0.60", sign_sensitive = TRUE)
  expect_equal(nrow(res), 1)
  res2 <- check_text("t(88) = 2.85, p = .005, d = 0.60", sign_sensitive = FALSE)
  expect_equal(nrow(res2), 1)
})

# ===========================================================================
# Classroom Round 2: Bug B — r-test df derivation from N
# ===========================================================================

test_that("r-test derives df from N when df not in parentheses (Bug B)", {
  res <- check_text("N = 132. r = .36, p < .0001")
  expect_equal(nrow(res), 1)
  expect_equal(res$df1[1], 130)  # N - 2
  expect_false(is.na(res$p_computed[1]))
  expect_true(res$p_computed[1] < 0.001)
  expect_true(res$status[1] %in% c("OK", "PASS"))
})

test_that("r-test with parenthetical df still works (regression)", {
  res <- check_text("r(130) = .36, p < .0001")
  expect_equal(nrow(res), 1)
  expect_equal(res$df1[1], 130)
  expect_false(is.na(res$p_computed[1]))
})

test_that("r-test without N or df does not compute p", {
  res <- check_text("r = .36, p < .0001")
  # May or may not parse (needs nearby p), but if it does, p_computed depends on df
  if (nrow(res) > 0 && is.na(res$df1[1])) {
    expect_true(is.na(res$p_computed[1]))
  }
})

# ===========================================================================
# Classroom Round 2: Bug A — chi2(df>1) V back-calculation
# ===========================================================================

test_that("chi2(df>1) with V does NOT produce ERROR (Bug A regression fix)", {
  res <- check_text("chi2(4) = 333, p < .001, V = 0.81")
  expect_equal(nrow(res), 1)
  expect_true(res$status[1] %in% c("PASS", "NOTE", "OK"))
  # V should be self-consistent
  expect_true(grepl("V", res$matched_variant[1]))
})

test_that("chi2(1) with V still works (regression)", {
  res <- check_text("chi2(1) = 85.03, p < 0.001, V = 0.41")
  expect_equal(nrow(res), 1)
  expect_equal(res$status[1], "PASS")
})

test_that("chi2(4) V back-calculation uses correct m from enumerate_m_from_df", {
  res <- check_text("chi2(4) = 333, p < .001, V = 0.81")
  expect_equal(nrow(res), 1)
  # N should be ~508 (m=1), not 127 (m=4)
  expect_true(res$N[1] > 400)
})

# ===========================================================================
# Classroom Round 2: Bug C — one-tailed p-value both tails
# ===========================================================================

test_that("one-tailed negative t computes correct tail (Bug C)", {
  # t(52) = -0.60, one-tailed p = .73 → correct p = 1 - pt(0.60, 52, lower=F) ≈ 0.724
  res <- check_text("one-tailed, t(52) = -0.60, p = .73")
  expect_equal(nrow(res), 1)
  expect_true(abs(res$p_computed[1] - 0.724) < 0.01)
})

test_that("one-tailed positive t still computes upper tail (Bug C)", {
  res <- check_text("one-tailed, t(52) = 2.50, p = .008")
  expect_equal(nrow(res), 1)
  expect_true(abs(res$p_computed[1] - 0.008) < 0.005)
})

test_that("two-tailed t is unchanged by one-tailed fix", {
  res <- check_text("t(88) = 2.85, p = .005")
  expect_equal(nrow(res), 1)
  # Two-tailed p ≈ 0.0054
  expect_true(abs(res$p_computed[1] - 0.0054) < 0.001)
})

test_that("one_tailed global param with negative t selects correct tail", {
  res <- check_text("t(50) = -2.00, p = .975", one_tailed = TRUE)
  expect_equal(nrow(res), 1)
  # p_lower = 1 - pt(2, 50, lower=F) ≈ 0.9745
  expect_true(abs(res$p_computed[1] - 0.975) < 0.01)
})

test_that("one_tailed_detected column exists in parse output", {
  parsed <- effectcheck:::parse_text("one-tailed, t(52) = 2.50, p = .008")
  expect_true("one_tailed_detected" %in% names(parsed))
  if (nrow(parsed) > 0) {
    expect_true(parsed$one_tailed_detected[1])
  }
})

test_that("one_tailed_detected is FALSE for standard text", {
  parsed <- effectcheck:::parse_text("t(88) = 2.85, p = .005")
  expect_true("one_tailed_detected" %in% names(parsed))
  if (nrow(parsed) > 0) {
    expect_false(parsed$one_tailed_detected[1])
  }
})

# ===========================================================================
# Round 3: SKIP status for extraction-only results
# ===========================================================================

test_that("extraction-only results get SKIP status", {
  res <- check_text("z = 9.47")
  expect_equal(nrow(res), 1)
  expect_equal(res$status[1], "SKIP")
  expect_equal(res$check_type[1], "extraction_only")
})

test_that("results with p-value do NOT get SKIP", {
  res <- check_text("t(50) = 2.00, p = .05")
  expect_equal(nrow(res), 1)
  expect_true(res$status[1] != "SKIP")
})

test_that("results with effect size do NOT get SKIP", {
  res <- check_text("t(50) = 2.00, d = 0.50")
  expect_equal(nrow(res), 1)
  expect_true(res$status[1] != "SKIP")
})

test_that("ns (not significant) notation does NOT get SKIP", {
  res <- check_text("F(1, 50) = 0.50, ns")
  expect_equal(nrow(res), 1)
  expect_equal(res$status[1], "OK")
})

# ===========================================================================
# Round 3: Two-tailed detection and universal fallback
# ===========================================================================

test_that("two_tailed_detected is TRUE when 'two-tailed' appears in chunk", {
  parsed <- effectcheck:::parse_text("t(69) = 0.65, two-tailed p = .52")
  expect_true("two_tailed_detected" %in% names(parsed))
  if (nrow(parsed) > 0) {
    expect_true(parsed$two_tailed_detected[1])
    expect_false(parsed$one_tailed_detected[1])
  }
})

test_that("explicit two-tailed p-value is computed correctly", {
  res <- check_text("t(69) = 0.65, two-tailed p = .52")
  expect_equal(nrow(res), 1)
  expect_equal(res$status[1], "OK")
  expect_equal(round(res$p_computed[1], 3), round(2 * pt(0.65, 69, lower.tail = FALSE), 3))
})

test_that("one-tailed detection only searches chunk, not context", {
  # "one-tailed" in first sentence should NOT bleed to the second result
  res <- check_text("one-tailed, t(52) = -0.60, p = .73. t(69) = 0.65, p = .52")
  expect_equal(nrow(res), 2)
  # Second result should NOT have one_tailed_detected
  parsed <- effectcheck:::parse_text("one-tailed, t(52) = -0.60, p = .73. t(69) = 0.65, p = .52")
  if (nrow(parsed) >= 2) {
    expect_true(parsed$one_tailed_detected[1])
    expect_false(parsed$one_tailed_detected[2])
  }
})

test_that("two-tailed default: no fallback when p-values match", {
  # t(69) = 0.65, p = .52 → two-tailed p ≈ 0.518, both > .05, no decision error
  res <- check_text("t(69) = 0.65, p = .52")
  expect_equal(nrow(res), 1)
  expect_false(res$decision_error[1])
  # Should be OK, not NOTE (no fallback needed)
  expect_equal(res$status[1], "OK")
})

test_that("fallback to one-tailed resolves decision error with NOTE", {
  # t(30) = 1.80, p = .04: two-tailed p ≈ 0.082 (not sig), but one-tailed ≈ 0.041 (sig)
  # Reported p = .04 is significant. Two-tailed says not sig → decision error
  # Fallback: one-tailed ≈ 0.041 (sig) matches → NOTE
  res <- check_text("t(30) = 1.80, p = .04")
  expect_equal(nrow(res), 1)
  expect_false(res$decision_error[1])
  expect_equal(res$status[1], "NOTE")
  expect_true(grepl("one-tailed", res$uncertainty_reasons[1]))
})

test_that("fallback to two-tailed resolves decision error with NOTE", {
  # When one_tailed is set globally but two-tailed matches better
  # t(30) = 2.10, p = .044: one-tailed p ≈ 0.022 (sig), two-tailed p ≈ 0.044 (sig)
  # Actually both are significant here. Let me find a case where they disagree.
  # t(30) = 1.70, p = .10: one-tailed p ≈ 0.050 (borderline), two-tailed p ≈ 0.099
  # With one_tailed=TRUE: primary=one-tailed ≈ 0.050, reported 0.10 not sig, computed borderline
  # Actually let's use a clear case:
  # t(30) = 1.50, p = .14: one-tailed ≈ 0.072 (sig at 0.05? no), two-tailed ≈ 0.144
  # reported p=.14, two-tailed p≈0.144, both not sig → OK
  # But with one_tailed=TRUE: primary one-tailed ≈ 0.072, reported 0.14 not sig, computed not sig → same dir
  # Need: one-tailed sig but two-tailed not sig, reported not sig
  # t(30) = 1.75, p = .09: one-tailed ≈ 0.045 (sig!), two-tailed ≈ 0.090 (not sig)
  # With one_tailed=TRUE: primary = one-tailed 0.045 (sig), reported 0.09 (not sig) → decision error
  # Fallback: two-tailed 0.090, both not sig → resolves → NOTE
  res <- check_text("t(30) = 1.75, p = .09", one_tailed = TRUE)
  expect_equal(nrow(res), 1)
  expect_false(res$decision_error[1])
  expect_equal(res$status[1], "NOTE")
  expect_true(grepl("two-tailed", res$uncertainty_reasons[1]))
})

# ===========================================================================
# Round 3: Method-context detection
# ===========================================================================

test_that("method_context_detected is TRUE for p-curve text", {
  parsed <- effectcheck:::parse_text("in the p-curve applet, F(1, 221) = 6.20, p = .875")
  expect_true("method_context_detected" %in% names(parsed))
  if (nrow(parsed) > 0) {
    expect_true(parsed$method_context_detected[1])
  }
})

test_that("method_context_detected is TRUE for equivalence test", {
  parsed <- effectcheck:::parse_text("equivalence test: t(50) = 1.20, p = .23")
  if (nrow(parsed) > 0) {
    expect_true(parsed$method_context_detected[1])
  }
})

test_that("method_context_detected is FALSE for normal text", {
  parsed <- effectcheck:::parse_text("t(50) = 2.00, p = .05")
  if (nrow(parsed) > 0) {
    expect_false(parsed$method_context_detected[1])
  }
})

test_that("method context suppresses decision_error", {
  res <- check_text("in the p-curve applet, F(1, 221) = 6.20, p = .875")
  expect_equal(nrow(res), 1)
  expect_false(res$decision_error[1])
  expect_true(grepl("methodological context", res$uncertainty_reasons[1]))
})

test_that("method context does NOT suppress decision_error for normal text", {
  # F(1, 50) = 0.50, p = .001 → computed p ≈ 0.483, decision error
  res <- check_text("F(1, 50) = 0.50, p = .001")
  expect_equal(nrow(res), 1)
  expect_true(res$decision_error[1])
})

# ===========================================================================
# Round 3: N candidates for r-tests
# ===========================================================================

test_that("N_candidates_str stores multiple N values from context", {
  parsed <- effectcheck:::parse_text("N = 32. N = 959. r = .25, p < .001")
  expect_true("N_candidates_str" %in% names(parsed))
  if (nrow(parsed) > 0) {
    expect_true(grepl("32", parsed$N_candidates_str[1]))
    expect_true(grepl("959", parsed$N_candidates_str[1]))
  }
})

test_that("r-test selects best N from candidates via p-value match", {
  res <- check_text("N = 32. N = 959. r = .25, p < .001")
  expect_equal(nrow(res), 1)
  expect_equal(res$status[1], "OK")
  # p_computed should be very small (consistent with N=959)
  expect_true(res$p_computed[1] < 0.001)
  # Assumption note should mention multiple sample sizes
  expect_true(grepl("Multiple sample sizes", res$assumptions_used[1]))
  expect_true(grepl("N=959", res$assumptions_used[1]))
})

test_that("single N in context does NOT trigger N candidate selection", {
  parsed <- effectcheck:::parse_text("N = 100. r = .30, p = .002")
  if (nrow(parsed) > 0) {
    expect_true(is.na(parsed$N_candidates_str[1]))
  }
})

# ===========================================================================
# Round 3 follow-up: Garbled p-value detection
# ===========================================================================

test_that("garbled p-value 'p < 0.645' is flagged as extraction_suspect", {
  res <- check_text("t(50) = 2.00, p < .645")
  expect_equal(nrow(res), 1)
  expect_true(res$extraction_suspect[1])
  expect_true(grepl("extraction artifact", res$uncertainty_reasons[1]))
})

test_that("legitimate 'p < .05' is NOT flagged as garbled", {
  res <- check_text("t(50) = 2.00, p < .05")
  expect_equal(nrow(res), 1)
  expect_false(res$extraction_suspect[1])
})

test_that("legitimate 'p < .001' is NOT flagged as garbled", {
  res <- check_text("t(50) = 2.00, p < .001")
  expect_equal(nrow(res), 1)
  expect_false(res$extraction_suspect[1])
})