# Tests for audit fixes (Issues 1-5)
# See: MetaESCI/preregistration/escicheck_audit_report.md

# ===========================================================================
# Issue 1: Cross-type effect size matching
# ===========================================================================

test_that("F(1,k) with reported d matches d variant, not eta", {
  text <- "F(1, 58) = 8.33, p = .006, d = 0.75"
  result <- check_text(text)

  expect_true(nrow(result) > 0)
  expect_equal(result$test_type[1], "F")

  # matched_variant should be a d-family variant, not eta
  expect_true(grepl("^d_|^dz", result$matched_variant[1]),
    info = paste("Expected d-family match, got:", result$matched_variant[1]))

  # Should NOT be highly_ambiguous cross-type
  expect_false(result$ambiguity_level[1] == "highly_ambiguous",
    info = "F(1,k) with d should not be highly_ambiguous")

  # Status should not be ERROR from cross-type mismatch
  expect_true(result$status[1] != "ERROR" ||
    !grepl("No same-type", result$ambiguity_reason[1]),
    info = "Cross-type fallback should not produce ERROR")
})

test_that("F(1,k) d_equiv is computed correctly from t equivalence", {
  # F(1, 30) = 9.00 => t = 3.0, n1=n2=16, d_ind = t*sqrt(1/16+1/16) = 3*sqrt(1/8) = 1.061
  text <- "F(1, 30) = 9.00, p = .005, d = 1.06"
  result <- check_text(text)

  expect_true(nrow(result) > 0)
  # Should match closely (delta < 0.02)
  expect_true(!is.na(result$delta_effect[1]) && result$delta_effect[1] < 0.05,
    info = paste("Delta:", result$delta_effect[1]))
})

test_that("F(1,k) computes r_equiv as alternative", {
  # F(1, 48) = 6.25 => r = sqrt(6.25/(6.25+48)) = sqrt(0.1152) = 0.3394
  # Note: parser does not currently extract "r = .34" as an effect size for F-tests
  # (it's ambiguous with a correlation test). But r_equiv should be computed.
  text <- "F(1, 48) = 6.25, p = .016"
  result <- check_text(text)

  expect_true(nrow(result) > 0)
  expected_r <- sqrt(6.25 / (6.25 + 48))
  actual_r <- result$r_from_t_or_reported[1]
  expect_true(!is.na(actual_r),
    info = "r_equiv should be computed for F(1,k)")
  expect_true(abs(actual_r - expected_r) < 0.001,
    info = sprintf("r_equiv=%.4f expected=%.4f", actual_r, expected_r))
})

test_that("F(1,k) with d does not produce 'unusual for F-test' warning", {
  text <- "F(1, 40) = 5.00, p = .031, d = 0.70"
  result <- check_text(text)

  expect_true(nrow(result) > 0)
  # Should not contain "unusual for F-test" uncertainty
  expect_false(grepl("unusual for F-test|unusual for F/ANOVA", result$uncertainty_reasons[1]),
    info = paste("Unexpected uncertainty:", result$uncertainty_reasons[1]))
})

test_that("F(2,k) with d still flags cross-type (df1 != 1)", {
  text <- "F(2, 27) = 4.56, p = .02, d = 0.80"
  result <- check_text(text)

  expect_true(nrow(result) > 0)
  # df1=2, no t-equivalence, should flag cross-type
  expect_true(result$ambiguity_level[1] == "highly_ambiguous" ||
    grepl("unusual", result$uncertainty_reasons[1]),
    info = "F(df1>1) with d should flag mismatch")
})

test_that("F-test with R2 reported computes R2 variant", {
  # F(2, 97) = 5.50 => eta2 = 5.50*2/(5.50*2+97) = 11/108 = 0.1019
  text <- "F(2, 97) = 5.50, p = .005, R2 = .10"
  result <- check_text(text)

  expect_true(nrow(result) > 0)
  # Should have R2 in matched_variant
  if (!is.na(result$matched_variant[1])) {
    expect_true(grepl("R2", result$matched_variant[1]),
      info = paste("Expected R2 match, got:", result$matched_variant[1]))
  }
})

test_that("cross-type fallback is capped at NOTE, not ERROR", {
  # F(3, 50) with d reported — no way to compute d from multi-df F
  text <- "F(3, 50) = 10.00, p < .001, d = 2.50"
  result <- check_text(text)

  expect_true(nrow(result) > 0)
  if (result$ambiguity_level[1] == "highly_ambiguous" &&
      grepl("No same-type", result$ambiguity_reason[1])) {
    expect_true(result$status[1] != "ERROR",
      info = "Cross-type (no same-type variants) should not be ERROR")
  }
})

# ===========================================================================
# Issue 2: Paired N bug in t-test design inference
# ===========================================================================

test_that("dz uses df+1 not df+2 for paired computation", {
  # t(29) = 3.50 => correct dz = 3.50/sqrt(30) = 0.63901
  text <- "t(29) = 3.50, p = .001, dz = 0.64"
  result <- check_text(text)

  expect_true(nrow(result) > 0)
  # dz should be computed with n = df+1 = 30
  expected_dz <- 3.50 / sqrt(30)
  actual_dz <- result$dz[1]
  expect_true(!is.na(actual_dz),
    info = "dz should be computed")
  expect_true(abs(actual_dz - expected_dz) < 0.001,
    info = sprintf("dz=%.4f expected=%.4f (diff=%.4f)", actual_dz, expected_dz, abs(actual_dz - expected_dz)))
})

test_that("paired dz uses df+1 even when design inferred as independent", {
  # When d is reported, design is inferred as independent (N=df+2)
  # But paired dz should still use n=df+1
  text <- "t(49) = 2.80, p = .007, d = 0.56"
  result <- check_text(text)

  expect_true(nrow(result) > 0)
  # dz = 2.80/sqrt(50) = 0.39598
  expected_dz <- 2.80 / sqrt(50)
  actual_dz <- result$dz[1]
  expect_true(!is.na(actual_dz),
    info = "dz should be computed even when design is independent")
  expect_true(abs(actual_dz - expected_dz) < 0.001,
    info = sprintf("dz=%.4f expected=%.4f", actual_dz, expected_dz))
})

# ===========================================================================
# Issue 3: NOTE results that should be PASS
# ===========================================================================

test_that("ambiguous design with good match is PASS not NOTE", {
  # t(28) with d reported, no design context => ambiguous
  # d_ind_equalN ~ 2*2.21/sqrt(30) = 0.807, reported d=0.81
  text <- "t(28) = 2.21, p = .035, d = 0.81"
  result <- check_text(text)

  expect_true(nrow(result) > 0)
  # If delta is within tolerance, should be PASS even with ambiguous design
  if (!is.na(result$delta_effect[1]) && result$delta_effect[1] < 0.02) {
    expect_equal(result$status[1], "PASS",
      info = paste("Status:", result$status[1], "delta:", result$delta_effect[1],
                   "ambiguity:", result$ambiguity_level[1]))
  }
})

test_that("highly_ambiguous with good delta stays NOTE", {
  # This test verifies that truly cross-type matches stay NOTE
  # (only ambiguous should upgrade to PASS)
  text <- "t(28) = 2.21, p = .035, xyz = 0.81"
  result <- check_text(text)

  # If unknown type and highly_ambiguous, should stay NOTE even with small delta
  if (nrow(result) > 0 && !is.na(result$ambiguity_level[1]) &&
      result$ambiguity_level[1] == "highly_ambiguous" &&
      !is.na(result$delta_effect[1]) && result$delta_effect[1] < 0.02) {
    expect_equal(result$status[1], "NOTE")
  }
})

# ===========================================================================
# Issue 4: check_type column
# ===========================================================================

test_that("check_type column is present in output", {
  text <- "t(28) = 2.21, p = .035, d = 0.80"
  result <- check_text(text)

  expect_true(nrow(result) > 0)
  expect_true("check_type" %in% names(result),
    info = "check_type column should be in output")
})

test_that("check_type is 'effect_size' when ES is reported", {
  text <- "t(28) = 2.21, p = .035, d = 0.80"
  result <- check_text(text)

  expect_true(nrow(result) > 0)
  expect_equal(result$check_type[1], "effect_size")
})

test_that("check_type is 'p_value' when only p-value reported", {
  text <- "t(28) = 2.21, p = .035"
  result <- check_text(text)

  expect_true(nrow(result) > 0)
  expect_equal(result$check_type[1], "p_value")
})

# ===========================================================================
# Issue 5: Extreme delta / extraction_suspect flag
# ===========================================================================

test_that("extraction_suspect column is present in output", {
  text <- "t(28) = 2.21, p = .035, d = 0.80"
  result <- check_text(text)

  expect_true(nrow(result) > 0)
  expect_true("extraction_suspect" %in% names(result),
    info = "extraction_suspect column should be in output")
})

test_that("extraction_suspect is FALSE for normal deltas", {
  text <- "t(28) = 2.21, p = .035, d = 0.80"
  result <- check_text(text)

  expect_true(nrow(result) > 0)
  expect_false(result$extraction_suspect[1])
})

test_that("extraction_suspect is TRUE for extreme deltas", {
  # Report a wildly wrong d value to trigger extreme delta
  text <- "t(28) = 2.21, p = .035, d = 5.00"
  result <- check_text(text)

  expect_true(nrow(result) > 0)
  if (!is.na(result$delta_effect[1]) && result$delta_effect[1] > 1.0) {
    expect_true(result$extraction_suspect[1],
      info = paste("delta:", result$delta_effect[1], "should flag extraction_suspect"))
  }
})

test_that("extraction_suspect does NOT change status", {
  # Status should still be ERROR for large deltas (not downgraded)
  text <- "t(28) = 2.21, p = .035, d = 5.00"
  result <- check_text(text)

  expect_true(nrow(result) > 0)
  if (!is.na(result$delta_effect[1]) && result$delta_effect[1] > 1.0) {
    # Status should still be ERROR (not downgraded to NOTE)
    expect_equal(result$status[1], "ERROR",
      info = "extraction_suspect should not change status")
  }
})