test_that("Calculations are correct - two class", {
  lst <- data_altman()
  pathology <- lst$pathology

  sens <- 231 / 258
  spec <- 54 / 86
  H <- sens
  Fa <- 1 - spec
  expected <- (log(Fa) - log(H) - log(1 - Fa) + log(1 - H)) /
    (log(Fa) + log(H) + log(1 - Fa) + log(1 - H))

  expect_equal(
    sedi_vec(truth = pathology$pathology, estimate = pathology$scan),
    expected
  )
})

test_that("Perfect predictions yield SEDI of 1", {
  truth <- factor(c("a", "a", "b", "b"), levels = c("a", "b"))
  estimate <- truth
  expect_equal(sedi_vec(truth, estimate), 1, tolerance = 1e-6)
})

test_that("Random predictions yield SEDI near 0", {
  # When sens ≈ 1 - spec (i.e. H ≈ F), SEDI → 0
  set.seed(42)
  n <- 10000
  truth <- factor(sample(c("a", "b"), n, replace = TRUE), levels = c("a", "b"))
  estimate <- factor(
    sample(c("a", "b"), n, replace = TRUE),
    levels = c("a", "b")
  )
  result <- sedi_vec(truth, estimate)
  expect_true(abs(result) < 0.1)
})

test_that("Calculations are correct - three class (macro)", {
  multi_ex <- data_three_by_three()

  expect_equal(
    sedi(multi_ex, estimator = "macro")[[".estimate"]],
    macro_metric(sedi_binary)
  )
})

test_that("Calculations are correct - three class (macro_weighted)", {
  multi_ex <- data_three_by_three()

  expect_equal(
    sedi(multi_ex, estimator = "macro_weighted")[[".estimate"]],
    macro_weighted_metric(sedi_binary)
  )
})

test_that("Calculations are correct - three class (micro)", {
  multi_ex <- data_three_by_three()
  micro <- data_three_by_three_micro()

  # Micro: pool counts, compute single SEDI
  H <- sum(micro$tp) / sum(micro$p)
  Fa <- sum(micro$fp) / (sum(micro$fp) + sum(micro$tn))

  small <- 1e-9
  H <- max(min(H, 1 - small), small)
  Fa <- max(min(Fa, 1 - small), small)

  expected <- (log(Fa) - log(H) - log(1 - Fa) + log(1 - H)) /
    (log(Fa) + log(H) + log(1 - Fa) + log(1 - H))

  expect_equal(
    sedi(multi_ex, estimator = "micro")[[".estimate"]],
    expected
  )
})

test_that("Multiclass auto-selects macro estimator", {
  truth <- factor(c("a", "b", "c", "a", "b", "c"), levels = c("a", "b", "c"))
  estimate <- factor(c("a", "b", "a", "a", "c", "c"), levels = c("a", "b", "c"))

  result <- sedi(
    data.frame(truth = truth, estimate = estimate),
    truth,
    estimate
  )

  expect_identical(result$.estimator, "macro")
})

test_that("Multiclass SEDI is bounded [-1, 1]", {
  truth <- factor(
    c("a", "b", "c", "a", "b", "c", "a", "b", "c"),
    levels = c("a", "b", "c")
  )
  estimate <- factor(
    c("a", "c", "b", "b", "a", "c", "c", "b", "a"),
    levels = c("a", "b", "c")
  )

  result <- sedi_vec(truth, estimate)
  expect_true(result >= -1 && result <= 1)
})

test_that("Perfect multiclass predictions yield SEDI of 1", {
  truth <- factor(c("a", "a", "b", "b", "c", "c"), levels = c("a", "b", "c"))
  estimate <- truth
  expect_equal(sedi_vec(truth, estimate), 1, tolerance = 1e-6)
})

test_that("All interfaces give the same results", {
  lst <- data_altman()
  pathology <- lst$pathology
  path_tbl <- lst$path_tbl
  path_mat <- unclass(path_tbl)

  exp <- sedi_vec(pathology$pathology, pathology$scan)

  expect_identical(
    sedi(path_tbl)[[".estimate"]],
    exp
  )
  expect_identical(
    sedi(path_mat)[[".estimate"]],
    exp
  )
  expect_identical(
    sedi(pathology, truth = pathology, estimate = scan)[[".estimate"]],
    exp
  )
})

test_that("Calculations handle NAs", {
  lst <- data_altman()
  pathology <- lst$pathology

  sens <- 230 / 256
  spec <- 53 / 85
  H <- sens
  Fa <- 1 - spec
  expected <- (log(Fa) - log(H) - log(1 - Fa) + log(1 - H)) /
    (log(Fa) + log(H) + log(1 - Fa) + log(1 - H))

  expect_equal(
    sedi_vec(truth = pathology$pathology, estimate = pathology$scan_na),
    expected
  )
})

test_that("Case weights calculations are correct", {
  df <- data.frame(
    truth = factor(c("x", "x", "y"), levels = c("x", "y")),
    estimate = factor(c("x", "y", "x"), levels = c("x", "y")),
    case_weights = c(1L, 10L, 2L)
  )

  result <- sedi(df, truth, estimate, case_weights = case_weights)[[
    ".estimate"
  ]]
  expect_type(result, "double")
  expect_true(result >= -1 && result <= 1)
})

test_that("work with class_pred input", {
  skip_if_not_installed("probably")

  cp_truth <- probably::as_class_pred(two_class_example$truth, which = 1)
  cp_estimate <- probably::as_class_pred(two_class_example$predicted, which = 2)

  fct_truth <- two_class_example$truth
  fct_truth[1] <- NA

  fct_estimate <- two_class_example$predicted
  fct_estimate[2] <- NA

  expect_identical(
    sedi_vec(fct_truth, cp_estimate),
    sedi_vec(fct_truth, fct_estimate)
  )

  expect_identical(
    sedi_vec(fct_truth, cp_estimate, na_rm = FALSE),
    NA_real_
  )

  expect_snapshot(
    error = TRUE,
    sedi_vec(cp_truth, cp_estimate)
  )
})

test_that("works with hardhat case weights", {
  lst <- data_altman()
  df <- lst$pathology
  imp_wgt <- hardhat::importance_weights(seq_len(nrow(df)))
  freq_wgt <- hardhat::frequency_weights(seq_len(nrow(df)))

  expect_no_error(
    sedi_vec(df$pathology, df$scan, case_weights = imp_wgt)
  )

  expect_no_error(
    sedi_vec(df$pathology, df$scan, case_weights = freq_wgt)
  )
})

test_that("na_rm argument check", {
  expect_snapshot(
    error = TRUE,
    sedi_vec(1, 1, na_rm = "yes")
  )
})

test_that("`event_level = 'second'` works", {
  lst <- data_altman()
  df <- lst$pathology

  df_rev <- df
  df_rev$pathology <- stats::relevel(df_rev$pathology, "norm")
  df_rev$scan <- stats::relevel(df_rev$scan, "norm")

  expect_equal(
    sedi_vec(df$pathology, df$scan),
    sedi_vec(df_rev$pathology, df_rev$scan, event_level = "second")
  )
})

test_that("SEDI is prevalence-independent", {
  # Same sens and spec should give same SEDI regardless of sample size
  # 10:90 prevalence
  truth1 <- factor(c(rep("a", 10), rep("b", 90)), levels = c("a", "b"))
  est1 <- factor(
    c(rep("a", 8), rep("b", 2), rep("b", 85), rep("a", 5)),
    levels = c("a", "b")
  )

  # 1:999 prevalence (same rates, different N)
  truth2 <- factor(c(rep("a", 100), rep("b", 9000)), levels = c("a", "b"))
  est2 <- factor(
    c(rep("a", 80), rep("b", 20), rep("b", 8500), rep("a", 500)),
    levels = c("a", "b")
  )

  # sens = 0.8, spec ≈ 0.944 for both
  expect_equal(
    sedi_vec(truth1, est1),
    sedi_vec(truth2, est2),
    tolerance = 0.01
  )
})

test_that("Multiclass SEDI is prevalence-independent per class", {
  # Three-class problem: class "a" is rare in both, same per-class rates
  # 5:45:50 prevalence
  truth1 <- factor(
    c(rep("a", 5), rep("b", 45), rep("c", 50)),
    levels = c("a", "b", "c")
  )
  est1 <- factor(
    c(
      rep("a", 4),
      "b",
      rep("b", 40),
      rep("c", 3),
      rep("a", 2),
      rep("c", 45),
      rep("b", 3),
      rep("a", 2)
    ),
    levels = c("a", "b", "c")
  )

  # 50:450:500 prevalence (same rates, 10x more data)
  truth2 <- factor(
    c(rep("a", 50), rep("b", 450), rep("c", 500)),
    levels = c("a", "b", "c")
  )
  est2 <- factor(
    c(
      rep("a", 40),
      rep("b", 10),
      rep("b", 400),
      rep("c", 30),
      rep("a", 20),
      rep("c", 450),
      rep("b", 30),
      rep("a", 20)
    ),
    levels = c("a", "b", "c")
  )

  expect_equal(
    sedi_vec(truth1, est1),
    sedi_vec(truth2, est2),
    tolerance = 0.02
  )
})

test_that("range values are correct", {
  direction <- metric_direction(sedi)
  range <- metric_range(sedi)
  perfect <- ifelse(direction == "minimize", range[1], range[2])

  df <- tibble::tibble(
    truth = factor(c("A", "A", "B", "B", "B")),
    off = factor(c("B", "B", "A", "A", "A"))
  )

  expect_equal(
    sedi_vec(df$truth, df$truth),
    perfect,
    tolerance = 1e-6
  )

  if (direction == "maximize") {
    expect_lt(sedi_vec(df$truth, df$off), perfect)
  }
})