test_that("confusion_summary computes correct metrics for perfect agreement", {
  pred <- c("a", "b", "c", "a", "b")
  true <- c("a", "b", "c", "a", "b")
  cs <- confusion_summary(pred, true)

  expect_equal(cs$macro_f1, 1.0)
  expect_equal(cs$micro_f1, 1.0)
  expect_equal(nrow(cs$per_class), 3)
  expect_true(all(cs$per_class$f1 == 1.0))
})

test_that("confusion_summary handles complete disagreement", {
  pred <- c("a", "a", "a")
  true <- c("b", "b", "b")
  cs <- confusion_summary(pred, true)

  expect_equal(cs$per_class$f1[cs$per_class$class == "a"], 0)
  expect_equal(cs$per_class$f1[cs$per_class$class == "b"], 0)
})

test_that("confusion_summary handles partial agreement", {
  pred <- c("pos", "neg", "neu", "pos", "neg")
  true <- c("pos", "neg", "pos", "pos", "neg")
  cs <- confusion_summary(pred, true)

  expect_equal(nrow(cs$per_class), 3)
  # pos: tp=2, fp=0, fn=1 -> prec=1, rec=2/3, f1=0.8
  pos_row <- cs$per_class[cs$per_class$class == "pos", ]
  expect_equal(pos_row$precision, 1.0)
  expect_equal(round(pos_row$recall, 4), round(2/3, 4))
})

test_that("llm_human_reliability returns correct structure", {
  skip_if_not_installed("irr")

  llm <- c("pos", "neg", "neu", "pos", "neg", "pos", "neg", "neu", "pos", "neg")
  human <- c("pos", "neg", "pos", "pos", "neg", "pos", "neg", "neu", "pos", "neg")
  rel <- llm_human_reliability(llm, human)

  expect_s3_class(rel, "llm_reliability")
  expect_true(!is.na(rel$cohens_kappa))
  expect_true(!is.na(rel$krippendorffs_alpha))
  expect_true(rel$accuracy >= 0 && rel$accuracy <= 1)
  expect_true(rel$macro_f1 >= 0 && rel$macro_f1 <= 1)
  expect_equal(rel$n, 10)
})

test_that("llm_human_reliability perfect agreement gives kappa = 1", {
  skip_if_not_installed("irr")

  codes <- c("a", "b", "c", "a", "b", "c", "a", "b")
  rel <- llm_human_reliability(codes, codes)

  expect_equal(rel$cohens_kappa, 1.0)
  expect_equal(rel$accuracy, 1.0)
})

test_that("llm_human_reliability validates input lengths", {
  expect_error(llm_human_reliability(c("a", "b"), c("a")))
})

test_that("llm_intermodel_reliability computes pairwise stats", {
  results <- tibble::tibble(
    prompt = rep(c("text1", "text2", "text3"), 2),
    model = rep(c("gpt4", "claude"), each = 3),
    response = c("pos", "neg", "neu", "pos", "neg", "pos")
  )

  inter <- llm_intermodel_reliability(results)
  expect_equal(nrow(inter), 1)  # 1 pair
  expect_true("cohens_kappa" %in% names(inter))
  expect_true("pct_agree" %in% names(inter))
})