test_that("confusion_summary computes correct metrics for perfect agreement", { pred <- c("a", "b", "c", "a", "b") true <- c("a", "b", "c", "a", "b") cs <- confusion_summary(pred, true) expect_equal(cs$macro_f1, 1.0) expect_equal(cs$micro_f1, 1.0) expect_equal(nrow(cs$per_class), 3) expect_true(all(cs$per_class$f1 == 1.0)) }) test_that("confusion_summary handles complete disagreement", { pred <- c("a", "a", "a") true <- c("b", "b", "b") cs <- confusion_summary(pred, true) expect_equal(cs$per_class$f1[cs$per_class$class == "a"], 0) expect_equal(cs$per_class$f1[cs$per_class$class == "b"], 0) }) test_that("confusion_summary handles partial agreement", { pred <- c("pos", "neg", "neu", "pos", "neg") true <- c("pos", "neg", "pos", "pos", "neg") cs <- confusion_summary(pred, true) expect_equal(nrow(cs$per_class), 3) # pos: tp=2, fp=0, fn=1 -> prec=1, rec=2/3, f1=0.8 pos_row <- cs$per_class[cs$per_class$class == "pos", ] expect_equal(pos_row$precision, 1.0) expect_equal(round(pos_row$recall, 4), round(2/3, 4)) }) test_that("llm_human_reliability returns correct structure", { skip_if_not_installed("irr") llm <- c("pos", "neg", "neu", "pos", "neg", "pos", "neg", "neu", "pos", "neg") human <- c("pos", "neg", "pos", "pos", "neg", "pos", "neg", "neu", "pos", "neg") rel <- llm_human_reliability(llm, human) expect_s3_class(rel, "llm_reliability") expect_true(!is.na(rel$cohens_kappa)) expect_true(!is.na(rel$krippendorffs_alpha)) expect_true(rel$accuracy >= 0 && rel$accuracy <= 1) expect_true(rel$macro_f1 >= 0 && rel$macro_f1 <= 1) expect_equal(rel$n, 10) }) test_that("llm_human_reliability perfect agreement gives kappa = 1", { skip_if_not_installed("irr") codes <- c("a", "b", "c", "a", "b", "c", "a", "b") rel <- llm_human_reliability(codes, codes) expect_equal(rel$cohens_kappa, 1.0) expect_equal(rel$accuracy, 1.0) }) test_that("llm_human_reliability validates input lengths", { expect_error(llm_human_reliability(c("a", "b"), c("a"))) }) test_that("llm_intermodel_reliability computes pairwise stats", { results <- tibble::tibble( prompt = rep(c("text1", "text2", "text3"), 2), model = rep(c("gpt4", "claude"), each = 3), response = c("pos", "neg", "neu", "pos", "neg", "pos") ) inter <- llm_intermodel_reliability(results) expect_equal(nrow(inter), 1) # 1 pair expect_true("cohens_kappa" %in% names(inter)) expect_true("pct_agree" %in% names(inter)) })