test_that("validate bundles confusion and reliability", {
  annotations <- data.frame(
    sample_id = rep(1:3, times = 2),
    model_id = rep(c("m1", "m2"), each = 3),
    label = c("pos", "neg", "pos", "pos", "neg", "neg"),
    truth = c("pos", "neg", "pos", "pos", "pos", "neg"),
    stringsAsFactors = FALSE
  )

  res <- validate(annotations)

  expect_named(res, c("confusion", "reliability"))
  expect_true(is.list(res$confusion))
  expect_true(is.list(res$reliability))
  expect_true("vs_gold" %in% names(res$confusion))
  expect_true("pairwise" %in% names(res$confusion))
  expect_equal(dim(res$confusion$vs_gold$m1), c(2, 2))
  expect_equal(nrow(res$reliability$cohen), 1)
})

test_that("validate respects include flags", {
  annotations <- data.frame(
    sample_id = rep(1:3, times = 2),
    model_id = rep(c("m1", "m2"), each = 3),
    label = c("pos", "neg", "pos", "pos", "neg", "neg"),
    truth = c("pos", "neg", "pos", "pos", "pos", "neg"),
    stringsAsFactors = FALSE
  )

  confusion_only <- validate(annotations, include_reliability = FALSE)
  expect_named(confusion_only, "confusion")

  reliability_only <- validate(annotations, include_confusion = FALSE)
  expect_named(reliability_only, "reliability")
})