test_that("explore works with predictors", {
  dataset <- data.frame(
    doc_id = c("doc_a", "doc_b", "doc_c"),
    text = c("Doc A", "Doc B", "Doc C"),
    truth = c("POS", "NEG", "POS"),
    stringsAsFactors = FALSE
  )
  predictor_a <- function(prompts, data, spec) rep("POS", length(prompts))
  predictor_b <- function(prompts, data, spec) rep(c("POS", "NEG", "POS"), length.out = length(prompts))

  models <- list(
    list(id = "model_a", predictor = predictor_a),
    list(id = "model_b", predictor = predictor_b)
  )

  builder <- function(spec) {
    data.frame(
      sample_id = dataset$doc_id,
      prompt = paste(spec$id, dataset$text),
      truth = dataset$truth,
      stringsAsFactors = FALSE
    )
  }

  res <- explore(
    models = models,
    prompts = builder,
    keep_prompts = TRUE
  )

  expect_true(is.data.frame(res$annotations))
  expect_equal(nrow(res$annotations), nrow(dataset) * length(models))
  expect_true(all(c("sample_id", "model_id", "label", "truth", "prompt") %in% names(res$annotations)))
  expect_false(is.factor(res$annotations$label))
})

test_that("template prompt builder generates structured prompts", {
  dataset <- data.frame(
    doc_id = c("doc_a", "doc_b"),
    text = c("Doc A text", "Doc B text"),
    stringsAsFactors = FALSE
  )

  # Use proper field names - they render as-is
  template <- list(
    "Annotation Task" = "Classify whether the text is positive or negative.",
    "Coding Rules" = "Return only POS or NEG. Respond in JSON.",
    "Examples" = data.frame(
      text = c("I love it", "Hate it"),
      label = c("POS", "NEG"),
      stringsAsFactors = FALSE
    ),
    "Target Text" = dataset$text,
    sample_id = dataset$doc_id
  )

  models <- list(list(id = "tmpl", predictor = function(prompts, ...) rep("POS", length(prompts))))

  res <- explore(models = models,
                 prompts = template,
                 keep_prompts = TRUE)

  expect_equal(nrow(res$annotations), nrow(dataset))
  # Field names render as-is
 expect_true(all(grepl("## Annotation Task", res$annotations$prompt, fixed = TRUE)))
  expect_true(all(grepl("## Target Text", res$annotations$prompt, fixed = TRUE)))
  expect_true(all(mapply(function(txt, prompt) grepl(txt, prompt, fixed = TRUE),
                         dataset$text,
                         res$annotations$prompt)))
})

test_that("character vector prompt builder passes prompts through", {
  ready <- c("Prompt 1", "Prompt 2", "Prompt 3")
  models <- list(list(id = "vec", predictor = function(prompts, ...) prompts))

  res <- explore(models = models,
                 prompts = ready,
                 keep_prompts = TRUE)

  prompts_seen <- res$annotations$prompt[res$annotations$model_id == "vec"]
  expect_equal(sort(unique(prompts_seen)), sort(ready))
})

test_that("confusion matrices and reliability stats are returned", {
  annotations <- data.frame(
    sample_id = rep(1:3, times = 2),
    model_id = rep(c("m1", "m2"), each = 3),
    label = c("A", "B", "A", "A", "B", "B"),
    truth = rep(c("A", "B", "A"), times = 2),
    stringsAsFactors = FALSE
  )

  cms <- compute_confusion_matrices(annotations)
  expect_true("vs_gold" %in% names(cms))
  expect_length(cms$vs_gold, 2)

  rel <- intercoder_reliability(annotations, label_levels = c("A", "B"))
  expect_true("cohen" %in% names(rel))
  expect_true("krippendorff" %in% names(rel))
  expect_equal(ncol(rel$cohen), 5)
})

test_that("annotation sink streams chunks to CSV", {
  tmp <- tempfile(fileext = ".csv")
  sink_fn <- annotation_sink_csv(tmp)
  data <- data.frame(id = c(1, 2), text = c("Doc A", "Doc B"), stringsAsFactors = FALSE)
  models <- list(list(id = "mock", predictor = function(prompts, ...) rep("X", length(prompts))))

  builder <- function(spec) data.frame(sample_id = data$id, prompt = data$text, stringsAsFactors = FALSE)

  res <- explore(models = models,
                 prompts = builder,
                 sink = sink_fn)

  expect_null(res$annotations)
  expect_true(file.exists(tmp))
  streamed <- utils::read.csv(tmp, stringsAsFactors = FALSE)
  expect_equal(nrow(streamed), nrow(data))
  expect_equal(unique(streamed$model_id), "mock")
})