# Helper function to extract metric value from qlm_comparison result
get_comparison_metric <- function(result, metric_name, variable = NULL) {
  if (!is.null(variable)) {
    rows <- result[result$variable == variable & result$measure == metric_name, ]
  } else {
    rows <- result[result$measure == metric_name, ]
  }
  if (nrow(rows) == 0) return(NA_real_)
  rows$value[1]
}

test_that("qlm_compare validates inputs correctly", {
  skip_if_not_installed("ellmer")

  type_obj <- ellmer::type_object(score = ellmer::type_number("Score"))
  codebook <- qlm_codebook("Test", "Test prompt", type_obj)

  mock_results1 <- data.frame(id = 1:3, score = c(1, 2, 3))
  mock_coded1 <- new_qlm_coded(
    results = mock_results1,
    codebook = codebook,
    data = c("text1", "text2", "text3"),
    input_type = "text",
    chat_args = list(name = "test/model"),
    execution_args = list(),
    metadata = list(timestamp = Sys.time(), n_units = 3),
    name = "original",
    call = quote(qlm_code(...)),
    parent = NULL
  )

  # Should error with only one object
  expect_error(
    qlm_compare(mock_coded1, by = "score"),
    "At least two.*qlm_coded.*objects"
  )

  # Should error with non-qlm_coded objects
  expect_error(
    qlm_compare(mock_coded1, list(a = 1), by = "score"),
    "must be.*qlm_coded.*objects"
  )
})


test_that("qlm_compare checks for 'by' variable", {
  skip_if_not_installed("ellmer")

  type_obj <- ellmer::type_object(score = ellmer::type_number("Score"))
  codebook <- qlm_codebook("Test", "Test prompt", type_obj)

  mock_results1 <- data.frame(id = 1:3, score = c(1, 2, 3))
  mock_coded1 <- new_qlm_coded(
    results = mock_results1,
    codebook = codebook,
    data = c("text1", "text2", "text3"),
    input_type = "text",
    chat_args = list(name = "test/model"),
    execution_args = list(),
    metadata = list(timestamp = Sys.time(), n_units = 3),
    name = "original",
    call = quote(qlm_code(...)),
    parent = NULL
  )

  mock_results2 <- data.frame(id = 1:3, score = c(1, 2, 2))
  mock_coded2 <- new_qlm_coded(
    results = mock_results2,
    codebook = codebook,
    data = c("text1", "text2", "text3"),
    input_type = "text",
    chat_args = list(name = "test/model"),
    execution_args = list(),
    metadata = list(timestamp = Sys.time(), n_units = 3),
    name = "original",
    call = quote(qlm_code(...)),
    parent = NULL
  )

  # Should error if 'by' variable doesn't exist
  expect_error(
    qlm_compare(mock_coded1, mock_coded2, by = "nonexistent"),
    "Variable.*nonexistent.*not found"
  )
})


test_that("qlm_compare works with matching units", {
  skip_if_not_installed("ellmer")
  skip_if_not_installed("irr")

  type_obj <- ellmer::type_object(score = ellmer::type_number("Score"))
  codebook <- qlm_codebook("Test", "Test prompt", type_obj)

  # Create two coded objects with same units
  mock_results1 <- data.frame(id = 1:5, score = c(1, 2, 3, 1, 2))
  mock_coded1 <- new_qlm_coded(
    results = mock_results1,
    codebook = codebook,
    data = paste0("text", 1:5),
    input_type = "text",
    chat_args = list(name = "test/model"),
    execution_args = list(),
    metadata = list(timestamp = Sys.time(), n_units = 5),
    name = "original",
    call = quote(qlm_code(...)),
    parent = NULL
  )

  mock_results2 <- data.frame(id = 1:5, score = c(1, 2, 2, 1, 3))
  mock_coded2 <- new_qlm_coded(
    results = mock_results2,
    codebook = codebook,
    data = paste0("text", 1:5),
    input_type = "text",
    chat_args = list(name = "test/model"),
    execution_args = list(),
    metadata = list(timestamp = Sys.time(), n_units = 5),
    name = "original",
    call = quote(qlm_code(...)),
    parent = NULL
  )

  # Compare using interval level
  comparison <- qlm_compare(mock_coded1, mock_coded2, by = "score", level = "interval")

  expect_true(inherits(comparison, "qlm_comparison"))
  expect_true(all(comparison$level == "interval"))
  expect_true(is.numeric(get_comparison_metric(comparison, "alpha_interval")))
  expect_true(is.numeric(get_comparison_metric(comparison, "icc")))
  expect_true(is.numeric(get_comparison_metric(comparison, "r")))
  expect_true(is.numeric(get_comparison_metric(comparison, "percent_agreement")))
  expect_equal(attr(comparison, "n"), 5)
  expect_equal(attr(comparison, "raters"), 2)
})


test_that("qlm_compare handles Cohen's kappa for 2 raters", {
  skip_if_not_installed("ellmer")
  skip_if_not_installed("irr")

  type_obj <- ellmer::type_object(
    category = ellmer::type_string("Category")
  )
  codebook <- qlm_codebook("Test", "Test prompt", type_obj)

  # Create categorical data
  mock_results1 <- data.frame(id = 1:10, category = rep(c("A", "B"), 5))
  mock_coded1 <- new_qlm_coded(
    results = mock_results1,
    codebook = codebook,
    data = paste0("text", 1:10),
    input_type = "text",
    chat_args = list(name = "test/model"),
    execution_args = list(),
    metadata = list(timestamp = Sys.time(), n_units = 10),
    name = "original",
    call = quote(qlm_code(...)),
    parent = NULL
  )

  mock_results2 <- data.frame(id = 1:10, category = c(rep("A", 8), "B", "B"))
  mock_coded2 <- new_qlm_coded(
    results = mock_results2,
    codebook = codebook,
    data = paste0("text", 1:10),
    input_type = "text",
    chat_args = list(name = "test/model"),
    execution_args = list(),
    metadata = list(timestamp = Sys.time(), n_units = 10),
    name = "original",
    call = quote(qlm_code(...)),
    parent = NULL
  )

  # Compare using nominal level (should compute Cohen's kappa for 2 raters)
  comparison <- qlm_compare(mock_coded1, mock_coded2,
                           by = "category",
                           level = "nominal")

  expect_true(all(comparison$level == "nominal"))
  expect_equal(attr(comparison, "raters"), 2)
  # kappa_type is not returned in the data frame, just check kappa exists
  expect_true(is.numeric(get_comparison_metric(comparison, "kappa")))
  expect_true(is.numeric(get_comparison_metric(comparison, "alpha_nominal")))
  expect_true(is.numeric(get_comparison_metric(comparison, "percent_agreement")))
})


test_that("qlm_compare handles Fleiss' kappa for 3+ raters", {
  skip_if_not_installed("ellmer")
  skip_if_not_installed("irr")

  type_obj <- ellmer::type_object(
    category = ellmer::type_string("Category")
  )
  codebook <- qlm_codebook("Test", "Test prompt", type_obj)

  # Create three coded objects
  mock_results1 <- data.frame(id = 1:8, category = rep(c("A", "B"), 4))
  mock_coded1 <- new_qlm_coded(
    results = mock_results1,
    codebook = codebook,
    data = paste0("text", 1:8),
    input_type = "text",
    chat_args = list(name = "test/model"),
    execution_args = list(),
    metadata = list(timestamp = Sys.time(), n_units = 8),
    name = "original",
    call = quote(qlm_code(...)),
    parent = NULL
  )

  mock_results2 <- data.frame(id = 1:8, category = c(rep("A", 6), "B", "B"))
  mock_coded2 <- new_qlm_coded(
    results = mock_results2,
    codebook = codebook,
    data = paste0("text", 1:8),
    input_type = "text",
    chat_args = list(name = "test/model"),
    execution_args = list(),
    metadata = list(timestamp = Sys.time(), n_units = 8),
    name = "original",
    call = quote(qlm_code(...)),
    parent = NULL
  )

  mock_results3 <- data.frame(id = 1:8, category = c(rep("A", 7), "B"))
  mock_coded3 <- new_qlm_coded(
    results = mock_results3,
    codebook = codebook,
    data = paste0("text", 1:8),
    input_type = "text",
    chat_args = list(name = "test/model"),
    execution_args = list(),
    metadata = list(timestamp = Sys.time(), n_units = 8),
    name = "original",
    call = quote(qlm_code(...)),
    parent = NULL
  )

  # Compare using nominal level (should compute Fleiss' kappa for 3 raters)
  comparison <- qlm_compare(mock_coded1, mock_coded2, mock_coded3,
                           by = "category",
                           level = "nominal")

  expect_true(all(comparison$level == "nominal"))
  expect_equal(attr(comparison, "raters"), 3)
  # kappa_type not returned in data frame
  expect_true(is.numeric(get_comparison_metric(comparison, "kappa")))
  expect_true(is.numeric(get_comparison_metric(comparison, "alpha_nominal")))
  expect_true(is.numeric(get_comparison_metric(comparison, "percent_agreement")))
})


test_that("qlm_compare computes percent agreement", {
  skip_if_not_installed("ellmer")

  type_obj <- ellmer::type_object(score = ellmer::type_number("Score"))
  codebook <- qlm_codebook("Test", "Test prompt", type_obj)

  # Create two coded objects with high agreement
  mock_results1 <- data.frame(id = 1:5, score = c(1, 2, 3, 1, 2))
  mock_coded1 <- new_qlm_coded(
    results = mock_results1,
    codebook = codebook,
    data = paste0("text", 1:5),
    input_type = "text",
    chat_args = list(name = "test/model"),
    execution_args = list(),
    metadata = list(timestamp = Sys.time(), n_units = 5),
    name = "original",
    call = quote(qlm_code(...)),
    parent = NULL
  )

  mock_results2 <- data.frame(id = 1:5, score = c(1, 2, 3, 1, 3))  # 4/5 agree
  mock_coded2 <- new_qlm_coded(
    results = mock_results2,
    codebook = codebook,
    data = paste0("text", 1:5),
    input_type = "text",
    chat_args = list(name = "test/model"),
    execution_args = list(),
    metadata = list(timestamp = Sys.time(), n_units = 5),
    name = "original",
    call = quote(qlm_code(...)),
    parent = NULL
  )

  # Compare using nominal level (includes percent agreement)
  comparison <- qlm_compare(mock_coded1, mock_coded2,
                           by = "score",
                           level = "nominal")

  expect_true(all(comparison$level == "nominal"))
  expect_equal(get_comparison_metric(comparison, "percent_agreement"), 0.8)  # 4 out of 5
  expect_true(is.numeric(get_comparison_metric(comparison, "kappa")))
  expect_true(is.numeric(get_comparison_metric(comparison, "alpha_nominal")))
})


test_that("qlm_compare handles mismatched units", {
  skip_if_not_installed("ellmer")

  type_obj <- ellmer::type_object(score = ellmer::type_number("Score"))
  codebook <- qlm_codebook("Test", "Test prompt", type_obj)

  # Create objects with different units
  mock_results1 <- data.frame(id = 1:3, score = c(1, 2, 3))
  mock_coded1 <- new_qlm_coded(
    results = mock_results1,
    codebook = codebook,
    data = c("text1", "text2", "text3"),
    input_type = "text",
    chat_args = list(name = "test/model"),
    execution_args = list(),
    metadata = list(timestamp = Sys.time(), n_units = 3),
    name = "original",
    call = quote(qlm_code(...)),
    parent = NULL
  )

  mock_results2 <- data.frame(id = 4:6, score = c(1, 2, 3))  # Different IDs
  mock_coded2 <- new_qlm_coded(
    results = mock_results2,
    codebook = codebook,
    data = c("text4", "text5", "text6"),
    input_type = "text",
    chat_args = list(name = "test/model"),
    execution_args = list(),
    metadata = list(timestamp = Sys.time(), n_units = 3),
    name = "original",
    call = quote(qlm_code(...)),
    parent = NULL
  )

  # Should error with no common units
  expect_error(
    qlm_compare(mock_coded1, mock_coded2, by = "score", level = "interval"),
    "No valid comparisons could be computed"
  )
})


test_that("print.qlm_comparison displays correctly", {
  skip_if_not_installed("ellmer")
  skip_if_not_installed("irr")

  type_obj <- ellmer::type_object(score = ellmer::type_number("Score"))
  codebook <- qlm_codebook("Test", "Test prompt", type_obj)

  mock_results1 <- data.frame(id = 1:5, score = c(1, 2, 3, 1, 2))
  mock_coded1 <- new_qlm_coded(
    results = mock_results1,
    codebook = codebook,
    data = paste0("text", 1:5),
    input_type = "text",
    chat_args = list(name = "test/model"),
    execution_args = list(),
    metadata = list(timestamp = Sys.time(), n_units = 5),
    name = "original",
    call = quote(qlm_code(...)),
    parent = NULL
  )

  mock_results2 <- data.frame(id = 1:5, score = c(1, 2, 2, 1, 3))
  mock_coded2 <- new_qlm_coded(
    results = mock_results2,
    codebook = codebook,
    data = paste0("text", 1:5),
    input_type = "text",
    chat_args = list(name = "test/model"),
    execution_args = list(),
    metadata = list(timestamp = Sys.time(), n_units = 5),
    name = "original",
    call = quote(qlm_code(...)),
    parent = NULL
  )

  comparison <- qlm_compare(mock_coded1, mock_coded2, by = "score", level = "interval")

  # Just verify print doesn't error (cli output isn't captured well)
  expect_no_error(print(comparison))

  # Verify structure
  expect_true(inherits(comparison, "qlm_comparison"))
  expect_equal(unique(comparison$variable), "score")
  expect_equal(unique(comparison$level), "interval")
  expect_equal(attr(comparison, "n"), 5)
  expect_equal(attr(comparison, "raters"), 2)
})

test_that("qlm_compare accepts plain data.frames for all arguments", {
  skip_if_not_installed("ellmer")
  skip_if_not_installed("irr")

  # Create two plain data.frames (simulating human coders)
  coder1 <- data.frame(.id = 1:10, category = rep(c("A", "B"), 5))
  coder2 <- data.frame(.id = 1:10, category = rep(c("A", "B"), 5))

  # Should work with plain data.frames
  comparison <- qlm_compare(coder1, coder2, by = category, level = "nominal")

  expect_true(inherits(comparison, "qlm_comparison"))
  expect_equal(get_comparison_metric(comparison, "percent_agreement"), 1.0)  # Perfect agreement
  expect_equal(attr(comparison, "n"), 10)
  expect_equal(attr(comparison, "raters"), 2)
})

test_that("qlm_compare works with plain data.frames and imperfect agreement", {
  skip_if_not_installed("ellmer")
  skip_if_not_installed("irr")

  # Create two plain data.frames with different values
  coder1 <- data.frame(.id = 1:10, category = c(rep("A", 7), rep("B", 3)))
  coder2 <- data.frame(.id = 1:10, category = rep(c("A", "B"), 5))

  comparison <- qlm_compare(coder1, coder2, by = category, level = "nominal")

  expect_true(inherits(comparison, "qlm_comparison"))
  pct_agree <- get_comparison_metric(comparison, "percent_agreement")
  expect_true(pct_agree < 1.0)
  expect_true(is.numeric(get_comparison_metric(comparison, "alpha_nominal")))
  expect_true(is.numeric(get_comparison_metric(comparison, "kappa")))
})

test_that("qlm_compare works with three plain data.frames", {
  skip_if_not_installed("ellmer")
  skip_if_not_installed("irr")

  # Create three plain data.frames
  coder1 <- data.frame(.id = 1:10, category = rep(c("A", "B"), 5))
  coder2 <- data.frame(.id = 1:10, category = rep(c("A", "B"), 5))
  coder3 <- data.frame(.id = 1:10, category = rep(c("A", "B"), 5))

  comparison <- qlm_compare(coder1, coder2, coder3, by = category, level = "nominal")

  expect_true(inherits(comparison, "qlm_comparison"))
  expect_equal(attr(comparison, "raters"), 3)
  # kappa_type not returned in data frame
})

test_that("qlm_compare supports non-standard evaluation for by argument", {
  skip_if_not_installed("ellmer")
  skip_if_not_installed("irr")

  type_obj <- ellmer::type_object(category = ellmer::type_string("Category"))
  codebook <- qlm_codebook("Test", "Test prompt", type_obj)

  # Create two coded objects
  mock_results1 <- data.frame(id = 1:10, category = rep(c("A", "B"), 5))
  mock_coded1 <- new_qlm_coded(
    results = mock_results1,
    codebook = codebook,
    data = paste0("text", 1:10),
    input_type = "text",
    chat_args = list(name = "test/model"),
    execution_args = list(),
    metadata = list(timestamp = Sys.time(), n_units = 10),
    name = "original",
    call = quote(qlm_code(...)),
    parent = NULL
  )

  mock_results2 <- data.frame(id = 1:10, category = rep(c("A", "B"), 5))
  mock_coded2 <- new_qlm_coded(
    results = mock_results2,
    codebook = codebook,
    data = paste0("text", 1:10),
    input_type = "text",
    chat_args = list(name = "test/model"),
    execution_args = list(),
    metadata = list(timestamp = Sys.time(), n_units = 10),
    name = "original",
    call = quote(qlm_code(...)),
    parent = NULL
  )

  # Test with unquoted variable name (NSE)
  comparison_nse <- qlm_compare(mock_coded1, mock_coded2, by = category, level = "nominal")

  # Test with quoted variable name (traditional)
  comparison_quoted <- qlm_compare(mock_coded1, mock_coded2, by = "category", level = "nominal")

  # Both should work and produce identical results
  expect_true(inherits(comparison_nse, "qlm_comparison"))
  expect_true(inherits(comparison_quoted, "qlm_comparison"))
  expect_equal(get_comparison_metric(comparison_nse, "alpha_nominal"),
               get_comparison_metric(comparison_quoted, "alpha_nominal"))
  expect_equal(get_comparison_metric(comparison_nse, "kappa"),
               get_comparison_metric(comparison_quoted, "kappa"))
  expect_equal(get_comparison_metric(comparison_nse, "percent_agreement"),
               get_comparison_metric(comparison_quoted, "percent_agreement"))
  expect_equal(attr(comparison_nse, "n"), attr(comparison_quoted, "n"))
  expect_equal(attr(comparison_nse, "raters"), attr(comparison_quoted, "raters"))
})