test_that("compare_datasets_from_yaml uses key parameter over YAML rules", {
  # Create test data with multiple potential key columns
  ref <- data.frame(
    id = 1:3,
    customer_id = c("A", "B", "C"),
    value = c(10.0, 20.0, 30.0),
    name = c("Alice", "Bob", "Charlie")
  )

  # Candidate data - same values but different order to test key-based sorting
  cand <- data.frame(
    id = c(2, 1, 3),
    customer_id = c("B", "A", "C"),
    value = c(20.1, 10.1, 30.1),
    name = c("Bob", "Alice", "Charlie")
  )

  # YAML with customer_id as key
  yaml_content <- '
version: 1
defaults:
  na_equal: yes
  keys: ["customer_id"]
by_type:
  numeric:
    abs: 0.5
'

  yaml_path <- tempfile(fileext = ".yaml")
  on.exit(unlink(yaml_path), add = TRUE)
  writeLines(yaml_content, yaml_path)

  # Test 1: Without key parameter, should use YAML key (customer_id)
  result_yaml <- compare_datasets_from_yaml(ref, cand, path = yaml_path)
  expect_true(result_yaml$all_passed)

  # Test 2: With key parameter "id", should override YAML and use "id"
  result_param <- compare_datasets_from_yaml(ref, cand, key = "id", path = yaml_path)
  expect_true(result_param$all_passed)

  # Test 3: With multiple keys parameter, should work
  result_multi <- compare_datasets_from_yaml(ref, cand, key = c("id", "customer_id"), path = yaml_path)
  expect_true(result_multi$all_passed)
})

test_that("compare_datasets_from_yaml key parameter handles NULL and missing keys", {
  ref <- data.frame(id = 1:3, value = c(10.0, 20.0, 30.0))
  cand <- data.frame(id = 1:3, value = c(10.1, 20.1, 30.1))

  # YAML without keys
  yaml_content <- '
version: 1
defaults:
  na_equal: yes
by_type:
  numeric:
    abs: 0.5
'

  yaml_path <- tempfile(fileext = ".yaml")
  on.exit(unlink(yaml_path), add = TRUE)
  writeLines(yaml_content, yaml_path)

  # Test 1: NULL key parameter with no keys in YAML should work (no key comparison)
  result_null <- compare_datasets_from_yaml(ref, cand, key = NULL, path = yaml_path)
  expect_true(result_null$all_passed)

  # Test 2: No key parameter (default NULL) with no keys in YAML
  result_default <- compare_datasets_from_yaml(ref, cand, path = yaml_path)
  expect_true(result_default$all_passed)
})

test_that("compare_datasets_from_yaml key parameter validation works", {
  ref <- data.frame(id = 1:3, value = c(10.0, 20.0, 30.0))
  cand <- data.frame(id = 1:3, value = c(10.1, 20.1, 30.1))

  yaml_content <- '
version: 1
defaults:
  na_equal: yes
by_type:
  numeric:
    abs: 0.5
'

  yaml_path <- tempfile(fileext = ".yaml")
  on.exit(unlink(yaml_path), add = TRUE)
  writeLines(yaml_content, yaml_path)

  # Test: Invalid key should return an empty output with all_passed FALSE

  res <- compare_datasets_from_yaml(ref, cand, key = "nonexistent", path = yaml_path)

  expect_false(res$all_passed)
  expect_null(res$agent)
  expect_null(res$response)
})

test_that("key parameter takes precedence over YAML in edge cases", {
  # Create data where YAML key would cause issues but parameter key works
  ref <- data.frame(
    id = 1:3,
    category = c("X", "Y", "Z"),
    value = c(1.0, 2.0, 3.0)
  )

  cand <- data.frame(
    id = c(1, 3, 2),  # Different order
    category = c("X", "Z", "Y"),  # Different order
    value = c(1.01, 3.01, 2.01)
  )

  # YAML specifies category as key (which would sort differently)
  yaml_content <- '
version: 1
defaults:
  na_equal: yes
  keys: ["category"]
by_type:
  numeric:
    abs: 0.1
'

  yaml_path <- tempfile(fileext = ".yaml")
  on.exit(unlink(yaml_path), add = TRUE)
  writeLines(yaml_content, yaml_path)

  # Using YAML key (category) would compare X->X, Y->Z, Z->Y which fails
  # But we override with id parameter
  result <- compare_datasets_from_yaml(ref, cand, key = "id", path = yaml_path)
  expect_true(result$all_passed)
})

# --- Duplicate key detection tests ---

test_that("warning is raised when reference data has duplicate keys", {
  # Reference with duplicate key values
  ref <- data.frame(
    id = c(1, 1, 2, 3),
    value = c(10.0, 11.0, 20.0, 30.0)
  )

  cand <- data.frame(
    id = c(1, 2, 3),
    value = c(10.5, 20.5, 30.5)
  )

  expect_warning(
    compare_datasets_from_yaml(ref, cand, key = "id"),
    regexp = "Duplicate keys detected.*data_reference.*1 duplicate key value.*affecting 2 rows",
    ignore.case = TRUE
  )
})

test_that("warning is raised when candidate data has duplicate keys", {
  ref <- data.frame(
    id = c(1, 2, 3),
    value = c(10.0, 20.0, 30.0)
  )

  # Candidate with duplicate key values
  cand <- data.frame(
    id = c(1, 2, 2, 3),
    value = c(10.5, 20.5, 21.0, 30.5)
  )

  expect_warning(
    compare_datasets_from_yaml(ref, cand, key = "id"),
    regexp = "Duplicate keys detected.*data_candidate.*1 duplicate key value.*affecting 2 rows",
    ignore.case = TRUE
  )
})

test_that("warning is raised when both datasets have duplicate keys", {
  ref <- data.frame(
    id = c(1, 1, 2, 3),
    value = c(10.0, 11.0, 20.0, 30.0)
  )

  cand <- data.frame(
    id = c(1, 2, 2, 3),
    value = c(10.5, 20.5, 21.0, 30.5)
  )

  expect_warning(
    compare_datasets_from_yaml(ref, cand, key = "id"),
    regexp = "Duplicate keys detected.*data_reference.*data_candidate",
    ignore.case = TRUE
  )
})

test_that("warning shows multiple duplicate key examples", {
  ref <- data.frame(
    id = c(1, 1, 2, 2, 3, 3, 4),
    value = c(10.0, 11.0, 20.0, 21.0, 30.0, 31.0, 40.0)
  )

  cand <- data.frame(
    id = c(1, 2, 3, 4),
    value = c(10.5, 20.5, 30.5, 40.5)
  )

  expect_warning(
    compare_datasets_from_yaml(ref, cand, key = "id"),
    regexp = "3 duplicate key value.*affecting 6 rows",
    ignore.case = TRUE
  )
})

test_that("warning works with composite keys", {
  ref <- data.frame(
    id1 = c(1, 1, 2, 2),
    id2 = c("A", "A", "B", "C"),
    value = c(10.0, 11.0, 20.0, 30.0)
  )

  cand <- data.frame(
    id1 = c(1, 2, 2),
    id2 = c("A", "B", "C"),
    value = c(10.5, 20.5, 30.5)
  )

  expect_warning(
    compare_datasets_from_yaml(ref, cand, key = c("id1", "id2")),
    regexp = "Duplicate keys detected.*key column.*id1.*id2.*data_reference",
    ignore.case = TRUE
  )
})

test_that("no warning when keys are unique", {
  ref <- data.frame(
    id = c(1, 2, 3),
    value = c(10.0, 20.0, 30.0)
  )

  cand <- data.frame(
    id = c(1, 2, 3),
    value = c(10.5, 20.5, 30.5)
  )

  expect_no_warning(
    compare_datasets_from_yaml(ref, cand, key = "id")
  )
})

test_that("compare_datasets_from_yaml uses key parameter over YAML rules", {
  # Create test data with multiple potential key columns
  ref <- data.frame(
    id = 1:3,
    customer_id = c("A", "B", "C"),
    value = c(10.0, 20.0, 30.0),
    name = c("Alice", "Bob", "Charlie")
  )

  # Candidate data - same values but different order to test key-based sorting
  cand <- data.frame(
    id = c(2, 1, 3),
    customer_id = c("B", "A", "C"),
    value = c(20.1, 10.1, 30.1),
    name = c("Bob", "Alice", "Charlie")
  )

  # YAML with customer_id as key
  yaml_content <- '
version: 1
defaults:
  na_equal: yes
  keys: ["customer_id"]
by_type:
  numeric:
    abs: 0.5
'

  yaml_path <- tempfile(fileext = ".yaml")
  on.exit(unlink(yaml_path), add = TRUE)
  writeLines(yaml_content, yaml_path)

  # Test 1: Without key parameter, should use YAML key (customer_id)
  result_yaml <- compare_datasets_from_yaml(ref, cand, path = yaml_path)
  expect_true(result_yaml$all_passed)

  # Test 2: With key parameter "id", should override YAML and use "id"
  result_param <- compare_datasets_from_yaml(ref, cand, key = "id", path = yaml_path)
  expect_true(result_param$all_passed)

  # Test 3: With multiple keys parameter, should work
  result_multi <- compare_datasets_from_yaml(ref, cand, key = c("id", "customer_id"), path = yaml_path)
  expect_true(result_multi$all_passed)
})


test_that("warning truncates to 3 examples when more than 3 duplicate key values in reference", {
  # 4 unique duplicate key values -> takes the else branch (n_dup_keys > 3)
  ref <- data.frame(
    id    = c(1, 1, 2, 2, 3, 3, 4, 4, 5),
    value = c(10, 11, 20, 21, 30, 31, 40, 41, 50)
  )
  cand <- data.frame(
    id    = c(1, 2, 3, 4, 5),
    value = c(10.5, 20.5, 30.5, 40.5, 50.5)
  )

  w <- tryCatch(
    compare_datasets_from_yaml(ref, cand, key = "id"),
    warning = function(w) conditionMessage(w)
  )
  expect_match(w, "\\.\\.\\.")
})

test_that("warning truncates to 3 examples when more than 3 duplicate key values in candidate", {
  ref <- data.frame(
    id    = c(1, 2, 3, 4, 5),
    value = c(10, 20, 30, 40, 50)
  )
  # 4 unique duplicate key values in candidate
  cand <- data.frame(
    id    = c(1, 1, 2, 2, 3, 3, 4, 4, 5),
    value = c(10, 11, 20, 21, 30, 31, 40, 41, 50)
  )

  w <- tryCatch(
    compare_datasets_from_yaml(ref, cand, key = "id"),
    warning = function(w) conditionMessage(w)
  )
  expect_match(w, "\\.\\.\\.")
})