test_that("compare_datasets_from_yaml handles ignore_columns", {
  ref <- data.frame(a = 1, b = 2, c = 3, to_ignore = 99)
  cand <- data.frame(a = 1, b = 2, c = 3, another_ignore = 100)

  # Create YAML with ignore_columns
  yaml_content <- '
version: 1
defaults:
  na_equal: yes
  ignore_columns: ["to_ignore", "another_ignore"]
row_validation:
  check_count: no
by_type:
  numeric:
    abs: 0.01
'

  yaml_path <- tempfile(fileext = ".yaml")
  on.exit(unlink(yaml_path), add = TRUE)
  writeLines(yaml_content, yaml_path)

  # Test the full integration
  result <- compare_datasets_from_yaml(ref, cand, path = yaml_path)

  # Should not report missing/extra columns
  expect_equal(length(result$missing_in_candidate), 0)

  expect_equal(length(result$extra_in_candidate), 0)
})

test_that("compare_datasets_from_yaml works with keys", {
  ref <- data.frame(id = c(1, 2, 3), value = c(10, 20, 30))
  cand <- data.frame(id = c(1, 2, 3), value = c(10.1, 20.1, 30.1))

  # Create simple rules
  yaml_content <- '
version: 1
defaults:
  na_equal: yes
by_type:
  numeric:
    abs: 0.5
'

  yaml_path <- tempfile(fileext = ".yaml")
  on.exit(unlink(yaml_path), add = TRUE)
  writeLines(yaml_content, yaml_path)

  result <- compare_datasets_from_yaml(ref, cand, key = "id", path = yaml_path)

  # Should have a valid result
  expect_type(result, "list")
  expect_s3_class(result$agent, "ptblank_agent")
})

test_that("compare_datasets_from_yaml handles row validation", {
  ref <- data.frame(a = 1:3, b = 2:4)
  cand <- data.frame(a = 1:3, b = 2:4)  # Same number of rows

  yaml_content <- '
version: 1
defaults:
  na_equal: yes
row_validation:
  check_count: yes
  expected_count: 3
by_type:
  numeric:
    abs: 0.01
'

  yaml_path <- tempfile(fileext = ".yaml")
  on.exit(unlink(yaml_path), add = TRUE)
  writeLines(yaml_content, yaml_path)

  # Should not error with matching row counts
  expect_no_error(compare_datasets_from_yaml(ref, cand, path = yaml_path))
})

test_that("column-specific rules override type rules for numeric and character columns", {
  # Create reference data
  ref <- data.frame(
    id = 1:2,
    num_type_only = c(1.0, 2.0),
    num_specific = c(1.0, 2.0),
    char_type_only = c("Hello", "World"),
    char_specific = c("Hello", "World")
  )

  # Create candidate data with differences that would fail type rules but pass specific rules
  cand <- data.frame(
    id = 1:2,
    num_type_only = c(1.005, 2.005),  # diff 0.005 < 0.01 (type abs), should pass
    num_specific = c(1.06, 2.06),     # diff 0.06 > 0.01 but < 0.1 (specific abs), should pass
    char_type_only = c("Hello", "World"),  # exact match, should pass
    char_specific = c("hello", "world")     # case differs, should pass with case_insensitive=true
  )

  # YAML with type rules and overriding column-specific rules
  yaml_content <- '
version: 1
defaults:
  na_equal: yes
  keys: ["id"]
by_type:
  numeric:
    abs: 0.01
  character:
    equal_mode: exact
    case_insensitive: false
    trim: false
by_name:
  num_specific:
    abs: 0.1
  char_specific:
    case_insensitive: true
'

  yaml_path <- tempfile(fileext = ".yaml")
  on.exit(unlink(yaml_path), add = TRUE)
  writeLines(yaml_content, yaml_path)

  result <- compare_datasets_from_yaml(ref, cand, path = yaml_path)

  # Check that applied_rules reflect the priority: column-specific override type rules
  applied <- result$applied_rules

  # num_type_only should have type rule (abs = 0.01)
  expect_equal(applied$num_type_only$abs, 0.01)

  # num_specific should have overridden rule (abs = 0.1)
  expect_equal(applied$num_specific$abs, 0.1)

  # char_type_only should have type rules
  expect_false(applied$char_type_only$case_insensitive)
  expect_equal(applied$char_type_only$equal_mode, "exact")

  # char_specific should have overridden rule
  expect_true(applied$char_specific$case_insensitive)

  # The comparison should pass since differences are within the applied rules
  expect_true(result$all_passed)
})


test_that("compare_datasets_from_yaml handles different row counts with keys", {
  # Reference has 3 rows, candidate has 4 rows with different keys
  ref <- data.frame(id = 1:3, value = c(10, 20, 30))
  cand <- data.frame(id = 1:4, value = c(10.1, 20.1, 30.1, 40.1))  # Extra row with id=4

  yaml_content <- '
version: 1
defaults:
  na_equal: yes
row_validation:
  check_count: false  # Row count validation disabled
by_type:
  numeric:
    abs: 0.5
'

  yaml_path <- tempfile(fileext = ".yaml")
  on.exit(unlink(yaml_path), add = TRUE)
  writeLines(yaml_content, yaml_path)

  # Should not error even with different row counts when keys are provided
  expect_no_error(result <- compare_datasets_from_yaml(ref, cand, key = "id", path = yaml_path))

  # Should have a valid result structure
  expect_type(result, "list")
  expect_s3_class(result$agent, "ptblank_agent")
  expect_false(result$all_passed)  # Should fail because of extra row in candidate
})

test_that("compare_datasets_from_yaml uses English language by default", {
  ref <- data.frame(id = 1:3, value = c(1.0, 2.0, 3.0))
  cand <- data.frame(id = 1:3, value = c(1.0, 2.0, 3.0))

  result <- compare_datasets_from_yaml(ref, cand, key = "id")

  # Agent should have English as default language
  expect_equal(result$agent$lang, "en")
  expect_equal(result$agent$locale, "en_US")
})

test_that("compare_datasets_from_yaml accepts custom language parameters", {
  ref <- data.frame(id = 1:3, value = c(1.0, 2.0, 3.0))
  cand <- data.frame(id = 1:3, value = c(1.0, 2.0, 3.0))

  # Test English
  result_en <- compare_datasets_from_yaml(ref, cand, key = "id", lang = "en", locale = "en_US")
  expect_equal(result_en$agent$lang, "en")
  expect_equal(result_en$agent$locale, "en_US")

  # Test German
  result_de <- compare_datasets_from_yaml(ref, cand, key = "id", lang = "de", locale = "de_DE")
  expect_equal(result_de$agent$lang, "de")
  expect_equal(result_de$agent$locale, "de_DE")

  # Test Spanish
  result_es <- compare_datasets_from_yaml(ref, cand, key = "id", lang = "es", locale = "es_ES")
  expect_equal(result_es$agent$lang, "es")
  expect_equal(result_es$agent$locale, "es_ES")
})

test_that("compare_datasets_from_yaml propagates language to agent with YAML rules", {
  ref <- data.frame(id = 1:3, value = c(1.0, 2.0, 3.0))
  cand <- data.frame(id = 1:3, value = c(1.1, 2.1, 3.1))

  yaml_content <- '
version: 1
defaults:
  na_equal: yes
  keys: ["id"]
by_type:
  numeric:
    abs: 0.5
'

  yaml_path <- tempfile(fileext = ".yaml")
  on.exit(unlink(yaml_path), add = TRUE)
  writeLines(yaml_content, yaml_path)

  # Test with English language
  result <- compare_datasets_from_yaml(ref, cand, path = yaml_path, lang = "en", locale = "en_GB")

  expect_equal(result$agent$lang, "en")
  expect_equal(result$agent$locale, "en_GB")
  expect_true(result$all_passed)
})

test_that("compare_datasets_from_yaml accepts extract_failed parameter", {
  ref <- data.frame(id = 1:5, value = c(1.0, 2.0, 3.0, 4.0, 5.0))
  cand <- data.frame(id = 1:5, value = c(1.0, 2.0, 3.0, 4.0, 5.0))

  # Test with extract_failed = TRUE (default)
  result_extract <- compare_datasets_from_yaml(ref, cand, key = "id", extract_failed = TRUE)
  expect_type(result_extract, "list")
  expect_true(result_extract$all_passed)

  # Test with extract_failed = FALSE (lightweight mode)
  result_no_extract <- compare_datasets_from_yaml(ref, cand, key = "id", extract_failed = FALSE)
  expect_type(result_no_extract, "list")
  expect_true(result_no_extract$all_passed)
})

test_that("compare_datasets_from_yaml accepts get_first_n parameter", {
  # Create data with many failures
  ref <- data.frame(id = 1:100, value = rep(1.0, 100))
  cand <- data.frame(id = 1:100, value = rep(2.0, 100))  # All values differ

  yaml_content <- '
version: 1
defaults:
  na_equal: yes
  keys: ["id"]
by_type:
  numeric:
    abs: 0.001
'

  yaml_path <- tempfile(fileext = ".yaml")
  on.exit(unlink(yaml_path), add = TRUE)
  writeLines(yaml_content, yaml_path)

  # Test with get_first_n = 5 (limit extracted failures)
  result <- compare_datasets_from_yaml(ref, cand, path = yaml_path, get_first_n = 5)

  expect_type(result, "list")
  expect_false(result$all_passed)  # Should fail because values differ
})

test_that("compare_datasets_from_yaml accepts sample_n parameter", {
  ref <- data.frame(id = 1:50, value = rep(1.0, 50))
  cand <- data.frame(id = 1:50, value = rep(2.0, 50))

  # Test with sample_n = 10
  result <- compare_datasets_from_yaml(ref, cand, key = "id", sample_n = 10)

  expect_type(result, "list")
  expect_false(result$all_passed)
})

test_that("compare_datasets_from_yaml accepts sample_frac and sample_limit parameters", {
  ref <- data.frame(id = 1:100, value = rep(1.0, 100))
  cand <- data.frame(id = 1:100, value = rep(2.0, 100))

  # Test with sample_frac = 0.1 and sample_limit = 5
  result <- compare_datasets_from_yaml(
    ref, cand, key = "id",
    sample_frac = 0.1,
    sample_limit = 5
  )

  expect_type(result, "list")
  expect_false(result$all_passed)
})

test_that("compare_datasets_from_yaml extraction parameters work together", {
  ref <- data.frame(id = 1:10, value = c(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0))
  cand <- data.frame(id = 1:10, value = c(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0))

  # Test combining extract_failed = FALSE with other parameters (should work)
  result <- compare_datasets_from_yaml(
    ref, cand, key = "id",
    extract_failed = FALSE,
    sample_limit = 100
  )

  expect_type(result, "list")
  expect_true(result$all_passed)
})