box::use(
  artma / data / column_recognition[
    get_column_patterns,
    match_column_name,
    recognize_columns,
    get_required_column_names,
    check_mapping_completeness,
    string_similarity,
    analyze_column_values,
    is_likely_numeric_id,
    is_likely_study_key,
    score_candidate_values,
    resolve_multiple_matches
  ]
)

test_that <- getFromNamespace("test_that", "testthat")
expect_equal <- getFromNamespace("expect_equal", "testthat")
expect_true <- getFromNamespace("expect_true", "testthat")
expect_false <- getFromNamespace("expect_false", "testthat")
expect_gte <- getFromNamespace("expect_gte", "testthat")
expect_lte <- getFromNamespace("expect_lte", "testthat")


test_that("get_column_patterns returns valid structure", {
  patterns <- get_column_patterns()

  expect_true(is.list(patterns))
  expect_true(length(patterns) > 0)

  # Check required columns are present
  required <- c("study_id", "effect", "se", "n_obs")
  for (col in required) {
    expect_true(col %in% names(patterns))
    expect_true("patterns" %in% names(patterns[[col]]))
    expect_true("keywords" %in% names(patterns[[col]]))
    expect_true("priority" %in% names(patterns[[col]]))
  }
})


test_that("string_similarity returns 1.0 for identical strings", {
  sim <- string_similarity("effect", "effect")
  expect_equal(sim, 1.0)
})


test_that("string_similarity returns 0.0 for completely different strings", {
  sim <- string_similarity("effect", "xyz123")
  expect_true(sim < 0.5)
})


test_that("string_similarity returns high score for similar strings", {
  sim <- string_similarity("effect", "Effect")
  expect_gte(sim, 0.9)
})


test_that("string_similarity handles substring matches", {
  sim <- string_similarity("effect_size", "effect")
  expect_gte(sim, 0.7)
})


test_that("match_column_name recognizes exact effect column", {
  patterns <- get_column_patterns()
  result <- match_column_name("effect", patterns)

  expect_equal(result$match, "effect")
  expect_equal(result$score, 1.0)
  expect_equal(result$method, "regex")
})


test_that("match_column_name recognizes exact se column", {
  patterns <- get_column_patterns()
  result <- match_column_name("se", patterns)

  expect_equal(result$match, "se")
  expect_equal(result$score, 1.0)
  expect_equal(result$method, "regex")
})


test_that("match_column_name recognizes study_name as study_id", {
  patterns <- get_column_patterns()
  result <- match_column_name("study_name", patterns)

  expect_equal(result$match, "study_id")
  expect_equal(result$score, 1.0)
  expect_equal(result$method, "regex")
})


test_that("match_column_name recognizes n_obs variants", {
  patterns <- get_column_patterns()

  test_cases <- list(
    list(name = "n_obs", expected = "n_obs"),
    list(name = "obs_n", expected = "n_obs"),
    list(name = "sample_size", expected = "n_obs")
  )

  for (test_case in test_cases) {
    result <- match_column_name(test_case$name, patterns)
    expect_equal(result$match, test_case$expected,
      info = paste("Testing", test_case$name)
    )
  }
})


test_that("match_column_name recognizes t_stat variants", {
  patterns <- get_column_patterns()

  test_cases <- list(
    list(name = "t_stat", expected = "t_stat"),
    list(name = "t_statistic", expected = "t_stat"),
    list(name = "tval", expected = "t_stat")
  )

  for (test_case in test_cases) {
    result <- match_column_name(test_case$name, patterns)
    expect_equal(result$match, test_case$expected,
      info = paste("Testing", test_case$name)
    )
  }
})


test_that("match_column_name handles exclude keywords correctly", {
  patterns <- get_column_patterns()

  # "study_id" should match study_id pattern
  result <- match_column_name("study_id", patterns)
  expect_equal(result$match, "study_id")
})


test_that("match_column_name returns NA for unrecognized column", {
  patterns <- get_column_patterns()
  result <- match_column_name("completely_unknown_xyz", patterns)

  expect_true(is.na(result$match))
  expect_equal(result$score, 0)
})


test_that("match_column_name avoids false positives with region names", {
  patterns <- get_column_patterns()

  # These should NOT match obs_id due to exclude keywords
  test_cases <- c(
    "region_middle_east_and_north_africa",
    "region_asia",
    "region_europe"
  )

  for (col_name in test_cases) {
    result <- match_column_name(col_name, patterns)
    expect_false(result$match == "obs_id" && result$score >= 0.7,
      info = paste("Testing", col_name, "should not match obs_id")
    )
  }
})


test_that("recognize_columns correctly identifies standard meta-analysis columns", {
  df <- data.frame(
    study_name = c("Study A", "Study B"),
    effect = c(10.5, 8.2),
    se = c(2.3, 1.8),
    n_obs = c(100, 150)
  )

  mapping <- recognize_columns(df, min_confidence = 0.7)

  expect_equal(mapping$study_id, "study_name")
  expect_equal(mapping$effect, "effect")
  expect_equal(mapping$se, "se")
  expect_equal(mapping$n_obs, "n_obs")
})


test_that("recognize_columns handles different naming conventions", {
  df <- data.frame(
    author_name = c("Author A", "Author B"),
    estimate = c(10.5, 8.2),
    std_error = c(2.3, 1.8),
    sample_size = c(100, 150)
  )

  mapping <- recognize_columns(df, min_confidence = 0.7)

  expect_equal(mapping$study_id, "author_name")
  expect_equal(mapping$effect, "estimate")
  expect_equal(mapping$se, "std_error")
  expect_equal(mapping$n_obs, "sample_size")
})


test_that("recognize_columns prioritizes required columns", {
  df <- data.frame(
    study_id = c(1L, 2L),
    study_name = c("Study A", "Study B"),
    effect = c(10.5, 8.2),
    se = c(2.3, 1.8),
    n_obs = c(100, 150)
  )

  mapping <- recognize_columns(df, min_confidence = 0.7)

  # Prefer string keys when both numeric IDs and labels are available
  expect_true("study_id" %in% names(mapping))
  expect_equal(mapping$study_id, "study_name")
})


test_that("recognize_columns does not reuse columns", {
  df <- data.frame(
    effect = c(10.5, 8.2),
    se = c(2.3, 1.8),
    n_obs = c(100, 150),
    study_id = c(1L, 2L)
  )

  mapping <- recognize_columns(df, min_confidence = 0.7)

  # Each column should be mapped at most once
  mapped_cols <- unlist(mapping)
  expect_equal(length(mapped_cols), length(unique(mapped_cols)))
})


test_that("recognize_columns respects min_confidence threshold", {
  df <- data.frame(
    something_vaguely_like_effect = c(10.5, 8.2),
    maybe_error_ish = c(2.3, 1.8),
    n_obs = c(100, 150),
    study_id = c(1L, 2L)
  )

  # With high threshold, should only match clear cases
  mapping_strict <- recognize_columns(df, min_confidence = 0.95)

  # Should recognize the clear ones
  expect_equal(mapping_strict$n_obs, "n_obs")
  expect_equal(mapping_strict$study_id, "study_id")
})


test_that("recognize_columns handles empty data frame", {
  df <- data.frame()

  # Empty data frames should be validated, not throw error in recognize_columns
  # The validation happens elsewhere in the pipeline
  withr::local_options(list("artma.verbose" = 1))
  mapping <- recognize_columns(df, min_confidence = 0.7)

  expect_true(is.list(mapping))
  expect_equal(length(mapping), 0)
})


test_that("recognize_columns handles data frame with no recognizable columns", {
  df <- data.frame(
    xyz = c(1, 2),
    abc = c(3, 4),
    def = c(5, 6)
  )

  mapping <- recognize_columns(df, min_confidence = 0.7)

  # Should return empty or minimal mapping
  expect_true(is.list(mapping))
  expect_true(length(mapping) < ncol(df))
})


test_that("get_required_column_names returns expected columns", {
  required <- get_required_column_names()

  expect_true(is.character(required))
  expect_true(length(required) >= 4)
  expect_true("study_id" %in% required)
  expect_true("effect" %in% required)
  expect_true("se" %in% required)
  expect_true("n_obs" %in% required)
})


test_that("check_mapping_completeness identifies complete mapping", {
  mapping <- list(
    study_id = "study_name",
    effect = "effect",
    se = "se",
    n_obs = "n_obs"
  )

  result <- check_mapping_completeness(mapping)

  expect_true(result$complete)
  expect_equal(length(result$missing), 0)
})


test_that("check_mapping_completeness identifies missing required columns", {
  mapping <- list(
    study_id = "study_name",
    effect = "effect"
    # Missing se and n_obs
  )

  result <- check_mapping_completeness(mapping)

  expect_false(result$complete)
  expect_true("se" %in% result$missing)
  expect_true("n_obs" %in% result$missing)
})


test_that("check_mapping_completeness handles empty mapping", {
  mapping <- list()

  result <- check_mapping_completeness(mapping)

  expect_false(result$complete)
  expect_true(length(result$missing) >= 4)
})


test_that("recognize_columns with realistic meta-analysis data", {
  # Simulate realistic column names from published meta-analyses
  df <- data.frame(
    obs_n = 1:5,
    study_id = 1:5,
    study_name = paste("Study", LETTERS[1:5]),
    effect = rnorm(5, 10, 2),
    se = runif(5, 1, 3),
    t_stat = rnorm(5, 3, 1),
    n_obs = sample(100:500, 5),
    reg_df = sample(50:200, 5),
    study_size = rep(1, 5)
  )

  withr::local_options(list("artma.verbose" = 1))
  mapping <- recognize_columns(df, min_confidence = 0.7)

  # Verify key columns are recognized; prefer string study key when available
  expect_equal(mapping$study_id, "study_name")
  expect_equal(mapping$effect, "effect")
  expect_equal(mapping$se, "se")
  # n_obs could be matched to either "obs_n" or "n_obs" - both are valid patterns
  expect_true(mapping$n_obs %in% c("n_obs", "obs_n"))
  expect_equal(mapping$t_stat, "t_stat")

  # Verify no false positives for obs_id
  if ("obs_id" %in% names(mapping)) {
    # If obs_id is mapped, it should be to obs_n, not to anything else
    expect_true(mapping$obs_id %in% c("obs_n", "obs_id"))
  }
})


test_that("is_likely_study_key detects citation-like string labels", {
  values <- c("Albeigh (2008)", "Baker (2009)", "Chou 2010")
  expect_true(is_likely_study_key(values))
  expect_false(is_likely_numeric_id(values))
})


test_that("is_likely_numeric_id detects sequential numeric identifiers", {
  values <- 1:20
  expect_true(is_likely_numeric_id(values))
  expect_false(is_likely_study_key(values))
})


test_that("recognize_columns prefers string study keys over sequential numeric IDs", {
  df <- data.frame(
    study = 1:8,
    study_id = c(
      "Albeigh (2008)", "Baker (2009)", "Chou (2010)", "Davis (2011)",
      "Evans (2012)", "Frost (2013)", "Gale (2014)", "Holt (2015)"
    ),
    effect = rnorm(8),
    se = runif(8, 0.1, 0.3),
    n_obs = sample(100:300, 8)
  )

  mapping <- recognize_columns(df, min_confidence = 0.7)

  expect_equal(mapping$study_id, "study_id")
})


test_that("column recognition handles case insensitivity", {
  df <- data.frame(
    EFFECT = c(10.5, 8.2),
    SE = c(2.3, 1.8),
    Study = c("A", "B"),
    N_OBS = c(100, 150)
  )

  mapping <- recognize_columns(df, min_confidence = 0.7)

  expect_equal(mapping$effect, "EFFECT")
  expect_equal(mapping$se, "SE")
  expect_equal(mapping$study_id, "Study")
  expect_equal(mapping$n_obs, "N_OBS")
})