box::use(
  artma / data / column_recognition[
    analyze_column_values,
    score_candidate_values,
    resolve_multiple_matches,
    recognize_columns,
    match_column_name,
    get_column_patterns
  ]
)

test_that <- getFromNamespace("test_that", "testthat")
expect_equal <- getFromNamespace("expect_equal", "testthat")
expect_true <- getFromNamespace("expect_true", "testthat")
expect_false <- getFromNamespace("expect_false", "testthat")
expect_gte <- getFromNamespace("expect_gte", "testthat")
expect_lte <- getFromNamespace("expect_lte", "testthat")
expect_lt <- getFromNamespace("expect_lt", "testthat")


# Tests for analyze_column_values
test_that("analyze_column_values detects sequential integer patterns", {
  values <- 1:10
  analysis <- analyze_column_values(values)

  expect_true(analysis$is_sequential)
  expect_true(analysis$is_unique)
  expect_true(analysis$is_numeric)
  expect_equal(analysis$uniqueness_ratio, 1.0)
})


test_that("analyze_column_values detects non-sequential patterns", {
  values <- c(100, 150, 200, 120, 180)
  analysis <- analyze_column_values(values)

  expect_false(analysis$is_sequential)
  expect_true(analysis$is_numeric)
})


test_that("analyze_column_values handles repeated values", {
  values <- c(100, 100, 150, 150, 200, 200)
  analysis <- analyze_column_values(values)

  expect_false(analysis$is_unique)
  expect_equal(analysis$uniqueness_ratio, 0.5)
})


test_that("analyze_column_values computes statistical properties", {
  values <- c(100, 200, 300, 400, 500)
  analysis <- analyze_column_values(values)

  expect_equal(analysis$mean, 300)
  expect_equal(analysis$min, 100)
  expect_equal(analysis$max, 500)
  expect_true(analysis$variance > 0)
})


test_that("analyze_column_values handles NA values", {
  values <- c(100, NA, 200, NA, 300)
  analysis <- analyze_column_values(values)

  expect_equal(analysis$mean, 200)
  expect_equal(analysis$min, 100)
  expect_equal(analysis$max, 300)
})


test_that("analyze_column_values handles all-NA columns", {
  values <- c(NA, NA, NA)
  analysis <- analyze_column_values(values)

  expect_false(analysis$is_sequential)
  expect_false(analysis$is_unique)
  expect_equal(analysis$uniqueness_ratio, 0)
  expect_true(is.na(analysis$mean))
})


test_that("analyze_column_values handles character columns", {
  values <- c("Study A", "Study B", "Study C")
  analysis <- analyze_column_values(values)

  expect_false(analysis$is_numeric)
  expect_true(analysis$is_unique)
})


test_that("analyze_column_values detects sequential pattern starting from non-1", {
  values <- 5:14
  analysis <- analyze_column_values(values)

  # Should NOT be detected as sequential since we check for diff == 1
  expect_true(analysis$is_sequential)
})


# Tests for score_candidate_values
test_that("score_candidate_values penalizes sequential pattern for n_obs", {
  df <- data.frame(
    obs_n = 1:10,
    n_obs = c(100, 150, 200, 120, 180, 90, 110, 130, 160, 140)
  )

  # Sequential column should get penalty
  score_seq <- score_candidate_values(df, "obs_n", "n_obs", 1.0)

  # Non-sequential column should not get penalty
  score_nonseq <- score_candidate_values(df, "n_obs", "n_obs", 1.0)

  expect_lt(score_seq, score_nonseq)
  expect_lt(score_seq, 0.8) # Should have significant penalty
})


test_that("score_candidate_values penalizes high uniqueness for n_obs", {
  df <- data.frame(
    col_unique = 1:100, # All unique, like an ID
    col_data = sample(c(50, 100, 150, 200, 250), 100, replace = TRUE) # Repeated values
  )

  score_unique <- score_candidate_values(df, "col_unique", "n_obs", 1.0)
  score_repeated <- score_candidate_values(df, "col_data", "n_obs", 1.0)

  expect_lt(score_unique, score_repeated)
})


test_that("score_candidate_values rewards sequential pattern for obs_id", {
  df <- data.frame(
    obs_n = 1:10,
    other = c(100, 150, 200, 120, 180, 90, 110, 130, 160, 140)
  )

  # Sequential column should get bonus
  score_seq <- score_candidate_values(df, "obs_n", "obs_id", 1.0)

  # Non-sequential column should not get bonus
  score_nonseq <- score_candidate_values(df, "other", "obs_id", 1.0)

  expect_gte(score_seq, score_nonseq)
})


test_that("score_candidate_values penalizes non-unique values for obs_id", {
  df <- data.frame(
    col_unique = 1:10,
    col_repeated = rep(1:5, each = 2)
  )

  score_unique <- score_candidate_values(df, "col_unique", "obs_id", 1.0)
  score_repeated <- score_candidate_values(df, "col_repeated", "obs_id", 1.0)

  expect_gte(score_unique, score_repeated)
})


test_that("score_candidate_values handles effect columns appropriately", {
  df <- data.frame(
    effect_real = rnorm(100, mean = 0.5, sd = 0.2),
    effect_seq = 1:100
  )

  score_real <- score_candidate_values(df, "effect_real", "effect", 1.0)
  score_seq <- score_candidate_values(df, "effect_seq", "effect", 1.0)

  expect_gte(score_real, score_seq)
})


test_that("score_candidate_values penalizes zero variance columns", {
  df <- data.frame(
    col_constant = rep(100, 10),
    col_varying = c(100, 150, 200, 120, 180, 90, 110, 130, 160, 140)
  )

  score_constant <- score_candidate_values(df, "col_constant", "effect", 1.0)
  score_varying <- score_candidate_values(df, "col_varying", "effect", 1.0)

  expect_lt(score_constant, score_varying)
})


# Tests for resolve_multiple_matches
test_that("resolve_multiple_matches picks non-sequential for n_obs", {
  df <- data.frame(
    obs_n = 1:5,
    n_obs = c(100, 150, 200, 120, 180),
    study_id = paste("Study", LETTERS[1:5]),
    effect = rnorm(5),
    se = runif(5, 1, 3)
  )

  patterns <- get_column_patterns()
  matches <- list(
    obs_n = list(match = "n_obs", score = 1.0, method = "regex"),
    n_obs = list(match = "n_obs", score = 1.0, method = "regex")
  )

  withr::local_options(list("artma.verbose" = 1))
  best <- resolve_multiple_matches(df, c("obs_n", "n_obs"), "n_obs", matches)

  # Should pick n_obs over obs_n because obs_n is sequential
  expect_equal(best, "n_obs")
})


test_that("resolve_multiple_matches picks sequential for obs_id", {
  df <- data.frame(
    obs_n = 1:5,
    other_id = c(101, 102, 105, 103, 104),
    study_id = paste("Study", LETTERS[1:5]),
    effect = rnorm(5),
    se = runif(5, 1, 3)
  )

  patterns <- get_column_patterns()
  matches <- list(
    obs_n = list(match = "obs_id", score = 1.0, method = "regex"),
    other_id = list(match = "obs_id", score = 0.9, method = "keyword")
  )

  withr::local_options(list("artma.verbose" = 1))
  best <- resolve_multiple_matches(df, c("obs_n", "other_id"), "obs_id", matches)

  # Should pick obs_n because it's sequential and unique
  expect_equal(best, "obs_n")
})


test_that("resolve_multiple_matches returns single candidate unchanged", {
  df <- data.frame(n_obs = c(100, 150, 200))
  matches <- list(n_obs = list(match = "n_obs", score = 1.0, method = "regex"))

  withr::local_options(list("artma.verbose" = 1))
  best <- resolve_multiple_matches(df, "n_obs", "n_obs", matches)

  expect_equal(best, "n_obs")
})


# Integration tests with recognize_columns
test_that("recognize_columns correctly resolves obs_n vs n_obs conflict", {
  # This is the main test case from the issue
  df <- data.frame(
    obs_n = 1:10,
    study_id = 1:10,
    study_name = paste("Study", LETTERS[1:10]),
    effect = rnorm(10, 10, 2),
    se = runif(10, 1, 3),
    t_stat = rnorm(10, 3, 1),
    n_obs = sample(100:500, 10),
    reg_df = sample(50:200, 10),
    study_size = rep(1, 10)
  )

  withr::local_options(list("artma.verbose" = 1))
  mapping <- recognize_columns(df, min_confidence = 0.7)

  # Should map n_obs standard column to n_obs data column, not obs_n
  expect_equal(mapping$n_obs, "n_obs")

  # obs_n may or may not be mapped to obs_id depending on what's already used
  # The key test is that n_obs is correctly identified
  if ("obs_id" %in% names(mapping)) {
    # If obs_id is mapped, it should be a sequential column
    obs_id_col <- mapping$obs_id
    analysis <- analyze_column_values(df[[obs_id_col]])
    expect_true(analysis$is_sequential || analysis$is_unique)
  }
})


test_that("recognize_columns handles case with only sequential column", {
  # Edge case: only obs_n exists, no n_obs
  df <- data.frame(
    obs_n = 1:10,
    study_name = paste("Study", LETTERS[1:10]),
    effect = rnorm(10, 10, 2),
    se = runif(10, 1, 3)
  )

  withr::local_options(list("artma.verbose" = 1))
  mapping <- recognize_columns(df, min_confidence = 0.7)

  # Should still map to n_obs despite being sequential (it's the only candidate)
  expect_equal(mapping$n_obs, "obs_n")
})


test_that("recognize_columns handles case with multiple effect-like columns", {
  df <- data.frame(
    study_id = paste("Study", LETTERS[1:10]),
    effect_id = 1:10, # Sequential, should be rejected
    effect = rnorm(10, 0.5, 0.2), # Real effect sizes
    se = runif(10, 0.1, 0.3),
    n_obs = sample(50:200, 10)
  )

  withr::local_options(list("artma.verbose" = 1))
  mapping <- recognize_columns(df, min_confidence = 0.7)

  # Should pick the non-sequential effect column
  expect_equal(mapping$effect, "effect")
})


test_that("recognize_columns prioritizes better semantic match", {
  df <- data.frame(
    row_number = 1:10,
    sample_size = sample(100:500, 10),
    study_id = paste("Study", LETTERS[1:10]),
    estimate = rnorm(10, 0.5, 0.2),
    se = runif(10, 0.1, 0.3)
  )

  withr::local_options(list("artma.verbose" = 1))
  mapping <- recognize_columns(df, min_confidence = 0.7)

  # Should prefer sample_size for n_obs even if row_number matches the pattern
  expect_equal(mapping$n_obs, "sample_size")
})


test_that("value-based resolution works with realistic meta-analysis data", {
  # Simulate realistic data where column names are ambiguous
  df <- data.frame(
    obs_n = 1:50,
    study_id = rep(1:10, each = 5),
    study_name = rep(paste("Study", LETTERS[1:10]), each = 5),
    effect = rnorm(50, 0.3, 0.15),
    se = runif(50, 0.05, 0.2),
    n_obs = sample(50:300, 50, replace = TRUE),
    precision = 1 / runif(50, 0.05, 0.2)^2
  )

  withr::local_options(list("artma.verbose" = 1))
  mapping <- recognize_columns(df, min_confidence = 0.7)

  # Verify correct mappings; prefer string study keys when available
  expect_equal(mapping$study_id, "study_name")
  expect_equal(mapping$effect, "effect")
  expect_equal(mapping$se, "se")
  expect_equal(mapping$n_obs, "n_obs") # Not obs_n
})


test_that("value-based resolution handles edge case with same scores", {
  # Create a case where both columns have very similar properties
  set.seed(42)
  df <- data.frame(
    n_obs_1 = sample(50:200, 20),
    n_obs_2 = sample(50:200, 20),
    study_id = paste("Study", 1:20),
    effect = rnorm(20),
    se = runif(20, 0.1, 0.3)
  )

  # Both n_obs_1 and n_obs_2 match the n_obs pattern
  withr::local_options(list("artma.verbose" = 1))
  mapping <- recognize_columns(df, min_confidence = 0.7)

  # Should pick one of them (whichever has slightly better properties)
  expect_true(mapping$n_obs %in% c("n_obs_1", "n_obs_2"))
})


test_that("analyze_column_values handles empty data frame columns", {
  values <- numeric(0)
  analysis <- analyze_column_values(values)

  expect_false(analysis$is_sequential)
  expect_false(analysis$is_unique)
  expect_equal(analysis$uniqueness_ratio, 0)
})


test_that("score_candidate_values handles columns with NA values", {
  df <- data.frame(
    col_with_na = c(100, NA, 200, NA, 300),
    n_obs = c(150, 160, 170, 180, 190)
  )

  # Should not crash on NA values
  score <- score_candidate_values(df, "col_with_na", "n_obs", 1.0)

  expect_true(is.numeric(score))
  expect_gte(score, 0)
  expect_lte(score, 1)
})


test_that("value-based resolution prefers exact name match when values are similar", {
  df <- data.frame(
    observations = sample(50:200, 20),
    n_obs = sample(50:200, 20),
    study_id = paste("Study", 1:20),
    effect = rnorm(20),
    se = runif(20, 0.1, 0.3)
  )

  withr::local_options(list("artma.verbose" = 1))
  mapping <- recognize_columns(df, min_confidence = 0.7)

  # When both have similar value properties, exact name match should win
  expect_equal(mapping$n_obs, "n_obs")
})