# Tests for id_var functionality with occup_panel dataset
# and other features demonstrated in vignettes

data("occup_panel", package = "cat2cat")
data("trans", package = "cat2cat")

# Split by encoding periods (2009 = old encoding, 2010 = new encoding)
panel_old <- occup_panel[occup_panel$year == 2009, ]
panel_new <- occup_panel[occup_panel$year == 2010, ]

# For quarterly granularity tests
panel_2009Q4 <- occup_panel[occup_panel$quarter == "2009Q4", ]
panel_2010Q1 <- occup_panel[occup_panel$quarter == "2010Q1", ]

# Subjects in both encoding periods
shared_ids <- intersect(panel_old$panel_id, panel_new$panel_id)

# -----------------------------------------------------------------------------
# id_var basic functionality tests
# -----------------------------------------------------------------------------

testthat::test_that("id_var direct matching reduces replication", {
  # Without id_var
  result_no_id <- cat2cat(
    data = list(
      old = panel_2009Q4,
      new = panel_2010Q1,
      cat_var = "code",
      time_var = "quarter"
    ),
    mappings = list(trans = trans, direction = "backward")
  )

  # With id_var
  result_with_id <- cat2cat(
    data = list(
      old = panel_2009Q4,
      new = panel_2010Q1,
      id_var = "panel_id",
      cat_var = "code",
      time_var = "quarter"
    ),
    mappings = list(trans = trans, direction = "backward")
  )

  # With id_var should have fewer rows due to direct matching
  expect_lt(nrow(result_with_id$old), nrow(result_no_id$old))

  # But weighted sums should equal original counts
  expect_equal(sum(result_no_id$old$wei_freq_c2c), nrow(panel_2009Q4))
  expect_equal(sum(result_with_id$old$wei_freq_c2c), nrow(panel_2009Q4))
})

testthat::test_that("id_var direct matches have rep_c2c = 1", {
  result <- cat2cat(
    data = list(
      old = panel_2009Q4,
      new = panel_2010Q1,
      id_var = "panel_id",
      cat_var = "code",
      time_var = "quarter"
    ),
    mappings = list(trans = trans, direction = "backward")
  )

  # Get direct matched subjects
  crossing_ids <- intersect(panel_2009Q4$panel_id, panel_2010Q1$panel_id)
  direct_matches <- result$old[result$old$panel_id %in% crossing_ids, ]

  # Direct matches should have rep_c2c = 1 (no replication)
  expect_true(all(direct_matches$rep_c2c == 1))

  # Direct matches should have wei_freq_c2c = 1
  expect_true(all(direct_matches$wei_freq_c2c == 1))
})

testthat::test_that("id_var non-matched subjects still get replicated", {
  result <- cat2cat(
    data = list(
      old = panel_2009Q4,
      new = panel_2010Q1,
      id_var = "panel_id",
      cat_var = "code",
      time_var = "quarter"
    ),
    mappings = list(trans = trans, direction = "backward")
  )

  crossing_ids <- intersect(panel_2009Q4$panel_id, panel_2010Q1$panel_id)
  non_matched <- result$old[!result$old$panel_id %in% crossing_ids, ]

  # Non-matched subjects should have some with rep_c2c > 1
  expect_true(any(non_matched$rep_c2c > 1))
})

testthat::test_that("id_var g_new_c2c comes from base period for direct matches", {
  result <- cat2cat(
    data = list(
      old = panel_2009Q4,
      new = panel_2010Q1,
      id_var = "panel_id",
      cat_var = "code",
      time_var = "quarter"
    ),
    mappings = list(trans = trans, direction = "backward")
  )

  crossing_ids <- intersect(panel_2009Q4$panel_id, panel_2010Q1$panel_id)

  # For each direct match, g_new_c2c should equal their 2010Q1 code
  for (pid in head(crossing_ids, 10)) {
    matched_row <- result$old[result$old$panel_id == pid, ]
    new_code <- panel_2010Q1$code[panel_2010Q1$panel_id == pid]
    expect_equal(matched_row$g_new_c2c, new_code)
  }
})

testthat::test_that("id_var complete matched panel uses only direct matches", {
  complete_ids <- intersect(panel_2009Q4$panel_id, panel_2010Q1$panel_id)
  complete_old <- panel_2009Q4[panel_2009Q4$panel_id %in% complete_ids, ]
  complete_new <- panel_2010Q1[panel_2010Q1$panel_id %in% complete_ids, ]

  result <- cat2cat(
    data = list(
      old = complete_old,
      new = complete_new,
      id_var = "panel_id",
      cat_var = "code",
      time_var = "quarter"
    ),
    mappings = list(trans = trans, direction = "backward")
  )

  # No observations should go through the replication path when every id matches.
  expect_equal(nrow(result$old), nrow(complete_old))
  expect_equal(nrow(result$new), nrow(complete_new))
  expect_true(all(result$old$rep_c2c == 1))
  expect_true(all(result$new$rep_c2c == 1))
  expect_true(all(result$old$wei_freq_c2c == 1))
  expect_true(all(result$old$wei_naive_c2c == 1))

  new_codes <- complete_new$code[match(result$old$panel_id, complete_new$panel_id)]
  expect_equal(result$old$g_new_c2c, new_codes)
})

# -----------------------------------------------------------------------------
# Weight sum preservation tests
# -----------------------------------------------------------------------------

testthat::test_that("weights sum to original n for all weight types", {
  result <- cat2cat(
    data = list(
      old = panel_2009Q4,
      new = panel_2010Q1,
      id_var = "panel_id",
      cat_var = "code",
      time_var = "quarter"
    ),
    mappings = list(trans = trans, direction = "backward")
  )

  n_original <- nrow(panel_2009Q4)
  expect_equal(sum(result$old$wei_freq_c2c), n_original)
  expect_equal(sum(result$old$wei_naive_c2c), n_original)
})

testthat::test_that("per-subject weights sum to 1", {
  result <- cat2cat(
    data = list(
      old = panel_2009Q4,
      new = panel_2010Q1,
      cat_var = "code",
      time_var = "quarter"
    ),
    mappings = list(trans = trans, direction = "backward")
  )

  # Group by index_c2c (original observation) and sum weights
  per_subject_freq <- tapply(result$old$wei_freq_c2c,
                              result$old$index_c2c, sum)
  per_subject_naive <- tapply(result$old$wei_naive_c2c,
                               result$old$index_c2c, sum)

  # All should sum to 1 (within floating point tolerance)
  expect_true(all(abs(per_subject_freq - 1) < 1e-6))
  expect_true(all(abs(per_subject_naive - 1) < 1e-6))
})

# -----------------------------------------------------------------------------
# prune_c2c tests
# -----------------------------------------------------------------------------

testthat::test_that("prune_c2c highest1 returns one row per original obs", {
  result <- cat2cat(
    data = list(
      old = panel_2009Q4,
      new = panel_2010Q1,
      cat_var = "code",
      time_var = "quarter"
    ),
    mappings = list(trans = trans, direction = "backward")
  )

  pruned <- prune_c2c(result$old, method = "highest1")
  expect_equal(nrow(pruned), nrow(panel_2009Q4))
})

testthat::test_that("prune_c2c nonzero removes zero-weight rows", {
  result <- cat2cat(
    data = list(
      old = panel_2009Q4,
      new = panel_2010Q1,
      cat_var = "code",
      time_var = "quarter"
    ),
    mappings = list(trans = trans, direction = "backward")
  )

  pruned <- prune_c2c(result$old, method = "nonzero")

  # Should have no zero weights
  expect_true(all(pruned$wei_freq_c2c > 0))

  # Fewer or equal rows

  expect_lte(nrow(pruned), nrow(result$old))
})

testthat::test_that("prune_c2c preserves weight sum after renormalization", {
  result <- cat2cat(
    data = list(
      old = panel_2009Q4,
      new = panel_2010Q1,
      cat_var = "code",
      time_var = "quarter"
    ),
    mappings = list(trans = trans, direction = "backward")
  )

  pruned <- prune_c2c(result$old, method = "highest")

  # Weight sum should still equal original n
  expect_equal(sum(pruned$wei_freq_c2c), nrow(panel_2009Q4))
})

# -----------------------------------------------------------------------------
# cross_c2c ensemble tests
# -----------------------------------------------------------------------------

testthat::test_that("cross_c2c creates ensemble weight column", {
  result <- cat2cat(
    data = list(
      old = panel_2009Q4,
      new = panel_2010Q1,
      cat_var = "code",
      time_var = "quarter"
    ),
    mappings = list(trans = trans, direction = "backward")
  )

  crossed <- cross_c2c(result$old,
                        c("wei_freq_c2c", "wei_naive_c2c"),
                        c(0.5, 0.5))

  expect_true("wei_cross_c2c" %in% names(crossed))
})

testthat::test_that("cross_c2c weights sum correctly", {
  result <- cat2cat(
    data = list(
      old = panel_2009Q4,
      new = panel_2010Q1,
      cat_var = "code",
      time_var = "quarter"
    ),
    mappings = list(trans = trans, direction = "backward")
  )

  crossed <- cross_c2c(result$old,
                        c("wei_freq_c2c", "wei_naive_c2c"),
                        c(0.7, 0.3))

  # Ensemble weights should sum to original n
  expect_equal(sum(crossed$wei_cross_c2c), nrow(panel_2009Q4))
})

testthat::test_that("cross_c2c weights are between 0 and 1", {
  result <- cat2cat(
    data = list(
      old = panel_2009Q4,
      new = panel_2010Q1,
      cat_var = "code",
      time_var = "quarter"
    ),
    mappings = list(trans = trans, direction = "backward")
  )

  crossed <- cross_c2c(result$old,
                        c("wei_freq_c2c", "wei_naive_c2c"),
                        c(0.5, 0.5))

  expect_true(all(crossed$wei_cross_c2c >= 0))
  expect_true(all(crossed$wei_cross_c2c <= 1))
})

# -----------------------------------------------------------------------------
# Quarterly panel structure validation
# -----------------------------------------------------------------------------

testthat::test_that("occup_panel has expected BAEL-style structure", {
  # 8 quarters
  expect_equal(length(unique(occup_panel$quarter)), 8)

  # Subjects observed across encoding change
  crossing <- intersect(
    occup_panel$panel_id[occup_panel$quarter == "2009Q4"],
    occup_panel$panel_id[occup_panel$quarter == "2010Q1"]
  )
  expect_gt(length(crossing), 400)  # Should be ~450

  # Most subjects observed 4 quarters
  appearances <- table(table(occup_panel$panel_id))
  expect_true("4" %in% names(appearances))
  expect_gt(appearances["4"], 500)  # Should be ~750
})

testthat::test_that("occup_panel codes have correct length per encoding", {
  old_codes <- occup_panel$code[occup_panel$year == 2009]
  new_codes <- occup_panel$code[occup_panel$year == 2010]

  # Old encoding: 4-digit
  expect_true(all(nchar(old_codes) == 4))

  # New encoding: 6-digit
  expect_true(all(nchar(new_codes) == 6))
})

testthat::test_that("boundary transitions are valid per trans table", {
  p2009Q4 <- occup_panel[occup_panel$quarter == "2009Q4",
                          c("panel_id", "code")]
  p2010Q1 <- occup_panel[occup_panel$quarter == "2010Q1",
                          c("panel_id", "code")]

  # Merge to get transitions
  crossing <- merge(p2009Q4, p2010Q1, by = "panel_id",
                     suffixes = c("_old", "_new"))

  # Check each transition is valid per trans table
  valid <- mapply(function(o, n) n %in% trans$new[trans$old == o],
                  crossing$code_old, crossing$code_new)

  # All transitions should be valid
  expect_true(all(valid))
})

# -----------------------------------------------------------------------------
# Forward direction with id_var
# -----------------------------------------------------------------------------

testthat::test_that("id_var works with forward direction", {
  # Some 2010 codes don't have backward mapping (genuinely new occupations)
  # So we expect a warning about missing coverage
  result <- suppressWarnings(cat2cat(
    data = list(
      old = panel_2009Q4,
      new = panel_2010Q1,
      id_var = "panel_id",
      cat_var = "code",
      time_var = "quarter"
    ),
    mappings = list(trans = trans, direction = "forward")
  ))

  # Forward maps new to old, so $new is replicated
  # Some observations may be dropped due to missing mapping coverage
  expect_lte(sum(result$new$wei_freq_c2c), nrow(panel_2010Q1))
  expect_gt(sum(result$new$wei_freq_c2c), nrow(panel_2010Q1) - 10)

  # Direct matches should have rep_c2c = 1
  crossing_ids <- intersect(panel_2009Q4$panel_id, panel_2010Q1$panel_id)
  direct_matches <- result$new[result$new$panel_id %in% crossing_ids, ]
  expect_true(all(direct_matches$rep_c2c == 1))
})

# -----------------------------------------------------------------------------
# ML weights with id_var
# -----------------------------------------------------------------------------

testthat::test_that("id_var works with ML weights", {
  skip_if_not_installed("caret")
  skip_if_not_installed("randomForest")

  result <- cat2cat(
    data = list(
      old = panel_2009Q4,
      new = panel_2010Q1,
      id_var = "panel_id",
      cat_var = "code",
      time_var = "quarter"
    ),
    mappings = list(trans = trans, direction = "backward"),
    ml = list(
      data = panel_2010Q1,
      cat_var = "code",
      method = "knn",
      features = c("age", "sex", "edu", "exp", "salary"),
      args = list(k = 5)
    )
  )

  # ML weights should be added
  expect_true("wei_knn_c2c" %in% names(result$old))

  # ML weights sum to original n
  expect_equal(sum(result$old$wei_knn_c2c), nrow(panel_2009Q4))

  # Direct matches should have ML weight = 1 (no ML prediction needed)
  crossing_ids <- intersect(panel_2009Q4$panel_id, panel_2010Q1$panel_id)
  direct_matches <- result$old[result$old$panel_id %in% crossing_ids, ]
  expect_true(all(direct_matches$wei_knn_c2c == 1))
})