# Tests for starting points functions and bundled dataset
# Covers: pnadc_series_starting_points (bundled data),
#          compute_z_aggregates(), compute_series_starting_points()

# =============================================================================
# BUNDLED DATASET TESTS (pnadc_series_starting_points)
# =============================================================================

test_that("bundled dataset loads successfully via data()", {
  data("pnadc_series_starting_points", package = "PNADCperiods")
  expect_s3_class(pnadc_series_starting_points, "data.table")
  expect_true(nrow(pnadc_series_starting_points) > 0,
              label = "Dataset has rows")
})

test_that("bundled dataset has exactly 3 columns: series_name, mesnotrim, y0", {
  data("pnadc_series_starting_points", package = "PNADCperiods")
  expect_equal(ncol(pnadc_series_starting_points), 3)
  expect_named(pnadc_series_starting_points,
               c("series_name", "mesnotrim", "y0"),
               ignore.order = FALSE)
})

test_that("mesnotrim only contains values 1, 2, 3", {
  data("pnadc_series_starting_points", package = "PNADCperiods")
  expect_true(all(pnadc_series_starting_points$mesnotrim %in% c(1L, 2L, 3L)),
              label = "All mesnotrim values are 1, 2, or 3")
  # Confirm all three positions are present

  expect_setequal(unique(pnadc_series_starting_points$mesnotrim), c(1L, 2L, 3L))
})

test_that("no NA in series_name or mesnotrim", {
  data("pnadc_series_starting_points", package = "PNADCperiods")
  expect_false(any(is.na(pnadc_series_starting_points$series_name)),
               label = "No NA in series_name")
  expect_false(any(is.na(pnadc_series_starting_points$mesnotrim)),
               label = "No NA in mesnotrim")
})

test_that("all y0 values are finite (no NA, NaN, Inf)", {
  data("pnadc_series_starting_points", package = "PNADCperiods")
  expect_true(all(is.finite(pnadc_series_starting_points$y0)),
              label = "All y0 values are finite")
})

test_that("key labor market series are present in bundled dataset", {
  data("pnadc_series_starting_points", package = "PNADCperiods")
  available <- unique(pnadc_series_starting_points$series_name)

  key_series <- c("popocup", "popdesocup", "popnaforca", "pop14mais")
  for (s in key_series) {
    expect_true(s %in% available,
                label = paste0("Series '", s, "' is present"))
  }
})

test_that("each series has exactly 3 rows (one per mesnotrim)", {
  data("pnadc_series_starting_points", package = "PNADCperiods")
  counts <- pnadc_series_starting_points[, .N, by = series_name]

  expect_true(all(counts$N == 3L),
              label = "Every series has exactly 3 rows")

  # Verify each series has all three positions
  for (s in unique(pnadc_series_starting_points$series_name)) {
    positions <- pnadc_series_starting_points[series_name == s, mesnotrim]
    expect_setequal(positions, c(1L, 2L, 3L))
  }
})

# =============================================================================
# compute_z_aggregates() TESTS
# =============================================================================

# Helper: create mock calibrated microdata for compute_z_aggregates()
create_mock_calibrated <- function(n_per_month = 10L,
                                    months = c(202301L, 202302L, 202303L),
                                    seed = 100L) {
  set.seed(seed)
  n <- n_per_month * length(months)

  dt <- data.table::data.table(
    weight_monthly = rep(1000, n),
    ref_month_yyyymm = rep(months, each = n_per_month),
    ref_month_in_quarter = rep(1:3, each = n_per_month),
    Ano = 2023L,
    Trimestre = 1L,
    V2009 = sample(c(5L, 20L, 35L, 50L, 70L), n, replace = TRUE),
    VD4001 = sample(c(1L, 2L), n, replace = TRUE),
    VD4002 = sample(c(1L, 2L, NA_integer_), n, replace = TRUE, prob = c(0.45, 0.15, 0.40)),
    VD4009 = sample(c(1L:10L, NA_integer_), n, replace = TRUE),
    VD4010 = sample(c(1L:11L, NA_integer_), n, replace = TRUE),
    V4019 = sample(c(1L, 2L, NA_integer_), n, replace = TRUE),
    VD4004 = sample(c(1L, 2L, NA_integer_), n, replace = TRUE),
    VD4004A = sample(c(1L, 2L, NA_integer_), n, replace = TRUE),
    VD4019 = round(runif(n, 1000, 5000), 2),
    VD4020 = round(runif(n, 900, 4500), 2),
    VD4016 = round(runif(n, 500, 3000), 2),
    VD4017 = round(runif(n, 500, 3000), 2),
    VD4031 = round(runif(n, 20, 44), 0),
    VD4035 = round(runif(n, 20, 50), 0)
  )
  dt
}


test_that("compute_z_aggregates errors when weight_monthly is missing", {
  dt <- data.table::data.table(
    ref_month_yyyymm = c(202301L, 202302L),
    ref_month_in_quarter = c(1L, 2L),
    V2009 = c(30L, 40L)
  )
  expect_error(
    compute_z_aggregates(dt, verbose = FALSE),
    "weight_monthly"
  )
})

test_that("compute_z_aggregates computes correct z_popocup and z_popdesocup", {
  # Construct a deterministic example:
  # 4 obs in one month, weights all 1000
  # VD4002: 1, 1, 2, 2  -> 2 occupied, 2 unemployed
  dt <- data.table::data.table(
    weight_monthly = rep(1000, 4),
    ref_month_yyyymm = rep(202301L, 4),
    ref_month_in_quarter = rep(1L, 4),
    Ano = 2023L,
    Trimestre = 1L,
    V2009 = c(30L, 35L, 25L, 40L),
    VD4001 = c(1L, 1L, 1L, 1L),
    VD4002 = c(1L, 1L, 2L, 2L)
  )

  result <- compute_z_aggregates(dt, verbose = FALSE)

  # z_popocup = sum(indicator * weight): 2 obs with VD4002==1, weight=1000 each
  expect_equal(result$z_popocup, 2 * 1000)
  # z_popdesocup = sum(indicator * weight): 2 obs with VD4002==2
  expect_equal(result$z_popdesocup, 2 * 1000)
})

test_that("z_populacao equals 1 for every observation (total pop = sum of weights)", {
  # 6 obs across 2 months, various weights
  dt <- data.table::data.table(
    weight_monthly = c(500, 800, 1200, 600, 900, 1100),
    ref_month_yyyymm = c(rep(202301L, 3), rep(202302L, 3)),
    ref_month_in_quarter = c(rep(1L, 3), rep(2L, 3)),
    Ano = 2023L,
    Trimestre = 1L,
    V2009 = c(30L, 35L, 25L, 40L, 50L, 20L)
  )

  result <- compute_z_aggregates(dt, verbose = FALSE)
  data.table::setorder(result, anomesexato)

  # z_populacao should be sum of weights per month
  # Month 1: 500 + 800 + 1200 = 2500
  # Month 2: 600 + 900 + 1100 = 2600
  expect_equal(result[anomesexato == 202301L, z_populacao], 2500)
  expect_equal(result[anomesexato == 202302L, z_populacao], 2600)
})

test_that("compute_z_aggregates groups correctly by month across multiple months", {
  dt <- create_mock_calibrated(n_per_month = 20L)
  result <- compute_z_aggregates(dt, verbose = FALSE)

  # Should have one row per month
  expect_equal(nrow(result), 3)
  expect_true(all(c(202301L, 202302L, 202303L) %in% result$anomesexato))

  # All z_ columns should be present and numeric
  z_cols <- grep("^z_", names(result), value = TRUE)
  expect_true(length(z_cols) > 0, label = "z_ columns exist")
  for (col in z_cols) {
    expect_true(is.numeric(result[[col]]),
                label = paste0(col, " is numeric"))
  }
})

test_that("compute_z_aggregates converts character VD variables to integer", {
  # VD variables stored as character in some PNADC files ("01", "02")
  dt <- data.table::data.table(
    weight_monthly = rep(1000, 4),
    ref_month_yyyymm = rep(202301L, 4),
    ref_month_in_quarter = rep(1L, 4),
    Ano = 2023L,
    Trimestre = 1L,
    V2009 = c(30L, 35L, 25L, 40L),
    VD4001 = c("01", "01", "02", "02"),
    VD4002 = c("01", "01", "02", "02")
  )

  # Should not error, and should produce correct results
  result <- compute_z_aggregates(dt, verbose = FALSE)
  expect_equal(result$z_popocup, 2 * 1000)
  expect_equal(result$z_popdesocup, 2 * 1000)
})

# =============================================================================
# compute_series_starting_points() TESTS
# =============================================================================

# Helper: create paired mock data for compute_series_starting_points()
# Returns a list with $monthly_estimates and $rolling_quarters
create_mock_for_starting_points <- function(n_months = 36L,
                                             start_yyyymm = 201301L,
                                             base_rq = 100,
                                             scale_factor = 1000,
                                             seed = 200L) {
  set.seed(seed)

  # Generate YYYYMM sequence
  months_seq <- generate_yyyymm_seq(start_yyyymm, n_months)
  mesnotrim_seq <- ((months_seq %% 100L - 1L) %% 3L) + 1L

  # Rolling quarters: base value with slight trend
  rq_values <- base_rq * (1 + 0.001 * seq_len(n_months))

  rolling_quarters <- data.table::data.table(
    anomesfinaltrimmovel = months_seq,
    mesnotrim = mesnotrim_seq,
    popocup = rq_values,
    popdesocup = rq_values * 0.1,
    pop14mais = rq_values * 1.2
  )

  # Monthly estimates: z_ columns = scale_factor * rq + noise
  # (so that z/scale_factor is close to rq, giving small y0)
  monthly_estimates <- data.table::data.table(
    anomesexato = months_seq,
    z_popocup = rq_values * scale_factor + rnorm(n_months, 0, 50),
    z_popdesocup = rq_values * 0.1 * scale_factor + rnorm(n_months, 0, 5),
    z_pop14mais = rq_values * 1.2 * scale_factor + rnorm(n_months, 0, 60)
  )

  list(
    monthly_estimates = monthly_estimates,
    rolling_quarters = rolling_quarters
  )
}


test_that("compute_series_starting_points returns correct columns", {
  mock <- create_mock_for_starting_points()
  result <- compute_series_starting_points(
    mock$monthly_estimates,
    mock$rolling_quarters,
    calibration_start = 201301L,
    calibration_end = 201912L,
    scale_factor = 1000,
    use_series_specific_periods = FALSE,
    verbose = FALSE
  )

  expect_s3_class(result, "data.table")
  expect_named(result, c("series_name", "mesnotrim", "y0"), ignore.order = FALSE)
})

test_that("compute_series_starting_points returns 3 rows per series", {
  mock <- create_mock_for_starting_points()
  result <- compute_series_starting_points(
    mock$monthly_estimates,
    mock$rolling_quarters,
    calibration_start = 201301L,
    calibration_end = 201912L,
    scale_factor = 1000,
    use_series_specific_periods = FALSE,
    verbose = FALSE
  )

  counts <- result[, .N, by = series_name]
  expect_true(all(counts$N == 3L),
              label = "Each series has exactly 3 rows")

  # Each series should have all 3 mesnotrim values
  for (s in unique(result$series_name)) {
    positions <- result[series_name == s, mesnotrim]
    expect_setequal(positions, c(1L, 2L, 3L))
  }
})

test_that("custom calibration_start/end are respected", {
  # Create data spanning a wide range
  mock <- create_mock_for_starting_points(n_months = 120L, start_yyyymm = 201201L)

  # Use narrow calibration period
  result_narrow <- compute_series_starting_points(
    mock$monthly_estimates,
    mock$rolling_quarters,
    calibration_start = 201601L,
    calibration_end = 201712L,
    scale_factor = 1000,
    use_series_specific_periods = FALSE,
    verbose = FALSE
  )

  # Use wider calibration period
  result_wide <- compute_series_starting_points(
    mock$monthly_estimates,
    mock$rolling_quarters,
    calibration_start = 201301L,
    calibration_end = 201912L,
    scale_factor = 1000,
    use_series_specific_periods = FALSE,
    verbose = FALSE
  )

  # Both should produce valid results
  expect_true(all(is.finite(result_narrow$y0)))
  expect_true(all(is.finite(result_wide$y0)))

  # y0 values should differ because calibration periods differ
  merged <- merge(result_narrow, result_wide,
                  by = c("series_name", "mesnotrim"),
                  suffixes = c("_narrow", "_wide"))
  # Not all y0 pairs should be identical (noise + different averaging window)
  expect_false(all(merged$y0_narrow == merged$y0_wide),
               label = "Different calibration periods produce different y0")
})

test_that("use_series_specific_periods=FALSE uses uniform calibration for all series", {
  # Create data with a "CNPJ-like" series name to verify it does NOT get special treatment
  months_seq <- generate_yyyymm_seq(201201L, 120)
  mesnotrim_seq <- ((months_seq %% 100L - 1L) %% 3L) + 1L

  rq_vals <- 100 * (1 + 0.001 * seq_along(months_seq))

  rq <- data.table::data.table(
    anomesfinaltrimmovel = months_seq,
    mesnotrim = mesnotrim_seq,
    popocup = rq_vals,
    empregadorcomcnpj = rq_vals * 0.05
  )

  me <- data.table::data.table(
    anomesexato = months_seq,
    z_popocup = rq_vals * 1000 + rnorm(120, 0, 10),
    z_empregadorcomcnpj = rq_vals * 0.05 * 1000 + rnorm(120, 0, 1)
  )

  result <- compute_series_starting_points(
    me, rq,
    calibration_start = 201301L,
    calibration_end = 201912L,
    scale_factor = 1000,
    use_series_specific_periods = FALSE,
    verbose = FALSE
  )

  # Both series should be present with 3 rows each
  expect_true("popocup" %in% result$series_name)
  expect_true("empregadorcomcnpj" %in% result$series_name)
  expect_equal(nrow(result[series_name == "empregadorcomcnpj"]), 3)
})

test_that("scale_factor=1 produces correct y0 (no scaling applied)", {
  mock <- create_mock_for_starting_points(scale_factor = 1)

  # Rebuild monthly estimates with scale_factor=1 (z_ values close to rq)
  months_seq <- mock$rolling_quarters$anomesfinaltrimmovel
  rq_popocup <- mock$rolling_quarters$popocup

  me <- data.table::data.table(
    anomesexato = months_seq,
    z_popocup = rq_popocup + rnorm(length(months_seq), 0, 0.1)
  )

  result <- compute_series_starting_points(
    me,
    mock$rolling_quarters,
    calibration_start = 201301L,
    calibration_end = 201912L,
    scale_factor = 1,
    use_series_specific_periods = FALSE,
    verbose = FALSE
  )

  # With scale_factor=1 and z ~ rq, y0 should be close to the base rq value
  # (y0 is the starting level, not zero; ~100 for base_rq=100)
  expect_true(all(is.finite(result$y0)))
  # The three y0 values per mesnotrim should be consistent with each other
  y0_vals <- result[series_name == "popocup", y0]
  expect_true(sd(y0_vals) < 5,
              label = "y0 values consistent across mesnotrim positions")
})

test_that("mathematical property: backprojection e0 averaged gives y0", {
  # The defining formula:
  #   e0 = z / scale_factor - cumsum_by_mesnotrim(rq)
  #   y0[pos] = mean(e0[mesnotrim == pos & in_calibration])
  #
  # We verify this by constructing data where we can compute e0 manually.

  set.seed(42)
  n <- 12L  # 12 months = 4 quarters
  months_seq <- generate_yyyymm_seq(201301L, n)
  mesnotrim_seq <- ((months_seq %% 100L - 1L) %% 3L) + 1L

  rq_vals <- rep(100, n)  # Constant rolling quarter
  z_vals <- rep(100000, n)  # Constant z values (scale 1000)
  scale_factor <- 1000

  rq <- data.table::data.table(
    anomesfinaltrimmovel = months_seq,
    mesnotrim = mesnotrim_seq,
    popocup = rq_vals
  )
  me <- data.table::data.table(
    anomesexato = months_seq,
    z_popocup = z_vals
  )

  result <- compute_series_starting_points(
    me, rq,
    calibration_start = 201301L,
    calibration_end = 201312L,
    scale_factor = scale_factor,
    use_series_specific_periods = FALSE,
    verbose = FALSE
  )

  # With constant rq, d3 = 3*(rq - lag(rq)) = 0, so cumsum = 0 for all periods
  # e0 = z/scale_factor - cum = 100000/1000 - 0 = 100
  # y0 should be 100 for all positions
  expect_equal(result[series_name == "popocup", y0],
               rep(100, 3),
               tolerance = 0.001)
})

test_that("compute_series_starting_points errors when no z_ columns present", {
  months_seq <- generate_yyyymm_seq(201301L, 12)
  mesnotrim_seq <- ((months_seq %% 100L - 1L) %% 3L) + 1L

  rq <- data.table::data.table(
    anomesfinaltrimmovel = months_seq,
    mesnotrim = mesnotrim_seq,
    popocup = rep(100, 12)
  )
  # monthly_estimates with NO z_ columns
  me <- data.table::data.table(
    anomesexato = months_seq,
    popocup = rep(100000, 12)
  )

  expect_error(
    compute_series_starting_points(me, rq, verbose = FALSE),
    "No z_ columns"
  )
})

test_that("compute_series_starting_points skips series not in rolling_quarters", {
  months_seq <- generate_yyyymm_seq(201301L, 12)
  mesnotrim_seq <- ((months_seq %% 100L - 1L) %% 3L) + 1L

  rq <- data.table::data.table(
    anomesfinaltrimmovel = months_seq,
    mesnotrim = mesnotrim_seq,
    popocup = rep(100, 12)
    # NOTE: no "popdesocup" column here
  )
  me <- data.table::data.table(
    anomesexato = months_seq,
    z_popocup = rep(100000, 12),
    z_popdesocup = rep(10000, 12)  # z_ column exists but series not in rq
  )

  result <- compute_series_starting_points(
    me, rq,
    calibration_start = 201301L,
    calibration_end = 201312L,
    scale_factor = 1000,
    use_series_specific_periods = FALSE,
    verbose = FALSE
  )

  # Only popocup should appear (popdesocup skipped because not in rq)
  expect_true("popocup" %in% result$series_name)
  expect_false("popdesocup" %in% result$series_name)
})

test_that("compute_series_starting_points handles anomesexato column name in monthly_estimates", {
  # The function renames "anomesexato" to "anomesfinaltrimmovel" internally
  months_seq <- generate_yyyymm_seq(201301L, 12)
  mesnotrim_seq <- ((months_seq %% 100L - 1L) %% 3L) + 1L

  rq <- data.table::data.table(
    anomesfinaltrimmovel = months_seq,
    mesnotrim = mesnotrim_seq,
    popocup = rep(100, 12)
  )
  me <- data.table::data.table(
    anomesexato = months_seq,  # Using anomesexato, not anomesfinaltrimmovel
    z_popocup = rep(100000, 12)
  )

  # Should not error
  result <- compute_series_starting_points(
    me, rq,
    calibration_start = 201301L,
    calibration_end = 201312L,
    scale_factor = 1000,
    verbose = FALSE
  )
  expect_s3_class(result, "data.table")
  expect_true(nrow(result) > 0)
})

test_that("non-finite y0 values are replaced with 0", {
  # If all z_ values are NA for a calibration period, e0 would be NaN
  # The function should replace non-finite y0 with 0
  months_seq <- generate_yyyymm_seq(201301L, 12)
  mesnotrim_seq <- ((months_seq %% 100L - 1L) %% 3L) + 1L

  rq <- data.table::data.table(
    anomesfinaltrimmovel = months_seq,
    mesnotrim = mesnotrim_seq,
    popocup = rep(100, 12)
  )

  # z_ values are all NA
  me <- data.table::data.table(
    anomesexato = months_seq,
    z_popocup = rep(NA_real_, 12)
  )

  result <- compute_series_starting_points(
    me, rq,
    calibration_start = 201301L,
    calibration_end = 201312L,
    scale_factor = 1000,
    use_series_specific_periods = FALSE,
    verbose = FALSE
  )

  # All y0 should be 0 (non-finite replaced)
  expect_true(all(result$y0 == 0),
              label = "Non-finite y0 replaced with 0")
})

# =============================================================================
# INTEGRATION: compute_z_aggregates() -> compute_series_starting_points()
# =============================================================================

test_that("z_aggregates output is compatible with compute_series_starting_points input", {
  # Build calibrated mock data -> compute z_aggregates -> feed into starting points
  dt <- create_mock_calibrated(n_per_month = 30L,
                                months = generate_yyyymm_seq(201301L, 12))
  z_agg <- compute_z_aggregates(dt, verbose = FALSE)

  # Verify z_aggregates has anomesexato column

  expect_true("anomesexato" %in% names(z_agg))

  # Create matching rolling quarter data with same series
  z_cols <- grep("^z_", names(z_agg), value = TRUE)
  series_names <- sub("^z_", "", z_cols)

  rq <- data.table::data.table(
    anomesfinaltrimmovel = z_agg$anomesexato,
    mesnotrim = ((z_agg$anomesexato %% 100L - 1L) %% 3L) + 1L
  )
  # Add matching series columns with plausible rolling quarter values
  for (s in series_names) {
    z_col <- paste0("z_", s)
    if (z_col %in% names(z_agg)) {
      rq[, (s) := z_agg[[z_col]] / 1000]
    }
  }

  # Should not error
  result <- compute_series_starting_points(
    z_agg, rq,
    calibration_start = 201301L,
    calibration_end = 201312L,
    scale_factor = 1000,
    use_series_specific_periods = FALSE,
    verbose = FALSE
  )

  expect_s3_class(result, "data.table")
  expect_true(nrow(result) > 0, label = "Starting points computed from z_aggregates")
  expect_true(all(is.finite(result$y0)),
              label = "All y0 from pipeline are finite")
})

test_that("compute_z_aggregates filters out zero and NA weights", {
  dt <- data.table::data.table(
    weight_monthly = c(1000, 0, NA_real_, 1000),
    ref_month_yyyymm = rep(202301L, 4),
    ref_month_in_quarter = rep(1L, 4),
    Ano = 2023L,
    Trimestre = 1L,
    V2009 = c(30L, 35L, 25L, 40L)
  )

  result <- compute_z_aggregates(dt, verbose = FALSE)

  # z_populacao should only count obs with valid weight (2 obs with weight=1000)
  expect_equal(result$z_populacao, 2 * 1000)
})

test_that("compute_z_aggregates errors when all weights are zero/NA", {
  dt <- data.table::data.table(
    weight_monthly = c(0, NA_real_, 0),
    ref_month_yyyymm = rep(202301L, 3),
    ref_month_in_quarter = rep(1L, 3),
    Ano = 2023L,
    Trimestre = 1L,
    V2009 = c(30L, 35L, 25L)
  )

  expect_error(
    compute_z_aggregates(dt, verbose = FALSE),
    "No observations with valid weight_monthly"
  )
})

test_that("compute_z_aggregates produces z_pop14mais only for age >= 14", {
  dt <- data.table::data.table(
    weight_monthly = rep(1000, 6),
    ref_month_yyyymm = rep(202301L, 6),
    ref_month_in_quarter = rep(1L, 6),
    Ano = 2023L,
    Trimestre = 1L,
    V2009 = c(5L, 10L, 13L, 14L, 30L, 65L)  # 3 under 14, 3 at/above 14
  )

  result <- compute_z_aggregates(dt, verbose = FALSE)

  # z_pop14mais: only ages 14, 30, 65 -> 3 obs * 1000 weight
  expect_equal(result$z_pop14mais, 3 * 1000)
  # z_populacao: all 6 obs
  expect_equal(result$z_populacao, 6 * 1000)
})