test_that("raw_to_lead processes 2018 data correctly", {
  # Create mock 2018+ format raw data
  # Note: BLD uses numeric range format that the function expects
  raw_data <- data.frame(
    FIP = c("37183020100", "37051003400"),
    ABV = c("NC", "NC"),
    TEN = c("OWNER", "RENTER"),
    YBL6 = c("2000-2009", "1990-1999"),
    BLD = c("1 1 DETACHED", "2 4"),  # Numeric ranges: "1-1" and "2-4"
    HFL = c("Natural gas", "Electricity"),
    AMI68 = c("0-30% AMI", "30-50% AMI"),
    UNITS = c(100, 150),
    HINCP = c(25000, 35000),
    ELEP = c(1200, 1500),
    GASP = c(800, 0),
    FULP = c(200, 100)
  )

  result <- raw_to_lead(raw_data, "2018")

  # Check column names are standardized
  expect_true("geoid" %in% names(result))
  expect_true("state_abbr" %in% names(result))
  expect_true("income_bracket" %in% names(result))
  expect_true("households" %in% names(result))

  # Check geoid is properly formatted (11 digits)
  expect_equal(nchar(result$geoid[1]), 11)
  expect_equal(result$geoid[1], "37183020100")

  # Check min_units and detached are extracted
  expect_true("min_units" %in% names(result))
  expect_true("detached" %in% names(result))
  expect_equal(result$detached[1], 1)  # Has DETACHED keyword
  expect_equal(result$detached[2], 0)  # No DETACHED keyword

  # Check data values
  expect_equal(result$households, c(100, 150))
  expect_equal(result$income, c(25000, 35000))
})

test_that("raw_to_lead handles short geoids with padding", {
  raw_data <- data.frame(
    FIP = c("1234567890", "123456789"),  # 10 and 9 digits
    ABV = c("NC", "NC"),
    TEN = c("OWNER", "OWNER"),
    YBL6 = c("2000-2009", "2000-2009"),
    BLD = c("1 1 DETACHED", "1 1 DETACHED"),
    HFL = c("Natural gas", "Electricity"),
    AMI68 = c("0-30% AMI", "30-50% AMI"),
    UNITS = c(100, 150),
    HINCP = c(25000, 35000),
    ELEP = c(1200, 1500),
    GASP = c(800, 0),
    FULP = c(200, 100)
  )

  result <- raw_to_lead(raw_data, "2018")

  # All geoids should be 11 digits with zero padding
  expect_equal(result$geoid[1], "01234567890")
  expect_equal(result$geoid[2], "00123456789")
})

test_that("raw_to_lead rejects 2016 vintage", {
  raw_data <- data.frame(FIP = "37183020100")

  expect_error(
    raw_to_lead(raw_data, "2016"),
    "2016 vintage processing not fully implemented"
  )
})

test_that("lead_to_poverty creates binary poverty indicator for FPL", {
  # Mock processed LEAD data
  data <- data.frame(
    geoid = rep("37183020100", 4),
    primary_heating_fuel = rep("Natural gas", 4),
    income_bracket = c("0-100%", "100-150%", "150-200%", "200%+"),
    households = c(50, 75, 100, 125),
    income = c(15000, 30000, 45000, 75000),
    electricity_spend = c(1200, 1400, 1600, 1800),
    gas_spend = c(800, 900, 1000, 1100),
    other_spend = c(100, 150, 200, 250),
    min_units = c(1, 1, 1, 1),
    detached = c(1, 1, 1, 1),
    housing_tenure = rep("OWNER", 4),
    year_constructed = rep("2000-2009", 4),
    building_type = rep("1 1 DETACHED", 4)
  )

  result <- lead_to_poverty(data, "fpl")

  # Check poverty indicator was created
  expect_true("income_bracket" %in% names(result))
  expect_true(is.factor(result$income_bracket))

  # Check levels
  poverty_levels <- levels(result$income_bracket)
  expect_true("Below Federal Poverty Line" %in% poverty_levels)
  expect_true("Above Federal Poverty Line" %in% poverty_levels)

  # Check aggregation occurred
  expect_true(nrow(result) <= nrow(data))
})

test_that("lead_to_poverty creates binary poverty indicator for AMI", {
  data <- data.frame(
    geoid = rep("37183020100", 3),
    primary_heating_fuel = rep("Natural gas", 3),
    income_bracket = c("very_low", "low_mod", "mid_high"),
    households = c(50, 75, 100),
    income = c(15000, 35000, 60000),
    electricity_spend = c(1200, 1400, 1600),
    gas_spend = c(800, 900, 1000),
    other_spend = c(100, 150, 200),
    min_units = c(1, 1, 1),
    detached = c(1, 1, 1),
    housing_tenure = rep("OWNER", 3),
    year_constructed = rep("2000-2009", 3),
    building_type = rep("1 1 DETACHED", 3)
  )

  result <- lead_to_poverty(data, "ami")

  # Check levels
  poverty_levels <- levels(result$income_bracket)
  expect_true("Below AMI Poverty Line" %in% poverty_levels)
  expect_true("Above AMI Poverty Line" %in% poverty_levels)
})

test_that("lead_to_poverty consolidates housing tenure", {
  data <- data.frame(
    geoid = rep("37183020100", 2),
    primary_heating_fuel = rep("Natural gas", 2),
    income_bracket = c("0-100%", "100-150%"),
    housing_tenure = c("OWNER", "RENTER"),
    households = c(50, 75),
    income = c(15000, 30000),
    electricity_spend = c(1200, 1400),
    gas_spend = c(800, 900),
    other_spend = c(100, 150),
    min_units = c(1, 1),
    detached = c(1, 1),
    year_constructed = rep("2000-2009", 2),
    building_type = rep("1 1 DETACHED", 2)
  )

  result <- lead_to_poverty(data, "fpl")

  # Check housing tenure was recoded
  expect_true(is.factor(result$housing_tenure))
  expect_true(all(result$housing_tenure %in% c("owned", "rented")))
})

test_that("lead_to_poverty creates number_of_units category", {
  data <- data.frame(
    geoid = rep("37183020100", 2),
    primary_heating_fuel = rep("Natural gas", 2),
    income_bracket = c("0-100%", "100-150%"),
    housing_tenure = rep("OWNER", 2),
    min_units = c(1, 5),  # Single vs multi-family
    detached = c(1, 0),
    households = c(50, 75),
    income = c(15000, 30000),
    electricity_spend = c(1200, 1400),
    gas_spend = c(800, 900),
    other_spend = c(100, 150),
    year_constructed = rep("2000-2009", 2),
    building_type = rep("1 1 DETACHED", 2)
  )

  result <- lead_to_poverty(data, "fpl")

  # Check number_of_units was created
  expect_true("number_of_units" %in% names(result))
  expect_true(is.factor(result$number_of_units))
  expect_true(all(result$number_of_units %in% c("single-family", "multi-family")))
})

test_that("process_lead_cohort_data calculates energy_burden correctly", {
  raw_data <- data.frame(
    FIP = c("37183020100", "37051003400"),
    ABV = c("NC", "NC"),
    TEN = c("OWNER", "RENTER"),
    YBL6 = c("2000-2009", "1990-1999"),
    BLD = c("1 1 DETACHED", "2 4"),
    HFL = c("Natural gas", "Electricity"),
    AMI68 = c("0-30% AMI", "30-50% AMI"),
    UNITS = c(100, 150),
    HINCP = c(50000, 40000),
    ELEP = c(1200, 1500),
    GASP = c(800, 0),
    FULP = c(200, 100)
  )

  result <- process_lead_cohort_data(raw_data, "ami", "2018", aggregate_poverty = FALSE)

  # Check energy_cost was calculated
  expect_true("energy_cost" %in% names(result))
  expect_equal(result$energy_cost[1], 1200 + 800 + 200)  # 2200
  expect_equal(result$energy_cost[2], 1500 + 0 + 100)    # 1600

  # Check energy_burden was calculated
  expect_true("energy_burden" %in% names(result))
  expect_equal(result$energy_burden[1], 2200 / 50000)
  expect_equal(result$energy_burden[2], 1600 / 40000)
})

test_that("process_lead_cohort_data filters zero-energy records", {
  raw_data <- data.frame(
    FIP = c("37183020100", "37051003400", "37119000100"),
    ABV = c("NC", "NC", "NC"),
    TEN = c("OWNER", "RENTER", "OWNER"),
    YBL6 = c("2000-2009", "1990-1999", "2010-2019"),
    BLD = rep("1 1 DETACHED", 3),
    HFL = rep("Natural gas", 3),
    AMI68 = rep("0-30% AMI", 3),
    UNITS = c(100, 150, 200),
    HINCP = c(50000, 40000, 60000),
    ELEP = c(1200, 0, 1800),
    GASP = c(800, 0, 900),
    FULP = c(200, 0, 0)  # Second record has zero energy
  )

  result <- process_lead_cohort_data(raw_data, "ami", "2018", aggregate_poverty = FALSE)

  # Should have filtered out the zero-energy record
  expect_equal(nrow(result), 2)
  expect_false("37051003400" %in% result$geoid)
})

test_that("process_lead_cohort_data handles aggregate_poverty = TRUE", {
  raw_data <- data.frame(
    FIP = rep("37183020100", 4),
    ABV = rep("NC", 4),
    TEN = rep("OWNER", 4),
    YBL6 = rep("2000-2009", 4),
    BLD = rep("1 1 DETACHED", 4),
    HFL = rep("Natural gas", 4),
    FPL15 = c("0-100%", "100-150%", "150-200%", "200%+"),
    UNITS = c(50, 75, 100, 125),
    HINCP = c(15000, 30000, 45000, 75000),
    ELEP = c(1200, 1400, 1600, 1800),
    GASP = c(800, 900, 1000, 1100),
    FULP = c(100, 150, 200, 250)
  )

  result <- process_lead_cohort_data(raw_data, "fpl", "2018", aggregate_poverty = TRUE)

  # Should have aggregated to poverty status level
  expect_true(nrow(result) < 4)
  expect_true("income_bracket" %in% names(result))

  # Check poverty labels were applied
  poverty_levels <- unique(as.character(result$income_bracket))
  expect_true(any(grepl("Poverty Line", poverty_levels)))
})

test_that("process_lead_cohort_data handles NA income gracefully", {
  raw_data <- data.frame(
    FIP = c("37183020100", "37051003400"),
    ABV = c("NC", "NC"),
    TEN = c("OWNER", "RENTER"),
    YBL6 = c("2000-2009", "1990-1999"),
    BLD = rep("1 1 DETACHED", 2),
    HFL = rep("Natural gas", 2),
    AMI68 = rep("0-30% AMI", 2),
    UNITS = c(100, 150),
    HINCP = c(NA, 40000),  # NA income
    ELEP = c(1200, 1500),
    GASP = c(800, 0),
    FULP = c(200, 100)
  )

  result <- process_lead_cohort_data(raw_data, "ami", "2018", aggregate_poverty = FALSE)

  # Check NA handling
  expect_true(is.na(result$energy_burden[1]))
  expect_false(is.na(result$energy_burden[2]))
})