# Tests for Housing Dimension Preservation Feature # Tests that TEN, TEN-YBL6, TEN-BLD, and TEN-HFL columns are preserved # during cohort data aggregation library(testthat) # Global variables to avoid R CMD check notes geoid <- income_bracket <- TEN <- `TEN-YBL6` <- `TEN-BLD` <- `TEN-HFL` <- NULL UNITS <- `HINCP.UNITS` <- `ELEP.UNITS` <- `GASP.UNITS` <- `FULP.UNITS` <- NULL # Helper: Create test data with housing dimensions create_housing_dimension_data <- function(n = 100, include_housing_cols = TRUE) { set.seed(42) data <- data.frame( FIP = rep(c("37001", "37003", "37005"), length.out = n), AMI = sample(c("very_low", "low_mod", "mid_high"), n, replace = TRUE), UNITS = rpois(n, 10), `HINCP.UNITS` = rpois(n, 500000), `ELEP.UNITS` = rpois(n, 1200), `GASP.UNITS` = rpois(n, 800), `FULP.UNITS` = rpois(n, 400), check.names = FALSE, stringsAsFactors = FALSE ) if (include_housing_cols) { data$TEN <- sample(c("1", "2", "3", "4"), n, replace = TRUE) data$`TEN-YBL6` <- sample(c("1-1", "1-2", "2-1", "2-2"), n, replace = TRUE) data$`TEN-BLD` <- sample(c("1-A", "1-B", "2-A", "2-B"), n, replace = TRUE) data$`TEN-HFL` <- sample(c("1-H1", "1-H2", "2-H1", "2-H2"), n, replace = TRUE) } return(data) } # ============================================================================ # TEST SUITE: Housing Dimension Column Preservation # ============================================================================ test_that("aggregate_cohort_data preserves all housing dimension columns", { # Setup data <- create_housing_dimension_data(n = 200) # Execute - call internal function result <- emburden:::aggregate_cohort_data(data, "ami", "2022", verbose = FALSE) # Verify all housing columns are present expect_true("TEN" %in% names(result)) expect_true("TEN-YBL6" %in% names(result)) expect_true("TEN-BLD" %in% names(result)) expect_true("TEN-HFL" %in% names(result)) }) test_that("aggregate_cohort_data groups by housing dimensions", { # Setup - create data with known pattern data <- data.frame( FIP = rep("37001", 4), AMI = rep("low_mod", 4), TEN = c("1", "1", "2", "2"), `TEN-YBL6` = c("1-1", "1-1", "2-1", "2-1"), `TEN-BLD` = c("1-A", "1-A", "2-A", "2-A"), `TEN-HFL` = c("1-H1", "1-H1", "2-H1", "2-H1"), UNITS = c(10, 15, 20, 25), `HINCP.UNITS` = c(100, 150, 200, 250), `ELEP.UNITS` = c(50, 75, 100, 125), `GASP.UNITS` = c(30, 45, 60, 75), `FULP.UNITS` = c(20, 30, 40, 50), check.names = FALSE, stringsAsFactors = FALSE ) # Execute result <- emburden:::aggregate_cohort_data(data, "ami", "2022", verbose = FALSE) # Verify - should result in 2 rows (one for each TEN value) # because all other housing dimensions align with TEN expect_equal(nrow(result), 2) expect_equal(sum(result$UNITS), 70) # 10+15+20+25 # Check that distinct housing dimension combinations are preserved expect_setequal(result$TEN, c("1", "2")) }) test_that("housing dimensions create separate aggregation groups", { # Setup - same FIP and income, but different housing dimensions data <- data.frame( FIP = rep("37001", 3), AMI = rep("low_mod", 3), TEN = c("1", "1", "1"), # Same tenure `TEN-HFL` = c("1-H1", "1-H2", "1-H3"), # Different heating fuel UNITS = c(100, 200, 300), `HINCP.UNITS` = c(1000, 2000, 3000), `ELEP.UNITS` = c(500, 1000, 1500), check.names = FALSE, stringsAsFactors = FALSE ) # Execute result <- emburden:::aggregate_cohort_data(data, "ami", "2022", verbose = FALSE) # Verify - should result in 3 rows (one per heating fuel) expect_equal(nrow(result), 3) expect_setequal(result$`TEN-HFL`, c("1-H1", "1-H2", "1-H3")) }) test_that("aggregation sums correctly when grouped by housing dimensions", { # Setup data <- data.frame( FIP = rep("37001", 4), AMI = rep("very_low", 4), TEN = c("1", "1", "2", "2"), `TEN-HFL` = c("1-H1", "1-H1", "2-H1", "2-H1"), UNITS = c(10, 20, 30, 40), `HINCP.UNITS` = c(100, 200, 300, 400), `ELEP.UNITS` = c(50, 100, 150, 200), check.names = FALSE, stringsAsFactors = FALSE ) # Execute result <- emburden:::aggregate_cohort_data(data, "ami", "2022", verbose = FALSE) # Verify - two groups: TEN=1 and TEN=2 expect_equal(nrow(result), 2) ten1_row <- result[result$TEN == "1", ] expect_equal(ten1_row$UNITS, 30) # 10 + 20 expect_equal(ten1_row$`HINCP.UNITS`, 300) # 100 + 200 expect_equal(ten1_row$`ELEP.UNITS`, 150) # 50 + 100 ten2_row <- result[result$TEN == "2", ] expect_equal(ten2_row$UNITS, 70) # 30 + 40 expect_equal(ten2_row$`HINCP.UNITS`, 700) # 300 + 400 expect_equal(ten2_row$`ELEP.UNITS`, 350) # 150 + 200 }) test_that("backward compatibility: works without housing dimension columns", { # Setup - data without housing dimensions data <- create_housing_dimension_data(n = 100, include_housing_cols = FALSE) # Execute result <- emburden:::aggregate_cohort_data(data, "ami", "2022", verbose = FALSE) # Verify - should still work, just without housing columns expect_false("TEN" %in% names(result)) expect_false("TEN-YBL6" %in% names(result)) expect_false("TEN-BLD" %in% names(result)) expect_false("TEN-HFL" %in% names(result)) # Should still have aggregation columns expect_true("UNITS" %in% names(result)) expect_true("FIP" %in% names(result)) expect_true("AMI" %in% names(result)) }) test_that("partial housing columns: some present, some missing", { # Setup - only TEN and TEN-HFL present data <- data.frame( FIP = rep("37001", 4), AMI = rep("low_mod", 4), TEN = c("1", "1", "2", "2"), `TEN-HFL` = c("1-H1", "1-H1", "2-H1", "2-H1"), # TEN-YBL6 and TEN-BLD intentionally missing UNITS = c(10, 20, 30, 40), `HINCP.UNITS` = c(100, 200, 300, 400), check.names = FALSE, stringsAsFactors = FALSE ) # Execute result <- emburden:::aggregate_cohort_data(data, "ami", "2022", verbose = FALSE) # Verify - only present housing columns are preserved expect_true("TEN" %in% names(result)) expect_true("TEN-HFL" %in% names(result)) expect_false("TEN-YBL6" %in% names(result)) expect_false("TEN-BLD" %in% names(result)) }) # ============================================================================ # TEST SUITE: Integration with load_cohort_data # ============================================================================ test_that("housing columns survive full data loading pipeline (mock test)", { skip("Requires real data or comprehensive mocking") # This test would verify that housing columns are preserved through # the entire load_cohort_data -> aggregate_cohort_data pipeline. # Skipped because it requires either: # 1. Real LEAD data files # 2. Complex mocking of the entire loading pipeline # Future enhancement: Mock the database/CSV loading to return data # with housing dimensions, then verify they survive to the final output }) # ============================================================================ # TEST SUITE: Data Types and Edge Cases # ============================================================================ test_that("housing dimension columns maintain character type", { # Setup data <- create_housing_dimension_data(n = 50) # Execute result <- emburden:::aggregate_cohort_data(data, "ami", "2022", verbose = FALSE) # Verify column types expect_type(result$TEN, "character") expect_type(result$`TEN-YBL6`, "character") expect_type(result$`TEN-BLD`, "character") expect_type(result$`TEN-HFL`, "character") }) test_that("NA values in housing dimensions are handled correctly", { # Setup - include some NA values data <- create_housing_dimension_data(n = 50) data$TEN[1:5] <- NA data$`TEN-HFL`[6:10] <- NA # Execute result <- emburden:::aggregate_cohort_data(data, "ami", "2022", verbose = FALSE) # Verify - NA groups should be created (dplyr groups NAs together) expect_true(any(is.na(result$TEN))) expect_true(any(is.na(result$`TEN-HFL`))) }) test_that("works with FPL dataset (different income bracket column)", { # Setup - FPL data uses FPL150 instead of AMI data <- data.frame( FIP = rep(c("37001", "37003"), each = 4), FPL150 = rep(c("0-100%", "100-150%"), 4), TEN = sample(c("1", "2"), 8, replace = TRUE), `TEN-HFL` = sample(c("1-H1", "2-H1"), 8, replace = TRUE), UNITS = rpois(8, 10), `HINCP.UNITS` = rpois(8, 50000), `ELEP.UNITS` = rpois(8, 1000), check.names = FALSE, stringsAsFactors = FALSE ) # Execute result <- emburden:::aggregate_cohort_data(data, "fpl", "2022", verbose = FALSE) # Verify housing columns are preserved with FPL data too expect_true("TEN" %in% names(result)) expect_true("TEN-HFL" %in% names(result)) expect_true("FPL150" %in% names(result)) }) test_that("large number of housing dimension combinations", { # Setup - create data with many unique housing dimension combinations n <- 500 set.seed(123) data <- data.frame( FIP = sample(sprintf("37%03d", 1:20), n, replace = TRUE), AMI = sample(c("very_low", "low_mod", "mid_high"), n, replace = TRUE), TEN = sample(c("1", "2", "3", "4"), n, replace = TRUE), `TEN-YBL6` = sample(paste0(1:4, "-", 1:6), n, replace = TRUE), `TEN-BLD` = sample(paste0(1:4, "-", LETTERS[1:10]), n, replace = TRUE), `TEN-HFL` = sample(paste0(1:4, "-H", 1:8), n, replace = TRUE), UNITS = rpois(n, 5), `HINCP.UNITS` = rpois(n, 50000), `ELEP.UNITS` = rpois(n, 1000), check.names = FALSE, stringsAsFactors = FALSE ) # Execute result <- emburden:::aggregate_cohort_data(data, "ami", "2022", verbose = FALSE) # Verify # Result should have fewer rows than input (aggregation happened) expect_lt(nrow(result), nrow(data)) # But should preserve the variety of housing dimensions expect_gt(length(unique(result$TEN)), 1) expect_gt(length(unique(result$`TEN-HFL`)), 1) # All housing columns present expect_true(all(c("TEN", "TEN-YBL6", "TEN-BLD", "TEN-HFL") %in% names(result))) }) # ============================================================================ # TEST SUITE: Verbose Output # ============================================================================ test_that("verbose mode reports housing dimension preservation", { # Setup data <- create_housing_dimension_data(n = 50) # Execute with verbose = TRUE and capture messages expect_message( emburden:::aggregate_cohort_data(data, "ami", "2022", verbose = TRUE), "Preserving housing dimensions" ) # Verify the message contains column names expect_message( emburden:::aggregate_cohort_data(data, "ami", "2022", verbose = TRUE), "TEN" ) }) test_that("verbose mode reports when housing dimensions are absent", { # Setup - no housing columns data <- create_housing_dimension_data(n = 50, include_housing_cols = FALSE) # Execute with verbose = TRUE expect_message( emburden:::aggregate_cohort_data(data, "ami", "2022", verbose = TRUE), "No housing dimension columns found" ) })