# Helper functions for creating test fixtures # These functions generate sample data for testing without requiring # external data sources or API calls #' Create sample LEAD data for testing #' #' @param n Number of rows to generate #' @param seed Random seed for reproducibility #' @param dataset Either "ami" or "fpl" #' @param vintage Either "2018" or "2022" #' @return A data.frame with sample LEAD data create_sample_lead_data <- function(n = 100, seed = 123, dataset = "ami", vintage = "2022") { set.seed(seed) # Generate realistic income brackets based on dataset if (dataset == "ami") { income_brackets <- c("0-30%", "30-60%", "60-80%", "80-100%", "100%+") # Income ranges by bracket (rough estimates) income_ranges <- list( "0-30%" = c(0, 25000), "30-60%" = c(25000, 50000), "60-80%" = c(50000, 65000), "80-100%" = c(65000, 85000), "100%+" = c(85000, 200000) ) } else { # fpl income_brackets <- c("0-100%", "100-150%", "150-200%", "200%+") income_ranges <- list( "0-100%" = c(0, 27000), "100-150%" = c(27000, 40000), "150-200%" = c(40000, 54000), "200%+" = c(54000, 150000) ) } # Generate data data <- data.frame( geoid = sprintf("37%03d%06d", sample(1:100, n, replace = TRUE), sample(1:999999, n, replace = TRUE)), state_abbr = "NC", county_name = sample(c("Wake", "Mecklenburg", "Durham", "Guilford", "Forsyth"), n, replace = TRUE), income_bracket = sample(income_brackets, n, replace = TRUE), stringsAsFactors = FALSE ) # Generate income based on bracket data$income <- mapply(function(bracket) { range <- income_ranges[[bracket]] runif(1, range[1], range[2]) }, data$income_bracket) # Energy costs correlate weakly with income (but not perfectly) data$energy_cost <- pmax(800, data$income * runif(n, 0.02, 0.08) + rnorm(n, 0, 300)) # Other fields data$electricity_spend <- data$energy_cost * runif(n, 0.5, 0.7) data$gas_spend <- data$energy_cost * runif(n, 0.2, 0.4) data$other_spend <- data$energy_cost - data$electricity_spend - data$gas_spend data$households <- pmax(1, round(rnorm(n, 100, 50))) data$housing_tenure <- sample(c("OWNER", "RENTER"), n, replace = TRUE, prob = c(0.65, 0.35)) data$primary_heating_fuel <- sample( c("Electricity", "Natural gas", "Fuel oil", "Propane"), n, replace = TRUE, prob = c(0.5, 0.3, 0.1, 0.1) ) data$building_type <- ifelse( sample(c(TRUE, FALSE), n, replace = TRUE, prob = c(0.7, 0.3)), "Single-Family", "Multi-Family" ) # Calculate derived metrics data$net_income <- data$income - data$energy_cost data$ner <- data$net_income / data$energy_cost data$energy_burden <- data$energy_cost / data$income # Add vintage identifier data$vintage <- vintage return(data) } #' Create corrupted test data with all-NA income_bracket #' #' This replicates the bug reported in issue #15 create_corrupted_fpl_data <- function(n = 100, seed = 456) { data <- create_sample_lead_data(n, seed, dataset = "fpl") data$income_bracket <- NA_character_ return(data) } #' Create incomplete schema data (missing required columns) create_incomplete_schema_data <- function(n = 100, seed = 789) { data <- create_sample_lead_data(n, seed) # Remove critical column data$income <- NULL return(data) } #' Create test data with edge cases create_edge_case_data <- function() { data.frame( geoid = c("37001000001", "37001000002", "37001000003", "37001000004"), income_bracket = c("0-30%", "0-30%", "100%+", "0-30%"), income = c(0, 5000, 150000, -1000), # Zero, low, high, negative energy_cost = c(1000, 0, 3000, 2000), # Normal, zero, high, normal households = c(100, 200, 50, 0), # Normal, normal, low, zero (invalid) housing_tenure = c("OWNER", "RENTER", "OWNER", NA), stringsAsFactors = FALSE ) } #' Create a temporary test cache directory #' #' @return Path to temporary directory create_test_cache <- function() { cache_dir <- file.path(tempdir(), "emburden_test_cache", paste0("test_", Sys.getpid())) dir.create(cache_dir, recursive = TRUE, showWarnings = FALSE) return(cache_dir) } #' Write sample data to CSV file #' #' @param data Data frame to write #' @param filename Filename (will be placed in temp directory) #' @return Path to created file write_test_csv <- function(data, filename) { filepath <- file.path(tempdir(), filename) write.csv(data, filepath, row.names = FALSE) return(filepath) } #' Create sample database for testing #' #' @return Path to created SQLite database create_test_database <- function() { if (!requireNamespace("DBI", quietly = TRUE) || !requireNamespace("RSQLite", quietly = TRUE)) { skip("DBI and RSQLite required for database tests") } db_path <- file.path(tempdir(), paste0("test_emburden_", Sys.getpid(), ".db")) con <- DBI::dbConnect(RSQLite::SQLite(), db_path) # Create tables for different datasets/vintages DBI::dbWriteTable(con, "ami_2022", create_sample_lead_data(500, seed = 1, dataset = "ami", vintage = "2022")) DBI::dbWriteTable(con, "ami_2018", create_sample_lead_data(500, seed = 2, dataset = "ami", vintage = "2018")) DBI::dbWriteTable(con, "fpl_2022", create_sample_lead_data(500, seed = 3, dataset = "fpl", vintage = "2022")) DBI::dbWriteTable(con, "fpl_2018", create_sample_lead_data(500, seed = 4, dataset = "fpl", vintage = "2018")) DBI::dbDisconnect(con) return(db_path) } #' Clean up test files and directories #' #' @param paths Character vector of paths to remove cleanup_test_files <- function(paths) { for (path in paths) { if (file.exists(path)) { if (dir.exists(path)) { unlink(path, recursive = TRUE) } else { file.remove(path) } } } } #' Skip test if offline (no internet connection) skip_if_offline <- function() { # Simple internet connectivity check without external dependencies has_internet <- tryCatch({ con <- url("http://www.google.com", open = "rb", timeout = 2) close(con) TRUE }, error = function(e) { FALSE }) if (!has_internet) { skip("No internet connection available") } } #' Skip test if specific package not available skip_if_not_installed <- function(pkg) { if (!requireNamespace(pkg, quietly = TRUE)) { skip(paste0("Package '", pkg, "' not available")) } }