# Tests for experimental period identification strategies # Tests for pnadc_experimental_periods() - probabilistic and UPA aggregation strategies # ============================================================================= # PROBABILISTIC STRATEGY TESTS # ============================================================================= test_that("probabilistic strategy requires store_date_bounds=TRUE", { # 1. Setup: Create crosswalk WITHOUT date bounds set.seed(400) data <- create_realistic_pnadc(n_quarters = 4, n_upas = 15) crosswalk <- pnadc_identify_periods(data, verbose = FALSE, store_date_bounds = FALSE) # 2. Verify: Should error when date bounds not stored # Error can be about missing columns like date_min, date_max, or week columns expect_error( pnadc_experimental_periods( crosswalk, strategy = "probabilistic", verbose = FALSE ), label = "Probabilistic strategy requires date bounds" ) # 3. Context: Probabilistic strategy needs date ranges to calculate confidence }) test_that("probabilistic strategy improves month determination", { # 1. Setup: Create data with date bounds set.seed(401) data <- create_realistic_pnadc(n_quarters = 4, n_upas = 20) crosswalk <- pnadc_identify_periods(data, verbose = FALSE, store_date_bounds = TRUE) # Store strict determination rate strict_rate <- crosswalk[, mean(determined_month, na.rm = TRUE)] # 2. Execute: Apply probabilistic strategy result <- pnadc_experimental_periods( crosswalk, strategy = "probabilistic", confidence_threshold = 0.9, verbose = FALSE ) # Calculate experimental rate exp_rate <- result[, mean(determined_month, na.rm = TRUE)] # 3. Verify: Experimental rate should be >= strict rate expect_true(exp_rate >= strict_rate, label = paste0("Probabilistic should maintain or improve rate: ", "strict=", round(strict_rate, 3), ", exp=", round(exp_rate, 3))) # 4. Context: Probabilistic adds determinations, never removes strict ones }) test_that("confidence_threshold parameter affects determination", { # 1. Setup: Create data with store_date_bounds set.seed(402) data <- create_realistic_pnadc(n_quarters = 4, n_upas = 20) crosswalk <- pnadc_identify_periods(data, verbose = FALSE, store_date_bounds = TRUE) # 2. Execute: Test with different confidence thresholds result_08 <- pnadc_experimental_periods( crosswalk, strategy = "probabilistic", confidence_threshold = 0.8, verbose = FALSE ) result_09 <- pnadc_experimental_periods( crosswalk, strategy = "probabilistic", confidence_threshold = 0.9, verbose = FALSE ) result_095 <- pnadc_experimental_periods( crosswalk, strategy = "probabilistic", confidence_threshold = 0.95, verbose = FALSE ) # 3. Verify: Lower threshold should give >= determinations than higher threshold rate_08 <- result_08[, mean(determined_month, na.rm = TRUE)] rate_09 <- result_09[, mean(determined_month, na.rm = TRUE)] rate_095 <- result_095[, mean(determined_month, na.rm = TRUE)] expect_true(rate_08 >= rate_09 - 0.01, # Allow tiny tolerance label = paste0("threshold=0.8 (", round(rate_08, 3), ") should be >= threshold=0.9 (", round(rate_09, 3), ")")) expect_true(rate_09 >= rate_095 - 0.01, label = paste0("threshold=0.9 (", round(rate_09, 3), ") should be >= threshold=0.95 (", round(rate_095, 3), ")")) # 4. Context: More conservative thresholds assign fewer probabilistic months }) test_that("probabilistic strategy preserves strict determinations", { # 1. Setup: Create data set.seed(403) data <- create_realistic_pnadc(n_quarters = 4, n_upas = 15) crosswalk <- pnadc_identify_periods(data, verbose = FALSE, store_date_bounds = TRUE) # Identify strict determinations strict_determined <- crosswalk[determined_month == TRUE] # 2. Execute: Apply probabilistic result <- pnadc_experimental_periods( crosswalk, strategy = "probabilistic", confidence_threshold = 0.9, verbose = FALSE ) # 3. Verify: All strict determinations must be preserved result_determined <- result[determined_month == TRUE] # All strict IDs should still be determined in result # Note: Crosswalk is at household level; V2003 is not in the crosswalk output strict_ids <- paste(strict_determined$Ano, strict_determined$Trimestre, strict_determined$UPA, strict_determined$V1008, strict_determined$V1014, sep = "_") result_ids <- paste(result_determined$Ano, result_determined$Trimestre, result_determined$UPA, result_determined$V1008, result_determined$V1014, sep = "_") all_preserved <- all(strict_ids %in% result_ids) expect_true(all_preserved, label = "Probabilistic must preserve all strict determinations") # 4. Context: Experimental strategies only ADD, never remove determinations }) test_that("probabilistic assignment only for 2-period ranges", { # 1. Setup: Create data set.seed(404) data <- create_realistic_pnadc(n_quarters = 4, n_upas = 15) crosswalk <- pnadc_identify_periods(data, verbose = FALSE, store_date_bounds = TRUE) # 2. Execute: Apply probabilistic result <- pnadc_experimental_periods( crosswalk, strategy = "probabilistic", confidence_threshold = 0.8, verbose = FALSE ) # 3. Verify: Check the experimental assignment pattern # Probabilistic only assigns when there's ambiguity between 2 periods # This is verified indirectly - the function should not error expect_true(is.data.frame(result)) expect_true("determined_month" %in% names(result)) # 4. Context: Probabilistic works on 2-month ranges (implementation detail) # Single-month or 3-month ranges are not probabilistically assigned }) # ============================================================================= # UPA AGGREGATION STRATEGY TESTS # ============================================================================= test_that("UPA aggregation extends determinations via consensus", { # 1. Setup: Create data set.seed(500) data <- create_realistic_pnadc(n_quarters = 4, n_upas = 20) crosswalk <- pnadc_identify_periods(data, verbose = FALSE, store_date_bounds = TRUE) # Store strict rate strict_rate <- crosswalk[, mean(determined_month, na.rm = TRUE)] # 2. Execute: Apply UPA aggregation result <- pnadc_experimental_periods( crosswalk, strategy = "upa_aggregation", upa_proportion_threshold = 0.5, verbose = FALSE ) # Calculate rate after UPA aggregation upa_rate <- result[, mean(determined_month, na.rm = TRUE)] # 3. Verify: UPA aggregation should maintain or improve rate expect_true(upa_rate >= strict_rate, label = paste0("UPA aggregation maintains or improves: ", "strict=", round(strict_rate, 3), ", upa=", round(upa_rate, 3))) # 4. Context: UPA aggregation extends via UPA-level consensus }) test_that("upa_proportion_threshold affects determination", { # 1. Setup: Create data set.seed(501) data <- create_realistic_pnadc(n_quarters = 4, n_upas = 20) crosswalk <- pnadc_identify_periods(data, verbose = FALSE, store_date_bounds = TRUE) # 2. Execute: Test with different thresholds result_05 <- pnadc_experimental_periods( crosswalk, strategy = "upa_aggregation", upa_proportion_threshold = 0.5, verbose = FALSE ) result_08 <- pnadc_experimental_periods( crosswalk, strategy = "upa_aggregation", upa_proportion_threshold = 0.8, verbose = FALSE ) # 3. Verify: Lower threshold should give >= determinations rate_05 <- result_05[, mean(determined_month, na.rm = TRUE)] rate_08 <- result_08[, mean(determined_month, na.rm = TRUE)] expect_true(rate_05 >= rate_08 - 0.01, label = paste0("threshold=0.5 (", round(rate_05, 3), ") should be >= threshold=0.8 (", round(rate_08, 3), ")")) # 4. Context: Higher proportion threshold requires stronger UPA consensus }) # ============================================================================= # "BOTH" STRATEGY TESTS # ============================================================================= test_that("'both' strategy combines probabilistic and UPA aggregation", { # 1. Setup: Create data set.seed(600) data <- create_realistic_pnadc(n_quarters = 4, n_upas = 20) crosswalk <- pnadc_identify_periods(data, verbose = FALSE, store_date_bounds = TRUE) # 2. Execute: Apply each strategy separately and combined result_prob <- pnadc_experimental_periods( crosswalk, strategy = "probabilistic", confidence_threshold = 0.9, verbose = FALSE ) result_upa <- pnadc_experimental_periods( crosswalk, strategy = "upa_aggregation", upa_proportion_threshold = 0.5, verbose = FALSE ) result_both <- pnadc_experimental_periods( crosswalk, strategy = "both", confidence_threshold = 0.9, upa_proportion_threshold = 0.5, verbose = FALSE ) # 3. Verify: "both" should be >= max(individual strategies) rate_prob <- result_prob[, mean(determined_month, na.rm = TRUE)] rate_upa <- result_upa[, mean(determined_month, na.rm = TRUE)] rate_both <- result_both[, mean(determined_month, na.rm = TRUE)] max_individual <- max(rate_prob, rate_upa) expect_true(rate_both >= max_individual - 0.01, # Allow tiny tolerance label = paste0("'both' (", round(rate_both, 3), ") should be >= max(individual) (", round(max_individual, 3), ")")) # 4. Context: Combined strategy should identify at least as many as either alone }) test_that("'both' strategy preserves strict determinations", { # 1. Setup: Create data set.seed(601) data <- create_realistic_pnadc(n_quarters = 4, n_upas = 15) crosswalk <- pnadc_identify_periods(data, verbose = FALSE, store_date_bounds = TRUE) # Count strict determinations n_strict <- crosswalk[, sum(determined_month, na.rm = TRUE)] # 2. Execute: Apply "both" strategy result <- pnadc_experimental_periods( crosswalk, strategy = "both", verbose = FALSE ) # Count experimental determinations n_exp <- result[, sum(determined_month, na.rm = TRUE)] # 3. Verify: Should have at least as many determinations as strict expect_true(n_exp >= n_strict, label = "Combined strategy must preserve strict determinations") # 4. Context: All experimental strategies are additive only }) # ============================================================================= # NESTING PRESERVATION TESTS # ============================================================================= test_that("experimental strategies maintain nesting (week requires fortnight)", { # 1. Setup: Create data set.seed(700) data <- create_realistic_pnadc(n_quarters = 4, n_upas = 15) crosswalk <- pnadc_identify_periods(data, verbose = FALSE, store_date_bounds = TRUE) # 2. Execute: Apply experimental strategy result <- pnadc_experimental_periods( crosswalk, strategy = "both", verbose = FALSE ) # 3. Verify: Nesting violation check # All week-determined must have fortnight-determined violation <- result[determined_week == TRUE & (is.na(determined_fortnight) | determined_fortnight == FALSE)] expect_equal(nrow(violation), 0, label = "Week determination requires fortnight determination (nesting)") # 4. Context: Nesting is a fundamental invariant }) test_that("experimental strategies maintain nesting (fortnight requires month)", { # 1. Setup: Create data set.seed(701) data <- create_realistic_pnadc(n_quarters = 4, n_upas = 15) crosswalk <- pnadc_identify_periods(data, verbose = FALSE, store_date_bounds = TRUE) # 2. Execute: Apply experimental strategy result <- pnadc_experimental_periods( crosswalk, strategy = "both", verbose = FALSE ) # 3. Verify: Nesting violation check # All fortnight-determined must have month-determined violation <- result[determined_fortnight == TRUE & (is.na(determined_month) | determined_month == FALSE)] expect_equal(nrow(violation), 0, label = "Fortnight determination requires month determination (nesting)") }) # ============================================================================= # EDGE CASES # ============================================================================= test_that("experimental strategies handle small data sets", { # 1. Setup: Create data with enough observations for experimental strategies set.seed(800) data <- create_realistic_pnadc(n_quarters = 2, n_upas = 10) crosswalk <- pnadc_identify_periods(data, verbose = FALSE, store_date_bounds = TRUE) # 2. Execute: Should not error with reasonable data size expect_no_error({ result <- pnadc_experimental_periods( crosswalk, strategy = "both", verbose = FALSE ) }) # 3. Context: Implementation should handle various data sizes gracefully }) test_that("experimental strategies work with multi-quarter data", { # 1. Setup: Use multi-quarter data for experimental strategies # Single quarter may not have enough variation for experimental strategies set.seed(801) data <- create_realistic_pnadc(n_quarters = 3, n_upas = 15) crosswalk <- pnadc_identify_periods(data, verbose = FALSE, store_date_bounds = TRUE) # 2. Execute: Should work with sufficient data expect_no_error({ result <- pnadc_experimental_periods( crosswalk, strategy = "both", verbose = FALSE ) }) # 3. Verify: Basic structure maintained expect_true(is.data.frame(result)) expect_equal(nrow(result), nrow(crosswalk)) # 4. Context: Experimental strategies need sufficient data variation })