expect_equal_saved_prep <- function( metadata, is_aggregated = TRUE, time_freq = NULL, file = NULL ) { workflow <- setup_test_workflow( metadata = metadata, is_aggregated = is_aggregated, time_freq = time_freq, link_geo = NULL, link = FALSE ) saved <- paste0( "snapshots/data_processing/", file ) %>% testthat::test_path() %>% read_saved_csv() expect_equal( workflow$preprocessed_data(), saved, tolerance = 0.01, ignore_attr = TRUE ) } test_that("prepprocess is consistent", { skip_on_cran() set.seed(123) # individual-level COVID data expect_equal_saved_prep( metadata = list( is_timevar = TRUE, special_case = "covid", family = "binomial" ), is_aggregated = FALSE, time_freq = "week", file = "covid_binomial_indiv.csv" ) # aggregated COVID data expect_equal_saved_prep( metadata = list( is_timevar = TRUE, special_case = "covid", family = "binomial" ), is_aggregated = TRUE, time_freq = NULL, file = "covid_binomial_agg.csv" ) # individual-level general time-varying data # with binary outcome expect_equal_saved_prep( metadata = list( is_timevar = TRUE, special_case = NULL, family = "binomial" ), is_aggregated = FALSE, time_freq = "week", file = "timevar_binomial_indiv.csv" ) # aggregated general time-varying data # with binary outcome expect_equal_saved_prep( metadata = list( is_timevar = TRUE, special_case = NULL, family = "binomial" ), is_aggregated = TRUE, time_freq = NULL, file = "timevar_binomial_agg.csv" ) # individual-level general time-varying data # with continuous outcome expect_equal_saved_prep( metadata = list( is_timevar = TRUE, special_case = NULL, family = "normal" ), is_aggregated = FALSE, time_freq = "week", file = "timevar_normal_indiv.csv" ) # individual-level polling data expect_equal_saved_prep( metadata = list( is_timevar = FALSE, special_case = "poll", family = "binomial" ), is_aggregated = FALSE, time_freq = NULL, file = "poll_binomial_indiv.csv" ) # aggregated polling data expect_equal_saved_prep( metadata = list( is_timevar = FALSE, special_case = "poll", family = "binomial" ), is_aggregated = TRUE, time_freq = NULL, file = "poll_binomial_agg.csv" ) # individual-level general cross-sectional data # with binary outcome expect_equal_saved_prep( metadata = list( is_timevar = FALSE, special_case = NULL, family = "binomial" ), is_aggregated = FALSE, time_freq = NULL, file = "crosssec_binomial_indiv.csv" ) # aggregated general cross-sectional data # with binary outcome expect_equal_saved_prep( metadata = list( is_timevar = FALSE, special_case = NULL, family = "binomial" ), is_aggregated = TRUE, time_freq = NULL, file = "crosssec_binomial_agg.csv" ) # individual-level general cross-sectional data # with continuous outcome expect_equal_saved_prep( metadata = list( is_timevar = FALSE, special_case = NULL, family = "normal" ), is_aggregated = FALSE, time_freq = NULL, file = "crosssec_normal_indiv.csv" ) }) test_that("link_acs works with all linking geographies", { skip_on_cran() # No linking geography expect_no_error( setup_test_workflow( metadata = list( is_timevar = FALSE, special_case = NULL, family = "binomial" ), is_aggregated = TRUE, time_freq = NULL, link_geo = NULL, link = TRUE ) ) # Linking through zip expect_no_error( setup_test_workflow( metadata = list( is_timevar = FALSE, special_case = NULL, family = "binomial" ), is_aggregated = TRUE, time_freq = NULL, link_geo = "zip", link = TRUE ) ) # Linking through county expect_no_error( setup_test_workflow( metadata = list( is_timevar = FALSE, special_case = NULL, family = "binomial" ), is_aggregated = TRUE, time_freq = NULL, link_geo = "county", link = TRUE ) ) # Linking through state expect_no_error( setup_test_workflow( metadata = list( is_timevar = FALSE, special_case = NULL, family = "binomial" ), is_aggregated = TRUE, time_freq = NULL, link_geo = "state", link = TRUE ) ) }) test_that("load_pstrat works", { skip_on_cran() pstrat_data <- example_pstrat_data() # For general time-varying data # with binary outcome workflow <- setup_test_workflow( metadata = list( is_timevar = TRUE, special_case = NULL, family = "binomial" ), is_aggregated = TRUE, time_freq = NULL, link = FALSE ) capture.output({ workflow$load_pstrat(pstrat_data) }, type = "message") expect_no_error(workflow$demo_bars("sex")) # For general time-varying data # with continuous outcome workflow <- setup_test_workflow( metadata = list( is_timevar = TRUE, special_case = NULL, family = "normal" ), is_aggregated = FALSE, time_freq = "week", link = FALSE ) capture.output({ workflow$load_pstrat(pstrat_data) }, type = "message") expect_no_error(workflow$demo_bars("sex")) # For general cross-sectional data # with binary outcome workflow <- setup_test_workflow( metadata = list( is_timevar = FALSE, special_case = NULL, family = "binomial" ), is_aggregated = TRUE, time_freq = NULL, link = FALSE ) capture.output({ workflow$load_pstrat(pstrat_data) }, type = "message") expect_no_error(workflow$demo_bars("sex")) # For general cross-sectional data # with continuous outcome workflow <- setup_test_workflow( metadata = list( is_timevar = FALSE, special_case = NULL, family = "normal" ), is_aggregated = FALSE, time_freq = NULL, link = FALSE ) capture.output({ workflow$load_pstrat(pstrat_data) }, type = "message") expect_no_error(workflow$demo_bars("sex")) # For COVID data workflow <- setup_test_workflow( metadata = list( is_timevar = TRUE, special_case = "covid", family = "binomial" ), is_aggregated = TRUE, time_freq = NULL, link = FALSE ) expect_error( workflow$load_pstrat(pstrat_data), "Custom poststratification data is not supported for special cases" ) # For polling data workflow <- setup_test_workflow( metadata = list( is_timevar = FALSE, special_case = "poll", family = "binomial" ), link = FALSE ) expect_error( workflow$load_pstrat(pstrat_data), "Custom poststratification data is not supported for special cases" ) }) test_that(".impute is consistent", { skip_on_cran() set.seed(123) n <- 20 cols <- c("sex", "race", "age") data <- example_sample_data( is_timevar = FALSE, is_aggregated = FALSE, special_case = NULL, family = "binomial" ) %>% mutate( across(all_of(cols), ~ replace(., row_number() <= n, NA)) ) workflow <- mrp_workflow() capture.output( workflow$preprocess( data, is_timevar = FALSE, is_aggregated = FALSE, special_case = NULL, family = "binomial" ) , type = "message") saved <- testthat::test_path("snapshots/data_processing/impute.csv") %>% read_saved_csv() expect_equal( workflow$preprocessed_data(), saved ) }) test_that(".data_type defaults classify common cases correctly", { # binary expect_equal(.data_type(c(TRUE, FALSE, NA)), "bin") expect_equal(.data_type(c(0, 1), num = TRUE), 1) # categorical (character / factor) expect_equal(.data_type(c("a", "b", "c", NA)), "cat") expect_equal(.data_type(factor(c("x", "y", "z")), num = TRUE), 2) # integer-like with few distinct values → categorical expect_equal(.data_type(c(1,1,2,2,3,3,NA)), "cat") # numeric with any decimal → continuous expect_equal(.data_type(c(1.0, 2.5, 3.0)), "cont") expect_equal(.data_type(c(1, 2.2), num = TRUE), 3) # even with only 2 values # integer-like with many distinct values → continuous expect_equal(.data_type(1:100), "cont") # dates/times → continuous expect_equal(.data_type(as.Date("2024-01-01") + 0:5), "cont") expect_equal(.data_type(as.POSIXct("2024-01-01 00:00:00", tz = "UTC") + 0:5), "cont") # empty or all-NA → categorical expect_error(.data_type(c(NA, NA)), "Column does not contain any non-NA values.") expect_error(.data_type(logical()), "Column does not contain any non-NA values.") })