set.seed(1234) data("occup", package = "cat2cat") data("trans", package = "cat2cat") occup_2006 <- occup[occup$year == 2006, ] occup_2008 <- occup[occup$year == 2008, ] occup_2010 <- occup[occup$year == 2010, ] occup_2012 <- occup[occup$year == 2012, ] ml_setup <- list( data = occup_2010, cat_var = "code", method = c("knn"), features = c("age", "sex", "edu", "exp", "parttime", "salary"), args = list(k = 10, ntree = 50) ) occup_back_2008_2010 <- cat2cat( data = list( old = occup_2008, new = occup_2010, cat_var = "code", time_var = "year" ), mappings = list(trans = trans, direction = "backward"), ml = ml_setup ) # the counts could be any of wei_* or their combination freqs_df <- occup_back_2008_2010$old[, c("g_new_c2c", "wei_freq_c2c")] %>% cross_c2c() %>% group_by(g_new_c2c) %>% summarise(counts = round(sum(wei_cross_c2c, na.rm = TRUE))) occup_back_2006_2008_1 <- cat2cat( data = list( old = occup_2006, new = occup_back_2008_2010$old, cat_var_old = "code", cat_var_new = "g_new_c2c", time_var = "year" ), mappings = list(trans = trans, direction = "backward"), ml = ml_setup ) testthat::test_that("multi-period cat2cat neutral for base period", { expect_identical(occup_back_2008_2010$old, occup_back_2006_2008_1$new) }) testthat::test_that("multi-period cat2cat probabilities", { expect_true( !identical( occup_back_2006_2008_1$old$wei_freq_c2c, occup_back_2006_2008_1$old$wei_naive_c2c ) ) expect_true( !identical( occup_back_2006_2008_1$old$wei_freq_c2c, occup_back_2006_2008_1$old$wei_knn_c2c ) ) expect_true(!identical( occup_back_2006_2008_1$old$wei_naive_c2c, occup_back_2006_2008_1$old$wei_knn_c2c )) expect_equal( sum(occup_back_2006_2008_1$old$wei_freq_c2c), nrow(occup_2006) ) expect_equal( sum(occup_back_2006_2008_1$old$wei_knn_c2c), nrow(occup_2006) ) expect_true((all(occup_back_2006_2008_1$old$wei_freq_c2c <= 1 & occup_back_2006_2008_1$old$wei_freq_c2c >= 0))) expect_true((all(occup_back_2006_2008_1$old$wei_knn_c2c <= 1 & occup_back_2006_2008_1$old$wei_knn_c2c >= 0))) }) occup_2006_new <- occup_back_2006_2008_1$old occup_2008_new <- occup_back_2008_2010$old # or occup_back_2006_2008$new occup_2010_new <- occup_back_2008_2010$new occup_2012_new <- dummy_c2c(occup_2012, cat_var = "code", ml = c("knn") ) final_data <- do.call(rbind, list( occup_2006_new, occup_2008_new, occup_2010_new, occup_2012_new )) counts_new <- final_data %>% cross_c2c() %>% group_by(year) %>% summarise( n = as.integer(round(sum(wei_freq_c2c))), n2 = as.integer(round(sum(wei_cross_c2c))), .groups = "drop" ) counts_old <- occup %>% group_by(year) %>% summarise(n = n(), n2 = n(), .groups = "drop") testthat::test_that("multi-period cat2cat persist the number of observations", { expect_identical(counts_new, counts_old) }) data(occup) data(trans) occup_2006 <- occup[occup$year == 2006, ] occup_2008 <- occup[occup$year == 2008, ] occup_2010 <- occup[occup$year == 2010, ] occup_2012 <- occup[occup$year == 2012, ] trans2 <- rbind( trans, data.frame( old = "no_cat", new = setdiff(c(occup_2010$code, occup_2012$code), trans$new) ) ) # from 2008 to 2010 occup_for_2008_2010 <- cat2cat( data = list( old = occup_2008, new = occup_2010, cat_var = "code", time_var = "year" ), mappings = list(trans = trans2, direction = "forward"), ml = ml_setup ) # optional, give more control # the counts could be any of wei_* or their combination freqs_df <- occup_for_2008_2010$new[, c("g_new_c2c", "wei_freq_c2c")] %>% group_by(g_new_c2c) %>% summarise(counts = round(sum(wei_freq_c2c))) # from 2010 to 2012 occup_for_2010_2012 <- cat2cat( data = list( old = occup_for_2008_2010$new, new = occup_2012, cat_var_old = "g_new_c2c", cat_var_new = "code", time_var = "year", freqs_df = freqs_df ), mappings = list(trans = trans2, direction = "forward"), ml = ml_setup ) # from 2010 to 2012 occup_for_2010_2012_2 <- cat2cat( data = list( old = occup_for_2008_2010$new, new = occup_2012, cat_var_old = "g_new_c2c", cat_var_new = "code", time_var = "year" ), mappings = list(trans = trans2, direction = "forward", freqs_df = freqs_df), ml = ml_setup ) expect_identical(occup_for_2010_2012_2, occup_for_2010_2012) # use ml argument when applied ml models occup_2006_new <- dummy_c2c(occup_2006, "code", ml = c("knn")) occup_2008_new <- occup_for_2008_2010$old occup_2010_new <- occup_for_2008_2010$new # or occup_for_2010_2012$old occup_2012_new <- occup_for_2010_2012$new final_data_for <- do.call( rbind, list(occup_2006_new, occup_2008_new, occup_2010_new, occup_2012_new) ) # We persist the number of observations counts_new <- final_data_for %>% cross_c2c() %>% group_by(year) %>% summarise( n = as.integer(round(sum(wei_freq_c2c))), n2 = as.integer(round(sum(wei_cross_c2c))) ) counts_old <- occup %>% group_by(year) %>% summarise(n = n(), n2 = n(), .groups = "drop") testthat::test_that("", { testthat::expect_identical(counts_new, counts_old) })