test_that("collapsing factors", { skip_if_not_installed("stringdist") skip_if_not_installed("modeldata") data(ames, package = "modeldata") expect_error( { rec_1 <- recipe(Sale_Price ~ ., data = ames) %>% step_collapse_stringdist(MS_SubClass, distance = 5) %>% prep() }, regex = NA ) expect_true(length(rec_1$steps[[1]]$results) == 1) expect_equal(names(rec_1$steps[[1]]$results), "MS_SubClass") expect_true( length(unique(rec_1$steps[[1]]$results$Neighborhood$.group)) < length(levels(ames$Neighborhood)) ) expect_equal( ames %>% select(-MS_SubClass, -Sale_Price), bake(rec_1, new_data = NULL) %>% select(-MS_SubClass, -Sale_Price) ) expect_false( isTRUE( all.equal(bake(rec_1, new_data = NULL)$MS_SubClass, ames$MS_SubClass) ) ) expect_error( { rec_2 <- recipe(Sale_Price ~ ., data = ames) %>% step_collapse_stringdist(MS_SubClass, Overall_Cond, distance = 10) %>% prep() }, regex = NA ) expect_true(length(rec_2$steps[[1]]$results) == 2) expect_equal( names(rec_2$steps[[1]]$results), c("MS_SubClass", "Overall_Cond") ) expect_true( length(rec_2$steps[[1]]$results$MS_SubClass) < length(rec_1$steps[[1]]$results$MS_SubClass) ) }) test_that("collapsing factors manual test", { skip_if_not_installed("stringdist") data0 <- tibble( x1 = c("a", "b", "d", "e", "aaaaaa", "bbbbbb"), x2 = c("ak", "b", "djj", "e", "aaaaaa", "aaaaaa") ) rec <- recipe(~., data = data0) %>% step_collapse_stringdist(all_predictors(), distance = 1) %>% prep() exp_result <- tibble( x1 = factor(c("a", "a", "a", "a", "aaaaaa", "bbbbbb")), x2 = factor(c("ak", "b", "djj", "b", "aaaaaa", "aaaaaa")) ) expect_equal( bake(rec, new_data = NULL), exp_result ) rec <- recipe(~., data = data0) %>% step_collapse_stringdist(all_predictors(), distance = 2) %>% prep() exp_result <- tibble( x1 = factor(c("a", "a", "a", "a", "aaaaaa", "bbbbbb")), x2 = factor(c("ak", "ak", "djj", "ak", "aaaaaa", "aaaaaa")) ) expect_equal( bake(rec, new_data = NULL), exp_result ) }) test_that("method argument", { skip_if_not_installed("stringdist") data0 <- tibble( x1 = c("a", "b", "d", "e", "aaaaaa", "bbbbbb"), x2 = c("ak", "b", "djj", "e", "aaaaaa", "aaaaaa") ) rec <- recipe(~., data = data0) %>% step_collapse_stringdist( all_predictors(), distance = 0.5, method = "cosine" ) %>% prep() exp_result <- tibble( x1 = factor(c("a", "b", "d", "e", "a", "b")), x2 = factor(c("aaaaaa", "b", "djj", "e", "aaaaaa", "aaaaaa")) ) expect_equal( bake(rec, new_data = NULL), exp_result ) rec <- recipe(~., data = data0) %>% step_collapse_stringdist( all_predictors(), distance = 1, method = "cosine" ) %>% prep() exp_result <- tibble( x1 = factor(c("a", "a", "a", "a", "a", "a")), x2 = factor(c("aaaaaa", "aaaaaa", "aaaaaa", "aaaaaa", "aaaaaa", "aaaaaa")) ) expect_equal( bake(rec, new_data = NULL), exp_result ) }) test_that("options argument", { skip_if_not_installed("stringdist") data0 <- tibble( x1 = c("a", "b", "d", "e", "aaaaaa", "bbbbbb"), x2 = c("ak", "b", "djj", "e", "aaaaaa", "aaaaaa") ) rec <- recipe(~., data = data0) %>% step_collapse_stringdist( all_predictors(), distance = 1, options = list(weight = c(d = 0.1, i = 1, s = 1, t = 1)) ) %>% prep() exp_result <- tibble( x1 = factor(c("a", "a", "a", "a", "a", "b")), x2 = factor(c("ak", "b", "djj", "b", "aaaaaa", "aaaaaa")) ) expect_equal( bake(rec, new_data = NULL), exp_result ) }) test_that("failed collapsing", { skip_if_not_installed("stringdist") skip_if_not_installed("modeldata") data(ames, package = "modeldata") # too many splits expect_error( { rec_4 <- recipe(Sale_Price ~ ., data = ames) %>% step_collapse_stringdist(MS_SubClass, distance = 0) %>% prep() }, regex = NA ) expect_equal( length(rec_4$steps[[1]]$results$MS_SubClass), length(levels(ames$MS_SubClass)) ) # too few splits expect_error( { rec_5 <- recipe(Sale_Price ~ ., data = ames) %>% step_collapse_stringdist(MS_SubClass, distance = 10000) %>% prep() }, regex = NA ) expect_equal( length(rec_5$steps[[1]]$results$MS_SubClass), 1 ) }) # Infrastructure --------------------------------------------------------------- test_that("bake method errors when needed non-standard role columns are missing", { skip_if_not_installed("stringdist") skip_if_not_installed("modeldata") data(ames, package = "modeldata") rec <- recipe(Sale_Price ~ ., data = ames) %>% step_collapse_stringdist(MS_SubClass, distance = 2) %>% update_role(MS_SubClass, new_role = "potato") %>% update_role_requirements(role = "potato", bake = FALSE) rec_trained <- prep(rec, training = ames, verbose = FALSE) expect_error( bake(rec_trained, new_data = ames[, -1]), class = "new_data_missing_column" ) }) test_that("empty printing", { rec <- recipe(mpg ~ ., mtcars) rec <- step_collapse_stringdist(rec, distance = 1) expect_snapshot(rec) rec <- prep(rec, mtcars) expect_snapshot(rec) }) test_that("empty selection prep/bake is a no-op", { rec1 <- recipe(mpg ~ ., mtcars) rec2 <- step_collapse_stringdist(rec1, distance = 1) rec1 <- prep(rec1, mtcars) rec2 <- prep(rec2, mtcars) baked1 <- bake(rec1, mtcars) baked2 <- bake(rec2, mtcars) expect_identical(baked1, baked2) }) test_that("empty selection tidy method works", { rec <- recipe(mpg ~ ., mtcars) rec <- step_collapse_stringdist(rec, distance = 1) expect <- tibble(terms = character(), id = character()) expect_identical(tidy(rec, number = 1), expect) rec <- prep(rec, mtcars) expect_identical(tidy(rec, number = 1), expect) }) test_that("printing", { skip_if_not_installed("stringdist") skip_if_not_installed("modeldata") data(ames, package = "modeldata") rec <- recipe(Sale_Price ~ ., data = ames) %>% step_collapse_stringdist(MS_SubClass, distance = 5) expect_snapshot(print(rec)) expect_snapshot(prep(rec)) })