library(testthat) library(dplyr) skip_on_cran() skip_if_not_installed("xgboost") skip_if_not_installed("modeldata") source(test_path("make_binned_data.R")) data("credit_data", package = "modeldata") data("ames", package = "modeldata") data("attrition", package = "modeldata") # Data for classification problem testing set.seed(42) credit_data_split <- rsample::initial_split(credit_data, strata = "Status") credit_data_train <- rsample::training(credit_data_split) credit_data_test <- rsample::testing(credit_data_split) set.seed(2393) credit_data_small <- dplyr::sample_n(credit_data_train, 30) rec_credit <- credit_data_train %>% select(-Status) %>% recipe(~.) %>% step_integer(all_predictors()) %>% prep(retain = TRUE) xgb_credit_train <- xgboost::xgb.DMatrix( data = as.matrix(bake(rec_credit, new_data = NULL)), label = ifelse(credit_data_train[["Status"]] == "bad", 0, 1), nthread = 1 ) xgb_credit_test <- xgboost::xgb.DMatrix( data = as.matrix(bake(rec_credit, new_data = credit_data_test)), label = ifelse(credit_data_test[["Status"]] == "bad", 0, 1), nthread = 1 ) # Data for multi-classification problem testing set.seed(42) attrition <- attrition %>% mutate(EducationField = as.integer(EducationField) - 1) attrition_data_split <- rsample::initial_split(attrition, strata = "EducationField") attrition_data_train <- rsample::training(attrition_data_split) attrition_data_test <- rsample::testing(attrition_data_split) set.seed(2393) attrition_data_small <- dplyr::sample_n(attrition_data_train, 10) rec_attrition <- attrition_data_train %>% select(-EducationField) %>% recipe(~.) %>% step_integer(all_predictors()) %>% prep(retain = TRUE) xgb_attrition_train <- xgboost::xgb.DMatrix( data = as.matrix(bake(rec_attrition, new_data = NULL)), label = attrition_data_train$EducationField, nthread = 1 ) xgb_attrition_test <- xgboost::xgb.DMatrix( data = as.matrix(bake(rec_attrition, new_data = attrition_data_test)), label = attrition_data_test$EducationField, nthread = 1 ) ames$Sale_Price <- log10(ames$Sale_Price) # Data for regression problem testing (naive) set.seed(1773) ames_data_split <- rsample::initial_split(ames, strata = "Sale_Price") ames_data_train <- rsample::training(ames_data_split) ames_data_test <- rsample::testing(ames_data_split) set.seed(8134) ames_data_small <- dplyr::sample_n(ames_data_train, 10) ames_rec <- ames_data_train %>% select(-Sale_Price) %>% recipe(~.) %>% step_integer(all_predictors()) %>% prep(retain = TRUE) xgb_ames_train <- xgboost::xgb.DMatrix( data = as.matrix(bake(ames_rec, new_data = NULL)), label = ames_data_train[["Sale_Price"]], nthread = 1 ) xgb_ames_test <- xgboost::xgb.DMatrix( data = as.matrix(bake(ames_rec, new_data = ames_data_test)), label = ames_data_test[["Sale_Price"]], nthread = 1 ) set.seed(8497) sim_tr_cls <- sim_data_2class(1000) sim_te_cls <- sim_data_2class(100) set.seed(8497) sim_tr_mcls <- sim_data_3class(1000) sim_te_mcls <- sim_data_3class(100) set.seed(8497) sim_tr_reg <- sim_data_reg(1000) sim_te_reg <- sim_data_reg(100) test_that("run_xgboost for classification", { skip_on_cran() # because data.table uses all cores by default xgboost <- embed:::run_xgboost( xgb_credit_train, xgb_credit_test, .learn_rate = 0.3, .num_breaks = 10, .tree_depth = 1, .min_n = 5, .objective = "binary:logistic", .num_class = NA ) expect_snapshot(xgboost) expect_equal(length(xgboost$params), 8) expect_equal(xgboost$nfeatures, 13) expect_equal(xgboost$params$tree_method, "hist") expect_equal(xgboost$params$objective, "binary:logistic") }) test_that("run_xgboost for multi-classification", { skip_on_cran() # because data.table uses all cores by default xgboost <- embed:::run_xgboost( xgb_attrition_train, xgb_attrition_test, .learn_rate = 0.3, .num_breaks = 10, .tree_depth = 1, .min_n = 5, .num_class = 6, # label must be in [0, num_class) .objective = "multi:softprob" ) expect_snapshot(xgboost) expect_equal(length(xgboost$params), 9) expect_equal(xgboost$nfeatures, 30) expect_equal(xgboost$params$tree_method, "hist") expect_equal(xgboost$params$objective, "multi:softprob") }) test_that("run_xgboost for regression", { skip_on_cran() # because data.table uses all cores by default xgboost <- embed:::run_xgboost( xgb_ames_train, xgb_ames_test, .learn_rate = 0.3, .num_breaks = 10, .tree_depth = 1, .min_n = 5, .objective = "reg:squarederror", .num_class = NA ) expect_snapshot(xgboost) expect_true(length(xgboost$params) > 1) expect_true(xgboost$nfeatures > 1) expect_equal(xgboost$params$tree_method, "hist") expect_equal(xgboost$params$objective, "reg:squarederror") }) test_that("xgb_binning for classification", { skip_on_cran() # because data.table uses all cores by default less_than_3.6 <- function() { utils::compareVersion("3.5.3", as.character(getRversion())) >= 0 } skip_if(less_than_3.6()) # Usual case set.seed(8497) xgb_binning <- embed:::xgb_binning( credit_data_train, "Status", "Seniority", sample_val = 0.20, learn_rate = 0.3, num_breaks = 10, tree_depth = 1, min_n = 5 ) expect_snapshot(xgb_binning) expect_true(length(xgb_binning) > 1) expect_type(xgb_binning, "double") skip_if(packageVersion("xgboost") > "1.5.2.1") # Algorithm runs on a too small training set/ insufficient variation in data expect_snapshot( embed:::xgb_binning( credit_data_small, "Status", "Seniority", sample_val = 0.30, learn_rate = 0.3, num_breaks = 10, tree_depth = 1, min_n = 5 ) ) }) test_that("xgb_binning for multi-classification", { skip_on_cran() # because data.table uses all cores by default less_than_3.6 <- function() { utils::compareVersion("3.5.3", as.character(getRversion())) >= 0 } skip_if(less_than_3.6()) # Usual case set.seed(8497) xgb_binning <- embed:::xgb_binning( attrition_data_train, "EducationField", "Age", sample_val = 0.20, learn_rate = 0.3, num_breaks = 10, tree_depth = 1, min_n = 5 ) expect_snapshot(xgb_binning) expect_true(length(xgb_binning) > 1) expect_type(xgb_binning, "double") # Algorithm runs on a too small training set/ insufficient variation in data expect_snapshot( embed:::xgb_binning( attrition_data_small, "EducationField", "Age", sample_val = 0.30, learn_rate = 0.3, num_breaks = 10, tree_depth = 1, min_n = 5 ) ) }) test_that("xgb_binning for regression", { skip_on_cran() # because data.table uses all cores by default less_than_3.6 <- function() { utils::compareVersion("3.5.3", as.character(getRversion())) >= 0 } skip_if(less_than_3.6()) set.seed(4235) # Usual case xgb_binning <- embed:::xgb_binning( ames_data_train, "Sale_Price", "Latitude", sample_val = 0.20, learn_rate = 0.3, num_breaks = 10, tree_depth = 1, min_n = 5 ) expect_snapshot(xgb_binning) expect_true(length(xgb_binning) > 1) expect_type(xgb_binning, "double") # Algorithm runs on a too small training set/ insufficient variation in data expect_snapshot( embed:::xgb_binning( ames_data_small, "Sale_Price", "Latitude", sample_val = 0.30, learn_rate = 0.3, num_breaks = 10, tree_depth = 1, min_n = 5 ) ) }) test_that("step_discretize_xgb for classification", { skip_on_cran() # because data.table uses all cores by default set.seed(125) # General use xgb_rec <- recipe(class ~ ., data = sim_tr_cls) %>% step_discretize_xgb(all_predictors(), outcome = "class") set.seed(28) xgb_rec <- prep(xgb_rec, training = sim_tr_cls) xgb_train_bins <- bake(xgb_rec, sim_tr_cls) xgb_test_bins <- bake(xgb_rec, sim_te_cls) expect_snapshot(xgb_train_bins[1:10, ]) expect_snapshot(xgb_test_bins[1:10, ]) expect_true(length(levels(xgb_train_bins$x)) > 1) expect_true(length(levels(xgb_train_bins$z)) > 1) expect_equal( levels(xgb_train_bins$x), levels(xgb_test_bins$x) ) expect_equal( levels(xgb_train_bins$z), levels(xgb_test_bins$z) ) # Too few data expect_snapshot(error = TRUE, { recipe(class ~ ., data = sim_tr_cls[1:9, ]) %>% step_discretize_xgb(all_predictors(), outcome = "class") %>% prep() }) # No numeric variables present predictors_non_numeric <- c( "Status", "Home", "Marital" ) xgb_rec <- credit_data_train %>% select(one_of(predictors_non_numeric)) %>% recipe(Status ~ .) %>% step_impute_median(all_numeric()) %>% step_discretize_xgb(all_numeric(), outcome = "Status") # Information about insufficient datapoints for Time predictor expect_snapshot({ set.seed(1) recipe(Status ~ ., data = credit_data_train) %>% step_discretize_xgb(Time, outcome = "Status") %>% prep(retain = TRUE) }) }) test_that("step_discretize_xgb for multi-classification", { skip_on_cran() # because data.table uses all cores by default set.seed(125) # General use xgb_rec <- recipe(class ~ ., data = sim_tr_mcls) %>% step_discretize_xgb(all_predictors(), outcome = "class") set.seed(28) xgb_rec <- prep(xgb_rec, training = sim_tr_mcls) xgb_train_bins <- bake(xgb_rec, sim_tr_mcls) xgb_test_bins <- bake(xgb_rec, sim_te_mcls) expect_snapshot(xgb_train_bins[1:10, ]) expect_snapshot(xgb_test_bins[1:10, ]) expect_true(length(levels(xgb_train_bins$x)) > 0) expect_true(length(levels(xgb_train_bins$z)) > 0) expect_equal( levels(xgb_train_bins$x), levels(xgb_test_bins$x) ) expect_equal( levels(xgb_train_bins$z), levels(xgb_test_bins$z) ) # Too few data expect_snapshot( error = TRUE, recipe(class ~ ., data = sim_tr_mcls[1:9, ]) %>% step_discretize_xgb(all_predictors(), outcome = "class") %>% prep() ) # No numeric variables present predictors_non_numeric <- c( "Attrition", "BusinessTravel", "Department", "Education", "EnvironmentSatisfaction", "Gender", "JobInvolvement", "JobRole", "JobSatisfaction", "MaritalStatus", "OverTime", "PerformanceRating", "RelationshipSatisfaction", "WorkLifeBalance" ) xgb_rec <- attrition_data_train %>% select(one_of(predictors_non_numeric)) %>% recipe(BusinessTravel ~ .) %>% step_impute_median(all_numeric()) %>% step_discretize_xgb(all_numeric(), outcome = "BusinessTravel") }) test_that("step_discretize_xgb for regression", { skip_on_cran() # because data.table uses all cores by default # Skip on R < 3.6 since the rng is different. skip("Needs to determine why random numbers are different") less_than_3.6 <- function() { utils::compareVersion("3.5.3", as.character(getRversion())) >= 0 } skip_if(less_than_3.6()) # General use set.seed(83834) xgb_rec <- recipe(y ~ ., data = sim_tr_reg) %>% step_discretize_xgb(all_predictors(), outcome = "y") tidy_untrained <- tidy(xgb_rec, 1) xgb_rec <- prep(xgb_rec, training = sim_tr_reg) tidy_trained <- tidy(xgb_rec, 1) xgb_train_bins <- bake(xgb_rec, sim_tr_reg) xgb_test_bins <- bake(xgb_rec, sim_te_reg) expect_snapshot(xgb_train_bins) expect_snapshot(xgb_test_bins) expect_true(length(levels(xgb_train_bins$x)) > 0) expect_true(length(levels(xgb_train_bins$z)) > 0) expect_equal( levels(xgb_train_bins$x), levels(xgb_test_bins$x) ) expect_equal( levels(xgb_train_bins$z), levels(xgb_test_bins$z) ) expect_true(tibble::is_tibble(tidy_untrained)) expect_equal( tidy_untrained$variable, "all_predictors()" ) expect_equal( tidy_untrained$values, NA_character_ ) expect_true(tibble::is_tibble(tidy_trained)) expect_equal( tidy_trained$terms, rep(c("x", "z"), c(4, 7)) ) expect_equal( tidy_trained$values, unlist(xgb_rec$steps[[1]]$rules, use.names = FALSE) ) # one bad predictor sim_tr_reg$x <- round(sim_tr_reg$x, 1) expect_snapshot({ xgb_rec <- recipe(y ~ ., data = sim_tr_reg[1:100, ]) %>% step_discretize_xgb(all_predictors(), outcome = "y") %>% prep() }) # No numeric variables present predictors_non_numeric <- c( "Neighborhood" ) xgb_rec <- ames_data_train %>% select(Sale_Price, one_of(predictors_non_numeric)) %>% recipe(Sale_Price ~ .) %>% step_medianimpute(all_numeric()) %>% step_discretize_xgb(all_predictors(), outcome = "Sale_Price") }) test_that("xgb_binning() errors if only one class in outcome", { skip_on_cran() # because data.table uses all cores by default const_outcome <- data.frame( outcome = factor(rep("a", 1000)), predictor = rep(1, 1000) ) expect_snapshot( error = TRUE, embed:::xgb_binning( const_outcome, "outcome", "predictor", sample_val = 0.20, learn_rate = 0.3, num_breaks = 10, tree_depth = 1, min_n = 5 ) ) }) test_that("case weights step_discretize_xgb", { skip_on_cran() # because data.table uses all cores by default sim_tr_cls_cw <- sim_tr_cls %>% mutate(weight = importance_weights(rep(1:0, each = 500))) sim_tr_mcls_cw <- sim_tr_mcls %>% mutate(weight = importance_weights(rep(1:0, each = 500))) sim_tr_reg_cw <- sim_tr_reg %>% mutate(weight = importance_weights(rep(1:0, each = 500))) # classification ------------------------------------------------------------ set.seed(125) # General use xgb_rec_cw <- recipe(class ~ ., data = sim_tr_cls_cw) %>% step_discretize_xgb(all_predictors(), outcome = "class") set.seed(28) xgb_rec_cw <- prep(xgb_rec_cw, training = sim_tr_cls_cw) exp_rules <- list() set.seed(28) for (col_names in c("x", "z")) { exp_rules[[col_names]] <- xgb_binning( sim_tr_cls_cw %>% select(-weight), "class", col_names, sample_val = 0.20, learn_rate = 0.3, num_breaks = 10, tree_depth = 1, min_n = 5, as.numeric(sim_tr_cls_cw$weight) ) } expect_identical( exp_rules, xgb_rec_cw$steps[[1]]$rules ) # multi-classification ------------------------------------------------------ set.seed(125) # General use xgb_rec_cw <- recipe(class ~ ., data = sim_tr_mcls_cw) %>% step_discretize_xgb(all_predictors(), outcome = "class") set.seed(28) xgb_rec_cw <- prep(xgb_rec_cw, training = sim_tr_mcls_cw) exp_rules <- list() set.seed(28) for (col_names in c("x", "z")) { exp_rules[[col_names]] <- xgb_binning( sim_tr_mcls_cw %>% select(-weight), "class", col_names, sample_val = 0.20, learn_rate = 0.3, num_breaks = 10, tree_depth = 1, min_n = 5, as.numeric(sim_tr_mcls_cw$weight) ) } expect_identical( exp_rules, xgb_rec_cw$steps[[1]]$rules ) # regression ---------------------------------------------------------------- set.seed(125) # General use xgb_rec_cw <- recipe(y ~ ., data = sim_tr_reg_cw) %>% step_discretize_xgb(all_predictors(), outcome = "y") set.seed(28) xgb_rec_cw <- prep(xgb_rec_cw, training = sim_tr_reg_cw) exp_rules <- list() set.seed(28) for (col_names in c("x", "z")) { exp_rules[[col_names]] <- xgb_binning( sim_tr_reg_cw %>% select(-weight), "y", col_names, sample_val = 0.20, learn_rate = 0.3, num_breaks = 10, tree_depth = 1, min_n = 5, as.numeric(sim_tr_reg_cw$weight) ) } expect_identical( exp_rules, xgb_rec_cw$steps[[1]]$rules ) # printing ------------------------------------------------------------------ expect_snapshot(xgb_rec_cw) }) test_that("tunable", { rec <- recipe(~., data = mtcars) %>% step_discretize_xgb(all_predictors(), outcome = "mpg") rec_param <- tunable.step_discretize_xgb(rec$steps[[1]]) expect_equal( rec_param$name, c("sample_val", "learn_rate", "num_breaks", "tree_depth", "min_n") ) expect_true(all(rec_param$source == "recipe")) expect_true(is.list(rec_param$call_info)) expect_equal(nrow(rec_param), 5) expect_equal( names(rec_param), c("name", "call_info", "source", "component", "component_id") ) }) # Infrastructure --------------------------------------------------------------- test_that("bake method errors when needed non-standard role columns are missing", { rec <- recipe(class ~ ., data = sim_tr_cls) %>% step_discretize_xgb(x, z, outcome = "class") %>% update_role(x, new_role = "potato") %>% update_role_requirements(role = "potato", bake = FALSE) rec_trained <- prep(rec, training = sim_tr_cls, verbose = FALSE) expect_error( bake(rec_trained, new_data = sim_tr_cls[, -1]), class = "new_data_missing_column" ) }) test_that("empty printing", { rec <- recipe(mpg ~ ., mtcars) rec <- step_discretize_xgb(rec, outcome = "mpg") expect_snapshot(rec) rec <- prep(rec, mtcars) expect_snapshot(rec) }) test_that("empty selection prep/bake is a no-op", { rec1 <- recipe(mpg ~ ., mtcars) rec2 <- step_discretize_xgb(rec1, outcome = "mpg") rec1 <- prep(rec1, mtcars) rec2 <- prep(rec2, mtcars) baked1 <- bake(rec1, mtcars) baked2 <- bake(rec2, mtcars) expect_identical(baked1, baked2) }) test_that("empty selection tidy method works", { rec <- recipe(mpg ~ ., mtcars) rec <- step_discretize_xgb(rec, outcome = "mpg") expect <- tibble(terms = character(), value = double(), id = character()) expect_identical(tidy(rec, number = 1), expect) rec <- prep(rec, mtcars) expect_identical(tidy(rec, number = 1), expect) }) test_that("printing", { skip_on_cran() # because data.table uses all cores by default rec <- recipe(class ~ ., data = sim_tr_cls) %>% step_discretize_xgb(all_predictors(), outcome = "class") expect_snapshot(print(rec)) expect_snapshot(prep(rec)) }) test_that("tunable is setup to works with extract_parameter_set_dials", { skip_if_not_installed("dials") rec <- recipe(~., data = mtcars) %>% step_discretize_xgb( all_predictors(), outcome = "mpg", sample_val = hardhat::tune(), learn_rate = hardhat::tune(), num_breaks = hardhat::tune(), tree_depth = hardhat::tune(), min_n = hardhat::tune() ) params <- extract_parameter_set_dials(rec) expect_s3_class(params, "parameters") expect_identical(nrow(params), 5L) })