library(ranger) library(survival) context("ranger_regularization") n <- 50 p <- 4 dat_reg <- data.frame(y = rnorm(n), x = replicate(p, runif(n))) dat_class <- data.frame(y = factor(rbinom(n, 1, .5)), x = replicate(p, runif(n))) dat_surv <- data.frame(time = runif(n), status = rbinom(n, 1, .5), x = replicate(p, runif(n))) get_num_splitvars <- function(rf) { all_splitvars <- do.call(c, lapply(1:rf$num.trees, function(t) { treeInfo(rf, t)[, "splitvarID"] })) length(unique(all_splitvars[!is.na(all_splitvars)])) } test_that("same results with 1 and p regularization coefficients, regression", { seed <- runif(1 , 0, .Machine$integer.max) set.seed(seed) rf1 <- ranger(y ~ ., dat_reg, num.trees = 5, num.threads = 1, regularization.factor = .1) set.seed(seed) rf2 <- ranger(y ~ ., dat_reg, num.trees = 5, num.threads = 1, regularization.factor = rep(.1, p)) expect_equal(rf1$prediction.error, rf2$prediction.error) }) test_that("same results with 1 and p regularization coefficients, classification", { seed <- runif(1 , 0, .Machine$integer.max) set.seed(seed) rf1 <- ranger(y ~ ., dat_class, num.trees = 5, num.threads = 1, regularization.factor = .1) set.seed(seed) rf2 <- ranger(y ~ ., dat_class, num.trees = 5, num.threads = 1, regularization.factor = rep(.1, p)) expect_equal(rf1$prediction.error, rf2$prediction.error) }) test_that("Error if maxstat splitrule and regularization", { expect_error(ranger(y ~ ., dat_reg, num.trees = 5, splitrule = "maxstat", num.threads = 1, regularization.factor = .0001, regularization.usedepth = TRUE), "Error: Regularization cannot be used with 'maxstat' splitrule\\.") }) # Regression test_that("Fewer variables used with regularization, regression", { rf_noreg <- ranger(y ~ ., dat_reg, num.trees = 5, min.node.size = 10, mtry = 4) rf_reg <- ranger(y ~ ., dat_reg, num.trees = 5, min.node.size = 10, mtry = 4, num.threads = 1, regularization.factor = .0001, regularization.usedepth = TRUE) expect_lt(get_num_splitvars(rf_reg), get_num_splitvars(rf_noreg)) }) test_that("Fewer variables used with regularization, regression extratrees", { rf_noreg <- ranger(y ~ ., dat_reg, num.trees = 5, min.node.size = 10, mtry = 4, splitrule = "extratrees") rf_reg <- ranger(y ~ ., dat_reg, num.trees = 5, min.node.size = 10, mtry = 4, splitrule = "extratrees", num.threads = 1, regularization.factor = .0001, regularization.usedepth = TRUE) expect_lt(get_num_splitvars(rf_reg), get_num_splitvars(rf_noreg)) }) test_that("Fewer variables used with regularization, regression beta", { dat <- data.frame(y = rbinom(n, 1, .5), x = replicate(p, runif(n))) rf_noreg <- ranger(y ~ ., dat, num.trees = 5, min.node.size = 10, mtry = 4, splitrule = "beta") rf_reg <- ranger(y ~ ., dat, num.trees = 5, min.node.size = 10, mtry = 4, splitrule = "beta", num.threads = 1, regularization.factor = .0001, regularization.usedepth = TRUE) expect_lt(get_num_splitvars(rf_reg), get_num_splitvars(rf_noreg)) }) # Classification test_that("Fewer variables used with regularization, classification", { rf_noreg <- ranger(y ~ ., dat_class, num.trees = 5, min.node.size = 10, mtry = 4) rf_reg <- ranger(y ~ ., dat_class, num.trees = 5, min.node.size = 10, mtry = 4, num.threads = 1, regularization.factor = .0001, regularization.usedepth = TRUE) expect_lt(get_num_splitvars(rf_reg), get_num_splitvars(rf_noreg)) }) test_that("Fewer variables used with regularization, classification extratrees", { rf_noreg <- ranger(y ~ ., dat_class, num.trees = 5, min.node.size = 10, mtry = 4, splitrule = "extratrees") rf_reg <- ranger(y ~ ., dat_class, num.trees = 5, min.node.size = 10, mtry = 4, splitrule = "extratrees", num.threads = 1, regularization.factor = .0001, regularization.usedepth = TRUE) expect_lt(get_num_splitvars(rf_reg), get_num_splitvars(rf_noreg)) }) test_that("Fewer variables used with regularization, classification hellinger", { rf_noreg <- ranger(y ~ ., dat_class, num.trees = 5, min.node.size = 10, mtry = 4, splitrule = "hellinger") rf_reg <- ranger(y ~ ., dat_class, num.trees = 5, min.node.size = 10, mtry = 4, splitrule = "hellinger", num.threads = 1, regularization.factor = .0001, regularization.usedepth = TRUE) expect_lt(get_num_splitvars(rf_reg), get_num_splitvars(rf_noreg)) }) # Probability test_that("Fewer variables used with regularization, probability", { rf_noreg <- ranger(y ~ ., dat_class, num.trees = 5, min.node.size = 10, mtry = 4, probability = TRUE) rf_reg <- ranger(y ~ ., dat_class, num.trees = 5, min.node.size = 10, mtry = 4, probability = TRUE, num.threads = 1, regularization.factor = .0001, regularization.usedepth = TRUE) expect_lt(get_num_splitvars(rf_reg), get_num_splitvars(rf_noreg)) }) test_that("Fewer variables used with regularization, probability extratrees", { rf_noreg <- ranger(y ~ ., dat_class, num.trees = 5, min.node.size = 10, mtry = 4, probability = TRUE, splitrule = "extratrees") rf_reg <- ranger(y ~ ., dat_class, num.trees = 5, min.node.size = 10, mtry = 4, probability = TRUE, splitrule = "extratrees", num.threads = 1, regularization.factor = .0001, regularization.usedepth = TRUE) expect_lt(get_num_splitvars(rf_reg), get_num_splitvars(rf_noreg)) }) test_that("Fewer variables used with regularization, probability hellinger", { rf_noreg <- ranger(y ~ ., dat_class, num.trees = 5, min.node.size = 10, mtry = 4, probability = TRUE, splitrule = "hellinger") rf_reg <- ranger(y ~ ., dat_class, num.trees = 5, min.node.size = 10, mtry = 4, probability = TRUE, splitrule = "hellinger", num.threads = 1, regularization.factor = .0001, regularization.usedepth = TRUE) expect_lt(get_num_splitvars(rf_reg), get_num_splitvars(rf_noreg)) }) # Survival test_that("Fewer variables used with regularization, survival", { rf_noreg <- ranger(Surv(time, status) ~ ., dat_surv, num.trees = 5, min.node.size = 10, mtry = 4) rf_reg <- ranger(Surv(time, status) ~ ., dat_surv, num.trees = 5, min.node.size = 10, mtry = 4, num.threads = 1, regularization.factor = .0001, regularization.usedepth = TRUE) expect_lt(get_num_splitvars(rf_reg), get_num_splitvars(rf_noreg)) }) test_that("Fewer variables used with regularization, survival extratrees", { rf_noreg <- ranger(Surv(time, status) ~ ., dat_surv, num.trees = 5, min.node.size = 10, mtry = 4, splitrule = "extratrees") rf_reg <- ranger(Surv(time, status) ~ ., dat_surv, num.trees = 5, min.node.size = 10, mtry = 4, splitrule = "extratrees", num.threads = 1, regularization.factor = .0001, regularization.usedepth = TRUE) expect_lt(get_num_splitvars(rf_reg), get_num_splitvars(rf_noreg)) }) test_that("Fewer variables used with regularization, survival C", { rf_noreg <- ranger(Surv(time, status) ~ ., dat_surv, num.trees = 5, min.node.size = 10, mtry = 4, splitrule = "C") rf_reg <- ranger(Surv(time, status) ~ ., dat_surv, num.trees = 5, min.node.size = 10, mtry = 4, splitrule = "C", num.threads = 1, regularization.factor = .0001, regularization.usedepth = TRUE) expect_lt(get_num_splitvars(rf_reg), get_num_splitvars(rf_noreg)) })