tmp = tsk("iris", id = "iris_small")$select("Sepal.Length") tasks = c(mlr_tasks$mget(c("iris", "sonar")), list(tmp)) learners = mlr_learners$mget(c("classif.featureless", "classif.rpart")) resamplings = rsmp("cv", folds = 3) design = benchmark_grid(tasks, learners, resamplings) bmr = benchmark(design) test_that("Basic benchmarking", { expect_benchmark_result(bmr) expect_names(names(as.data.table(bmr)), permutation.of = c(mlr_reflections$rr_names, "uhash", "prediction", "task_id", "learner_id", "resampling_id")) tab = as.data.table(bmr) expect_data_table(tab, nrows = 18L, ncols = 9L) expect_names(names(tab), permutation.of = c("uhash", "prediction", mlr_reflections$rr_names, "task_id", "learner_id", "resampling_id")) measures = list(msr("classif.acc")) tab = bmr$score(measures, ids = FALSE, predictions = TRUE) expect_data_table(tab, nrows = 18L, ncols = 7L + length(measures)) expect_names(names(tab), must.include = c("nr", "uhash", "prediction_test", mlr_reflections$rr_names, ids(measures))) expect_list(tab$prediction_test, "Prediction") tab = bmr$tasks expect_data_table(tab, nrows = 3L, any.missing = FALSE) expect_names(names(tab), identical.to = c("task_hash", "task_id", "task")) expect_hash(tab$task_hash, len = 3L) tab = bmr$learners expect_data_table(tab, nrows = 2L, any.missing = FALSE) expect_names(names(tab), identical.to = c("learner_hash", "learner_id", "learner")) expect_hash(tab$learner_hash, len = 2L) qexpectr(map(tab$learner, "state"), "0") tab = bmr$resamplings expect_data_table(tab, nrows = 3L, any.missing = FALSE) expect_names(names(tab), permutation.of = c("resampling_hash", "resampling", "resampling_id")) expect_hash(tab$resampling_hash, len = 3L) tab = bmr$aggregate(measures) expect_data_table(tab, nrows = 6L) expect_names(names(tab), type = "unique", identical.to = c("nr", "resample_result", "task_id", "learner_id", "resampling_id", "iters", ids(measures))) m = measures[[1L]] expect_numeric(tab[[m$id]], any.missing = FALSE, lower = m$range[1], upper = m$range[2]) }) test_that("ResampleResult / hash", { m = msr("classif.ce") aggr = bmr$aggregate(m, uhashes = TRUE) nr = aggr$nr expect_integer(nr, len = 6L, any.missing = FALSE, unique = TRUE) for (i in nr) { rr = aggr$resample_result[[i]] expect_resample_result(rr) expect_equal(unname(rr$aggregate(m)), aggr[["classif.ce"]][i]) expect_equal(bmr$uhashes[i], rr$uhash) } }) test_that("discarding model", { bmr2 = benchmark(benchmark_grid(tasks[1L], learners[1L], resamplings), store_models = FALSE) expect_benchmark_result(bmr2) expect_true(every(map(as.data.table(bmr2)$learner, "model"), is.null)) bmr2 = benchmark(benchmark_grid(tasks[1L], learners[1L], resamplings), store_models = TRUE) expect_benchmark_result(bmr2) expect_false(every(map(as.data.table(bmr2)$learner, "model"), is.null)) }) test_that("bmr$combine()", { bmr_new = benchmark(benchmark_grid(mlr_tasks$mget("pima"), learners, resamplings)) combined = list( bmr$clone(deep = TRUE)$combine(bmr_new), c(bmr, bmr_new) ) for (bmr_combined in combined) { expect_benchmark_result(bmr) expect_benchmark_result(bmr_new) expect_benchmark_result(bmr_combined) expect_data_table(get_private(bmr)$.data$data$fact, nrows = 18L) expect_data_table(get_private(bmr_new)$.data$data$fact, nrows = 6L) expect_data_table(get_private(bmr_combined)$.data$data$fact, nrows = 24L) expect_false("pima" %chin% bmr$tasks$task_id) expect_true("pima" %chin% bmr_new$tasks$task_id) expect_true("pima" %chin% bmr_combined$tasks$task_id) } rr = resample(tsk("zoo"), lrn("classif.rpart"), rsmp("holdout")) bmr2 = c(combined[[1]], rr) expect_benchmark_result(bmr2) expect_data_table(get_private(bmr2)$.data$data$fact, nrows = 25L) }) test_that("empty bmr", { bmr_new = BenchmarkResult$new() expect_benchmark_result(bmr_new) bmr_new$combine(NULL) expect_benchmark_result(bmr_new) bmr_new$combine(bmr) expect_benchmark_result(bmr_new) expect_data_table(get_private(bmr_new)$.data$data$fact, nrows = nrow(get_private(bmr)$.data)) }) test_that("bmr$resample_result()", { uhashes = bmr$uhashes expect_resample_result(bmr$resample_result(1L)) expect_resample_result(bmr$resample_result(uhash = uhashes[1])) expect_resample_result(bmr$resample_result(learner_id = "classif.featureless", task_id = "iris")) expect_error(bmr$resample_result(learner_id = "classif.featureless"), "requires selecting exactly one") expect_error(bmr$resample_result(0)) expect_error(bmr$resample_result(100)) expect_error(bmr$resample_result(uhash = "a")) expect_error(bmr$resample_result(i = 1, uhash = uhashes[1])) }) test_that("inputs are cloned", { task = tsk("iris") learner = lrn("classif.featureless") resampling = rsmp("holdout") expect_error(benchmark(data.table(task = list(task), learner = list(learner), resampling = list(resampling))), "instantiated") bmr = benchmark(design = data.table(task = list(task), learner = list(learner), resampling = list(resampling$instantiate(task)))) rr = bmr$resample_result(1) expect_different_address(task, rr$task) expect_different_address(learner, get_private(rr)$.data$data$learner_components$learner[[1L]]) expect_different_address(resampling, rr$resampling) }) test_that("memory footprint", { expect_equal(uniqueN(map_chr(design$task, address)), 3L) expect_equal(uniqueN(map_chr(design$learner, address)), 2L) expect_equal(uniqueN(map_chr(design$resampling, address)), 3L) x = as.data.table(bmr) expect_equal(uniqueN(map_chr(x$task, address)), 3L) expect_equal(uniqueN(map_chr(x$learner, address)), 18L) expect_equal(uniqueN(map_chr(x$resampling, address)), 3L) }) test_that("resampling validation in benchmark_grid", { task1 = tsk("iris") task2 = tsk("pima") resampling_1 = rsmp("holdout") resampling_2 = rsmp("holdout") # should work when resamplings are instantiated on their corresponding tasks resampling_1$instantiate(task1) resampling_2$instantiate(task2) expect_data_table(benchmark_grid(list(task1, task2), lrn("classif.rpart"), list(resampling_1, resampling_2), paired = TRUE)) # should fail when resamplings are not instantiated resampling_1 = rsmp("holdout") resampling_2 = rsmp("holdout") expect_error(benchmark_grid(list(task1, task2), lrn("classif.rpart"), list(resampling_1, resampling_2), paired = TRUE), "is not instantiated") # should fail when resampling is instantiated on wrong task resampling_1$instantiate(task1) resampling_2$instantiate(task1) expect_error(benchmark_grid(list(task1, task2), lrn("classif.rpart"), list(resampling_1, resampling_2), paired = TRUE), "not instantiated") # should fail when task row hashes don't match task1 = tsk("iris") task2 = tsk("iris")$filter(1:100) resampling_1 = rsmp("holdout") resampling_1$instantiate(task1) expect_error(benchmark_grid(list(task1, task2), lrn("classif.rpart"), list(resampling_1)), "not instantiated") # should fail when the tasks have the same number of rows but different row hashes task1 = tsk("iris")$filter(1:75) task2 = tsk("iris")$filter(76:150) resampling_1 = rsmp("holdout") resampling_1$instantiate(task1) expect_error(benchmark_grid(list(task1, task2), lrn("classif.rpart"), list(resampling_1)), "not instantiated") # should work when all resamplings are uninstantiated res1 = rsmp("holdout") res2 = rsmp("holdout") expect_data_table(benchmark_grid(list(task1, task2), lrn("classif.rpart"), list(res1, res2))) # should fail when some resamplings are instantiated and others are not res1$instantiate(task1) expect_error(benchmark_grid(list(task1, task2), lrn("classif.rpart"), list(res1, res2)), "All resamplings must be instantiated, or none at all") }) test_that("multiple measures", { tasks = list(tsk("iris"), tsk("sonar")) learner = lrn("classif.featureless") measures = list(msr("classif.ce"), msr("classif.acc")) bmr = benchmark(design = benchmark_grid(tasks, learner, rsmp("cv", folds = 3))) expect_subset(c("classif.ce", "classif.acc"), names(bmr$aggregate(measures))) }) test_that("predict_type is checked", { task = tsk("sonar") learner = lrn("classif.featureless") resampling = rsmp("cv", folds = 3L) design = benchmark_grid(task, learner, resampling) bmr = benchmark(design) expect_error(bmr$aggregate("classif.auc", "predict_type")) }) test_that("custom resampling (#245)", { task_boston = tsk("california_housing") lrn = lrn("regr.featureless") rdesc = rsmp("custom") train_sets = list((1:200), (1:300), (1:400)) test_sets = list((201:301), (301:401), (401:501)) rdesc$instantiate(task_boston, train_sets, test_sets) expect_resample_result(mlr3::resample(task_boston, lrn, rdesc)) design = data.table(task = list(task_boston), learner = list(lrn), resampling = list(rdesc)) bmr = benchmark(design) expect_benchmark_result(bmr) # Issue #451 design = benchmark_grid(tasks = task_boston, learners = lrn, resamplings = rdesc) expect_data_table(design, nrows = 1) }) test_that("extract params in aggregate and score", { # set params differently in a few learners lrns = list( lrn("classif.rpart", id = "rp1", xval = 0), lrn("classif.rpart", id = "rp2", xval = 0, cp = 0.2, minsplit = 2), lrn("classif.rpart", id = "rp3", xval = 0, cp = 0.1) ) bmr = benchmark(benchmark_grid(tsk("wine"), lrns, rsmp("cv", folds = 3))) aggr = bmr$aggregate(params = TRUE) setorder(aggr, "learner_id") expect_list(aggr$params[[1]], names = "unique", len = 1L) expect_list(aggr$params[[2]], names = "unique", len = 3L) expect_list(aggr$params[[3]], names = "unique", len = 2L) scores = bmr$score() pvs = map(scores$learner, function(l) l$param_set$values) expect_true(all(sapply(split(lengths(pvs), scores$nr), uniqueN) == 1)) expect_set_equal(lengths(pvs), 1:3) # only one params lrns = mlr_learners$mget("classif.featureless") bmr = benchmark(benchmark_grid(tsk("wine"), lrns, rsmp("cv", folds = 3))) aggr = bmr$aggregate(params = TRUE) expect_list(aggr$params[[1]], names = "unique", len = 1L) # no params lrns = mlr_learners$mget("classif.debug") lrns$classif.debug$param_set$values = list() bmr = benchmark(benchmark_grid(tsk("wine"), lrns, rsmp("cv", folds = 3))) aggr = bmr$aggregate(params = TRUE) expect_list(aggr$params[[1]], names = "unique", len = 0L) expect_true(all(c("warnings", "errors") %chin% names(bmr$score(conditions = TRUE)))) }) test_that("benchmark_grid", { learner = lrn("classif.rpart") tasks = tsks(c("iris", "iris")) resamp = rsmp("cv")$instantiate(tasks[[1]]) expect_data_table(benchmark_grid(tasks, learner, resamp)) tasks = tsks(c("iris", "sonar")) resamp = rsmp("cv")$instantiate(tasks[[1]]) expect_error(benchmark_grid(tasks, learner, resamp), "not instantiated") }) test_that("filter", { tasks = lapply(c("iris", "sonar"), tsk) learners = lapply(c("classif.featureless", "classif.rpart"), lrn) resamplings = list(rsmp("cv", folds = 3), rsmp("holdout")) design = benchmark_grid(tasks, learners, resamplings) bmr = benchmark(design) expect_data_table(get_private(bmr)$.data$data$fact, nrows = 16) bmr$filter(task_ids = "sonar") expect_data_table(get_private(bmr)$.data$data$fact, nrows = 8) expect_resultdata(get_private(bmr)$.data, TRUE) bmr$filter(learner_ids = "classif.rpart") expect_data_table(get_private(bmr)$.data$data$fact, nrows = 4) expect_resultdata(get_private(bmr)$.data, TRUE) bmr2 = bmr$clone(deep = TRUE)$filter(resampling_ids = "cv") expect_data_table(get_private(bmr2)$.data$data$fact, nrows = 3) expect_resultdata(get_private(bmr2)$.data, TRUE) bmr$filter(i = 2) expect_data_table(get_private(bmr)$.data$data$fact, nrows = 1) expect_resultdata(get_private(bmr)$.data, TRUE) expect_benchmark_result(bmr) expect_benchmark_result(bmr2) }) test_that("aggregated performance values are calculated correctly (#555)", { task = tsk("spam") learner1 = lrn("classif.featureless") learner2 = lrn("classif.rpart") resampling = rsmp("subsampling", repeats = 2) design = benchmark_grid(task, list(learner1, learner2), resampling) bmr = benchmark(design = design, store_models = TRUE) y = bmr$aggregate()$classif.ce expect_gt(y[1], y[2]) y = c( bmr$resample_result(1)$aggregate(msr("classif.ce")), bmr$resample_result(2)$aggregate(msr("classif.ce")) ) expect_gt(y[1], y[2]) }) test_that("save/load roundtrip", { path = tempfile() saveRDS(bmr, file = path) bmr2 = readRDS(path) expect_benchmark_result(bmr2) }) test_that("debug branch", { tmp = tsk("iris", id = "iris_small")$select("Sepal.Length") tasks = c(mlr_tasks$mget(c("iris", "sonar")), list(tmp)) learners = mlr_learners$mget(c("classif.featureless", "classif.rpart")) resamplings = rsmp("cv", folds = 2) design = benchmark_grid(tasks, learners, resamplings) bmr = invoke(benchmark, design, .opts = list(mlr3.debug = TRUE)) expect_benchmark_result(bmr) }) # uncomment when evaluate 1.0.4 is released # test_that("encapsulatiion", { # learners = list(lrn("classif.debug", error_train = 1), lrn("classif.rpart")) # grid = benchmark_grid(tasks, learners, resamplings) # expect_error(benchmark(grid), "classif.debug->train()") # bmr = benchmark(grid, encapsulate = "evaluate") # aggr = bmr$aggregate(conditions = TRUE) # expect_true(all(aggr[learner_id == "classif.debug", errors] == 3L)) # expect_true(all(aggr[learner_id != "classif.debug", errors] == 0L)) # for (learner in bmr$learners$learner) { # expect_class(learner$fallback, "LearnerClassifFeatureless") # expect_equal(learner$encapsulation[["train"]], "evaluate") # expect_equal(learner$encapsulation[["predict"]], "evaluate") # } # }) test_that("disable cloning", { grid = benchmark_grid( tasks = tsk("iris"), learners = lrn("classif.featureless"), resamplings = rsmp("holdout") ) task = grid$task[[1L]] learner = grid$learner[[1L]] resampling = grid$resampling[[1L]] bmr = benchmark(grid, clone = c()) expect_same_address(task, bmr$tasks$task[[1]]) expect_same_address(learner, get_private(bmr)$.data$data$learners$learner[[1]]) expect_same_address(resampling, bmr$resamplings$resampling[[1]]) expect_identical(task$hash, bmr$tasks$task[[1]]$hash) expect_identical(learner$hash, bmr$learners$learner[[1]]$hash) expect_identical(resampling$hash, bmr$resamplings$resampling[[1]]$hash) }) test_that("task and learner assertions", { grid = benchmark_grid( tasks = tsks(c("iris", "california_housing")), learners = lrn("classif.rpart"), resamplings = rsmp("holdout") ) expect_error(benchmark(grid), "task types") grid = benchmark_grid( tasks = tsk("iris"), learners = lrns(c("classif.rpart", "regr.rpart")), resamplings = rsmp("holdout") ) expect_error(benchmark(grid), "learner types") grid = benchmark_grid( tasks = tsk("iris"), learners = lrn("regr.rpart"), resamplings = rsmp("holdout") ) expect_error(benchmark(grid), "not match type") }) test_that("benchmark_grid works if paired = TRUE", { tasks = mlr3::tsks(c("pima", "iris")) learners = lrns(c("classif.featureless", "classif.rpart")) resampling = rsmp("cv") resamplings = pmap( list(tasks, rsmps(c("cv", "holdout"))), function(task, resampling) resampling$instantiate(task) ) design = benchmark_grid(tasks, learners, resamplings, paired = TRUE) expect_class(design, "benchmark_grid") # design[, identical(task), by = task]] # expect(identical(design$resampling[class(learner)[[1]] ==)])) expect_true(nrow(design) == 4L) # expect_identical(design$task[[1]], design$task[[2]]) expect_identical(design$task[[3]], design$task[[4]]) expect_false(identical(design$task[[1]], design$task[[3]])) expect_identical(design$resampling[[1]], design$resampling[[2]]) expect_identical(design$resampling[[3]], design$resampling[[4]]) expect_false(identical(design$resampling[[1]], design$resampling[[3]])) expect_identical(design$learner[[1]], design$learner[[3]]) expect_identical(design$learner[[2]], design$learner[[4]]) expect_false(identical(design$learner[[2]], design$learner[[3]])) # Resamplings must be instantiated tasks = tsks(c("pima", "iris")) learners = lrns(c("classif.featureless", "classif.rpart")) resamplings = mlr3::rsmps(c("cv", "holdout")) expect_error(benchmark_grid(tasks, learners, resamplings, paired = TRUE)) # Resamplings and tasks must have the same length tasks = tsks(c("pima", "iris")) learners = lrns(c("classif.featureless", "classif.rpart")) resamplings = pmap( list(tasks, mlr3::rsmps(c("cv", "holdout"))), function(task, resampling) resampling$instantiate(task) ) resamplings = c(resamplings, resamplings) expect_error(benchmark_grid(tasks, learners, resamplings, paired = TRUE)) # Resamplings and tasks must have corresponding hashes tasks = tsks(c("pima", "iris")) learners = lrns(c("classif.featureless", "classif.rpart")) resamplings = pmap( list(tasks, mlr3::rsmps(c("cv", "holdout"))), function(task, resampling) resampling$instantiate(task) ) resamplings = rev(resamplings) expect_error(benchmark_grid(tasks, learners, resamplings, paired = TRUE)) }) test_that("param_values in benchmark", { # setup tasks = tsks("iris") resamplings = list(rsmp("cv", folds = 3)$instantiate(tasks[[1]])) learners = lrns("classif.debug") # single parameter set via manual design design = data.table(task = tasks, learner = learners, resampling = resamplings, param_values = list(list(list(x = 1)))) bmr = benchmark(design) expect_benchmark_result(bmr) expect_equal(bmr$n_resample_results, 1) expect_equal(nrow(as.data.table(bmr)), 3) learner = bmr$resample_result(1)$learner expect_equal(learner$param_set$values$x, 1) expect_equal(nrow(as.data.table(bmr)), 3) # multiple parameters set via manual design design = data.table(task = tasks, learner = learners, resampling = resamplings, param_values = list(list(list(x = 1), list(x = 0.5)))) bmr = benchmark(design) expect_benchmark_result(bmr) expect_equal(bmr$n_resample_results, 2) expect_equal(nrow(as.data.table(bmr)), 6) learner = bmr$resample_result(1)$learner expect_equal(learner$param_set$values$x, 1) learner = bmr$resample_result(2)$learner expect_equal(learner$param_set$values$x, 0.5) # benchmark grid does not attach param_values if empty design = benchmark_grid(tasks, learners, resamplings) expect_names(names(design), permutation.of = c("task", "learner", "resampling")) # benchmark grid with param_values design = benchmark_grid(tasks, learners, resamplings, param_values = list(list(list(x = 1)))) expect_data_table(design, nrows = 1) expect_names(names(design), permutation.of = c("task", "learner", "resampling", "param_values")) bmr = benchmark(design) expect_benchmark_result(bmr) # benchmark grid with param_values and paired = TRUE design = benchmark_grid(tasks, learners, resamplings, param_values = list(list(list(x = 1))), paired = TRUE) expect_data_table(design, nrows = 1) bmr = benchmark(design) expect_benchmark_result(bmr) expect_equal(bmr$n_resample_results, 1) # benchmark grid with multiple params design = benchmark_grid(tasks, learners, resamplings, param_values = list(list(list(x = 1), list(x = 0.5)))) expect_data_table(design, nrows = 1) bmr = benchmark(design) expect_benchmark_result(bmr) expect_equal(bmr$n_resample_results, 2) # benchmark grid with multiple params and multiple learners design = benchmark_grid(tasks, lrns(c("classif.debug", "classif.rpart")), rsmp("holdout"), param_values = list(list(list(x = 1), list(x = 0.5)), list())) bmr = benchmark(design) expect_benchmark_result(bmr) expect_equal(bmr$n_resample_results, 3) # constant values are inserted learners = lrns("classif.rpart", minsplit = 12) design = data.table(task = tasks, learner = learners, resampling = resamplings, param_values = list(list(list(cp = 0.1), list(minbucket = 2)))) bmr = benchmark(design) sortnames = function(x) { if (!is.null(names(x))) { x <- x[order(names(x))] } x } trained = bmr$learners$learner ii = which(map_lgl(trained, function(x) "cp" %chin% names(x$param_set$values))) # find learner with cp expect_count(ii) expect_equal(sortnames(bmr$learners$learner[-ii][[1]]$param_set$values), list(minbucket = 2, minsplit = 12, xval = 0)) expect_equal(sortnames(bmr$learners$learner[[ii]]$param_set$values), list(cp = 0.1, minsplit = 12, xval = 0)) }) test_that("learner's validate cannot be 'test' if internal_valid_set is present", { # otherwise, predict_set = "internal_valid" would be ambiguous learner = lrn("classif.debug", validate = "test", predict_sets = c("train", "internal_valid")) task = tsk("iris") task$internal_valid_task = 1 expect_error(benchmark(benchmark_grid(task, learner, rsmp("holdout"))), "cannot be set to ") }) test_that("learner's validate cannot be a ratio if internal_valid_set is present", { # otherwise, predict_set = "internal_valid" would be ambiguous learner = lrn("classif.debug", validate = 0.5, predict_sets = c("train", "internal_valid")) task = tsk("iris") task$internal_valid_task = 1 expect_error(benchmark(benchmark_grid(task, learner, rsmp("holdout"))), "cannot be set to ") }) test_that("properties are also checked on validation task", { task = tsk("iris") row = task$data(1) row[[1]][1] = NA row$..row_id = 151 task$rbind(row) task$internal_valid_task = 151 learner = lrn("classif.debug", validate = "predefined") learner$properties = setdiff(learner$properties, "missings") expect_error(benchmark(benchmark_grid(task, learner, rsmp("holdout"))), "missing values") }) test_that("unmarshal parameter is respected", { learner = lrn("classif.debug", count_marshaling = TRUE) learner$encapsulate("callr", lrn("classif.featureless")) task = tsk("iris") resampling = rsmp("holdout") design = benchmark_grid(task, learner, resampling) bmr = with_future(future::multisession, { list( marshaled = benchmark(design, store_models = TRUE, unmarshal = FALSE), unmarshaled = benchmark(design, store_models = TRUE, unmarshal = TRUE) ) }) expect_false(bmr$unmarshaled$resample_result(1)$learners[[1]]$marshaled) expect_true(bmr$marshaled$resample_result(1)$learners[[1]]$marshaled) }) test_that("BenchmarkResult can be (un)marshaled", { bmr = benchmark(benchmark_grid(tsk("iris"), lrn("classif.debug"), rsmp("holdout")), store_models = TRUE) expect_false(bmr$resample_result(1)$learners[[1]]$marshaled) bmr$marshal() expect_true(bmr$resample_result(1)$learners[[1]]$marshaled) bmr$unmarshal() expect_false(bmr$resample_result(1)$learners[[1]]$marshaled) # also works with non-marshalable learner bmr1 = benchmark(benchmark_grid(tsk("iris"), lrn("classif.featureless"), rsmp("holdout")), store_models = TRUE) model = bmr1$resample_result(1)$learners[[1]]$model bmr1$unmarshal() expect_equal(bmr1$resample_result(1)$learners[[1]]$model, model) }) test_that("obs_loss", { bmr = benchmark(benchmark_grid( tsk("iris"), lrn("classif.rpart"), rsmp("holdout") )) tbl = bmr$obs_loss(msrs(c("classif.acc", "classif.auc"))) expect_true(all(is.na(tbl$classif.auc))) expect_integer(tbl$classif.acc) }) test_that("predictions retrieved with as.data.table and predictions method are equal", { tab = as.data.table(bmr) predictions = unlist(map(bmr$resample_results$resample_result, function(rr) rr$predictions()), recursive = FALSE) expect_equal(tab$prediction, predictions) tab = as.data.table(bmr, predict_sets = "train") predictions = unlist(map(bmr$resample_results$resample_result, function(rr) rr$predictions(predict_sets = "train")), recursive = FALSE) expect_equal(tab$prediction, predictions) }) test_that("score works with predictions and empty predictions", { learner_1 = lrn("classif.rpart", predict_sets = "train", id = "learner_1") learner_2 = lrn("classif.rpart", predict_sets = "test", id = "learner_2") task = tsk("pima") design = benchmark_grid(task, list(learner_1, learner_2), rsmp("holdout")) bmr = benchmark(design) expect_warning({tab = bmr$score(msr("classif.ce", predict_sets = "test"))}, "Measure") expect_equal(tab$classif.ce[1], NaN) }) test_that("benchmark_grid only allows unique learner ids", { task = tsk("iris") learner = lrn("classif.rpart") resampling = rsmp("holdout") expect_error(benchmark_grid(task, list(learner, learner), resampling), "unique") }) test_that("benchmark allows that param_values overwrites tune token", { learner = lrn("classif.rpart", cp = to_tune(0.01, 0.1)) design = benchmark_grid(tsk("pima"), learner, rsmp("cv", folds = 3), param_values = list(list(list(cp = 0.01)))) expect_benchmark_result(benchmark(design)) learner = lrn("classif.rpart", cp = to_tune(0.01, 0.1)) design = benchmark_grid(tsk("pima"), learner, rsmp("cv", folds = 3)) expect_error(benchmark(design), "cannot be trained with TuneToken present in hyperparameter") }) test_that("uhash_table works", { design = benchmark_grid(tsks(c("iris", "sonar")), lrns(c("classif.debug", "classif.featureless")), rsmps(c("holdout", "insample"))) bmr = benchmark(design) u = bmr$uhash_table # results agree with uhash_table from resample result, which is also tested for correctness for (i in seq_len(nrow(u))) { rr = bmr$resample_result(i) learner_id = rr$learner$id task_id = rr$task$id resampling_id = rr$resampling$id expect_equal(u$learner_id[i], learner_id) expect_equal(u$task_id[i], task_id) expect_equal(u$resampling_id[i], resampling_id) expect_equal(u$uhash[i], rr$uhash) } # uhash is in correct order expect_equal(u$uhash, bmr$uhashes) expect_equal(u$uhash, as.data.table(bmr)$uhash) }) test_that("can change the threshold", { task = tsk("iris")$filter(1:80)$droplevels("Species") design = benchmark_grid(task, lrn("classif.featureless", predict_type = "prob"), rsmp("insample")) bmr = benchmark(design) # we can set the threshold and pass ties_method correctly expect_true(all(bmr$resample_result(1)$prediction()$response == "setosa")) bmr$set_threshold(0.9) expect_true(all(bmr$resample_result(1)$prediction()$response == "versicolor")) bmr$set_threshold(0.1) expect_true(all(bmr$resample_result(1)$prediction()$response == "setosa")) bmr$set_threshold(0.625, ties_method = "first") expect_true(all(bmr$resample_result(1)$prediction()$response == "setosa")) bmr$set_threshold(0.625, ties_method = "last") expect_true(all(bmr$resample_result(1)$prediction()$response == "versicolor")) with_seed(1, { bmr$set_threshold(0.625, ties_method = "random") expect_true("setosa" %in% bmr$resample_result(1)$prediction()$response && "versicolor" %in% bmr$resample_result(1)$prediction()$response) }) # Don't modify any threshold when at least one operation is invalid design = suppressWarnings(benchmark_grid( task, c(lrn("classif.featureless", predict_type = "prob"), lrn("classif.debug")), rsmp("insample") )) bmr = benchmark(design) response = bmr$resample_result(1)$prediction()$response expect_error(bmr$set_threshold(0.9), "Cannot set threshold, no probabilities available") # the other prediction was also not affected, we want to avoid partial updates expect_equal(bmr$resample_result(1)$prediction()$response, response) bmr$set_threshold(0.9, uhashes = uhashes(bmr, learner_ids = "classif.featureless")) expect_true(all(bmr$resample_result(1)$prediction()$response == "versicolor")) # can also use the iters argument design = benchmark_grid( task, c(lrn("classif.featureless", predict_type = "prob"), lrn("classif.debug", predict_type = "prob")), rsmp("insample") ) bmr = benchmark(design) bmr$set_threshold(0.9, i = 1) expect_true(all(bmr$resample_result(1)$prediction()$response == "versicolor")) expect_false(all(bmr$resample_result(2)$prediction()$response == "versicolor")) }) test_that("uhashe(s) work", { design = benchmark_grid( tsks(c("iris", "sonar")), lrns(c("classif.featureless", "classif.rpart")), rsmp("holdout") ) bmr = benchmark(design) tbl = bmr$uhash_table expect_equal(bmr$uhashes, uhashes(bmr)) expect_equal(tbl[get("learner_id") == "classif.debug", "uhash"]$uhash, uhashes(bmr, learner_ids = "classif.debug")) expect_equal(tbl[get("task_id") == "sonar", "uhash"]$uhash, uhashes(bmr, task_ids = "sonar")) expect_equal(tbl[get("resampling_id") == "holdout", "uhash"]$uhash, uhashes(bmr, resampling_ids = "holdout")) all_uhashes = bmr$uhashes expect_equal(length(all_uhashes), 4) # 2 tasks * 2 learners # Test filtering by single ID featureless_uhashes = uhashes(bmr, learner_ids = "classif.featureless") expect_equal(length(featureless_uhashes), 2) expect_true(all(featureless_uhashes %in% all_uhashes)) iris_uhashes = uhashes(bmr, task_ids = "iris") expect_equal(length(iris_uhashes), 2) expect_true(all(iris_uhashes %in% all_uhashes)) holdout_uhashes = uhashes(bmr, resampling_ids = "holdout") expect_equal(length(holdout_uhashes), 4) expect_true(all(holdout_uhashes %in% all_uhashes)) # Test filtering by multiple IDs learner_subset = uhashes(bmr, learner_ids = c("classif.featureless", "classif.rpart")) expect_equal(length(learner_subset), 4) expect_setequal(learner_subset, all_uhashes) task_subset = uhashes(bmr, task_ids = c("iris", "sonar")) expect_equal(length(task_subset), 4) expect_setequal(task_subset, all_uhashes) # Test combined filtering iris_featureless = uhashes(bmr, learner_ids = "classif.featureless", task_ids = "iris" ) expect_equal(length(iris_featureless), 1) # Test uhash function with single valid combination single_uhash = uhash(bmr, learner_id = "classif.featureless", task_id = "iris", resampling_id = "holdout" ) expect_string(single_uhash) expect_true(single_uhash %in% all_uhashes) expect_error(uhash(bmr), "got 4") # no match expect_equal(uhashes(bmr, "not-existing"), character(0)) expect_error(uhash(bmr, "not-existing"), "Expected exactly one uhash") expect_equal(bmr$uhashes, uhashes(bmr)) expect_equal(bmr$filter(1)$uhashes, uhash(bmr)) }) test_that("resampling validation", { # test with uninstantiated resampling task = tsk("iris") learner = lrn("classif.rpart") resampling = rsmp("holdout") design = data.table(task = list(task), learner = list(learner), resampling = list(resampling)) expect_error(benchmark(design), "instantiated") # test with resampling instantiated on wrong task task1 = tsk("iris") task2 = tsk("pima") resampling = rsmp("holdout") resampling$instantiate(task1) design = data.table(task = list(task2), learner = list(learner), resampling = list(resampling)) expect_error(benchmark(design), "not instantiated") # test with resampling instantiated on filtered task task = tsk("iris") resampling = rsmp("holdout") resampling$instantiate(task) task$filter(1:100) design = data.table(task = list(task), learner = list(learner), resampling = list(resampling)) expect_error(benchmark(design), "not instantiated") # test with resampling instantiated on correct task task = tsk("iris") resampling = rsmp("holdout") resampling$instantiate(task) design = data.table(task = list(task), learner = list(learner), resampling = list(resampling)) expect_benchmark_result(benchmark(design)) }) test_that("warning when mixing predict types", { expect_warning(benchmark_grid( tsk("iris"), list( lrn("classif.debug", predict_type = "prob"), lrn("classif.featureless", predict_type = "response") ), rsmp("cv", folds = 3) ), regexp = "Multiple predict types detected") }) test_that("benchmark with tasks with weights", { learners = list( lrn("classif.featureless", use_weights = "ignore", predict_type = "prob", id = "ignores_weights"), lrn("classif.featureless", use_weights = "use", predict_type = "prob", id = "uses_weights") ) measures = list( msr("classif.acc", use_weights = "ignore", id = "acc_ignore"), msr("classif.acc", use_weights = "use", id = "acc_use") ) tasks = list( iris_weights_learner, iris_weights_measure, tsk("iris") ) resamplings = list( rsmp("custom")$instantiate(tsk("iris"), train_sets = list(c(1:50, 140:150)), test_sets = list(c(1, 150)) ) ) design = benchmark_grid(tasks, learners, resamplings) bmr = benchmark(design) predictions = map(1:6, function(i) bmr$resample_result(i)$prediction()) # learner ignores weights, sees 50 'setosa' and 10 'virginica' -> predicts setosa expect_equal(predictions[[1]]$response[1], factor("setosa", levels = levels(iris$Species))) # learner uses weights, sees 50 'setosa' (weight 1) and 10 'virginica' (weight 100) -> predicts virginica expect_equal(predictions[[2]]$response[1], factor("virginica", levels = levels(iris$Species))) # task has no weights_learner -> unweighted 'setosa' predictions expect_equal(predictions[[3]]$response[1], factor("setosa", levels = levels(iris$Species))) expect_equal(predictions[[4]]$response[1], factor("setosa", levels = levels(iris$Species))) expect_equal(predictions[[5]]$response[1], factor("setosa", levels = levels(iris$Species))) expect_equal(predictions[[6]]$response[1], factor("setosa", levels = levels(iris$Species))) # 'weights' is NULL for all predictions not made for iris_weights_measure expect_null(predictions[[1]]$weights) expect_null(predictions[[2]]$weights) expect_null(predictions[[5]]$weights) expect_null(predictions[[6]]$weights) agpred = bmr$aggregate(measures) expect_equal(agpred$acc_ignore, rep(0.5, 6)) # made one correct, one incorrect prediction, unweighted # made one correct, one incorrect prediction, but 2nd task weighs the incorrect prediction x100 expect_equal(agpred$acc_use, c(0.5, 0.5, 1 / 101, 1 / 101, 0.5, 0.5)) expect_error(bmr$aggregate(msr("classif.acc", use_weights = "error")), "'use_weights' was set to 'error'") # no error when task has no weights_measure bmr = benchmark(design[c(1:2, 5:6)]) expect_equal(bmr$aggregate(msr("classif.acc", use_weights = "error"))$classif.acc, c(0.5, 0.5, 0.5, 0.5)) learners[[1]]$use_weights = "error" design = benchmark_grid(tasks, learners, resamplings) expect_error(benchmark(design), "'use_weights' was set to 'error'") # no error when task has no weights_learner design = benchmark_grid(tasks[2:3], learners, resamplings) bmr = benchmark(design) })