data(agaricus.train, package = "lightgbm") data(agaricus.test, package = "lightgbm") train <- agaricus.train test <- agaricus.test set.seed(708L) # [description] Every time this function is called, it adds 0.1 # to an accumulator then returns the current value. # This is used to mock the situation where an evaluation # metric increases every iteration ACCUMULATOR_NAME <- "INCREASING_METRIC_ACUMULATOR" assign(x = ACCUMULATOR_NAME, value = 0.0, envir = .GlobalEnv) .increasing_metric <- function(preds, dtrain) { if (!exists(ACCUMULATOR_NAME, envir = .GlobalEnv)) { assign(ACCUMULATOR_NAME, 0.0, envir = .GlobalEnv) } assign( x = ACCUMULATOR_NAME , value = get(ACCUMULATOR_NAME, envir = .GlobalEnv) + 0.1 , envir = .GlobalEnv ) return(list( name = "increasing_metric" , value = get(ACCUMULATOR_NAME, envir = .GlobalEnv) , higher_better = TRUE )) } # [description] Evaluation function that always returns the # same value CONSTANT_METRIC_VALUE <- 0.2 .constant_metric <- function(preds, dtrain) { return(list( name = "constant_metric" , value = CONSTANT_METRIC_VALUE , higher_better = FALSE )) } # sample datasets to test early stopping DTRAIN_RANDOM_REGRESSION <- lgb.Dataset( data = as.matrix(rnorm(100L), ncol = 1L, drop = FALSE) , label = rnorm(100L) , params = list(num_threads = .LGB_MAX_THREADS) ) DVALID_RANDOM_REGRESSION <- lgb.Dataset( data = as.matrix(rnorm(50L), ncol = 1L, drop = FALSE) , label = rnorm(50L) , params = list(num_threads = .LGB_MAX_THREADS) ) DTRAIN_RANDOM_CLASSIFICATION <- lgb.Dataset( data = as.matrix(rnorm(120L), ncol = 1L, drop = FALSE) , label = sample(c(0L, 1L), size = 120L, replace = TRUE) , params = list(num_threads = .LGB_MAX_THREADS) ) DVALID_RANDOM_CLASSIFICATION <- lgb.Dataset( data = as.matrix(rnorm(37L), ncol = 1L, drop = FALSE) , label = sample(c(0L, 1L), size = 37L, replace = TRUE) , params = list(num_threads = .LGB_MAX_THREADS) ) test_that("train and predict binary classification", { nrounds <- 10L bst <- lightgbm( data = train$data , label = train$label , params = list( num_leaves = 5L , objective = "binary" , metric = "binary_error" , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) , nrounds = nrounds , valids = list( "train" = lgb.Dataset( data = train$data , label = train$label ) ) ) expect_false(is.null(bst$record_evals)) record_results <- lgb.get.eval.result(bst, "train", "binary_error") expect_lt(min(record_results), 0.02) pred <- predict(bst, test$data) expect_equal(length(pred), 1611L) pred1 <- predict(bst, train$data, num_iteration = 1L) expect_equal(length(pred1), 6513L) err_pred1 <- sum((pred1 > 0.5) != train$label) / length(train$label) err_log <- record_results[1L] expect_lt(abs(err_pred1 - err_log), .LGB_NUMERIC_TOLERANCE) }) test_that("train and predict softmax", { set.seed(708L) X_mat <- as.matrix(iris[, -5L]) lb <- as.numeric(iris$Species) - 1L bst <- lightgbm( data = X_mat , label = lb , params = list( num_leaves = 4L , learning_rate = 0.05 , min_data = 20L , min_hessian = 10.0 , objective = "multiclass" , metric = "multi_error" , num_class = 3L , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) , nrounds = 20L , valids = list( "train" = lgb.Dataset( data = X_mat , label = lb ) ) ) expect_false(is.null(bst$record_evals)) record_results <- lgb.get.eval.result(bst, "train", "multi_error") expect_lt(min(record_results), 0.06) pred <- predict(bst, as.matrix(iris[, -5L])) expect_equal(length(pred), nrow(iris) * 3L) }) test_that("use of multiple eval metrics works", { metrics <- list("binary_error", "auc", "binary_logloss") bst <- lightgbm( data = train$data , label = train$label , params = list( num_leaves = 4L , learning_rate = 1.0 , objective = "binary" , metric = metrics , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) , nrounds = 10L , valids = list( "train" = lgb.Dataset( data = train$data , label = train$label , params = list(num_threads = .LGB_MAX_THREADS) ) ) ) expect_false(is.null(bst$record_evals)) expect_named( bst$record_evals[["train"]] , unlist(metrics) , ignore.order = FALSE , ignore.case = FALSE ) }) test_that("lgb.Booster.upper_bound() and lgb.Booster.lower_bound() work as expected for binary classification", { set.seed(708L) nrounds <- 10L bst <- lightgbm( data = train$data , label = train$label , params = list( num_leaves = 5L , objective = "binary" , metric = "binary_error" , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) , nrounds = nrounds ) expect_true(abs(bst$lower_bound() - -1.590853) < .LGB_NUMERIC_TOLERANCE) expect_true(abs(bst$upper_bound() - 1.871015) < .LGB_NUMERIC_TOLERANCE) }) test_that("lgb.Booster.upper_bound() and lgb.Booster.lower_bound() work as expected for regression", { set.seed(708L) nrounds <- 10L bst <- lightgbm( data = train$data , label = train$label , params = list( num_leaves = 5L , objective = "regression" , metric = "l2" , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) , nrounds = nrounds ) expect_true(abs(bst$lower_bound() - 0.1513859) < .LGB_NUMERIC_TOLERANCE) expect_true(abs(bst$upper_bound() - 0.9080349) < .LGB_NUMERIC_TOLERANCE) }) test_that("lightgbm() rejects negative or 0 value passed to nrounds", { dtrain <- lgb.Dataset(train$data, label = train$label) params <- list(objective = "regression", metric = "l2,l1", num_threads = .LGB_MAX_THREADS) for (nround_value in c(-10L, 0L)) { expect_error({ bst <- lightgbm( data = dtrain , params = params , nrounds = nround_value ) }, "nrounds should be greater than zero") } }) test_that("lightgbm() accepts nrounds as either a top-level argument or parameter", { nrounds <- 15L set.seed(708L) top_level_bst <- lightgbm( data = train$data , label = train$label , nrounds = nrounds , params = list( objective = "regression" , metric = "l2" , num_leaves = 5L , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) ) set.seed(708L) param_bst <- lightgbm( data = train$data , label = train$label , params = list( objective = "regression" , metric = "l2" , num_leaves = 5L , nrounds = nrounds , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) ) set.seed(708L) both_customized <- lightgbm( data = train$data , label = train$label , nrounds = 20L , params = list( objective = "regression" , metric = "l2" , num_leaves = 5L , nrounds = nrounds , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) ) top_level_l2 <- top_level_bst$eval_train()[[1L]][["value"]] params_l2 <- param_bst$eval_train()[[1L]][["value"]] both_l2 <- both_customized$eval_train()[[1L]][["value"]] # check type just to be sure the subsetting didn't return a NULL expect_true(is.numeric(top_level_l2)) expect_true(is.numeric(params_l2)) expect_true(is.numeric(both_l2)) # check that model produces identical performance expect_identical(top_level_l2, params_l2) expect_identical(both_l2, params_l2) expect_identical(param_bst$current_iter(), top_level_bst$current_iter()) expect_identical(param_bst$current_iter(), both_customized$current_iter()) expect_identical(param_bst$current_iter(), nrounds) }) test_that("lightgbm() performs evaluation on validation sets if they are provided", { set.seed(708L) dvalid1 <- lgb.Dataset( data = train$data , label = train$label , params = list(num_threads = .LGB_MAX_THREADS) ) dvalid2 <- lgb.Dataset( data = train$data , label = train$label , params = list(num_threads = .LGB_MAX_THREADS) ) nrounds <- 10L bst <- lightgbm( data = train$data , label = train$label , params = list( num_leaves = 5L , objective = "binary" , metric = c( "binary_error" , "auc" ) , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) , nrounds = nrounds , valids = list( "valid1" = dvalid1 , "valid2" = dvalid2 , "train" = lgb.Dataset( data = train$data , label = train$label , params = list(num_threads = .LGB_MAX_THREADS) ) ) ) expect_named( bst$record_evals , c("train", "valid1", "valid2", "start_iter") , ignore.order = TRUE , ignore.case = FALSE ) for (valid_name in c("train", "valid1", "valid2")) { eval_results <- bst$record_evals[[valid_name]][["binary_error"]] expect_length(eval_results[["eval"]], nrounds) } expect_true(abs(bst$record_evals[["train"]][["binary_error"]][["eval"]][[1L]] - 0.02226317) < .LGB_NUMERIC_TOLERANCE) expect_true(abs(bst$record_evals[["valid1"]][["binary_error"]][["eval"]][[1L]] - 0.02226317) < .LGB_NUMERIC_TOLERANCE) expect_true(abs(bst$record_evals[["valid2"]][["binary_error"]][["eval"]][[1L]] - 0.02226317) < .LGB_NUMERIC_TOLERANCE) }) test_that("training continuation works", { dtrain <- lgb.Dataset( train$data , label = train$label , free_raw_data = FALSE , params = list(num_threads = .LGB_MAX_THREADS) ) watchlist <- list(train = dtrain) param <- list( objective = "binary" , metric = "binary_logloss" , num_leaves = 5L , learning_rate = 1.0 , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) # train for 10 consecutive iterations bst <- lgb.train(param, dtrain, nrounds = 10L, watchlist) err_bst <- lgb.get.eval.result(bst, "train", "binary_logloss", 10L) # train for 5 iterations, save, load, train for 5 more bst1 <- lgb.train(param, dtrain, nrounds = 5L, watchlist) model_file <- tempfile(fileext = ".model") lgb.save(bst1, model_file) bst2 <- lgb.train(param, dtrain, nrounds = 5L, watchlist, init_model = bst1) err_bst2 <- lgb.get.eval.result(bst2, "train", "binary_logloss", 10L) # evaluation metrics should be nearly identical for the model trained in 10 coonsecutive # iterations and the one trained in 5-then-5. expect_lt(abs(err_bst - err_bst2), 0.01) }) test_that("cv works", { dtrain <- lgb.Dataset(train$data, label = train$label) params <- list( objective = "regression" , metric = "l2,l1" , min_data = 1L , learning_rate = 1.0 , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) bst <- lgb.cv( params , dtrain , 10L , nfold = 5L , early_stopping_rounds = 10L ) expect_false(is.null(bst$record_evals)) }) test_that("CVBooster$reset_parameter() works as expected", { dtrain <- lgb.Dataset(train$data, label = train$label) n_folds <- 2L cv_bst <- lgb.cv( params = list( objective = "regression" , min_data = 1L , num_leaves = 7L , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) , data = dtrain , nrounds = 3L , nfold = n_folds ) expect_true(methods::is(cv_bst, "lgb.CVBooster")) expect_length(cv_bst$boosters, n_folds) for (bst in cv_bst$boosters) { expect_equal(bst[["booster"]]$params[["num_leaves"]], 7L) } cv_bst$reset_parameter(list(num_leaves = 11L)) for (bst in cv_bst$boosters) { expect_equal(bst[["booster"]]$params[["num_leaves"]], 11L) } }) test_that("lgb.cv() rejects negative or 0 value passed to nrounds", { dtrain <- lgb.Dataset(train$data, label = train$label, params = list(num_threads = 2L)) params <- list( objective = "regression" , metric = "l2,l1" , min_data = 1L , num_threads = .LGB_MAX_THREADS ) for (nround_value in c(-10L, 0L)) { expect_error({ bst <- lgb.cv( params , dtrain , nround_value , nfold = 5L ) }, "nrounds should be greater than zero") } }) test_that("lgb.cv() throws an informative error is 'data' is not an lgb.Dataset and labels are not given", { bad_values <- list( 4L , "hello" , list(a = TRUE, b = seq_len(10L)) , data.frame(x = seq_len(5L), y = seq_len(5L)) , data.table::data.table(x = seq_len(5L), y = seq_len(5L)) , matrix(data = seq_len(10L), 2L, 5L) ) for (val in bad_values) { expect_error({ bst <- lgb.cv( params = list( objective = "regression" , metric = "l2,l1" , min_data = 1L ) , data = val , 10L , nfold = 5L ) }, regexp = "'label' must be provided for lgb.cv if 'data' is not an 'lgb.Dataset'", fixed = TRUE) } }) test_that("lightgbm.cv() gives the correct best_score and best_iter for a metric where higher values are better", { set.seed(708L) dtrain <- lgb.Dataset( data = as.matrix(runif(n = 500L, min = 0.0, max = 15.0), drop = FALSE) , label = rep(c(0L, 1L), 250L) , params = list(num_threads = .LGB_MAX_THREADS) ) nrounds <- 10L cv_bst <- lgb.cv( data = dtrain , nfold = 5L , nrounds = nrounds , params = list( objective = "binary" , metric = "auc,binary_error" , learning_rate = 1.5 , num_leaves = 5L , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) ) expect_true(methods::is(cv_bst, "lgb.CVBooster")) expect_named( cv_bst$record_evals , c("start_iter", "valid") , ignore.order = FALSE , ignore.case = FALSE ) auc_scores <- unlist(cv_bst$record_evals[["valid"]][["auc"]][["eval"]]) expect_length(auc_scores, nrounds) expect_identical(cv_bst$best_iter, which.max(auc_scores)) expect_identical(cv_bst$best_score, auc_scores[which.max(auc_scores)]) }) test_that("lgb.cv() fit on linearly-relatead data improves when using linear learners", { set.seed(708L) .new_dataset <- function() { X <- matrix(rnorm(1000L), ncol = 1L) return(lgb.Dataset( data = X , label = 2L * X + runif(nrow(X), 0L, 0.1) , params = list(num_threads = .LGB_MAX_THREADS) )) } params <- list( objective = "regression" , verbose = -1L , metric = "mse" , seed = 0L , num_leaves = 2L , num_threads = .LGB_MAX_THREADS ) dtrain <- .new_dataset() cv_bst <- lgb.cv( data = dtrain , nrounds = 10L , params = params , nfold = 5L ) expect_true(methods::is(cv_bst, "lgb.CVBooster")) dtrain <- .new_dataset() cv_bst_linear <- lgb.cv( data = dtrain , nrounds = 10L , params = utils::modifyList(params, list(linear_tree = TRUE)) , nfold = 5L ) expect_true(methods::is(cv_bst_linear, "lgb.CVBooster")) expect_true(cv_bst_linear$best_score < cv_bst$best_score) }) test_that("lgb.cv() respects showsd argument", { dtrain <- lgb.Dataset(train$data, label = train$label, params = list(num_threads = .LGB_MAX_THREADS)) params <- list( objective = "regression" , metric = "l2" , min_data = 1L , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) nrounds <- 5L set.seed(708L) bst_showsd <- lgb.cv( params = params , data = dtrain , nrounds = nrounds , nfold = 3L , showsd = TRUE ) evals_showsd <- bst_showsd$record_evals[["valid"]][["l2"]] set.seed(708L) bst_no_showsd <- lgb.cv( params = params , data = dtrain , nrounds = nrounds , nfold = 3L , showsd = FALSE ) evals_no_showsd <- bst_no_showsd$record_evals[["valid"]][["l2"]] expect_equal( evals_showsd[["eval"]] , evals_no_showsd[["eval"]] ) expect_true(methods::is(evals_showsd[["eval_err"]], "list")) expect_equal(length(evals_showsd[["eval_err"]]), nrounds) expect_identical(evals_no_showsd[["eval_err"]], list()) }) test_that("lgb.cv() raises an informative error for unrecognized objectives", { dtrain <- lgb.Dataset( data = train$data , label = train$label , params = list(num_threads = .LGB_MAX_THREADS) ) expect_error({ capture.output({ bst <- lgb.cv( data = dtrain , params = list( objective_type = "not_a_real_objective" , verbosity = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) ) }, type = "message") }, regexp = "Unknown objective type name: not_a_real_objective") }) test_that("lgb.cv() respects parameter aliases for objective", { nrounds <- 3L nfold <- 4L dtrain <- lgb.Dataset( data = train$data , label = train$label , params = list(num_threads = .LGB_MAX_THREADS) ) cv_bst <- lgb.cv( data = dtrain , params = list( num_leaves = 5L , application = "binary" , num_iterations = nrounds , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) , nfold = nfold ) expect_equal(cv_bst$best_iter, nrounds) expect_named(cv_bst$record_evals[["valid"]], "binary_logloss") expect_length(cv_bst$record_evals[["valid"]][["binary_logloss"]][["eval"]], nrounds) expect_length(cv_bst$boosters, nfold) }) test_that("lgb.cv() prefers objective in params to keyword argument", { data("EuStockMarkets") cv_bst <- lgb.cv( data = lgb.Dataset( data = EuStockMarkets[, c("SMI", "CAC", "FTSE")] , label = EuStockMarkets[, "DAX"] , params = list(num_threads = .LGB_MAX_THREADS) ) , params = list( application = "regression_l1" , verbosity = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) , nrounds = 5L , obj = "regression_l2" ) for (bst_list in cv_bst$boosters) { bst <- bst_list[["booster"]] expect_equal(bst$params$objective, "regression_l1") # NOTE: using save_model_to_string() since that is the simplest public API in the R package # allowing access to the "objective" attribute of the Booster object on the C++ side model_txt_lines <- strsplit( x = bst$save_model_to_string() , split = "\n" , fixed = TRUE )[[1L]] expect_true(any(model_txt_lines == "objective=regression_l1")) expect_false(any(model_txt_lines == "objective=regression_l2")) } }) test_that("lgb.cv() respects parameter aliases for metric", { nrounds <- 3L nfold <- 4L dtrain <- lgb.Dataset( data = train$data , label = train$label , params = list(num_threads = .LGB_MAX_THREADS) ) cv_bst <- lgb.cv( data = dtrain , params = list( num_leaves = 5L , objective = "binary" , num_iterations = nrounds , metric_types = c("auc", "binary_logloss") , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) , nfold = nfold ) expect_equal(cv_bst$best_iter, nrounds) expect_named(cv_bst$record_evals[["valid"]], c("auc", "binary_logloss")) expect_length(cv_bst$record_evals[["valid"]][["binary_logloss"]][["eval"]], nrounds) expect_length(cv_bst$record_evals[["valid"]][["auc"]][["eval"]], nrounds) expect_length(cv_bst$boosters, nfold) }) test_that("lgb.cv() respects eval_train_metric argument", { dtrain <- lgb.Dataset(train$data, label = train$label) params <- list( objective = "regression" , metric = "l2" , min_data = 1L , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) nrounds <- 5L set.seed(708L) bst_train <- lgb.cv( params = params , data = dtrain , nrounds = nrounds , nfold = 3L , showsd = FALSE , eval_train_metric = TRUE ) set.seed(708L) bst_no_train <- lgb.cv( params = params , data = dtrain , nrounds = nrounds , nfold = 3L , showsd = FALSE , eval_train_metric = FALSE ) expect_equal( bst_train$record_evals[["valid"]][["l2"]] , bst_no_train$record_evals[["valid"]][["l2"]] ) expect_true("train" %in% names(bst_train$record_evals)) expect_false("train" %in% names(bst_no_train$record_evals)) expect_true(methods::is(bst_train$record_evals[["train"]][["l2"]][["eval"]], "list")) expect_equal( length(bst_train$record_evals[["train"]][["l2"]][["eval"]]) , nrounds ) }) test_that("lgb.train() works as expected with multiple eval metrics", { metrics <- c("binary_error", "auc", "binary_logloss") bst <- lgb.train( data = lgb.Dataset( train$data , label = train$label , params = list(num_threads = .LGB_MAX_THREADS) ) , nrounds = 10L , params = list( objective = "binary" , metric = metrics , learning_rate = 1.0 , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) , valids = list( "train" = lgb.Dataset( train$data , label = train$label , params = list(num_threads = .LGB_MAX_THREADS) ) ) ) expect_false(is.null(bst$record_evals)) expect_named( bst$record_evals[["train"]] , unlist(metrics) , ignore.order = FALSE , ignore.case = FALSE ) }) test_that("lgb.train() raises an informative error for unrecognized objectives", { dtrain <- lgb.Dataset( data = train$data , label = train$label ) expect_error({ capture.output({ bst <- lgb.train( data = dtrain , params = list( objective_type = "not_a_real_objective" , verbosity = .LGB_VERBOSITY ) ) }, type = "message") }, regexp = "Unknown objective type name: not_a_real_objective") }) test_that("lgb.train() respects parameter aliases for objective", { nrounds <- 3L dtrain <- lgb.Dataset( data = train$data , label = train$label , params = list(num_threads = .LGB_MAX_THREADS) ) bst <- lgb.train( data = dtrain , params = list( num_leaves = 5L , application = "binary" , num_iterations = nrounds , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) , valids = list( "the_training_data" = dtrain ) ) expect_named(bst$record_evals[["the_training_data"]], "binary_logloss") expect_length(bst$record_evals[["the_training_data"]][["binary_logloss"]][["eval"]], nrounds) expect_equal(bst$params[["objective"]], "binary") }) test_that("lgb.train() prefers objective in params to keyword argument", { data("EuStockMarkets") bst <- lgb.train( data = lgb.Dataset( data = EuStockMarkets[, c("SMI", "CAC", "FTSE")] , label = EuStockMarkets[, "DAX"] , params = list(num_threads = .LGB_MAX_THREADS) ) , params = list( loss = "regression_l1" , verbosity = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) , nrounds = 5L , obj = "regression_l2" ) expect_equal(bst$params$objective, "regression_l1") # NOTE: using save_model_to_string() since that is the simplest public API in the R package # allowing access to the "objective" attribute of the Booster object on the C++ side model_txt_lines <- strsplit( x = bst$save_model_to_string() , split = "\n" , fixed = TRUE )[[1L]] expect_true(any(model_txt_lines == "objective=regression_l1")) expect_false(any(model_txt_lines == "objective=regression_l2")) }) test_that("lgb.train() respects parameter aliases for metric", { nrounds <- 3L dtrain <- lgb.Dataset( data = train$data , label = train$label , params = list(num_threads = .LGB_MAX_THREADS) ) bst <- lgb.train( data = dtrain , params = list( num_leaves = 5L , objective = "binary" , num_iterations = nrounds , metric_types = c("auc", "binary_logloss") , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) , valids = list( "train" = dtrain ) ) record_results <- bst$record_evals[["train"]] expect_equal(sort(names(record_results)), c("auc", "binary_logloss")) expect_length(record_results[["auc"]][["eval"]], nrounds) expect_length(record_results[["binary_logloss"]][["eval"]], nrounds) expect_equal(bst$params[["metric"]], list("auc", "binary_logloss")) }) test_that("lgb.train() rejects negative or 0 value passed to nrounds", { dtrain <- lgb.Dataset(train$data, label = train$label, params = list(num_threads = .LGB_MAX_THREADS)) params <- list( objective = "regression" , metric = "l2,l1" , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) for (nround_value in c(-10L, 0L)) { expect_error({ bst <- lgb.train( params , dtrain , nround_value ) }, "nrounds should be greater than zero") } }) test_that("lgb.train() accepts nrounds as either a top-level argument or parameter", { nrounds <- 15L set.seed(708L) top_level_bst <- lgb.train( data = lgb.Dataset( train$data , label = train$label , params = list(num_threads = .LGB_MAX_THREADS) ) , nrounds = nrounds , params = list( objective = "regression" , metric = "l2" , num_leaves = 5L , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) ) set.seed(708L) param_bst <- lgb.train( data = lgb.Dataset( train$data , label = train$label , params = list(num_threads = .LGB_MAX_THREADS) ) , params = list( objective = "regression" , metric = "l2" , num_leaves = 5L , nrounds = nrounds , verbose = .LGB_VERBOSITY ) ) set.seed(708L) both_customized <- lgb.train( data = lgb.Dataset( train$data , label = train$label , params = list(num_threads = .LGB_MAX_THREADS) ) , nrounds = 20L , params = list( objective = "regression" , metric = "l2" , num_leaves = 5L , nrounds = nrounds , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) ) top_level_l2 <- top_level_bst$eval_train()[[1L]][["value"]] params_l2 <- param_bst$eval_train()[[1L]][["value"]] both_l2 <- both_customized$eval_train()[[1L]][["value"]] # check type just to be sure the subsetting didn't return a NULL expect_true(is.numeric(top_level_l2)) expect_true(is.numeric(params_l2)) expect_true(is.numeric(both_l2)) # check that model produces identical performance expect_identical(top_level_l2, params_l2) expect_identical(both_l2, params_l2) expect_identical(param_bst$current_iter(), top_level_bst$current_iter()) expect_identical(param_bst$current_iter(), both_customized$current_iter()) expect_identical(param_bst$current_iter(), nrounds) }) test_that("lgb.train() throws an informative error if 'data' is not an lgb.Dataset", { bad_values <- list( 4L , "hello" , list(a = TRUE, b = seq_len(10L)) , data.frame(x = seq_len(5L), y = seq_len(5L)) , data.table::data.table(x = seq_len(5L), y = seq_len(5L)) , matrix(data = seq_len(10L), 2L, 5L) ) for (val in bad_values) { expect_error({ bst <- lgb.train( params = list( objective = "regression" , metric = "l2,l1" , verbose = .LGB_VERBOSITY ) , data = val , 10L ) }, regexp = "data must be an lgb.Dataset instance", fixed = TRUE) } }) test_that("lgb.train() throws an informative error if 'valids' is not a list of lgb.Dataset objects", { valids <- list( "valid1" = data.frame(x = rnorm(5L), y = rnorm(5L)) , "valid2" = data.frame(x = rnorm(5L), y = rnorm(5L)) ) expect_error({ bst <- lgb.train( params = list( objective = "regression" , metric = "l2,l1" , verbose = .LGB_VERBOSITY ) , data = lgb.Dataset(train$data, label = train$label) , 10L , valids = valids ) }, regexp = "valids must be a list of lgb.Dataset elements") }) test_that("lgb.train() errors if 'valids' is a list of lgb.Dataset objects but some do not have names", { valids <- list( "valid1" = lgb.Dataset(matrix(rnorm(10L), 5L, 2L)) , lgb.Dataset(matrix(rnorm(10L), 2L, 5L)) ) expect_error({ bst <- lgb.train( params = list( objective = "regression" , metric = "l2,l1" , verbose = .LGB_VERBOSITY ) , data = lgb.Dataset(train$data, label = train$label) , 10L , valids = valids ) }, regexp = "each element of valids must have a name") }) test_that("lgb.train() throws an informative error if 'valids' contains lgb.Dataset objects but none have names", { valids <- list( lgb.Dataset(matrix(rnorm(10L), 5L, 2L)) , lgb.Dataset(matrix(rnorm(10L), 2L, 5L)) ) expect_error({ bst <- lgb.train( params = list( objective = "regression" , metric = "l2,l1" , verbose = .LGB_VERBOSITY ) , data = lgb.Dataset(train$data, label = train$label) , 10L , valids = valids ) }, regexp = "each element of valids must have a name") }) test_that("lgb.train() works with force_col_wise and force_row_wise", { set.seed(1234L) nrounds <- 10L dtrain <- lgb.Dataset( train$data , label = train$label , params = list(num_threads = .LGB_MAX_THREADS) ) params <- list( objective = "binary" , metric = "binary_error" , force_col_wise = TRUE , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) bst_col_wise <- lgb.train( params = params , data = dtrain , nrounds = nrounds ) params <- list( objective = "binary" , metric = "binary_error" , force_row_wise = TRUE , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) bst_row_wise <- lgb.train( params = params , data = dtrain , nrounds = nrounds ) expected_error <- 0.003070782 expect_equal(bst_col_wise$eval_train()[[1L]][["value"]], expected_error) expect_equal(bst_row_wise$eval_train()[[1L]][["value"]], expected_error) # check some basic details of the boosters just to be sure force_col_wise # and force_row_wise are not causing any weird side effects for (bst in list(bst_row_wise, bst_col_wise)) { expect_equal(bst$current_iter(), nrounds) parsed_model <- jsonlite::fromJSON(bst$dump_model()) expect_equal(parsed_model$objective, "binary sigmoid:1") expect_false(parsed_model$average_output) } }) test_that("lgb.train() works as expected with sparse features", { set.seed(708L) num_obs <- 70000L trainDF <- data.frame( y = sample(c(0L, 1L), size = num_obs, replace = TRUE) , x = sample(c(1.0:10.0, rep(NA_real_, 50L)), size = num_obs, replace = TRUE) ) dtrain <- lgb.Dataset( data = as.matrix(trainDF[["x"]], drop = FALSE) , label = trainDF[["y"]] , params = list(num_threads = .LGB_MAX_THREADS) ) nrounds <- 1L bst <- lgb.train( params = list( objective = "binary" , min_data = 1L , min_data_in_bin = 1L , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) , data = dtrain , nrounds = nrounds ) expect_true(.is_Booster(bst)) expect_equal(bst$current_iter(), nrounds) parsed_model <- jsonlite::fromJSON(bst$dump_model()) expect_equal(parsed_model$objective, "binary sigmoid:1") expect_false(parsed_model$average_output) expected_error <- 0.6931268 expect_true(abs(bst$eval_train()[[1L]][["value"]] - expected_error) < .LGB_NUMERIC_TOLERANCE) }) test_that("lgb.train() works with early stopping for classification", { trainDF <- data.frame( "feat1" = rep(c(5.0, 10.0), 500L) , "target" = rep(c(0L, 1L), 500L) ) validDF <- data.frame( "feat1" = rep(c(5.0, 10.0), 50L) , "target" = rep(c(0L, 1L), 50L) ) dtrain <- lgb.Dataset( data = as.matrix(trainDF[["feat1"]], drop = FALSE) , label = trainDF[["target"]] , params = list(num_threads = .LGB_MAX_THREADS) ) dvalid <- lgb.Dataset( data = as.matrix(validDF[["feat1"]], drop = FALSE) , label = validDF[["target"]] , params = list(num_threads = .LGB_MAX_THREADS) ) nrounds <- 10L ################################ # train with no early stopping # ################################ bst <- lgb.train( params = list( objective = "binary" , metric = "binary_error" , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) , data = dtrain , nrounds = nrounds , valids = list( "valid1" = dvalid ) ) # a perfect model should be trivial to obtain, but all 10 rounds # should happen expect_equal(bst$best_score, 0.0) expect_equal(bst$best_iter, 1L) expect_equal(length(bst$record_evals[["valid1"]][["binary_error"]][["eval"]]), nrounds) ############################# # train with early stopping # ############################# early_stopping_rounds <- 5L bst <- lgb.train( params = list( objective = "binary" , metric = "binary_error" , early_stopping_rounds = early_stopping_rounds , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) , data = dtrain , nrounds = nrounds , valids = list( "valid1" = dvalid ) ) # a perfect model should be trivial to obtain, and only 6 rounds # should have happen (1 with improvement, 5 consecutive with no improvement) expect_equal(bst$best_score, 0.0) expect_equal(bst$best_iter, 1L) expect_equal( length(bst$record_evals[["valid1"]][["binary_error"]][["eval"]]) , early_stopping_rounds + 1L ) }) test_that("lgb.train() treats early_stopping_rounds<=0 as disabling early stopping", { set.seed(708L) trainDF <- data.frame( "feat1" = rep(c(5.0, 10.0), 500L) , "target" = rep(c(0L, 1L), 500L) ) validDF <- data.frame( "feat1" = rep(c(5.0, 10.0), 50L) , "target" = rep(c(0L, 1L), 50L) ) dtrain <- lgb.Dataset( data = as.matrix(trainDF[["feat1"]], drop = FALSE) , label = trainDF[["target"]] , params = list(num_threads = .LGB_MAX_THREADS) ) dvalid <- lgb.Dataset( data = as.matrix(validDF[["feat1"]], drop = FALSE) , label = validDF[["target"]] , params = list(num_threads = .LGB_MAX_THREADS) ) nrounds <- 5L for (value in c(-5L, 0L)) { #----------------------------# # passed as keyword argument # #----------------------------# bst <- lgb.train( params = list( objective = "binary" , metric = "binary_error" , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) , data = dtrain , nrounds = nrounds , valids = list( "valid1" = dvalid ) , early_stopping_rounds = value ) # a perfect model should be trivial to obtain, but all 10 rounds # should happen expect_equal(bst$best_score, 0.0) expect_equal(bst$best_iter, 1L) expect_equal(length(bst$record_evals[["valid1"]][["binary_error"]][["eval"]]), nrounds) #---------------------------# # passed as parameter alias # #---------------------------# bst <- lgb.train( params = list( objective = "binary" , metric = "binary_error" , n_iter_no_change = value , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) , data = dtrain , nrounds = nrounds , valids = list( "valid1" = dvalid ) ) # a perfect model should be trivial to obtain, but all 10 rounds # should happen expect_equal(bst$best_score, 0.0) expect_equal(bst$best_iter, 1L) expect_equal(length(bst$record_evals[["valid1"]][["binary_error"]][["eval"]]), nrounds) } }) test_that("lgb.train() works with early stopping for classification with a metric that should be maximized", { set.seed(708L) dtrain <- lgb.Dataset( data = train$data , label = train$label , params = list(num_threads = .LGB_MAX_THREADS) ) dvalid <- lgb.Dataset( data = test$data , label = test$label , params = list(num_threads = .LGB_MAX_THREADS) ) nrounds <- 10L ############################# # train with early stopping # ############################# early_stopping_rounds <- 5L # the harsh max_depth guarantees that AUC improves over at least the first few iterations bst_auc <- lgb.train( params = list( objective = "binary" , metric = "auc" , max_depth = 3L , early_stopping_rounds = early_stopping_rounds , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) , data = dtrain , nrounds = nrounds , valids = list( "valid1" = dvalid ) ) bst_binary_error <- lgb.train( params = list( objective = "binary" , metric = "binary_error" , max_depth = 3L , early_stopping_rounds = early_stopping_rounds , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) , data = dtrain , nrounds = nrounds , valids = list( "valid1" = dvalid ) ) # early stopping should have been hit for binary_error (higher_better = FALSE) eval_info <- bst_binary_error$.__enclos_env__$private$get_eval_info() expect_identical(eval_info, "binary_error") expect_identical( unname(bst_binary_error$.__enclos_env__$private$higher_better_inner_eval) , FALSE ) expect_identical(bst_binary_error$best_iter, 1L) expect_identical(bst_binary_error$current_iter(), early_stopping_rounds + 1L) expect_true(abs(bst_binary_error$best_score - 0.01613904) < .LGB_NUMERIC_TOLERANCE) # early stopping should not have been hit for AUC (higher_better = TRUE) eval_info <- bst_auc$.__enclos_env__$private$get_eval_info() expect_identical(eval_info, "auc") expect_identical( unname(bst_auc$.__enclos_env__$private$higher_better_inner_eval) , TRUE ) expect_identical(bst_auc$best_iter, 9L) expect_identical(bst_auc$current_iter(), nrounds) expect_true(abs(bst_auc$best_score - 0.9999969) < .LGB_NUMERIC_TOLERANCE) }) test_that("lgb.train() works with early stopping for regression", { set.seed(708L) trainDF <- data.frame( "feat1" = rep(c(10.0, 100.0), 500L) , "target" = rep(c(-50.0, 50.0), 500L) ) validDF <- data.frame( "feat1" = rep(50.0, 4L) , "target" = rep(50.0, 4L) ) dtrain <- lgb.Dataset( data = as.matrix(trainDF[["feat1"]], drop = FALSE) , label = trainDF[["target"]] , params = list(num_threads = .LGB_MAX_THREADS) ) dvalid <- lgb.Dataset( data = as.matrix(validDF[["feat1"]], drop = FALSE) , label = validDF[["target"]] , params = list(num_threads = .LGB_MAX_THREADS) ) nrounds <- 10L ################################ # train with no early stopping # ################################ bst <- lgb.train( params = list( objective = "regression" , metric = "rmse" , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) , data = dtrain , nrounds = nrounds , valids = list( "valid1" = dvalid ) ) # the best possible model should come from the first iteration, but # all 10 training iterations should happen expect_equal(bst$best_score, 55.0) expect_equal(bst$best_iter, 1L) expect_equal(length(bst$record_evals[["valid1"]][["rmse"]][["eval"]]), nrounds) ############################# # train with early stopping # ############################# early_stopping_rounds <- 5L bst <- lgb.train( params = list( objective = "regression" , metric = "rmse" , early_stopping_rounds = early_stopping_rounds , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) , data = dtrain , nrounds = nrounds , valids = list( "valid1" = dvalid ) ) # the best model should be from the first iteration, and only 6 rounds # should have happen (1 with improvement, 5 consecutive with no improvement) expect_equal(bst$best_score, 55.0) expect_equal(bst$best_iter, 1L) expect_equal( length(bst$record_evals[["valid1"]][["rmse"]][["eval"]]) , early_stopping_rounds + 1L ) }) test_that("lgb.train() does not stop early if early_stopping_rounds is not given", { set.seed(708L) increasing_metric_starting_value <- get( ACCUMULATOR_NAME , envir = .GlobalEnv ) nrounds <- 10L metrics <- list( .constant_metric , .increasing_metric ) bst <- lgb.train( params = list( objective = "regression" , metric = "None" , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) , data = DTRAIN_RANDOM_REGRESSION , nrounds = nrounds , valids = list("valid1" = DVALID_RANDOM_REGRESSION) , eval = metrics ) # Only the two functions provided to "eval" should have been evaluated expect_equal(length(bst$record_evals[["valid1"]]), 2L) # all 10 iterations should have happen, and the best_iter should be # the first one (based on constant_metric) best_iter <- 1L expect_equal(bst$best_iter, best_iter) # best_score should be taken from the first metric expect_equal( bst$best_score , bst$record_evals[["valid1"]][["constant_metric"]][["eval"]][[best_iter]] ) # early stopping should not have happened. Even though constant_metric # had 9 consecutive iterations with no improvement, it is ignored because of # first_metric_only = TRUE expect_equal( length(bst$record_evals[["valid1"]][["constant_metric"]][["eval"]]) , nrounds ) expect_equal( length(bst$record_evals[["valid1"]][["increasing_metric"]][["eval"]]) , nrounds ) }) test_that("If first_metric_only is not given or is FALSE, lgb.train() decides to stop early based on all metrics", { set.seed(708L) early_stopping_rounds <- 3L param_variations <- list( list( objective = "regression" , metric = "None" , early_stopping_rounds = early_stopping_rounds , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) , list( objective = "regression" , metric = "None" , early_stopping_rounds = early_stopping_rounds , first_metric_only = FALSE , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) ) for (params in param_variations) { nrounds <- 10L bst <- lgb.train( params = params , data = DTRAIN_RANDOM_REGRESSION , nrounds = nrounds , valids = list( "valid1" = DVALID_RANDOM_REGRESSION ) , eval = list( .increasing_metric , .constant_metric ) ) # Only the two functions provided to "eval" should have been evaluated expect_equal(length(bst$record_evals[["valid1"]]), 2L) # early stopping should have happened, and should have stopped early_stopping_rounds + 1 rounds in # because constant_metric never improves # # the best iteration should be the last one, because increasing_metric was first # and gets better every iteration best_iter <- early_stopping_rounds + 1L expect_equal(bst$best_iter, best_iter) # best_score should be taken from "increasing_metric" because it was first expect_equal( bst$best_score , bst$record_evals[["valid1"]][["increasing_metric"]][["eval"]][[best_iter]] ) # early stopping should not have happened. even though increasing_metric kept # getting better, early stopping should have happened because "constant_metric" # did not improve expect_equal( length(bst$record_evals[["valid1"]][["constant_metric"]][["eval"]]) , early_stopping_rounds + 1L ) expect_equal( length(bst$record_evals[["valid1"]][["increasing_metric"]][["eval"]]) , early_stopping_rounds + 1L ) } }) test_that("If first_metric_only is TRUE, lgb.train() decides to stop early based on only the first metric", { set.seed(708L) nrounds <- 10L early_stopping_rounds <- 3L increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) bst <- lgb.train( params = list( objective = "regression" , metric = "None" , early_stopping_rounds = early_stopping_rounds , first_metric_only = TRUE , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) , data = DTRAIN_RANDOM_REGRESSION , nrounds = nrounds , valids = list( "valid1" = DVALID_RANDOM_REGRESSION ) , eval = list( .increasing_metric , .constant_metric ) ) # Only the two functions provided to "eval" should have been evaluated expect_equal(length(bst$record_evals[["valid1"]]), 2L) # all 10 iterations should happen, and the best_iter should be the final one expect_equal(bst$best_iter, nrounds) # best_score should be taken from "increasing_metric" expect_equal( bst$best_score , increasing_metric_starting_value + 0.1 * nrounds ) # early stopping should not have happened. Even though constant_metric # had 9 consecutive iterations with no improvement, it is ignored because of # first_metric_only = TRUE expect_equal( length(bst$record_evals[["valid1"]][["constant_metric"]][["eval"]]) , nrounds ) expect_equal( length(bst$record_evals[["valid1"]][["increasing_metric"]][["eval"]]) , nrounds ) }) test_that("lgb.train() works when a mixture of functions and strings are passed to eval", { set.seed(708L) nrounds <- 10L increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) bst <- lgb.train( params = list( objective = "regression" , metric = "None" , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) , data = DTRAIN_RANDOM_REGRESSION , nrounds = nrounds , valids = list( "valid1" = DVALID_RANDOM_REGRESSION ) , eval = list( .increasing_metric , "rmse" , .constant_metric , "l2" ) ) # all 4 metrics should have been used expect_named( bst$record_evals[["valid1"]] , expected = c("rmse", "l2", "increasing_metric", "constant_metric") , ignore.order = TRUE , ignore.case = FALSE ) # the difference metrics shouldn't have been mixed up with each other results <- bst$record_evals[["valid1"]] expect_true(abs(results[["rmse"]][["eval"]][[1L]] - 1.105012) < .LGB_NUMERIC_TOLERANCE) expect_true(abs(results[["l2"]][["eval"]][[1L]] - 1.221051) < .LGB_NUMERIC_TOLERANCE) expected_increasing_metric <- increasing_metric_starting_value + 0.1 expect_true( abs( results[["increasing_metric"]][["eval"]][[1L]] - expected_increasing_metric ) < .LGB_NUMERIC_TOLERANCE ) expect_true(abs(results[["constant_metric"]][["eval"]][[1L]] - CONSTANT_METRIC_VALUE) < .LGB_NUMERIC_TOLERANCE) }) test_that("lgb.train() works when a list of strings or a character vector is passed to eval", { # testing list and character vector, as well as length-1 and length-2 eval_variations <- list( c("binary_error", "binary_logloss") , "binary_logloss" , list("binary_error", "binary_logloss") , list("binary_logloss") ) for (eval_variation in eval_variations) { set.seed(708L) nrounds <- 10L increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) bst <- lgb.train( params = list( objective = "binary" , metric = "None" , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) , data = DTRAIN_RANDOM_CLASSIFICATION , nrounds = nrounds , valids = list( "valid1" = DVALID_RANDOM_CLASSIFICATION ) , eval = eval_variation ) # both metrics should have been used expect_named( bst$record_evals[["valid1"]] , expected = unlist(eval_variation) , ignore.order = TRUE , ignore.case = FALSE ) # the difference metrics shouldn't have been mixed up with each other results <- bst$record_evals[["valid1"]] if ("binary_error" %in% unlist(eval_variation)) { expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.4864865) < .LGB_NUMERIC_TOLERANCE) } if ("binary_logloss" %in% unlist(eval_variation)) { expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.6932548) < .LGB_NUMERIC_TOLERANCE) } } }) test_that("lgb.train() works when you specify both 'metric' and 'eval' with strings", { set.seed(708L) nrounds <- 10L increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) bst <- lgb.train( params = list( objective = "binary" , metric = "binary_error" , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) , data = DTRAIN_RANDOM_CLASSIFICATION , nrounds = nrounds , valids = list( "valid1" = DVALID_RANDOM_CLASSIFICATION ) , eval = "binary_logloss" ) # both metrics should have been used expect_named( bst$record_evals[["valid1"]] , expected = c("binary_error", "binary_logloss") , ignore.order = TRUE , ignore.case = FALSE ) # the difference metrics shouldn't have been mixed up with each other results <- bst$record_evals[["valid1"]] expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.4864865) < .LGB_NUMERIC_TOLERANCE) expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.6932548) < .LGB_NUMERIC_TOLERANCE) }) test_that("lgb.train() works when you give a function for eval", { set.seed(708L) nrounds <- 10L increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) bst <- lgb.train( params = list( objective = "binary" , metric = "None" , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) , data = DTRAIN_RANDOM_CLASSIFICATION , nrounds = nrounds , valids = list( "valid1" = DVALID_RANDOM_CLASSIFICATION ) , eval = .constant_metric ) # the difference metrics shouldn't have been mixed up with each other results <- bst$record_evals[["valid1"]] expect_true(abs(results[["constant_metric"]][["eval"]][[1L]] - CONSTANT_METRIC_VALUE) < .LGB_NUMERIC_TOLERANCE) }) test_that("lgb.train() works with early stopping for regression with a metric that should be minimized", { set.seed(708L) trainDF <- data.frame( "feat1" = rep(c(10.0, 100.0), 500L) , "target" = rep(c(-50.0, 50.0), 500L) ) validDF <- data.frame( "feat1" = rep(50.0, 4L) , "target" = rep(50.0, 4L) ) dtrain <- lgb.Dataset( data = as.matrix(trainDF[["feat1"]], drop = FALSE) , label = trainDF[["target"]] , params = list(num_threads = .LGB_MAX_THREADS) ) dvalid <- lgb.Dataset( data = as.matrix(validDF[["feat1"]], drop = FALSE) , label = validDF[["target"]] , params = list(num_threads = .LGB_MAX_THREADS) ) nrounds <- 10L ############################# # train with early stopping # ############################# early_stopping_rounds <- 5L bst <- lgb.train( params = list( objective = "regression" , metric = c( "mape" , "rmse" , "mae" ) , min_data_in_bin = 5L , early_stopping_rounds = early_stopping_rounds , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) , data = dtrain , nrounds = nrounds , valids = list( "valid1" = dvalid ) ) # the best model should be from the first iteration, and only 6 rounds # should have happened (1 with improvement, 5 consecutive with no improvement) expect_equal(bst$best_score, 1.1) expect_equal(bst$best_iter, 1L) expect_equal( length(bst$record_evals[["valid1"]][["mape"]][["eval"]]) , early_stopping_rounds + 1L ) # Booster should understand thatt all three of these metrics should be minimized eval_info <- bst$.__enclos_env__$private$get_eval_info() expect_identical(eval_info, c("mape", "rmse", "l1")) expect_identical( unname(bst$.__enclos_env__$private$higher_better_inner_eval) , rep(FALSE, 3L) ) }) test_that("lgb.train() supports non-ASCII feature names", { dtrain <- lgb.Dataset( data = matrix(rnorm(400L), ncol = 4L) , label = rnorm(100L) , params = list(num_threads = .LGB_MAX_THREADS) ) # content below is equivalent to # # feature_names <- c("F_零", "F_一", "F_二", "F_三") # # but using rawToChar() to avoid weird issues when {testthat} # sources files and converts their encodings prior to evaluating the code feature_names <- c( rawToChar(as.raw(c(0x46, 0x5f, 0xe9, 0x9b, 0xb6))) , rawToChar(as.raw(c(0x46, 0x5f, 0xe4, 0xb8, 0x80))) , rawToChar(as.raw(c(0x46, 0x5f, 0xe4, 0xba, 0x8c))) , rawToChar(as.raw(c(0x46, 0x5f, 0xe4, 0xb8, 0x89))) ) bst <- lgb.train( data = dtrain , nrounds = 5L , obj = "regression" , params = list( metric = "rmse" , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) , colnames = feature_names ) expect_true(.is_Booster(bst)) dumped_model <- jsonlite::fromJSON(bst$dump_model()) # UTF-8 strings are not well-supported on Windows # * https://developer.r-project.org/Blog/public/2020/05/02/utf-8-support-on-windows/ # * https://developer.r-project.org/Blog/public/2020/07/30/windows/utf-8-build-of-r-and-cran-packages/index.html if (.LGB_UTF8_LOCALE && !.LGB_ON_WINDOWS) { expect_identical( dumped_model[["feature_names"]] , feature_names ) } else { expect_identical( dumped_model[["feature_names"]] , iconv(feature_names, to = "UTF-8") ) } }) test_that("lgb.train() works with integer, double, and numeric data", { data(mtcars) X <- as.matrix(mtcars[, -1L]) y <- mtcars[, 1L, drop = TRUE] expected_mae <- 4.263667 for (data_mode in c("numeric", "double", "integer")) { mode(X) <- data_mode nrounds <- 10L bst <- lightgbm( data = X , label = y , params = list( objective = "regression" , min_data_in_bin = 1L , min_data_in_leaf = 1L , learning_rate = 0.01 , seed = 708L , verbose = .LGB_VERBOSITY ) , nrounds = nrounds ) # should have trained for 10 iterations and found splits modelDT <- lgb.model.dt.tree(bst) expect_equal(modelDT[, max(tree_index)], nrounds - 1L) expect_gt(nrow(modelDT), nrounds * 3L) # should have achieved expected performance preds <- predict(bst, X) mae <- mean(abs(y - preds)) expect_true(abs(mae - expected_mae) < .LGB_NUMERIC_TOLERANCE) } }) test_that("lgb.train() updates params based on keyword arguments", { dtrain <- lgb.Dataset( data = matrix(rnorm(400L), ncol = 4L) , label = rnorm(100L) , params = list(num_threads = .LGB_MAX_THREADS) ) # defaults from keyword arguments should be used if not specified in params invisible( capture.output({ bst <- lgb.train( data = dtrain , obj = "regression" , params = list(num_threads = .LGB_MAX_THREADS) ) }) ) expect_equal(bst$params[["verbosity"]], 1L) expect_equal(bst$params[["num_iterations"]], 100L) # main param names should be preferred to keyword arguments invisible( capture.output({ bst <- lgb.train( data = dtrain , obj = "regression" , params = list( "verbosity" = 5L , "num_iterations" = 2L , num_threads = .LGB_MAX_THREADS ) ) }) ) expect_equal(bst$params[["verbosity"]], 5L) expect_equal(bst$params[["num_iterations"]], 2L) # aliases should be preferred to keyword arguments, and converted to main parameter name invisible( capture.output({ bst <- lgb.train( data = dtrain , obj = "regression" , params = list( "verbose" = 5L , "num_boost_round" = 2L , num_threads = .LGB_MAX_THREADS ) ) }) ) expect_equal(bst$params[["verbosity"]], 5L) expect_false("verbose" %in% bst$params) expect_equal(bst$params[["num_iterations"]], 2L) expect_false("num_boost_round" %in% bst$params) }) test_that("when early stopping is not activated, best_iter and best_score come from valids and not training data", { set.seed(708L) trainDF <- data.frame( "feat1" = rep(c(10.0, 100.0), 500L) , "target" = rep(c(-50.0, 50.0), 500L) ) validDF <- data.frame( "feat1" = rep(50.0, 4L) , "target" = rep(50.0, 4L) ) dtrain <- lgb.Dataset( data = as.matrix(trainDF[["feat1"]], drop = FALSE) , label = trainDF[["target"]] , params = list(num_threads = .LGB_MAX_THREADS) ) dvalid1 <- lgb.Dataset( data = as.matrix(validDF[["feat1"]], drop = FALSE) , label = validDF[["target"]] , params = list(num_threads = .LGB_MAX_THREADS) ) dvalid2 <- lgb.Dataset( data = as.matrix(validDF[1L:10L, "feat1"], drop = FALSE) , label = validDF[1L:10L, "target"] , params = list(num_threads = .LGB_MAX_THREADS) ) nrounds <- 10L train_params <- list( objective = "regression" , metric = "rmse" , learning_rate = 1.5 , num_leaves = 5L , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) # example 1: two valids, neither are the training data bst <- lgb.train( data = dtrain , nrounds = nrounds , valids = list( "valid1" = dvalid1 , "valid2" = dvalid2 ) , params = train_params ) expect_named( bst$record_evals , c("start_iter", "valid1", "valid2") , ignore.order = FALSE , ignore.case = FALSE ) rmse_scores <- unlist(bst$record_evals[["valid1"]][["rmse"]][["eval"]]) expect_length(rmse_scores, nrounds) expect_identical(bst$best_iter, which.min(rmse_scores)) expect_identical(bst$best_score, rmse_scores[which.min(rmse_scores)]) # example 2: train first (called "train") and two valids bst <- lgb.train( data = dtrain , nrounds = nrounds , valids = list( "train" = dtrain , "valid1" = dvalid1 , "valid2" = dvalid2 ) , params = train_params ) expect_named( bst$record_evals , c("start_iter", "train", "valid1", "valid2") , ignore.order = FALSE , ignore.case = FALSE ) rmse_scores <- unlist(bst$record_evals[["valid1"]][["rmse"]][["eval"]]) expect_length(rmse_scores, nrounds) expect_identical(bst$best_iter, which.min(rmse_scores)) expect_identical(bst$best_score, rmse_scores[which.min(rmse_scores)]) # example 3: train second (called "train") and two valids bst <- lgb.train( data = dtrain , nrounds = nrounds , valids = list( "valid1" = dvalid1 , "train" = dtrain , "valid2" = dvalid2 ) , params = train_params ) # note that "train" still ends up as the first one expect_named( bst$record_evals , c("start_iter", "train", "valid1", "valid2") , ignore.order = FALSE , ignore.case = FALSE ) rmse_scores <- unlist(bst$record_evals[["valid1"]][["rmse"]][["eval"]]) expect_length(rmse_scores, nrounds) expect_identical(bst$best_iter, which.min(rmse_scores)) expect_identical(bst$best_score, rmse_scores[which.min(rmse_scores)]) # example 4: train third (called "train") and two valids bst <- lgb.train( data = dtrain , nrounds = nrounds , valids = list( "valid1" = dvalid1 , "valid2" = dvalid2 , "train" = dtrain ) , params = train_params ) # note that "train" still ends up as the first one expect_named( bst$record_evals , c("start_iter", "train", "valid1", "valid2") , ignore.order = FALSE , ignore.case = FALSE ) rmse_scores <- unlist(bst$record_evals[["valid1"]][["rmse"]][["eval"]]) expect_length(rmse_scores, nrounds) expect_identical(bst$best_iter, which.min(rmse_scores)) expect_identical(bst$best_score, rmse_scores[which.min(rmse_scores)]) # example 5: train second (called "something-random-we-would-not-hardcode") and two valids bst <- lgb.train( data = dtrain , nrounds = nrounds , valids = list( "valid1" = dvalid1 , "something-random-we-would-not-hardcode" = dtrain , "valid2" = dvalid2 ) , params = train_params ) # note that "something-random-we-would-not-hardcode" was recognized as the training # data even though it isn't named "train" expect_named( bst$record_evals , c("start_iter", "something-random-we-would-not-hardcode", "valid1", "valid2") , ignore.order = FALSE , ignore.case = FALSE ) rmse_scores <- unlist(bst$record_evals[["valid1"]][["rmse"]][["eval"]]) expect_length(rmse_scores, nrounds) expect_identical(bst$best_iter, which.min(rmse_scores)) expect_identical(bst$best_score, rmse_scores[which.min(rmse_scores)]) # example 6: the only valid supplied is the training data bst <- lgb.train( data = dtrain , nrounds = nrounds , valids = list( "train" = dtrain ) , params = train_params ) expect_identical(bst$best_iter, -1L) expect_identical(bst$best_score, NA_real_) }) test_that("lightgbm.train() gives the correct best_score and best_iter for a metric where higher values are better", { set.seed(708L) trainDF <- data.frame( "feat1" = runif(n = 500L, min = 0.0, max = 15.0) , "target" = rep(c(0L, 1L), 500L) ) validDF <- data.frame( "feat1" = runif(n = 50L, min = 0.0, max = 15.0) , "target" = rep(c(0L, 1L), 50L) ) dtrain <- lgb.Dataset( data = as.matrix(trainDF[["feat1"]], drop = FALSE) , label = trainDF[["target"]] , params = list(num_threads = .LGB_MAX_THREADS) ) dvalid1 <- lgb.Dataset( data = as.matrix(validDF[1L:25L, "feat1"], drop = FALSE) , label = validDF[1L:25L, "target"] , params = list(num_threads = .LGB_MAX_THREADS) ) nrounds <- 10L bst <- lgb.train( data = dtrain , nrounds = nrounds , valids = list( "valid1" = dvalid1 , "something-random-we-would-not-hardcode" = dtrain ) , params = list( objective = "binary" , metric = "auc" , learning_rate = 1.5 , num_leaves = 5L , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) ) # note that "something-random-we-would-not-hardcode" was recognized as the training # data even though it isn't named "train" expect_named( bst$record_evals , c("start_iter", "something-random-we-would-not-hardcode", "valid1") , ignore.order = FALSE , ignore.case = FALSE ) auc_scores <- unlist(bst$record_evals[["valid1"]][["auc"]][["eval"]]) expect_length(auc_scores, nrounds) expect_identical(bst$best_iter, which.max(auc_scores)) expect_identical(bst$best_score, auc_scores[which.max(auc_scores)]) }) test_that("using lightgbm() without early stopping, best_iter and best_score come from valids and not training data", { set.seed(708L) # example: train second (called "something-random-we-would-not-hardcode"), two valids, # and a metric where higher values are better ("auc") trainDF <- data.frame( "feat1" = runif(n = 500L, min = 0.0, max = 15.0) , "target" = rep(c(0L, 1L), 500L) ) validDF <- data.frame( "feat1" = runif(n = 50L, min = 0.0, max = 15.0) , "target" = rep(c(0L, 1L), 50L) ) dtrain <- lgb.Dataset( data = as.matrix(trainDF[["feat1"]], drop = FALSE) , label = trainDF[["target"]] , params = list(num_threads = .LGB_MAX_THREADS) ) dvalid1 <- lgb.Dataset( data = as.matrix(validDF[1L:25L, "feat1"], drop = FALSE) , label = validDF[1L:25L, "target"] , params = list(num_threads = .LGB_MAX_THREADS) ) dvalid2 <- lgb.Dataset( data = as.matrix(validDF[26L:50L, "feat1"], drop = FALSE) , label = validDF[26L:50L, "target"] , params = list(num_threads = .LGB_MAX_THREADS) ) nrounds <- 10L bst <- lightgbm( data = dtrain , nrounds = nrounds , valids = list( "valid1" = dvalid1 , "something-random-we-would-not-hardcode" = dtrain , "valid2" = dvalid2 ) , params = list( objective = "binary" , metric = "auc" , learning_rate = 1.5 , num_leaves = 5L , num_threads = .LGB_MAX_THREADS ) , verbose = -7L ) # when verbose <= 0 is passed to lightgbm(), 'valids' is passed through to lgb.train() # untouched. If you set verbose to > 0, the training data will still be first but called "train" expect_named( bst$record_evals , c("start_iter", "something-random-we-would-not-hardcode", "valid1", "valid2") , ignore.order = FALSE , ignore.case = FALSE ) auc_scores <- unlist(bst$record_evals[["valid1"]][["auc"]][["eval"]]) expect_length(auc_scores, nrounds) expect_identical(bst$best_iter, which.max(auc_scores)) expect_identical(bst$best_score, auc_scores[which.max(auc_scores)]) }) test_that("lgb.cv() works when you specify both 'metric' and 'eval' with strings", { set.seed(708L) nrounds <- 10L nfolds <- 4L increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) bst <- lgb.cv( params = list( objective = "binary" , metric = "binary_error" , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) , data = DTRAIN_RANDOM_CLASSIFICATION , nrounds = nrounds , nfold = nfolds , eval = "binary_logloss" ) # both metrics should have been used expect_named( bst$record_evals[["valid"]] , expected = c("binary_error", "binary_logloss") , ignore.order = TRUE , ignore.case = FALSE ) # the difference metrics shouldn't have been mixed up with each other results <- bst$record_evals[["valid"]] expect_true(abs(results[["binary_error"]][["eval"]][[1L]] - 0.5005654) < .LGB_NUMERIC_TOLERANCE) expect_true(abs(results[["binary_logloss"]][["eval"]][[1L]] - 0.7011232) < .LGB_NUMERIC_TOLERANCE) # all boosters should have been created expect_length(bst$boosters, nfolds) }) test_that("lgb.cv() works when you give a function for eval", { set.seed(708L) nrounds <- 10L nfolds <- 3L increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) bst <- lgb.cv( params = list( objective = "binary" , metric = "None" , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) , data = DTRAIN_RANDOM_CLASSIFICATION , nfold = nfolds , nrounds = nrounds , eval = .constant_metric ) # the difference metrics shouldn't have been mixed up with each other results <- bst$record_evals[["valid"]] expect_true(abs(results[["constant_metric"]][["eval"]][[1L]] - CONSTANT_METRIC_VALUE) < .LGB_NUMERIC_TOLERANCE) expect_named(results, "constant_metric") }) test_that("If first_metric_only is TRUE, lgb.cv() decides to stop early based on only the first metric", { set.seed(708L) nrounds <- 10L nfolds <- 5L early_stopping_rounds <- 3L increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) bst <- lgb.cv( params = list( objective = "regression" , metric = "None" , early_stopping_rounds = early_stopping_rounds , first_metric_only = TRUE , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) , data = DTRAIN_RANDOM_REGRESSION , nfold = nfolds , nrounds = nrounds , eval = list( .increasing_metric , .constant_metric ) ) # Only the two functions provided to "eval" should have been evaluated expect_named(bst$record_evals[["valid"]], c("increasing_metric", "constant_metric")) # all 10 iterations should happen, and the best_iter should be the final one expect_equal(bst$best_iter, nrounds) # best_score should be taken from "increasing_metric" # # this expected value looks magical and confusing, but it's because # evaluation metrics are averaged over all folds. # # consider 5-fold CV with a metric that adds 0.1 to a global accumulator # each time it's called # # * iter 1: [0.1, 0.2, 0.3, 0.4, 0.5] (mean = 0.3) # * iter 2: [0.6, 0.7, 0.8, 0.9, 1.0] (mean = 1.3) # * iter 3: [1.1, 1.2, 1.3, 1.4, 1.5] (mean = 1.8) # cv_value <- increasing_metric_starting_value + mean(seq_len(nfolds) / 10.0) + (nrounds - 1L) * 0.1 * nfolds expect_equal(bst$best_score, cv_value) # early stopping should not have happened. Even though constant_metric # had 9 consecutive iterations with no improvement, it is ignored because of # first_metric_only = TRUE expect_equal( length(bst$record_evals[["valid"]][["constant_metric"]][["eval"]]) , nrounds ) expect_equal( length(bst$record_evals[["valid"]][["increasing_metric"]][["eval"]]) , nrounds ) }) test_that("early stopping works with lgb.cv()", { set.seed(708L) nrounds <- 10L nfolds <- 5L early_stopping_rounds <- 3L increasing_metric_starting_value <- get(ACCUMULATOR_NAME, envir = .GlobalEnv) bst <- lgb.cv( params = list( objective = "regression" , metric = "None" , early_stopping_rounds = early_stopping_rounds , first_metric_only = TRUE , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) , data = DTRAIN_RANDOM_REGRESSION , nfold = nfolds , nrounds = nrounds , eval = list( .constant_metric , .increasing_metric ) ) # only the two functions provided to "eval" should have been evaluated expect_named(bst$record_evals[["valid"]], c("constant_metric", "increasing_metric")) # best_iter should be based on the first metric. Since constant_metric # never changes, its first iteration was the best oone expect_equal(bst$best_iter, 1L) # best_score should be taken from the first metri expect_equal(bst$best_score, 0.2) # early stopping should have happened, since constant_metric was the first # one passed to eval and it will not improve over consecutive iterations # # note that this test is identical to the previous one, but with the # order of the eval metrics switched expect_equal( length(bst$record_evals[["valid"]][["constant_metric"]][["eval"]]) , early_stopping_rounds + 1L ) expect_equal( length(bst$record_evals[["valid"]][["increasing_metric"]][["eval"]]) , early_stopping_rounds + 1L ) # every booster's predict method should use best_iter as num_iteration in predict random_data <- as.matrix(rnorm(10L), ncol = 1L, drop = FALSE) for (x in bst$boosters) { expect_equal(x$booster$best_iter, bst$best_iter) expect_gt(x$booster$current_iter(), bst$best_iter) preds_iter <- predict(x$booster, random_data, num_iteration = bst$best_iter) preds_no_iter <- predict(x$booster, random_data) expect_equal(preds_iter, preds_no_iter) } }) test_that("lgb.cv() respects changes to logging verbosity", { dtrain <- lgb.Dataset( data = train$data , label = train$label , params = list(num_threads = .LGB_MAX_THREADS) ) # (verbose = 1) should be INFO and WARNING level logs lgb_cv_logs <- capture.output({ cv_bst <- lgb.cv( params = list(num_threads = .LGB_MAX_THREADS) , nfold = 2L , nrounds = 5L , data = dtrain , obj = "binary" , verbose = 1L ) }) expect_true(any(grepl("[LightGBM] [Info]", lgb_cv_logs, fixed = TRUE))) expect_true(any(grepl("[LightGBM] [Warning]", lgb_cv_logs, fixed = TRUE))) # (verbose = 0) should be WARNING level logs only lgb_cv_logs <- capture.output({ cv_bst <- lgb.cv( params = list(num_threads = .LGB_MAX_THREADS) , nfold = 2L , nrounds = 5L , data = dtrain , obj = "binary" , verbose = 0L ) }) expect_false(any(grepl("[LightGBM] [Info]", lgb_cv_logs, fixed = TRUE))) expect_true(any(grepl("[LightGBM] [Warning]", lgb_cv_logs, fixed = TRUE))) # (verbose = -1) no logs lgb_cv_logs <- capture.output({ cv_bst <- lgb.cv( params = list(num_threads = .LGB_MAX_THREADS) , nfold = 2L , nrounds = 5L , data = dtrain , obj = "binary" , verbose = -1L ) }) # NOTE: this is not length(lgb_cv_logs) == 0 because lightgbm's # dependencies might print other messages expect_false(any(grepl("[LightGBM] [Info]", lgb_cv_logs, fixed = TRUE))) expect_false(any(grepl("[LightGBM] [Warning]", lgb_cv_logs, fixed = TRUE))) }) test_that("lgb.cv() updates params based on keyword arguments", { dtrain <- lgb.Dataset( data = matrix(rnorm(400L), ncol = 4L) , label = rnorm(100L) , params = list(num_threads = .LGB_MAX_THREADS) ) # defaults from keyword arguments should be used if not specified in params invisible( capture.output({ cv_bst <- lgb.cv( data = dtrain , obj = "regression" , params = list(num_threads = .LGB_MAX_THREADS) , nfold = 2L ) }) ) for (bst in cv_bst$boosters) { bst_params <- bst[["booster"]]$params expect_equal(bst_params[["verbosity"]], 1L) expect_equal(bst_params[["num_iterations"]], 100L) } # main param names should be preferred to keyword arguments invisible( capture.output({ cv_bst <- lgb.cv( data = dtrain , obj = "regression" , params = list( "verbosity" = 5L , "num_iterations" = 2L , num_threads = .LGB_MAX_THREADS ) , nfold = 2L ) }) ) for (bst in cv_bst$boosters) { bst_params <- bst[["booster"]]$params expect_equal(bst_params[["verbosity"]], 5L) expect_equal(bst_params[["num_iterations"]], 2L) } # aliases should be preferred to keyword arguments, and converted to main parameter name invisible( capture.output({ cv_bst <- lgb.cv( data = dtrain , obj = "regression" , params = list( "verbose" = 5L , "num_boost_round" = 2L , num_threads = .LGB_MAX_THREADS ) , nfold = 2L ) }) ) for (bst in cv_bst$boosters) { bst_params <- bst[["booster"]]$params expect_equal(bst_params[["verbosity"]], 5L) expect_false("verbose" %in% bst_params) expect_equal(bst_params[["num_iterations"]], 2L) expect_false("num_boost_round" %in% bst_params) } }) test_that("lgb.train() fit on linearly-relatead data improves when using linear learners", { set.seed(708L) .new_dataset <- function() { X <- matrix(rnorm(100L), ncol = 1L) return(lgb.Dataset( data = X , label = 2L * X + runif(nrow(X), 0L, 0.1) , params = list(num_threads = .LGB_MAX_THREADS) )) } params <- list( objective = "regression" , verbose = .LGB_VERBOSITY , metric = "mse" , seed = 0L , num_leaves = 2L , num_threads = .LGB_MAX_THREADS ) dtrain <- .new_dataset() bst <- lgb.train( data = dtrain , nrounds = 10L , params = params , valids = list("train" = dtrain) ) expect_true(.is_Booster(bst)) dtrain <- .new_dataset() bst_linear <- lgb.train( data = dtrain , nrounds = 10L , params = utils::modifyList(params, list(linear_tree = TRUE)) , valids = list("train" = dtrain) ) expect_true(.is_Booster(bst_linear)) bst_last_mse <- bst$record_evals[["train"]][["l2"]][["eval"]][[10L]] bst_lin_last_mse <- bst_linear$record_evals[["train"]][["l2"]][["eval"]][[10L]] expect_true(bst_lin_last_mse < bst_last_mse) }) test_that("lgb.train() with linear learner fails already-constructed dataset with linear=false", { set.seed(708L) params <- list( objective = "regression" , verbose = .LGB_VERBOSITY , metric = "mse" , seed = 0L , num_leaves = 2L , num_threads = .LGB_MAX_THREADS ) dtrain <- lgb.Dataset( data = matrix(rnorm(100L), ncol = 1L) , label = rnorm(100L) , params = list(num_threads = .LGB_MAX_THREADS) ) dtrain$construct() expect_error({ capture.output({ bst_linear <- lgb.train( data = dtrain , nrounds = 10L , params = utils::modifyList(params, list(linear_tree = TRUE)) ) }, type = "message") }, regexp = "Cannot change linear_tree after constructed Dataset handle") }) test_that("lgb.train() works with linear learners even if Dataset has missing values", { set.seed(708L) .new_dataset <- function() { values <- rnorm(100L) values[sample(seq_len(length(values)), size = 10L)] <- NA_real_ X <- matrix( data = sample(values, size = 100L) , ncol = 1L ) return(lgb.Dataset( data = X , label = 2L * X + runif(nrow(X), 0L, 0.1) , params = list(num_threads = .LGB_MAX_THREADS) )) } params <- list( objective = "regression" , verbose = .LGB_VERBOSITY , metric = "mse" , seed = 0L , num_leaves = 2L , num_threads = .LGB_MAX_THREADS ) dtrain <- .new_dataset() bst <- lgb.train( data = dtrain , nrounds = 10L , params = params , valids = list("train" = dtrain) ) expect_true(.is_Booster(bst)) dtrain <- .new_dataset() bst_linear <- lgb.train( data = dtrain , nrounds = 10L , params = utils::modifyList(params, list(linear_tree = TRUE)) , valids = list("train" = dtrain) ) expect_true(.is_Booster(bst_linear)) bst_last_mse <- bst$record_evals[["train"]][["l2"]][["eval"]][[10L]] bst_lin_last_mse <- bst_linear$record_evals[["train"]][["l2"]][["eval"]][[10L]] expect_true(bst_lin_last_mse < bst_last_mse) }) test_that("lgb.train() works with linear learners, bagging, and a Dataset that has missing values", { set.seed(708L) .new_dataset <- function() { values <- rnorm(100L) values[sample(seq_len(length(values)), size = 10L)] <- NA_real_ X <- matrix( data = sample(values, size = 100L) , ncol = 1L ) return(lgb.Dataset( data = X , label = 2L * X + runif(nrow(X), 0L, 0.1) , params = list(num_threads = .LGB_MAX_THREADS) )) } params <- list( objective = "regression" , verbose = .LGB_VERBOSITY , metric = "mse" , seed = 0L , num_leaves = 2L , bagging_freq = 1L , subsample = 0.8 , num_threads = .LGB_MAX_THREADS ) dtrain <- .new_dataset() bst <- lgb.train( data = dtrain , nrounds = 10L , params = params , valids = list("train" = dtrain) ) expect_true(.is_Booster(bst)) dtrain <- .new_dataset() bst_linear <- lgb.train( data = dtrain , nrounds = 10L , params = utils::modifyList(params, list(linear_tree = TRUE)) , valids = list("train" = dtrain) ) expect_true(.is_Booster(bst_linear)) bst_last_mse <- bst$record_evals[["train"]][["l2"]][["eval"]][[10L]] bst_lin_last_mse <- bst_linear$record_evals[["train"]][["l2"]][["eval"]][[10L]] expect_true(bst_lin_last_mse < bst_last_mse) }) test_that("lgb.train() works with linear learners and data where a feature has only 1 non-NA value", { set.seed(708L) .new_dataset <- function() { values <- c(rnorm(100L), rep(NA_real_, 100L)) values[118L] <- rnorm(1L) X <- matrix( data = values , ncol = 2L ) return(lgb.Dataset( data = X , label = 2L * X[, 1L] + runif(nrow(X), 0L, 0.1) , params = list( feature_pre_filter = FALSE , num_threads = .LGB_MAX_THREADS ) )) } params <- list( objective = "regression" , verbose = -1L , metric = "mse" , seed = 0L , num_leaves = 2L , num_threads = .LGB_MAX_THREADS ) dtrain <- .new_dataset() bst_linear <- lgb.train( data = dtrain , nrounds = 10L , params = utils::modifyList(params, list(linear_tree = TRUE)) ) expect_true(.is_Booster(bst_linear)) }) test_that("lgb.train() works with linear learners when Dataset has categorical features", { set.seed(708L) .new_dataset <- function() { X <- matrix(numeric(200L), nrow = 100L, ncol = 2L) X[, 1L] <- rnorm(100L) X[, 2L] <- sample(seq_len(4L), size = 100L, replace = TRUE) return(lgb.Dataset( data = X , label = 2L * X[, 1L] + runif(nrow(X), 0L, 0.1) , params = list(num_threads = .LGB_MAX_THREADS) )) } params <- list( objective = "regression" , verbose = -1L , metric = "mse" , seed = 0L , num_leaves = 2L , categorical_feature = 1L , num_threads = .LGB_MAX_THREADS ) dtrain <- .new_dataset() bst <- lgb.train( data = dtrain , nrounds = 10L , params = params , valids = list("train" = dtrain) ) expect_true(.is_Booster(bst)) dtrain <- .new_dataset() bst_linear <- lgb.train( data = dtrain , nrounds = 10L , params = utils::modifyList(params, list(linear_tree = TRUE)) , valids = list("train" = dtrain) ) expect_true(.is_Booster(bst_linear)) bst_last_mse <- bst$record_evals[["train"]][["l2"]][["eval"]][[10L]] bst_lin_last_mse <- bst_linear$record_evals[["train"]][["l2"]][["eval"]][[10L]] expect_true(bst_lin_last_mse < bst_last_mse) }) test_that("lgb.train() throws an informative error if interaction_constraints is not a list", { dtrain <- lgb.Dataset(train$data, label = train$label) params <- list(objective = "regression", interaction_constraints = "[1,2],[3]") expect_error({ bst <- lightgbm( data = dtrain , params = params , nrounds = 2L ) }, "interaction_constraints must be a list") }) test_that(paste0("lgb.train() throws an informative error if the members of interaction_constraints ", "are not character or numeric vectors"), { dtrain <- lgb.Dataset(train$data, label = train$label) params <- list(objective = "regression", interaction_constraints = list(list(1L, 2L), list(3L))) expect_error({ bst <- lightgbm( data = dtrain , params = params , nrounds = 2L ) }, "every element in interaction_constraints must be a character vector or numeric vector") }) test_that("lgb.train() throws an informative error if interaction_constraints contains a too large index", { dtrain <- lgb.Dataset(train$data, label = train$label) params <- list(objective = "regression", interaction_constraints = list(c(1L, length(colnames(train$data)) + 1L), 3L)) expect_error({ bst <- lightgbm( data = dtrain , params = params , nrounds = 2L ) }, "supplied a too large value in interaction_constraints") }) test_that(paste0("lgb.train() gives same result when interaction_constraints is specified as a list of ", "character vectors, numeric vectors, or a combination"), { set.seed(1L) dtrain <- lgb.Dataset(train$data, label = train$label, params = list(num_threads = .LGB_MAX_THREADS)) params <- list( objective = "regression" , interaction_constraints = list(c(1L, 2L), 3L) , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) bst <- lightgbm( data = dtrain , params = params , nrounds = 2L ) pred1 <- bst$predict(test$data) cnames <- colnames(train$data) params <- list( objective = "regression" , interaction_constraints = list(c(cnames[[1L]], cnames[[2L]]), cnames[[3L]]) , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) bst <- lightgbm( data = dtrain , params = params , nrounds = 2L ) pred2 <- bst$predict(test$data) params <- list( objective = "regression" , interaction_constraints = list(c(cnames[[1L]], cnames[[2L]]), 3L) , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) bst <- lightgbm( data = dtrain , params = params , nrounds = 2L ) pred3 <- bst$predict(test$data) expect_equal(pred1, pred2) expect_equal(pred2, pred3) }) test_that(paste0("lgb.train() gives same results when using interaction_constraints and specifying colnames"), { set.seed(1L) dtrain <- lgb.Dataset(train$data, label = train$label, params = list(num_threads = .LGB_MAX_THREADS)) params <- list( objective = "regression" , interaction_constraints = list(c(1L, 2L), 3L) , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) bst <- lightgbm( data = dtrain , params = params , nrounds = 2L ) pred1 <- bst$predict(test$data) new_colnames <- paste0(colnames(train$data), "_x") params <- list( objective = "regression" , interaction_constraints = list(c(new_colnames[1L], new_colnames[2L]), new_colnames[3L]) , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) bst <- lightgbm( data = dtrain , params = params , nrounds = 2L , colnames = new_colnames ) pred2 <- bst$predict(test$data) expect_equal(pred1, pred2) }) .generate_trainset_for_monotone_constraints_tests <- function(x3_to_categorical) { n_samples <- 3000L x1_positively_correlated_with_y <- runif(n = n_samples, min = 0.0, max = 1.0) x2_negatively_correlated_with_y <- runif(n = n_samples, min = 0.0, max = 1.0) x3_negatively_correlated_with_y <- runif(n = n_samples, min = 0.0, max = 1.0) if (x3_to_categorical) { x3_negatively_correlated_with_y <- as.integer(x3_negatively_correlated_with_y / 0.01) categorical_features <- "feature_3" } else { categorical_features <- NULL } X <- matrix( data = c( x1_positively_correlated_with_y , x2_negatively_correlated_with_y , x3_negatively_correlated_with_y ) , ncol = 3L ) zs <- rnorm(n = n_samples, mean = 0.0, sd = 0.01) scales <- 10.0 * (runif(n = 6L, min = 0.0, max = 1.0) + 0.5) y <- ( scales[1L] * x1_positively_correlated_with_y + sin(scales[2L] * pi * x1_positively_correlated_with_y) - scales[3L] * x2_negatively_correlated_with_y - cos(scales[4L] * pi * x2_negatively_correlated_with_y) - scales[5L] * x3_negatively_correlated_with_y - cos(scales[6L] * pi * x3_negatively_correlated_with_y) + zs ) return(lgb.Dataset( data = X , label = y , categorical_feature = categorical_features , free_raw_data = FALSE , colnames = c("feature_1", "feature_2", "feature_3") , params = list(num_threads = .LGB_MAX_THREADS) )) } .is_increasing <- function(y) { return(all(diff(y) >= 0.0)) } .is_decreasing <- function(y) { return(all(diff(y) <= 0.0)) } .is_non_monotone <- function(y) { return(any(diff(y) < 0.0) & any(diff(y) > 0.0)) } # R equivalent of numpy.linspace() .linspace <- function(start_val, stop_val, num) { weights <- (seq_len(num) - 1L) / (num - 1L) return(start_val + weights * (stop_val - start_val)) } .is_correctly_constrained <- function(learner, x3_to_categorical) { iterations <- 10L n <- 1000L variable_x <- .linspace(0L, 1L, n) fixed_xs_values <- .linspace(0L, 1L, n) for (i in seq_len(iterations)) { fixed_x <- fixed_xs_values[i] * rep(1.0, n) monotonically_increasing_x <- matrix( data = c(variable_x, fixed_x, fixed_x) , ncol = 3L ) monotonically_increasing_y <- predict( learner , monotonically_increasing_x ) monotonically_decreasing_x <- matrix( data = c(fixed_x, variable_x, fixed_x) , ncol = 3L ) monotonically_decreasing_y <- predict( learner , monotonically_decreasing_x ) if (x3_to_categorical) { non_monotone_data <- c( fixed_x , fixed_x , as.integer(variable_x / 0.01) ) } else { non_monotone_data <- c(fixed_x, fixed_x, variable_x) } non_monotone_x <- matrix( data = non_monotone_data , ncol = 3L ) non_monotone_y <- predict( learner , non_monotone_x ) if (!(.is_increasing(monotonically_increasing_y) && .is_decreasing(monotonically_decreasing_y) && .is_non_monotone(non_monotone_y) )) { return(FALSE) } } return(TRUE) } for (x3_to_categorical in c(TRUE, FALSE)) { set.seed(708L) dtrain <- .generate_trainset_for_monotone_constraints_tests( x3_to_categorical = x3_to_categorical ) for (monotone_constraints_method in c("basic", "intermediate", "advanced")) { test_msg <- paste0( "lgb.train() supports monotone constraints (" , "categoricals=" , x3_to_categorical , ", method=" , monotone_constraints_method , ")" ) test_that(test_msg, { params <- list( min_data = 20L , num_leaves = 20L , monotone_constraints = c(1L, -1L, 0L) , monotone_constraints_method = monotone_constraints_method , use_missing = FALSE , verbose = .LGB_VERBOSITY , num_threads = .LGB_MAX_THREADS ) constrained_model <- lgb.train( params = params , data = dtrain , obj = "regression_l2" , nrounds = 100L ) expect_true({ .is_correctly_constrained( learner = constrained_model , x3_to_categorical = x3_to_categorical ) }) }) } } test_that("lightgbm() accepts objective as function argument and under params", { bst1 <- lightgbm( data = train$data , label = train$label , params = list(objective = "regression_l1", num_threads = .LGB_MAX_THREADS) , nrounds = 5L , verbose = .LGB_VERBOSITY ) expect_equal(bst1$params$objective, "regression_l1") model_txt_lines <- strsplit( x = bst1$save_model_to_string() , split = "\n" , fixed = TRUE )[[1L]] expect_true(any(model_txt_lines == "objective=regression_l1")) expect_false(any(model_txt_lines == "objective=regression_l2")) bst2 <- lightgbm( data = train$data , label = train$label , objective = "regression_l1" , nrounds = 5L , verbose = .LGB_VERBOSITY ) expect_equal(bst2$params$objective, "regression_l1") model_txt_lines <- strsplit( x = bst2$save_model_to_string() , split = "\n" , fixed = TRUE )[[1L]] expect_true(any(model_txt_lines == "objective=regression_l1")) expect_false(any(model_txt_lines == "objective=regression_l2")) }) test_that("lightgbm() prioritizes objective under params over objective as function argument", { bst1 <- lightgbm( data = train$data , label = train$label , objective = "regression" , params = list(objective = "regression_l1", num_threads = .LGB_MAX_THREADS) , nrounds = 5L , verbose = .LGB_VERBOSITY ) expect_equal(bst1$params$objective, "regression_l1") model_txt_lines <- strsplit( x = bst1$save_model_to_string() , split = "\n" , fixed = TRUE )[[1L]] expect_true(any(model_txt_lines == "objective=regression_l1")) expect_false(any(model_txt_lines == "objective=regression_l2")) bst2 <- lightgbm( data = train$data , label = train$label , objective = "regression" , params = list(loss = "regression_l1", num_threads = .LGB_MAX_THREADS) , nrounds = 5L , verbose = .LGB_VERBOSITY ) expect_equal(bst2$params$objective, "regression_l1") model_txt_lines <- strsplit( x = bst2$save_model_to_string() , split = "\n" , fixed = TRUE )[[1L]] expect_true(any(model_txt_lines == "objective=regression_l1")) expect_false(any(model_txt_lines == "objective=regression_l2")) }) test_that("lightgbm() accepts init_score as function argument", { bst1 <- lightgbm( data = train$data , label = train$label , objective = "binary" , nrounds = 5L , verbose = .LGB_VERBOSITY , params = list(num_threads = .LGB_MAX_THREADS) ) pred1 <- predict(bst1, train$data, type = "raw") bst2 <- lightgbm( data = train$data , label = train$label , init_score = pred1 , objective = "binary" , nrounds = 5L , verbose = .LGB_VERBOSITY , params = list(num_threads = .LGB_MAX_THREADS) ) pred2 <- predict(bst2, train$data, type = "raw") expect_true(any(pred1 != pred2)) }) test_that("lightgbm() defaults to 'regression' objective if objective not otherwise provided", { bst <- lightgbm( data = train$data , label = train$label , nrounds = 5L , verbose = .LGB_VERBOSITY , params = list(num_threads = .LGB_MAX_THREADS) ) expect_equal(bst$params$objective, "regression") model_txt_lines <- strsplit( x = bst$save_model_to_string() , split = "\n" , fixed = TRUE )[[1L]] expect_true(any(model_txt_lines == "objective=regression")) expect_false(any(model_txt_lines == "objective=regression_l1")) }) test_that("lightgbm() accepts 'num_threads' as either top-level argument or under params", { bst <- lightgbm( data = train$data , label = train$label , nrounds = 5L , verbose = .LGB_VERBOSITY , num_threads = 1L ) expect_equal(bst$params$num_threads, 1L) model_txt_lines <- strsplit( x = bst$save_model_to_string() , split = "\n" , fixed = TRUE )[[1L]] expect_true(any(grepl("[num_threads: 1]", model_txt_lines, fixed = TRUE))) bst <- lightgbm( data = train$data , label = train$label , nrounds = 5L , verbose = .LGB_VERBOSITY , params = list(num_threads = 1L) ) expect_equal(bst$params$num_threads, 1L) model_txt_lines <- strsplit( x = bst$save_model_to_string() , split = "\n" , fixed = TRUE )[[1L]] expect_true(any(grepl("[num_threads: 1]", model_txt_lines, fixed = TRUE))) bst <- lightgbm( data = train$data , label = train$label , nrounds = 5L , verbose = .LGB_VERBOSITY , num_threads = 10L , params = list(num_threads = 1L) ) expect_equal(bst$params$num_threads, 1L) model_txt_lines <- strsplit( x = bst$save_model_to_string() , split = "\n" , fixed = TRUE )[[1L]] expect_true(any(grepl("[num_threads: 1]", model_txt_lines, fixed = TRUE))) }) test_that("lightgbm() accepts 'weight' and 'weights'", { data(mtcars) X <- as.matrix(mtcars[, -1L]) y <- as.numeric(mtcars[, 1L]) w <- rep(1.0, nrow(X)) model <- lightgbm( X , y , weights = w , obj = "regression" , nrounds = 5L , verbose = .LGB_VERBOSITY , params = list( min_data_in_bin = 1L , min_data_in_leaf = 1L , num_threads = .LGB_MAX_THREADS ) ) expect_equal(model$.__enclos_env__$private$train_set$get_field("weight"), w) # Avoid a bad CRAN check due to partial argument matches lgb_args <- list( X , y , weight = w , obj = "regression" , nrounds = 5L , verbose = -1L ) model <- do.call(lightgbm, lgb_args) expect_equal(model$.__enclos_env__$private$train_set$get_field("weight"), w) }) .assert_has_expected_logs <- function(log_txt, lgb_info, lgb_warn, early_stopping, valid_eval_msg) { expect_identical( object = any(grepl("[LightGBM] [Info]", log_txt, fixed = TRUE)) , expected = lgb_info ) expect_identical( object = any(grepl("[LightGBM] [Warning]", log_txt, fixed = TRUE)) , expected = lgb_warn ) expect_identical( object = any(grepl("Will train until there is no improvement in 5 rounds", log_txt, fixed = TRUE)) , expected = early_stopping ) expect_identical( object = any(grepl("Did not meet early stopping", log_txt, fixed = TRUE)) , expected = early_stopping ) expect_identical( object = any(grepl("valid's auc\\:[0-9]+", log_txt)) , expected = valid_eval_msg ) } .assert_has_expected_record_evals <- function(fitted_model) { record_evals <- fitted_model$record_evals expect_equal(record_evals$start_iter, 1L) if (inherits(fitted_model, "lgb.CVBooster")) { expected_valid_auc <- c(0.979056, 0.9844697, 0.9900813, 0.9908026, 0.9935588) } else { expected_valid_auc <- c(0.9805752, 0.9805752, 0.9934957, 0.9934957, 0.9949372) } expect_equal( object = unlist(record_evals[["valid"]][["auc"]][["eval"]]) , expected = expected_valid_auc , tolerance = .LGB_NUMERIC_TOLERANCE ) expect_named(record_evals, c("start_iter", "valid"), ignore.order = TRUE, ignore.case = FALSE) expect_equal(record_evals[["valid"]][["auc"]][["eval_err"]], list()) } .train_for_verbosity_test <- function(train_function, verbose_kwarg, verbose_param) { set.seed(708L) nrounds <- 5L params <- list( num_leaves = 5L , objective = "binary" , metric = "auc" , early_stopping_round = nrounds , num_threads = .LGB_MAX_THREADS # include a nonsense parameter just to trigger a WARN-level log , nonsense_param = 1.0 ) if (!is.null(verbose_param)) { params[["verbose"]] <- verbose_param } train_kwargs <- list( params = params , nrounds = nrounds ) if (!is.null(verbose_kwarg)) { train_kwargs[["verbose"]] <- verbose_kwarg } function_name <- deparse(substitute(train_function)) if (function_name == "lgb.train") { train_kwargs[["data"]] <- lgb.Dataset( data = train$data , label = train$label , params = list(num_threads = .LGB_MAX_THREADS) ) train_kwargs[["valids"]] <- list( "valid" = lgb.Dataset(data = test$data, label = test$label) ) } else if (function_name == "lightgbm") { train_kwargs[["data"]] <- train$data train_kwargs[["label"]] <- train$label train_kwargs[["valids"]] <- list( "valid" = lgb.Dataset(data = test$data, label = test$label) ) } else if (function_name == "lgb.cv") { train_kwargs[["data"]] <- lgb.Dataset( data = train$data , label = train$label ) train_kwargs[["nfold"]] <- 3L train_kwargs[["showsd"]] <- FALSE } log_txt <- capture.output({ bst <- do.call( what = train_function , args = train_kwargs ) }) return(list(booster = bst, logs = log_txt)) } test_that("lgb.train() only prints eval metrics when expected to", { # regardless of value passed to keyword argument 'verbose', value in params # should take precedence for (verbose_keyword_arg in c(-5L, -1L, 0L, 1L, 5L)) { # (verbose = -1) should not be any logs, should be record evals out <- .train_for_verbosity_test( train_function = lgb.train , verbose_kwarg = verbose_keyword_arg , verbose_param = -1L ) .assert_has_expected_logs( log_txt = out[["logs"]] , lgb_info = FALSE , lgb_warn = FALSE , early_stopping = FALSE , valid_eval_msg = FALSE ) .assert_has_expected_record_evals( fitted_model = out[["booster"]] ) # (verbose = 0) should be only WARN-level LightGBM logs out <- .train_for_verbosity_test( train_function = lgb.train , verbose_kwarg = verbose_keyword_arg , verbose_param = 0L ) .assert_has_expected_logs( log_txt = out[["logs"]] , lgb_info = FALSE , lgb_warn = TRUE , early_stopping = FALSE , valid_eval_msg = FALSE ) .assert_has_expected_record_evals( fitted_model = out[["booster"]] ) # (verbose > 0) should be INFO- and WARN-level LightGBM logs, and record eval messages out <- .train_for_verbosity_test( train_function = lgb.train , verbose_kwarg = verbose_keyword_arg , verbose_param = 1L ) .assert_has_expected_logs( log_txt = out[["logs"]] , lgb_info = TRUE , lgb_warn = TRUE , early_stopping = TRUE , valid_eval_msg = TRUE ) .assert_has_expected_record_evals( fitted_model = out[["booster"]] ) } # if verbosity isn't specified in `params`, changing keyword argument `verbose` should # alter what messages are printed # (verbose = -1) should not be any logs, should be record evals out <- .train_for_verbosity_test( train_function = lgb.train , verbose_kwarg = -1L , verbose_param = NULL ) .assert_has_expected_logs( log_txt = out[["logs"]] , lgb_info = FALSE , lgb_warn = FALSE , early_stopping = FALSE , valid_eval_msg = FALSE ) .assert_has_expected_record_evals( fitted_model = out[["booster"]] ) # (verbose = 0) should be only WARN-level LightGBM logs out <- .train_for_verbosity_test( train_function = lgb.train , verbose_kwarg = 0L , verbose_param = NULL ) .assert_has_expected_logs( log_txt = out[["logs"]] , lgb_info = FALSE , lgb_warn = TRUE , early_stopping = FALSE , valid_eval_msg = FALSE ) .assert_has_expected_record_evals( fitted_model = out[["booster"]] ) # (verbose > 0) should be INFO- and WARN-level LightGBM logs, and record eval messages out <- .train_for_verbosity_test( train_function = lgb.train , verbose_kwarg = 1L , verbose_param = NULL ) .assert_has_expected_logs( log_txt = out[["logs"]] , lgb_info = TRUE , lgb_warn = TRUE , early_stopping = TRUE , valid_eval_msg = TRUE ) .assert_has_expected_record_evals( fitted_model = out[["booster"]] ) }) test_that("lightgbm() only prints eval metrics when expected to", { # regardless of value passed to keyword argument 'verbose', value in params # should take precedence for (verbose_keyword_arg in c(-5L, -1L, 0L, 1L, 5L)) { # (verbose = -1) should not be any logs, train should not be in valids out <- .train_for_verbosity_test( train_function = lightgbm , verbose_kwarg = verbose_keyword_arg , verbose_param = -1L ) .assert_has_expected_logs( log_txt = out[["logs"]] , lgb_info = FALSE , lgb_warn = FALSE , early_stopping = FALSE , valid_eval_msg = FALSE ) .assert_has_expected_record_evals( fitted_model = out[["booster"]] ) # (verbose = 0) should be only WARN-level LightGBM logs, train should not be in valids out <- .train_for_verbosity_test( train_function = lightgbm , verbose_kwarg = verbose_keyword_arg , verbose_param = 0L ) .assert_has_expected_logs( log_txt = out[["logs"]] , lgb_info = FALSE , lgb_warn = TRUE , early_stopping = FALSE , valid_eval_msg = FALSE ) .assert_has_expected_record_evals( fitted_model = out[["booster"]] ) # (verbose > 0) should be INFO- and WARN-level LightGBM logs, and record eval messages, and # train should be in valids out <- .train_for_verbosity_test( train_function = lightgbm , verbose_kwarg = verbose_keyword_arg , verbose_param = 1L ) .assert_has_expected_logs( log_txt = out[["logs"]] , lgb_info = TRUE , lgb_warn = TRUE , early_stopping = TRUE , valid_eval_msg = TRUE ) .assert_has_expected_record_evals( fitted_model = out[["booster"]] ) } # if verbosity isn't specified in `params`, changing keyword argument `verbose` should # alter what messages are printed # (verbose = -1) should not be any logs, train should not be in valids out <- .train_for_verbosity_test( train_function = lightgbm , verbose_kwarg = -1L , verbose_param = NULL ) .assert_has_expected_logs( log_txt = out[["logs"]] , lgb_info = FALSE , lgb_warn = FALSE , early_stopping = FALSE , valid_eval_msg = FALSE ) .assert_has_expected_record_evals( fitted_model = out[["booster"]] ) # (verbose = 0) should be only WARN-level LightGBM logs, train should not be in valids out <- .train_for_verbosity_test( train_function = lightgbm , verbose_kwarg = 0L , verbose_param = NULL ) .assert_has_expected_logs( log_txt = out[["logs"]] , lgb_info = FALSE , lgb_warn = TRUE , early_stopping = FALSE , valid_eval_msg = FALSE ) .assert_has_expected_record_evals( fitted_model = out[["booster"]] ) # (verbose > 0) should be INFO- and WARN-level LightGBM logs, and record eval messages, and # train should be in valids out <- .train_for_verbosity_test( train_function = lightgbm , verbose_kwarg = 1L , verbose_param = NULL ) .assert_has_expected_logs( log_txt = out[["logs"]] , lgb_info = TRUE , lgb_warn = TRUE , early_stopping = TRUE , valid_eval_msg = TRUE ) .assert_has_expected_record_evals( fitted_model = out[["booster"]] ) }) test_that("lgb.cv() only prints eval metrics when expected to", { # regardless of value passed to keyword argument 'verbose', value in params # should take precedence for (verbose_keyword_arg in c(-5L, -1L, 0L, 1L, 5L)) { # (verbose = -1) should not be any logs, should be record evals out <- .train_for_verbosity_test( verbose_kwarg = verbose_keyword_arg , verbose_param = -1L , train_function = lgb.cv ) .assert_has_expected_logs( log_txt = out[["logs"]] , lgb_info = FALSE , lgb_warn = FALSE , early_stopping = FALSE , valid_eval_msg = FALSE ) .assert_has_expected_record_evals( fitted_model = out[["booster"]] ) # (verbose = 0) should be only WARN-level LightGBM logs out <- .train_for_verbosity_test( verbose_kwarg = verbose_keyword_arg , verbose_param = 0L , train_function = lgb.cv ) .assert_has_expected_logs( log_txt = out[["logs"]] , lgb_info = FALSE , lgb_warn = TRUE , early_stopping = FALSE , valid_eval_msg = FALSE ) .assert_has_expected_record_evals( fitted_model = out[["booster"]] ) # (verbose > 0) should be INFO- and WARN-level LightGBM logs, and record eval messages out <- .train_for_verbosity_test( verbose_kwarg = verbose_keyword_arg , verbose_param = 1L , train_function = lgb.cv ) .assert_has_expected_logs( log_txt = out[["logs"]] , lgb_info = TRUE , lgb_warn = TRUE , early_stopping = TRUE , valid_eval_msg = TRUE ) .assert_has_expected_record_evals( fitted_model = out[["booster"]] ) } # if verbosity isn't specified in `params`, changing keyword argument `verbose` should # alter what messages are printed # (verbose = -1) should not be any logs, should be record evals out <- .train_for_verbosity_test( verbose_kwarg = verbose_keyword_arg , verbose_param = -1L , train_function = lgb.cv ) .assert_has_expected_logs( log_txt = out[["logs"]] , lgb_info = FALSE , lgb_warn = FALSE , early_stopping = FALSE , valid_eval_msg = FALSE ) .assert_has_expected_record_evals( fitted_model = out[["booster"]] ) # (verbose = 0) should be only WARN-level LightGBM logs out <- .train_for_verbosity_test( verbose_kwarg = verbose_keyword_arg , verbose_param = 0L , train_function = lgb.cv ) .assert_has_expected_logs( log_txt = out[["logs"]] , lgb_info = FALSE , lgb_warn = TRUE , early_stopping = FALSE , valid_eval_msg = FALSE ) .assert_has_expected_record_evals( fitted_model = out[["booster"]] ) # (verbose > 0) should be INFO- and WARN-level LightGBM logs, and record eval messages out <- .train_for_verbosity_test( verbose_kwarg = verbose_keyword_arg , verbose_param = 1L , train_function = lgb.cv ) .assert_has_expected_logs( log_txt = out[["logs"]] , lgb_info = TRUE , lgb_warn = TRUE , early_stopping = TRUE , valid_eval_msg = TRUE ) .assert_has_expected_record_evals( fitted_model = out[["booster"]] ) }) test_that("lightgbm() changes objective='auto' appropriately", { # Regression data("mtcars") y <- mtcars$mpg x <- as.matrix(mtcars[, -1L]) model <- lightgbm(x, y, objective = "auto", verbose = .LGB_VERBOSITY, nrounds = 5L, num_threads = .LGB_MAX_THREADS) expect_equal(model$params$objective, "regression") model_txt_lines <- strsplit( x = model$save_model_to_string() , split = "\n" , fixed = TRUE )[[1L]] expect_true(any(grepl("objective=regression", model_txt_lines, fixed = TRUE))) expect_false(any(grepl("objective=regression_l1", model_txt_lines, fixed = TRUE))) # Binary classification x <- train$data y <- factor(train$label) model <- lightgbm(x, y, objective = "auto", verbose = .LGB_VERBOSITY, nrounds = 5L, num_threads = .LGB_MAX_THREADS) expect_equal(model$params$objective, "binary") model_txt_lines <- strsplit( x = model$save_model_to_string() , split = "\n" , fixed = TRUE )[[1L]] expect_true(any(grepl("objective=binary", model_txt_lines, fixed = TRUE))) # Multi-class classification data("iris") y <- factor(iris$Species) x <- as.matrix(iris[, -5L]) model <- lightgbm(x, y, objective = "auto", verbose = .LGB_VERBOSITY, nrounds = 5L, num_threads = .LGB_MAX_THREADS) expect_equal(model$params$objective, "multiclass") expect_equal(model$params$num_class, 3L) model_txt_lines <- strsplit( x = model$save_model_to_string() , split = "\n" , fixed = TRUE )[[1L]] expect_true(any(grepl("objective=multiclass", model_txt_lines, fixed = TRUE))) }) test_that("lightgbm() determines number of classes for non-default multiclass objectives", { data("iris") y <- factor(iris$Species) x <- as.matrix(iris[, -5L]) model <- lightgbm( x , y , objective = "multiclassova" , verbose = .LGB_VERBOSITY , nrounds = 5L , num_threads = .LGB_MAX_THREADS ) expect_equal(model$params$objective, "multiclassova") expect_equal(model$params$num_class, 3L) model_txt_lines <- strsplit( x = model$save_model_to_string() , split = "\n" , fixed = TRUE )[[1L]] expect_true(any(grepl("objective=multiclassova", model_txt_lines, fixed = TRUE))) }) test_that("lightgbm() doesn't accept binary classification with non-binary factors", { data("iris") y <- factor(iris$Species) x <- as.matrix(iris[, -5L]) expect_error({ lightgbm(x, y, objective = "binary", verbose = .LGB_VERBOSITY, nrounds = 5L, num_threads = .LGB_MAX_THREADS) }, regexp = "Factors with >2 levels as labels only allowed for multi-class objectives") }) test_that("lightgbm() doesn't accept multi-class classification with binary factors", { data("iris") y <- as.character(iris$Species) y[y == "setosa"] <- "versicolor" y <- factor(y) x <- as.matrix(iris[, -5L]) expect_error({ lightgbm(x, y, objective = "multiclass", verbose = .LGB_VERBOSITY, nrounds = 5L, num_threads = .LGB_MAX_THREADS) }, regexp = "Two-level factors as labels only allowed for objective='binary'") }) test_that("lightgbm() model predictions retain factor levels for multiclass classification", { data("iris") y <- factor(iris$Species) x <- as.matrix(iris[, -5L]) model <- lightgbm(x, y, objective = "auto", verbose = .LGB_VERBOSITY, nrounds = 5L, num_threads = .LGB_MAX_THREADS) pred <- predict(model, x, type = "class") expect_true(is.factor(pred)) expect_equal(levels(pred), levels(y)) pred <- predict(model, x, type = "response") expect_equal(colnames(pred), levels(y)) pred <- predict(model, x, type = "raw") expect_equal(colnames(pred), levels(y)) }) test_that("lightgbm() model predictions retain factor levels for binary classification", { data("iris") y <- as.character(iris$Species) y[y == "setosa"] <- "versicolor" y <- factor(y) x <- as.matrix(iris[, -5L]) model <- lightgbm(x, y, objective = "auto", verbose = .LGB_VERBOSITY, nrounds = 5L, num_threads = .LGB_MAX_THREADS) pred <- predict(model, x, type = "class") expect_true(is.factor(pred)) expect_equal(levels(pred), levels(y)) pred <- predict(model, x, type = "response") expect_true(is.vector(pred)) expect_true(is.numeric(pred)) expect_false(any(pred %in% y)) pred <- predict(model, x, type = "raw") expect_true(is.vector(pred)) expect_true(is.numeric(pred)) expect_false(any(pred %in% y)) }) test_that("lightgbm() accepts named categorical_features", { data(mtcars) y <- mtcars$mpg x <- as.matrix(mtcars[, -1L]) model <- lightgbm( x , y , categorical_feature = "cyl" , verbose = .LGB_VERBOSITY , nrounds = 5L , num_threads = .LGB_MAX_THREADS ) expect_true(length(model$params$categorical_feature) > 0L) }) test_that("lightgbm() correctly sets objective when passing lgb.Dataset as input", { data(mtcars) y <- mtcars$mpg x <- as.matrix(mtcars[, -1L]) ds <- lgb.Dataset(x, label = y) model <- lightgbm( ds , objective = "auto" , verbose = .LGB_VERBOSITY , nrounds = 5L , num_threads = .LGB_MAX_THREADS ) expect_equal(model$params$objective, "regression") }) test_that("Evaluation metrics aren't printed as a single-element vector", { log_txt <- capture_output({ data(mtcars) y <- mtcars$mpg x <- as.matrix(mtcars[, -1L]) cv_result <- lgb.cv( data = lgb.Dataset(x, label = y) , params = list( objective = "regression" , metric = "l2" , min_data_in_leaf = 5L , max_depth = 3L , num_threads = .LGB_MAX_THREADS ) , nrounds = 2L , nfold = 3L , verbose = 1L , eval_train_metric = TRUE ) }) expect_false(grepl("[1] \"[1]", log_txt, fixed = TRUE)) })