#################### # Author: James Hickey # # Series of test to check the behaviour of the predict S3 method. # #################### context("Test input checking - predict.GBMFit") test_that("Error thrown if type is not 'link' or 'response'", { # Given a fit and data ## test Gaussian distribution gbm model set.seed(1) # create some data N <- 1000 X1 <- runif(N) X2 <- 2*runif(N) X3 <- factor(sample(letters[1:4],N,replace=T)) X4 <- ordered(sample(letters[1:6],N,replace=T)) X5 <- factor(sample(letters[1:3],N,replace=T)) X6 <- 3*runif(N) mu <- c(-1,0,1,2)[as.numeric(X3)] SNR <- 10 # signal-to-noise ratio Y <- X1**1.5 + 2 * (X2**.5) + mu sigma <- sqrt(var(Y)/SNR) Y <- Y + rnorm(N,0,sigma) # create a bunch of missing values X1[sample(1:N,size=100)] <- NA X3[sample(1:N,size=300)] <- NA w <- rep(1,N) offset <- rep(0, N) data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6) # Set up for new API params <- training_params(num_trees=200, interaction_depth=3, min_num_obs_in_node=10, shrinkage=0.005, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=6) dist <- gbm_dist("Gaussian") fit <- gbmt(Y~X1+X2+X3+X4+X5+X6, data=data, distribution=dist, weights=w, offset=offset, train_params=params, var_monotone=c(0, 0, 0, 0, 0, 0), keep_gbm_data=TRUE, cv_folds=10, is_verbose=FALSE) # Make prediction set.seed(2) # make some new data N <- 1000 X1 <- runif(N) X2 <- 2*runif(N) X3 <- factor(sample(letters[1:4],N,replace=TRUE)) X4 <- ordered(sample(letters[1:6],N,replace=TRUE)) X5 <- factor(sample(letters[1:3],N,replace=TRUE)) X6 <- 3*runif(N) mu <- c(-1,0,1,2)[as.numeric(X3)] # Actual underlying signal - Check how close to this we are Y <- X1**1.5 + 2 * (X2**.5) + mu data2 <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6) # When calling predict with type not 'link' or 'response' # Then an error will be thrown expect_error(predict(fit, data2, length(fit$trees), type='wrong'), "type must be either 'link' or 'response'") }) test_that("Error thrown if data is missing", { # Given a fit and data ## test Gaussian distribution gbm model set.seed(1) # create some data N <- 1000 X1 <- runif(N) X2 <- 2*runif(N) X3 <- factor(sample(letters[1:4],N,replace=T)) X4 <- ordered(sample(letters[1:6],N,replace=T)) X5 <- factor(sample(letters[1:3],N,replace=T)) X6 <- 3*runif(N) mu <- c(-1,0,1,2)[as.numeric(X3)] SNR <- 10 # signal-to-noise ratio Y <- X1**1.5 + 2 * (X2**.5) + mu sigma <- sqrt(var(Y)/SNR) Y <- Y + rnorm(N,0,sigma) # create a bunch of missing values X1[sample(1:N,size=100)] <- NA X3[sample(1:N,size=300)] <- NA w <- rep(1,N) offset <- rep(0, N) data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6) # Set up for new API params <- training_params(num_trees=200, interaction_depth=3, min_num_obs_in_node=10, shrinkage=0.005, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=6) dist <- gbm_dist("Gaussian") fit <- gbmt(Y~X1+X2+X3+X4+X5+X6, data=data, distribution=dist, weights=w, offset=offset, train_params=params, var_monotone=c(0, 0, 0, 0, 0, 0), keep_gbm_data=TRUE, cv_folds=10, is_verbose=FALSE) # Make prediction set.seed(2) # make some new data N <- 1000 X1 <- runif(N) X2 <- 2*runif(N) X3 <- factor(sample(letters[1:4],N,replace=TRUE)) X4 <- ordered(sample(letters[1:6],N,replace=TRUE)) X5 <- factor(sample(letters[1:3],N,replace=TRUE)) X6 <- 3*runif(N) mu <- c(-1,0,1,2)[as.numeric(X3)] # Actual underlying signal - Check how close to this we are Y <- X1**1.5 + 2 * (X2**.5) + mu data2 <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6) # When calling predict without data # Then an error will be thrown expect_error(predict(fit, n.trees=length(fit$trees)), "newdata must be provided as a data frame") }) test_that("Error thrown if data is not in data.frame", { # Given a fit and data ## test Gaussian distribution gbm model set.seed(1) # create some data N <- 1000 X1 <- runif(N) X2 <- 2*runif(N) X3 <- factor(sample(letters[1:4],N,replace=T)) X4 <- ordered(sample(letters[1:6],N,replace=T)) X5 <- factor(sample(letters[1:3],N,replace=T)) X6 <- 3*runif(N) mu <- c(-1,0,1,2)[as.numeric(X3)] SNR <- 10 # signal-to-noise ratio Y <- X1**1.5 + 2 * (X2**.5) + mu sigma <- sqrt(var(Y)/SNR) Y <- Y + rnorm(N,0,sigma) # create a bunch of missing values X1[sample(1:N,size=100)] <- NA X3[sample(1:N,size=300)] <- NA w <- rep(1,N) offset <- rep(0, N) data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6) # Set up for new API params <- training_params(num_trees=200, interaction_depth=3, min_num_obs_in_node=10, shrinkage=0.005, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=6) dist <- gbm_dist("Gaussian") fit <- gbmt(Y~X1+X2+X3+X4+X5+X6, data=data, distribution=dist, weights=w, offset=offset, train_params=params, var_monotone=c(0, 0, 0, 0, 0, 0), keep_gbm_data=TRUE, cv_folds=10, is_verbose=FALSE) # Make prediction set.seed(2) # make some new data N <- 1000 X1 <- runif(N) X2 <- 2*runif(N) X3 <- factor(sample(letters[1:4],N,replace=TRUE)) X4 <- ordered(sample(letters[1:6],N,replace=TRUE)) X5 <- factor(sample(letters[1:3],N,replace=TRUE)) X6 <- 3*runif(N) mu <- c(-1,0,1,2)[as.numeric(X3)] # Actual underlying signal - Check how close to this we are Y <- X1**1.5 + 2 * (X2**.5) + mu data2 <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6) # When calling predict with data not in data.frame # Then an error will be thrown expect_error(predict(fit, newdata=as.list(data2), n.trees=length(fit$trees)), "newdata must be provided as a data frame") }) test_that("Error thrown if n.trees not provided", { # Given a fit and data ## test Gaussian distribution gbm model set.seed(1) # create some data N <- 1000 X1 <- runif(N) X2 <- 2*runif(N) X3 <- factor(sample(letters[1:4],N,replace=T)) X4 <- ordered(sample(letters[1:6],N,replace=T)) X5 <- factor(sample(letters[1:3],N,replace=T)) X6 <- 3*runif(N) mu <- c(-1,0,1,2)[as.numeric(X3)] SNR <- 10 # signal-to-noise ratio Y <- X1**1.5 + 2 * (X2**.5) + mu sigma <- sqrt(var(Y)/SNR) Y <- Y + rnorm(N,0,sigma) # create a bunch of missing values X1[sample(1:N,size=100)] <- NA X3[sample(1:N,size=300)] <- NA w <- rep(1,N) offset <- rep(0, N) data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6) # Set up for new API params <- training_params(num_trees=200, interaction_depth=3, min_num_obs_in_node=10, shrinkage=0.005, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=6) dist <- gbm_dist("Gaussian") fit <- gbmt(Y~X1+X2+X3+X4+X5+X6, data=data, distribution=dist, weights=w, offset=offset, train_params=params, var_monotone=c(0, 0, 0, 0, 0, 0), keep_gbm_data=TRUE, cv_folds=10, is_verbose=FALSE) # Make prediction set.seed(2) # make some new data N <- 1000 X1 <- runif(N) X2 <- 2*runif(N) X3 <- factor(sample(letters[1:4],N,replace=TRUE)) X4 <- ordered(sample(letters[1:6],N,replace=TRUE)) X5 <- factor(sample(letters[1:3],N,replace=TRUE)) X6 <- 3*runif(N) mu <- c(-1,0,1,2)[as.numeric(X3)] # Actual underlying signal - Check how close to this we are Y <- X1**1.5 + 2 * (X2**.5) + mu data2 <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6) # When calling predict without number of trees specified # Then an error will be thrown expect_error(predict(fit, data2), "Number of trees to be used in prediction must be provided.") }) test_that("Error thrown if n.trees is NULL or vector of 0 length", { # Given a fit and data ## test Gaussian distribution gbm model set.seed(1) # create some data N <- 1000 X1 <- runif(N) X2 <- 2*runif(N) X3 <- factor(sample(letters[1:4],N,replace=T)) X4 <- ordered(sample(letters[1:6],N,replace=T)) X5 <- factor(sample(letters[1:3],N,replace=T)) X6 <- 3*runif(N) mu <- c(-1,0,1,2)[as.numeric(X3)] SNR <- 10 # signal-to-noise ratio Y <- X1**1.5 + 2 * (X2**.5) + mu sigma <- sqrt(var(Y)/SNR) Y <- Y + rnorm(N,0,sigma) # create a bunch of missing values X1[sample(1:N,size=100)] <- NA X3[sample(1:N,size=300)] <- NA w <- rep(1,N) offset <- rep(0, N) data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6) # Set up for new API params <- training_params(num_trees=200, interaction_depth=3, min_num_obs_in_node=10, shrinkage=0.005, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=6) dist <- gbm_dist("Gaussian") fit <- gbmt(Y~X1+X2+X3+X4+X5+X6, data=data, distribution=dist, weights=w, offset=offset, train_params=params, var_monotone=c(0, 0, 0, 0, 0, 0), keep_gbm_data=TRUE, cv_folds=10, is_verbose=FALSE) # Make prediction set.seed(2) # make some new data N <- 1000 X1 <- runif(N) X2 <- 2*runif(N) X3 <- factor(sample(letters[1:4],N,replace=TRUE)) X4 <- ordered(sample(letters[1:6],N,replace=TRUE)) X5 <- factor(sample(letters[1:3],N,replace=TRUE)) X6 <- 3*runif(N) mu <- c(-1,0,1,2)[as.numeric(X3)] # Actual underlying signal - Check how close to this we are Y <- X1**1.5 + 2 * (X2**.5) + mu data2 <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6) # When calling predict with number of trees NULL or length 0 # Then an error will be thrown expect_error(predict(fit, data2, NULL), "n.trees cannot be NULL or a vector of zero length") expect_error(predict(fit, data2, c()), "n.trees cannot be NULL or a vector of zero length") }) test_that("Error thrown if n.trees has element which is not an integer", { # Given a fit and data ## test Gaussian distribution gbm model set.seed(1) # create some data N <- 1000 X1 <- runif(N) X2 <- 2*runif(N) X3 <- factor(sample(letters[1:4],N,replace=T)) X4 <- ordered(sample(letters[1:6],N,replace=T)) X5 <- factor(sample(letters[1:3],N,replace=T)) X6 <- 3*runif(N) mu <- c(-1,0,1,2)[as.numeric(X3)] SNR <- 10 # signal-to-noise ratio Y <- X1**1.5 + 2 * (X2**.5) + mu sigma <- sqrt(var(Y)/SNR) Y <- Y + rnorm(N,0,sigma) # create a bunch of missing values X1[sample(1:N,size=100)] <- NA X3[sample(1:N,size=300)] <- NA w <- rep(1,N) offset <- rep(0, N) data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6) # Set up for new API params <- training_params(num_trees=2000, interaction_depth=3, min_num_obs_in_node=10, shrinkage=0.005, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=6) dist <- gbm_dist("Gaussian") fit <- gbmt(Y~X1+X2+X3+X4+X5+X6, data=data, distribution=dist, weights=w, offset=offset, train_params=params, var_monotone=c(0, 0, 0, 0, 0, 0), keep_gbm_data=TRUE, cv_folds=10, is_verbose=FALSE) # Make prediction set.seed(2) # make some new data N <- 1000 X1 <- runif(N) X2 <- 2*runif(N) X3 <- factor(sample(letters[1:4],N,replace=TRUE)) X4 <- ordered(sample(letters[1:6],N,replace=TRUE)) X5 <- factor(sample(letters[1:3],N,replace=TRUE)) X6 <- 3*runif(N) mu <- c(-1,0,1,2)[as.numeric(X3)] # Actual underlying signal - Check how close to this we are Y <- X1**1.5 + 2 * (X2**.5) + mu data2 <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6) # When calling predict with number of trees containing an element which isn't an integer # Then an error will be thrown expect_error(predict(fit, data2, n.trees=c(1, -1.2)), "n.trees must be a vector of non-negative integers") }) test_that("Warning thrown if offset was in original fit", { # Given a fit and data ## test Gaussian distribution gbm model set.seed(1) # create some data N <- 1000 X1 <- runif(N) X2 <- 2*runif(N) X3 <- factor(sample(letters[1:4],N,replace=T)) X4 <- ordered(sample(letters[1:6],N,replace=T)) X5 <- factor(sample(letters[1:3],N,replace=T)) X6 <- 3*runif(N) mu <- c(-1,0,1,2)[as.numeric(X3)] SNR <- 10 # signal-to-noise ratio Y <- X1**1.5 + 2 * (X2**.5) + mu sigma <- sqrt(var(Y)/SNR) Y <- Y + rnorm(N,0,sigma) # create a bunch of missing values X1[sample(1:N,size=100)] <- NA X3[sample(1:N,size=300)] <- NA w <- rep(1,N) offset <- rep(0, N) data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6) # Set up for new API params <- training_params(num_trees=200, interaction_depth=3, min_num_obs_in_node=10, shrinkage=0.005, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=6) dist <- gbm_dist("Gaussian") fit <- gbmt(Y~X1+X2+X3+X4+X5+X6 + offset(offset), data=data, distribution=dist, weights=w, train_params=params, var_monotone=c(0, 0, 0, 0, 0, 0), keep_gbm_data=TRUE, cv_folds=10, is_verbose=FALSE) # Make prediction set.seed(2) # make some new data N <- 1000 X1 <- runif(N) X2 <- 2*runif(N) X3 <- factor(sample(letters[1:4],N,replace=TRUE)) X4 <- ordered(sample(letters[1:6],N,replace=TRUE)) X5 <- factor(sample(letters[1:3],N,replace=TRUE)) X6 <- 3*runif(N) mu <- c(-1,0,1,2)[as.numeric(X3)] # Actual underlying signal - Check how close to this we are Y <- X1**1.5 + 2 * (X2**.5) + mu data2 <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6) # When calling predict with original fit containing the offset # Then a warning will be thrown expect_warning(predict(fit, data2, length(fit$trees)), "predict.GBMFit does not add the offset to the predicted values.") }) test_that("Warning thrown if n.trees exceeds number in original fit", { # Given a fit and data ## test Gaussian distribution gbm model set.seed(1) # create some data N <- 1000 X1 <- runif(N) X2 <- 2*runif(N) X3 <- factor(sample(letters[1:4],N,replace=T)) X4 <- ordered(sample(letters[1:6],N,replace=T)) X5 <- factor(sample(letters[1:3],N,replace=T)) X6 <- 3*runif(N) mu <- c(-1,0,1,2)[as.numeric(X3)] SNR <- 10 # signal-to-noise ratio Y <- X1**1.5 + 2 * (X2**.5) + mu sigma <- sqrt(var(Y)/SNR) Y <- Y + rnorm(N,0,sigma) # create a bunch of missing values X1[sample(1:N,size=100)] <- NA X3[sample(1:N,size=300)] <- NA w <- rep(1,N) offset <- rep(0, N) data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6) # Set up for new API params <- training_params(num_trees=200, interaction_depth=3, min_num_obs_in_node=10, shrinkage=0.005, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=6) dist <- gbm_dist("Gaussian") fit <- gbmt(Y~X1+X2+X3+X4+X5+X6, data=data, distribution=dist, weights=w, offset=offset, train_params=params, var_monotone=c(0, 0, 0, 0, 0, 0), keep_gbm_data=TRUE, cv_folds=10, is_verbose=FALSE) # Make prediction set.seed(2) # make some new data N <- 1000 X1 <- runif(N) X2 <- 2*runif(N) X3 <- factor(sample(letters[1:4],N,replace=TRUE)) X4 <- ordered(sample(letters[1:6],N,replace=TRUE)) X5 <- factor(sample(letters[1:3],N,replace=TRUE)) X6 <- 3*runif(N) mu <- c(-1,0,1,2)[as.numeric(X3)] # Actual underlying signal - Check how close to this we are Y <- X1**1.5 + 2 * (X2**.5) + mu data2 <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6) # When calling predict with number of trees exceeding number in original fit # Then a warning will be thrown expect_warning(predict(fit, data2, length(fit$trees)+1), "Number of trees exceeded number fit so far. Using ", paste(length(fit$trees),collapse=" "),".") }) context("Test basic functionality of predict.GBMFit") test_that("predict works if Terms defined in original fit", { # Given a fit and data ## test Gaussian distribution gbm model set.seed(1) # create some data N <- 1000 X1 <- runif(N) X2 <- 2*runif(N) X3 <- factor(sample(letters[1:4],N,replace=T)) X4 <- ordered(sample(letters[1:6],N,replace=T)) X5 <- factor(sample(letters[1:3],N,replace=T)) X6 <- 3*runif(N) mu <- c(-1,0,1,2)[as.numeric(X3)] SNR <- 10 # signal-to-noise ratio Y <- X1**1.5 + 2 * (X2**.5) + mu sigma <- sqrt(var(Y)/SNR) Y <- Y + rnorm(N,0,sigma) # create a bunch of missing values X1[sample(1:N,size=100)] <- NA X3[sample(1:N,size=300)] <- NA w <- rep(1,N) offset <- rep(0, N) data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6) # Set up for new API params <- training_params(num_trees=200, interaction_depth=3, min_num_obs_in_node=10, shrinkage=0.005, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=6) dist <- gbm_dist("Gaussian") fit <- gbmt(Y~X1+X2+X3+X4+X5+X6, data=data, distribution=dist, weights=w, offset=offset, train_params=params, var_monotone=c(0, 0, 0, 0, 0, 0), keep_gbm_data=TRUE, cv_folds=10, is_verbose=FALSE) # Make prediction set.seed(2) # make some new data N <- 1000 X1 <- runif(N) X2 <- 2*runif(N) X3 <- factor(sample(letters[1:4],N,replace=TRUE)) X4 <- ordered(sample(letters[1:6],N,replace=TRUE)) X5 <- factor(sample(letters[1:3],N,replace=TRUE)) X6 <- 3*runif(N) mu <- c(-1,0,1,2)[as.numeric(X3)] # Actual underlying signal - Check how close to this we are Y <- X1**1.5 + 2 * (X2**.5) + mu data2 <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6) # When calling predict correctly # Then no error is thrown expect_error(predict(fit, data2, length(fit$trees)), NA) }) test_that("type='response' scales predictions", { # Given a fit and data set.seed(1) # create some data N <- 1000 X1 <- runif(N) X2 <- runif(N) X3 <- factor(sample(letters[1:4],N,replace=T)) mu <- c(-1,0,1,2)[as.numeric(X3)] p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu))) Y <- rbinom(N,1,p) # random weights if you want to experiment with them w <- rexp(N) w <- N*w/sum(w) data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3) offset <- rep(0, N) # Set up for new API params <- training_params(num_trees=300, interaction_depth=3, min_num_obs_in_node=10, shrinkage=0.001, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=3) dist <- gbm_dist("Bernoulli") fit <- gbmt(Y~X1+X2+X3, data=data, distribution=dist, weights=w, offset=offset, train_params=params, var_monotone=c(0, 0, 0), keep_gbm_data=TRUE, cv_folds=5, is_verbose = FALSE) # make some new data set.seed(2) N <- 1000 X1 <- runif(N) X2 <- runif(N) X3 <- factor(sample(letters[1:4],N,replace=T)) mu <- c(-1,0,1,2)[as.numeric(X3)] p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu))) Y <- rbinom(N,1,p) data2 <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3) # When calling predict with type = response preds <- predict(fit, data2, n.trees=length(fit$trees), type='response') preds_link <- predict(fit, data2, n.trees=length(fit$trees), type='link') # Then type=response predictions are scaled expect_equal(preds, adjust_pred_scale(preds_link, fit$distribution)) }) test_that("Output is matrix if length(n.trees) > 1", { # Given a fit and data set.seed(1) # create some data N <- 1000 X1 <- runif(N) X2 <- runif(N) X3 <- factor(sample(letters[1:4],N,replace=T)) mu <- c(-1,0,1,2)[as.numeric(X3)] p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu))) Y <- rbinom(N,1,p) # random weights if you want to experiment with them w <- rexp(N) w <- N*w/sum(w) data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3) offset <- rep(0, N) # Set up for new API params <- training_params(num_trees=300, interaction_depth=3, min_num_obs_in_node=10, shrinkage=0.001, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=3) dist <- gbm_dist("Bernoulli") fit <- gbmt(Y~X1+X2+X3, data=data, distribution=dist, weights=w, offset=offset, train_params=params, var_monotone=c(0, 0, 0), keep_gbm_data=TRUE, cv_folds=5, is_verbose = FALSE) # make some new data set.seed(2) N <- 1000 X1 <- runif(N) X2 <- runif(N) X3 <- factor(sample(letters[1:4],N,replace=T)) mu <- c(-1,0,1,2)[as.numeric(X3)] p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu))) Y <- rbinom(N,1,p) data2 <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3) # When calling predict with length n.trees > 1 preds <- predict(fit, data2, n.trees=c(50, 100)) # Then output is a matrix expect_true(is.matrix(preds)) }) test_that("Output is vector if length(n.trees) == 1", { # Given a fit and data set.seed(1) # create some data N <- 1000 X1 <- runif(N) X2 <- runif(N) X3 <- factor(sample(letters[1:4],N,replace=T)) mu <- c(-1,0,1,2)[as.numeric(X3)] p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu))) Y <- rbinom(N,1,p) # random weights if you want to experiment with them w <- rexp(N) w <- N*w/sum(w) data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3) offset <- rep(0, N) # Set up for new API params <- training_params(num_trees=300, interaction_depth=3, min_num_obs_in_node=10, shrinkage=0.001, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=3) dist <- gbm_dist("Bernoulli") fit <- gbmt(Y~X1+X2+X3, data=data, distribution=dist, weights=w, offset=offset, train_params=params, var_monotone=c(0, 0, 0), keep_gbm_data=TRUE, cv_folds=5, is_verbose = FALSE) # make some new data set.seed(2) N <- 1000 X1 <- runif(N) X2 <- runif(N) X3 <- factor(sample(letters[1:4],N,replace=T)) mu <- c(-1,0,1,2)[as.numeric(X3)] p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu))) Y <- rbinom(N,1,p) data2 <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3) # When calling predict with length n.trees = 1 preds <- predict(fit, data2, n.trees=length(fit$trees), type='response') # Then output is a vector expect_true(is.vector(preds)) }) test_that("When n.trees specified exceeds total number in fit then number in fit used", { # Given a fit and data set.seed(1) # create some data N <- 1000 X1 <- runif(N) X2 <- runif(N) X3 <- factor(sample(letters[1:4],N,replace=T)) mu <- c(-1,0,1,2)[as.numeric(X3)] p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu))) Y <- rbinom(N,1,p) # random weights if you want to experiment with them w <- rexp(N) w <- N*w/sum(w) data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3) offset <- rep(0, N) # Set up for new API params <- training_params(num_trees=300, interaction_depth=3, min_num_obs_in_node=10, shrinkage=0.001, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=3) dist <- gbm_dist("Bernoulli") fit <- gbmt(Y~X1+X2+X3, data=data, distribution=dist, weights=w, offset=offset, train_params=params, var_monotone=c(0, 0, 0), keep_gbm_data=TRUE, cv_folds=5, is_verbose = FALSE) # make some new data set.seed(2) N <- 1000 X1 <- runif(N) X2 <- runif(N) X3 <- factor(sample(letters[1:4],N,replace=T)) mu <- c(-1,0,1,2)[as.numeric(X3)] p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu))) Y <- rbinom(N,1,p) data2 <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3) # When calling predict with n.trees > length(fit$trees) # Then predictions are evaluated at n.trees = length(fit$trees) expect_equal(predict(fit, data2, n.trees=length(fit$trees) + 1), predict(fit, data2, n.trees=length(fit$trees))) })