#################### # Author: James Hickey # # Series of tests to check the extraction of observations # into different folds # #################### context("Error checking") test_that("split_and_join throws an error if gbm_data_obj is not GBMData object", { # Given gbm_data, a distribution obj, rows_in_fold and rows_in_training ## test Gaussian distribution gbm model set.seed(1) # create some data N <- 1000 X1 <- runif(N) X2 <- 2*runif(N) X3 <- factor(sample(letters[1:4],N,replace=T)) X4 <- ordered(sample(letters[1:6],N,replace=T)) X5 <- factor(sample(letters[1:3],N,replace=T)) X6 <- 3*runif(N) mu <- c(-1,0,1,2)[as.numeric(X3)] SNR <- 10 # signal-to-noise ratio Y <- X1**1.5 + 2 * (X2**.5) + mu sigma <- sqrt(var(Y)/SNR) Y <- Y + rnorm(N,0,sigma) # create a bunch of missing values X1[sample(1:N,size=100)] <- NA X3[sample(1:N,size=300)] <- NA w <- rep(1,N) offset <- rep(0, N) data <- data.frame(X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6) # Set up for new API params <- training_params(num_trees=2000, interaction_depth=3, min_num_obs_in_node=10, shrinkage=0.005, bag_fraction=0.5, id=seq_len(nrow(data)), num_train=N/2, num_features=6) dist <- gbm_dist("Gaussian") gdata <- gbm_data(data, Y, w, offset) cv_folds <- 5 cv_groups <- create_cv_groups(gbm_data_obj = gdata, dist, params, cv_folds, FALSE, NULL) # Observations in the training set rows_in_training_set <- params$id %in% seq_len(params$num_train_rows) params$id <- params$id[rows_in_training_set] # Observations in cv_group - interested in first fold rows_in_fold <- params$id %in% seq_len(params$num_train_rows)[(cv_groups == 1)] # Extract relevent data - split into training and validation sets # Calculate new number of training rows params$num_train_rows <- length(which(cv_groups != 1)) params$num_train <- length(unique(params$id[!rows_in_fold])) # When try to split_and_join not GBMData class(gdata) <- "wrong" # Then error is thrown expect_error(split_and_join(gdata, params, rows_in_training_set, rows_in_fold)) }) test_that("split_and_join throws an error if train_params is not GBMTrainParams object", { # Given gbm_data, a distribution obj, rows_in_fold and rows_in_training ## test Gaussian distribution gbm model set.seed(1) # create some data N <- 1000 X1 <- runif(N) X2 <- 2*runif(N) X3 <- factor(sample(letters[1:4],N,replace=T)) X4 <- ordered(sample(letters[1:6],N,replace=T)) X5 <- factor(sample(letters[1:3],N,replace=T)) X6 <- 3*runif(N) mu <- c(-1,0,1,2)[as.numeric(X3)] SNR <- 10 # signal-to-noise ratio Y <- X1**1.5 + 2 * (X2**.5) + mu sigma <- sqrt(var(Y)/SNR) Y <- Y + rnorm(N,0,sigma) # create a bunch of missing values X1[sample(1:N,size=100)] <- NA X3[sample(1:N,size=300)] <- NA w <- rep(1,N) offset <- rep(0, N) data <- data.frame(X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6) # Set up for new API params <- training_params(num_trees=2000, interaction_depth=3, min_num_obs_in_node=10, shrinkage=0.005, bag_fraction=0.5, id=seq_len(nrow(data)), num_train=N/2, num_features=6) dist <- gbm_dist("Gaussian") gdata <- gbm_data(data, Y, w, offset) cv_folds <- 5 cv_groups <- create_cv_groups(gbm_data_obj = gdata, dist, params, cv_folds, FALSE, NULL) # Observations in the training set rows_in_training_set <- params$id %in% seq_len(params$num_train_rows) params$id <- params$id[rows_in_training_set] # Observations in cv_group - interested in first fold rows_in_fold <- params$id %in% seq_len(params$num_train_rows)[(cv_groups == 1)] # Extract relevent data - split into training and validation sets # Calculate new number of training rows params$num_train_rows <- length(which(cv_groups != 1)) params$num_train <- length(unique(params$id[!rows_in_fold])) # When try to split_and_join not GBMTrainParams class(params) <- "wrong" # Then error is thrown expect_error(split_and_join(gdata, params, rows_in_training_set, rows_in_fold)) }) test_that("split_and_join throws an error if rows_in_training is not an atomic of logicals", { # Given gbm_data, a distribution obj, rows_in_fold and rows_in_training ## test Gaussian distribution gbm model set.seed(1) # create some data N <- 1000 X1 <- runif(N) X2 <- 2*runif(N) X3 <- factor(sample(letters[1:4],N,replace=T)) X4 <- ordered(sample(letters[1:6],N,replace=T)) X5 <- factor(sample(letters[1:3],N,replace=T)) X6 <- 3*runif(N) mu <- c(-1,0,1,2)[as.numeric(X3)] SNR <- 10 # signal-to-noise ratio Y <- X1**1.5 + 2 * (X2**.5) + mu sigma <- sqrt(var(Y)/SNR) Y <- Y + rnorm(N,0,sigma) # create a bunch of missing values X1[sample(1:N,size=100)] <- NA X3[sample(1:N,size=300)] <- NA w <- rep(1,N) offset <- rep(0, N) data <- data.frame(X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6) # Set up for new API params <- training_params(num_trees=2000, interaction_depth=3, min_num_obs_in_node=10, shrinkage=0.005, bag_fraction=0.5, id=seq_len(nrow(data)), num_train=N/2, num_features=6) dist <- gbm_dist("Gaussian") gdata <- gbm_data(data, Y, w, offset) cv_folds <- 5 cv_groups <- create_cv_groups(gbm_data_obj = gdata, dist, params, cv_folds, FALSE, NULL) # Observations in the training set rows_in_training_set <- params$id %in% seq_len(params$num_train_rows) params$id <- params$id[rows_in_training_set] # Observations in cv_group - interested in first fold rows_in_fold <- params$id %in% seq_len(params$num_train_rows)[(cv_groups == 1)] # Extract relevent data - split into training and validation sets # Calculate new number of training rows params$num_train_rows <- length(which(cv_groups != 1)) params$num_train <- length(unique(params$id[!rows_in_fold])) # When try to split_and_join with rows_in_training_set not an atomic of logicals # Then error is thrown expect_error(split_and_join(gdata, params, rows_in_training_set=c(1, 2), rows_in_fold)) expect_error(split_and_join(gdata, params, rows_in_training_set="what", rows_in_fold)) expect_error(split_and_join(gdata, params, rows_in_training_set= NaN, rows_in_fold)) }) test_that("split_and_join throws an error if rows_in_fold is not an atomic of logicals of length num_train", { # Given gbm_data, a distribution obj, rows_in_fold and rows_in_training ## test Gaussian distribution gbm model set.seed(1) # create some data N <- 1000 X1 <- runif(N) X2 <- 2*runif(N) X3 <- factor(sample(letters[1:4],N,replace=T)) X4 <- ordered(sample(letters[1:6],N,replace=T)) X5 <- factor(sample(letters[1:3],N,replace=T)) X6 <- 3*runif(N) mu <- c(-1,0,1,2)[as.numeric(X3)] SNR <- 10 # signal-to-noise ratio Y <- X1**1.5 + 2 * (X2**.5) + mu sigma <- sqrt(var(Y)/SNR) Y <- Y + rnorm(N,0,sigma) # create a bunch of missing values X1[sample(1:N,size=100)] <- NA X3[sample(1:N,size=300)] <- NA w <- rep(1,N) offset <- rep(0, N) data <- data.frame(X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6) # Set up for new API params <- training_params(num_trees=2000, interaction_depth=3, min_num_obs_in_node=10, shrinkage=0.005, bag_fraction=0.5, id=seq_len(nrow(data)), num_train=N/2, num_features=6) dist <- gbm_dist("Gaussian") gdata <- gbm_data(data, Y, w, offset) cv_folds <- 5 cv_groups <- create_cv_groups(gbm_data_obj = gdata, dist, params, cv_folds, FALSE, NULL) # Observations in the training set rows_in_training_set <- params$id %in% seq_len(params$num_train_rows) params$id <- params$id[rows_in_training_set] # Observations in cv_group - interested in first fold rows_in_fold <- params$id %in% seq_len(params$num_train_rows)[(cv_groups == 1)] # Extract relevent data - split into training and validation sets # Calculate new number of training rows params$num_train_rows <- length(which(cv_groups != 1)) params$num_train <- length(unique(params$id[!rows_in_fold])) # When try to split_and_join with rows_in_fold not an atomic of logicals of length num_train # Then error is thrown expect_error(split_and_join(gdata, params, rows_in_training_set, rows_in_fold =sample(c(TRUE, FALSE), params$num_train+1, replace=TRUE))) expect_error(split_and_join(gdata, params, rows_in_training_set, rows_in_fold=c(1, 2))) expect_error(split_and_join(gdata, params, rows_in_training_set, rows_in_fold=NaN)) expect_error(split_and_join(gdata, params, rows_in_training_set, rows_in_fold="Where")) }) test_that("update_fold_dist_data throws an error if gbm_data_obj is not GBMData object", { # Given gbm_data, a distribution obj, rows_in_fold and rows_in_training ## test Gaussian distribution gbm model set.seed(1) # create some data N <- 1000 X1 <- runif(N) X2 <- 2*runif(N) X3 <- factor(sample(letters[1:4],N,replace=T)) X4 <- ordered(sample(letters[1:6],N,replace=T)) X5 <- factor(sample(letters[1:3],N,replace=T)) X6 <- 3*runif(N) mu <- c(-1,0,1,2)[as.numeric(X3)] SNR <- 10 # signal-to-noise ratio Y <- X1**1.5 + 2 * (X2**.5) + mu sigma <- sqrt(var(Y)/SNR) Y <- Y + rnorm(N,0,sigma) # create a bunch of missing values X1[sample(1:N,size=100)] <- NA X3[sample(1:N,size=300)] <- NA w <- rep(1,N) offset <- rep(0, N) data <- data.frame(X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6) # Set up for new API params <- training_params(num_trees=2000, interaction_depth=3, min_num_obs_in_node=10, shrinkage=0.005, bag_fraction=0.5, id=seq_len(nrow(data)), num_train=N/2, num_features=6) dist <- gbm_dist("Gaussian") gdata <- gbm_data(data, Y, w, offset) cv_folds <- 5 cv_groups <- create_cv_groups(gbm_data_obj = gdata, dist, params, cv_folds, FALSE, NULL) # Observations in the training set rows_in_training_set <- params$id %in% seq_len(params$num_train_rows) params$id <- params$id[rows_in_training_set] # Observations in cv_group - interested in first fold rows_in_fold <- params$id %in% seq_len(params$num_train_rows)[(cv_groups == 1)] # Extract relevent data - split into training and validation sets # Calculate new number of training rows params$num_train_rows <- length(which(cv_groups != 1)) params$num_train <- length(unique(params$id[!rows_in_fold])) # When try to update_fold_dist_data not GBMData class(gdata) <- "wrong" # Then error is thrown expect_error(update_fold_dist_data(dist, gdata, params, rows_in_training_set, rows_in_fold)) }) test_that("update_fold_dist_data throws an error if gbm_dist_obj is not GBMDist object", { # Given gbm_data, a distribution obj, rows_in_fold and rows_in_training ## test Gaussian distribution gbm model set.seed(1) # create some data N <- 1000 X1 <- runif(N) X2 <- 2*runif(N) X3 <- factor(sample(letters[1:4],N,replace=T)) X4 <- ordered(sample(letters[1:6],N,replace=T)) X5 <- factor(sample(letters[1:3],N,replace=T)) X6 <- 3*runif(N) mu <- c(-1,0,1,2)[as.numeric(X3)] SNR <- 10 # signal-to-noise ratio Y <- X1**1.5 + 2 * (X2**.5) + mu sigma <- sqrt(var(Y)/SNR) Y <- Y + rnorm(N,0,sigma) # create a bunch of missing values X1[sample(1:N,size=100)] <- NA X3[sample(1:N,size=300)] <- NA w <- rep(1,N) offset <- rep(0, N) data <- data.frame(X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6) # Set up for new API params <- training_params(num_trees=2000, interaction_depth=3, min_num_obs_in_node=10, shrinkage=0.005, bag_fraction=0.5, id=seq_len(nrow(data)), num_train=N/2, num_features=6) dist <- gbm_dist("Gaussian") gdata <- gbm_data(data, Y, w, offset) cv_folds <- 5 cv_groups <- create_cv_groups(gbm_data_obj = gdata, dist, params, cv_folds, FALSE, NULL) # Observations in the training set rows_in_training_set <- params$id %in% seq_len(params$num_train_rows) params$id <- params$id[rows_in_training_set] # Observations in cv_group - interested in first fold rows_in_fold <- params$id %in% seq_len(params$num_train_rows)[(cv_groups == 1)] # Extract relevent data - split into training and validation sets # Calculate new number of training rows params$num_train_rows <- length(which(cv_groups != 1)) params$num_train <- length(unique(params$id[!rows_in_fold])) # When try to update_fold_dist_data not GBMDist class(dist) <- "wrong" # Then error is thrown expect_error(update_fold_dist_data(dist, gdata, params, rows_in_training_set, rows_in_fold)) }) test_that("update_fold_dist_data throws an error if train_params is not GBMTrainParams object", { # Given gbm_data, a distribution obj, rows_in_fold and rows_in_training ## test Gaussian distribution gbm model set.seed(1) # create some data N <- 1000 X1 <- runif(N) X2 <- 2*runif(N) X3 <- factor(sample(letters[1:4],N,replace=T)) X4 <- ordered(sample(letters[1:6],N,replace=T)) X5 <- factor(sample(letters[1:3],N,replace=T)) X6 <- 3*runif(N) mu <- c(-1,0,1,2)[as.numeric(X3)] SNR <- 10 # signal-to-noise ratio Y <- X1**1.5 + 2 * (X2**.5) + mu sigma <- sqrt(var(Y)/SNR) Y <- Y + rnorm(N,0,sigma) # create a bunch of missing values X1[sample(1:N,size=100)] <- NA X3[sample(1:N,size=300)] <- NA w <- rep(1,N) offset <- rep(0, N) data <- data.frame(X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6) # Set up for new API params <- training_params(num_trees=2000, interaction_depth=3, min_num_obs_in_node=10, shrinkage=0.005, bag_fraction=0.5, id=seq_len(nrow(data)), num_train=N/2, num_features=6) dist <- gbm_dist("Gaussian") gdata <- gbm_data(data, Y, w, offset) cv_folds <- 5 cv_groups <- create_cv_groups(gbm_data_obj = gdata, dist, params, cv_folds, FALSE, NULL) # Observations in the training set rows_in_training_set <- params$id %in% seq_len(params$num_train_rows) params$id <- params$id[rows_in_training_set] # Observations in cv_group - interested in first fold rows_in_fold <- params$id %in% seq_len(params$num_train_rows)[(cv_groups == 1)] # Extract relevent data - split into training and validation sets # Calculate new number of training rows params$num_train_rows <- length(which(cv_groups != 1)) params$num_train <- length(unique(params$id[!rows_in_fold])) # When try to update_fold_dist_data not GBMTrainParams class(params) <- "wrong" # Then error is thrown expect_error(update_fold_dist_data(dist, gdata, params, rows_in_training_set, rows_in_fold)) }) context("Test splitting and joining functionality") test_that("split_and_join returns a GBMData object", { # Given gbm_data, a distribution obj, rows_in_fold and rows_in_training ## test Gaussian distribution gbm model set.seed(1) # create some data N <- 1000 X1 <- runif(N) X2 <- 2*runif(N) X3 <- factor(sample(letters[1:4],N,replace=T)) X4 <- ordered(sample(letters[1:6],N,replace=T)) X5 <- factor(sample(letters[1:3],N,replace=T)) X6 <- 3*runif(N) mu <- c(-1,0,1,2)[as.numeric(X3)] SNR <- 10 # signal-to-noise ratio Y <- X1**1.5 + 2 * (X2**.5) + mu sigma <- sqrt(var(Y)/SNR) Y <- Y + rnorm(N,0,sigma) # create a bunch of missing values X1[sample(1:N,size=100)] <- NA X3[sample(1:N,size=300)] <- NA w <- rep(1,N) offset <- rep(0, N) data <- data.frame(X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6) # Set up for new API params <- training_params(num_trees=2000, interaction_depth=3, min_num_obs_in_node=10, shrinkage=0.005, bag_fraction=0.5, id=seq_len(nrow(data)), num_train=N/2, num_features=6) dist <- gbm_dist("Gaussian") gdata <- gbm_data(data, Y, w, offset) cv_folds <- 5 cv_groups <- create_cv_groups(gbm_data_obj = gdata, dist, params, cv_folds, FALSE, NULL) # Observations in the training set rows_in_training_set <- params$id %in% seq_len(params$num_train_rows) params$id <- params$id[rows_in_training_set] # Observations in cv_group - interested in first fold rows_in_fold <- params$id %in% seq_len(params$num_train_rows)[(cv_groups == 1)] # Extract relevent data - split into training and validation sets # Calculate new number of training rows params$num_train_rows <- length(which(cv_groups != 1)) params$num_train <- length(unique(params$id[!rows_in_fold])) # When split_and_join called returned_obj <- split_and_join(gdata, params, rows_in_training_set, rows_in_fold) # Then returned object is a GBMData object expect_error(check_if_gbm_data(returned_obj), NA) }) test_that("split_and_join modifies gbm_data_obj so data in validation fold is at the 'end' of each field", { # Given gbm_data, a distribution obj, rows_in_fold and rows_in_training ## test Gaussian distribution gbm model set.seed(1) # create some data N <- 1000 X1 <- runif(N) X2 <- 2*runif(N) X3 <- factor(sample(letters[1:4],N,replace=T)) X4 <- ordered(sample(letters[1:6],N,replace=T)) X5 <- factor(sample(letters[1:3],N,replace=T)) X6 <- 3*runif(N) mu <- c(-1,0,1,2)[as.numeric(X3)] SNR <- 10 # signal-to-noise ratio Y <- X1**1.5 + 2 * (X2**.5) + mu sigma <- sqrt(var(Y)/SNR) Y <- Y + rnorm(N,0,sigma) # create a bunch of missing values X1[sample(1:N,size=100)] <- NA X3[sample(1:N,size=300)] <- NA w <- rep(1,N) offset <- rep(0, N) data <- data.frame(X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6) # Set up for new API params <- training_params(num_trees=2000, interaction_depth=3, min_num_obs_in_node=10, shrinkage=0.005, bag_fraction=0.5, id=seq_len(nrow(data)), num_train=N/2, num_features=6) dist <- gbm_dist("Gaussian") gdata <- gbm_data(data, Y, w, offset) cv_folds <- 5 cv_groups <- create_cv_groups(gbm_data_obj = gdata, dist, params, cv_folds, FALSE, NULL) # Observations in the training set rows_in_training_set <- params$id %in% seq_len(params$num_train_rows) params$id <- params$id[rows_in_training_set] # Observations in cv_group - interested in first fold rows_in_fold <- params$id %in% seq_len(params$num_train_rows)[(cv_groups == 1)] # Extract relevent data - split into training and validation sets # Calculate new number of training rows params$num_train_rows <- length(which(cv_groups != 1)) params$num_train <- length(unique(params$id[!rows_in_fold])) # When split_and_join called returned_obj <- split_and_join(gdata, params, rows_in_training_set, rows_in_fold) # Then returned object is a GBMData object with validation fold data at end of rows expect_equal(returned_obj$x[rows_in_training_set, ,drop=FALSE][(N/2 - length(which(cv_groups==1)) + 1):(N/2), ,drop=FALSE], data[rows_in_training_set, ,drop=FALSE][rows_in_fold, ,drop=FALSE]) expect_equal(returned_obj$y[rows_in_training_set, ,drop=FALSE][(N/2 - length(which(cv_groups==1)) + 1):(N/2), ,drop=FALSE]$V1, Y[rows_in_training_set][rows_in_fold]) expect_equal(returned_obj$weights[rows_in_training_set][(N/2 - length(which(cv_groups==1)) + 1):(N/2)], w[rows_in_training_set][rows_in_fold]) expect_equal(returned_obj$offset[rows_in_training_set][(N/2 - length(which(cv_groups==1)) + 1):(N/2)], offset[rows_in_training_set][rows_in_fold]) }) test_that("split_and_join updates x_order so as to only use data in training folds", { # Given gbm_data, a distribution obj, rows_in_fold and rows_in_training ## test Gaussian distribution gbm model set.seed(1) # create some data N <- 1000 X1 <- runif(N) X2 <- 2*runif(N) X3 <- factor(sample(letters[1:4],N,replace=T)) X4 <- ordered(sample(letters[1:6],N,replace=T)) X5 <- factor(sample(letters[1:3],N,replace=T)) X6 <- 3*runif(N) mu <- c(-1,0,1,2)[as.numeric(X3)] SNR <- 10 # signal-to-noise ratio Y <- X1**1.5 + 2 * (X2**.5) + mu sigma <- sqrt(var(Y)/SNR) Y <- Y + rnorm(N,0,sigma) # create a bunch of missing values X1[sample(1:N,size=100)] <- NA X3[sample(1:N,size=300)] <- NA w <- rep(1,N) offset <- rep(0, N) data <- data.frame(X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6) # Set up for new API params <- training_params(num_trees=2000, interaction_depth=3, min_num_obs_in_node=10, shrinkage=0.005, bag_fraction=0.5, id=seq_len(nrow(data)), num_train=N/2, num_features=6) dist <- gbm_dist("Gaussian") gdata <- gbm_data(data, Y, w, offset) cv_folds <- 5 cv_groups <- create_cv_groups(gbm_data_obj = gdata, dist, params, cv_folds, FALSE, NULL) # Observations in the training set rows_in_training_set <- params$id %in% seq_len(params$num_train_rows) params$id <- params$id[rows_in_training_set] # Observations in cv_group - interested in first fold rows_in_fold <- params$id %in% seq_len(params$num_train_rows)[(cv_groups == 1)] # Extract relevent data - split into training and validation sets # Calculate new number of training rows params$num_train_rows <- length(which(cv_groups != 1)) params$num_train <- length(unique(params$id[!rows_in_fold])) # When split_and_join called returned_obj <- split_and_join(gdata, params, rows_in_training_set, rows_in_fold) # Get gdata training folds data gdata_train <- gdata gdata_train$x <- as.data.frame(gdata_train$x[rows_in_training_set, ,drop=FALSE][!rows_in_fold, ,drop=FALSE]) gdata_train$y <- as.data.frame(as.matrix(gdata_train$y)[rows_in_training_set, ,drop=FALSE][!rows_in_fold, ,drop=FALSE]) gdata_train$offset <- gdata_train$offset[rows_in_training_set][!rows_in_fold] gdata_train$weights <- gdata_train$weights[rows_in_training_set][!rows_in_fold] x_order_with_training_folds <- predictor_order(gdata_train, params)$x_order # Then returned object is a GBMData object with x_order updated so as to only expect_equal(as.matrix(returned_obj$x_order), as.matrix(x_order_with_training_folds)) }) test_that("update_fold_dist_data returns the original distribution object if NOT Pairwise or CoxPH", { # Given gbm_data, a distribution obj, rows_in_fold and rows_in_training ## test Gaussian distribution gbm model set.seed(1) # create some data N <- 1000 X1 <- runif(N) X2 <- 2*runif(N) X3 <- factor(sample(letters[1:4],N,replace=T)) X4 <- ordered(sample(letters[1:6],N,replace=T)) X5 <- factor(sample(letters[1:3],N,replace=T)) X6 <- 3*runif(N) mu <- c(-1,0,1,2)[as.numeric(X3)] SNR <- 10 # signal-to-noise ratio Y <- X1**1.5 + 2 * (X2**.5) + mu sigma <- sqrt(var(Y)/SNR) Y <- Y + rnorm(N,0,sigma) # create a bunch of missing values X1[sample(1:N,size=100)] <- NA X3[sample(1:N,size=300)] <- NA w <- rep(1,N) offset <- rep(0, N) data <- data.frame(X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6) # Set up for new API params <- training_params(num_trees=2000, interaction_depth=3, min_num_obs_in_node=10, shrinkage=0.005, bag_fraction=0.5, id=seq_len(nrow(data)), num_train=N/2, num_features=6) dist <- gbm_dist("Gaussian") gdata <- gbm_data(data, Y, w, offset) cv_folds <- 5 cv_groups <- create_cv_groups(gbm_data_obj = gdata, dist, params, cv_folds, FALSE, NULL) # Observations in the training set rows_in_training_set <- params$id %in% seq_len(params$num_train_rows) params$id <- params$id[rows_in_training_set] # Observations in cv_group - interested in first fold rows_in_fold <- params$id %in% seq_len(params$num_train_rows)[(cv_groups == 1)] # Extract relevent data - split into training and validation sets # Calculate new number of training rows params$num_train_rows <- length(which(cv_groups != 1)) params$num_train <- length(unique(params$id[!rows_in_fold])) # When try to update_fold_dist_data with distributions which are not CoxPH or Pairwise # Not assuming no stratification in cv_groups or fold_id so same for all dists # Then nothing changes for those distributions expect_equal(update_fold_dist_data(gbm_dist("AdaBoost"), gdata, params, rows_in_training_set, rows_in_fold), gbm_dist("AdaBoost")) expect_equal(update_fold_dist_data(gbm_dist("Bernoulli"), gdata, params, rows_in_training_set, rows_in_fold), gbm_dist("Bernoulli")) expect_equal(update_fold_dist_data(gbm_dist("Gamma"), gdata, params, rows_in_training_set, rows_in_fold), gbm_dist("Gamma")) expect_equal(update_fold_dist_data(gbm_dist("Gaussian"), gdata, params, rows_in_training_set, rows_in_fold), gbm_dist("Gaussian")) expect_equal(update_fold_dist_data(gbm_dist("Huberized"), gdata, params, rows_in_training_set, rows_in_fold), gbm_dist("Huberized")) expect_equal(update_fold_dist_data(gbm_dist("Laplace"), gdata, params, rows_in_training_set, rows_in_fold), gbm_dist("Laplace")) expect_equal(update_fold_dist_data(gbm_dist("Poisson"), gdata, params, rows_in_training_set, rows_in_fold), gbm_dist("Poisson")) expect_equal(update_fold_dist_data(gbm_dist("Quantile"), gdata, params, rows_in_training_set, rows_in_fold), gbm_dist("Quantile")) expect_equal(update_fold_dist_data(gbm_dist("TDist"), gdata, params, rows_in_training_set, rows_in_fold), gbm_dist("TDist")) expect_equal(update_fold_dist_data(gbm_dist("Tweedie"), gdata, params, rows_in_training_set, rows_in_fold), gbm_dist("Tweedie")) }) test_that("update_fold_dist_data updates the dist objects strata and sorted fields if CoxPH", { # Given gbm_data, a distribution obj (CoxPH!!), rows_in_fold and rows_in_training # Require Surv to be available require(survival) # create some data set.seed(1) N <- 3000 X1 <- runif(N) X2 <- runif(N) X3 <- factor(sample(letters[1:4],N,replace=T)) mu <- c(-1,0,1,2)[as.numeric(X3)] f <- 0.5*sin(3*X1 + 5*X2^2 + mu/10) tt.surv <- rexp(N,exp(f)) tt.cens <- rexp(N,0.5) delta <- as.numeric(tt.surv <= tt.cens) tt <- apply(cbind(tt.surv,tt.cens),1,min) # throw in some missing values X1[sample(1:N,size=100)] <- NA X3[sample(1:N,size=300)] <- NA # random weights if you want to experiment with them w <- rep(1,N) data <- data.frame(tt=tt,delta=delta,X1=X1,X2=X2,X3=X3) # Put into new API params <- training_params(num_trees=3000, interaction_depth=3, min_num_obs_in_node=10, shrinkage=0.001, bag_fraction=0.5, id=seq_len(nrow(data)), num_train=N/2, num_features=3) # Set up for new API dist <- gbm_dist("CoxPH") gdata <- gbm_data(data, Surv(tt, delta), w, offset=rep(0, N)) cv_folds <- 5 cv_groups <- create_cv_groups(gbm_data_obj = gdata, dist, params, cv_folds, FALSE, NULL) # Observations in the training set rows_in_training_set <- params$id %in% seq_len(params$num_train_rows) params$id <- params$id[rows_in_training_set] # Observations in cv_group - interested in first fold rows_in_fold <- params$id %in% seq_len(params$num_train_rows)[(cv_groups == 1)] # Extract relevent data - split into training and validation sets # Calculate new number of training rows params$num_train_rows <- length(which(cv_groups != 1)) params$num_train <- length(unique(params$id[!rows_in_fold])) # When try to update_fold_dist_data with distributions dist_update <- update_fold_dist_data(dist, gdata, params, rows_in_training_set, rows_in_fold) # Then strata and sorted are updated expect_true(length(dist_update$strata) != length(dist$strata)) expect_false(all(is.na(dist_update$sorted))) }) test_that("update_fold_dist_data updates the dist objects group if Pairwise", { # Given gbm_data, a distribution obj (Pairwise!! + groups), rows_in_fold and rows_in_training # create some data - DOESN'T NEED TO REFLECT DIST set.seed(1) N <- 1000 X1 <- runif(N) X2 <- 2*runif(N) X3 <- factor(sample(letters[1:4],N,replace=T)) X4 <- ordered(sample(letters[1:6],N,replace=T)) X5 <- factor(sample(letters[1:3],N,replace=T)) X6 <- 3*runif(N) mu <- c(-1,0,1,2)[as.numeric(X3)] SNR <- 10 # signal-to-noise ratio Y <- X1**1.5 + 2 * (X2**.5) + mu sigma <- sqrt(var(Y)/SNR) Y <- Y + rnorm(N,0,sigma) # create a bunch of missing values X1[sample(1:N,size=100)] <- NA X3[sample(1:N,size=300)] <- NA w <- rep(1,N) offset <- rep(0, N) X <- data.frame(X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6) data <- gbm_data(X, Y, w, offset) # Put into new API params <- training_params(num_trees=3000, interaction_depth=3, min_num_obs_in_node=10, shrinkage=0.001, bag_fraction=0.5, id=seq_len(nrow(X)), num_train=N/2, num_features=3) # Set up for new API dist <- gbm_dist("Pairwise") dist$group <- as.factor(sample(seq_len(5), N, replace=TRUE)) cv_folds <- 5 cv_groups <- create_cv_groups(gbm_data_obj = data, dist, params, cv_folds, FALSE, NULL) # DEFINE GROUPINGS # Observations in the training set rows_in_training_set <- params$id %in% seq_len(params$num_train_rows) params$id <- params$id[rows_in_training_set] # Observations in cv_group - interested in first fold rows_in_fold <- params$id %in% seq_len(params$num_train_rows)[(cv_groups == 1)] # Extract relevent data - split into training and validation sets # Calculate new number of training rows params$num_train_rows <- length(which(cv_groups != 1)) params$num_train <- length(unique(params$id[!rows_in_fold])) # When try to update_fold_dist_data with distributions dist_update <- update_fold_dist_data(dist, data, params, rows_in_training_set, rows_in_fold) # Then groupings are now updated expect_equal(dist_update$group, c(dist$group[rows_in_training_set][!rows_in_fold], dist$group[rows_in_training_set][rows_in_fold])) }) test_that("extract_obs_in_fold updates the training parameters so as to take into account the number of observations in the validation fold", { # Given gbm_data, a distribution obj (CoxPH!!), rows_in_fold and rows_in_training # Require Surv to be available require(survival) # create some data set.seed(1) N <- 3000 X1 <- runif(N) X2 <- runif(N) X3 <- factor(sample(letters[1:4],N,replace=T)) mu <- c(-1,0,1,2)[as.numeric(X3)] f <- 0.5*sin(3*X1 + 5*X2^2 + mu/10) tt.surv <- rexp(N,exp(f)) tt.cens <- rexp(N,0.5) delta <- as.numeric(tt.surv <= tt.cens) tt <- apply(cbind(tt.surv,tt.cens),1,min) # throw in some missing values X1[sample(1:N,size=100)] <- NA X3[sample(1:N,size=300)] <- NA # random weights if you want to experiment with them w <- rep(1,N) data <- data.frame(tt=tt,delta=delta,X1=X1,X2=X2,X3=X3) # Put into new API dist <- gbm_dist("CoxPH", prior_node_coeff_var=10) params <- training_params(num_trees=3000, interaction_depth=3, min_num_obs_in_node=10, shrinkage=0.001, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=3) # Set up for new API dist <- gbm_dist("CoxPH") gdata <- gbm_data(data, Surv(tt, delta), w, offset=rep(0, N)) cv_folds <- 5 cv_groups <- create_cv_groups(gbm_data_obj = gdata, dist, params, cv_folds, FALSE, NULL) # When extract_obs_in_fold is called returned_obj <- extract_obs_in_fold(gdata, dist, params, cv_groups, fold_num=1) # Then updates the num of training observations in train_params expect_equal(returned_obj$params$num_train, params$num_train * (cv_folds-1)/cv_folds) }) test_that("extract_obs_in_fold returns a list of the updated gbm_data_obj, dist obj and training params", { # Given gbm_data, a distribution obj (CoxPH!!), rows_in_fold and rows_in_training # Require Surv to be available require(survival) # create some data set.seed(1) N <- 3000 X1 <- runif(N) X2 <- runif(N) X3 <- factor(sample(letters[1:4],N,replace=T)) mu <- c(-1,0,1,2)[as.numeric(X3)] f <- 0.5*sin(3*X1 + 5*X2^2 + mu/10) tt.surv <- rexp(N,exp(f)) tt.cens <- rexp(N,0.5) delta <- as.numeric(tt.surv <= tt.cens) tt <- apply(cbind(tt.surv,tt.cens),1,min) # throw in some missing values X1[sample(1:N,size=100)] <- NA X3[sample(1:N,size=300)] <- NA # random weights if you want to experiment with them w <- rep(1,N) data <- data.frame(X1=X1,X2=X2,X3=X3) # Put into new API params <- training_params(num_trees=3000, interaction_depth=3, min_num_obs_in_node=10, shrinkage=0.001, bag_fraction=0.5, id=seq(nrow(data)), num_train=N/2, num_features=3) # Set up for new API dist <- gbm_dist("CoxPH") gdata <- gbm_data(data, Surv(tt, delta), w, rep(0, N)) cv_folds <- 5 cv_groups <- create_cv_groups(gdata, dist, params, cv_folds, FALSE, NULL) # When extract_obs_in_fold is called returned_obj <- extract_obs_in_fold(gdata, dist, params, cv_groups, fold_num=1) # Then returns a list with correct fields expect_true(is.list(returned_obj)) expect_equal(names(returned_obj), c("data", "dist", "params")) expect_error(check_if_gbm_data(returned_obj$data), NA) expect_error(check_if_gbm_dist(returned_obj$dist), NA) expect_error(check_if_gbm_train_params(returned_obj$params), NA) })