#################### # Author: James Hickey # # Series of test to check the data ordering functionality # #################### context("Test data ordering functionality") test_that("Reordering method does nothing to fit unless CoxPH or Pairwise selected", { # Given mock gbm_fit N <- 1500 gbm_fit <- structure(list("fit"=1:N), class="GBMFit") # When ordering according to any distribution other than CoxPH or Pairwise dist <- gbm_dist("AdaBoost") dist_2 <- gbm_dist("Bernoulli") dist_3 <- gbm_dist("Gamma") dist_4 <- gbm_dist("Gaussian") dist_5 <- gbm_dist("Huberized") dist_6 <- gbm_dist("Laplace") dist_7 <- gbm_dist("Poisson") dist_8 <- gbm_dist("Quantile") dist_9 <- gbm_dist("TDist") dist_10 <- gbm_dist("Tweedie") # Then fit order does not change expect_equal(reorder_fit(gbm_fit, dist), gbm_fit) expect_equal(reorder_fit(gbm_fit, dist_2), gbm_fit) expect_equal(reorder_fit(gbm_fit, dist_3), gbm_fit) expect_equal(reorder_fit(gbm_fit, dist_4), gbm_fit) expect_equal(reorder_fit(gbm_fit, dist_5), gbm_fit) expect_equal(reorder_fit(gbm_fit, dist_6), gbm_fit) expect_equal(reorder_fit(gbm_fit, dist_7), gbm_fit) expect_equal(reorder_fit(gbm_fit, dist_8), gbm_fit) expect_equal(reorder_fit(gbm_fit, dist_9), gbm_fit) expect_equal(reorder_fit(gbm_fit, dist_10), gbm_fit) }) test_that("CoxPH- reorders fit according to time and strata", { # Require Surv to be available require(survival) # create some data set.seed(1) N <- 3000 X1 <- runif(N) X2 <- runif(N) X3 <- factor(sample(letters[1:4],N,replace=T)) mu <- c(-1,0,1,2)[as.numeric(X3)] f <- 0.5*sin(3*X1 + 5*X2^2 + mu/10) tt.surv <- rexp(N,exp(f)) tt.cens <- rexp(N,0.5) delta <- as.numeric(tt.surv <= tt.cens) tt <- apply(cbind(tt.surv,tt.cens),1,min) # throw in some missing values X1[sample(1:N,size=100)] <- NA X3[sample(1:N,size=300)] <- NA # random weights if you want to experiment with them w <- rep(1,N) Resp <- Surv(tt, delta) data <- gbm_data(data.frame(X1, X2, X3), Resp, w, offset=rep(0, N)) train_p <- training_params(id=c(rep(1, N/5), rep(2, N/5), rep(3, N/5), rep(4, N/5), rep(5, N/5)), num_train = 5, num_features = 3, bag_fraction=1, min_num_obs_in_node=1) dist <- gbm_dist("CoxPH") # Reorder Fit dist <- create_strata(data, train_p, dist) gbm_fit <- structure(list("fit"=1:N), class="GBMFit") gbm_fit_ordered <- reorder_fit(gbm_fit, dist) # Expect order according to time_order gbm_fit$fit[dist$time_order] <- gbm_fit$fit expect_equal(gbm_fit_ordered$fit, gbm_fit$fit) }) test_that("Pairwise - reorders fit according to group order", { skip("Skipping pairwise") # Given Pairwise data and fit set.seed(1) N <- 1000 X1 <- runif(N) X2 <- 2*runif(N) X3 <- factor(sample(letters[1:4],N,replace=T)) X4 <- ordered(sample(letters[1:6],N,replace=T)) X5 <- factor(sample(letters[1:3],N,replace=T)) X6 <- 3*runif(N) mu <- c(-1,0,1,2)[as.numeric(X3)] SNR <- 10 # signal-to-noise ratio Y <- X1**1.5 + 2 * (X2**.5) + mu sigma <- sqrt(var(Y)/SNR) Y <- Y + rnorm(N,0,sigma) # create a bunch of missing values X1[sample(1:N,size=100)] <- NA X3[sample(1:N,size=300)] <- NA w <- rep(1,N) offset <- rep(0, N) X <- data.frame(X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6) data <- gbm_data(X, Y, w, offset) # Put into new API params <- training_params(num_trees=3000, interaction_depth=3, min_num_obs_in_node=10, shrinkage=0.001, bag_fraction=0.5, id=seq_len(nrow(X)), num_train=N/2, num_features=3) # Set up for new API dist <- gbm_dist("Pairwise") dist$group <- as.factor(sample(seq_len(5), N, replace=TRUE)) dist$group_order <-order(dist$group, -Y) fit <- structure(list("fit"=1:N), class="GBMFit") # When fit is reordered reordered_fit <- reorder_fit(fit, dist) # Then correctly reorders fit expect_equal(reordered_fit$fit, fit$fit[order(dist$group_order)]) }) test_that("Order data requires a gbm_data object and distribution object", { # Given gbm_data and dist # Require Surv to be available require(survival) # create some data set.seed(1) N <- 3000 X1 <- runif(N) X2 <- runif(N) X3 <- factor(sample(letters[1:4],N,replace=T)) mu <- c(-1,0,1,2)[as.numeric(X3)] f <- 0.5*sin(3*X1 + 5*X2^2 + mu/10) tt.surv <- rexp(N,exp(f)) tt.cens <- rexp(N,0.5) delta <- as.numeric(tt.surv <= tt.cens) tt <- apply(cbind(tt.surv,tt.cens),1,min) # throw in some missing values X1[sample(1:N,size=100)] <- NA X3[sample(1:N,size=300)] <- NA # random weights if you want to experiment with them w <- rep(1,N) offset <- rep(0, N/2) Resp <- Surv(tt, delta) data <- gbm_data(data.frame(X1, X2, X3), Resp, w, offset) train_p <- training_params(id=rep(1, N), num_train = N, num_features = 3) dist <- gbm_dist("CoxPH") # When classes is removed data_2 <- data dist_2 <- dist attr(data, "class") <- "NOTData" attr(dist, "class") <- "NOTDist" # Then error thrown when trying to order data expect_error(order_data(data, train_p, dist_2)) expect_error(order_data(data_2, train_p, dist)) }) test_that("Can order data by id", { # Given GBM data and training parameters require(survival) # create some data set.seed(1) N <- 3000 X1 <- runif(N) X2 <- runif(N) X3 <- factor(sample(letters[1:4],N,replace=T)) mu <- c(-1,0,1,2)[as.numeric(X3)] f <- 0.5*sin(3*X1 + 5*X2^2 + mu/10) tt.surv <- rexp(N,exp(f)) tt.cens <- rexp(N,0.5) delta <- as.numeric(tt.surv <= tt.cens) tt <- apply(cbind(tt.surv,tt.cens),1,min) # throw in some missing values X1[sample(1:N,size=100)] <- NA X3[sample(1:N,size=300)] <- NA # random weights if you want to experiment with them w <- rep(1,N) offset <- rep(0, N/2) Resp <- Surv(tt, delta) data <- gbm_data(data.frame(X1, X2, X3), Resp, w, offset) train_p <- training_params(id=sample(1:5, N, replace=TRUE), num_train = 5, bag_fraction=1, min_num_obs_in_node = 1, num_features = 3) # When ordered by id data_copy <- data data <- order_by_id(data, train_p) # Then data is ordered by id expect_equal(data$x, data_copy$x[order(train_p$id), , drop=FALSE]) expect_equal(data$y, data_copy$y[order(train_p$id)]) }) test_that("Can order data by groupings", { # Given pairwise data and dist set.seed(1) N <- 1000 X1 <- runif(N) X2 <- 2*runif(N) X3 <- factor(sample(letters[1:4],N,replace=T)) X4 <- ordered(sample(letters[1:6],N,replace=T)) X5 <- factor(sample(letters[1:3],N,replace=T)) X6 <- 3*runif(N) mu <- c(-1,0,1,2)[as.numeric(X3)] SNR <- 10 # signal-to-noise ratio Y <- X1**1.5 + 2 * (X2**.5) + mu sigma <- sqrt(var(Y)/SNR) Y <- Y + rnorm(N,0,sigma) # create a bunch of missing values X1[sample(1:N,size=100)] <- NA X3[sample(1:N,size=300)] <- NA w <- rep(1,N) offset <- rep(0, N) X <- data.frame(X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6) data <- gbm_data(X, Y, w, offset) # Put into new API params <- training_params(num_trees=3000, interaction_depth=3, min_num_obs_in_node=10, shrinkage=0.001, bag_fraction=0.5, id=seq_len(nrow(X)), num_train=N/2, num_features=3) # Set up for new API dist <- gbm_dist("Pairwise") dist$group <- as.factor(sample(seq_len(5), N, replace=TRUE)) dist$group_order <-order(dist$group, -Y) # When order_by_groupings is called data_ordered <- order_by_groupings(data, dist) # Then correctly orders by groupings expect_equal(data_ordered$x, data$x[dist$group_order,,drop=FALSE]) expect_equal(data_ordered$y, data$y[dist$group_order]) expect_equal(data_ordered$weights, data$weights[dist$group_order]) }) test_that("Ordering by groupings does nothing if not Pairwise", { # Given data and not pairwise (using CoxPH data for example) require(survival) # create some data set.seed(1) N <- 3000 X1 <- runif(N) X2 <- runif(N) X3 <- factor(sample(letters[1:4],N,replace=T)) mu <- c(-1,0,1,2)[as.numeric(X3)] f <- 0.5*sin(3*X1 + 5*X2^2 + mu/10) tt.surv <- rexp(N,exp(f)) tt.cens <- rexp(N,0.5) delta <- as.numeric(tt.surv <= tt.cens) tt <- apply(cbind(tt.surv,tt.cens),1,min) # throw in some missing values X1[sample(1:N,size=100)] <- NA X3[sample(1:N,size=300)] <- NA # random weights if you want to experiment with them w <- rep(1,N) offset <- rep(0, N/2) Resp <- Surv(tt, delta) data <- gbm_data(data.frame(X1, X2, X3), Resp, w, offset) train_p <- training_params(id=sample(1:5, N, replace=TRUE), num_train = 5, bag_fraction=1, min_num_obs_in_node = 1, num_features = 3) # When ordered by grouping - NOT Pairwise dist # Then nothing changes expect_equal(data, order_by_groupings(data, gbm_dist("AdaBoost"))) expect_equal(data, order_by_groupings(data, gbm_dist("Bernoulli"))) expect_equal(data, order_by_groupings(data, gbm_dist("Gamma"))) expect_equal(data, order_by_groupings(data, gbm_dist("Gaussian"))) expect_equal(data, order_by_groupings(data, gbm_dist("Huberized"))) expect_equal(data, order_by_groupings(data, gbm_dist("Laplace"))) expect_equal(data, order_by_groupings(data, gbm_dist("Poisson"))) expect_equal(data, order_by_groupings(data, gbm_dist("Quantile"))) expect_equal(data, order_by_groupings(data, gbm_dist("TDist"))) expect_equal(data, order_by_groupings(data, gbm_dist("Tweedie"))) }) test_that("Can order by predictors - check predictor_order", { # Given gbm data and training parameters require(survival) # create some data set.seed(1) N <- 3000 X1 <- runif(N) X2 <- runif(N) X3 <- factor(sample(letters[1:4],N,replace=T)) mu <- c(-1,0,1,2)[as.numeric(X3)] f <- 0.5*sin(3*X1 + 5*X2^2 + mu/10) tt.surv <- rexp(N,exp(f)) tt.cens <- rexp(N,0.5) delta <- as.numeric(tt.surv <= tt.cens) tt <- apply(cbind(tt.surv,tt.cens),1,min) # throw in some missing values X1[sample(1:N,size=100)] <- NA X3[sample(1:N,size=300)] <- NA # random weights if you want to experiment with them w <- rep(1,N) offset <- rep(0, N/2) Resp <- Surv(tt, delta) data <- gbm_data(data.frame(X1, X2, X3), Resp, w, offset) train_p <- training_params(id=sample(1:5, N, replace=TRUE), num_train = 5, bag_fraction = 1, min_num_obs_in_node = 1, num_features = 3) # Then can order by predictors data <- predictor_order(data, train_p) expect_equal(data$x_order, apply(data$x[seq_len(train_p$num_train_rows),,drop=FALSE], 2, order,na.last=FALSE)-1) })