test_that("Prediction from trees with constant leaf", {
  # Create dataset and forest container
  num_trees <- 10
  # fmt: skip
  X = matrix(c(1.5, 8.7, 1.2, 
                 2.7, 3.4, 5.4, 
                 3.6, 1.2, 9.3, 
                 4.4, 5.4, 10.4, 
                 5.3, 9.3, 3.6, 
                 6.1, 10.4, 4.4), 
               byrow = TRUE, nrow = 6)
  n <- nrow(X)
  p <- ncol(X)
  forest_dataset = createForestDataset(X)
  forest_samples <- createForestSamples(num_trees, 1, TRUE)

  # Initialize a forest with constant root predictions
  forest_samples$add_forest_with_constant_leaves(0.)

  # Check that regular and "raw" predictions are the same (since the leaf is constant)
  pred <- forest_samples$predict(forest_dataset)
  pred_raw <- forest_samples$predict_raw(forest_dataset)

  # Assertion
  expect_equal(pred, pred_raw)

  # Split the root of the first tree in the ensemble at X[,1] > 4.0
  forest_samples$add_numeric_split_tree(0, 0, 0, 0, 4.0, -5., 5.)

  # Check that regular and "raw" predictions are the same (since the leaf is constant)
  pred <- forest_samples$predict(forest_dataset)
  pred_raw <- forest_samples$predict_raw(forest_dataset)

  # Assertion
  expect_equal(pred, pred_raw)

  # Split the left leaf of the first tree in the ensemble at X[,2] > 4.0
  forest_samples$add_numeric_split_tree(0, 0, 1, 1, 4.0, -7.5, -2.5)

  # Check that regular and "raw" predictions are the same (since the leaf is constant)
  pred <- forest_samples$predict(forest_dataset)
  pred_raw <- forest_samples$predict_raw(forest_dataset)

  # Assertion
  expect_equal(pred, pred_raw)

  # Check the split count for the first tree in the ensemble
  split_counts <- forest_samples$get_tree_split_counts(0, 0, p)
  split_counts_expected <- c(1, 1, 0)

  # Assertion
  expect_equal(split_counts, split_counts_expected)
})

test_that("Prediction from trees with univariate leaf basis", {
  # Create dataset and forest container
  num_trees <- 10
  # fmt: skip
  X = matrix(c(1.5, 8.7, 1.2, 
                 2.7, 3.4, 5.4, 
                 3.6, 1.2, 9.3, 
                 4.4, 5.4, 10.4, 
                 5.3, 9.3, 3.6, 
                 6.1, 10.4, 4.4), 
               byrow = TRUE, nrow = 6)
  W = as.matrix(c(-1, -1, -1, 1, 1, 1))
  n <- nrow(X)
  p <- ncol(X)
  forest_dataset = createForestDataset(X, W)
  forest_samples <- createForestSamples(num_trees, 1, FALSE)

  # Initialize a forest with constant root predictions
  forest_samples$add_forest_with_constant_leaves(0.)

  # Check that regular and "raw" predictions are the same (since the leaf is constant)
  pred <- forest_samples$predict(forest_dataset)
  pred_raw <- forest_samples$predict_raw(forest_dataset)

  # Assertion
  expect_equal(pred, pred_raw)

  # Split the root of the first tree in the ensemble at X[,1] > 4.0
  forest_samples$add_numeric_split_tree(0, 0, 0, 0, 4.0, -5., 5.)

  # Check that regular and "raw" predictions are the same (since the leaf is constant)
  pred <- forest_samples$predict(forest_dataset)
  pred_raw <- forest_samples$predict_raw(forest_dataset)
  pred_manual <- pred_raw * W

  # Assertion
  expect_equal(pred, pred_manual)

  # Split the left leaf of the first tree in the ensemble at X[,2] > 4.0
  forest_samples$add_numeric_split_tree(0, 0, 1, 1, 4.0, -7.5, -2.5)

  # Check that regular and "raw" predictions are the same (since the leaf is constant)
  pred <- forest_samples$predict(forest_dataset)
  pred_raw <- forest_samples$predict_raw(forest_dataset)
  pred_manual <- pred_raw * W

  # Assertion
  expect_equal(pred, pred_manual)

  # Check the split count for the first tree in the ensemble
  split_counts <- forest_samples$get_tree_split_counts(0, 0, p)
  split_counts_expected <- c(1, 1, 0)

  # Assertion
  expect_equal(split_counts, split_counts_expected)
})

test_that("Prediction from trees with multivariate leaf basis", {
  # Create dataset and forest container
  num_trees <- 10
  output_dim <- 2
  num_samples <- 0
  # fmt: skip
  X = matrix(c(1.5, 8.7, 1.2, 
                 2.7, 3.4, 5.4, 
                 3.6, 1.2, 9.3, 
                 4.4, 5.4, 10.4, 
                 5.3, 9.3, 3.6, 
                 6.1, 10.4, 4.4), 
               byrow = TRUE, nrow = 6)
  n <- nrow(X)
  p <- ncol(X)
  W = matrix(c(1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1), byrow = FALSE, nrow = 6)
  forest_dataset = createForestDataset(X, W)
  forest_samples <- createForestSamples(num_trees, output_dim, FALSE)

  # Initialize a forest with constant root predictions
  forest_samples$add_forest_with_constant_leaves(c(1., 1.))
  num_samples <- num_samples + 1

  # Check that regular and "raw" predictions are the same (since the leaf is constant)
  pred <- forest_samples$predict(forest_dataset)
  pred_raw <- forest_samples$predict_raw(forest_dataset)
  pred_intermediate <- as.numeric(pred_raw) * as.numeric(W)
  dim(pred_intermediate) <- c(n, output_dim, num_samples)
  pred_manual <- apply(pred_intermediate, 3, function(x) rowSums(x))

  # Assertion
  expect_equal(pred, pred_manual)

  # Split the root of the first tree in the ensemble at X[,1] > 4.0
  forest_samples$add_numeric_split_tree(0, 0, 0, 0, 4.0, c(-5., -1.), c(5., 1.))

  # Check that regular and "raw" predictions are the same (since the leaf is constant)
  pred <- forest_samples$predict(forest_dataset)
  pred_raw <- forest_samples$predict_raw(forest_dataset)
  pred_intermediate <- as.numeric(pred_raw) * as.numeric(W)
  dim(pred_intermediate) <- c(n, output_dim, num_samples)
  pred_manual <- apply(pred_intermediate, 3, function(x) rowSums(x))

  # Assertion
  expect_equal(pred, pred_manual)

  # Split the left leaf of the first tree in the ensemble at X[,2] > 4.0
  forest_samples$add_numeric_split_tree(
    0,
    0,
    1,
    1,
    4.0,
    c(-7.5, 2.5),
    c(-2.5, 7.5)
  )

  # Check that regular and "raw" predictions are the same (since the leaf is constant)
  pred <- forest_samples$predict(forest_dataset)
  pred_raw <- forest_samples$predict_raw(forest_dataset)
  pred_intermediate <- as.numeric(pred_raw) * as.numeric(W)
  dim(pred_intermediate) <- c(n, output_dim, num_samples)
  pred_manual <- apply(pred_intermediate, 3, function(x) rowSums(x))

  # Assertion
  expect_equal(pred, pred_manual)

  # Check the split count for the first tree in the ensemble
  split_counts <- forest_samples$get_tree_split_counts(0, 0, 3)
  split_counts_expected <- c(1, 1, 0)

  # Assertion
  expect_equal(split_counts, split_counts_expected)
})

test_that("BART predictions with pre-summarization", {
  # Generate data and test-train split
  n <- 100
  p <- 5
  X <- matrix(runif(n * p), ncol = p)
  # fmt: skip
  f_XW <- (((0 <= X[, 1]) & (0.25 > X[, 1])) * (-7.5) +
                 ((0.25 <= X[, 1]) & (0.5 > X[, 1])) * (-2.5) +
                 ((0.5 <= X[, 1]) & (0.75 > X[, 1])) * (2.5) +
                 ((0.75 <= X[, 1]) & (1 > X[, 1])) * (7.5))
  noise_sd <- 1
  y <- f_XW + rnorm(n, 0, noise_sd)
  test_set_pct <- 0.2
  n_test <- round(test_set_pct * n)
  n_train <- n - n_test
  test_inds <- sort(sample(1:n, n_test, replace = FALSE))
  train_inds <- (1:n)[!((1:n) %in% test_inds)]
  X_test <- X[test_inds, ]
  X_train <- X[train_inds, ]
  y_test <- y[test_inds]
  y_train <- y[train_inds]

  # Fit a "classic" BART model
  bart_model <- bart(
    X_train = X_train,
    y_train = y_train,
    num_gfr = 10,
    num_burnin = 0,
    num_mcmc = 10
  )

  # Check that the default predict method returns a list
  pred <- predict(bart_model, X = X_test)
  y_hat_posterior_test <- pred$y_hat
  expect_equal(dim(y_hat_posterior_test), c(20, 10))

  # Check that the pre-aggregated predictions match with those computed by rowMeans
  pred_mean <- predict(bart_model, X = X_test, type = "mean")
  y_hat_mean_test <- pred_mean$y_hat
  expect_equal(y_hat_mean_test, rowMeans(y_hat_posterior_test))

  # Check that we warn and return a NULL when requesting terms that weren't fit
  expect_warning({
    pred_mean <- predict(
      bart_model,
      X = X_test,
      type = "mean",
      terms = c("rfx", "variance_forest")
    )
  })
  expect_equal(NULL, pred_mean)

  # Fit a heteroskedastic BART model
  var_params <- list(num_trees = 20)
  het_bart_model <- bart(
    X_train = X_train,
    y_train = y_train,
    num_gfr = 10,
    num_burnin = 0,
    num_mcmc = 10,
    general_params = list(sample_sigma2_global = FALSE),
    variance_forest_params = var_params
  )

  # Check that the default predict method returns a list
  pred <- predict(het_bart_model, X = X_test)
  y_hat_posterior_test <- pred$y_hat
  sigma2_hat_posterior_test <- pred$variance_forest_predictions

  # Assertion
  expect_equal(dim(y_hat_posterior_test), c(20, 10))
  expect_equal(dim(sigma2_hat_posterior_test), c(20, 10))

  # Check that the pre-aggregated predictions match with those computed by rowMeans
  pred_mean <- predict(het_bart_model, X = X_test, type = "mean")
  y_hat_mean_test <- pred_mean$y_hat
  sigma2_hat_mean_test <- pred_mean$variance_forest_predictions

  # Assertion
  expect_equal(y_hat_mean_test, rowMeans(y_hat_posterior_test))
  expect_equal(sigma2_hat_mean_test, rowMeans(sigma2_hat_posterior_test))

  # Check that the "single-term" pre-aggregated predictions
  # match those computed by pre-aggregated predictions returned in a list
  y_hat_mean_test_single_term <- predict(
    het_bart_model,
    X = X_test,
    type = "mean",
    terms = "y_hat"
  )
  sigma2_hat_mean_test_single_term <- predict(
    het_bart_model,
    X = X_test,
    type = "mean",
    terms = "variance_forest"
  )

  # Assertion
  expect_equal(y_hat_mean_test, y_hat_mean_test_single_term)
  expect_equal(sigma2_hat_mean_test, sigma2_hat_mean_test_single_term)
})

test_that("BART predictions with random effects", {
  # Generate data and test-train split
  n <- 100
  p <- 5
  X <- matrix(runif(n * p), ncol = p)
  # fmt: skip
  f_XW <- (((0 <= X[, 1]) & (0.25 > X[, 1])) * (-7.5) +
                 ((0.25 <= X[, 1]) & (0.5 > X[, 1])) * (-2.5) +
                 ((0.5 <= X[, 1]) & (0.75 > X[, 1])) * (2.5) +
                 ((0.75 <= X[, 1]) & (1 > X[, 1])) * (7.5))
  noise_sd <- 1
  rfx_group_ids <- sample(1:3, n, replace = TRUE)
  rfx_coefs <- c(-2, 0, 2)
  rfx_term <- rfx_coefs[rfx_group_ids]
  rfx_basis <- matrix(1, nrow = n, ncol = 1)
  y <- f_XW + rfx_term + rnorm(n, 0, noise_sd)
  test_set_pct <- 0.2
  n_test <- round(test_set_pct * n)
  n_train <- n - n_test
  test_inds <- sort(sample(1:n, n_test, replace = FALSE))
  train_inds <- (1:n)[!((1:n) %in% test_inds)]
  X_test <- X[test_inds, ]
  X_train <- X[train_inds, ]
  rfx_group_ids_test <- rfx_group_ids[test_inds]
  rfx_group_ids_train <- rfx_group_ids[train_inds]
  rfx_basis_test <- rfx_basis[test_inds, ]
  rfx_basis_train <- rfx_basis[train_inds, ]
  y_test <- y[test_inds]
  y_train <- y[train_inds]

  # Fit a "classic" BART model
  rfx_params <- list(model_spec = "intercept_only")
  bart_model <- bart(
    X_train = X_train,
    y_train = y_train,
    rfx_group_ids_train = rfx_group_ids_train,
    random_effects_params = rfx_params,
    num_gfr = 10,
    num_burnin = 0,
    num_mcmc = 10
  )

  # Check that the default predict method returns a list
  pred <- predict(bart_model, X = X_test, rfx_group_ids = rfx_group_ids_test)
  y_hat_posterior_test <- pred$y_hat
  expect_equal(dim(y_hat_posterior_test), c(20, 10))

  # Check that the pre-aggregated predictions match with those computed by rowMeans
  pred_mean <- predict(
    bart_model,
    X = X_test,
    rfx_group_ids = rfx_group_ids_test,
    type = "mean"
  )
  y_hat_mean_test <- pred_mean$y_hat
  expect_equal(y_hat_mean_test, rowMeans(y_hat_posterior_test))

  # Check that we warn and return a NULL when requesting terms that weren't fit
  expect_warning({
    pred_mean <- predict(
      bart_model,
      X = X_test,
      rfx_group_ids = rfx_group_ids_test,
      type = "mean",
      terms = c("variance_forest")
    )
  })
  expect_equal(NULL, pred_mean)
})

test_that("BCF predictions with pre-summarization", {
  # Generate data and test-train split
  n <- 100
  g <- function(x) {
    ifelse(x[, 5] == 1, 2, ifelse(x[, 5] == 2, -1, -4))
  }
  x1 <- rnorm(n)
  x2 <- rnorm(n)
  x3 <- rnorm(n)
  x4 <- as.numeric(rbinom(n, 1, 0.5))
  x5 <- as.numeric(sample(1:3, n, replace = TRUE))
  X <- cbind(x1, x2, x3, x4, x5)
  p <- ncol(X)
  mu_x <- 1 + g(X) + X[, 1] * X[, 3]
  tau_x <- 1 + 2 * X[, 2] * X[, 4]
  pi_x <- 0.8 *
    pnorm((3 * mu_x / sd(mu_x)) - 0.5 * X[, 1]) +
    0.05 +
    runif(n) / 10
  Z <- rbinom(n, 1, pi_x)
  E_XZ <- mu_x + Z * tau_x
  snr <- 2
  y <- E_XZ + rnorm(n, 0, 1) * (sd(E_XZ) / snr)
  X <- as.data.frame(X)
  X$x4 <- factor(X$x4, ordered = TRUE)
  X$x5 <- factor(X$x5, ordered = TRUE)
  test_set_pct <- 0.2
  n_test <- round(test_set_pct * n)
  n_train <- n - n_test
  test_inds <- sort(sample(1:n, n_test, replace = FALSE))
  train_inds <- (1:n)[!((1:n) %in% test_inds)]
  X_test <- X[test_inds, ]
  X_train <- X[train_inds, ]
  pi_test <- pi_x[test_inds]
  pi_train <- pi_x[train_inds]
  Z_test <- Z[test_inds]
  Z_train <- Z[train_inds]
  y_test <- y[test_inds]
  y_train <- y[train_inds]

  # Fit a "classic" BCF model
  bcf_model <- bcf(
    X_train = X_train,
    Z_train = Z_train,
    y_train = y_train,
    propensity_train = pi_train,
    X_test = X_test,
    Z_test = Z_test,
    propensity_test = pi_test,
    num_gfr = 10,
    num_burnin = 0,
    num_mcmc = 10
  )

  # Check that the default predict method returns a list
  pred <- predict(
    bcf_model,
    X = X_test,
    Z = Z_test,
    propensity = pi_test
  )
  y_hat_posterior_test <- pred$y_hat
  expect_equal(dim(y_hat_posterior_test), c(20, 10))

  # Check that the pre-aggregated predictions match with those computed by rowMeans
  pred_mean <- predict(
    bcf_model,
    X = X_test,
    Z = Z_test,
    propensity = pi_test,
    type = "mean"
  )
  y_hat_mean_test <- pred_mean$y_hat
  expect_equal(y_hat_mean_test, rowMeans(y_hat_posterior_test))

  # Check that we warn and return a NULL when requesting terms that weren't fit
  expect_warning({
    pred_mean <- predict(
      bcf_model,
      X = X_test,
      Z = Z_test,
      propensity = pi_test,
      type = "mean",
      terms = c("rfx", "variance_forest")
    )
  })
  expect_equal(NULL, pred_mean)

  # Fit a heteroskedastic BCF model
  var_params <- list(num_trees = 20)
  expect_warning(
    het_bcf_model <- bcf(
      X_train = X_train,
      Z_train = Z_train,
      y_train = y_train,
      propensity_train = pi_train,
      X_test = X_test,
      Z_test = Z_test,
      propensity_test = pi_test,
      num_gfr = 10,
      num_burnin = 0,
      num_mcmc = 10,
      variance_forest_params = var_params
    )
  )

  # Check that the default predict method returns a list
  pred <- predict(
    het_bcf_model,
    X = X_test,
    Z = Z_test,
    propensity = pi_test
  )
  y_hat_posterior_test <- pred$y_hat
  sigma2_hat_posterior_test <- pred$variance_forest_predictions

  # Assertion
  expect_equal(dim(y_hat_posterior_test), c(20, 10))
  expect_equal(dim(sigma2_hat_posterior_test), c(20, 10))

  # Check that the pre-aggregated predictions match with those computed by rowMeans
  pred_mean <- predict(
    het_bcf_model,
    X = X_test,
    Z = Z_test,
    propensity = pi_test,
    type = "mean"
  )
  y_hat_mean_test <- pred_mean$y_hat
  sigma2_hat_mean_test <- pred_mean$variance_forest_predictions

  # Assertion
  expect_equal(y_hat_mean_test, rowMeans(y_hat_posterior_test))
  expect_equal(sigma2_hat_mean_test, rowMeans(sigma2_hat_posterior_test))

  # Check that the "single-term" pre-aggregated predictions
  # match those computed by pre-aggregated predictions returned in a list
  y_hat_mean_test_single_term <- predict(
    het_bcf_model,
    X = X_test,
    Z = Z_test,
    propensity = pi_test,
    type = "mean",
    terms = "y_hat"
  )
  sigma2_hat_mean_test_single_term <- predict(
    het_bcf_model,
    X = X_test,
    Z = Z_test,
    propensity = pi_test,
    type = "mean",
    terms = "variance_forest"
  )

  # Assertion
  expect_equal(y_hat_mean_test, y_hat_mean_test_single_term)
  expect_equal(sigma2_hat_mean_test, sigma2_hat_mean_test_single_term)
})

test_that("BCF predictions with random effects", {
  # Generate data and test-train split
  n <- 100
  g <- function(x) {
    ifelse(x[, 5] == 1, 2, ifelse(x[, 5] == 2, -1, -4))
  }
  x1 <- rnorm(n)
  x2 <- rnorm(n)
  x3 <- rnorm(n)
  x4 <- as.numeric(rbinom(n, 1, 0.5))
  x5 <- as.numeric(sample(1:3, n, replace = TRUE))
  X <- cbind(x1, x2, x3, x4, x5)
  p <- ncol(X)
  mu_x <- 1 + g(X) + X[, 1] * X[, 3]
  tau_x <- 1 + 2 * X[, 2] * X[, 4]
  pi_x <- 0.8 *
    pnorm((3 * mu_x / sd(mu_x)) - 0.5 * X[, 1]) +
    0.05 +
    runif(n) / 10
  Z <- rbinom(n, 1, pi_x)
  E_XZ <- mu_x + Z * tau_x
  rfx_group_ids <- sample(1:3, n, replace = TRUE)
  rfx_basis <- cbind(1, Z)
  rfx_coefs <- matrix(
    c(
      -2,
      -0.5,
      0,
      0.0,
      2,
      0.5
    ),
    byrow = T,
    ncol = 2
  )
  rfx_term <- rowSums(rfx_basis * rfx_coefs[rfx_group_ids, ])
  snr <- 2
  y <- E_XZ + rfx_term + rnorm(n, 0, 1) * (sd(E_XZ + rfx_term) / snr)
  X <- as.data.frame(X)
  X$x4 <- factor(X$x4, ordered = TRUE)
  X$x5 <- factor(X$x5, ordered = TRUE)
  test_set_pct <- 0.2
  n_test <- round(test_set_pct * n)
  n_train <- n - n_test
  test_inds <- sort(sample(1:n, n_test, replace = FALSE))
  train_inds <- (1:n)[!((1:n) %in% test_inds)]
  X_test <- X[test_inds, ]
  X_train <- X[train_inds, ]
  pi_test <- pi_x[test_inds]
  pi_train <- pi_x[train_inds]
  rfx_group_ids_test <- rfx_group_ids[test_inds]
  rfx_group_ids_train <- rfx_group_ids[train_inds]
  rfx_basis_test <- rfx_basis[test_inds, ]
  rfx_basis_train <- rfx_basis[train_inds, ]
  Z_test <- Z[test_inds]
  Z_train <- Z[train_inds]
  y_test <- y[test_inds]
  y_train <- y[train_inds]

  # Fit a BCF model with random intercept and random slope on Z
  rfx_params = list(model_spec = "intercept_plus_treatment")
  bcf_model <- bcf(
    X_train = X_train,
    Z_train = Z_train,
    y_train = y_train,
    propensity_train = pi_train,
    rfx_group_ids_train = rfx_group_ids_train,
    X_test = X_test,
    Z_test = Z_test,
    propensity_test = pi_test,
    rfx_group_ids_test = rfx_group_ids_test,
    random_effects_params = rfx_params,
    num_gfr = 10,
    num_burnin = 0,
    num_mcmc = 10
  )

  # Check that the default predict method returns a list
  pred <- predict(
    bcf_model,
    X = X_test,
    Z = Z_test,
    propensity = pi_test,
    rfx_group_ids = rfx_group_ids_test
  )
  y_hat_posterior_test <- pred$y_hat
  expect_equal(dim(y_hat_posterior_test), c(20, 10))

  # Check that the pre-aggregated predictions match with those computed by rowMeans
  pred_mean <- predict(
    bcf_model,
    X = X_test,
    Z = Z_test,
    propensity = pi_test,
    rfx_group_ids = rfx_group_ids_test,
    type = "mean"
  )
  y_hat_mean_test <- pred_mean$y_hat
  expect_equal(y_hat_mean_test, rowMeans(y_hat_posterior_test))

  # Check that we warn and return a NULL when requesting terms that weren't fit
  expect_warning({
    pred_mean <- predict(
      bcf_model,
      X = X_test,
      Z = Z_test,
      propensity = pi_test,
      type = "mean",
      terms = c("variance_forest")
    )
  })
  expect_equal(NULL, pred_mean)

  # Fit a BCF model with random intercept only
  # Fit a BCF model with random intercept and random slope on Z
  rfx_params = list(model_spec = "intercept_only")
  bcf_model <- bcf(
    X_train = X_train,
    Z_train = Z_train,
    y_train = y_train,
    propensity_train = pi_train,
    rfx_group_ids_train = rfx_group_ids_train,
    X_test = X_test,
    Z_test = Z_test,
    propensity_test = pi_test,
    rfx_group_ids_test = rfx_group_ids_test,
    random_effects_params = rfx_params,
    num_gfr = 10,
    num_burnin = 0,
    num_mcmc = 10
  )

  # Check that the default predict method returns a list
  pred <- predict(
    bcf_model,
    X = X_test,
    Z = Z_test,
    propensity = pi_test,
    rfx_group_ids = rfx_group_ids_test
  )
  y_hat_posterior_test <- pred$y_hat
  expect_equal(dim(y_hat_posterior_test), c(20, 10))

  # Check that the pre-aggregated predictions match with those computed by rowMeans
  pred_mean <- predict(
    bcf_model,
    X = X_test,
    Z = Z_test,
    propensity = pi_test,
    rfx_group_ids = rfx_group_ids_test,
    type = "mean"
  )
  y_hat_mean_test <- pred_mean$y_hat
  expect_equal(y_hat_mean_test, rowMeans(y_hat_posterior_test))

  # Check that we warn and return a NULL when requesting terms that weren't fit
  expect_warning({
    pred_mean <- predict(
      bcf_model,
      X = X_test,
      Z = Z_test,
      propensity = pi_test,
      type = "mean",
      terms = c("variance_forest")
    )
  })
})

test_that("BART cloglog binary: posterior interval and contrast", {
  # Generate binary cloglog data
  set.seed(42)
  n <- 100
  p <- 3
  X <- matrix(runif(n * p), ncol = p)
  beta <- rep(1 / sqrt(p), p)
  true_lambda <- X %*% beta
  prob_y1 <- 1 - exp(-exp(true_lambda))
  y <- rbinom(n, 1, prob_y1)

  # Train/test split
  test_set_pct <- 0.2
  n_test <- round(test_set_pct * n)
  test_inds <- sort(sample(1:n, n_test, replace = FALSE))
  train_inds <- (1:n)[!((1:n) %in% test_inds)]
  X_train <- X[train_inds, ]
  X_test <- X[test_inds, ]
  y_train <- y[train_inds]

  # Fit binary cloglog BART model
  bart_model <- bart(
    X_train = X_train,
    y_train = y_train,
    num_gfr = 10,
    num_burnin = 0,
    num_mcmc = 10,
    general_params = list(
      sample_sigma2_global = FALSE,
      outcome_model = OutcomeModel(outcome = "binary", link = "cloglog")
    )
  )

  # Test posterior interval on linear scale
  interval_linear <- computeBARTPosteriorInterval(
    bart_model,
    terms = "mean_forest",
    level = 0.95,
    scale = "linear",
    X = X_test
  )
  expect_true(is.list(interval_linear))
  expect_true("lower" %in% names(interval_linear))
  expect_true("upper" %in% names(interval_linear))
  expect_equal(length(interval_linear$lower), n_test)
  expect_equal(length(interval_linear$upper), n_test)
  expect_true(all(interval_linear$lower <= interval_linear$upper))

  # Test posterior interval on probability scale
  interval_prob <- computeBARTPosteriorInterval(
    bart_model,
    terms = "mean_forest",
    level = 0.95,
    scale = "probability",
    X = X_test
  )
  expect_true(is.list(interval_prob))
  expect_equal(length(interval_prob$lower), n_test)
  expect_true(all(interval_prob$lower >= 0 & interval_prob$lower <= 1))
  expect_true(all(interval_prob$upper >= 0 & interval_prob$upper <= 1))
  expect_true(all(interval_prob$lower <= interval_prob$upper))

  # Test contrast on linear scale
  X0 <- X_test
  X1 <- X_test + 0.5
  contrast_linear <- computeContrastBARTModel(
    bart_model,
    X_0 = X0,
    X_1 = X1,
    type = "posterior",
    scale = "linear"
  )
  expect_true(is.matrix(contrast_linear))
  expect_equal(nrow(contrast_linear), n_test)

  # Test contrast on probability scale
  contrast_prob <- computeContrastBARTModel(
    bart_model,
    X_0 = X0,
    X_1 = X1,
    type = "posterior",
    scale = "probability"
  )
  expect_true(is.matrix(contrast_prob))
  expect_equal(nrow(contrast_prob), n_test)

  # Test contrast with type = "mean"
  contrast_mean_linear <- computeContrastBARTModel(
    bart_model,
    X_0 = X0,
    X_1 = X1,
    type = "mean",
    scale = "linear"
  )
  expect_true(is.numeric(contrast_mean_linear))
  expect_equal(length(contrast_mean_linear), n_test)

  contrast_mean_prob <- computeContrastBARTModel(
    bart_model,
    X_0 = X0,
    X_1 = X1,
    type = "mean",
    scale = "probability"
  )
  expect_true(is.numeric(contrast_mean_prob))
  expect_equal(length(contrast_mean_prob), n_test)
})

test_that("BART cloglog ordinal: posterior interval and contrast", {
  # Generate ordinal cloglog data (3 categories)
  set.seed(42)
  n <- 500
  p <- 3
  n_categories <- 3
  X <- matrix(runif(n * p), ncol = p)
  beta <- rep(1 / sqrt(p), p)
  true_lambda <- X %*% beta
  gamma_true <- c(-1.5, -0.5)

  # Compute class probabilities
  true_probs <- matrix(0, nrow = n, ncol = n_categories)
  for (j in 1:n_categories) {
    if (j == 1) {
      true_probs[, j] <- 1 - exp(-exp(gamma_true[j] + true_lambda))
    } else if (j == n_categories) {
      true_probs[, j] <- 1 - rowSums(true_probs[, 1:(j - 1), drop = FALSE])
    } else {
      true_probs[, j] <- exp(-exp(gamma_true[j - 1] + true_lambda)) *
        (1 - exp(-exp(gamma_true[j] + true_lambda)))
    }
  }

  # Generate ordinal outcomes
  y <- sapply(1:n, function(i) {
    sample(1:n_categories, 1, prob = true_probs[i, ])
  })

  # Train/test split
  test_set_pct <- 0.2
  n_test <- round(test_set_pct * n)
  test_inds <- sort(sample(1:n, n_test, replace = FALSE))
  train_inds <- (1:n)[!((1:n) %in% test_inds)]
  X_train <- X[train_inds, ]
  X_test <- X[test_inds, ]
  y_train <- y[train_inds]

  # Fit ordinal cloglog BART model
  bart_model <- bart(
    X_train = X_train,
    y_train = y_train,
    num_gfr = 10,
    num_burnin = 0,
    num_mcmc = 10,
    general_params = list(
      sample_sigma2_global = FALSE,
      outcome_model = OutcomeModel(outcome = "ordinal", link = "cloglog")
    )
  )

  # Test posterior interval on linear scale
  interval_linear <- computeBARTPosteriorInterval(
    bart_model,
    terms = "mean_forest",
    level = 0.95,
    scale = "linear",
    X = X_test
  )
  expect_true(is.list(interval_linear))
  expect_equal(length(interval_linear$lower), n_test)
  expect_true(all(interval_linear$lower <= interval_linear$upper))

  # Test posterior interval on probability scale
  # For ordinal models, probability scale returns survival probabilities P(Y > k)
  # which are (n_test, n_categories - 1) matrices
  interval_prob <- computeBARTPosteriorInterval(
    bart_model,
    terms = "mean_forest",
    level = 0.95,
    scale = "probability",
    X = X_test
  )
  expect_true(is.list(interval_prob))
  expect_equal(nrow(interval_prob$lower), n_test)
  expect_equal(ncol(interval_prob$lower), n_categories - 1)
  expect_equal(nrow(interval_prob$upper), n_test)
  expect_equal(ncol(interval_prob$upper), n_categories - 1)
  expect_true(all(interval_prob$lower >= 0 & interval_prob$lower <= 1))
  expect_true(all(interval_prob$upper >= 0 & interval_prob$upper <= 1))
  expect_true(all(interval_prob$lower <= interval_prob$upper))

  # Test contrast on linear scale
  X0 <- X_test
  X1 <- X_test + 0.5
  contrast_linear <- computeContrastBARTModel(
    bart_model,
    X_0 = X0,
    X_1 = X1,
    type = "posterior",
    scale = "linear"
  )
  expect_true(is.matrix(contrast_linear))
  expect_equal(nrow(contrast_linear), n_test)

  # Test contrast on probability scale (ordinal returns 3D array)
  contrast_prob <- computeContrastBARTModel(
    bart_model,
    X_0 = X0,
    X_1 = X1,
    type = "posterior",
    scale = "probability"
  )
  expect_true(is.array(contrast_prob))
  expect_equal(dim(contrast_prob)[1], n_test)
  expect_equal(dim(contrast_prob)[2], n_categories - 1)

  # Test contrast with type = "mean" on linear scale
  contrast_mean_linear <- computeContrastBARTModel(
    bart_model,
    X_0 = X0,
    X_1 = X1,
    type = "mean",
    scale = "linear"
  )
  expect_true(is.numeric(contrast_mean_linear))
  expect_equal(length(contrast_mean_linear), n_test)

  # Test contrast with type = "mean" on probability scale
  # For ordinal, mean contrast should be (n_test, n_categories - 1) matrix
  contrast_mean_prob <- computeContrastBARTModel(
    bart_model,
    X_0 = X0,
    X_1 = X1,
    type = "mean",
    scale = "probability"
  )
  expect_true(is.matrix(contrast_mean_prob))
  expect_equal(nrow(contrast_mean_prob), n_test)
  expect_equal(ncol(contrast_mean_prob), n_categories - 1)
})

test_that("BART cloglog binary: sample posterior predictive", {
  # Generate binary cloglog data
  set.seed(42)
  n <- 100
  p <- 3
  X <- matrix(runif(n * p), ncol = p)
  beta <- rep(1 / sqrt(p), p)
  true_lambda <- X %*% beta
  prob_y1 <- 1 - exp(-exp(true_lambda))
  y <- rbinom(n, 1, prob_y1)

  # Train/test split
  test_set_pct <- 0.2
  n_test <- round(test_set_pct * n)
  test_inds <- sort(sample(1:n, n_test, replace = FALSE))
  train_inds <- (1:n)[!((1:n) %in% test_inds)]
  X_train <- X[train_inds, ]
  X_test <- X[test_inds, ]
  y_train <- y[train_inds]
  num_mcmc <- 10

  # Fit binary cloglog BART model
  bart_model <- bart(
    X_train = X_train,
    y_train = y_train,
    num_gfr = 10,
    num_burnin = 0,
    num_mcmc = num_mcmc,
    general_params = list(
      sample_sigma2_global = FALSE,
      outcome_model = OutcomeModel(outcome = "binary", link = "cloglog")
    )
  )

  # Test with multiple draws per sample
  ppd <- sampleBARTPosteriorPredictive(
    bart_model,
    X = X_test,
    num_draws_per_sample = 3
  )
  expect_equal(dim(ppd), c(n_test, num_mcmc, 3))
  expect_true(all(ppd %in% c(0, 1)))

  # Test with single draw per sample
  ppd1 <- sampleBARTPosteriorPredictive(
    bart_model,
    X = X_test,
    num_draws_per_sample = 1
  )
  expect_equal(dim(ppd1), c(n_test, num_mcmc))
  expect_true(all(ppd1 %in% c(0, 1)))
})

test_that("BART cloglog ordinal: sample posterior predictive", {
  # Generate ordinal cloglog data (3 categories)
  set.seed(42)
  n <- 500
  p <- 3
  n_categories <- 3
  X <- matrix(runif(n * p), ncol = p)
  beta <- rep(1 / sqrt(p), p)
  true_lambda <- X %*% beta
  gamma_true <- c(-1.5, -0.5)

  # Compute class probabilities
  true_probs <- matrix(0, nrow = n, ncol = n_categories)
  for (j in 1:n_categories) {
    if (j == 1) {
      true_probs[, j] <- 1 - exp(-exp(gamma_true[j] + true_lambda))
    } else if (j == n_categories) {
      true_probs[, j] <- 1 - rowSums(true_probs[, 1:(j - 1), drop = FALSE])
    } else {
      true_probs[, j] <- exp(-exp(gamma_true[j - 1] + true_lambda)) *
        (1 - exp(-exp(gamma_true[j] + true_lambda)))
    }
  }

  # Generate ordinal outcomes
  y <- sapply(1:n, function(i) {
    sample(1:n_categories, 1, prob = true_probs[i, ])
  })

  # Train/test split
  test_set_pct <- 0.2
  n_test <- round(test_set_pct * n)
  test_inds <- sort(sample(1:n, n_test, replace = FALSE))
  train_inds <- (1:n)[!((1:n) %in% test_inds)]
  X_train <- X[train_inds, ]
  X_test <- X[test_inds, ]
  y_train <- y[train_inds]
  num_mcmc <- 10

  # Fit ordinal cloglog BART model
  bart_model <- bart(
    X_train = X_train,
    y_train = y_train,
    num_gfr = 10,
    num_burnin = 0,
    num_mcmc = num_mcmc,
    general_params = list(
      sample_sigma2_global = FALSE,
      outcome_model = OutcomeModel(outcome = "ordinal", link = "cloglog")
    )
  )

  # Test with multiple draws per sample
  ppd <- sampleBARTPosteriorPredictive(
    bart_model,
    X = X_test,
    num_draws_per_sample = 3
  )
  expect_equal(dim(ppd), c(n_test, num_mcmc, 3))
  expect_true(all(ppd %in% 1:n_categories))

  # Test with single draw per sample
  ppd1 <- sampleBARTPosteriorPredictive(
    bart_model,
    X = X_test,
    num_draws_per_sample = 1
  )
  expect_equal(dim(ppd1), c(n_test, num_mcmc))
  expect_true(all(ppd1 %in% 1:n_categories))
})

test_that("BART gaussian: posterior interval and contrast", {
  # Generate gaussian data
  set.seed(42)
  n <- 100
  p <- 3
  X <- matrix(runif(n * p), ncol = p)
  beta <- rep(1 / sqrt(p), p)
  mu <- X %*% beta
  y <- rnorm(n, mu, 1)

  # Train/test split
  test_set_pct <- 0.2
  n_test <- round(test_set_pct * n)
  test_inds <- sort(sample(1:n, n_test, replace = FALSE))
  train_inds <- (1:n)[!((1:n) %in% test_inds)]
  X_train <- X[train_inds, ]
  X_test <- X[test_inds, ]
  y_train <- y[train_inds]
  num_mcmc <- 10

  # Fit gaussian BART model (default link)
  bart_model <- bart(
    X_train = X_train,
    y_train = y_train,
    num_gfr = 10,
    num_burnin = 0,
    num_mcmc = num_mcmc
  )

  # Test posterior interval on linear scale (mean_forest term)
  interval_linear <- computeBARTPosteriorInterval(
    bart_model,
    terms = "mean_forest",
    level = 0.95,
    scale = "linear",
    X = X_test
  )
  expect_true(is.list(interval_linear))
  expect_true("lower" %in% names(interval_linear))
  expect_true("upper" %in% names(interval_linear))
  expect_equal(length(interval_linear$lower), n_test)
  expect_equal(length(interval_linear$upper), n_test)
  expect_true(all(interval_linear$lower <= interval_linear$upper))

  # Test posterior interval for y_hat term
  interval_yhat <- computeBARTPosteriorInterval(
    bart_model,
    terms = "y_hat",
    level = 0.95,
    scale = "linear",
    X = X_test
  )
  expect_true(is.list(interval_yhat))
  expect_equal(length(interval_yhat$lower), n_test)
  expect_true(all(interval_yhat$lower <= interval_yhat$upper))

  # Test contrast on linear scale, type = "posterior"
  X0 <- X_test
  X1 <- X_test + 0.5
  contrast_posterior <- computeContrastBARTModel(
    bart_model,
    X_0 = X0,
    X_1 = X1,
    type = "posterior",
    scale = "linear"
  )
  expect_true(is.matrix(contrast_posterior))
  expect_equal(nrow(contrast_posterior), n_test)
  expect_equal(ncol(contrast_posterior), num_mcmc)

  # Test contrast with type = "mean"
  contrast_mean <- computeContrastBARTModel(
    bart_model,
    X_0 = X0,
    X_1 = X1,
    type = "mean",
    scale = "linear"
  )
  expect_true(is.numeric(contrast_mean))
  expect_equal(length(contrast_mean), n_test)
})

test_that("BART binary probit: posterior interval and contrast", {
  # Generate binary probit data
  set.seed(42)
  n <- 100
  p <- 3
  X <- matrix(runif(n * p), ncol = p)
  beta <- rep(1 / sqrt(p), p)
  mu <- X %*% beta
  prob_y1 <- pnorm(mu)
  y <- rbinom(n, 1, prob_y1)

  # Train/test split
  test_set_pct <- 0.2
  n_test <- round(test_set_pct * n)
  test_inds <- sort(sample(1:n, n_test, replace = FALSE))
  train_inds <- (1:n)[!((1:n) %in% test_inds)]
  X_train <- X[train_inds, ]
  X_test <- X[test_inds, ]
  y_train <- y[train_inds]
  num_mcmc <- 10

  # Fit binary probit BART model
  bart_model <- bart(
    X_train = X_train,
    y_train = y_train,
    num_gfr = 10,
    num_burnin = 0,
    num_mcmc = num_mcmc,
    general_params = list(
      sample_sigma2_global = FALSE,
      outcome_model = OutcomeModel(outcome = "binary", link = "probit")
    )
  )

  # Test posterior interval on linear scale
  interval_linear <- computeBARTPosteriorInterval(
    bart_model,
    terms = "mean_forest",
    level = 0.95,
    scale = "linear",
    X = X_test
  )
  expect_true(is.list(interval_linear))
  expect_true("lower" %in% names(interval_linear))
  expect_true("upper" %in% names(interval_linear))
  expect_equal(length(interval_linear$lower), n_test)
  expect_equal(length(interval_linear$upper), n_test)
  expect_true(all(interval_linear$lower <= interval_linear$upper))

  # Test posterior interval on probability scale
  interval_prob <- computeBARTPosteriorInterval(
    bart_model,
    terms = "mean_forest",
    level = 0.95,
    scale = "probability",
    X = X_test
  )
  expect_true(is.list(interval_prob))
  expect_equal(length(interval_prob$lower), n_test)
  expect_equal(length(interval_prob$upper), n_test)
  expect_true(all(interval_prob$lower >= 0 & interval_prob$lower <= 1))
  expect_true(all(interval_prob$upper >= 0 & interval_prob$upper <= 1))
  expect_true(all(interval_prob$lower <= interval_prob$upper))

  # Test contrast on linear scale
  X0 <- X_test
  X1 <- X_test + 0.5
  contrast_linear <- computeContrastBARTModel(
    bart_model,
    X_0 = X0,
    X_1 = X1,
    type = "posterior",
    scale = "linear"
  )
  expect_true(is.matrix(contrast_linear))
  expect_equal(nrow(contrast_linear), n_test)
  expect_equal(ncol(contrast_linear), num_mcmc)

  # Test contrast on probability scale
  contrast_prob <- computeContrastBARTModel(
    bart_model,
    X_0 = X0,
    X_1 = X1,
    type = "posterior",
    scale = "probability"
  )
  expect_true(is.matrix(contrast_prob))
  expect_equal(nrow(contrast_prob), n_test)
  expect_equal(ncol(contrast_prob), num_mcmc)

  # Test contrast with type = "mean" on linear scale
  contrast_mean_linear <- computeContrastBARTModel(
    bart_model,
    X_0 = X0,
    X_1 = X1,
    type = "mean",
    scale = "linear"
  )
  expect_true(is.numeric(contrast_mean_linear))
  expect_equal(length(contrast_mean_linear), n_test)

  # Test contrast with type = "mean" on probability scale
  contrast_mean_prob <- computeContrastBARTModel(
    bart_model,
    X_0 = X0,
    X_1 = X1,
    type = "mean",
    scale = "probability"
  )
  expect_true(is.numeric(contrast_mean_prob))
  expect_equal(length(contrast_mean_prob), n_test)
})

test_that("BART gaussian: sample posterior predictive", {
  # Generate gaussian data
  set.seed(42)
  n <- 100
  p <- 3
  X <- matrix(runif(n * p), ncol = p)
  beta <- rep(1 / sqrt(p), p)
  mu <- X %*% beta
  y <- rnorm(n, mu, 1)

  # Train/test split
  test_set_pct <- 0.2
  n_test <- round(test_set_pct * n)
  test_inds <- sort(sample(1:n, n_test, replace = FALSE))
  train_inds <- (1:n)[!((1:n) %in% test_inds)]
  X_train <- X[train_inds, ]
  X_test <- X[test_inds, ]
  y_train <- y[train_inds]
  num_mcmc <- 10

  # Fit gaussian BART model (default link)
  bart_model <- bart(
    X_train = X_train,
    y_train = y_train,
    num_gfr = 10,
    num_burnin = 0,
    num_mcmc = num_mcmc
  )

  # Test with multiple draws per sample
  ppd <- sampleBARTPosteriorPredictive(
    bart_model,
    X = X_test,
    num_draws_per_sample = 3
  )
  expect_equal(dim(ppd), c(n_test, num_mcmc, 3))
  expect_true(is.numeric(ppd))

  # Test with single draw per sample
  ppd1 <- sampleBARTPosteriorPredictive(
    bart_model,
    X = X_test,
    num_draws_per_sample = 1
  )
  expect_equal(dim(ppd1), c(n_test, num_mcmc))
  expect_true(is.numeric(ppd1))
})

test_that("BART binary probit: sample posterior predictive", {
  # Generate binary probit data
  set.seed(42)
  n <- 100
  p <- 3
  X <- matrix(runif(n * p), ncol = p)
  beta <- rep(1 / sqrt(p), p)
  mu <- X %*% beta
  prob_y1 <- pnorm(mu)
  y <- rbinom(n, 1, prob_y1)

  # Train/test split
  test_set_pct <- 0.2
  n_test <- round(test_set_pct * n)
  test_inds <- sort(sample(1:n, n_test, replace = FALSE))
  train_inds <- (1:n)[!((1:n) %in% test_inds)]
  X_train <- X[train_inds, ]
  X_test <- X[test_inds, ]
  y_train <- y[train_inds]
  num_mcmc <- 10

  # Fit binary probit BART model
  bart_model <- bart(
    X_train = X_train,
    y_train = y_train,
    num_gfr = 10,
    num_burnin = 0,
    num_mcmc = num_mcmc,
    general_params = list(
      sample_sigma2_global = FALSE,
      outcome_model = OutcomeModel(outcome = "binary", link = "probit")
    )
  )

  # Test with multiple draws per sample
  ppd <- sampleBARTPosteriorPredictive(
    bart_model,
    X = X_test,
    num_draws_per_sample = 3
  )
  expect_equal(dim(ppd), c(n_test, num_mcmc, 3))
  expect_true(all(ppd %in% c(0, 1)))

  # Test with single draw per sample
  ppd1 <- sampleBARTPosteriorPredictive(
    bart_model,
    X = X_test,
    num_draws_per_sample = 1
  )
  expect_equal(dim(ppd1), c(n_test, num_mcmc))
  expect_true(all(ppd1 %in% c(0, 1)))
})

test_that("BCF gaussian: posterior interval and contrast", {
  # Generate gaussian causal data
  set.seed(42)
  n <- 100
  p <- 3
  X <- matrix(runif(n * p), ncol = p)
  pi_x <- pnorm(X[, 1])
  Z <- rbinom(n, 1, pi_x)
  mu_x <- X[, 1] + X[, 2]
  tau_x <- rep(1, n)
  y <- mu_x + tau_x * Z + rnorm(n)

  # Train/test split
  test_set_pct <- 0.2
  n_test <- round(test_set_pct * n)
  test_inds <- sort(sample(1:n, n_test, replace = FALSE))
  train_inds <- (1:n)[!((1:n) %in% test_inds)]
  X_train <- X[train_inds, ]
  X_test <- X[test_inds, ]
  Z_train <- Z[train_inds]
  Z_test <- Z[test_inds]
  pi_train <- pi_x[train_inds]
  pi_test <- pi_x[test_inds]
  y_train <- y[train_inds]
  num_mcmc <- 10

  # Fit gaussian BCF model
  bcf_model <- bcf(
    X_train = X_train,
    Z_train = Z_train,
    y_train = y_train,
    propensity_train = pi_train,
    X_test = X_test,
    Z_test = Z_test,
    propensity_test = pi_test,
    num_gfr = 10,
    num_burnin = 0,
    num_mcmc = num_mcmc
  )

  # Test posterior interval on linear scale (prognostic_function term)
  interval_prog <- computeBCFPosteriorInterval(
    bcf_model,
    terms = "prognostic_function",
    level = 0.95,
    scale = "linear",
    X = X_test,
    Z = Z_test,
    propensity = pi_test
  )
  expect_true(is.list(interval_prog))
  expect_true("lower" %in% names(interval_prog))
  expect_true("upper" %in% names(interval_prog))
  expect_equal(length(interval_prog$lower), n_test)
  expect_equal(length(interval_prog$upper), n_test)
  expect_true(all(interval_prog$lower <= interval_prog$upper))

  # Test posterior interval for cate term
  interval_cate <- computeBCFPosteriorInterval(
    bcf_model,
    terms = "cate",
    level = 0.95,
    scale = "linear",
    X = X_test,
    Z = Z_test,
    propensity = pi_test
  )
  expect_true(is.list(interval_cate))
  expect_equal(length(interval_cate$lower), n_test)
  expect_true(all(interval_cate$lower <= interval_cate$upper))

  # Test posterior interval for y_hat term
  interval_yhat <- computeBCFPosteriorInterval(
    bcf_model,
    terms = "y_hat",
    level = 0.95,
    scale = "linear",
    X = X_test,
    Z = Z_test,
    propensity = pi_test
  )
  expect_true(is.list(interval_yhat))
  expect_equal(length(interval_yhat$lower), n_test)
  expect_true(all(interval_yhat$lower <= interval_yhat$upper))

  # Test contrast on linear scale (CATE: Z=1 vs Z=0)
  contrast_posterior <- computeContrastBCFModel(
    bcf_model,
    X_0 = X_test,
    X_1 = X_test,
    Z_0 = rep(0, n_test),
    Z_1 = rep(1, n_test),
    propensity_0 = pi_test,
    propensity_1 = pi_test,
    type = "posterior",
    scale = "linear"
  )
  expect_true(is.matrix(contrast_posterior))
  expect_equal(nrow(contrast_posterior), n_test)
  expect_equal(ncol(contrast_posterior), num_mcmc)

  # Test contrast with type = "mean"
  contrast_mean <- computeContrastBCFModel(
    bcf_model,
    X_0 = X_test,
    X_1 = X_test,
    Z_0 = rep(0, n_test),
    Z_1 = rep(1, n_test),
    propensity_0 = pi_test,
    propensity_1 = pi_test,
    type = "mean",
    scale = "linear"
  )
  expect_true(is.numeric(contrast_mean))
  expect_equal(length(contrast_mean), n_test)
})

test_that("BCF binary probit: posterior interval and contrast", {
  # Generate binary probit causal data
  set.seed(42)
  n <- 100
  p <- 3
  X <- matrix(runif(n * p), ncol = p)
  pi_x <- pnorm(X[, 1])
  Z <- rbinom(n, 1, pi_x)
  mu_x <- X[, 1] + X[, 2]
  tau_x <- rep(0.5, n)
  prob_y1 <- pnorm(mu_x + tau_x * Z)
  y <- rbinom(n, 1, prob_y1)

  # Train/test split
  test_set_pct <- 0.2
  n_test <- round(test_set_pct * n)
  test_inds <- sort(sample(1:n, n_test, replace = FALSE))
  train_inds <- (1:n)[!((1:n) %in% test_inds)]
  X_train <- X[train_inds, ]
  X_test <- X[test_inds, ]
  Z_train <- Z[train_inds]
  Z_test <- Z[test_inds]
  pi_train <- pi_x[train_inds]
  pi_test <- pi_x[test_inds]
  y_train <- y[train_inds]
  num_mcmc <- 10

  # Fit binary probit BCF model
  bcf_model <- bcf(
    X_train = X_train,
    Z_train = Z_train,
    y_train = y_train,
    propensity_train = pi_train,
    X_test = X_test,
    Z_test = Z_test,
    propensity_test = pi_test,
    num_gfr = 10,
    num_burnin = 0,
    num_mcmc = num_mcmc,
    general_params = list(
      sample_sigma2_global = FALSE,
      outcome_model = OutcomeModel(outcome = "binary", link = "probit")
    )
  )

  # Test posterior interval on linear scale
  interval_linear <- computeBCFPosteriorInterval(
    bcf_model,
    terms = "prognostic_function",
    level = 0.95,
    scale = "linear",
    X = X_test,
    Z = Z_test,
    propensity = pi_test
  )
  expect_true(is.list(interval_linear))
  expect_true("lower" %in% names(interval_linear))
  expect_true("upper" %in% names(interval_linear))
  expect_equal(length(interval_linear$lower), n_test)
  expect_equal(length(interval_linear$upper), n_test)
  expect_true(all(interval_linear$lower <= interval_linear$upper))

  # Test posterior interval on probability scale
  interval_prob <- computeBCFPosteriorInterval(
    bcf_model,
    terms = "prognostic_function",
    level = 0.95,
    scale = "probability",
    X = X_test,
    Z = Z_test,
    propensity = pi_test
  )
  expect_true(is.list(interval_prob))
  expect_equal(length(interval_prob$lower), n_test)
  expect_equal(length(interval_prob$upper), n_test)
  expect_true(all(interval_prob$lower >= 0 & interval_prob$lower <= 1))
  expect_true(all(interval_prob$upper >= 0 & interval_prob$upper <= 1))
  expect_true(all(interval_prob$lower <= interval_prob$upper))

  # Test contrast on linear scale (CATE: Z=1 vs Z=0)
  contrast_linear <- computeContrastBCFModel(
    bcf_model,
    X_0 = X_test,
    X_1 = X_test,
    Z_0 = rep(0, n_test),
    Z_1 = rep(1, n_test),
    propensity_0 = pi_test,
    propensity_1 = pi_test,
    type = "posterior",
    scale = "linear"
  )
  expect_true(is.matrix(contrast_linear))
  expect_equal(nrow(contrast_linear), n_test)
  expect_equal(ncol(contrast_linear), num_mcmc)

  # Test contrast on probability scale
  contrast_prob <- computeContrastBCFModel(
    bcf_model,
    X_0 = X_test,
    X_1 = X_test,
    Z_0 = rep(0, n_test),
    Z_1 = rep(1, n_test),
    propensity_0 = pi_test,
    propensity_1 = pi_test,
    type = "posterior",
    scale = "probability"
  )
  expect_true(is.matrix(contrast_prob))
  expect_equal(nrow(contrast_prob), n_test)
  expect_equal(ncol(contrast_prob), num_mcmc)

  # Test contrast with type = "mean" on linear scale
  contrast_mean_linear <- computeContrastBCFModel(
    bcf_model,
    X_0 = X_test,
    X_1 = X_test,
    Z_0 = rep(0, n_test),
    Z_1 = rep(1, n_test),
    propensity_0 = pi_test,
    propensity_1 = pi_test,
    type = "mean",
    scale = "linear"
  )
  expect_true(is.numeric(contrast_mean_linear))
  expect_equal(length(contrast_mean_linear), n_test)

  # Test contrast with type = "mean" on probability scale
  contrast_mean_prob <- computeContrastBCFModel(
    bcf_model,
    X_0 = X_test,
    X_1 = X_test,
    Z_0 = rep(0, n_test),
    Z_1 = rep(1, n_test),
    propensity_0 = pi_test,
    propensity_1 = pi_test,
    type = "mean",
    scale = "probability"
  )
  expect_true(is.numeric(contrast_mean_prob))
  expect_equal(length(contrast_mean_prob), n_test)
})

test_that("BCF gaussian: sample posterior predictive", {
  # Generate gaussian causal data
  set.seed(42)
  n <- 100
  p <- 3
  X <- matrix(runif(n * p), ncol = p)
  pi_x <- pnorm(X[, 1])
  Z <- rbinom(n, 1, pi_x)
  mu_x <- X[, 1] + X[, 2]
  tau_x <- rep(1, n)
  y <- mu_x + tau_x * Z + rnorm(n)

  # Train/test split
  test_set_pct <- 0.2
  n_test <- round(test_set_pct * n)
  test_inds <- sort(sample(1:n, n_test, replace = FALSE))
  train_inds <- (1:n)[!((1:n) %in% test_inds)]
  X_train <- X[train_inds, ]
  X_test <- X[test_inds, ]
  Z_train <- Z[train_inds]
  Z_test <- Z[test_inds]
  pi_train <- pi_x[train_inds]
  pi_test <- pi_x[test_inds]
  y_train <- y[train_inds]
  num_mcmc <- 10

  # Fit gaussian BCF model
  bcf_model <- bcf(
    X_train = X_train,
    Z_train = Z_train,
    y_train = y_train,
    propensity_train = pi_train,
    X_test = X_test,
    Z_test = Z_test,
    propensity_test = pi_test,
    num_gfr = 10,
    num_burnin = 0,
    num_mcmc = num_mcmc
  )

  # Test with multiple draws per sample
  ppd <- sampleBCFPosteriorPredictive(
    bcf_model,
    X = X_test,
    Z = Z_test,
    propensity = pi_test,
    num_draws_per_sample = 3
  )
  expect_equal(dim(ppd), c(n_test, num_mcmc, 3))
  expect_true(is.numeric(ppd))

  # Test with single draw per sample
  ppd1 <- sampleBCFPosteriorPredictive(
    bcf_model,
    X = X_test,
    Z = Z_test,
    propensity = pi_test,
    num_draws_per_sample = 1
  )
  expect_equal(dim(ppd1), c(n_test, num_mcmc))
  expect_true(is.numeric(ppd1))
})

test_that("BCF binary probit: sample posterior predictive", {
  # Generate binary probit causal data
  set.seed(42)
  n <- 100
  p <- 3
  X <- matrix(runif(n * p), ncol = p)
  pi_x <- pnorm(X[, 1])
  Z <- rbinom(n, 1, pi_x)
  mu_x <- X[, 1] + X[, 2]
  tau_x <- rep(0.5, n)
  prob_y1 <- pnorm(mu_x + tau_x * Z)
  y <- rbinom(n, 1, prob_y1)

  # Train/test split
  test_set_pct <- 0.2
  n_test <- round(test_set_pct * n)
  test_inds <- sort(sample(1:n, n_test, replace = FALSE))
  train_inds <- (1:n)[!((1:n) %in% test_inds)]
  X_train <- X[train_inds, ]
  X_test <- X[test_inds, ]
  Z_train <- Z[train_inds]
  Z_test <- Z[test_inds]
  pi_train <- pi_x[train_inds]
  pi_test <- pi_x[test_inds]
  y_train <- y[train_inds]
  num_mcmc <- 10

  # Fit binary probit BCF model
  bcf_model <- bcf(
    X_train = X_train,
    Z_train = Z_train,
    y_train = y_train,
    propensity_train = pi_train,
    X_test = X_test,
    Z_test = Z_test,
    propensity_test = pi_test,
    num_gfr = 10,
    num_burnin = 0,
    num_mcmc = num_mcmc,
    general_params = list(
      sample_sigma2_global = FALSE,
      outcome_model = OutcomeModel(outcome = "binary", link = "probit")
    )
  )

  # Test with multiple draws per sample
  ppd <- sampleBCFPosteriorPredictive(
    bcf_model,
    X = X_test,
    Z = Z_test,
    propensity = pi_test,
    num_draws_per_sample = 3
  )
  expect_equal(dim(ppd), c(n_test, num_mcmc, 3))
  expect_true(all(ppd %in% c(0, 1)))

  # Test with single draw per sample
  ppd1 <- sampleBCFPosteriorPredictive(
    bcf_model,
    X = X_test,
    Z = Z_test,
    propensity = pi_test,
    num_draws_per_sample = 1
  )
  expect_equal(dim(ppd1), c(n_test, num_mcmc))
  expect_true(all(ppd1 %in% c(0, 1)))
})