# load required functions and packages
library("testthat")
suppressWarnings(library("SuperLearner"))

# generate the data -- note that this is a simple setting, for speed -----------
set.seed(4747)
p <- 2
n <- 5e4
x <- replicate(p, stats::rnorm(n, 0, 1))
# apply the function to the x's
y <- 1 + 0.5 * x[, 1] + 0.75 * x[, 2] + stats::rnorm(n, 0, 1)
true_var <- mean((y - mean(y)) ^ 2)
# note that true difference in R-squareds for variable j, under independence, is 
# beta_j^2 * var(x_j) / var(y)
r2_one <- 0.5 ^ 2 * 1 / true_var
r2_two <- 0.75 ^ 2 * 1 / true_var

# split the data in half
samp <- sample(1:n, n/2, replace = FALSE)
y_samp <- y[samp]
x_samp <- x[samp, ]
y_nsamp <- y[-samp]
x_nsamp <- x[-samp, ]
# set up hypothesis testing folds
folds <- sample(rep(seq_len(2), length = length(samp)))

# set up a library for SuperLearner
learners <- c("SL.glm", "SL.mean")
V <- 2

# average variable importance --------------------------------------------------
# fit the data with all covariates
set.seed(1234)
full_fit_1 <- SuperLearner::SuperLearner(Y = y_samp, 
                                         X = as.data.frame(x_samp),
                           SL.library = learners, cvControl = list(V = V))
full_fitted_1 <- SuperLearner::predict.SuperLearner(full_fit_1)$pred

full_fit_2 <- SuperLearner::SuperLearner(Y = y_nsamp,
                                         X = as.data.frame(x_nsamp), 
                           SL.library = learners, cvControl = list(V = V))
full_fitted_2 <- SuperLearner::predict.SuperLearner(full_fit_2)$pred

# fit the first split; importance for X2
reduced_fit_1 <- SuperLearner::SuperLearner(Y = full_fitted_1, 
                              X = as.data.frame(x_samp[, -2, drop = FALSE]), 
                              SL.library = learners, cvControl = list(V = V))
reduced_fitted_1 <- SuperLearner::predict.SuperLearner(reduced_fit_1)$pred

# fit the second split; importance for X2
reduced_fit_2 <- SuperLearner::SuperLearner(Y = full_fitted_2, 
                              X = as.data.frame(x_nsamp[, -2, drop = FALSE]), 
                              SL.library = learners, cvControl = list(V = V))
reduced_fitted_2 <- SuperLearner::predict.SuperLearner(reduced_fit_2)$pred

test_that("Averaging variable importance estimates works", {
  est_1 <- vim(Y = y_samp, f1 = full_fitted_1, f2 = reduced_fitted_1, 
               run_regression = FALSE, indx = 2, type = "r_squared", 
               sample_splitting = FALSE)
  est_2 <- vim(Y = y_nsamp, f1 = full_fitted_2, f2 = reduced_fitted_2, 
               run_regression = FALSE, indx = 2, type = "r_squared", 
               sample_splitting = FALSE)

  est <- average_vim(est_1, est_2)
  expect_equal(est$est, r2_two, 
               tolerance = 0.2, scale = 1)
  expect_length(est$mat, 7)
  expect_output(print(est), "Estimate", fixed = TRUE)
})