test_that("test full enumeration cases: r and r-lean", {

#' @srrstats {G5.4} **Correctness tests** *to test that statistical algorithms
#' produce expected results to some fixed test data sets (potentially through
#' comparisons using binding frameworks such as
#' [RStata](https://github.com/lbraglia/RStata)).* Several correctness
#' tests are implemented. First, it is tested if the non-bootstrapped
#' t-statistics
#' produced via boottest() *exactly* match those computed by the fixest package
#' (see test_tstat_equivalence). Second, `fwildclusterboot` is heavily tested
#'  against `WildBootTests.jl` - see "test-r-vs-julia". Last, multiple R
#'  implementations of the WCB are tested against each other.
#' @srrstats {G5.6} **Parameter recovery tests** *to test that the
#' implementation produce expected results given data with known properties.
#' For instance, a linear regression algorithm should return expected
#' coefficient values for a simulated data set generated from a linear model.*
#' Done. Non-bootstrapped t-stats are tested against t-stats and F-stats
#' computed by the fixest package (see test_tstat_equivalence.R). Also, tests
#'  if bootstrapped p-values are deterministic under "full enumeration"
#'  (test-seed.R).
#' @srrstats {G5.6a} *Parameter recovery tests should generally be expected
#' to succeed within a defined tolerance rather than recovering exact values.*
#'  t-stat equivalence is tested "exactly", r vs Julia is tested with tolerance.


  # note: these tests are deterministic! therefore exact.

  reltol <- 0.05

  N <- 2000

  set.seed(2351)
  dqrng::dqset.seed(2351)

  data1 <<- fwildclusterboot:::create_data(
    N = N,
    N_G1 = 7,
    icc1 = 0.5,
    N_G2 = 20,
    icc2 = 0.2,
    numb_fe1 = 10,
    numb_fe2 = 10,
    seed = 55530,
    weights = 1:N / N
  )

  lm_fit <- lm(proposition_vote ~ treatment + log_income,
    data = data1
  )



  lm_fit_weights <- lm(proposition_vote ~ treatment + log_income,
    weights = data1$weights,
    data = data1
  )

  lm_fits <- list(
    ols = lm_fit
    # ,wls = lm_fit_weights
  )



  for (object in lm_fits) {
    # for (p_val_type in c("two-tailed", "equal-tailed", ">", "<")) {
    for (impose_null in c(TRUE)) {
      boot_r <- suppressWarnings(
        boottest(
          object,
          clustid = "group_id1",
          B = 999,
          param = "treatment",
          impose_null = impose_null,
          conf_int = FALSE,
          ssc = boot_ssc(adj = FALSE, cluster.adj = FALSE)
        )
      )


      boot_r_lean <- suppressWarnings(
        boottest(
          object,
          clustid = "group_id1",
          B = 999,
          param = "treatment",
          impose_null = impose_null,
          engine = "R-lean",
          conf_int = FALSE,
          ssc = boot_ssc(adj = FALSE, cluster.adj = FALSE)
        )
      )

      boot_fnr <- suppressWarnings(
        boottest(
          object,
          clustid = "group_id1",
          B = 999,
          param = "treatment",
          impose_null = impose_null,
          engine = "R",
          bootstrap_type = "11",
          conf_int = FALSE,
          ssc = boot_ssc(adj = FALSE, cluster.adj = FALSE)
        )
      )

      # note: difference in p-values due to discrete jumps:
      # t-statistics are not directly identical (discrepancies at order e-14)
      # therefore, the actual t-statistic t can be smaller, larger, or lie within
      # the bootstrapped test statisics for which all weights are 1 or -1
      # as there are only 2^N_G_bootcluster t-stats and only
      # 2^(N_G_bootcluster -1 )
      # different t-stats in abs values, the p-value needs to be
      # x / (2^N_G_bootcluster),
      # with x any integer
      # and depending where t lies (smaller, larger, or within), the p-value can
      # be x / (2^N_G_bootcluster), (x-1) / (2^N_G_bootcluster),
      # (x+1) / (2^N_G_bootcluster)
      # I therefore set the tolerance to 1 / (2^N_G_bootcluster) for all p-values
      # see sum(sort(boot_r$t_boot) - sort(boot_jl$t_boot)) != 0L


      expect_true(
        boot_r_lean$p_val %in% (boot_r$p_val + -2:2 * 1 / 2^boot_r$N_G)
      )
      expect_true(
        boot_fnr$p_val %in% (boot_r$p_val + -2:2 * 1 / 2^boot_r$N_G)
      )

      expect_equal(
        boot_r_lean$t_stat,
        boot_r$t_stat,
        ignore_attr = TRUE
      )
      expect_equal(
        boot_fnr$t_stat,
        boot_r$t_stat,
        ignore_attr = TRUE
      )

      expect_equal(
        sort(boot_r_lean$t_boot),
        sort(boot_r$t_boot),
        ignore_attr = TRUE
      )
      expect_equal(
        sort(boot_fnr$t_boot),
        sort(boot_r$t_boot),
        ignore_attr = TRUE
      )


      # check: with 9 clusters, 2^9 unique draws -
      # hence the p-values calculated
      # must be an integer if multiplied by 512

      expect_true(
        (boot_r$p_val * 2^boot_r$N_G) %% 1 == 0
      )
      expect_true(
        (boot_r_lean$p_val * 2^boot_r_lean$N_G) %% 1 == 0
      )


      # multi-param hypotheses
      # cat("Check 2:", "\n")

      boot_r <- suppressWarnings(
        boottest(
          object,
          clustid = "group_id1",
          B = 999,
          param = c("treatment", "log_income"),
          R = c(-0.1, 0.1),
          r = 0.1,
          impose_null = impose_null,
          conf_int = FALSE
        )
      )
      boot_r_lean <- suppressWarnings(
        boottest(
          object,
          clustid = "group_id1",
          B = 999,
          param = c("treatment", "log_income"),
          R = c(-0.1, 0.1),
          r = 0.1,
          impose_null = impose_null,
          engine = "R-lean",
          conf_int = FALSE
        )
      )


      expect_true(
        boot_r_lean$p_val %in% (boot_r$p_val + -2:2 * 1 / 2^boot_r$N_G)
      )
      expect_equal(
        boot_r_lean$t_stat,
        boot_r$t_stat,
        ignore_attr = TRUE
      )
      expect_equal(
        sort(boot_r_lean$t_boot),
        sort(boot_r$t_boot),
        ignore_attr = TRUE
      )

      # check: with 9 clusters, 2^9 unique draws -
      # hence the p-values calculated
      # must be an integer if multiplied by 512

      expect_true(
        (boot_r$p_val * 2^boot_r$N_G) %% 1 == 0
      )
      expect_true(
        (boot_r_lean$p_val * 2^boot_r_lean$N_G) %% 1 == 0
      )
    }
  }
})




test_that("test full enumeration cases: r and julia", {
  # note: these tests are deterministic! therefore exact.

  skip_on_cran()
  skip_if_not(
    fwildclusterboot:::find_proglang("julia"),
    message = "skip test as julia installation not found."
  )

  reltol <- 0.05

  N <- 2000
  seed <- 187019

  data1 <<- fwildclusterboot:::create_data(
    N = N,
    N_G1 = 8,
    icc1 = 0.5,
    N_G2 = 20,
    icc2 = 0.2,
    numb_fe1 = 10,
    numb_fe2 = 10,
    seed = seed,
    weights = 1:N / N
  )

  lm_fit <- lm(proposition_vote ~ treatment + log_income,
    data = data1
  )



  lm_fit_weights <- lm(proposition_vote ~ treatment + log_income,
    weights = data1$weights,
    data = data1
  )

  lm_fits <- list(
    ols = lm_fit
    # ,wls = lm_fit_weights
  )

  for (object in lm_fits) {
    for (impose_null in c(TRUE, FALSE)) {
      boot_r <- suppressWarnings(
        boottest(
          object,
          clustid = "group_id1",
          B = 999,
          param = "treatment",
          impose_null = impose_null,
          conf_int = FALSE
        )
      )

      boot_jl <- suppressWarnings(
        boottest(object,
          clustid = "group_id1",
          B = 999,
          param = "treatment",
          impose_null = impose_null,
          engine = "WildBootTests.jl",
          conf_int = FALSE
        )
      )

      # note: difference in p-values due to discrete jumps:
      # t-statistics are not directly identical
      # (discrepancies at order e-14)
      # therefore, the actual t-statistic t can be smaller, larger,
      # or lie within
      # the bootstrapped test statisics for which all weights are 1 or -1
      # as there are only 2^N_G_bootcluster t-stats and only
      # 2^(N_G_bootcluster -1 )
      # different t-stats in abs values, the p-value needs to be
      # x / (2^N_G_bootcluster),
      # with x any integer
      # and depending where t lies (smaller, larger, or within),
      # the p-value can
      # be x / (2^N_G_bootcluster), (x-1) / (2^N_G_bootcluster),
      # (x+1) / (2^N_G_bootcluster)
      # I therefore set the tolerance to 1 / (2^N_G_bootcluster) for
      # all p-values
      # see sum(sort(boot_r$t_boot) - sort(boot_jl$t_boot)) != 0L

      expect_true(
        boot_r$p_val %in% (boot_jl$p_val + -2:2 * 1 / 2^boot_r$N_G)
      )

      expect_equal(
        boot_r$t_stat,
        boot_jl$t_stat,
        ignore_attr = TRUE
      )
      expect_equal(
        sort(boot_r$t_boot),
        sort(boot_jl$t_boot),
        ignore_attr = TRUE
      )



      # check: with 9 clusters, 2^9 unique draws -
      # hence the p-values calculated
      # must be an integer if multiplied by 512

      expect_true(
        (boot_r$p_val * 2^boot_r$N_G) %% 1 == 0
      )

      expect_true(
        (boot_jl$p_val * 2^boot_jl$N_G) %% 1 == 0
      )

      # multi-param hypotheses
      # cat("Check 2:", "\n")

      boot_r <- suppressWarnings(
        boottest(
          object,
          clustid = "group_id1",
          B = 999,
          param = c("treatment", "log_income"),
          R = c(-0.1, 0.1),
          r = 0.1,
          impose_null = impose_null,
          conf_int = FALSE
        )
      )

      boot_jl <- suppressWarnings(
        boottest(
          object,
          clustid = "group_id1",
          B = 999,
          param = c("treatment", "log_income"),
          R = c(-0.1, 0.1),
          r = 0.1,
          impose_null = impose_null,
          engine = "WildBootTests.jl",
          conf_int = FALSE
        )
      )

      expect_true(
        boot_r$p_val %in% (boot_jl$p_val + -2:2 * 1 / 2^boot_r$N_G)
      )
      expect_equal(
        boot_r$t_stat,
        boot_jl$t_stat,
        ignore_attr = TRUE
      )
      expect_equal(
        sort(boot_r$t_boot),
        sort(boot_jl$t_boot),
        ignore_attr = TRUE
      )



      # check: with 9 clusters, 2^9 unique draws -
      # hence the p-values calculated
      # must be an integer if multiplied by 512

      expect_true(
        (boot_r$p_val * 2^boot_r$N_G) %% 1 == 0
      )

      expect_true(
        (boot_jl$p_val * 2^boot_jl$N_G) %% 1 == 0
      )
    }
  }
})