set.seed(20051920) # 20051920 is derived from 'test'

n <- 100
d <- dplyr::tibble(
    uid = 1:n,
    cov1 = sample(rep(1:2, c(n / 2, n / 2)), n, replace = TRUE),
    cov2 = sample(rep(1:2, c(n / 2, n / 2)), n, replace = TRUE)
)

test_that("make_splits output", {
        checkmate::expect_data_frame(
            make_splits(d, uid, .num_splits = 2),
            any.missing = FALSE,
            nrows = 100,
            ncols = 4,
            col.names = "named"
        )
        expect_true(".split_id" %in% names(make_splits(d, uid, .num_splits = 2)))
})

s_df <- make_splits(d, uid, .num_splits = 2)
s_id <- s_df$.split_id
test_that("2 splits are ok", {
    expect_length(unique(s_id), 2)
    checkmate::expect_integerish(s_id, len = n, any.missing = FALSE, lower = 1, upper = 2)
    expect_true(all(table(s_id) == (n / 2)))
})

test_that("resplitting works", {
    expect_error(s_df2 <- make_splits(s_df, uid, .num_splits = 4), NA)
    checkmate::expect_data_frame(s_df2, min.rows = n, max.rows = n)
    expect_true(".split_id" %in% names(s_df2))
})

test_that("check_splits", {
    expect_error(check_splits(d), "You must first construct splits with `tidyhte::make_splits`.")
})

test_that("check that splits can be used to create dataframes", {
    checkmate::expect_r6(split_data(s_df, 1), classes = "HTEFold")
    expect_error(split_data(d, 1), "Must construct split identifiers before splitting.")
})

test_that("train split is larger than holdout", {
    s_df <- make_splits(d, uid, .num_splits = 4)
    fold <- split_data(s_df, 1)
    checkmate::expect_r6(fold, classes = "HTEFold")
    expect_true(nrow(fold$train) == (3 * nrow(fold$holdout)))
    expect_true(nrow(fold$holdout) == sum(fold$in_holdout))
})

s_id <- make_splits(d, uid, .num_splits = 7)$.split_id
test_that("check that 7 splits are ok", {
    expect_length(unique(s_id), 7)
    checkmate::expect_integerish(s_id, len = n, any.missing = FALSE, lower = 1, upper = 10)
    checkmate::expect_integer(table(s_id), lower = floor(n / 7), upper = ceiling(n / 7))
})

n <- 100
d <- dplyr::tibble(
    uid = 1:n,
    cov1 = sample(rep(1:2, c(n / 2, n / 2)), n, replace = TRUE),
    cov2 = sample(rep(1:2, c(n / 20, 19 * n / 20)), n, replace = TRUE)
)

s_df <- make_splits(d, uid, cov2, .num_splits = 2)
test_that("check that stratification on one variable works", {
    s_df %>%
    dplyr::group_by(cov2) %>%
    dplyr::summarize(all_splits = all(1:2 %in% .split_id)) %>%
    dplyr::summarize(all_splits = all(all_splits)) %>%
    unlist() -> result
    expect_true(result)
})

s_df <- make_splits(d, uid, cov1, cov2, .num_splits = 2)
test_that("check that stratification on multiple variables works", {
    s_df %>%
    dplyr::group_by(cov2) %>%
    dplyr::summarize(all_splits = all(1:2 %in% .split_id)) %>%
    dplyr::summarize(all_splits = all(all_splits)) %>%
    unlist() -> result
    expect_true(result)

    s_df %>%
    dplyr::group_by(cov1) %>%
    dplyr::summarize(all_splits = all(1:2 %in% .split_id)) %>%
    dplyr::summarize(all_splits = all(all_splits)) %>%
    unlist() -> result
    expect_true(result)
})


d <- dplyr::tibble(
    uid = paste0("uid is ", as.character(1:n)),
    cov1 = sample(rep(1:2, c(n / 2, n / 2)), n, replace = TRUE),
    cov2 = sample(rep(1:2, c(n / 2, n / 2)), n, replace = TRUE)
)

test_that("character uids", {
        checkmate::expect_data_frame(
            make_splits(d, uid, .num_splits = 2),
            any.missing = FALSE,
            nrows = 100,
            ncols = 4,
            col.names = "named"
        )
        expect_true(".split_id" %in% names(make_splits(d, uid, .num_splits = 2)))
})


d <- dplyr::tibble(
    uid = paste0("uid is ", as.character(1:n)),
    cov1 = sample(rep(1:2, c(n / 2, n / 2)), n, replace = TRUE),
    cov2 = sample(rep(1:2, c(n / 2, n / 2)), n, replace = TRUE),
    cov3 = rnorm(n),
    cov4 = runif(n)
)

test_that("lots of covariates informing the strata", {
        newd <- make_splits(d, uid, cov1, cov2, cov3, cov4, .num_splits = 2)
        checkmate::expect_data_frame(
            newd,
            any.missing = FALSE,
            nrows = 100,
            ncols = 6,
            col.names = "named"
        )
        expect_true(".split_id" %in% names(newd))
})

d <- dplyr::tibble(
    uid = rep(paste0("uid is ", as.character(1:(n / 2))), 2),
    cov1 = sample(rep(1:2, c(n / 2, n / 2)), n, replace = TRUE),
    cov2 = sample(rep(1:2, c(n / 2, n / 2)), n, replace = TRUE),
    cov3 = rnorm(n),
    cov4 = runif(n)
)

test_that("clustered data", {
        newd <- make_splits(d, uid, cov1, cov2, cov3, cov4, .num_splits = 2)
        checkmate::expect_data_frame(
            newd,
            any.missing = FALSE,
            nrows = 100,
            ncols = 6,
            col.names = "named"
        )
        expect_true(".split_id" %in% names(newd))
})

test_that("splitting works when quickblock isn't installed", {
    expect_message(
        mockr::with_mock(
            package_present = function(x) FALSE,
            {
                make_splits(d, uid, cov1, .num_splits = 4)
            }
        ),
        "`quickblock` is not installed, so falling back to un-stratified CV."
    )
})