################################################################################

context("SPLIT_LD")

################################################################################

test_that("get_L() and get_C() work", {

  corr <- as(outer(1:4 / 10, 1:4 / 10, '+'), "dgCMatrix")
  diag(corr) <- 1

  L <- matrix(0, 4, 4)
  L[1, 2] <- sum(corr[1, 2:4]^2)
  L[1, 3] <- sum(corr[1, 3:4]^2)
  L[1, 4] <- sum(corr[1, 4:4]^2)
  L[2, 3] <- sum(corr[2, 3:4]^2)
  L[2, 4] <- sum(corr[2, 4:4]^2)
  L[3, 4] <- sum(corr[3, 4:4]^2)

  # Rcpp::sourceCpp('src/split-LD.cpp')
  library(magrittr)
  L2 <- corr %>%
    Matrix::tril() %>%
    { bigsnpr:::get_L(.@p, .@i, .@x, thr_r2 = 0, max_r2 = 1) } %>%
    # L now has an extra column with all 0s for convenience
    { Matrix::sparseMatrix(i = .[[1]], j = .[[2]], x = .[[3]], dims = c(4, 5),
                           triangular = FALSE, index1 = FALSE) }

  expect_equal(L2[, 1:4], as(L, "dgCMatrix"))

  # E <- matrix(0, 4, 4)
  # E[1, 1] <- sum(corr[  1, 2:4]^2)
  # E[1, 2] <- sum(corr[1:2, 3:4]^2)
  # E[1, 3] <- sum(corr[1:3,   4]^2)
  # E[2, 2] <- sum(corr[  2, 3:4]^2)
  # E[2, 3] <- sum(corr[2:3,   4]^2)
  # E[3, 3] <- sum(corr[  3,   4]^2)

  path_cost1 <- bigsnpr:::get_C(L2, min_size = 1, max_size = 4,
                                max_K = 5, max_cost = Inf,
                                pos_scaled = rep(0, ncol(corr)))
  # one block remaining
  expect_identical(path_cost1$best_ind[, 1], rep(4L, 4))
  expect_identical(path_cost1$C[, 1], rep(0, 4))
  # two blocks remaining -> last is impossible
  expect_identical(path_cost1$best_ind[, 2], c(1L, 2L, 3L, NA))
  expect_equal(path_cost1$C[, 2], c(0.5, 0.61, 0.49, Inf))
  # three blocks remaining
  expect_identical(path_cost1$best_ind[, 3], c(1L, 2L, NA, NA))
  expect_equal(path_cost1$C[, 3], c(1.11, 1.1, Inf, Inf), tolerance = 1e-6)
  # four blocks remaining
  expect_identical(path_cost1$best_ind[, 4], c(1L, NA, NA, NA))
  expect_equal(path_cost1$C[, 4], c(1.6, Inf, Inf, Inf))
  # five blocks remaining -> all impossible
  expect_identical(path_cost1$best_ind[, 5], rep(NA_integer_, 4))
  expect_identical(path_cost1$C[, 5], rep(Inf, 4))

  # must use two blocks of two
  path_cost2 <- bigsnpr:::get_C(L2, min_size = 2, max_size = 2,
                                max_K = 3, max_cost = Inf,
                                pos_scaled = rep(1, ncol(corr)))
  # one block remaining
  expect_identical(path_cost2$best_ind[, 1], c(NA, NA, 4L, NA))
  expect_identical(path_cost2$C[, 1], c(Inf, Inf, 0, Inf))
  # two blocks remaining
  expect_identical(path_cost2$best_ind[, 2], c(2L, NA, NA, NA))
  expect_equal(path_cost2$C[, 2], c(1.02, Inf, Inf, Inf), tolerance = 1e-6)
  # three blocks remaining -> always impossible
  expect_identical(path_cost2$best_ind[, 3], rep(NA_integer_, 4))
  expect_identical(path_cost2$C[, 3], rep(Inf, 4))

  path_cost3 <- bigsnpr:::get_C(L2, min_size = 1, max_size = 3,
                                max_K = 3, max_cost = Inf,
                                pos_scaled = seq(0, 1, length.out = ncol(corr)))
  # one block remaining
  expect_identical(path_cost3$best_ind[, 1], c(NA, 4L, 4L, 4L))
  expect_identical(path_cost3$C[, 1], c(Inf, 0, 0, 0))
  # two blocks remaining
  expect_identical(path_cost3$best_ind[, 2], c(1L, 2L, 3L, NA))
  expect_equal(path_cost3$C[, 2], c(0.5, 0.61, 0.49, Inf))
  # three blocks remaining
  expect_identical(path_cost3$best_ind[, 3], c(1L, 2L, NA, NA))
  expect_equal(path_cost3$C[, 3], c(1.11, 1.10, Inf, Inf), tolerance = 1e-6)

  # pos_scaled works
  expect_null(snp_ldsplit(corr, thr_r2 = 0, max_r2 = 1, min_size = 1,
                          max_size = 3, max_K = 3, max_cost = Inf,
                          pos_scaled = cols_along(corr) * 2))

  test <- snp_ldsplit(corr, thr_r2 = 0, max_r2 = 1, min_size = 1,
                      max_size = 3, max_K = 4, max_cost = Inf,
                      pos_scaled = cols_along(corr) * 2)
  expect_equal(nrow(test), 1)

  path_cost4 <- bigsnpr:::get_C(L2, min_size = 1, max_size = 3,
                                max_K = 4, max_cost = Inf,
                                pos_scaled = cols_along(corr) * 2)


  # one block remaining
  expect_identical(path_cost4$best_ind[, 1], c(NA, NA, NA, 4L))
  expect_identical(path_cost4$C[, 1], c(Inf, Inf, Inf, 0))
  # two blocks remaining
  err <- corr[3, 4]^2
  expect_identical(path_cost4$best_ind[, 2], c(NA, NA, 3L, NA))
  expect_equal(path_cost4$C[, 2], c(Inf, Inf, err, Inf))
  # three blocks remaining
  err <- err + sum(corr[2, 3:4]^2)
  expect_identical(path_cost4$best_ind[, 3], c(NA, 2L, NA, NA))
  expect_equal(path_cost4$C[, 3], c(Inf, err, Inf, Inf), tolerance = 1e-6)
  # four blocks remaining
  err <- err + sum(corr[1, 2:4]^2)
  expect_identical(path_cost4$best_ind[, 4], c(1L, NA, NA, NA))
  expect_equal(path_cost4$C[, 4], c(err, Inf, Inf, Inf), tolerance = 1e-6)
})

################################################################################

test_that("$perc_kept in snp_ldsplit() is exact", {

  corr <- as(outer(1:4 / 10, 1:4 / 10, '+'), "dgCMatrix")
  diag(corr) <- 1

  test <- snp_ldsplit(corr, 0, 1, 2, 4, max_r2 = 1, max_cost = Inf)
  expect_identical(test$n_block, 2:4)
  expect_identical(test$perc_kept, c(8, 6, 4) / 16)
})

################################################################################

test_that("snp_ldsplit() gives consistent results", {

  compute_cost <- function(block_num, corr, thr_r2) {
    corr %>%
      Matrix::tril() %>%
      as("dgTMatrix") %>%
      { .@x[block_num[.@i + 1L] != block_num[.@j + 1L]]^2 } %>%
      { sum(.[. >= thr_r2]) }
  }

  compute_max_out <- function(block_num, corr) {
    corr %>%
      Matrix::tril() %>%
      as("dgTMatrix") %>%
      { .@x[block_num[.@i + 1L] != block_num[.@j + 1L]]^2 } %>%
      max()
  }

  corr <- readRDS(test_path("testdata/spMat.rds"))

  res1 <- snp_ldsplit(corr, thr_r2 = 0.02, min_size = 10, max_size = 30,
                      max_K = 50, max_r2 = 1, max_cost = Inf)
  # 30 * 13 = 390 < 401
  # 10 * 40 = 400 < 401
  expect_identical(res1$n_block, 14:40)

  block_num <- lapply(res1$all_size, function(sizes) rep(seq_along(sizes), sizes))
  costs <- sapply(block_num, function(nums) compute_cost(nums, corr, thr_r2 = 0.02))
  expect_equal(res1$cost, costs)

  prev_res1 <- readRDS(test_path("testdata/split_before.rds"))  # < v1.10.1
  expect_equal(res1$cost, prev_res1$cost)  # only costs are the same

  res2 <- snp_ldsplit(corr, thr_r2 = runif(1, 0, 0.2), min_size = 20, max_size = 40,
                      max_K = 50, max_r2 = 1, max_cost = Inf)
  expect_identical(res2$n_block, 11:20)

  res3 <- snp_ldsplit(corr, thr_r2 = runif(1, 0, 0.2), min_size = 20, max_size = 40,
                      max_K = 15, max_r2 = 1, max_cost = Inf)
  expect_identical(res3$n_block, 11:15)

  # parameter 'max_cost' works
  max_cost <- runif(1, min(res1$cost), max(res1$cost))
  res4 <- snp_ldsplit(corr, thr_r2 = 0.02, min_size = 10, max_size = 30,
                      max_K = 50, max_r2 = 1, max_cost = max_cost)
  expect_true(all(res4$cost <= max_cost))
  res1.bad <- subset(res1, !n_block %in% res4$n_block)
  expect_true(all(res1.bad$cost > max_cost))

  # parameter 'max_r2' works
  max_r2 <- runif(1, 0.1, 0.4)
  res5 <- snp_ldsplit(corr, thr_r2 = 0.02, min_size = 10, max_size = 50,
                      max_K = 100, max_r2 = max_r2, max_cost = Inf)
  sapply(res5$all_size, function(sizes) {
     block_num <- rep(seq_along(sizes), sizes)
     max_r2_out <- compute_max_out(block_num, corr)
     expect_lte(max_r2_out, max_r2)
     max_r2_out / max_r2
  })

  # can provide multiple 'max_size'
  max_r2 <- runif(1, 0.3, 1)
  res6 <- snp_ldsplit(corr, thr_r2 = 0.02, min_size = 10, max_size = 30,
                      max_K = 50, max_r2 = max_r2, max_cost = Inf)
  res7 <- snp_ldsplit(corr, thr_r2 = 0.02, min_size = 10, max_size = 40,
                      max_K = 50, max_r2 = max_r2, max_cost = Inf)
  res67 <- snp_ldsplit(corr, thr_r2 = 0.02, min_size = 10,
                       max_size = sample(c(30, 40)),
                       max_K = 50, max_r2 = max_r2, max_cost = Inf)
  expect_equal(res67[-1], unique(rbind(res6, res7)[-1]))
})

################################################################################