context("Case of NA values in LD estimates")

check_missing_ld <- function() {
  skip_if_not_installed("snpStats")
  
  #Sys.setenv("OMP_THREAD_LIMIT" = 2)
  data("ld.example", package = "snpStats")
  p <- ncol(ceph.1mb)
  h <- p - 1
  ld.ceph <- snpStats::ld(ceph.1mb, depth = h, stats = "R.squared")
  if (!any(is.na(ld.ceph)))  {
    skip("No NA value: nothing to test here!")
  }
  sf <- system.file("data/ld.example.RData", package="snpStats")
  expected <- "497fcd532b5c2bcb082a0dad7ca0d44d"
  if (!(tools::md5sum(sf) == expected)) {
    skip("Different version of data('ld.example', package = 'snpStats')")
  }
}

test_that("NA values in LD estimates gives a warning/error in 'snpClust'", {
  skip_if_not_installed("snpStats")
  check_missing_ld()
  
  #Sys.setenv("OMP_THREAD_LIMIT" = 2)
  data("ld.example", package = "snpStats")
  p <- ncol(ceph.1mb)
  h <- p - 1
    
  ld.ceph <- snpStats::ld(ceph.1mb, depth = h, stats = "R.squared")
  ## In this unrealistically small example with only 90 subjects, it turns out
  ## that one of the LD estimates is NA due to the lack of genetic diversity in
  ## the sample:
  expect_warning(snpClust(ceph.1mb, h = h, stats = "R.squared"))
  expect_error(snpClust(ld.ceph, h = h))
})

test_that("NA values in LD estimates gives a warning/error in 'snpClust' (second version)", {
  # when check_missing_ld() skips the previous test: it means that snpClust does not produce NA
  skip_if_not_installed("snpStats")

  #Sys.setenv("OMP_THREAD_LIMIT" = 2)
  data("ld.example", package = "snpStats")
  p <- ncol(ceph.1mb)
  h <- p - 1
  
  ld.ceph <- snpStats::ld(ceph.1mb, depth = h, stats = "R.squared")
  ld.ceph[9,8] <- ld.ceph[8,9] <- NA # manually add NA
  
  expect_error(snpClust(ld.ceph, h = h))

  mat <- matrix(1, nrow = 10, ncol = 2)
  mat <- as(mat, "SnpMatrix")
  expect_warning(snpClust(mat))
})

## One way to correct this is to drop one of the incriminated SNPs
test_that("Dropping a SNP yielding NA values in LD fixes the NA problem", {
  skip_if_not_installed("snpStats")
  check_missing_ld()
  
  #Sys.setenv("OMP_THREAD_LIMIT" = 2)
  geno <- ceph.1mb[, -316]  ## drop one SNP leading to one missing LD value
  p <- ncol(geno)
  h <- p - 1
  
  ld.ceph <- snpStats::ld(geno, depth = h, stats = "R.squared", 
                          symmetric = TRUE)
  expect_true(all(!is.na(ld.ceph)))
  
  ## avoid LD values slightly greater than 1
  expect_warning(fit <- snpClust(geno, h = h, stats = "R.squared"),
                 "Forcing the LD similarity to be smaller than or equal to 1")
  expect_warning(fit10 <- snpClust(ld.ceph, h = h, stats = "R.squared"),
                 "Forcing the LD similarity to be smaller than or equal to 1")
  expect_identical(fit, fit10)
})

## Another way to correct this is to modify the genotype of a single sample
test_that("Modifying one genotype also fixes the NA problem", {
  skip_if_not_installed("snpStats")
  check_missing_ld()
  
  #Sys.setenv("OMP_THREAD_LIMIT" = 2)
  data("ld.example", package = "snpStats")
  p <- ncol(ceph.1mb)
  h <- p - 1
  nSamples <- nrow(ceph.1mb)

  ceph.1mb[4,286]@.Data[1,1] <- as.raw(3) ## to avoid NaNs
  ld.ceph <- snpStats::ld(ceph.1mb, depth = h, stats = "R.squared",
                          symmetric = TRUE)
  expect_true(all(!is.na(ld.ceph)))
  
  ## avoid LD values slightly greater than 1
  # ld.ceph10 <- round(ld.ceph, digits = 10)  
  expect_warning(fit <- snpClust(ceph.1mb, h = h, stats = "R.squared"),
                 "Forcing the LD similarity to be smaller than or equal to 1")
  expect_warning(fit10 <- snpClust(ld.ceph, h = h, stats = "R.squared"),
                 "Forcing the LD similarity to be smaller than or equal to 1")
  expect_identical(fit, fit10)
})