context("disclapmix")

# https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17494
suppressWarnings(RNGversion("3.5.0"))

ESTIMATION_TOL_DUE_TO_SAMPLING <- 1e-3

##################################################
# One center, one locus
##################################################
set.seed(1)
p <- 0.3 # Dispersion parameter 
y <- 13 # Location parameter 
x <- disclap::rdisclap(1000, p) + y # Generate a sample using the rdisclap function

y_hat <- median(x)
mu_hat <- mean(abs(x - y_hat))
p_hat <- mu_hat^(-1) * (sqrt(mu_hat^2 + 1) - 1)

test_that("rdisclap", {
  expect_equal(y_hat, y)
  expect_equal(p_hat, p, tol = ESTIMATION_TOL_DUE_TO_SAMPLING)
})

xmat <- as.matrix(as.integer(x))

init_y <- 12L
fit <- disclapmix(x = xmat, 
                  clusters = 1L, 
                  init_y = matrix(init_y), 
                  init_y_method = NULL)

test_that("disclapmix 1 center 1 locus", {
  expect_equal(as.integer(fit$init_y), init_y)
  expect_equal(as.integer(fit$y), y)
  expect_equal(c(fit$disclap_parameters), p, tol = ESTIMATION_TOL_DUE_TO_SAMPLING)
  expect_equal(c(fit$disclap_parameters), p_hat)
  expect_output(print(fit), "disclapmixfit from 1000 observations on 1 loci with 1 clusters.", 
                fixed = TRUE)
  expect_output(summary(fit), "disclapmixfit from 1000 observations on 1 loci with 1 clusters.

EM converged:                                                       FALSE
Number of central haplotype changes:                                0
Total number of EM iterations:                                      2
Model observations (n*loci*clusters):                               1000
Model parameters ((clusters*loci)+(loci+clusters-1)+(clusters-1)):  2
GLM method:                                                         internal_coef
Initial central haplotypes supplied:                                TRUE", 
                fixed = TRUE)
})

clusprob <- clusterprob(fit, xmat)
test_that("clusterprob", {
  expect_equal(1L, length(unique(clusprob)))
  expect_equal(rep(0, nrow(xmat)), c(abs(clusprob - 1)))
})

test_that("clusterdist", {
  expect_error(clusterdist(fit))
})

##################################################
# Multiple centres, multiple loci -- some overlap
##################################################
set.seed(1)
ps_loc <- c(0.3, 0.2, 0.4)
ps_ys <- c(0.1, 0)
ys <- matrix(c(13L, 19L, 20L,
               13L, 20L, 22L), nrow = 2L, byrow = TRUE) # Location parameter 
tau <- c(0.2, 0.8)
N <- 1000L
z <- sample(seq_along(tau), N, prob = tau, replace = TRUE)
x <- do.call(cbind, lapply(seq_along(ps_loc), function(k) {
  p <- ps_loc[k] + ps_ys[z]
  return(disclap::rdisclap(N, p) + ys[z, k])
}))

fit <- disclapmix(x = x, clusters = 2L)

order_by_ys_true <- order(ys[, 1L], ys[, 2L], ys[, 3L])
order_by_ys_fit <- order(fit$y[, 1L], fit$y[, 2L], fit$y[, 3L])

p_true <- outer(ps_ys, ps_loc, "+")[order_by_ys_true, ]
p_fit <- fit$disclap_parameters[order_by_ys_fit, ]
dimnames(p_fit) <- NULL

y_true <- ys[order_by_ys_true, ]
y_fit <- fit$y[order_by_ys_fit, ]
dimnames(y_fit) <- NULL

test_that("disclapmix multiple centers multiple loci", {
  expect_equal(p_true, p_fit, tol = 1e-1)
  expect_equal(y_true, y_fit)
})


test_that("clusterprob", {
  expect_equal(clusterprob(fit, x), fit$v_matrix, tol = 1e-3)
})


##################################################
# Estimation methods
##################################################

fit_std_int_coef <- disclapmix(x = x, clusters = 2L, glm_method = "internal_coef")
fit_std_int_dev <- disclapmix(x = x, clusters = 2L, glm_method = "internal_dev")
fit_std_glmfit <- disclapmix(x = x, clusters = 2L, glm_method = "glm.fit")

# reordering
order_int_coef <- order(fit_std_int_coef$y[, 1L], fit_std_int_coef$y[, 2L], fit_std_int_coef$y[, 3L])
order_int_dev <- order(fit_std_int_dev$y[, 1L], fit_std_int_dev$y[, 2L], fit_std_int_dev$y[, 3L])
order_glmfit <- order(fit_std_glmfit$y[, 1L], fit_std_glmfit$y[, 2L], fit_std_glmfit$y[, 3L])

test_that("disclapmix multiple centers multiple loci", {
  expect_equal(fit_std_int_coef$y[order_int_coef, ], 
               fit_std_int_dev$y[order_int_dev, ])
  expect_equal(fit_std_int_coef$y[order_int_coef, ], 
               fit_std_glmfit$y[order_glmfit, ])
  
  expect_equal(fit_std_int_coef$disclap_parameters[order_int_coef, ], 
               fit_std_int_dev$disclap_parameters[order_int_dev, ], tol = 1e-6)
  expect_equal(fit_std_int_coef$disclap_parameters[order_int_coef, ], 
               fit_std_glmfit$disclap_parameters[order_glmfit, ], tol = 1e-6)
})


test_that("clusterdist", {
  expect_equal(clusterdist(fit_std_int_coef), clusterdist(fit_std_int_dev), tol = 1e-6)
})


clusterwise_probs <- predict_clusterwise(fit, newdata = x)

pred_man <- apply(clusterwise_probs, 1, function(p) sum(fit$tau*p))


test_that("predict", {
  expect_equal(predict(fit_std_int_coef, newdata = x), predict(fit_std_int_dev, newdata = x), tol = 1e-6)
  expect_equal(predict(fit_std_int_coef, newdata = x), predict(fit_std_glmfit, newdata = x), tol = 1e-6)
  expect_equal(predict(fit_std_int_coef, newdata = x), pred_man)
})


##################################################
# Simulate
##################################################
sim <- simulate(fit_std_int_coef, 1000L)
sim_fit <- disclapmix(x = sim, clusters = 2L, glm_method = "internal_coef")

test_that("simulate", {
  expect_equal(sim_fit$disclap_parameters, fit_std_int_coef$disclap_parameters, tol = 1e-1)
  expect_equal(sim_fit$y, fit_std_int_coef$y)
})


##################################################
# Multiple centres, multiple loci -- more difference between components
##################################################
set.seed(1)
ps_loc <- c(0.2, 0.2, 0.2)
ps_ys <- c(0.1, 0)
ys <- matrix(c(10L, 10L, 10L,
               13L, 13L, 13L), nrow = 2L, byrow = TRUE) # Location parameter 
tau <- c(0.2, 0.8)
N <- 1000L
z <- sample(seq_along(tau), N, prob = tau, replace = TRUE)
x <- do.call(cbind, lapply(seq_along(ps_loc), function(k) {
  p <- ps_loc[k] + ps_ys[z]
  return(disclap::rdisclap(N, p) + ys[z, k])
}))

fit <- disclapmix(x = x, clusters = 2L)

order_by_ys_true <- order(ys[, 1L], ys[, 2L], ys[, 3L])
order_by_ys_fit <- order(fit$y[, 1L], fit$y[, 2L], fit$y[, 3L])

p_true <- outer(ps_ys, ps_loc, "+")[order_by_ys_true, ]
p_fit <- fit$disclap_parameters[order_by_ys_fit, ]
dimnames(p_fit) <- NULL

y_true <- ys[order_by_ys_true, ]
y_fit <- fit$y[order_by_ys_fit, ]
dimnames(y_fit) <- NULL

test_that("disclapmix multiple centers multiple loci", {
  expect_equal(p_true, p_fit, tol = 1e-1)
  expect_equal(y_true, y_fit)
})

# Also vmatrix
v_true <- matrix(0, nrow = N, ncol = nrow(fit$y))
for (i in 1L:N) v_true[i, z[i]] <- 1
v_fit <- fit$v_matrix[, order_by_ys_fit]

test_that("disclapmix v matrix", {
  expect_true(mean(abs(v_true - v_fit)) < 0.05)
})


##################################################
# Others
##################################################

test_that("rcpp_find_haplotype_in_matrix", {
  expect_equal(1L, find_haplotype_in_matrix(x, x[1L, ]))
  expect_equal(33L, find_haplotype_in_matrix(x, x[33L, ]))
  expect_null(find_haplotype_in_matrix(x, rep(1000L, ncol(x))))
})

test_that("disclapglm_linkfun/disclapglm_mu_eta", {
  expect_true(disclapglm_linkfun(0.1) < 0)
  expect_true(disclapglm_mu_eta(-2) < 1)
  expect_equal(0.1, disclapglm_mu_eta(disclapglm_linkfun(0.1)), tol = 1e-3)
})

test_that("convert_to_compact_db", {
  expect_equal(nrow(x), sum(convert_to_compact_db(x)$Ndb))
})

##################################################
# Errors
##################################################

test_that("disclapmix should give error", {
  expect_error(disclapmix(x, centers = 4L))
  expect_error(disclapmix(x, use.parallel = TRUE))
  expect_error(disclapmix(x, calculate.logLs = TRUE))
  expect_error(disclapmix(x, plots.prefix = 'foo'))
  expect_error(disclapmix(x, clusters = NULL))
  expect_error(disclapmix(x, clusters = -1))
  expect_error(disclapmix(x, clusters = 2.1))
  
  expect_error(simulate(fit_std_int_coef, 100))
  
  expect_error(convert_to_compact_db(c()))
  
  expect_error(check_x(data.frame(x = 1:10)))
  expect_error(check_x(matrix(1L:10L, nrow = 1L)))
  
  expect_error(check_y(data.frame(x = 1:10), clusters = 0L, loci = 10L))
  expect_error(check_y(matrix(1L:10L, nrow = 1L), clusters = 0L, loci = 10L))
  expect_error(check_y(matrix(1L:10L, nrow = 1L), clusters = 0L, loci = 2L))
})