library(testthat)
library(aricode)

## define the ARI as in the mclust package
adjustedRandIndex <- function(x, y) {
  x <- as.vector(x)
  y <- as.vector(y)
  if (length(x) != length(y)) {
    stop("arguments must be vectors of the same length")
  }
  tab <- table(x, y)
  if (all(dim(tab) == c(1, 1))) {
    return(1)
  }
  a <- sum(choose(tab, 2))
  b <- sum(choose(rowSums(tab), 2)) - a
  c <- sum(choose(colSums(tab), 2)) - a
  d <- choose(sum(tab), 2) - a - b - c
  ARI <- (a - (a + b) * (a + c) / (a + b + c + d)) / ((a + b + a + c) / 2 - (a + b) * (a + c) / (a + b + c + d))
  return(ARI)
}

EMI <- function(counts) {
  s_emi <- 0
  n <- sum(counts)
  s1 <- margin.table(counts, 1)
  s2 <- margin.table(counts, 2)
  for (i in 1:nrow(counts)) {
    for (j in 1:ncol(counts)) {
      ai <- s1[i]
      bj <- s2[j]
      min_nij <- max(1, ai + bj - n)
      max_nij <- min(ai, bj)
      if (min_nij >= max_nij) next
      n.ij <- seq(min_nij, max_nij) # sequence of consecutive numbers
      t1 <- (n.ij / n) * log((n.ij * n) / (ai * bj))
      t2 <- exp(lfactorial(ai) + lfactorial(bj) + lfactorial(n - ai) + lfactorial(n - bj) - lfactorial(n) - lfactorial(n.ij) - lfactorial(ai - n.ij) - lfactorial(bj - n.ij) - lfactorial(n - ai - bj + n.ij))
      emi <- sum(t1 * t2)
      # if (is.nan(emi)) browser()
      s_emi <- s_emi + emi
    }
  }
  s_emi
}

Chi2_ref <- function(c1, c2) {
  suppressWarnings(as.numeric(chisq.test(c1, c2, correct = F)$stat[1]))
}

## Martina's code
ARI_estimated <- function(c1, c2) {
  # c1, c2, two classifications of the same observations
  n_kl <- table(c1, c2)

  n <- sum(n_kl)
  n_k <- rowSums(n_kl)
  n_l <- colSums(n_kl)

  T1 <- sum(choose(n_kl, 2))
  T2 <- 2 * n + sum(outer(n_k, n_l) * n_kl) - sum(n_kl^2) - sum(n_k^2) - sum(n_l^2)
  T3 <- 1 / (6 * choose(n, 4)) * (sum(outer(n_k^2, n_l^2)) - 4 * T1 - 4 * T2 - n * 2 * (sum(choose(n_k, 2)) + sum(choose(n_l, 2))) - n^2) / 4

  RI_estim <- (1 / choose(n, 2)) * T1
  ARI_estim <- RI_estim - T3
  ARI_estim
}

test_that("Testing coherence of the adjusted Rand Index", {
  ## "\n-large random vectors-
  n <- 1e5
  c1 <- as.numeric(sample(1:(n / 100), n, replace = TRUE))
  c2 <- as.numeric(sample(1:(n / 100), n, replace = TRUE))
  expect_equal(ARI(c1, c2), adjustedRandIndex(c1, c2))

  ## "\n-real data-
  data(iris)
  cl <- cutree(hclust(dist(iris[, -5])), 4)
  expect_equal(ARI(cl, iris$Species), adjustedRandIndex(cl, iris$Species))

  ## "\n-completely equal vectors with no groups-")
  c1 <- 1:100
  c2 <- 1:100
  expect_equal(ARI(c1, c2), adjustedRandIndex(c1, c2))

  ## "\n-completely equal vectors with one groups-")
  c1 <- rep(1, 100)
  c2 <- rep(2, 100)
  expect_equal(ARI(c1, c2), adjustedRandIndex(c1, c2))

  ## "\n-completely different vectors with one groups-")
  c1 <- c(rep(0, 99), 1)
  c2 <- rep(1, 100)
  expect_equal(ARI(c1, c2), adjustedRandIndex(c1, c2))
})

test_that("Testing coherence of the expected mutual information", {
  ## "\n-real data-
  data(iris)
  cl <- cutree(hclust(dist(iris[, -5])), 4)
  counts <- table(cl, iris$Species)
  ni. <- rowSums(counts)
  n.j <- colSums(counts)
  expect_equal(aricode:::expected_MI(ni., n.j), EMI(counts))

  ## "\n-completely equal vectors with one groups-")
  c1 <- rep(1, 100)
  c2 <- rep(2, 100)
  counts <- table(c1, c2)
  ni. <- rowSums(counts)
  n.j <- colSums(counts)
  expect_equal(aricode:::expected_MI(ni., n.j), EMI(counts))

  ## "\n-completely different vectors with one groups-")
  c1 <- c(rep(0, 99), 1)
  c2 <- rep(1, 100)
  counts <- table(c1, c2)
  ni. <- rowSums(counts)
  n.j <- colSums(counts)
  expect_equal(aricode:::expected_MI(ni., n.j), EMI(counts))
})


test_that("Testing coherence of the Chi2 statistics information", {
  ## "\n-large random vectors-
  n <- 1e5
  c1 <- as.numeric(sample(1:(n / 100), n, replace = TRUE))
  c2 <- as.numeric(sample(1:(n / 100), n, replace = TRUE))
  expect_equal(ARI(c1, c2), adjustedRandIndex(c1, c2))

  ## "\n-moderate random-
  n <- rpois(lambda = 100, n = 1) + 3
  k1 <- rpois(lambda = 5, n = 1) + 2
  k2 <- rpois(lambda = 5, n = 1) + 2
  c1 <- sample(1:k1, n, replace = T)
  c2 <- sample(1:k2, n, replace = T)
  expect_equal(aricode::Chi2(c1, c2), Chi2_ref(c1, c2))

  ## "\n-real data-
  data(iris)
  cl <- cutree(hclust(dist(iris[, -5])), 4)
  expect_equal(aricode::Chi2(cl, iris$Species), Chi2_ref(cl, iris$Species))

  ## "\n-completely equal vectors with no groups-")
  c1 <- 1:100
  c2 <- 1:100
  expect_equal(aricode::Chi2(c1, c2), Chi2_ref(c1, c2))
})

test_that("Testing coherence of the MARI", {
  ## "\n-large random vectors-
  n <- 1e5
  c1 <- as.numeric(sample(1:(n / 100), n, replace = TRUE))
  c2 <- as.numeric(sample(1:(n / 100), n, replace = TRUE))
  expect_equal(aricode::MARI(c1, c2, raw = TRUE), ARI_estimated(c1, c2))

  ## "\n-moderate random-
  n <- rpois(lambda = 100, n = 1) + 3
  k1 <- rpois(lambda = 5, n = 1) + 2
  k2 <- rpois(lambda = 5, n = 1) + 2
  c1 <- sample(1:k1, n, replace = T)
  c2 <- sample(1:k2, n, replace = T)
  expect_equal(aricode::MARI(c1, c2, raw = TRUE), ARI_estimated(c1, c2))

  ## "\n-real data-
  data(iris)
  cl <- cutree(hclust(dist(iris[, -5])), 4)
  expect_equal(aricode::MARI(c1, c2, raw = TRUE), ARI_estimated(c1, c2))

  ## "\n-completely equal vectors with no groups-")
  c1 <- 1:100
  c2 <- 1:100
  expect_equal(aricode::MARI(c1, c2, raw = TRUE), ARI_estimated(c1, c2))

  ## "\n-completely equal vectors with one groups-")
  c1 <- rep(1, 100)
  c2 <- rep(2, 100)
  expect_equal(aricode::MARI(c1, c2, raw = TRUE), ARI_estimated(c1, c2))

  ## "\n-completely different vectors with one groups-")
  c1 <- c(rep(0, 99), 1)
  c2 <- rep(1, 100)
  expect_equal(aricode::MARI(c1, c2, raw = TRUE), ARI_estimated(c1, c2))
})

test_that("Testing coherence of the MARI with ARI : in case of independance their values should be close", {
  n <- rpois(lambda = 100, n = 1) + 3
  k1 <- rpois(lambda = 5, n = 1) + 2
  k2 <- rpois(lambda = 5, n = 1) + 2
  class1 <- sample(1:k1, n, replace = T)
  class2 <- sample(1:k2, n, replace = T)
  ## expect_equal or almost equal in test_that
  expect_equal(ARI(class1, class2), MARI(class1, class2), tolerance = 1e-3)
})