library(polmineR) use("polmineR") use(pkg = "RcppCWB", corpus = "REUTERS") testthat::context("pmi") test_that( "check calculation of pointwise mutual information", { y <- cooccurrences("REUTERS", query = "oil", method = "pmi") N <- size(y)[["ref"]] + size(y)[["coi"]] + count(y) I <- log2((y[["count_coi"]]/N) / ((count(y) / N) * (y[["count_partition"]] / N))) expect_equal(y[["pmi"]], I, tolerance = 1e-3) } ) test_that( "identity of phrase detection of decode-workflow and Cooccurrences workflow", { a <- corpus("GERMAPARLMINI") %>% decode(p_attribute = "word", s_attribute = character(), to = "data.table", verbose = FALSE) %>% ngrams(n = 2L, p_attribute = "word") %>% pmi(observed = count("GERMAPARLMINI", p_attribute = "word")) b <- Cooccurrences("GERMAPARLMINI", p_attribute = "word", left = 0L, right = 1L, verbose = FALSE) %>% decode() %>% pmi() a_min <- subset(a, ngram_count == 5L) %>% slot("stat") %>% data.table::setorderv(cols = c("word_1", "word_2")) b_min <- subset(b, ab_count == 5L) %>% slot("stat") %>% data.table::setorderv(cols = c("a_word", "b_word")) expect_identical(nrow(a_min), nrow(b_min)) expect_identical(a_min[["word_1"]], b_min[["a_word"]]) expect_identical(a_min[["word_2"]], b_min[["b_word"]]) } )