####################################################################### # nuggets: An R framework for exploration of patterns in data # Copyright (C) 2025 Michal Burda # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . ####################################################################### test_that("dig_associations with contingency table", { d <- data.frame(a = c(T, T, F, F, F), b = c(T, T, T, T, F), c = c(F, F, F, T, T)) res <- dig_associations(d, antecedent = everything(), consequent = everything(), min_support = 0.0001, min_confidence = 0.0001) res <- res[order(res$antecedent_length, res$antecedent, res$consequent), ] expect_true(is_nugget(res, "associations")) expect_true(is_tibble(res)) expect_equal(attr(res, "call_function"), "dig_associations") expect_true(is.list(attr(res, "call_data"))) expect_equal(attr(res, "call_data")$nrow, 5) expect_equal(attr(res, "call_data")$ncol, 3) expect_equal(attr(res, "call_data")$colnames, c("a", "b", "c")) expect_true(is.list(attr(res, "call_args"))) expect_equal(attr(res, "call_args")$x, "d") expect_equal(attr(res, "call_args")$antecedent, c("a", "b", "c")) expect_equal(attr(res, "call_args")$consequent, c("a", "b", "c")) expect_equal(attr(res, "call_args")$min_support, 0.0001) expect_equal(attr(res, "call_args")$min_confidence, 0.0001) expect_equal(attr(res, "call_args")$contingency_table, TRUE) expect_true(is_tibble(res)) expect_equal(nrow(res), 7) expect_equal(colnames(res), c("antecedent", "consequent", "support", "confidence", "coverage", "conseq_support", "lift", "count", "antecedent_length", "pp", "pn", "np", "nn")) expect_true(is.character(res$antecedent)) expect_true(is.character(res$consequent)) expect_true(is.double(res$support)) expect_true(is.double(res$confidence)) expect_equal(res$antecedent, c( "{}", "{}", "{}", "{a}", "{b}", "{b}", "{c}")) expect_equal(res$consequent, c("{a}", "{b}", "{c}", "{b}", "{a}", "{c}", "{b}")) expect_equal(round(res$support, 6), c(0.4, 0.8, 0.4, 0.4, 0.4, 0.2, 0.2)) expect_equal(round(res$conseq_support, 6), c(0.4, 0.8, 0.4, 0.8, 0.4, 0.4, 0.8)) expect_equal(round(res$confidence, 6), c(0.4, 0.8, 0.4, 1.0, 0.5, 0.25, 0.5)) expect_equal(res$antecedent_length, c(0, 0, 0, 1, 1, 1, 1)) expect_equal(res$pp, c(2, 4, 2, 2, 2, 1, 1)) expect_equal(res$np, c(0, 0, 0, 2, 0, 1, 3)) expect_equal(res$pn, c(3, 1, 3, 0, 2, 3, 1)) expect_equal(res$nn, c(0, 0, 0, 1, 1, 0, 0)) }) test_that("dig_associations with disjoint", { d <- data.frame(a = c(T, T, F, F, F), b = c(T, T, T, T, F), c = c(F, F, F, T, T)) res <- dig_associations(d, antecedent = everything(), consequent = everything(), disjoint = c(1, 2, 2), min_support = 0.0001, min_confidence = 0.0001) expect_true(is_nugget(res, "associations")) expect_true(is_tibble(res)) expect_equal(attr(res, "call_function"), "dig_associations") expect_true(is.list(attr(res, "call_data"))) expect_equal(attr(res, "call_data")$nrow, 5) expect_equal(attr(res, "call_data")$ncol, 3) expect_equal(attr(res, "call_data")$colnames, c("a", "b", "c")) expect_true(is.list(attr(res, "call_args"))) expect_equal(attr(res, "call_args")$x, "d") expect_equal(attr(res, "call_args")$antecedent, c("a", "b", "c")) expect_equal(attr(res, "call_args")$consequent, c("a", "b", "c")) expect_equal(attr(res, "call_args")$min_support, 0.0001) expect_equal(attr(res, "call_args")$min_confidence, 0.0001) expect_equal(attr(res, "call_args")$disjoint, c(1, 2, 2)) expect_true(is_tibble(res)) expect_equal(nrow(res), 5) expect_equal(colnames(res), c("antecedent", "consequent", "support", "confidence", "coverage", "conseq_support", "lift", "count", "antecedent_length", "pp", "pn", "np", "nn")) expect_true(is.character(res$antecedent)) expect_true(is.character(res$consequent)) expect_true(is.double(res$support)) expect_true(is.double(res$confidence)) expect_equal(res$antecedent, c("{}", "{}", "{}", "{b}", "{a}")) expect_equal(res$consequent, c("{b}", "{a}", "{c}", "{a}", "{b}")) expect_equal(res$support, c(0.8, 0.4, 0.4, 0.4, 0.4)) expect_equal(round(res$conseq_support, 6), c(0.8, 0.4, 0.4, 0.4, 0.8)) expect_equal(res$confidence, c(0.8, 0.4, 0.4, 0.5, 1.0)) expect_equal(res$antecedent_length, c(0, 0, 0, 1, 1)) }) test_that("dig_associations min_support", { # min_support is the support of the whole rule d <- data.frame(a = c(T, T, F, F, F), b = c(T, T, T, T, F), c = c(T, F, F, T, T)) res <- dig_associations(d, antecedent = everything(), consequent = everything(), disjoint = c(1, 2, 3), min_support = 0.2, min_confidence = 0.0001) expect_true(is_tibble(res)) expect_equal(nrow(res), 12) res <- dig_associations(d, antecedent = everything(), consequent = everything(), disjoint = c(1, 2, 3), min_support = 0.3, min_confidence = 0.0001) expect_true(is_nugget(res, "associations")) expect_true(is_tibble(res)) expect_equal(attr(res, "call_function"), "dig_associations") expect_true(is.list(attr(res, "call_data"))) expect_equal(attr(res, "call_data")$nrow, 5) expect_equal(attr(res, "call_data")$ncol, 3) expect_equal(attr(res, "call_data")$colnames, c("a", "b", "c")) expect_true(is.list(attr(res, "call_args"))) expect_equal(attr(res, "call_args")$x, "d") expect_equal(attr(res, "call_args")$antecedent, c("a", "b", "c")) expect_equal(attr(res, "call_args")$consequent, c("a", "b", "c")) expect_equal(attr(res, "call_args")$min_support, 0.3) expect_equal(attr(res, "call_args")$min_confidence, 0.0001) expect_equal(attr(res, "call_args")$disjoint, c(1, 2, 3)) expect_true(is_tibble(res)) expect_equal(nrow(res), 7) res <- dig_associations(d, antecedent = everything(), consequent = everything(), disjoint = c(1, 2, 3), min_support = 0.8, min_confidence = 0.0001) expect_true(is_nugget(res, "associations")) expect_true(is_tibble(res)) expect_equal(attr(res, "call_function"), "dig_associations") expect_true(is.list(attr(res, "call_data"))) expect_equal(attr(res, "call_data")$nrow, 5) expect_equal(attr(res, "call_data")$ncol, 3) expect_equal(attr(res, "call_data")$colnames, c("a", "b", "c")) expect_true(is.list(attr(res, "call_args"))) expect_equal(attr(res, "call_args")$x, "d") expect_equal(attr(res, "call_args")$antecedent, c("a", "b", "c")) expect_equal(attr(res, "call_args")$consequent, c("a", "b", "c")) expect_equal(attr(res, "call_args")$min_support, 0.8) expect_equal(attr(res, "call_args")$min_confidence, 0.0001) expect_equal(attr(res, "call_args")$disjoint, c(1, 2, 3)) expect_true(is_tibble(res)) expect_equal(nrow(res), 1) res <- dig_associations(d, antecedent = everything(), consequent = everything(), disjoint = c(1, 2, 3), min_support = 0.81, min_confidence = 0.0001) expect_true(is_nugget(res, "associations")) expect_true(is_tibble(res)) expect_equal(attr(res, "call_function"), "dig_associations") expect_true(is.list(attr(res, "call_data"))) expect_equal(attr(res, "call_data")$nrow, 5) expect_equal(attr(res, "call_data")$ncol, 3) expect_equal(attr(res, "call_data")$colnames, c("a", "b", "c")) expect_true(is.list(attr(res, "call_args"))) expect_equal(attr(res, "call_args")$x, "d") expect_equal(attr(res, "call_args")$antecedent, c("a", "b", "c")) expect_equal(attr(res, "call_args")$consequent, c("a", "b", "c")) expect_equal(attr(res, "call_args")$min_support, 0.81) expect_equal(attr(res, "call_args")$min_confidence, 0.0001) expect_equal(attr(res, "call_args")$disjoint, c(1, 2, 3)) expect_true(is_tibble(res)) expect_equal(nrow(res), 0) }) test_that("dig_associations min_coverage", { # min_coverage is the support of the antecedent d <- data.frame(a = c(T, T, F, F, F), b = c(T, T, T, T, F), c = c(T, F, F, T, T)) res <- dig_associations(d, antecedent = everything(), consequent = everything(), disjoint = c(1, 2, 3), min_coverage = 0.2, min_confidence = 0.0001) expect_true(is_nugget(res, "associations")) expect_true(is_tibble(res)) expect_equal(attr(res, "call_function"), "dig_associations") expect_true(is.list(attr(res, "call_data"))) expect_equal(attr(res, "call_data")$nrow, 5) expect_equal(attr(res, "call_data")$ncol, 3) expect_equal(attr(res, "call_data")$colnames, c("a", "b", "c")) expect_true(is.list(attr(res, "call_args"))) expect_equal(attr(res, "call_args")$x, "d") expect_equal(attr(res, "call_args")$antecedent, c("a", "b", "c")) expect_equal(attr(res, "call_args")$consequent, c("a", "b", "c")) expect_equal(attr(res, "call_args")$min_coverage, 0.2) expect_equal(attr(res, "call_args")$min_confidence, 0.0001) expect_equal(attr(res, "call_args")$disjoint, c(1, 2, 3)) expect_true(is_tibble(res)) expect_equal(nrow(res), 12) res <- dig_associations(d, antecedent = everything(), consequent = everything(), disjoint = c(1, 2, 3), min_coverage = 0.3, min_confidence = 0.0001) expect_true(is_nugget(res, "associations")) expect_true(is_tibble(res)) expect_equal(attr(res, "call_function"), "dig_associations") expect_true(is.list(attr(res, "call_data"))) expect_equal(attr(res, "call_data")$nrow, 5) expect_equal(attr(res, "call_data")$ncol, 3) expect_equal(attr(res, "call_data")$colnames, c("a", "b", "c")) expect_true(is.list(attr(res, "call_args"))) expect_equal(attr(res, "call_args")$x, "d") expect_equal(attr(res, "call_args")$antecedent, c("a", "b", "c")) expect_equal(attr(res, "call_args")$consequent, c("a", "b", "c")) expect_equal(attr(res, "call_args")$min_coverage, 0.3) expect_equal(attr(res, "call_args")$min_confidence, 0.0001) expect_equal(attr(res, "call_args")$disjoint, c(1, 2, 3)) expect_true(is_tibble(res)) expect_equal(nrow(res), 11) res <- dig_associations(d, antecedent = everything(), consequent = everything(), disjoint = c(1, 2, 3), min_coverage = 0.8, min_confidence = 0.0001) expect_true(is_nugget(res, "associations")) expect_true(is_tibble(res)) expect_equal(attr(res, "call_function"), "dig_associations") expect_true(is.list(attr(res, "call_data"))) expect_equal(attr(res, "call_data")$nrow, 5) expect_equal(attr(res, "call_data")$ncol, 3) expect_equal(attr(res, "call_data")$colnames, c("a", "b", "c")) expect_true(is.list(attr(res, "call_args"))) expect_equal(attr(res, "call_args")$x, "d") expect_equal(attr(res, "call_args")$antecedent, c("a", "b", "c")) expect_equal(attr(res, "call_args")$consequent, c("a", "b", "c")) expect_equal(attr(res, "call_args")$min_coverage, 0.8) expect_equal(attr(res, "call_args")$min_confidence, 0.0001) expect_equal(attr(res, "call_args")$disjoint, c(1, 2, 3)) expect_true(is_tibble(res)) expect_equal(nrow(res), 5) res <- dig_associations(d, antecedent = everything(), consequent = everything(), disjoint = c(1, 2, 3), min_coverage = 1, min_confidence = 0.0001) expect_true(is_nugget(res, "associations")) expect_true(is_tibble(res)) expect_equal(attr(res, "call_function"), "dig_associations") expect_true(is.list(attr(res, "call_data"))) expect_equal(attr(res, "call_data")$nrow, 5) expect_equal(attr(res, "call_data")$ncol, 3) expect_equal(attr(res, "call_data")$colnames, c("a", "b", "c")) expect_true(is.list(attr(res, "call_args"))) expect_equal(attr(res, "call_args")$x, "d") expect_equal(attr(res, "call_args")$antecedent, c("a", "b", "c")) expect_equal(attr(res, "call_args")$consequent, c("a", "b", "c")) expect_equal(attr(res, "call_args")$min_coverage, 1) expect_equal(attr(res, "call_args")$min_confidence, 0.0001) expect_equal(attr(res, "call_args")$disjoint, c(1, 2, 3)) expect_true(is_tibble(res)) expect_equal(nrow(res), 3) }) test_that("compare dig_associations to arules::apriori", { skip_if_not_installed("arules") set.seed(2123) rows <- 100 cols <- 5 m <- matrix(sample(c(T, F), rows * cols, replace = TRUE), nrow = rows, ncol = cols) colnames(m) <- letters[seq_len(cols)] afit <- arules::apriori(m, parameter = list(minlen = 1, maxlen = 6, supp=0.001, conf = 0.5), control = list(verbose = FALSE)) expected <- arules::DATAFRAME(afit) expected$LHS <- as.character(expected$LHS) expected$RHS <- as.character(expected$RHS) for (inter in c("addedValue", "centeredConfidence", "conviction")) { expected[[inter]] <- arules::interestMeasure(afit, inter) } expected <- expected[order(expected$LHS, expected$RHS), ] res <- dig_associations(m, min_support = 0.001, min_length = 0, max_length = 5, min_confidence = 0.5) expect_true(is_nugget(res, "associations")) expect_true(is_tibble(res)) expect_equal(attr(res, "call_function"), "dig_associations") expect_true(is.list(attr(res, "call_data"))) expect_equal(attr(res, "call_data")$nrow, 100) expect_equal(attr(res, "call_data")$ncol, 5) expect_equal(attr(res, "call_data")$colnames, c("a", "b", "c", "d", "e")) expect_true(is.list(attr(res, "call_args"))) expect_equal(attr(res, "call_args")$x, "m") expect_equal(attr(res, "call_args")$min_support, 0.001) expect_equal(attr(res, "call_args")$min_length, 0) expect_equal(attr(res, "call_args")$max_length, 5) expect_equal(attr(res, "call_args")$min_confidence, 0.5) expect_true(is_tibble(res)) res <- res[order(res$antecedent, res$consequent), ] expect_equal(res$antecedent, expected$LHS) expect_equal(res$consequent, expected$RHS) expect_equal(res$support, expected$support, tolerance = 1e-6) expect_equal(res$confidence, expected$confidence, tolerance = 1e-6) expect_equal(res$coverage, expected$coverage, tolerance = 1e-6) expect_equal(res$lift, expected$lift, tolerance = 1e-6) expect_equal(res$count, expected$count) }) test_that("dig_associations return object details", { d <- data.frame(a = c(T, T, F, F, F), b = c(T, T, T, T, F), c = c(T, F, F, T, T)) res <- dig_associations(d, antecedent = a:b, consequent = b:c, disjoint = c(1, 2, 2), excluded = list("a"), min_length = 1L, max_length = Inf, min_coverage = 0.2, min_support = 0.3, min_confidence = 0.5, t_norm = "lukas", max_results = 10, threads = 1) expect_true(is_nugget(res, "associations")) expect_true(is_tibble(res)) expect_equal(attr(res, "call_function"), "dig_associations") expect_true(is.list(attr(res, "call_data"))) expect_equal(attr(res, "call_data")$nrow, 5) expect_equal(attr(res, "call_data")$ncol, 3) expect_equal(attr(res, "call_data")$colnames, c("a", "b", "c")) expect_true(is.list(attr(res, "call_args"))) expect_equal(attr(res, "call_args")$x, "d") expect_equal(attr(res, "call_args")$antecedent, c("a", "b")) expect_equal(attr(res, "call_args")$consequent, c("b", "c")) expect_equal(attr(res, "call_args")$disjoint, c(1, 2, 2)) expect_equal(attr(res, "call_args")$excluded, list("a")) expect_equal(attr(res, "call_args")$min_length, 1) expect_equal(attr(res, "call_args")$max_length, Inf) expect_equal(attr(res, "call_args")$min_coverage, 0.2) expect_equal(attr(res, "call_args")$min_support, 0.3) expect_equal(attr(res, "call_args")$min_confidence, 0.5) expect_equal(attr(res, "call_args")$contingency_table, TRUE) expect_equal(attr(res, "call_args")$t_norm, "lukas") expect_equal(attr(res, "call_args")$max_results, 10) expect_equal(attr(res, "call_args")$threads, 1) expect_true(is_tibble(res)) }) test_that("dig_associations errors", { d <- data.frame(a = c(T, T, F, F, F), b = c(T, T, T, T, F), c = c(T, F, F, T, T)) d2 <- data.frame(a = c(T, T, F, F, F), b = c(T, T, T, T, F), c = as.character(c(T, F, F, T, T))) expect_error(dig_associations(as.list(d)), "`x` must be a matrix or a data frame.") expect_error(dig_associations(d2, antecedent = b:c, consequent = a), "All columns selected by `antecedent` must be logical or numeric from the interval") expect_error(dig_associations(d2, antecedent = a:b, consequent = c), "All columns selected by `consequent` must be logical or numeric from the interval") expect_error(dig_associations(d, disjoint = "foo"), "The length of `disjoint` must be 0 or must be equal to the number of columns in `x`") expect_error(dig_associations(d, excluded = "foo"), "`excluded` must be a list or NULL") expect_error(dig_associations(d, min_length = "x"), "`min_length` must be an integerish scalar.") expect_error(dig_associations(d, max_length = "x"), "`max_length` must be an integerish scalar.") expect_error(dig_associations(d, min_coverage = "x"), "`min_coverage` must be a double scalar.") expect_error(dig_associations(d, min_support = "x"), "`min_support` must be a double scalar.") expect_error(dig_associations(d, min_confidence = "x"), "`min_confidence` must be a double scalar.") expect_error(dig_associations(d, t_norm = "x"), "`t_norm` must be equal to one of") expect_error(dig_associations(d, max_results = "x"), "`max_results` must be an integerish scalar.") expect_error(dig_associations(d, verbose = "x"), "`verbose` must be a flag.") expect_error(dig_associations(d, threads = "x"), "`threads` must be an integerish scalar.") }) test_that("dig_associations return nothing", { d <- data.frame(a = c(T, T, F, F, F), b = c(T, T, T, T, F)) res <- dig_associations(d, antecedent = a, consequent = b, min_length = 3, max_length = 3, disjoint = c(1, 2), min_support = 0.1, min_confidence = 0.2, t_norm = "lukas", max_results = 5, verbose = FALSE, threads = 1) expect_true(is_nugget(res, "associations")) expect_true(is_tibble(res)) expect_equal(nrow(res), 0) expect_equal(attr(res, "call_function"), "dig_associations") expect_true(is.list(attr(res, "call_data"))) expect_true(is.list(attr(res, "call_args"))) }) test_that("dig_associations limiting max_results", { # dig() limits max_results by antecedent, dig_associations() must limit by itself d <- data.frame(a = c(T, T, F, F, F), b = c(T, T, T, T, F), c = c(F, F, F, T, T)) res <- dig_associations(d, antecedent = everything(), consequent = everything(), min_support = 0.0001, min_confidence = 0.0001, max_results = 2) expect_true(is_nugget(res, "associations")) expect_true(is_tibble(res)) expect_equal(attr(res, "call_function"), "dig_associations") expect_true(is.list(attr(res, "call_data"))) expect_equal(attr(res, "call_data")$nrow, 5) expect_equal(attr(res, "call_data")$ncol, 3) expect_equal(attr(res, "call_data")$colnames, c("a", "b", "c")) expect_true(is.list(attr(res, "call_args"))) expect_equal(attr(res, "call_args")$x, "d") expect_equal(attr(res, "call_args")$antecedent, c("a", "b", "c")) expect_equal(attr(res, "call_args")$consequent, c("a", "b", "c")) expect_equal(attr(res, "call_args")$min_support, 0.0001) expect_equal(attr(res, "call_args")$min_confidence, 0.0001) expect_equal(attr(res, "call_args")$max_results, 2) expect_equal(nrow(res), 2) }) test_that("bug with cache", { d <- partition(mtcars, .breaks = 2) d <- d[, c(1,7,9)] colnames(d) <- c("a", "b", "c") ia <- sum(d$a) / nrow(d) ib <- sum(d$b) / nrow(d) ic <- sum(d$c) / nrow(d) iab <- sum(d$a & d$b) / nrow(d) iac <- sum(d$a & d$c) / nrow(d) ibc <- sum(d$b & d$c) / nrow(d) iabc <- sum(d$a & d$b & d$c) / nrow(d) ex <- data.frame(rule = c("{} => {a}", "{} => {b}", "{} => {c}", "{a} => {b}", "{a} => {c}", "{b} => {a}", "{b} => {c}", "{c} => {a}", "{c} => {b}", "{a,b} => {c}", "{a,c} => {b}", "{b,c} => {a}"), support = c(ia, ib, ic, iab, iac, iab, ibc, iac, ibc, iabc, iabc, iabc)) ex05 <- ex[ex$support >= 0.5, ] ex05 <- ex05[order(ex05$rule), ] # here was the error that itemset "abc" was not added to the cache # because its parent "bc" was not frequent, but later "abc" was searched # in the cache because both parents "ac" and "ab" were frequent res <- dig_associations(d, antecedent = everything(), consequent = everything(), min_support = 0.5) expect_true(is_nugget(res, "associations")) expect_true(is_tibble(res)) res$rule <- paste0(res$antecedent, " => ", res$consequent) res <- res[order(res$rule), c("rule", "support")] attributes(res) <- NULL names(res) <- c("rule", "support") expect_equal(as.list(res), as.list(ex05)) })