library("quanteda") mt <- corpus_subset(data_corpus_inaugural, Year > 1980 & Year < 2021) %>% tokens() %>% dfm() mt <- dfm_trim(mt, min_termfreq = 10) test_that("y errors if not a dfm", { expect_error( textstat_simil(mt, y = c("mr", "president"), margin = "features"), "y must be a dfm matching x in the margin specified" ) }) test_that("selection takes integer or logical vector", { expect_equivalent(textstat_simil(mt, y = mt[, c(2, 5)], margin = "features"), textstat_simil(mt, y = mt[, c("mr", "president")], margin = "features")) suppressWarnings(expect_equivalent(textstat_simil(mt, y = mt[, c(2, 5)], margin = "features"), textstat_simil(mt, y = mt[, c("mr", "president")], margin = "features"))) l1 <- featnames(mt) %in% c("mr", "president") expect_equivalent(textstat_simil(mt, y = mt[, l1], margin = "features"), textstat_simil(mt, y = mt[, c("mr", "president")], margin = "features")) expect_error(textstat_simil(mt, "xxxx", margin = "features")) expect_error(textstat_simil(mt, 1000, margin = "features")) expect_equivalent(textstat_simil(mt, y = mt[c(2, 4), ], margin = "documents"), textstat_simil(mt, y = mt[c("1985-Reagan", "1993-Clinton"), ], margin = "documents")) l2 <- docnames(mt) %in% c("1985-Reagan", "1993-Clinton") expect_equivalent(textstat_simil(mt, y = mt[l2, ], margin = "documents"), textstat_simil(mt, y = mt[c("1985-Reagan", "1993-Clinton"), ], margin = "documents")) expect_error(textstat_simil(mt, y = "nothing", margin = "documents")) expect_error(textstat_simil(mt, y = 100, margin = "documents")) }) test_that("textstat_simil() returns NA for empty dfm", { skip_if_not_installed("proxy") mt <- dfm_trim(data_dfm_lbgexample, 1000) # # cor is wrong # expect_equivalent( # unclass(as.matrix(textstat_simil(mt, method = "correlation"))), # unclass(cor(t(as.matrix(mt)), method = "pearson")) # ) expect_equivalent( unclass(as.dist(textstat_simil(mt, method = "correlation"))), unclass(proxy::simil(as.matrix(mt), method = "correlation")) ) expect_equivalent( unclass(as.dist(textstat_simil(mt, method = "cosine"))), unclass(proxy::simil(as.matrix(mt), method = "cosine")) ) # proxy::simil is wrong # expect_equivalent( # unclass(as.dist(textstat_simil(mt, method = "jaccard"))), # unclass(as.dist(proxy::simil(as.matrix(mt), method = "jaccard"))) # ) # proxy::simil is wrong # expect_equivalent( # unclass(as.dist(textstat_simil(mt, method = "ejaccard"))), # unclass(as.dist(proxy::simil(as.matrix(mt), method = "ejaccard"))) # ) # proxy::simil is wrong # expect_equivalent( # unclass(as.dist(textstat_simil(mt, method = "dice"))), # unclass(as.dist(proxy::simil(as.matrix(mt), method = "dice"))) # ) # expect_equivalent( # unclass(as.dist(textstat_simil(mt, method = "edice"))), # unclass(as.dist(proxy::simil(as.matrix(mt), method = "edice"))) # ) expect_equivalent( unclass(as.dist(textstat_simil(mt, method = "hamman"))), unclass(proxy::dist(as.matrix(mt), method = "hamman")) ) expect_equivalent( unclass(as.dist(textstat_simil(mt, method = "simple matching"))), unclass(as.dist(proxy::simil(as.matrix(mt), method = "simple matching"))) ) }) test_that("textstat_simil() returns NA for zero-variance documents", { mt <- data_dfm_lbgexample[1:5, 1:20] mt[1:2, ] <- 0 mt[3:4, ] <- 1 mt <- as.dfm(mt) mt_na_all <- matrix(NA, nrow = 5, ncol = 5, dimnames = list(paste0("R", 1:5), paste0("R", 1:5))) mt_na_some <- mt_na_all mt_na_some[3:4, 3:4] <- 1 expect_equivalent( as.matrix(textstat_simil(mt, method = "correlation")), mt_na_all ) expect_equal( as.matrix(textstat_simil(mt, method = "cosine")), mt_na_some ) # proxy::simil is wrong # expect_equivalent( # as.matrix(textstat_simil(mt, method = "jaccard")), # mt_na_some # ) # proxy::simil is wrong # expect_equivalent( # as.matrix(textstat_simil(mt, method = "ejaccard")), # mt_na_some # ) # proxy::simil is wrong # expect_equivalent( # as.matrix(textstat_simil(mt, method = "dice")), # mt_na_some # ) # proxy::simil is wrong # expect_equal( # as.matrix(textstat_simil(mt, method = "edice")), # mt_na_some # ) # proxyC::simil is wrong (#44) # expect_equal( # as.matrix(textstat_simil(mt, method = "hamman")), # mt_na_some # ) # proxy::simil is wrong # expect_equal( # as.matrix(textstat_simil(mt, method = "simple matching")), # mt_na_some # ) }) test_that("selection is always on columns (#1549)", { mt <- dfm(tokens( corpus_subset(data_corpus_inaugural, Year > 1980) )) suppressWarnings(expect_equal( textstat_simil(mt, margin = "documents", selection = c("1985-Reagan", "1989-Bush")) %>% as.matrix() %>% colnames(), c("1985-Reagan", "1989-Bush") )) suppressWarnings(expect_equal( textstat_simil(mt, margin = "documents", selection = c(2, 3)) %>% as.matrix() %>% colnames(), c("1985-Reagan", "1989-Bush") )) suppressWarnings(expect_equal( textstat_simil(mt, margin = "features", selection = c("justice", "and")) %>% as.matrix() %>% colnames(), c("justice", "and") )) suppressWarnings(expect_equal( textstat_simil(mt, margin = "features", selection = c(4, 6)) %>% as.matrix() %>% colnames(), c("mr", "chief") )) }) test_that("all similarities are between 0 and 1", { methods <- c("correlation", "cosine", "jaccard", "ejaccard", "dice", "edice", "hamann", "simple matching") for (m in methods) { minmax <- range(textstat_simil(mt, method = m, margin = "documents")) tol <- .000001 expect_gte(minmax[1], 0) expect_lte(minmax[2], 1.0 + tol) } }) test_that("textstat_simil is stable across repetitions", { res <- textstat_simil(mt, y = mt[c(2, 4), ], margin = "documents") set.seed(10) resv <- list() for (i in 1:100) { resv[[i]] <- as.matrix(textstat_simil(mt, y = mt[2, ], margin = "documents")) } rescols <- do.call(cbind, resv) expect_true(all(apply(rescols, 1, sd) == 0)) }) test_that("textstat_simil coercion methods work with options", { mt2 <- mt[6:10, ] # upper = TRUE, diag = TRUE tstat <- textstat_simil(mt2, margin = "documents") expect_equal( nrow(as.data.frame(tstat, diag = TRUE, upper = TRUE)), nrow(mt2) ^ 2 ) mat <- as.matrix(tstat) expect_equal(dim(mat), c(ndoc(mt2), ndoc(mt2))) # in matrix, diagonal is 1.0 iden <- rep(1, ndoc(mt2)); names(iden) <- docnames(mt2) expect_equal(diag(mat), iden) lis <- as.list(tstat, sort = TRUE, diag = TRUE) lislen <- rep(ndoc(mt2), 5); names(lislen) <- docnames(mt2) expect_equivalent(lengths(lis), rep(ndoc(mt2), ndoc(mt2))) # in list, sorted first item is comparison to itself expect_identical(names(lis), names(sapply(lis, "[[", 1))) expect_equal(iden, sapply(lis, "[[", 1)) # upper = TRUE, diag = FALSE tstat <- textstat_simil(mt2, margin = "documents") expect_equal( nrow(as.data.frame(tstat, upper = TRUE, diag = FALSE)), nrow(mt2) ^ 2 - ndoc(mt2) ) mat <- as.matrix(tstat) expect_equal(dim(mat), c(ndoc(mt2), ndoc(mt2))) # # in matrix, diagonal is NA # iden <- rep(as.numeric(NA), ndoc(mt2)); names(iden) <- docnames(mt2) # expect_equal(diag(as.matrix(tstat)), iden) # in matrix, diagonal is 1.0 iden <- rep(1, ndoc(mt2)) names(iden) <- docnames(mt2) expect_equal(diag(mat), iden) lis <- as.list(tstat, sort = TRUE, diag = FALSE) expect_equivalent(lengths(lis), rep(ndoc(mt2) - 1, ndoc(mt2))) expect_identical(names(lis), names(sapply(lis, "[[", 1))) # in list, item not compared to itself expect_true(all(sapply(seq_along(lis), function(y) ! names(lis[y]) %in% names(y)))) # upper = FALSE, diag = TRUE tstat <- textstat_simil(mt2, margin = "documents") expect_equal( nrow(as.data.frame(tstat, upper = FALSE, diag = TRUE)), (nrow(mt2) ^ 2 - ndoc(mt2)) / 2 + ndoc(mt2) ) mat <- as.matrix(tstat) # expect_true(all(is.na(mat[upper.tri(mat)]))) # in matrix, diagonal is 1.0 iden <- rep(1, ndoc(mt2)); names(iden) <- docnames(mt2) expect_equal(diag(as.matrix(tstat)), iden) # in matrix, lower is NA lis <- as.list(tstat, sort = TRUE, diag = TRUE) lislen <- rep(ndoc(mt2), ndoc(mt2)) names(lislen) <- docnames(mt2) expect_equivalent(lengths(lis), rep(ndoc(mt2), ndoc(mt2))) # in list, sorted first item is comparison to itself expect_identical(names(lis), names(sapply(lis, "[[", 1))) expect_equal(iden, sapply(lis, "[[", 1)) # upper = FALSE, diag = FALSE tstat <- textstat_simil(mt2, margin = "documents") expect_equal( nrow(as.data.frame(tstat, upper = FALSE, diag = FALSE)), (nrow(mt2) ^ 2 - ndoc(mt2)) / 2 ) mat <- as.matrix(tstat) loweranddiag <- upper.tri(mat) diag(loweranddiag) <- TRUE # expect_true(all(is.na(mat[upper.tri(mat)]))) # in matrix, diagonal is 1.0 iden <- rep(1, ndoc(mt2)) names(iden) <- docnames(mt2) expect_equal(diag(mat), iden) lis <- as.list(tstat, sort = TRUE, diag = FALSE) expect_equivalent(lengths(lis), rep(ndoc(mt2) - 1, ndoc(mt2))) # in list, item not compared to itself expect_true(all(sapply(seq_along(lis), function(y) ! names(lis[y]) %in% names(y)))) }) test_that("as.list.texstat_simil() is robust", { expect_error( as.list(textstat_simil(mt), n = 0), "n must be 1 or greater" ) expect_equivalent( lengths(as.list(textstat_simil(mt), n = 2)), rep(2, ndoc(mt)) ) expect_equivalent( lengths(as.list(textstat_simil(mt), n = ndoc(mt) + 20, diag = TRUE)), rep(ndoc(mt), ndoc(mt)) ) expect_warning( as.list(textstat_simil(mt), n = 2, sort = FALSE), "ignoring n when sorted = FALSE" ) }) test_that("as.list.textstat_simil works with features margin", { tstat <- textstat_simil(mt, y = mt[, c("world", "freedom")], method = "cosine", margin = "features") lis <- as.list(tstat, n = 5, diag = FALSE) expect_equal( sapply(lis, head, 1), c("world.today" = 0.952, "freedom.independence" = 0.937), tol = .01 ) expect_identical(names(lis), c("world", "freedom")) tstat <- textstat_simil(mt, y = mt[, "freedom"], method = "cosine", margin = "features") lis <- as.list(tstat, n = 5, diag = TRUE) expect_equal( sapply(lis, head, 1), c("freedom.freedom" = 1) ) }) test_that("as.data.frame.textstat_simildist works with selection", { mt2 <- mt[6:10, ] tstat <- textstat_simil(mt2, y = mt[c("2017-Trump", "2001-Bush"), ], method = "cosine") expect_equal( as.character(as.data.frame(tstat, diag = FALSE, upper = FALSE)$document2), c(rep("2017-Trump", 4), rep("2001-Bush", 4)) ) expect_equal( as.character(as.data.frame(tstat, diag = TRUE, upper = FALSE)$document2), c(rep("2017-Trump", 5), rep("2001-Bush", 5)) ) suppressWarnings(expect_equal( as.character(as.data.frame(tstat, diag = FALSE, upper = TRUE)$document2), c(rep("2017-Trump", 4), rep("2001-Bush", 4)) )) suppressWarnings(expect_equal( as.character(as.data.frame(tstat, diag = TRUE, upper = TRUE)$document2), c(rep("2017-Trump", 5), rep("2001-Bush", 5)) )) expect_warning( as.data.frame(tstat, upper = TRUE), "upper = TRUE has no effect when columns have been selected" ) expect_identical( names(as.data.frame(textstat_simil(mt2, method = "cosine")))[3], "cosine" ) expect_identical( names(as.data.frame(textstat_simil(mt2, method = "correlation")))[3], "correlation" ) expect_identical( names(as.data.frame(textstat_dist(mt2, method = "euclidean")))[3], "euclidean" ) }) test_that("textstat_simil validator works", { expect_error( textstat_simil(data_dfm_lbgexample, min_simil = -1.1), "min_simil must range from -1.0 to 1.0" ) }) test_that("textstat_simil show/head/tail methods work", { # skip("until this is separate from quanteda") expect_output( show(textstat_simil(data_dfm_lbgexample, method = "cosine")), "textstat_simil object;" ) expect_equal( as.matrix(head(textstat_simil(data_dfm_lbgexample, method = "cosine"), n = 3)), as.matrix(textstat_simil(data_dfm_lbgexample, method = "cosine"))[1:3, ] ) expect_equal( as.matrix(tail(textstat_simil(data_dfm_lbgexample, method = "cosine"), n = 3)), as.matrix(textstat_simil(data_dfm_lbgexample, method = "cosine"))[4:6, ] ) }) test_that("min_simil argument works", { tstat <- textstat_simil(mt, method = "cosine", min_simil = 0.98) expect_output( show(tstat), "1.000 0.982 . . ", fixed = TRUE ) expect_equal( as.data.frame(tstat, diag = FALSE, upper = FALSE), data.frame(document1 = factor(c("1981-Reagan"), levels = rownames(tstat)), document2 = factor(c("1985-Reagan"), levels = rownames(tstat)), cosine = c(0.9817)), tol = .0001 ) expect_equal( as.data.frame(tstat, diag = FALSE, upper = TRUE), data.frame(document1 = factor(c("1985-Reagan", "1981-Reagan"), levels = rownames(tstat)), document2 = factor(c("1981-Reagan", "1985-Reagan"), levels = rownames(tstat)), cosine = c(0.9817, 0.9817)), tol = .0001 ) expect_equal( as.list(tstat, diag = FALSE), list("1981-Reagan" = c("1985-Reagan" = 0.981771), "1985-Reagan" = c("1981-Reagan" = 0.981771)), tol = .001 ) expect_equal( sapply(as.list(tstat, diag = TRUE), "[", 1), structure(rep(1, ndoc(mt)), names = paste(docnames(mt), docnames(mt), sep = ".")) ) }) test_that("test that min_simil coercion to matrix works as expected", { library("quanteda") dfmat <- corpus_subset(data_corpus_inaugural, Year > 2000) %>% tokens(remove_punct = TRUE) %>% tokens_remove(stopwords("english")) %>% dfm() tstat1 <- textstat_simil(dfmat, method = "cosine", margin = "documents", min_simil = 0.6) expect_equal( as.matrix(tstat1)[3, 4:5], c("2013-Obama" = 0.6373, "2017-Trump" = NA), tol = .0001 ) expect_equal( as.matrix(tstat1, omitted = 0)[3, 4:5], c("2013-Obama" = 0.6373, "2017-Trump" = 0), tol = .0001 ) tstat2 <- textstat_simil(dfmat, y = dfmat[c("2009-Obama", "2013-Obama"), ], method = "cosine", margin = "documents", min_simil = 0.6) expect_equal( as.matrix(tstat2)[3:5, 1], c("2009-Obama" = 1, "2013-Obama" = 0.6373, "2017-Trump" = NA), tol = .0001 ) expect_equal( as.matrix(tstat2, omitted = 0)[3:5, 1], c("2009-Obama" = 1, "2013-Obama" = 0.6373, "2017-Trump" = 0), tol = .0001 ) }) test_that("y is working in the same way as selection (#1714)", { suppressWarnings({ expect_identical(textstat_simil(mt, selection = c("2009-Obama", "2013-Obama"), margin = "documents"), textstat_simil(mt, mt[c("2009-Obama", "2013-Obama"), ], margin = "documents")) expect_identical(textstat_simil(mt, selection = c("world", "freedom"), margin = "features"), textstat_simil(mt, mt[, c("world", "freedom")], margin = "features")) expect_identical(textstat_dist(mt, selection = c("2009-Obama", "2013-Obama"), margin = "documents"), textstat_dist(mt, mt[c("2009-Obama", "2013-Obama"), ], margin = "documents")) expect_identical(textstat_dist(mt, selection = c("world", "freedom"), margin = "features"), textstat_dist(mt, mt[, c("world", "freedom")], margin = "features")) }) }) test_that("diag2na is working", { mat1 <- Matrix::Matrix(1:9, nrow = 3, dimnames = list(c("a", "b", "c"), c("b", "c", "d"))) expect_equal(as.matrix(quanteda.textstats:::diag2na(as(as(mat1, "TsparseMatrix"), "dgTMatrix"))), matrix(c(1, NA, 3, 4, 5, NA, 7, 8, 9), nrow = 3, dimnames = list(c("a", "b", "c"), c("b", "c", "d")))) mat2 <- Matrix::Matrix(1:9, nrow = 3, dimnames = list(c("a", "b", "c"), c("d", "c", "b"))) expect_equal(as.matrix(quanteda.textstats:::diag2na(as(as(mat2, "TsparseMatrix"), "dgTMatrix"))), matrix(c(1, 2, 3, 4, 5, NA, 7, NA, 9), nrow = 3, dimnames = list(c("a", "b", "c"), c("d", "c", "b")))) mat3 <- Matrix::Matrix(1:6, nrow = 3, dimnames = list(c("a", "b", "c"), c("c", "b"))) expect_equal(as.matrix(quanteda.textstats:::diag2na(as(as(mat3, "TsparseMatrix"), "dgTMatrix"))), matrix(c(1, 2, NA, 4, NA, 6), nrow = 3, dimnames = list(c("a", "b", "c"), c("c", "b")))) mat4 <- Matrix::forceSymmetric(mat1) expect_equal(as.matrix(quanteda.textstats:::diag2na(as(as(mat4, "TsparseMatrix"), "dsTMatrix"))), matrix(c(NA, 4, 7, 4, NA, 8, 7, 8, NA), nrow = 3, dimnames = list(c("b", "c", "d"), c("b", "c", "d")))) mat5 <- Matrix::Matrix(rep(0, 9), nrow = 3, dimnames = list(c("a", "b", "c"), c("b", "c", "d"))) expect_equal(as.matrix(quanteda.textstats:::diag2na(as(as(mat5, "TsparseMatrix"), "generalMatrix"))), matrix(c(0, NA, 0, 0, 0, NA, 0, 0, 0), nrow = 3, dimnames = list(c("a", "b", "c"), c("b", "c", "d")))) # expect_equal(as.matrix(quanteda.textstats:::diag2na(as(as(mat5, "TsparseMatrix"), "dgTMatrix"))), # matrix(c(0, NA, 0, 0, 0, NA, 0, 0, 0), nrow = 3, # dimnames = list(c("a", "b", "c"), c("b", "c", "d")))) }) test_that("symmetric class is correctly given", { # skip("Until these classes are separate from quanteda") dist1 <- textstat_dist(mt) expect_identical( Matrix::tril(dist1), t(Matrix::triu(dist1)) ) dist2 <- textstat_dist(mt, mt) expect_identical( Matrix::tril(dist2), t(Matrix::triu(dist2)) ) siml1 <- textstat_simil(mt) expect_identical( Matrix::tril(siml1), t(Matrix::triu(siml1)) ) siml2 <- textstat_simil(mt, mt) expect_identical( Matrix::tril(siml2), t(Matrix::triu(siml2)) ) }) test_that("as.data.frame works with subsetted object", { levs <- c(paste0("R", 1:5), "V1") simildf <- textstat_simil(data_dfm_lbgexample[-1, ], data_dfm_lbgexample[1, ]) %>% as.data.frame() expect_equal( simildf, data.frame(document1 = factor(levs[-1], levels = levs), document2 = factor(rep("R1", 5), levels = levs), correlation = c(0.18, -0.29, -0.32, -0.32, -0.12)), tol = .01 ) expect_identical(levels(simildf$document1), levels(simildf$document1)) simildf <- textstat_simil(data_dfm_lbgexample, data_dfm_lbgexample[c(1, 3), ]) %>% as.data.frame() levs2 <- levs[c(1, 3, 2, 4:6)] expect_identical( simildf[, -3], data.frame(document1 = factor(c(levs[-1], levs[-3]), levels = levs2), document2 = factor(c(rep("R1", 5), c(rep("R3", 5))), levels = levs2)) ) expect_identical(levels(simildf$document1), levels(simildf$document1)) }) test_that("hamman still works", { expect_identical( textstat_simil(data_dfm_lbgexample, method = "hamman"), textstat_simil(data_dfm_lbgexample, method = "hamann") ) })