txt <- c(doc1 = "a B c D e", doc2 = "a BBB c D e", doc3 = "Aaaa BBB cc") toks <- tokens(txt) testdfm <- dfm(toks, tolower = FALSE) test_that("test dfm_select, fixed", { expect_equal( featnames(dfm_select(testdfm, c("a", "b", "c"), selection = "keep", valuetype = "fixed", verbose = FALSE)), c("a", "B", "c") ) expect_equal( featnames(dfm_select(testdfm, c("a", "b", "c"), selection = "remove", valuetype = "fixed", verbose = FALSE)), setdiff(featnames(testdfm), c("a", "B", "c")) ) expect_equal( featnames(dfm_select(testdfm, c("a", "b", "c"), selection = "keep", valuetype = "fixed", case_insensitive = FALSE, verbose = FALSE)), c("a", "c") ) expect_equal( featnames(dfm_select(testdfm, c("a", "b", "c"), selection = "remove", valuetype = "fixed", case_insensitive = FALSE, verbose = FALSE)), setdiff(featnames(testdfm), c("a", "c")) ) expect_equal( featnames(dfm_select(testdfm, c("aaaa", "bbb", "cc"), selection = "keep", valuetype = "fixed", min_nchar = 3, verbose = FALSE)), c("BBB", "Aaaa") ) expect_equal( featnames(dfm_select(testdfm, c("bbb"), selection = "remove", valuetype = "fixed", min_nchar = 3, verbose = FALSE)), c("Aaaa") ) expect_equal( featnames(dfm_select(testdfm, c("aaaa", "bbb", "cc"), selection = "keep", valuetype = "fixed", min_nchar = 3, max_nchar = 3, verbose = FALSE)), c("BBB") ) expect_equal( featnames(dfm_select(testdfm, c("bbb"), selection = "remove", valuetype = "fixed", min_nchar = 3, max_nchar = 3, verbose = FALSE)), character() ) }) test_that("test dfm_select, glob", { feats <- c("a*", "B*", "c") expect_equal( featnames(dfm_select(testdfm, feats, selection = "keep", valuetype = "glob", verbose = FALSE)), c("a", "B", "c", "BBB", "Aaaa") ) expect_equal( featnames(dfm_select(testdfm, feats, selection = "remove", valuetype = "glob", verbose = FALSE)), c("D", "e", "cc") ) expect_equal( featnames(dfm_select(testdfm, feats, selection = "keep", valuetype = "glob", case_insensitive = FALSE, verbose = FALSE)), c("a", "B", "c", "BBB") ) expect_equal( featnames(dfm_select(testdfm, feats, selection = "remove", valuetype = "glob", case_insensitive = FALSE, verbose = FALSE)), c("D", "e", "Aaaa", "cc") ) expect_equal( featnames(dfm_select(testdfm, feats, selection = "keep", valuetype = "glob", min_nchar = 3, verbose = FALSE)), c("BBB", "Aaaa") ) expect_equal( featnames(dfm_select(testdfm, feats, selection = "remove", valuetype = "glob", min_nchar = 3, verbose = FALSE)), character() ) }) test_that("test dfm_select, regex", { feats <- c("[A-Z].*", "c.+") expect_equal( featnames(dfm_select(testdfm, feats, selection = "keep", valuetype = "regex", verbose = FALSE)), c("a", "B", "c", "D", "e", "BBB", "Aaaa", "cc") ) expect_equal( featnames(dfm_select(testdfm, feats, selection = "remove", valuetype = "regex", verbose = FALSE)), character() ) expect_equal( featnames(dfm_select(testdfm, feats, selection = "keep", valuetype = "regex", case_insensitive = FALSE, verbose = FALSE)), c("B", "D", "BBB", "Aaaa", "cc") ) expect_equal( featnames(dfm_select(testdfm, feats, selection = "remove", valuetype = "regex", case_insensitive = FALSE, verbose = FALSE)), c("a", "c", "e") ) expect_equal( featnames(dfm_select(testdfm, feats, selection = "keep", valuetype = "regex", min_nchar = 3, verbose = FALSE)), c("BBB", "Aaaa") ) expect_equal( featnames(dfm_select(testdfm, feats, selection = "remove", valuetype = "regex", min_nchar = 3, verbose = FALSE)), character() ) }) test_that("glob works if results in no features", { expect_equal(featnames(dfm_select(testdfm, "notthere")), character()) }) test_that("selection that is out of bounds", { expect_equal(dfm_select(testdfm), testdfm) expect_equal( featnames(dfm_select(testdfm, selection = "keep", min_nchar = 5)), character() ) expect_equal( featnames(dfm_select(testdfm, selection = "remove", min_nchar = 5)), character() ) }) test_that("longer selection than longer than features that exist (related to #447)", { dfmtest <- dfm(tokens(c(d1 = "a b", d2 = "a b c d e"))) feat <- c("b", "c", "d", "e", "f", "g") expect_message(dfm_select(dfmtest, feat, verbose = TRUE), "kept 4 features") expect_message(dfm_remove(dfmtest, feat, verbose = TRUE), "removed 4 features") expect_equivalent( as.matrix(dfm_select(dfmtest, feat)), matrix(c(1, 1, 0, 1, 0, 1, 0, 1), nrow = 2) ) }) test_that("test dfm_select with ngrams #589", { ngramdfm <- dfm(tokens(c("of_the", "in_the", "to_the", "of_our", "and_the", " it_is", "by_the", "for_the"))) expect_equal(featnames(dfm_select(ngramdfm, pattern = c("of_the", "in_the"), valuetype = "fixed")), c("of_the", "in_the")) expect_equal(featnames(dfm_select(ngramdfm, pattern = "*_the", valuetype = "glob")), c("of_the", "in_the", "to_the", "and_the", "by_the", "for_the")) }) test_that("test dfm_select with ngrams concatenated with whitespace", { ngramdfm <- dfm(tokens(c("of_the", "in_the", "to_the", "of_our", "and_the", " it_is", "by_the", "for_the"))) colnames(ngramdfm) <- stringi::stri_replace_all_fixed(colnames(ngramdfm), "_", " ") expect_equal( featnames(dfm_select(ngramdfm, pattern = c("of the", "in the"), valuetype = "fixed")), c("of the", "in the") ) expect_equal( featnames(dfm_select(ngramdfm, pattern = "* the", valuetype = "glob")), c("of the", "in the", "to the", "and the", "by the", "for the") ) }) test_that("dfm_select on a dfm returns equal feature sets", { txts <- c(d1 = "This is text one", d2 = "The second text", d3 = "This is text three") dfmt1 <- dfm(tokens(txts[1:2])) dfmt2 <- dfm(tokens(txts[2:3])) expect_error({ dfmt3 <- dfm_select(dfmt1, dfmt2) }, "The `pattern` argument of `dfm_select()` cannot be a dfm", fixed = TRUE) }) test_that("dfm_select removes padding", { txts <- c(d1 = "This is text one", d2 = "The second text", d3 = "This is text three") toks <- tokens(txts) toks <- tokens_remove(toks, stopwords(), padding = TRUE) testdfm <- dfm(toks) expect_true("" %in% featnames(testdfm)) testdfm <- dfm_remove(dfm(toks), "") expect_false("" %in% featnames(testdfm)) }) # test_that("dfm_select raises warning when padding = TRUE but not valuetype = fixed", { # # expect_warning(dfm_select(testdfm, c("z", "d", "e"), padding = TRUE), # "padding is used only when valuetype is 'fixed'") # # }) test_that("dfm_select returns empty dfm when not maching features", { expect_equal(dim(dfm_select(testdfm, pattern = c("x", "y", "z"))), c(3, 0)) }) test_that("dfm_remove works even when it does not remove anything, issue 711", { txts <- c(d1 = "This is text one", d2 = "The second text", d3 = "This is text three") testdfm <- dfm(tokens(txts)) expect_silent(dfm_remove(testdfm, c("xxx", "yyy", "x y"))) expect_equal(featnames(dfm_remove(testdfm, c("xxx", "yyy", "x y"))), featnames(testdfm)) }) test_that("dfm_select errors when dictionary has multi-word features, issue 775", { dfm_inaug <- dfm(tokens(data_corpus_inaugural[50:58])) testdict1 <- dictionary(list(eco = c("compan*", "factory worker*"), pol = c("political part*", "election*")), separator = " ") testdict2 <- dictionary(list(eco = c("compan*", "factory_worker"), pol = c("political_part*", "election*")), separator = "_") expect_equal( featnames(dfm_select(dfm_inaug, pattern = testdict1, valuetype = "glob")), c("election", "elections", "company", "companies") ) expect_equal( featnames(dfm_select(dfm_inaug, pattern = phrase(testdict1), valuetype = "glob")), c("political", "election", "part", "parties", "elections", "partisan", "company", "participation", "party", "partisanship", "partial", "companies") ) expect_equal( featnames(dfm_select(dfm_inaug, pattern = testdict2, valuetype = "glob")), c("election", "elections", "company", "companies") ) expect_equal( featnames(dfm_select(dfm_inaug, pattern = phrase(testdict2), valuetype = "glob")), c("political", "election", "part", "parties", "elections", "partisan", "company", "participation", "party", "partisanship", "partial", "companies") ) }) # test_that("dfm_select works when selecting on collocations", { # txt <- c(d1 = "a b c d e g h", d2 = "a b e g h i j") # toks_uni <- tokens(txt) # dfm_uni <- dfm(toks_uni) # toks_bi <- tokens(txt) |> tokens_ngrams(n = 2, concatenator = " ") # dfm_bi <- dfm(toks_bi) # coll_bi <- textstat_collocations(toks_uni, size = 2, min_count = 2) # coll_tri <- textstat_collocations(toks_uni, size = 3, min_count = 2) # # expect_equal( # dim(dfm_select(dfm_uni, coll_bi)), # c(2, 0) # ) # expect_equal( # dim(dfm_select(dfm_uni, coll_tri)), # c(2, 0) # ) # # expect_equal(sum(dfm_select(dfm_bi, coll_bi)), 6) # expect_equal(featnames(dfm_select(dfm_bi, coll_bi)), c("a b", "e g", "g h")) # # # wrong # expect_equal(dim(dfm_select(dfm_bi, coll_tri)), c(2, 0)) # expect_equal(featnames(dfm_select(dfm_bi, coll_tri)), character()) # }) test_that("shortcut functions works", { testdfm <- dfm(tokens(data_corpus_inaugural[1:5])) expect_equal(dfm_select(testdfm, stopwords("english"), selection = "keep"), dfm_keep(testdfm, stopwords("english"))) expect_equal(dfm_select(testdfm, stopwords("english"), selection = "remove"), dfm_remove(testdfm, stopwords("english"))) }) test_that("dfm_remove/keep fail if selection argument is used", { dfmt <- dfm(tokens(c("a b c d d", "a a b c d"))) expect_error( dfm_remove(dfmt, c("b", "c"), selection = "remove"), "dfm_remove cannot include selection argument" ) expect_error( dfm_keep(dfmt, c("b", "c"), selection = "keep"), "dfm_keep cannot include selection argument" ) }) test_that("dfm_remove works when selection is a dfm (#1320)", { d1 <- dfm(tokens("a b b c c c d d d d")) d2 <- dfm(tokens("d d d a a")) expect_error({ d3 <- dfm_remove(d1, pattern = d2) }, "The `pattern` argument of `dfm_select()` cannot be a dfm", fixed = TRUE) expect_error({ d4 <- dfm_select(d1, pattern = d2, selection = "remove") }, "The `pattern` argument of `dfm_select()` cannot be a dfm", fixed = TRUE) }) test_that("really long words are not removed in tokens() (#1713)", { dfmat <- dfm(tokens("one two DonaudampfschiffahrtselektrizittenhauptbetriebswerkbauunterbeamtengesellschaftXXX")) expect_equivalent(nfeat(dfmat), 3) }) test_that("padding in dfm_select works in the same way as tokens_select (#2152)", { corp <- corpus(c("a b c d d", "a a b c d"), docvars = data.frame(var1 = c(1, 2), var2 = c(TRUE, FALSE))) toks1 <- tokens(corp) dfmt1 <- dfm(toks1) expect_equal( as.vector(dfm_remove(dfmt1, c("a", "b"), padding = TRUE)[,1]), c(2, 3) ) expect_equal( dim(dfm_remove(dfmt1, c("a", "b"), padding = TRUE)), c(2, 3) ) expect_equal( dim(dfm_remove(dfmt1, c("z"), padding = TRUE)), c(2, 4) ) expect_equal(dfm(tokens_select(toks1, "d", padding = TRUE)), dfm_select(dfmt1, "d", padding = TRUE)) expect_equal(dfm(tokens_select(toks1, "z", padding = TRUE)), dfm_select(dfmt1, "z", padding = TRUE)) expect_equal(dfm(tokens_remove(toks1, "d", padding = TRUE)), dfm_remove(dfmt1, "d", padding = TRUE)) expect_equal(dfm(tokens_remove(toks1, "z", padding = TRUE)), dfm_remove(dfmt1, "z", padding = TRUE)) # objects that already have padding toks2 <- tokens_remove(toks1, "c", padding = TRUE) dfmt2 <- dfm(toks2, remove_padding = FALSE) expect_equal( as.vector(dfm_remove(dfmt2, c("a", "b"), padding = TRUE)[,1]), c(3, 4) ) expect_equal( dim(dfm_remove(dfmt2, c("a", "b"), padding = TRUE)), c(2, 2) ) expect_equal( dim(dfm_remove(dfmt2, c("z"), padding = TRUE)), c(2, 4) ) expect_equal(dfm(tokens_select(toks2, "d", padding = TRUE)), dfm_select(dfmt2, "d", padding = TRUE)) expect_equal(dfm(tokens_select(toks2, "z", padding = TRUE)), dfm_select(dfmt2, "z", padding = TRUE)) expect_equal(dfm(tokens_remove(toks2, "d", padding = TRUE)), dfm_remove(dfmt2, "d", padding = TRUE)) expect_equal(dfm(tokens_remove(toks2, "z", padding = TRUE)), dfm_remove(dfmt2, "z", padding = TRUE)) })