test_that("dfm_subset works in a basic way", { dfmat <- dfm(tokens(corpus_subset(data_corpus_inaugural, Year > 1980 & Year < 2018))) expect_equal( ndoc(dfm_subset(dfmat, Year > 2000)), 5 ) expect_equal( length(levels(docid(dfm_subset(dfmat, Year > 2000, drop_docid = TRUE)))), 5 ) expect_equal( length(levels(docid(dfm_subset(dfmat, Year > 2000, drop_docid = FALSE)))), 10 ) expect_equal( docnames(dfm_subset(dfmat, President == "Clinton")), c("1993-Clinton", "1997-Clinton") ) expect_equal( docnames(dfm_subset(dfmat, c(TRUE, TRUE, rep(FALSE, 8)))), c("1981-Reagan", "1985-Reagan") ) expect_warning( dfm_subset(dfmat, Year > 2000, something = 10), "something argument is not used.", fixed = TRUE ) }) test_that("dfm_subset works with docvars", { dfmat <- dfm(tokens(corpus_subset(data_corpus_inaugural, Year > 1900))) expect_equal( docvars(head(dfmat, 5))$President, c("McKinley", "Roosevelt", "Taft", "Wilson", "Wilson") ) }) test_that("tokens_subset works with min_ntoken and max_ntoken", { toks <- tokens(corpus_subset(data_corpus_inaugural, Year > 1900)) dfmt <- dfm(toks) expect_equal( dfm_subset(dfmt, 1000 <= ntoken(dfmt)), dfm_subset(dfmt, min_ntoken = 1000) ) expect_equal( dfm_subset(dfmt, ntoken(dfmt) <= 3000), dfm_subset(dfmt, max_ntoken = 3000) ) expect_equal( dfm_subset(dfmt, Year > 2000 & 1000 <= ntoken(dfmt) & ntoken(dfmt) >= 1000), dfm_subset(dfmt, Year > 2000, min_ntoken = 1000, max_ntoken = 3000) ) expect_error( dfm_subset(dfmt, min_ntoken = -1), "The value of min_ntoken must be between 0 and Inf" ) expect_error( dfm_subset(dfmt, min_ntoken = c(10, 20)), "The length of min_ntoken must be 1" ) expect_error( dfm_subset(dfmt, max_ntoken = -1), "The value of max_ntoken must be between 0 and Inf" ) expect_error( dfm_subset(dfmt, max_ntoken = c(10, 20)), "The length of max_ntoken must be 1" ) })