test_that("dfm_trim", { mydfm <- dfm(tokens(c(d1 = "a b c d e", d2 = "a a b b e f", d3 = "b c e e f f f"))) s <- sum(mydfm) expect_equal(nfeat(dfm_trim(mydfm, min_termfreq = 0.5, termfreq_type = "quantile")), 4) expect_equal(nfeat(dfm_trim(mydfm, min_termfreq = 3 / s, termfreq_type = "prop")), 4) expect_equal(nfeat(dfm_trim(mydfm, min_termfreq = 3)), 4) expect_equal(nfeat(dfm_trim(mydfm, max_termfreq = 0.8, termfreq_type = "quantile")), 6) expect_equal(nfeat(dfm_trim(mydfm, max_termfreq = 4 / s, termfreq_type = "prop")), 6) expect_equal(nfeat(dfm_trim(mydfm, max_termfreq = 4)), 6) expect_equal(nfeat(dfm_trim(mydfm, min_docfreq = 0.5, docfreq_type = "quantile")), 5) expect_equal(nfeat(dfm_trim(mydfm, min_docfreq = 2 / 3, docfreq_type = "prop")), 5) expect_equal(nfeat(dfm_trim(mydfm, min_docfreq = 2)), 5) expect_equal(nfeat(dfm_trim(mydfm, max_docfreq = 0.5, docfreq_type = "quantile")), 4) expect_equal(nfeat(dfm_trim(mydfm, max_docfreq = 2 / 3, docfreq_type = "prop")), 4) expect_equal(nfeat(dfm_trim(mydfm, max_docfreq = 2)), 4) expect_equal(nfeat(dfm_trim(mydfm, min_termfreq = 2, termfreq_type = "rank")), 3) expect_equal(nfeat(dfm_trim(mydfm, min_termfreq = 4)), 3) expect_equal(nfeat(dfm_trim(mydfm, max_termfreq = 4, termfreq_type = "rank")), 3) expect_equal(nfeat(dfm_trim(mydfm, max_termfreq = 3)), 3) expect_equal(nfeat(dfm_trim(mydfm, min_docfreq = 1, docfreq_type = "rank")), 2) expect_equal(nfeat(dfm_trim(mydfm, min_docfreq = 3)), 2) expect_equal(nfeat(dfm_trim(mydfm, max_docfreq = 2, docfreq_type = "rank")), 4) expect_equal(nfeat(dfm_trim(mydfm, max_docfreq = 2)), 4) }) test_that("dfm_trim works as expected", { mydfm <- dfm(tokens(c("This is a sentence.", "This is a second sentence.", "Third sentence.", "Fouth sentence.", "Fifth sentence."))) expect_message(dfm_trim(mydfm, min_termfreq = 2, min_docfreq = 2, verbose = TRUE), regexp = "Removing features occurring:") expect_message(dfm_trim(mydfm, min_termfreq = 2, min_docfreq = 2, verbose = TRUE), regexp = "fewer than 2 times: 4") expect_message(dfm_trim(mydfm, min_termfreq = 2, min_docfreq = 2, verbose = TRUE), regexp = "in fewer than 2 documents: 4") expect_message(dfm_trim(mydfm, min_termfreq = 2, min_docfreq = 2, verbose = TRUE), regexp = " Total features removed: 4 \\(44.4%\\).") }) test_that("dfm_trim works as expected", { mydfm <- dfm(tokens(c("This is a sentence.", "This is a second sentence.", "Third sentence.", "Fouth sentence.", "Fifth sentence."))) expect_message(dfm_trim(mydfm, max_termfreq = 2, max_docfreq = 2, verbose = TRUE), regexp = "more than 2 times: 2") expect_message(dfm_trim(mydfm, max_termfreq = 2, max_docfreq = 2, verbose = TRUE), regexp = "in more than 2 documents: 2") expect_message(dfm_trim(mydfm, max_termfreq = 5, max_docfreq = 5, verbose = TRUE), regexp = "No features removed.") }) test_that("dfm_trim works without trimming arguments #509", { mydfm <- dfm(tokens(c("This is a sentence.", "This is a second sentence.", "Third sentence."))) expect_equal(dim(mydfm[-2, ]), c(2, 7)) expect_equal(dim(dfm_trim(mydfm[-2, ], verbose = FALSE)), c(2, 6)) }) test_that("dfm_trim doesn't break because of duplicated feature names (#829)", { mydfm <- dfm(tokens(c(d1 = "a b c d e", d2 = "a a b b e f", d3 = "b c e e f f f"))) colnames(mydfm)[3] <- "b" expect_equal( as.matrix(dfm_trim(mydfm, min_termfreq = 1)), matrix(c(1,1,1,1,1,0, 2,2,0,0,1,1, 0,1,1,0,2,3), byrow = TRUE, nrow = 3, dimnames = list(docs = c("d1", "d2", "d3"), features = c(letters[c(1,2,2,4:6)]))) ) expect_equal( as.matrix(dfm_trim(mydfm, min_termfreq = 2)), matrix(c(1,1,1,1,0, 2,2,0,1,1, 0,1,1,2,3), byrow = TRUE, nrow = 3, dimnames = list(docs = c("d1", "d2", "d3"), features = c(letters[c(1,2,2,5:6)]))) ) expect_equal( as.matrix(dfm_trim(mydfm, min_termfreq = 3)), matrix(c(1,1,1,0, 2,2,1,1, 0,1,2,3), byrow = TRUE, nrow = 3, dimnames = list(docs = c("d1", "d2", "d3"), features = c(letters[c(1,2,5:6)]))) ) }) test_that("dfm_trim works with min_termfreq larger than total number (#1181)", { testdfm <- dfm(tokens(c(d1 = "a a a a b b", d2 = "a b b c"))) expect_equal(dimnames(dfm_trim(testdfm, min_termfreq = 6)), list(docs = c("d1", "d2"), features = character()) ) expect_equal(dimnames(dfm_trim(testdfm, min_docfreq = 3)), list(docs = c("d1", "d2"), features = character()) ) }) test_that("dfm_trim works on previously weighted dfms (#1237)", { dfm1 <- dfm(tokens(c("the quick brown fox jumps over the lazy dog", "the quick brown foxy ox jumps over the lazy god"))) dfm2 <- dfm_tfidf(dfm1) expect_equal( suppressWarnings( as.matrix(dfm_trim(dfm2, min_termfreq = 0, min_docfreq = .5, termfreq_type = "prop", docfreq_type = "prop")) ), matrix(c(.30103, 0, .30103, 0, 0, .30103, 0, .30103, 0, .30103), nrow = 2, dimnames = list(docs = c("text1", "text2"), features = c("fox", "dog", "foxy", "ox", "god"))), tol = .0001 ) suppressWarnings(expect_warning( dfm_trim(dfm2, min_docfreq = .5, docfreq_type = "prop"), "dfm has been previously weighted" )) suppressWarnings(expect_warning( dfm_trim(dfm2, min_termfreq = 1, min_docfreq = .5, docfreq_type = "prop"), "dfm has been previously weighted" )) expect_equal( dim(suppressWarnings(dfm_trim(dfm2, min_termfreq = 1, min_docfreq = .5, docfreq_type = "prop"))), c(2, 0) ) }) test_that("dfm_trim error with invalid input", { dfmat <- dfm(tokens(c("the quick brown fox jumps over the lazy dog", "the quick brown foxy ox jumps over the lazy god"))) # min_termfreq expect_error( dfm_trim(dfmat, min_termfreq = -1, termfreq_type = "count"), "The value of min_termfreq must be between 0 and Inf" ) expect_error( dfm_trim(dfmat, min_termfreq = 1.1, termfreq_type = "prop"), "The value of min_termfreq must be between 0 and 1" ) expect_error( dfm_trim(dfmat, min_termfreq = 1.1, termfreq_type = "quantile"), "The value of min_termfreq must be between 0 and 1" ) expect_error( dfm_trim(dfmat, min_termfreq = 0, termfreq_type = "rank"), "The value of min_termfreq must be between 1 and Inf" ) # max_termfreq expect_error( dfm_trim(dfmat, max_termfreq = -1, termfreq_type = "count"), "The value of max_termfreq must be between 0 and Inf" ) expect_error( dfm_trim(dfmat, max_termfreq = 1.1, termfreq_type = "prop"), "The value of max_termfreq must be between 0 and 1" ) expect_error( dfm_trim(dfmat, max_termfreq = 1.1, termfreq_type = "quantile"), "The value of max_termfreq must be between 0 and 1" ) expect_error( dfm_trim(dfmat, max_termfreq = 0, termfreq_type = "rank"), "The value of max_termfreq must be between 1 and Inf" ) # min_docfreq expect_error( dfm_trim(dfmat, min_docfreq = -1, docfreq_type = "count"), "The value of min_docfreq must be between 0 and Inf" ) expect_error( dfm_trim(dfmat, min_docfreq = 1.1, docfreq_type = "prop"), "The value of min_docfreq must be between 0 and 1" ) expect_error( dfm_trim(dfmat, min_docfreq = 1.1, docfreq_type = "quantile"), "The value of min_docfreq must be between 0 and 1" ) expect_error( dfm_trim(dfmat, min_docfreq = 0, docfreq_type = "rank"), "The value of min_docfreq must be between 1 and Inf" ) # max_docfreq expect_error( dfm_trim(dfmat, max_docfreq = -1, docfreq_type = "count"), "The value of max_docfreq must be between 0 and Inf" ) expect_error( dfm_trim(dfmat, max_docfreq = 1.1, docfreq_type = "prop"), "The value of max_docfreq must be between 0 and 1" ) expect_error( dfm_trim(dfmat, max_docfreq = 1.1, docfreq_type = "quantile"), "The value of max_docfreq must be between 0 and 1" ) expect_error( dfm_trim(dfmat, max_docfreq = 0, docfreq_type = "rank"), "The value of max_docfreq must be between 1 and Inf" ) })