test_that("test dfm_lookup, issue #389", { toks <- tokens(data_corpus_inaugural[1:5]) dict <- dictionary(list(Country = "united states", HOR = c("House of Re*"), law = c("law*", "constitution"), freedom = c("free*", "libert*"))) expect_equal(featnames(dfm(tokens_lookup(toks, dictionary = dict), tolower = FALSE)), c("Country", "HOR", "law", "freedom")) # expect_error(dfm_lookup(dfm(toks), dictionary = dict), # "dfm_lookup not implemented for ngrams > 1 and multi-word dictionary values") dict2 <- dictionary(list(Country = "united", HOR = c("House"), law = c("law*", "constitution"), freedom = c("free*", "libert*"))) expect_equal( as.numeric(dfm_lookup(dfm(toks), dictionary = dict2)[, "Country"]), c(4, 1, 3, 0, 1) ) }) test_that("#459 apply a hierarchical dictionary to a dfm", { txt <- c(d1 = "The United States is bordered by the Atlantic Ocean and the Pacific Ocean.", d2 = "The Supreme Court of the United States is seldom in a united state.") testdfm <- dfm(tokens(txt)) dict <- dictionary(list("geo" = list( Countries = c("States"), oceans = c("Atlantic", "Pacific")), "other" = list( gameconsoles = c("Xbox", "Nintendo"), swords = c("States")))) expect_equal( as.matrix(dfm_lookup(testdfm, dict, valuetype = "fixed", levels = 1)), matrix(c(3, 1, 1, 1), ncol = 2, dimnames = list(docs = c("d1", "d2"), features = c("geo", "other"))) ) expect_equal( as.matrix(dfm_lookup(testdfm, dict, valuetype = "fixed", levels = 1:2)), matrix(c(1, 1, 2, 0, 0, 0, 1, 1), ncol = 4, dimnames = list(docs = c("d1", "d2"), features = c("geo.Countries", "geo.oceans", "other.gameconsoles", "other.swords"))) ) expect_equal( as.matrix(dfm_lookup(testdfm, dict, valuetype = "fixed", levels = 2)), matrix(c(1, 1, 2, 0, 0, 0, 1, 1), ncol = 4, dimnames = list(docs = c("d1", "d2"), features = c("Countries", "oceans", "gameconsoles", "swords"))) ) }) test_that("#459 extract the lower levels of a dictionary using a dfm", { txt <- c(d1 = "The United States has the Atlantic Ocean and the Pacific Ocean.", d2 = "Britain and Ireland have the Irish Sea and the English Channel.") dict <- dictionary(list("US" = list( Countries = c("States"), oceans = c("Atlantic", "Pacific")), "Europe" = list( Countries = c("Britain", "Ireland"), oceans = list(west = "Sea", east = "Channel")))) testdfm <- dfm(tokens(txt)) expect_equal(as.matrix(dfm_lookup(testdfm, dict, levels = 1)), matrix(c(3, 0, 0, 4), nrow = 2, dimnames = list(docs = c("d1", "d2"), features = c("US", "Europe")))) expect_equal(as.matrix(dfm_lookup(testdfm, dict, levels = 2)), matrix(c(1, 2, 2, 2), nrow = 2, dimnames = list(docs = c("d1", "d2"), features = c("Countries", "oceans")))) expect_equal(as.matrix(dfm_lookup(testdfm, dict, levels = 1:2)), matrix(c(1, 0, 2, 0, 0, 2, 0, 2), nrow = 2, dimnames = list(docs = c("d1", "d2"), features = c("US.Countries", "US.oceans", "Europe.Countries", "Europe.oceans")))) expect_equal(as.matrix(dfm_lookup(testdfm, dict, levels = 3)), matrix(c(0, 1, 0, 1), nrow = 2, dimnames = list(docs = c("d1", "d2"), features = c("west", "east")))) expect_equal(as.matrix(dfm_lookup(testdfm, dict, levels = c(1, 3))), matrix(c(3, 0, 0, 2, 0, 1, 0, 1), nrow = 2, dimnames = list(docs = c("d1", "d2"), features = c("US", "Europe", "Europe.west", "Europe.east")))) expect_equal(as.matrix(dfm_lookup(testdfm, dict, levels = c(2, 3))), matrix(c(1, 2, 2, 0, 0, 1, 0, 1), nrow = 2, dimnames = list(docs = c("d1", "d2"), features = c("Countries", "oceans", "oceans.west", "oceans.east")))) expect_equal(as.matrix(dfm_lookup(testdfm, dict, levels = c(1, 4))), matrix(c(3, 0, 0, 4), nrow = 2, dimnames = list(docs = c("d1", "d2"), features = c("US", "Europe")))) expect_equal(as.matrix(dfm_lookup(testdfm, dict, levels = 4)), matrix(numeric(), nrow = 2, ncol = 0, dimnames = list(docs = c("d1", "d2"), features = NULL))) }) test_that("dfm_lookup raises error when dictionary has multi-word entries", { toks <- tokens(data_corpus_inaugural[1:5]) dict <- dictionary(list(Country = "united states"), separator = " ") expect_equal( featnames(dfm_lookup(dfm(tokens_ngrams(toks, n = 2, concatenator = " ")), dictionary = dict)), c("Country") ) }) test_that("dfm_lookup works with multi-word keys, issue #704", { dict <- dictionary(list("en" = list("foreign policy" = "foreign", "domestic politics" = "domestic"))) testdfm <- dfm(tokens(data_corpus_inaugural[1:5])) expect_equal(featnames(dfm_lookup(testdfm, dict)), c("en.foreign policy", "en.domestic politics")) }) test_that("dfm_lookup return dfm even if no matches, issue #704", { dict <- dictionary(list("en" = list("foreign policy" = "aaaaa", "domestic politics" = "bbbbb"))) dfmt <- dfm(tokens(data_corpus_inaugural[1:5])) expect_identical( featnames(dfm_lookup(dfmt, dict)), c("en.foreign policy", "en.domestic politics") ) expect_identical( colSums(dfm_lookup(dfmt, dict)), c("en.foreign policy" = 0, "en.domestic politics" = 0) ) }) test_that("dfm_lookup return all features even if no matches when exclusive = FALSE, issue #116", { dict <- dictionary(list("en" = list("foreign policy" = "aaaaa", "domestic politics" = "bbbbb"))) testdfm <- dfm(tokens(data_corpus_inaugural[1:5])) expect_equivalent(testdfm, dfm_lookup(testdfm, dict, exclusive = FALSE)) }) test_that("dfm_lookup verbose output works correctly", { expect_message( dfm_lookup(dfm(tokens(c(d1 = "a b c d", d2 = "c d e f g"))), dictionary(list(one = "a", two = c("d", "e"))), verbose = TRUE), "applying a dictionary consisting of 2 keys" ) expect_silent( dfm_lookup(dfm(tokens(c(d1 = "a b c d", d2 = "c d e f g"))), dictionary(list(one = "a", two = c("d", "e"))), verbose = FALSE) ) }) test_that("dfm_lookup with nomatch works", { txt <- c(d1 = "a c d d", d2 = "a a b c c c e f") dfm1 <- dfm(tokens(txt)) dict <- dictionary(list(one = c("a", "b"), two = c("e", "f"))) expect_equal( as.matrix(dfm_lookup(dfm1, dict)), as.matrix(dfm_lookup(dfm1, dict, nomatch = "_unmatched"))[, 1:2] ) expect_equal( as.matrix(dfm_lookup(dfm1, dict, nomatch = "_unmatched")), matrix(c(1, 3, 0, 2, 3, 3), nrow = 2, dimnames = list(docs = c("d1", "d2"), features = c("one", "two", "_unmatched"))) ) expect_warning( dfm_lookup(dfm1, dict, nomatch = "ANYTHING", exclusive = FALSE), "nomatch only applies if exclusive = TRUE" ) }) test_that("dfm_lookup works with exclusive = TRUE, #958", { txt <- c("word word2 document documents documenting", "use using word word2") dict <- dictionary(list( document = "document*", use = c("use", "using") )) mx <- dfm(tokens(txt)) expect_equal( as.matrix(dfm_lookup(mx, dict, exclusive = TRUE)), matrix(c(3, 0, 0, 2), ncol = 2, dimnames = list(docs = c("text1", "text2"), features = c("document", "use"))) ) expect_equal( as.matrix(dfm_lookup(mx, dict, exclusive = FALSE, capkeys = TRUE)), matrix(c(1, 1, 1, 1, 3, 0, 0, 2), ncol = 4, dimnames = list(docs = c("text1", "text2"), features = c("word", "word2", "DOCUMENT", "USE"))) ) expect_equal( as.matrix(dfm_lookup(mx, dict, exclusive = FALSE, capkeys = FALSE)), matrix(c(1, 1, 1, 1, 3, 0, 0, 2), ncol = 4, dimnames = list(docs = c("text1", "text2"), features = c("word", "word2", "document", "use"))) ) }) test_that("dfm_lookup works with zero count features, #958", { dict <- dictionary(list(A = "aa", B = "bb", D = "dd")) mx1 <- as.dfm(matrix(c(0, 0, 0, 0, 1, 2), nrow = 2, dimnames = list(c("doc1", "doc2"), c("aa", "bb", "cc")))) mx2 <- as.dfm(matrix(c(4, 5, 0, 0, 0, 0), nrow = 2, dimnames = list(c("doc1", "doc2"), c("aa", "bb", "cc")))) expect_equal(as.matrix(dfm_lookup(mx1, dict, exclusive = TRUE)), matrix(c(0, 0, 0, 0, 0, 0), nrow = 2, dimnames = list(docs = c("doc1", "doc2"), features = c("A", "B", "D")))) expect_equal(as.matrix(dfm_lookup(mx2, dict, exclusive = TRUE)), matrix(c(4, 5, 0, 0, 0, 0), nrow = 2, dimnames = list(docs = c("doc1", "doc2"), features = c("A", "B", "D")))) expect_equal(as.matrix(dfm_lookup(mx1, dict, exclusive = FALSE)), matrix(c(0, 0, 0, 0, 1, 2), nrow = 2, dimnames = list(docs = c("doc1", "doc2"), features = c("A", "B", "cc")))) expect_equal(as.matrix(dfm_lookup(mx2, dict, exclusive = FALSE)), matrix(c(4, 5, 0, 0, 0, 0), nrow = 2, dimnames = list(docs = c("doc1", "doc2"), features = c("A", "B", "cc")))) }) test_that("dfm_lookup works on a weighted dfm", { dict <- dictionary(list(first = LETTERS[1:5], second = LETTERS[6:10])) d <- dfm_weight(data_dfm_lbgexample, "prop") expect_equal( colSums(dfm_lookup(d, dictionary = dict)), c(first = 0.082, second = .740), tol = .001 ) }) test_that("dfm_lookup with nomatch works with key that do not appear in the text, #1347", { txt <- c("12032 Musgrave rd red hill", "13 rad street windermore park queensland", "130 right road", "130 rtn road") toks <- tokens(txt) dict <- dictionary(list(CR = c("rd", "red"), CB = c("street", "feet"), CA = c("parl", "dark"))) # CA does not appear at all dfm_dict <- dfm_lookup(dfm(toks), dict, nomatch = "NONE") expect_identical(as.matrix(dfm_dict), matrix(c(2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 3, 5, 3, 3), nrow = 4, dimnames = list(docs = c("text1", "text2", "text3", "text4"), features = c("CR", "CB", "CA", "NONE")))) }) test_that("dfm_lookup works exclusive = TRUE and FALSE, #970", { dfmat <- dfm(tokens("say good bye to Hollywood"), tolower = FALSE) dict <- dictionary(list(pos = "good", farewell = "bye")) dfmat_ex <- dfm_lookup(dfmat, dict, exclusive = TRUE) expect_true(dfmat_ex@meta$object$what == "dictionary") expect_equal(featnames(dfmat_ex), c("pos", "farewell")) dfmat_ne <- dfm_lookup(dfmat, dict, exclusive = FALSE) expect_true(dfmat_ne@meta$object$what == "word") expect_equal(featnames(dfmat_ne), c("say", "POS", "FAREWELL", "to", "Hollywood")) }) test_that("dfm_lookup handle nested patterns correctly, #2159", { dict <- dictionary(list(irish = c("ire*", "ireland", "irish"), anger = c("ire*", "mad"))) toks <- tokens("I am mad about Ireland") dfmat <- dfm(toks) expect_equal( as.matrix(dfm_lookup(dfmat, dict, exclusive = TRUE)), matrix(c(1, 2), nrow = 1, dimnames = list(docs = "text1", features = c("irish", "anger"))) ) expect_equal( as.matrix(dfm_lookup(dfmat, dict, exclusive = FALSE)), matrix(c(1, 1, 1, 2, 1), nrow = 1, dimnames = list(docs = "text1", features = c("i", "am", "IRISH", "ANGER", "about"))) ) })