test_that("test kwic general", { toks <- tokens(paste(LETTERS, collapse = " ")) expect_equal( as.data.frame(kwic(toks, "D")), data.frame( docname = c("text1"), from = 4L, to = 4L, pre = "A B C", keyword = "D", post = "E F G H I", pattern = factor("D"), stringsAsFactors = FALSE) ) expect_equal( as.data.frame(kwic(toks, "D", window = 2)), data.frame( docname = c("text1"), from = 4L, to = 4L, pre = "B C", keyword = "D", post = "E F", pattern = factor("D"), stringsAsFactors = FALSE)) expect_equal( as.data.frame(kwic(toks, "D", window = 2, separator = "_")), data.frame( docname = c("text1"), from = 4L, to = 4L, pre = "B_C", keyword = "D", post = "E_F", pattern = factor("D"), stringsAsFactors = FALSE)) expect_equal( as.data.frame(kwic(toks, "D", separator = "")), data.frame( docname = c("text1"), from = 4L, to = 4L, pre = "ABC", keyword = "D", post = "EFGHI", pattern = factor("D"), stringsAsFactors = FALSE)) }) test_that("test kwic on first token", { testkwic <- kwic(tokens(paste(LETTERS, collapse = " ")), "A") expect_equivalent( as.data.frame(testkwic), data.frame( docname = "text1", from = 1L, to = 1L, pre = "", keyword = "A", post = "B C D E F", pattern = factor("A"), stringsAsFactors = FALSE ) ) }) test_that("test kwic on last token", { testkwic <- kwic(tokens(paste(LETTERS, collapse = " ")), "Z") expect_equivalent( as.data.frame(testkwic), data.frame( docname = c("text1"), from = 26L, to = 26L, pre = "U V W X Y", keyword = "Z", post = "", pattern = factor("Z"), stringsAsFactors = FALSE ) ) }) test_that("test kwic on two tokens", { txt <- "A B C D E F G D H" testkwic <- kwic(tokens(txt), c("D", "E"), window = 3) expect_equivalent( as.data.frame(testkwic), data.frame( docname = "text1", from = c(4L, 5L, 8L), to = c(4L, 5L, 8L), pre = c("A B C", "B C D", "E F G"), keyword = c("D", "E", "D"), post = c("E F G", "F G D", "H"), pattern = factor(c("D", "E", "D")), stringsAsFactors = FALSE) ) }) test_that("test kwic on non-existent token", { testkwic <- kwic(tokens(paste(LETTERS, collapse = " ")), "É") expect_true(is.data.frame(testkwic)) }) test_that("test kwic on multiple texts", { testcorpus <- corpus(c( paste(LETTERS[2:26], collapse = " "), paste(LETTERS, collapse = " ") )) testkwic <- kwic(tokens(testcorpus), "A") expect_that( as.data.frame(testkwic), equals(data.frame( docname = c("text2"), from = 1L, to = 1L, pre = "", keyword = "A", post = "B C D E F", pattern = factor("A"), stringsAsFactors = FALSE )) ) }) test_that("test kwic with multiple matches", { testcorpus <- corpus(c( paste(c(LETTERS, LETTERS), collapse = " ") )) testkwic <- kwic(tokens(testcorpus), "A") expect_that( as.data.frame(testkwic), equals(data.frame( docname = c(c("text1", "text1")), from = c(1L, 27L), to = c(1L, 27L), pre = c("", "V W X Y Z"), keyword = c("A", "A"), post = c("B C D E F", "B C D E F"), pattern = factor(c("A", "A")), stringsAsFactors = FALSE )) ) }) test_that("test kwic with multiple matches, where one is the last (fixed bug)", { testkwic <- kwic(tokens("what does the fox say fox"), "fox") expect_that( as.data.frame(testkwic), equals(data.frame( docname = c(c("text1", "text1")), from = c(4L, 6L), to = c(4L, 6L), pre = c("what does the", "what does the fox say"), keyword = c("fox", "fox"), post = c("say fox", ""), pattern = factor("fox"), stringsAsFactors = F )) ) }) test_that("test that kwic works for glob types", { txt <- data_corpus_inaugural["2005-Bush"] kwic_glob <- kwic(tokens(txt), "secur*", valuetype = "glob", case_insensitive = TRUE) expect_true( setequal(c("security", "secured", "securing", "Security"), as.data.frame(kwic_glob)$keyword) ) kwic_glob2 <- kwic(tokens(txt), "secur*", valuetype = "glob", case_insensitive = FALSE) expect_true( setequal(c("security", "secured", "securing"), as.data.frame(kwic_glob2)$keyword) ) }) test_that("test that kwic works for regex types", { txt <- data_corpus_inaugural["2005-Bush"] kwic_regex <- kwic(tokens(txt), "^secur",valuetype = "regex", case_insensitive = TRUE) expect_true( setequal(c("security", "secured", "securing", "Security"), as.data.frame(kwic_regex)$keyword) ) kwic_regex2 <- kwic(tokens(txt), "^secur", valuetype = "regex", case_insensitive = FALSE) expect_true( setequal(c("security", "secured", "securing"), as.data.frame(kwic_regex2)$keyword) ) }) test_that("test that kwic works for fixed types", { kwic_fixed <- kwic(tokens(data_corpus_inaugural), "security", valuetype = "fixed", case_insensitive = TRUE) expect_true( setequal(c("security", "Security"), as.data.frame(kwic_fixed)$keyword) ) kwic_fixed2 <- kwic(tokens(data_corpus_inaugural), "security", valuetype = "fixed", case_insensitive = FALSE) expect_true( setequal(c("security"), as.data.frame(kwic_fixed2)$keyword) ) }) test_that("test that kwic works with index", { toks <- tokens(data_corpus_inaugural) idx <- index(toks, "security") kiwc_idx <- kwic(toks, index = idx) kwic_pat <- kwic(toks, pattern = "security") expect_identical(kiwc_idx, kwic_pat) kiwc_idx2 <- kwic(toks, index = idx[c(2, 3, 1),]) kwic_pat2 <- kwic(toks, pattern = "security")[c(2, 3, 1),] expect_identical(kiwc_idx2, kwic_pat2) kiwc_idx3 <- kwic(toks, index = idx[0,]) kwic_pat3 <- kwic(toks, pattern = "security")[0,] expect_identical(kiwc_idx3, kwic_pat3) expect_error(kwic(toks), "Either pattten or index must be provided") expect_error(kwic(toks, index = data.frame(1:5)), "Invalid index object") }) test_that("is.kwic works as expected", { kwic1 <- kwic(tokens(data_corpus_inaugural[1:3]), "provident*") expect_true(is.kwic(kwic1)) expect_false(is.kwic("Not a kwic")) kwic2 <- kwic(tokens(data_corpus_inaugural[1:3]), "abcdefg") expect_true(is.kwic(kwic2)) }) test_that("print method works as expected", { testkwic <- kwic(tokens("what does the fox say fox"), "fox") expect_output( print(testkwic), paste("Keyword-in-context with 2 matches.", "[text1, 4] what does the | fox | say fox", "[text1, 6] what does the fox say | fox |", sep = "\\s*" )) testkwic <- kwic(tokens("what does the fox say fox"), "foox") expect_output(print(testkwic), "Keyword-in-context with 0 matches.", fixed = TRUE) toks <- tokens(data_corpus_inaugural[1:8]) kw <- kwic(toks, "secure*", window = 1) out <- paste("Keyword-in-context with 6 matches.", "[1797-Adams, 478] and | secure | the", "[1797-Adams, 1512] and | secured | immortal", "[1805-Jefferson, 2367] shall | secure | to", "[1817-Monroe, 1754] To | secure | us", "[1817-Monroe, 1814] to | secure | our", "[1817-Monroe, 3009] to | secure | economy", sep = "\\s*") expect_output(print(kw, max_nrow = -1), out) expect_output(print(kw, max_nrow = 6), out) expect_output(print(kw, max_nrow = 7), out) expect_output(print(kw, show_summary = FALSE), paste("[1797-Adams, 478] and | secure | the", "[1797-Adams, 1512] and | secured | immortal", "[1805-Jefferson, 2367] shall | secure | to", "[1817-Monroe, 1754] To | secure | us", "[1817-Monroe, 1814] to | secure | our", "[1817-Monroe, 3009] to | secure | economy", sep = "\\s*")) expect_output(print(kw, 3), paste("Keyword-in-context with 6 matches.", "[1797-Adams, 478] and | secure | the", "[1797-Adams, 1512] and | secured | immortal", "[1805-Jefferson, 2367] shall | secure | to", "[ reached max_nrow ... 3 more matches ]", sep = "\\s*")) expect_output(print(kwic(toks, "secured", window = 1)), "Keyword-in-context with 1 match. [1797-Adams, 1512] and | secured | immortal", fixed = TRUE) expect_output(print(kwic(toks, "XXX", window = 1)), "Keyword-in-context with 0 matches.") }) test_that("kwic works with padding", { testtoks <- tokens("what does the fox say cat") expect_output( print(kwic(tokens_remove(testtoks, c("what", "the"), padding = TRUE), "fox")), paste("Keyword-in-context with 1 match.", "[text1, 4] does | fox | say cat", sep = "\\s*") ) expect_output( print(kwic(tokens_remove(testtoks, "*", padding = TRUE), "fox")), "Keyword-in-context with 0 matches.", ) }) test_that("kwic works as expected with and without phrases", { txt <- c(d1 = "a b c d e g h", d2 = "a b e g h i j") toks_uni <- tokens(txt) dfm_uni <- dfm(toks_uni) toks_bi <- tokens(txt) |> tokens_ngrams(n = 2, concatenator = " ") dfm_bi <- dfm(toks_bi) char_uni <- c("a", "b", "g", "j") char_bi <- c("a b", "g j") list_uni <- list("a", "b", "g", "j") list_bi <- list("a b", "g j") dict_uni <- dictionary(list(one = c("a", "b"), two = c("g", "j"))) dict_bi <- dictionary(list(one = "a b", two = "g j")) coll_bi <- data.frame(collocation = c("a b", "e g", "g h"), stringsAsFactors = FALSE) class(coll_bi) <- c("collocations", "data.frame") coll_tri <- data.frame(collocation = c("e g h"), stringsAsFactors = FALSE) class(coll_tri) <- c("collocations", "data.frame") ## on tokens expect_equal( as.data.frame(kwic(toks_uni, char_uni))$keyword, c("a", "b", "g", "a", "b", "g", "j") ) expect_equal( as.data.frame(kwic(toks_uni, list_uni))$keyword, c("a", "b", "g", "a", "b", "g", "j") ) expect_equal( nrow(kwic(toks_uni, char_bi)), 0 ) expect_equal( nrow(kwic(toks_uni, list("c d", "g h"))), 0 ) expect_equal( as.data.frame(kwic(toks_uni, list(c("c", "d"), c("g", "h"))))$keyword, c("c d", "g h", "g h") ) expect_equal( as.data.frame(kwic(toks_uni, phrase(c("c d", "g h"))))$keyword, c("c d", "g h", "g h") ) expect_equal(nrow(kwic(toks_uni, coll_bi)), 6) expect_equal(nrow(kwic(toks_uni, coll_tri)), 2) expect_equal( as.data.frame(kwic(toks_uni, as.phrase(coll_bi)))$keyword, c("a b", "e g", "g h", "a b", "e g", "g h") ) expect_equal( nrow(kwic(toks_bi, as.phrase(coll_bi))), 0 ) expect_equal(nrow(kwic(toks_uni, dict_bi)), 2) }) test_that("kwic error when dfm is given, #1006", { toks <- tokens("a b c") expect_error(kwic(toks, dfm(tokens("b c d")))) }) test_that("keywords attribute is set correctly in textplot_kwic (#1514)", { corp <- corpus(c(alpha1 = paste(letters, collapse = " "), alpha2 = paste(LETTERS, collapse = " "))) toks <- tokens(corp) kwic1 <- kwic(toks, "f") kwic2 <- kwic(toks, "u") kwic3 <- kwic(toks, c("u", "f")) expect_identical(kwic1$pattern, factor(c("f", "f"))) expect_identical(kwic2$pattern, factor(c("u", "u"))) expect_identical(kwic3$pattern, factor(c("f", "u", "f", "u"), levels = c("u", "f"))) kwic_dict1 <- kwic(tokens(corp), dictionary(list(ukey = "u"))) kwic_dict2 <- kwic(toks, dictionary(list(ukey = "u"))) kwic_dict3 <- kwic(tokens(corp), dictionary(list(ukey = "u", fkey = "f"))) kwic_dict4 <- kwic(toks, dictionary(list(ukey = "u", fkey = "f"))) expect_identical(kwic_dict1, kwic_dict2) expect_identical(kwic_dict3, kwic_dict4) expect_identical(kwic_dict1$pattern, factor(c("ukey", "ukey"))) expect_identical(kwic_dict3$pattern, factor(rep(c("fkey", "ukey"), 2), levels = c("ukey", "fkey"))) col <- data.frame(collocations = c("u v", "e f"), stringsAsFactors = FALSE) class(col) <- c("collocations", "data.frame") kwic_col <- kwic(toks, col) expect_identical(kwic_col$pattern, factor(c("e f", "u v", "e f", "u v"), levels = c("u v", "e f"))) }) test_that("keywords match pattern match and map_keywords() is working as expected", { toks <- tokens(c(alpha1 = paste(letters, collapse = " "), alpha2 = paste(LETTERS, collapse = " "))) kwic1 <- kwic(toks, dictionary(list(key1 = c("a", "b"), key2 = c("x", "y")))) expect_equal( kwic1$pattern, factor(c("key1", "key1", "key2", "key2", "key1", "key1", "key2", "key2"), levels = c("key1", "key2")) ) kwic2 <- kwic(toks, dictionary(list(key2 = c("x", "y"), key1 = c("a", "b")))) expect_equal( kwic2$pattern, factor(c("key1", "key1", "key2", "key2", "key1", "key1", "key2", "key2"), levels = c("key2", "key1")) ) kwic3 <- kwic(toks, dictionary(list(key2 = c("b", "c"), key1 = c("a", "b")))) expect_equal( kwic3$pattern, factor(c("key1", "key2", "key1", "key2", "key1", "key2", "key1", "key2"), levels = c("key2", "key1")) ) }) test_that("kwic pattern column works for phrases", { txt <- c("This is a test", "This is it.", "What is in a train?", "Is it a question?", "Sometimes you don't know if this is it.", "Is it a bird or a plane or is it a train?") toks <- tokens(txt) kw1 <- kwic(toks, c("is", "a"), valuetype = "fixed") expect_equal( as.character(kw1$pattern), char_tolower(as.data.frame(kw1)$keyword) ) kw2 <- kwic(toks, phrase("is a"), valuetype = "fixed") expect_equal( as.character(kw2$pattern), char_tolower(as.data.frame(kw2)$keyword) ) }) test_that("kwic with pattern overlaps works as expected", { kw <- c(d2 = "one two three four", d1 = "four three two one") |> tokens() |> kwic(pattern = c("two", "two", "three")) expect_equal( as.character(kw$pattern), char_tolower(as.data.frame(kw)$keyword) ) }) test_that("subsetting of kwic works", { kw <- kwic(tokens(data_corpus_inaugural), "terror") kw2 <- kw[1:3, ] expect_true("kwic" %in% class(kw2)) expect_output( print(kw2), paste0("^Keyword-in-context with 3 matches\\.") ) }) test_that("pre and post for phrases are working", { toks <- tokens(c(doc1 = "a a a b c d d d", doc2 = "a b c d e")) expect_identical( as.data.frame(kwic(toks, phrase("b c"), window = 2)), structure(list(docname = c("doc1", "doc2"), from = c(4L, 2L), to = c(5L, 3L), pre = c("a a", "a"), keyword = c("b c", "b c"), post = c("d d", "d e"), pattern = structure(c(1L, 1L), .Label = "b c", class = "factor")), class = "data.frame", row.names = c(NA, -2L)) ) }) test_that("kwic structure is as expected", { toks <- tokens(c(doc1 = "a a a b c d d d", doc2 = "a b c d e", doc3 = "b b a a")) kw <- kwic(toks, phrase("a a"), window = 2) expect_identical( kw, structure(data.frame(docname = c("doc1", "doc1", "doc3"), from = 1L:3L, to = 2L:4L, pre = c("", "a", "b b"), keyword = c("a a", "a a", "a a"), post = c("a b", "b c", ""), pattern = factor(rep("a a", 3)), stringsAsFactors = FALSE), class = c("kwic", "data.frame"), ntoken = c(doc1 = 8L, doc3 = 4L)) ) }) test_that("kwic deprecations work as expected", { txt <- "A b c d e." expect_warning( kwic(tokens(txt), "e", window = 2, remove_punct = TRUE), "remove_punct argument is not used" ) })