quanteda_options("tokens_tokenizer_word" = "word3") test_that("tokens indexing works as expected", { toks <- tokens(c(d1 = "one two three", d2 = "four five six", d3 = "seven eight")) expect_equal(toks[[1]], c("one", "two", "three")) expect_equal(as.list(toks[c(FALSE, TRUE, FALSE)]), list(d2 = c("four", "five", "six"))) expect_equal(as.list(toks["d2"]), list(d2 = c("four", "five", "six"))) expect_equal(as.list(toks[2]), list(d2 = c("four", "five", "six"))) expect_equal(as.list(toks[c(-1, -3)]), list(d2 = c("four", "five", "six"))) # issue #1830 # issue #370 expect_equal(attr(toks[1], "types"), c("one", "two", "three")) expect_equal(attr(toks[2], "types"), c("four", "five", "six")) # issue #1308 expect_error(toks[4], "Subscript out of bounds") expect_error(toks[1:4], "Subscript out of bounds") expect_error(toks["d4"], "Subscript out of bounds") expect_error(toks[c("d1", "d4")], "Subscript out of bounds") }) test_that("tokens_recompile combine duplicates is working", { toks <- tokens(c(one = "a b c d A B C D", two = "A B C d")) expect_equivalent(attr(toks, "types"), c("a", "b", "c", "d", "A", "B", "C", "D")) expect_equivalent(attr(tokens_tolower(toks), "types"), c("a", "b", "c", "d")) attr(toks, "types") <- char_tolower(attr(toks, "types")) expect_equivalent(attr(quanteda:::tokens_recompile(toks), "types"), c("a", "b", "c", "d")) }) test_that("test `ngrams` with padding = FALSE: #428", { toks <- tokens(c(doc1 = "a b c d e f g")) toks2 <- tokens_remove(toks, c("b", "e"), padding = FALSE) expect_equal(as.list(tokens_ngrams(toks2, n = 2)), list(doc1 = c("a_c", "c_d", "d_f", "f_g"))) expect_equal(as.list(tokens_ngrams(toks2, n = 3)), list(doc1 = c("a_c_d", "c_d_f", "d_f_g"))) expect_equal(as.list(tokens_ngrams(toks2, n = 2, skip = 2)), list(doc1 = c("a_f", "c_g"))) }) test_that("test `ngrams` with padding = TRUE: #428", { toks <- tokens(c(doc1 = "a b c d e f g")) toks3 <- tokens_remove(toks, c("b", "e"), padding = TRUE) expect_equal(as.list(tokens_ngrams(toks3, n = 2)), list(doc1 = c("c_d", "f_g"))) expect_equal(as.list(tokens_ngrams(toks3, n = 3)), list(doc1 = character(0))) expect_equal(as.list(tokens_ngrams(toks3, n = 2, skip = 2)), list(doc1 = c("a_d", "c_f", "d_g"))) }) test_that("test dfm with padded tokens, padding = FALSE", { toks <- tokens(c(doc1 = "a b c d e f g", doc2 = "a b c g", doc3 = "")) toks3 <- tokens_remove(toks, c("b", "e"), padding = FALSE) expect_equivalent(as.matrix(dfm(toks3)), matrix(c(1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0), nrow = 3, byrow = TRUE)) }) test_that("test dfm with padded tokens, padding = TRUE", { toks <- tokens(c(doc1 = "a b c d e f g", doc2 = "a b c g", doc3 = "")) toks3 <- tokens_remove(toks, c("b", "e"), padding = TRUE) expect_equivalent(as.matrix(dfm(toks3)), matrix(c(2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0), nrow = 3, byrow = TRUE)) }) test_that("docnames works for tokens", { expect_equal(names(data_char_ukimmig2010), docnames(tokens(data_char_ukimmig2010))) }) test_that("longer features longer than documents do not crash (#447)", { toks <- tokens(c(d1 = "a b", d2 = "a b c d e")) feat <- "b c d e" # bugs in C++ needs repeated tests expect_silent(replicate(10, tokens_select(toks, feat))) expect_equal( as.list(tokens_select(toks, feat)), list(d1 = character(0), d2 = character(0)) ) expect_equal( as.list(tokens_select(toks, phrase(feat))), list(d1 = character(0), d2 = c("b", "c", "d", "e")) ) }) test_that("tokens works as expected for what = character", { expect_equal( as.character(tokens("one, two three.", what = "character", remove_separators = TRUE)), c("o", "n", "e", ",", "t", "w", "o", "t", "h", "r", "e", "e", ".") ) expect_equal( as.character(tokens("one, two three.", what = "character", remove_separators = FALSE)), c("o", "n", "e", ",", " ", "t", "w", "o", " ", "t", "h", "r", "e", "e", ".") ) expect_equal( as.character(tokens("one, two three.", what = "character", remove_punct = TRUE, remove_separators = TRUE)), c("o", "n", "e", "t", "w", "o", "t", "h", "r", "e", "e") ) }) test_that("tokens works with unusual hiragana #554", { skip("Behaviour changed - consider removing test") skip_on_cran() skip_on_os("windows") skip_on_os("mac") txts <- c("づい゙", "゛ん゙", "たー゚") expect_equivalent(as.list(tokens(txts)), list(c("づ", "い゙"), c("゛", "ん゙"), c("た", "ー゚"))) }) test_that("types attribute is a character vector", { toks <- tokens("one two three") expect_true(is.character(attr(toks, "types"))) expect_equal(length(attributes(attr(toks, "types"))), 0) }) test_that("remove_url works as expected", { txt <- c("The URL was http://t.co/something.", "The URL was http://quanteda.io", "https://github.com/quanteda/quanteda/issue/1 is another URL") toks <- tokens(txt, what = "word", remove_url = TRUE) expect_equal( as.list(toks), list(text1 = c("The", "URL", "was"), text2 = c("The", "URL", "was"), text3 = c("is", "another", "URL")) ) }) test_that("+ operator works with tokens", { txt1 <- c(d1 = "This is sample document one.", d2 = "Here is the second sample document.") txt2 <- c(d3 = "And the third document.") toks_added <- tokens(txt1) + tokens(txt2) expect_equal( length(unique(as.character(toks_added))), length(attr(toks_added, "types")) ) expect_equal(ndoc(toks_added), 3) # expect_error( # tokens(txt1, what = "word") + tokens(txt2, what = "sentence"), # "Cannot combine tokens in different tokenization units" # ) }) test_that("+ works with empty padded tokens (#1695)", { toks1 <- tokens(c(d1 = "a b")) toks2 <- tokens(c(d2 = "")) toks3 <- tokens(c(d3 = "c")) toks4 <- tokens(c(d4 = "c d")) expect_identical( as.list(toks1 + toks2), list(d1 = c("a", "b"), d2 = character(0)) ) expect_identical( as.list(toks1 + toks3), list(d1 = c("a", "b"), d3 = "c") ) expect_identical( as.list(toks1 + tokens_remove(toks3, pattern = "c", pad = FALSE)), list(d1 = c("a", "b"), d3 = character(0)) ) expect_identical( as.list(toks1 + tokens_remove(toks3, pattern = "c", pad = TRUE)), list(d1 = c("a", "b"), d3 = "") ) expect_identical( as.list(tokens_remove(toks3, pattern = "c", pad = TRUE) + toks1), list(d3 = "", d1 = c("a", "b")) ) expect_identical( as.list(toks1 + tokens_remove(toks4, pattern = "c", pad = FALSE)), list(d1 = c("a", "b"), d4 = "d") ) expect_identical( as.list(toks1 + tokens_remove(toks4, pattern = "c", pad = TRUE)), list(d1 = c("a", "b"), d4 = c("", "d")) ) expect_identical( as.list(tokens_remove(toks4, pattern = "c", pad = TRUE) + tokens_remove(toks3, pattern = "c", pad = TRUE)), list(d4 = c("", "d"), d3 = "") ) }) test_that("c() works with tokens", { toks1 <- tokens(c(d1 = "This is sample document one.", d2 = "Here is the second sample document.")) toks2 <- tokens(c(d3 = "And the third document.")) toks3 <- tokens(c(d4 = "This is sample document 4.")) toks4 <- tokens(c(d5 = "This is sample document five!")) expect_error( c(toks1, list()), "Cannot combine different types of objects" ) expect_equal( c(toks1), toks1 ) expect_equal( c(toks1, toks2), toks1 + toks2 ) expect_equal( c(toks1, toks2, toks3, toks4), toks1 + toks2 + toks3 + toks4 ) expect_error( c(toks1, toks1), "Cannot combine tokens with duplicated document names" ) # issue #1835 toks <- c(tokens(data_corpus_inaugural[1:2]), tokens(data_corpus_inaugural[3:5]), tokens(data_corpus_inaugural[6:10])) expect_equivalent( as.list(toks), as.list(tokens(data_corpus_inaugural[1:10])) ) expect_equal( docvars(toks), docvars(tokens(data_corpus_inaugural[1:10])) ) }) test_that("docvars are erased for tokens added", { corp <- corpus(c(d1 = "This is sample document one.", d2 = "Here is the second sample document."), docvars = data.frame(dvar1 = c("A", "B"), dvar2 = c(1, 2))) toks1 <- tokens(corp, include_docvars = TRUE) expect_equivalent( docvars(toks1), data.frame(dvar1 = c("A", "B"), dvar2 = c(1, 2)) ) toks2 <- tokens(c(d3 = "And the third sample document.")) expect_equivalent( docvars(toks1 + toks2), data.frame(dvar1 = c("A", "B", NA), dvar2 = c(1, 2, NA)) ) }) test_that("what = character works with @ and #, issue #637", { expect_identical(as.list(tokens("This: is, a @test! #tag", what = "character", remove_punct = FALSE)), list(text1 = c("T", "h", "i", "s", ":", "i", "s", ",", "a", "@", "t", "e", "s", "t", "!", "#", "t", "a", "g"))) expect_identical(as.list(tokens("This: is, a @test! #tag", what = "character", remove_punct = TRUE)), list(text1 = c("T", "h", "i", "s", "i", "s", "a", "t", "e", "s", "t", "t", "a", "g"))) }) test_that("unlist retuns character vector, issue #716", { expect_equal(unlist(tokens(c(doc1 = "aaa bbb cccc", doc2 = "aaa bbb dddd"))), c(doc11 = "aaa", doc12 = "bbb", doc13 = "cccc", doc21 = "aaa", doc22 = "bbb", doc23 = "dddd")) expect_equal(unlist(tokens(c(doc1 = "aaa bbb cccc", doc2 = "aaa bbb dddd")), use.names = FALSE), c("aaa", "bbb", "cccc", "aaa", "bbb", "dddd")) }) test_that("unused argument warnings for tokens work as expected", { # for tokens expect_identical( as.character(tokens(c(d1 = "This: punctuation"), remove_punct = TRUE)), c("This", "punctuation") ) expect_warning( tokens(c(d1 = "This: punctuation"), notarg1 = TRUE), "^notarg1 argument is not used" ) expect_warning( tokens(c(d1 = "This: punctuation"), notarg1 = TRUE, notarg2 = FALSE), "^notarg1, notarg2 arguments are not used\\." ) }) test_that("tokens arguments works with values from parent frame (#721)", { expect_identical( tokens("This contains 99 numbers.", remove_numbers = T), tokens("This contains 99 numbers.", remove_numbers = TRUE), ) suppressWarnings({ expect_identical( dfm(tokens("This contains 99 numbers."), remove_numbers = T), dfm(tokens("This contains 99 numbers."), remove_numbers = TRUE) ) }) val <- FALSE expect_identical( tokens("This contains 99 numbers.", remove_numbers = val), tokens("This contains 99 numbers.", remove_numbers = FALSE) ) suppressWarnings({ expect_identical( dfm(tokens("This contains 99 numbers."), remove_numbers = val), dfm(tokens("This contains 99 numbers."), remove_numbers = FALSE) ) }) }) test_that("tokens works for strange spaces (#796)", { txt <- "space tab\t newline\n non-breakingspace\u00A0, variationselector16 \uFE0F." expect_identical(ntoken(txt, remove_punct = FALSE, remove_separators = TRUE), c(text1 = 7L)) expect_identical( as.character(tokens(txt, what = "word", remove_punct = TRUE, remove_separators = TRUE)), c("space", "tab", "newline", "non-breakingspace", "variationselector16") ) toks <- tokens(txt, what = "word1", remove_punct = FALSE, remove_separators = FALSE) expect_identical(ntoken(toks), c(text1 = 15L)) expect_identical( as.character(tokens(txt, what = "word1", remove_punct = FALSE, remove_separators = FALSE))[13:15], c("variationselector16", " ", ".") ) expect_identical( ntoken(txt, remove_punct = TRUE, remove_separators = FALSE, what = "word1"), c(text1 = 13L) ) expect_identical( as.character(tokens(txt, remove_punct = TRUE, remove_separators = FALSE, what = "word1"))[12:13], c("variationselector16", " ") ) expect_silent( tokens(txt, what = "word", remove_separators = FALSE) ) expect_warning( tokens(txt, what = "sentence", remove_separators = FALSE), "remove_separators is always TRUE for this type" ) }) test_that("tokens works with control characters", { txt <- "Left-to-Right Override \u202D Zero-Width Non-Breaking Space \ufeff" expect_equal(ntoken(txt), c(text1 = 5)) }) test_that("tokens remove whitespace with combining characters (#882)", { skip_on_travis() skip_on_cran() skip_on_appveyor() skip_on_os("windows") txt <- "( \u0361\u00b0 \u035c\u0296 \u0361\u00b0)" tok <- tokens(txt) expect_equal(as.list(tok)[[1]], c("(", "°", "ʖ", "°", ")")) }) test_that("split_hyphens is working correctly", { txt <- "a b-c d . !" expect_equal(as.character(tokens(txt, split_hyphens = FALSE, remove_punct = FALSE)[[1]]), c("a", "b-c", "d", ".", "!")) expect_equal(as.character(tokens(txt, split_hyphens = FALSE, remove_punct = TRUE)[[1]]), c("a", "b-c", "d")) expect_equal(as.character(tokens(txt, split_hyphens = TRUE, remove_punct = FALSE)[[1]]), c("a", "b", "-", "c", "d", ".", "!")) expect_equal(as.character(tokens(txt, split_hyphens = TRUE, remove_punct = TRUE)[[1]]), c("a", "b", "c", "d")) }) test_that("tokens.tokens() does nothing by default", { toks <- tokens(data_corpus_inaugural, remove_numbers = FALSE, remove_punct = FALSE, remove_symbols = FALSE, remove_separators = TRUE, split_hyphens = FALSE, remove_url = FALSE) expect_equal(toks, tokens(toks)) }) test_that("test that features remove by tokens.tokens is comparable to tokens.character", { chars <- c("a b c 12345 ! @ # $ % ^ & * ( ) _ + { } | : \' \" < > ? ! , . \t \n \u2028 \u00A0 \u2003 \uFE0F", "#tag @user", "abc be-fg hi 100kg 2017", "https://github.com/kbenoit/quanteda", "a b c d e") toks1 <- as.tokens(stringi::stri_split_fixed(chars[1], " ")) toks2 <- as.tokens(stringi::stri_split_fixed(chars[2], " ")) toks3 <- as.tokens(stringi::stri_split_fixed(chars[3], " ")) toks4 <- as.tokens(stringi::stri_split_fixed(chars[4], " ")) toks5 <- as.tokens(stringi::stri_split_fixed(chars[5], " ")) expect_equal(tokens(chars[1], remove_numbers = TRUE) |> as.list(), tokens(toks1, remove_numbers = TRUE) |> as.list()) expect_equal(tokens(chars[1], remove_punct = TRUE) |> as.list(), tokens(toks1, remove_punct = TRUE) |> as.list()) expect_equal(tokens(chars[1], remove_separator = TRUE) |> as.list(), tokens(toks1, remove_separator = TRUE) |> as.list()) expect_equal(tokens(chars[1], remove_symbols = TRUE) |> as.list(), tokens(toks1, remove_symbols = TRUE) |> as.list()) expect_equal(tokens(chars[4], remove_url = TRUE) |> as.list(), tokens(toks4, remove_url = TRUE) |> as.list()) expect_equal(tokens(chars[3], split_hyphens = TRUE) |> as.list(), tokens(toks3, split_hyphens = TRUE) |> as.list()) # This fails because there is not separator in toks # expect_equal(tokens(chars[1], remove_symbols = TRUE, remove_separator = FALSE), # tokens(toks1, remove_symbols = TRUE, remove_separator = FALSE)) }) test_that("tokens works as expected with NA, and blanks", { expect_equal( as.list(tokens(c("one", "two", ""))), list(text1 = "one", text2 = "two", text3 = character()) ) expect_equal( as.list(suppressWarnings(tokens(c("one", NA, "")))), list(text1 = "one", text2 = character(), text3 = character()) ) expect_equal( as.list(suppressWarnings(tokens(c(NA, "one", "")))), list(text1 = character(), text2 = "one", text3 = character()) ) expect_equal( as.list(tokens("")), list(text1 = character()) ) expect_equal( as.list(suppressWarnings(tokens(c(d1 = "", d2 = NA)))), list(d1 = character(), d2 = character()) ) expect_equal( as.list(suppressWarnings(tokens(c(d1 = NA, d2 = "")))), list(d1 = character(), d2 = character()) ) expect_equal( as.character(as.tokens(list(""))), character() ) }) test_that("assignment operators are disabled for tokens object", { toks <- tokens(c(d1 = "a b c d", d2 = "c d e")) try(toks[[1]] <- c(6, 100, "z"), silent = TRUE) expect_equal(as.list(toks), list(d1 = c("a", "b", "c", "d"), d2 = c("c", "d", "e"))) expect_error(toks[[1]] <- c(6, 100, "z"), "assignment to tokens objects is not allowed") expect_error(toks[1] <- list(c(6, 100, "z")), "assignment to tokens objects is not allowed") }) test_that("assignment operators are disabled for tokens object", { toks <- tokens(c(d1 = "a b c d", d2 = "c d e")) try(toks[[1]] <- c(6, 100, "z"), silent = TRUE) expect_equal(as.list(toks), list(d1 = c("a", "b", "c", "d"), d2 = c("c", "d", "e"))) expect_error(toks[[1]] <- c(6, 100, "z"), "assignment to tokens objects is not allowed") expect_error(toks[1] <- list(c(6, 100, "z")), "assignment to tokens objects is not allowed") }) test_that("empty tokens are removed correctly", { txt <- "a b c d e " tok <- c("a", "b", "c", "d", "e") expect_equal(as.list(tokens(txt, what = "word"))[[1]], tok) expect_equal(as.list(tokens(txt, what = "fasterword"))[[1]], tok) expect_equal(as.list(tokens(txt, what = "fastestword"))[[1]], tok) }) test_that("combined tokens objects have all the attributes", { toks1 <- tokens(c(text1 = "a b c")) toks2 <- tokens_compound(tokens(c(text2 = "d e f")), phrase("e f"), concatenator = "+") toks3 <- tokens(c(text3 = "d e f"), what = "sentence") expect_warning( toks4 <- tokens(c(text4 = "d e f"), ngram = 1:2, skip = 2), "ngram, skip arguments are not used." ) toks5 <- tokens(c(text5 = "d e f")) expect_error(c(toks1, toks1), "Cannot combine tokens with duplicated document names") expect_error(c(toks1, toks2), "Cannot combine tokens with different concatenators") # expect_error(c(toks1, toks3), # "Cannot combine tokens in different tokenization units") expect_identical(names(attributes(c(toks1, toks4))), names(attributes(toks1))) expect_identical(attr(c(toks1, toks4), "meta")$object$what, "word") expect_identical(attr(c(toks1, toks4), "meta")$object$tokenizer, "tokenize_word3") expect_identical(attr(c(toks1, toks4), "meta")$object$concatenator, "_") expect_identical(attr(c(toks1, toks4), "meta")$object$ngram, c(1L)) expect_identical(attr(c(toks1, toks4), "meta")$object$skip, c(0L)) expect_identical(docnames(dfm(c(toks1, toks4))), c("text1", "text4")) expect_identical(names(attributes(c(toks1, toks5))), names(attributes(toks1))) expect_identical(attr(c(toks1, toks5), "meta")$object$what, "word") expect_identical(attr(c(toks1, toks5), "meta")$object$tokenizer, "tokenize_word3") expect_identical(attr(c(toks1, toks5), "meta")$object$concatenator, "_") expect_identical(attr(c(toks1, toks5), "meta")$object$ngram, 1L) expect_identical(attr(c(toks1, toks5), "meta")$object$skip, 0L) expect_identical(docnames(dfm(c(toks1, toks5))), c("text1", "text5")) }) test_that("tokens fasterword handles newlines correctly (#1447)", { expect_identical( as.list(tokens("one\ntwo\tthree", what = "fastestword")), list(text1 = c("one\ntwo\tthree")) ) expect_identical( suppressWarnings(as.list(tokens("one\ntwo\tthree", what = "fastestword", remove_separators = FALSE))), list(text1 = c("one\ntwo\tthree")) ) expect_identical( as.list(tokens("one\ntwo\tthree", what = "fasterword", remove_separators = TRUE)), list(text1 = c("one", "two", "three")) ) expect_identical( suppressWarnings(as.list(tokens("one\ntwo\tthree", what = "fasterword", remove_separators = FALSE))), list(text1 = c("one", "two", "three")) ) expect_identical( as.list(tokens("one\ntwo\tthree", what = "word", remove_separators = TRUE)), list(text1 = c("one", "two", "three")) ) }) test_that("warn when remove_separators = FALSE fasterword and fastestword", { expect_silent(tokens("a b c", what = "word")) expect_warning(tokens("a b c", what = "fasterword", remove_separators = FALSE), "remove_separators is always TRUE for this type") expect_warning(tokens("a b c", what = "fastestword", remove_separators = FALSE), "remove_separators is always TRUE for this type") }) test_that("tokens.tokens(x, split_hyphens = TRUE) behaves same as tokens.character(...)", { # issue #1498 txt <- "Auto-immune system." expect_identical( as.character(tokens(txt, split_hyphens = FALSE) |> tokens(split_hyphens = TRUE)), c("Auto", "-", "immune", "system", ".") ) txt <- c("There's shrimp-kabobs, shrimp creole. Deep-deep-fried, stir-fried.", "Stir-fried shrimp.") expect_identical( tokens(txt, split_hyphens = TRUE) |> as.list(), tokens(txt, split_hyphens = FALSE) |> tokens(split_hyphens = TRUE) |> as.list() ) }) test_that("types are encoded when necessarly", { toks <- tokens(c("まずは最初の文書。そして、次の文書。", "最後の文書")) expect_true(all(Encoding(types(toks)) == "UTF-8")) expect_true(all(Encoding(types(tokens_wordstem(toks))) == "UTF-8")) expect_true(all(Encoding(types(tokens_sample(toks, 1))) == "UTF-8")) expect_true(all(Encoding(types(tokens_tolower(toks))) == "UTF-8")) expect_true(all(Encoding(types(tokens_toupper(toks))) == "UTF-8")) expect_true(all(Encoding(types(tokens_ngrams(toks))) == "UTF-8")) expect_true(all(Encoding(types(tokens_remove(toks, "の"))) == "UTF-8")) expect_true(all(Encoding(types(tokens_replace(toks, phrase("次 の"), phrase("次 は")))) == "UTF-8")) expect_true(all(Encoding(types(tokens_split(toks, "は"))) == "UTF-8")) expect_true(all(Encoding(types(tokens_chunk(toks, 2))) == "UTF-8")) expect_true(all(Encoding(types(tokens_subset(toks, c(TRUE, FALSE)))) == "UTF-8")) }) test_that("tokens verbose = TRUE produces expected messages", { expect_message( tokens(c("one two three", "four five."), verbose = TRUE), "^Creating a tokens from a character object" ) expect_message( tokens(corpus(c("one two three", "four five.")), xptr = TRUE, verbose = TRUE), "^Creating a tokens_xptr from a corpus object" ) }) test_that("types<- with wrong value generates error", { toks <- tokens(c("one two three", "four five.")) expect_error( quanteda:::`types<-.tokens`(toks, value = 1:6), "replacement value must be character" ) }) test_that("tokens.tokens warns about unused arguments", { expect_warning( tokens(tokens("one two three"), notanarg = TRUE), "^notanarg argument is not used" ) }) test_that("tokens.tokens(x, split_hyphens = TRUE, verbose = TRUE) works as expected (#1683)", { expect_message( tokens(tokens("No hyphens here."), split_hyphens = TRUE, verbose = TRUE), "splitting hyphens" ) expect_message( tokens(tokens("Hyphens oft-cited here."), split_hyphens = TRUE, verbose = TRUE), "splitting hyphens" ) expect_identical( as.character(tokens(tokens("Hyphens oft-cited here."), split_hyphens = TRUE)), c("Hyphens", "oft", "-", "cited", "here", ".") ) }) test_that("tokens.tokens(x, split_tags = TRUE, verbose = TRUE) works as expected (#1683)", { expect_warning( tokens(tokens("No Twitter."), split_tags = TRUE), "split_tags argument is not used" ) expect_message( tokens(tokens("Removing #hashtags.", verbose = TRUE)), "preserving social media tags" ) expect_identical( as.character(tokens(tokens("Removing #hashtags.", split_tags = TRUE))), c("Removing", "#", "hashtags", ".") ) }) test_that("tokens.tokens(x, remove_numbers = TRUE, verbose = TRUE) works as expected (#1683)", { expect_message( tokens(tokens("Removing no number words."), remove_numbers = TRUE, verbose = TRUE), "...removing separators, numbers" ) expect_message( tokens(tokens("Removing 1 number words."), remove_numbers = TRUE, verbose = TRUE), "...removing separators, numbers" ) expect_identical( as.character(tokens(tokens("Removing 1 number words."), remove_numbers = TRUE)), c("Removing", "number", "words", ".") ) }) test_that("tokens.tokens(x, remove_punct = TRUE, verbose = TRUE) works as expected (#1683)", { expect_message( tokens(tokens("Removing no £ punctuation"), remove_punct = TRUE, verbose = TRUE), "...removing separators, punctuation" ) expect_message( tokens(tokens("Removing £ punctuation."), remove_symbols = TRUE, verbose = TRUE), "removing separators, symbols" ) expect_message( tokens(tokens("Removing £ punctuation."), remove_symbols = TRUE, remove_separators = TRUE, verbose = TRUE), "removing separators, symbols" ) expect_identical( as.character(tokens(tokens("Removing £ punctuation."), remove_punct = TRUE, remove_symbol = FALSE)), c("Removing", "£", "punctuation") ) }) test_that("tokens.tokens(x, remove_symbols = TRUE, verbose = TRUE) works as expected (#1683)", { expect_message( tokens(tokens("Removing no symbols."), remove_symbols = TRUE, verbose = TRUE), "removing separators, symbols" ) expect_message( tokens(tokens("Removing € symbols."), remove_symbols = TRUE, verbose = TRUE), "removing separators, symbols" ) expect_identical( as.character(tokens(tokens("Removing € symbols."), remove_symbols = TRUE)), c("Removing", "symbols", ".") ) }) test_that("tokens.tokens(x, remove_separators = TRUE, verbose = TRUE) works as expected (#1683)", { skip("the verbose message has been changed") expect_message( tokens(tokens("Removing separators", remove_separators = FALSE, what = "word1"), remove_separators = TRUE, verbose = TRUE), "...removing separators" ) expect_message( tokens(tokens("Removing no separators", remove_separators = TRUE), remove_separators = TRUE, verbose = TRUE), "removing separators" ) expect_identical( as.character( tokens(tokens("Removing separators", remove_separators = FALSE, what = "word1"), remove_separators = TRUE) ), c("Removing", "separators") ) expect_message( tokens(tokens("Removing separators", remove_separators = TRUE), verbose = TRUE), c("elapsed time: .+ seconds") ) }) test_that("tokens.tokens(x, remove_url = TRUE, verbose = TRUE) works as expected (#1683)", { expect_message( tokens(tokens("Removing https://quanteda.org URLs", what = "fasterword"), remove_url = TRUE, verbose = TRUE), "removing separators, URLs" ) expect_message( tokens(tokens("Removing no URLs"), remove_url = TRUE, verbose = TRUE), "removing separators, URLs" ) expect_identical( as.character(tokens(tokens("Removing https://quanteda.org URLs", what = "fasterword"), remove_url = TRUE)), c("Removing", "URLs") ) }) test_that("symbols and punctuation are handled separately (#1445)", { txt <- "£ € 👏 Rock on❗ 💪️🎸" expect_identical( as.character(tokens(txt, what = "word", remove_symbols = FALSE, remove_punct = TRUE)), as.character(tokens(txt, what = "word", remove_symbols = FALSE, remove_punct = FALSE)) ) expect_identical( as.character(tokens(txt, what = "fasterword", remove_symbols = FALSE, remove_punct = TRUE)), as.character(tokens(txt, what = "fasterword", remove_symbols = FALSE, remove_punct = FALSE)) ) expect_identical( as.character(tokens(txt, what = "fastestword", remove_symbols = FALSE, remove_punct = TRUE)), as.character(tokens(txt, what = "fastestword", remove_symbols = FALSE, remove_punct = FALSE)) ) }) test_that("test that what = \"word\" works the same as \"word2\"", { skip("we no longer expect these to be the same") chars <- c("a b c 12345 ! @ # $ % ^ & * ( ) _ + { } | : \' \" < > ? ! , . \t \n \u2028 \u00A0 \u2003", "#tag @user", "abc be-fg hi 100kg 2017", "a b c d e") expect_equal(tokens(chars, what = "word", remove_numbers = TRUE) |> as.list(), tokens(chars, what = "word1", remove_numbers = TRUE) |> as.list()) expect_equal(tokens(chars, what = "word", remove_numbers = TRUE) |> as.list(), tokens(chars, what = "word1", remove_numbers = TRUE) |> as.list()) expect_equal(tokens(chars, what = "word", remove_symbols = TRUE) |> as.list(), tokens(chars, what = "word1", remove_symbols = TRUE) |> as.list()) expect_equal(tokens(chars, what = "word", remove_symbols = TRUE) |> as.list(), tokens(chars, what = "word1", remove_symbols = TRUE) |> as.list()) expect_equal(tokens(chars, what = "word", remove_punct = TRUE) |> as.list(), tokens(chars, what = "word1", remove_punct = TRUE) |> as.list()) expect_equal(tokens(chars, what = "word", remove_punct = TRUE) |> as.list(), tokens(chars, what = "word1", remove_punct = TRUE) |> as.list()) expect_equal(tokens(chars, what = "word", remove_punct = TRUE, split_tags = TRUE) |> as.list(), tokens(chars, what = "word1", remove_punct = TRUE, split_tags = TRUE) |> as.list()) expect_equal(tokens(chars, what = "word", remove_punct = TRUE, split_tags = TRUE) |> as.list(), tokens(chars, what = "word1", remove_punct = TRUE, split_tags = TRUE) |> as.list()) suppressWarnings( expect_equal(tokens(chars, what = "word", remove_punct = FALSE, split_tags = TRUE) |> as.list(), tokens(chars, what = "word1", remove_punct = FALSE, split_tags = TRUE) |> as.list()) ) suppressWarnings( expect_equal(tokens(chars, what = "word", remove_punct = FALSE, split_tags = TRUE) |> as.list(), tokens(chars, what = "word1", remove_punct = FALSE, split_tags = TRUE) |> as.list()) ) expect_equal(tokens(chars, what = "word", split_hyphens = TRUE) |> as.list(), tokens(chars, what = "word1", split_hyphens = TRUE) |> as.list()) expect_equal(tokens(chars, what = "word", split_hyphens = TRUE) |> as.list(), tokens(chars, what = "word1", split_hyphens = TRUE) |> as.list()) }) test_that("tokens printing works", { toks <- tokens(data_corpus_inaugural[1:14]) expect_silent( print(toks, max_ndoc = 0, max_ntoken = 0, show_summary = FALSE) ) expect_output( print(toks, max_ndoc = 0, max_ntoken = 0, show_summary = TRUE), "Tokens consisting of 14 documents and 4 docvars.", fixed = TRUE ) expect_output( print(toks, max_ndoc = 2, max_ntoken = 3, show_summary = TRUE), paste0('Tokens consisting of 14 documents and 4 docvars.\n', '1789-Washington :\n', '[1] "Fellow-Citizens" "of" "the" \n', '[ ... and 1,534 more ]\n\n', '1793-Washington :\n', '[1] "Fellow" "citizens" "," \n', '[ ... and 144 more ]\n\n', '[ reached max_ndoc ... 12 more documents ]'), fixed = TRUE ) expect_output( print(toks, max_ndoc = 2, max_ntoken = 3, show_summary = FALSE), paste0('1789-Washington :\n', '[1] "Fellow-Citizens" "of" "the" \n', '[ ... and 1,534 more ]\n\n', '1793-Washington :\n', '[1] "Fellow" "citizens" "," \n', '[ ... and 144 more ]\n\n', '[ reached max_ndoc ... 12 more documents ]'), fixed = TRUE ) expect_output( print(toks[1:2], max_ndoc = 2, max_ntoken = 3, show_summary = FALSE), paste0('1789-Washington :\n', '[1] "Fellow-Citizens" "of" "the" \n', '[ ... and 1,534 more ]\n\n', '1793-Washington :\n', '[1] "Fellow" "citizens" "," \n', '[ ... and 144 more ]\n'), fixed = TRUE ) expect_output( print(tokens("a b c d"), max_ndoc = -1, max_ntoken = 2), paste0('Tokens consisting of 1 document.\n', 'text1 :\n', '[1] "a" "b"\n', '[ ... and 2 more ]\n'), fixed = TRUE ) expect_output( print(tokens("a b c d"), max_ndoc = -1, max_ntoken = 4), paste0('Tokens consisting of 1 document.\n', 'text1 :\n', '[1] "a" "b" "c" "d"'), fixed = TRUE ) expect_output( print(tokens("a b c d"), max_ndoc = -1, max_ntoken = -1), paste0('Tokens consisting of 1 document.\n', 'text1 :\n', '[1] "a" "b" "c" "d"'), fixed = TRUE ) }) test_that("tokens.list() works", { lis <- list(d1 = c("one", "two-three", "@test"), d2 = c("four", ".")) expect_identical(as.list(tokens(lis)), lis) expect_identical(as.list(tokens(lis, split_hyphens = TRUE)), list(d1 = c("one", "two", "-", "three", "@test"), d2 = c("four", "."))) }) test_that("tokens.character(x, padding = TRUE) works", { txt <- c(doc1 = "One 2, £ https://qunteda.org one-two.") # punct expect_identical( as.list(tokens(txt, what = "word", remove_punct = TRUE, padding = TRUE)), list(doc1 = c("One", "2", "", "£", "https://qunteda.org", "one-two", "")) ) expect_identical( as.list(tokens(txt, what = "word", remove_punct = TRUE, padding = FALSE)), list(doc1 = c("One", "2", "£", "https://qunteda.org", "one-two")) ) # symbols expect_identical( as.list(tokens(txt, what = "word", remove_symbols = TRUE, padding = TRUE)), list(doc1 = c("One", "2", ",", "", "https://qunteda.org", "one-two", ".")) ) expect_identical( as.list(tokens(txt, what = "word", remove_symbols = TRUE, padding = FALSE)), list(doc1 = c("One", "2", ",", "https://qunteda.org", "one-two", ".")) ) # numbers expect_identical( as.list(tokens(txt, what = "word", remove_numbers = TRUE, padding = TRUE)), list(doc1 = c("One", "", ",", "£", "https://qunteda.org", "one-two", ".")) ) expect_identical( as.list(tokens(txt, what = "word", remove_numbers = TRUE, padding = FALSE)), list(doc1 = c("One", ",", "£", "https://qunteda.org", "one-two", ".")) ) # url expect_identical( as.list(tokens(txt, what = "word", remove_url = TRUE, padding = TRUE)), list(doc1 = c("One", "2", ",", "£", "", "one-two", ".")) ) expect_identical( as.list(tokens(txt, what = "word", remove_url = TRUE, padding = FALSE)), list(doc1 = c("One", "2", ",", "£", "one-two", ".")) ) }) test_that("tokens.tokens(x, padding = TRUE) works", { txt <- c(doc1 = "One 2, £ https://qunteda.org one-two.") toks <- tokens(txt, what = "word") # punct expect_identical( as.list(tokens(toks, what = "word", remove_punct = TRUE, padding = TRUE)), list(doc1 = c("One", "2", "", "£", "https://qunteda.org", "one-two", "")) ) expect_identical( as.list(tokens(toks, what = "word", remove_punct = TRUE, padding = FALSE)), list(doc1 = c("One", "2", "£", "https://qunteda.org", "one-two")) ) # symbols expect_identical( as.list(tokens(toks, what = "word", remove_symbols = TRUE, padding = TRUE)), list(doc1 = c("One", "2", ",", "", "https://qunteda.org", "one-two", ".")) ) expect_identical( as.list(tokens(toks, what = "word", remove_symbols = TRUE, padding = FALSE)), list(doc1 = c("One", "2", ",", "https://qunteda.org", "one-two", ".")) ) # numbers expect_identical( as.list(tokens(toks, what = "word", remove_numbers = TRUE, padding = TRUE)), list(doc1 = c("One", "", ",", "£", "https://qunteda.org", "one-two", ".")) ) expect_identical( as.list(tokens(toks, what = "word", remove_numbers = TRUE, padding = FALSE)), list(doc1 = c("One", ",", "£", "https://qunteda.org", "one-two", ".")) ) # url expect_identical( as.list(tokens(toks, what = "word", remove_url = TRUE, padding = TRUE)), list(doc1 = c("One", "2", ",", "£", "", "one-two", ".")) ) expect_identical( as.list(tokens(toks, what = "word", remove_url = TRUE, padding = FALSE)), list(doc1 = c("One", "2", ",", "£", "one-two", ".")) ) }) test_that("special1 functions are working", { expect_identical( quanteda:::preserve_special1("#quanteda #q-x #q_y #q100 #q", split_hyphens = TRUE, split_tags = FALSE), "_ht_quanteda _ht_q-x _ht_q_y _ht_q100 _ht_q" ) expect_identical( quanteda:::preserve_special1("#quanteda #q-x #q_y #q100 #q", split_hyphens = FALSE, split_tags = FALSE), "_ht_quanteda _ht_q_hy_x _ht_q_y _ht_q100 _ht_q" ) toks1 <- as.tokens(list(c("_ht_quanteda", "_ht_q-x", "_ht_q_y", "_ht_q100", "_ht_q"))) expect_identical( attr(quanteda:::restore_special1(toks1, split_hyphens = TRUE, split_tags = FALSE), "types"), c("#quanteda", "#q-x", "#q_y", "#q100", "#q") ) toks2 <- as.tokens(list(c("_ht_quanteda", "_ht_q_hy_x", "_ht_q_y", "_ht_q100", "_ht_q"))) expect_identical( attr(quanteda:::restore_special1(toks2, split_hyphens = FALSE, split_tags = FALSE), "types"), c("#quanteda", "#q-x", "#q_y", "#q100", "#q") ) }) test_that("tokenizing Japanese with URLs works", { txt <- c(d1 = "私のユーザー名は@quantedainitです。") expect_identical( as.list(tokens(txt, what = "word")), list(d1 = c("私", "の", "ユーザー", "名", "は", "@quantedainit", "です", "。")) ) txt <- c(d1 = "私のウェブサイトはhttps://www.nichibenren.or.jp/です。") expect_identical( as.list(tokens(txt, what = "word")), list(d1 = c("私", "の", "ウェブサイト", "は", "https://www.nichibenren.or.jp/", "です", "。")) ) txt <- c(d1 = "10,000人のフォロワーがいます。") expect_identical( as.list(tokens(txt, what = "word")), list(d1 = c("10,000", "人", "の", "フォロワー", "がい", "ます", "。")) ) txt <- c(d1 = "私のウェブサイトはhttps://www.nichibenren.or.jp/です。10,000人のフォロワーがいます。") expect_identical( as.list(tokens(txt, what = "word")), list(d1 = c("私", "の", "ウェブサイト", "は", "https://www.nichibenren.or.jp/", "です", "。", "10,000", "人", "の", "フォロワー", "がい", "ます", "。")) ) }) test_that("Non-ASCII hashtags are preserved", { txt <- c(d1 = "オリンピック延期決定! #政治 #安倍政権") expect_identical( as.list(tokens(txt, what = "word")), list(d1 = c("オリンピック", "延期", "決定", "!", "#政治", "#安倍政権")) ) }) test_that("Weibo-style hashtags are preserved", { txt <- c(d1 = "#英国首相#仍在ICU") expect_identical( as.list(tokens(txt, what = "word")), list(d1 = c("#英国首相#", "仍在", "ICU")) ) }) test_that("emails address is preserved", { # prevents test failing on Ubuntu 20.04 on GitHub Actions skip_if( as.numeric(stringi::stri_info()$Unicode.version) > 10 && as.numeric(stringi::stri_info()$ICU.version) > 61.1 ) txt <- c(d1 = "support@quanteda.io K.Watanabe@qi1234.co.jp") expect_identical( as.list(tokens(txt, what = "word")), list(d1 = c("support@quanteda.io", "K.Watanabe@qi1234.co.jp")) ) }) test_that("split_tags works", { txt1 <- c(d1 = "@quanteda @koheiw7 @QUANTEDA_INITIATIVE") expect_identical( as.list(tokens(txt1, what = "word")), list(d1 = c("@quanteda", "@koheiw7", "@QUANTEDA_INITIATIVE")) ) # expect_identical( # as.list(tokens(txt1, what = "word", split_tags = TRUE)), # list(d1 = c("@", "quanteda", "@", "koheiw7", "@", "QUANTEDA_INITIATIVE")) # ) txt2 <- c(d1 = "#quanteda #q-x #q_y #q100 #q") expect_identical( as.list(tokens(txt2, what = "word")), list(d1 = c("#quanteda", "#q-x", "#q_y", "#q100", "#q")) ) expect_identical( as.list(tokens(txt2, what = "word", split_tags = TRUE)), list(d1 = c("#", "quanteda", "#", "q-x", "#", "q_y", "#", "q100", "#", "q")) ) }) test_that("old preserve_special works", { txt <- "This @username used this #hashtag." expect_identical( quanteda:::preserve_special1(txt, split_tags = FALSE), "This _as_username used this _ht_hashtag." ) expect_identical( quanteda:::preserve_special1(txt, split_tags = TRUE), txt ) }) test_that("output is correct for word1", { expect_message( toks <- tokens(data_char_ukimmig2010, what = "word1", split_hyphens = FALSE, verbose = TRUE), "preserving hyphens" ) expect_message( toks <- tokens(data_char_ukimmig2010, what = "word1", split_hyphens = FALSE, verbose = TRUE), "Finished constructing tokens from 9 documents" ) expect_message( toks <- tokens(data_char_ukimmig2010, what = "word1", split_hyphens = FALSE, verbose = TRUE), "^Creating a tokens from a character object" ) expect_message( toks <- tokens(data_char_ukimmig2010, what = "sentence", verbose = TRUE), "segmenting into sentences" ) }) test_that("remove_numbers functions correctly", { txt <- "1 and 12 123 1975 12345 100,000 $1,000.00 123,123,456 and 50¢ 1.200,34 100bn 20-year-old 4ever gr8" toks1 <- tokens(txt, remove_numbers = TRUE) toks2 <- tokens(txt, what = "fasterword", remove_numbers = TRUE) expect_identical( as.character(toks1), c("and", "$", "and", "¢", "100bn", "20-year-old", "4ever", "gr8") ) expect_identical( as.character(toks2), c("and", "and", "100bn", "20-year-old", "4ever", "gr8") ) }) test_that("edge case usernames are correctly recognized", { toks <- tokens("Valid username: @_", remove_punct = TRUE) expect_identical( as.character(toks), c("Valid", "username", "@_") ) }) test_that("cancatenator is working", { txt <- c(d1 = "The United States is bordered by the Atlantic Ocean and the Pacific Ocean.", d2 = "The Supreme Court of the United States is seldom in a united state.") # construct toks <- tokens(txt, remove_punct = TRUE, concatenator = " ") expect_error( tokens(txt, remove_punct = TRUE, concatenator = c(" ", " ")), "The length of concatenator must be 1" ) expect_equal( concatenator(toks), " " ) expect_equal( concat(toks), " " ) # compound dict <- dictionary(list(Countries = c("* States", "Federal Republic of *"), Oceans = c("* Ocean")), tolower = FALSE) toks <- tokens_compound(toks, dict, concatenator = "_") expect_equal( concatenator(toks), "_" ) expect_equal( concat(toks), "_" ) expect_equal( ntoken(tokens_select(toks, c("United_States"))), c(d1 = 1, d2 = 1) ) # update toks <- tokens(toks, concatenator = "+") expect_error( tokens(toks, concatenator = c(" ", " ")), "The length of concatenator must be 1" ) expect_equal( concatenator(toks), "+" ) expect_equal( concat(toks), "+" ) expect_equal( ntoken(tokens_select(toks, "United+States")), c(d1 = 1, d2 = 1) ) }) test_that("cancatenator is passed to the downstream", { txt <- c(d1 = "The United States is bordered by the Atlantic Ocean and the Pacific Ocean.", d2 = "The Supreme Court of the United States is seldom in a united state.") dict <- dictionary(list(Countries = c("* States", "Federal Republic of *"), Oceans = c("* Ocean")), tolower = FALSE) # construct toks <- tokens(txt, remove_punct = TRUE, concatenator = "+") toks_dict <- tokens_lookup(toks, dict, append_key = TRUE) expect_true("United+States/Countries" %in% types(toks_dict)) expect_identical(concat(toks_dict), "+") toks_ngram <- tokens_ngrams(toks) expect_true("United+States" %in% types(toks_ngram)) expect_identical(concat(toks_ngram), "+") toks_comp <- tokens_compound(toks, dict) expect_true("United+States" %in% types(toks_comp)) expect_identical(concat(toks_comp), "+") # re-construct with different concatantor toks2 <- tokens(toks, remove_punct = TRUE, concatenator = "*") toks_dict2 <- tokens_lookup(toks2, dict, append_key = TRUE) expect_true("United*States/Countries" %in% types(toks_dict2)) expect_identical(concat(toks_dict2), "*") toks_ngram2 <- tokens_ngrams(toks2) expect_true("United*States" %in% types(toks_ngram2)) expect_identical(concat(toks_ngram2), "*") toks_comp2 <- tokens_compound(toks2, dict) expect_true("United*States" %in% types(toks_comp2)) expect_identical(concat(toks_comp2), "*") }) quanteda_options(reset = TRUE)