quanteda_options("tokens_tokenizer_word" = "word4")

test_that("tokens indexing works as expected", {
    toks <- tokens(c(d1 = "one two three",
                     d2 = "four five six",
                     d3 = "seven eight"))

    expect_equal(toks[[1]], c("one", "two", "three"))
    expect_equal(as.list(toks[c(FALSE, TRUE, FALSE)]), list(d2 = c("four", "five", "six")))
    expect_equal(as.list(toks["d2"]), list(d2 = c("four", "five", "six")))
    expect_equal(as.list(toks[2]), list(d2 = c("four", "five", "six")))
    expect_equal(as.list(toks[c(-1, -3)]), list(d2 = c("four", "five", "six"))) # issue #1830

    # issue #370
    expect_equal(attr(toks[1], "types"), c("one", "two", "three"))
    expect_equal(attr(toks[2], "types"), c("four", "five", "six"))

    # issue #1308
    expect_error(toks[4], "Subscript out of bounds")
    expect_error(toks[1:4], "Subscript out of bounds")
    expect_error(toks["d4"], "Subscript out of bounds")
    expect_error(toks[c("d1", "d4")], "Subscript out of bounds")
})

test_that("test `ngrams` with padding = FALSE: #428", {
    toks <- tokens(c(doc1 = "a b c d e f g"))
    toks2 <- tokens_remove(toks, c("b", "e"), padding = FALSE)

    expect_equal(as.list(tokens_ngrams(toks2, n = 2)),
                 list(doc1 = c("a_c", "c_d", "d_f", "f_g")))
    expect_equal(as.list(tokens_ngrams(toks2, n = 3)),
                 list(doc1 = c("a_c_d", "c_d_f", "d_f_g")))
    expect_equal(as.list(tokens_ngrams(toks2, n = 2, skip = 2)),
                 list(doc1 = c("a_f", "c_g")))
})

test_that("test `ngrams` with padding = TRUE: #428", {
    toks <- tokens(c(doc1 = "a b c d e f g"))
    toks3 <- tokens_remove(toks, c("b", "e"), padding = TRUE)

    expect_equal(as.list(tokens_ngrams(toks3, n = 2)),
                 list(doc1 = c("c_d", "f_g")))
    expect_equal(as.list(tokens_ngrams(toks3, n = 3)),
                 list(doc1 = character(0)))
    expect_equal(as.list(tokens_ngrams(toks3, n = 2, skip = 2)),
                 list(doc1 = c("a_d", "c_f", "d_g")))
})

test_that("test dfm with padded tokens, padding = FALSE", {
    toks <- tokens(c(doc1 = "a b c d e f g",
                     doc2 = "a b c g",
                     doc3 = ""))
    toks3 <- tokens_remove(toks, c("b", "e"), padding = FALSE)
    expect_equivalent(as.matrix(dfm(toks3)),
                      matrix(c(1, 1, 1, 1, 1,
                               1, 1, 0, 0, 1,
                               0, 0, 0, 0, 0), nrow = 3, byrow = TRUE))
})

test_that("test dfm with padded tokens, padding = TRUE", {
    toks <- tokens(c(doc1 = "a b c d e f g",
                     doc2 = "a b c g",
                     doc3 = ""))
    toks3 <- tokens_remove(toks, c("b", "e"), padding = TRUE)
    expect_equivalent(as.matrix(dfm(toks3)),
                      matrix(c(2, 1, 1, 1, 1, 1,
                               1, 1, 1, 0, 0, 1,
                               0, 0, 0, 0, 0, 0), nrow = 3, byrow = TRUE))
})

test_that("docnames works for tokens", {
    expect_equal(names(data_char_ukimmig2010),
                 docnames(tokens(data_char_ukimmig2010)))
})

test_that("longer features longer than documents do not crash (#447)", {
    toks <- tokens(c(d1 = "a b", d2 = "a b c d e"))
    feat <- "b c d e"
    # bugs in C++ needs repeated tests
    expect_silent(replicate(10, tokens_select(toks, feat)))
    expect_equal(
        as.list(tokens_select(toks, feat)),
        list(d1 = character(0), d2 = character(0))
    )
    expect_equal(
        as.list(tokens_select(toks, phrase(feat))),
        list(d1 = character(0), d2 = c("b", "c", "d", "e"))
    )
})

test_that("tokens works as expected for what = character", {
    expect_equal(
        as.character(tokens("one, two three.", what = "character", remove_separators = TRUE)),
        c("o", "n", "e", ",", "t", "w", "o", "t", "h", "r", "e", "e", ".")
    )
    expect_equal(
        as.character(tokens("one, two three.", what = "character", remove_separators = FALSE)),
        c("o", "n", "e", ",", " ", "t", "w", "o", " ", "t", "h", "r", "e", "e", ".")
    )
    expect_equal(
        as.character(tokens("one, two three.", what = "character", remove_punct = TRUE,
                            remove_separators = TRUE)),
        c("o", "n", "e", "t", "w", "o", "t", "h", "r", "e", "e")
    )
})

test_that("tokens works with unusual hiragana #554", {
    skip_on_cran()
    skip_on_os("windows")
    skip_on_os("mac")
    txts <- c("づいﾞ", "゛んﾞ", "たーﾟ")
    expect_equivalent(as.list(tokens(txts)),
                      list(c("づ", "いﾞ"), c("゛", "んﾞ"), c("た", "ーﾟ")))
})

test_that("types attribute is a character vector", {
    toks <- tokens("one two three")
    expect_true(is.character(attr(toks, "types")))
    expect_equal(length(attributes(attr(toks, "types"))), 0)
})

test_that("remove_url works as expected", {
    txt <- c("The URL was http://t.co/something.",
             "The URL was http://quanteda.io",
             "https://github.com/quanteda/quanteda/issue/1 is another URL",
             "www.r-project.org/about.html is a specific page without protocol",
             "https://www.google.com/search?q=quanteda+package is a google search",
             "ftp://user@host/foo/bar.txt is a FTP-hosted file",
             "kohei.watanabe@quanteda.org is an email address",
             "The U.S. is not an url")
    toks <- tokens(txt, remove_url = TRUE)
    expect_equal(
        as.list(toks),
        list(text1 = c("The", "URL", "was"),
             text2 = c("The", "URL", "was"),
             text3 = c("is", "another", "URL"),
             text4 = c("is", "a", "specific", "page", "without", "protocol"),
             text5 = c("is", "a", "google", "search"),
             text6 = c("is", "a", "FTP-hosted", "file"),
             text7 = c("is", "an", "email", "address"),
             text8 = c("The", "U.S", ".", "is", "not", "an", "url"))
    )
})

test_that("+ operator works with tokens", {
    txt1 <- c(d1 = "This is sample document one.",
              d2 = "Here is the second sample document.")
    txt2 <- c(d3 = "And the third document.")
    toks_added <- tokens(txt1) + tokens(txt2)
    expect_equal(
        length(unique(as.character(toks_added))),
        length(attr(toks_added, "types"))
    )
    expect_equal(ndoc(toks_added), 3)
    # expect_error(
    #     tokens(txt1, what = "word") + tokens(txt2, what = "sentence"),
    #     "Cannot combine tokens in different tokenization units"
    # )
})

test_that("+ works with empty padded tokens (#1695)", {
    toks1 <- tokens(c(d1 = "a b"))
    toks2 <- tokens(c(d2 = ""))
    toks3 <- tokens(c(d3 = "c"))
    toks4 <- tokens(c(d4 = "c d"))

    expect_identical(
        as.list(toks1 + toks2),
        list(d1 = c("a", "b"), d2 = character(0))
    )
    expect_identical(
        as.list(toks1 + toks3),
        list(d1 = c("a", "b"), d3 = "c")
    )
    expect_identical(
        as.list(toks1 + tokens_remove(toks3, pattern = "c", pad = FALSE)),
        list(d1 = c("a", "b"), d3 = character(0))
    )
    expect_identical(
        as.list(toks1 + tokens_remove(toks3, pattern = "c", pad = TRUE)),
        list(d1 = c("a", "b"), d3 = "")
    )
    expect_identical(
        as.list(tokens_remove(toks3, pattern = "c", pad = TRUE) + toks1),
        list(d3 = "", d1 = c("a", "b"))
    )
    expect_identical(
        as.list(toks1 + tokens_remove(toks4, pattern = "c", pad = FALSE)),
        list(d1 = c("a", "b"), d4 = "d")
    )
    expect_identical(
        as.list(toks1 + tokens_remove(toks4, pattern = "c", pad = TRUE)),
        list(d1 = c("a", "b"), d4 = c("", "d"))
    )
    expect_identical(
        as.list(tokens_remove(toks4, pattern = "c", pad = TRUE) +
                    tokens_remove(toks3, pattern = "c", pad = TRUE)),
        list(d4 = c("", "d"), d3 = "")
    )
})

test_that("c() works with tokens", {

    toks1 <- tokens(c(d1 = "This is sample document one.",
                      d2 = "Here is the second sample document."))
    toks2 <- tokens(c(d3 = "And the third document."))
    toks3 <- tokens(c(d4 = "This is sample document 4."))
    toks4 <- tokens(c(d1 = "This is sample document five!"))

    expect_equal(
        c(toks1),
        toks1
    )

    expect_equal(
        c(toks1, toks2),
        toks1 + toks2
    )

    expect_equal(
        c(toks1, toks2, toks3),
        toks1 + toks2 + toks3
    )

    expect_error(
        c(toks1, toks4),
        "Cannot combine tokens with duplicated document names"
    )

    # issue #1835
    toks <- c(tokens(data_corpus_inaugural[1:2]),
              tokens(data_corpus_inaugural[3:5]),
              tokens(data_corpus_inaugural[6:10]))

    expect_equivalent(
         as.list(toks),
         as.list(tokens(data_corpus_inaugural[1:10]))
    )

    expect_equal(
        docvars(toks),
        docvars(tokens(data_corpus_inaugural[1:10]))
    )
})

test_that("docvars are erased for tokens added", {
    corp <- corpus(c(d1 = "This is sample document one.",
                     d2 = "Here is the second sample document."),
                   docvars = data.frame(dvar1 = c("A", "B"), dvar2 = c(1, 2)))
    toks1 <- tokens(corp, include_docvars = TRUE)
    expect_equivalent(
        docvars(toks1),
        data.frame(dvar1 = c("A", "B"), dvar2 = c(1, 2))
    )
    toks2 <- tokens(c(d3 = "And the third sample document."))
    expect_equivalent(
        docvars(toks1 + toks2),
        data.frame(dvar1 = c("A", "B", NA), dvar2 = c(1, 2, NA))
    )
})

test_that("what = character works with @ and #, issue #637", {
    expect_identical(as.list(tokens("This: is, a @test! #tag", what = "character",
                                remove_punct = FALSE)),
                 list(text1 = c("T", "h", "i", "s", ":", "i", "s", ",",
                                "a", "@", "t", "e", "s", "t", "!", "#", "t", "a", "g")))
    expect_identical(as.list(tokens("This: is, a @test! #tag", what = "character",
                                remove_punct = TRUE)),
                 list(text1 = c("T", "h", "i", "s", "i", "s",
                                "a", "t", "e", "s", "t", "t", "a", "g")))
})

test_that("unlist retuns character vector, issue #716", {
    expect_equal(unlist(tokens(c(doc1 = "aaa bbb cccc", doc2 = "aaa bbb dddd"))),
                 c(doc11 = "aaa", doc12 = "bbb", doc13 = "cccc",
                   doc21 = "aaa", doc22 = "bbb", doc23 = "dddd"))
    expect_equal(unlist(tokens(c(doc1 = "aaa bbb cccc", doc2 = "aaa bbb dddd")), use.names = FALSE),
                 c("aaa", "bbb", "cccc", "aaa", "bbb", "dddd"))
})

test_that("unused argument warnings for tokens work as expected", {

    # for tokens
    expect_identical(
        as.character(tokens(c(d1 = "This: punctuation"), remove_punct = TRUE)),
        c("This", "punctuation")
    )
    expect_warning(
        tokens(c(d1 = "This: punctuation"), notarg1 = TRUE),
        "^notarg1 argument is not used"
    )
    expect_warning(
        tokens(c(d1 = "This: punctuation"), notarg1 = TRUE, notarg2 = FALSE),
        "^notarg1, notarg2 arguments are not used\\."
    )

})

test_that("tokens arguments works with values from parent frame (#721)", {
    expect_identical(
        tokens("This contains 99 numbers.", remove_numbers = T),
        tokens("This contains 99 numbers.", remove_numbers = TRUE),
    )

    suppressWarnings({
        expect_identical(
        dfm(tokens("This contains 99 numbers."), remove_numbers = T),
        dfm(tokens("This contains 99 numbers."), remove_numbers = TRUE)
    )
    })

    val <- FALSE
    expect_identical(
        tokens("This contains 99 numbers.", remove_numbers = val),
        tokens("This contains 99 numbers.", remove_numbers = FALSE)
    )
    suppressWarnings({
        expect_identical(
        dfm(tokens("This contains 99 numbers."), remove_numbers = val),
        dfm(tokens("This contains 99 numbers."), remove_numbers = FALSE)
    )
    })
})

test_that("tokens works for strange spaces (#796)", {
    txt <- "space tab\t newline\n non-breakingspace\u00A0, variationselector16 \uFE0F."
    expect_identical(ntoken(txt, remove_punct = FALSE, remove_separators = TRUE),
                     c(text1 = 7L))
    expect_identical(
        as.character(tokens(txt, what = "word", remove_punct = TRUE, remove_separators = TRUE)),
        c("space", "tab", "newline", "non-breakingspace", "variationselector16")
    )
    expect_silent(
        tokens(txt, what = "word", remove_separators = FALSE)
        # "remove_separators is always TRUE for this type"
    )
})

test_that("tokens works with control characters", {
    txt <- "Left-to-Right Override \u202D Zero-Width Non-Breaking Space \ufeff"
    expect_equal(ntoken(txt), c(text1 = 5))
})

test_that("tokens remove whitespace with combining characters (#882)", {

    skip_on_travis()
    skip_on_cran()
    skip_on_appveyor()
    skip_on_os("windows")

    txt <- "( \u0361\u00b0 \u035c\u0296 \u0361\u00b0)"
    tok <- tokens(txt)
    expect_equal(as.list(tok)[[1]],
                 c("(", "°", "ʖ", "°", ")"))

})

test_that("split_hyphens is working correctly", {
    txt <- "a b-c d . !"
    expect_equal(as.character(tokens(txt, split_hyphens = FALSE, remove_punct = FALSE)[[1]]),
                 c("a", "b-c", "d", ".", "!"))
    expect_equal(as.character(tokens(txt, split_hyphens = FALSE, remove_punct = TRUE)[[1]]),
                 c("a", "b-c", "d"))
    expect_equal(as.character(tokens(txt, split_hyphens = TRUE, remove_punct = FALSE)[[1]]),
                 c("a", "b", "-", "c", "d", ".", "!"))
    expect_equal(as.character(tokens(txt, split_hyphens = TRUE, remove_punct = TRUE)[[1]]),
                 c("a", "b", "c", "d"))
})

test_that("tokens.tokens() does nothing by default", {
    toks <- tokens(data_corpus_inaugural,
                   remove_numbers = FALSE,
                   remove_punct = FALSE,
                   remove_symbols = FALSE,
                   remove_separators = TRUE,
                   split_hyphens = FALSE,
                   remove_url = FALSE)
    expect_equal(toks, tokens(toks))
})

test_that("tokens works as expected with NA, and blanks", {
    expect_equal(
        as.list(tokens(c("one", "two", ""))),
        list(text1 = "one", text2 = "two", text3 = character())
    )
    expect_equal(
        as.list(suppressWarnings(tokens(c("one", NA, "")))),
        list(text1 = "one", text2 = character(), text3 = character())
    )
    expect_equal(
        as.list(suppressWarnings(tokens(c(NA, "one", "")))),
        list(text1 = character(), text2 = "one", text3 = character())
    )
    expect_equal(
        as.list(tokens("")),
        list(text1 = character())
    )
    expect_equal(
        as.list(suppressWarnings(tokens(c(d1 = "", d2 = NA)))),
        list(d1 = character(), d2 = character())
    )
    expect_equal(
        as.list(suppressWarnings(tokens(c(d1 = NA, d2 = "")))),
        list(d1 = character(), d2 = character())
    )
    expect_equal(
        as.character(as.tokens(list(""))),
        character()
    )
})

test_that("assignment operators are disabled for tokens object", {
    toks <- tokens(c(d1 = "a b c d", d2 = "c d e"))

    try(toks[[1]] <- c(6, 100, "z"), silent = TRUE)
    expect_equal(as.list(toks),
                 list(d1 = c("a", "b", "c", "d"), d2 = c("c", "d", "e")))

    expect_error(toks[[1]] <- c(6, 100, "z"), "assignment to tokens objects is not allowed")
    expect_error(toks[1] <- list(c(6, 100, "z")), "assignment to tokens objects is not allowed")
})

test_that("assignment operators are disabled for tokens object", {
    toks <- tokens(c(d1 = "a b c d", d2 = "c d e"))

    try(toks[[1]] <- c(6, 100, "z"), silent = TRUE)
    expect_equal(as.list(toks),
                 list(d1 = c("a", "b", "c", "d"), d2 = c("c", "d", "e")))

    expect_error(toks[[1]] <- c(6, 100, "z"), "assignment to tokens objects is not allowed")
    expect_error(toks[1] <- list(c(6, 100, "z")), "assignment to tokens objects is not allowed")
})

test_that("empty tokens are removed correctly", {
    txt <- "a   b  c d e "
    tok <- c("a", "b", "c", "d", "e")
    expect_equal(as.list(tokens(txt, what = "word"))[[1]], tok)
})

test_that("combined tokens objects have all the attributes", {

    toks1 <- tokens(c(text1 = "a b c"))
    toks2 <- tokens_compound(tokens(c(text2 = "d e f")), phrase("e f"), concatenator = "+")
    toks3 <- tokens(c(text3 = "d e f"), what = "sentence")
    expect_warning(
        toks4 <- tokens(c(text4 = "d e f"), ngram = 1:2, skip = 2),
        "ngram, skip arguments are not used."
    )
    toks5 <- tokens(c(text5 = "d e f"))

    expect_error(c(toks1, toks1),
                 "Cannot combine tokens with duplicated document names")
    expect_error(c(toks1, toks2),
                 "Cannot combine tokens with different concatenators")
    # expect_error(c(toks1, toks3),
    #              "Cannot combine tokens in different tokenization units")

    expect_identical(names(attributes(c(toks1, toks4))),
                     names(attributes(toks1)))
    expect_identical(attr(c(toks1, toks4), "meta")$object$what, "word")
    expect_identical(attr(c(toks1, toks4), "meta")$object$concatenator, "_")
    expect_identical(attr(c(toks1, toks4), "meta")$object$ngram, c(1L))
    expect_identical(attr(c(toks1, toks4), "meta")$object$skip, c(0L))

    expect_identical(docnames(dfm(c(toks1, toks4))), c("text1", "text4"))
    expect_identical(names(attributes(c(toks1, toks5))),
                     names(attributes(toks1)))
    expect_identical(attr(c(toks1, toks5), "meta")$object$what, "word")
    expect_identical(attr(c(toks1, toks5), "meta")$object$concatenator, "_")
    expect_identical(attr(c(toks1, toks5), "meta")$object$ngram, 1L)
    expect_identical(attr(c(toks1, toks5), "meta")$object$skip, 0L)
    expect_identical(docnames(dfm(c(toks1, toks5))), c("text1", "text5"))
})

test_that("tokens.tokens(x, split_hyphens = TRUE) behaves same as tokens.character(...)", {
    # issue #1498
    txt <- "Auto-immune system."
    expect_identical(
        as.character(tokens(txt, split_hyphens = FALSE) |> tokens(split_hyphens = TRUE)),
        c("Auto", "-", "immune", "system", ".")
    )

    txt <- c("There's shrimp-kabobs, shrimp creole. Deep-deep-fried, stir-fried.",
             "Stir-fried shrimp.")
    expect_identical(
        tokens(txt, split_hyphens = TRUE) |> as.list(),
        tokens(txt, split_hyphens = FALSE) |> tokens(split_hyphens = TRUE) |> as.list()
    )
})

test_that("types are encoded when necessarly", {
    toks <- tokens(c("まずは最初の文書。そして、次の文書。", "最後の文書"))
    expect_true(all(Encoding(types(toks)) == "UTF-8"))
    expect_true(all(Encoding(types(tokens_wordstem(toks))) == "UTF-8"))
    expect_true(all(Encoding(types(tokens_sample(toks, 1))) == "UTF-8"))
    expect_true(all(Encoding(types(tokens_tolower(toks))) == "UTF-8"))
    expect_true(all(Encoding(types(tokens_toupper(toks))) == "UTF-8"))
    expect_true(all(Encoding(types(tokens_ngrams(toks))) == "UTF-8"))
    expect_true(all(Encoding(types(tokens_remove(toks, "の"))) == "UTF-8"))
    expect_true(all(Encoding(types(tokens_replace(toks, phrase("次 の"), phrase("次 は")))) == "UTF-8"))
    expect_true(all(Encoding(types(tokens_split(toks, "は"))) == "UTF-8"))
    expect_true(all(Encoding(types(tokens_chunk(toks, 2))) == "UTF-8"))
    expect_true(all(Encoding(types(tokens_subset(toks, c(TRUE, FALSE)))) == "UTF-8"))

})

test_that("tokens verbose = TRUE produces expected messages", {
    expect_message(
        tokens(c("one two three", "four five."), verbose = TRUE),
        "starting tokenization"
    )
})

test_that("types<- with wrong value generates error", {
    toks <- tokens(c("one two three", "four five."))
    expect_error(
        quanteda:::`types<-.tokens`(toks, value = 1:6),
        "replacement value must be character"
    )
})

test_that("tokens.tokens warns about unused arguments", {
    expect_warning(
        tokens(tokens("one two three"), notanarg = TRUE),
        "^notanarg argument is not used"
    )
})

test_that("tokens.tokens(x, split_hyphens = TRUE, verbose = TRUE) works as expected  (#1683)", {
    expect_message(
        tokens(tokens("No hyphens here."), split_hyphens = TRUE, verbose = TRUE),
        "splitting hyphens"
    )
    expect_message(
        tokens(tokens("Hyphens oft-cited here."), split_hyphens = TRUE, verbose = TRUE),
        "splitting hyphens"
    )
    expect_identical(
        as.character(tokens(tokens("Hyphens oft-cited here."), split_hyphens = TRUE)),
        c("Hyphens", "oft", "-", "cited", "here", ".")
    )
})

test_that("tokens.tokens(x, split_tags = TRUE, verbose = TRUE) works as expected  (#1683)", {
    expect_warning(
        tokens(tokens("No Twitter."), split_tags = TRUE),
        "split_tags argument is not used"
    )
    expect_message(
        tokens(tokens("Removing #hashtags.", what = "word", verbose = TRUE)),
        "preserving social media tags"
    )
})

test_that("tokens.tokens(x, remove_numbers = TRUE, verbose = TRUE) works as expected (#1683)", {
    expect_message(
        tokens(tokens("Removing no number words."), remove_numbers = TRUE, verbose = TRUE),
        "...removing separators, numbers"
    )
    expect_message(
        tokens(tokens("Removing 1 number words."), remove_numbers = TRUE, verbose = TRUE),
        "...removing separators, numbers"
    )
    expect_identical(
        as.character(tokens(tokens("Removing 1 number words."), remove_numbers = TRUE)),
        c("Removing", "number", "words", ".")
    )
})

test_that("tokens.tokens(x, remove_punct = TRUE, verbose = TRUE) works as expected (#1683)", {
    expect_message(
        tokens(tokens("Removing no £ punctuation"), remove_punct = TRUE, verbose = TRUE),
        "...removing separators, punctuation"
    )
    expect_message(
        tokens(tokens("Removing £ punctuation."), remove_symbols = TRUE, verbose = TRUE),
        "removing separators, symbols"
    )
    expect_message(
        tokens(tokens("Removing £ punctuation."), remove_symbols = TRUE, remove_separators = TRUE, verbose = TRUE),
        "removing separators, symbols"
    )
    expect_identical(
        as.character(tokens(tokens("Removing £ punctuation."), remove_punct = TRUE,
                            remove_symbol = FALSE)),
        c("Removing", "£", "punctuation")
    )
})

test_that("tokens.tokens(x, remove_symbols = TRUE, verbose = TRUE) works as expected (#1683)", {
    expect_message(
        tokens(tokens("Removing no symbols."), remove_symbols = TRUE, verbose = TRUE),
        "removing separators, symbols"
    )
    expect_message(
        tokens(tokens("Removing € symbols."), remove_symbols = TRUE, verbose = TRUE),
        "removing separators, symbols"
    )
    expect_identical(
        as.character(tokens(tokens("Removing € symbols."), remove_symbols = TRUE)),
        c("Removing", "symbols", ".")
    )
})

test_that("tokens.tokens(x, remove_separators = TRUE, verbose = TRUE) works as expected (#1683)", {
    skip("the verbose message has been changed")
    expect_message(
        tokens(tokens("Removing separators", remove_separators = FALSE, what = "word"),
               remove_separators = TRUE, verbose = TRUE),
        "...removing separators"
    )
    expect_message(
        tokens(tokens("Removing no separators", remove_separators = TRUE), remove_separators = TRUE, verbose = TRUE),
        "removing separators"
    )
    expect_identical(
        as.character(
            tokens(tokens("Removing separators", remove_separators = FALSE, what = "word"), remove_separators = TRUE)
        ),
        c("Removing", "separators")
    )
    expect_message(
        tokens(tokens("Removing separators", remove_separators = TRUE), verbose = TRUE),
        c("elapsed time: .+ seconds")
    )
})

test_that("tokens printing works", {
    toks <- tokens(data_corpus_inaugural[1:14])
    expect_silent(
        print(toks, max_ndoc = 0, max_ntoken = 0, show_summary = FALSE)
    )
    expect_output(
        print(toks, max_ndoc = 0, max_ntoken = 0, show_summary = TRUE),
        "Tokens consisting of 14 documents and 4 docvars.",
        fixed = TRUE
    )
    expect_output(
        print(toks, max_ndoc = 2, max_ntoken = 3, show_summary = TRUE),
        paste0('Tokens consisting of 14 documents and 4 docvars.\n',
               '1789-Washington :\n',
               '[1] "Fellow-Citizens" "of"              "the"            \n',
               '[ ... and 1,534 more ]\n\n',
               '1793-Washington :\n',
               '[1] "Fellow"   "citizens" ","       \n',
               '[ ... and 144 more ]\n\n',
               '[ reached max_ndoc ... 12 more documents ]'),
        fixed = TRUE
    )
    expect_output(
        print(toks, max_ndoc = 2, max_ntoken = 3, show_summary = FALSE),
        paste0('1789-Washington :\n',
               '[1] "Fellow-Citizens" "of"              "the"            \n',
               '[ ... and 1,534 more ]\n\n',
               '1793-Washington :\n',
               '[1] "Fellow"   "citizens" ","       \n',
               '[ ... and 144 more ]\n\n',
               '[ reached max_ndoc ... 12 more documents ]'),
        fixed = TRUE
    )
    expect_output(
        print(toks[1:2], max_ndoc = 2, max_ntoken = 3, show_summary = FALSE),
        paste0('1789-Washington :\n',
               '[1] "Fellow-Citizens" "of"              "the"            \n',
               '[ ... and 1,534 more ]\n\n',
               '1793-Washington :\n',
               '[1] "Fellow"   "citizens" ","       \n',
               '[ ... and 144 more ]\n'),
        fixed = TRUE
    )

    expect_output(
        print(tokens("a b c d"), max_ndoc = -1, max_ntoken = 2),
        paste0('Tokens consisting of 1 document.\n',
               'text1 :\n',
               '[1] "a" "b"\n',
               '[ ... and 2 more ]\n'),
        fixed = TRUE
    )
    expect_output(
        print(tokens("a b c d"), max_ndoc = -1, max_ntoken = 4),
        paste0('Tokens consisting of 1 document.\n',
               'text1 :\n',
               '[1] "a" "b" "c" "d"'),
        fixed = TRUE
    )
    expect_output(
        print(tokens("a b c d"), max_ndoc = -1, max_ntoken = -1),
        paste0('Tokens consisting of 1 document.\n',
               'text1 :\n',
               '[1] "a" "b" "c" "d"'),
        fixed = TRUE
    )
})

test_that("tokens.list() works", {
    lis <- list(d1 = c("one", "two-three", "@test"), d2 = c("four", "."))
    expect_identical(as.list(tokens(lis)), lis)
    expect_identical(as.list(tokens(lis, split_hyphens = TRUE)),
                     list(d1 = c("one", "two", "-", "three", "@test"),
                          d2 = c("four", ".")))
})

test_that("tokens.character(x, padding = TRUE) works", {
    txt <- c(doc1 = "One 2, £ https://qunteda.org one-two.")

    # punct
    expect_identical(
        as.list(tokens(txt, what = "word", remove_punct = TRUE, padding = TRUE)),
        list(doc1 = c("One", "2", "", "£", "https://qunteda.org", "one-two", ""))
    )
    expect_identical(
        as.list(tokens(txt, what = "word", remove_punct = TRUE, padding = FALSE)),
        list(doc1 = c("One", "2", "£", "https://qunteda.org", "one-two"))
    )

    # symbols
    expect_identical(
        as.list(tokens(txt, what = "word", remove_symbols = TRUE, padding = TRUE)),
        list(doc1 = c("One", "2", ",", "", "https://qunteda.org", "one-two", "."))
    )
    expect_identical(
        as.list(tokens(txt, what = "word", remove_symbols = TRUE, padding = FALSE)),
        list(doc1 = c("One", "2", ",", "https://qunteda.org", "one-two", "."))
    )

    # numbers
    expect_identical(
        as.list(tokens(txt, what = "word", remove_numbers = TRUE, padding = TRUE)),
        list(doc1 = c("One", "", ",", "£", "https://qunteda.org", "one-two", "."))
    )
    expect_identical(
        as.list(tokens(txt, what = "word", remove_numbers = TRUE, padding = FALSE)),
        list(doc1 = c("One", ",", "£", "https://qunteda.org", "one-two", "."))
    )

    # url
    expect_identical(
        as.list(tokens(txt, what = "word", remove_url = TRUE, padding = TRUE)),
        list(doc1 = c("One", "2", ",", "£", "", "one-two", "."))
    )
    expect_identical(
        as.list(tokens(txt, what = "word", remove_url = TRUE, padding = FALSE)),
        list(doc1 = c("One", "2", ",", "£", "one-two", "."))
    )
})

test_that("tokens.tokens(x, padding = TRUE) works", {
    txt <- c(doc1 = "One 2, £ https://qunteda.org one-two.")
    toks <- tokens(txt, what = "word")

    # punct
    expect_identical(
        as.list(tokens(toks, what = "word", remove_punct = TRUE, padding = TRUE)),
        list(doc1 = c("One", "2", "", "£", "https://qunteda.org", "one-two", ""))
    )
    expect_identical(
        as.list(tokens(toks, what = "word", remove_punct = TRUE, padding = FALSE)),
        list(doc1 = c("One", "2", "£", "https://qunteda.org", "one-two"))
    )

    # symbols
    expect_identical(
        as.list(tokens(toks, what = "word", remove_symbols = TRUE, padding = TRUE)),
        list(doc1 = c("One", "2", ",", "", "https://qunteda.org", "one-two", "."))
    )
    expect_identical(
        as.list(tokens(toks, what = "word", remove_symbols = TRUE, padding = FALSE)),
        list(doc1 = c("One", "2", ",", "https://qunteda.org", "one-two", "."))
    )

    # numbers
    expect_identical(
        as.list(tokens(toks, what = "word", remove_numbers = TRUE, padding = TRUE)),
        list(doc1 = c("One", "", ",", "£", "https://qunteda.org", "one-two", "."))
    )
    expect_identical(
        as.list(tokens(toks, what = "word", remove_numbers = TRUE, padding = FALSE)),
        list(doc1 = c("One", ",", "£", "https://qunteda.org", "one-two", "."))
    )

    # url
    expect_identical(
        as.list(tokens(toks, what = "word", remove_url = TRUE, padding = TRUE)),
        list(doc1 = c("One", "2", ",", "£", "", "one-two", "."))
    )
    expect_identical(
        as.list(tokens(toks, what = "word", remove_url = TRUE, padding = FALSE)),
        list(doc1 = c("One", "2", ",", "£", "one-two", "."))
    )
})

test_that("tokenizing Japanese with URLs works", {
    txt <- c(d1 = "私のユーザー名は@quantedainitです。")
    expect_identical(
        as.list(tokens(txt, what = "word")),
        list(d1 = c("私", "の", "ユーザー", "名", "は", "@quantedainit", "です", "。"))
    )

    txt <- c(d1 = "私のウェブサイトはhttps://www.nichibenren.or.jp/です。")
    expect_identical(
        as.list(tokens(txt, what = "word")),
        list(d1 = c("私", "の", "ウェブサイト", "は", "https://www.nichibenren.or.jp/", "です", "。"))
    )

    txt <- c(d1 = "10,000人のフォロワーがいます。")
    expect_identical(
        as.list(tokens(txt, what = "word")),
        list(d1 = c("10,000", "人", "の", "フォロワー", "がい", "ます", "。"))
    )

    txt <- c(d1 = "私のウェブサイトはhttps://www.nichibenren.or.jp/です。10,000人のフォロワーがいます。")
    expect_identical(
        as.list(tokens(txt, what = "word")),
        list(d1 = c("私", "の", "ウェブサイト", "は", "https://www.nichibenren.or.jp/", "です", "。", "10,000",
                     "人", "の", "フォロワー", "がい", "ます", "。"))
    )
})

test_that("Non-ASCII hashtags are preserved", {
    txt <- c(d1 = "オリンピック延期決定！ #政治 #安部政権")
    expect_identical(
        as.list(tokens(txt, what = "word")),
        list(d1 = c("オリンピック", "延期", "決定", "！", "#政治", "#安部政権"))
    )
})

test_that("Weibo-style hashtags are preserved", {
    txt <- c(d1 = "#英国首相#仍在ICU")
    expect_identical(
        as.list(tokens(txt, what = "word")),
        list(d1 = c("#英国首相#", "仍在", "ICU"))
    )
})

test_that("email addresses are preserved", {
    txt <- c(d1 = "support-team@e-mail.quanteda.io SupportTeam@quanteda.org",
             d2 = "K.Watanabe@qi1234.co.jp K_Watanabe@qi1234.com",
             d3 = "support+noreply@qi-japan.tokyo")
    expect_identical(
        as.list(tokens(txt, what = "word")),
        list(d1 = c("support-team@e-mail.quanteda.io", "SupportTeam@quanteda.org"),
             d2 = c("K.Watanabe@qi1234.co.jp", "K_Watanabe@qi1234.com"),
             d3 = "support+noreply@qi-japan.tokyo")
    )
})

test_that("split_tags works", {
    txt1 <- c(d1 = "@quanteda @koheiw7 @QUANTEDA_INITIATIVE")
    expect_identical(
        as.list(tokens(txt1, what = "word")),
        list(d1 = c("@quanteda", "@koheiw7", "@QUANTEDA_INITIATIVE"))
    )
    expect_identical(
        as.list(tokens(txt1, what = "word", split_tags = TRUE)),
        list(d1 = c("@", "quanteda", "@", "koheiw7", "@", "QUANTEDA_INITIATIVE"))
    )
    
    txt2 <- c(d1 = "#quanteda #q-x #q_y #q100 #q")
    expect_identical(
        as.list(tokens(txt2, what = "word")),
        list(d1 = c("#quanteda", "#q", "-", "x", "#q_y", "#q100", "#q"))
    )
    expect_identical(
        as.list(tokens(txt2, what = "word", split_tags = TRUE)),
        list(d1 = c("#", "quanteda", "#", "q-x", "#", "q_y", "#", "q100", "#", "q"))
    )
})

test_that("edge case usernames are correctly recognized", {
    toks <- tokens("Valid username: @_", remove_punct = TRUE)
    expect_identical(
        as.character(toks),
        c("Valid", "username", "@_")
    )
})

test_that("split_elisions is working", {
    expect_equal(
        tokens("Qu'est-ce que c'est?", split_elisions = FALSE)[[1]],
        c("Qu'est-ce", "que", "c'est", "?")
    )
    expect_equal(
        tokens("Qu'est-ce que c'est?", split_elisions = TRUE)[[1]],
        c("Qu'", "est-ce", "que", "c'", "est", "?")
    )
})

quanteda_options(reset = TRUE)