context("Tokenizer")

test_that("Parser works for large strings",{
  x <- "I want to see the lovely, handier version of sing or singing dogs"
  y <- paste(rep(x, 100), collapse=" ")
  for(i in 1:10) {
    expect_length(hunspell_parse(y)[[1]], 1300)
    expect_length(hunspell_parse(y, dict = "ru_RU")[[1]], 1300)
    expect_length(hunspell_parse(y, dict = "russian-aot")[[1]], 1300)
  }
})

# contractions and posessive are the most challenging to check
test_that("Test English apostrophe",{
  text <- c(
    "let's, don't, couldn't, it's, she's",
    "the lawyer's fee",
    "the child's toy",
    "children's toys",
    "Tom Jones's first album",
    "anyone's guess",
    "excessive lawyers' fees",
    "the twins' parents",
    "the student teachers' supervisor",
    "the boys' baseball team"
  )

  # Do not split at the apostrophe
  len <- c(5, 3, 3, 2, 4, 2, 3, 3, 4, 4)
  expect_equal(sapply(hunspell_parse(text, dict = 'en_US'), length), len)
  expect_equal(sapply(hunspell_parse(text, dict = 'en_GB'), length), len)

  # Do not find any typos
  lapply(text, function(x){
    expect_length(hunspell(x, dict = 'en_GB')[[1]], 0)
    expect_length(hunspell(x, dict = 'en_US')[[1]], 0)
  })
})

test_that("Quotes around words are ignored", {
  str <- "The 'hunspell' library"
  words <- c("The", "hunspell", "library")
  expect_equal(hunspell_parse(str, dict = 'en_US')[[1]], words)
  expect_equal(hunspell_parse(str, dict = 'en_GB')[[1]], words)
  expect_equal(hunspell(str, dict = 'en_US')[[1]], "hunspell")
  expect_equal(hunspell(str, dict = 'en_GB')[[1]], "hunspell")
})