corpus_original <- c(
  "The rabbit munched the orange carrot.",
  "The snake hugged the green lizard.",
  "The hedgehog impaled the orange orange.",
  "The squirrel buried the brown nut."
)

# text preprocessing: tolower, remove punctuation, remove stopwords
# note this is just an example and not the best way for larger amounts of text
stopwords <- c("the", "a", "an", "and")
corpus <- corpus_original |>
  tolower() |>
  gsub(pattern = "[[:punct:]]", replacement = "") |>
  gsub(pattern = paste0("\\b(", paste(stopwords, collapse = "|"), ") *\\b"),
       replacement = "") |>
  trimws()

# define some metadata for the text corpus, e.g., the original text and the source
metadata <- data.frame(
  text_original = corpus_original,
  source = c("book1", "book2", "book3", "book4")
)

test_that("BM25 works", {
  bm <- BM25$new(data = corpus, metadata = metadata)

  expect_equal(class(bm), c("BM25", "R6"))
  expect_equal(bm$get_lang(), "Detect")
  expected_data <- data.frame(
    text = corpus,
    text_original = corpus_original,
    source = c("book1", "book2", "book3", "book4")
  )
  expect_equal(bm$get_data(), expected_data)
  expected_languages <- c(
    ar = "arabic", da = "danish", nl = "dutch", en = "english",
    fr = "french", de = "german", el = "greek", hu = "hungarian",
    it = "italian", no = "norwegian", pt = "portuguese", ro = "romanian",
    ru = "russian", es = "spanish", sv = "swedish", ta = "tamil",
    tr = "turkish", auto = "detect"
    )
  expect_equal(bm$available_languages(), expected_languages)

  res <- bm$query(query = "orange", max_n = 2)

  expected <- data.frame(
    id = c(3, 1),
    score = c(0.49042809, 0.35667497),
    rank = c(1, 2),
    text = corpus[c(3, 1)],
    text_original = corpus_original[c(3, 1)],
    source = c("book3", "book1")
  )

  expect_equal(res, expected)
})

test_that("bm25_score works", {
  scores <- bm25_score(data = corpus, query = "orange")

  expected <- c(0.35667497, 0.0, 0.49042809, 0.0)
  expect_equal(scores, expected)
})