test_that("html tags removed", { testing <- data.frame(ID = c(1,2), page_notes = c(" The review .", "text with
a page break", "tag without spaces", " color ")) results <- token_comments(testing) expect_identical(results[[1]], c("the", "review")) expect_identical(results[[2]], c("text", "with", "a", "page", "break")) expect_identical(results[[3]], c("tag", "without", "spaces")) expect_identical(results[[4]], "color") }) test_that("dollar sign removed", { testing <- data.frame(ID = c(1), page_notes = c("$4.50")) results <- token_comments(testing) expect_identical(results[[1]], c("450")) }) test_that("period between characters removed to keep characters together", { testing <- data.frame(ID = c(1), page_notes = c("This is a sentence. No.2")) results <- token_comments(testing) expect_identical(results[[1]], c("this","is","a","sentence","no2")) }) test_that("comma between characters removed to keep characters together", { testing <- data.frame(ID = c(1), page_notes = c("This, is a sentence. 5,000")) results <- token_comments(testing) expect_identical(results[[1]], c("this","is","a","sentence","5000")) }) test_that("dash removed and separates characters", { testing <- data.frame(ID = c(1,2), page_notes = c("dash-name","1877-1777")) results <- token_comments(testing) expect_identical(results[[1]], c("dash","name")) expect_identical(results[[2]], c("1877","1777")) })