testthat::context("encode") test_that( "encode corpus", { library(data.table) testthat::skip_if_not_installed("janeaustenr") testthat::skip_if_not_installed("tidytext") cwb_dirs <- create_cwb_directories(prefix = tempdir(), ask = FALSE, verbose = FALSE) austen_data_dir_tmp <- fs::path(cwb_dirs[["corpus_dir"]], "austen") dir.create(austen_data_dir_tmp) Austen <- CorpusData$new() # Generate tokenstream -------------------------- books <- janeaustenr::austen_books() tbl <- tidytext::unnest_tokens(books, word, text, to_lower = FALSE) Austen$tokenstream <- as.data.table(tbl) Austen$tokenstream[, stem := SnowballC::wordStem(tbl[["word"]], language = "english")] Austen$tokenstream[, cpos := 0L:(nrow(tbl) - 1L)] setcolorder(Austen$tokenstream, c("cpos", "word", "stem")) # Generate metadata ---------------------------------------- cpos_max_min <- function(x) list(cpos_left = min(x[["cpos"]]), cpos_right = max(x[["cpos"]])) Austen$metadata <- Austen$tokenstream[, cpos_max_min(.SD), by = book] Austen$metadata[, book := as.character(book)] setcolorder(Austen$metadata, c("cpos_left", "cpos_right", "book")) # Clean up -------------------------------------------------- Austen$tokenstream[, book := NULL] # Encode ----------------------------------------- Austen$encode( corpus = "AUSTEN", encoding = "utf8", p_attributes = c("word", "stem"), s_attributes = "book", registry_dir = cwb_dirs[["registry_dir"]], data_dir = austen_data_dir_tmp, method = "R", compress = FALSE ) # Load corpus ------------------------------------------- # if (RcppCWB::cqp_is_initialized()){ # RcppCWB::cqp_reset_registry(registry = cwb_dirs[["registry_dir"]]) # } else { # RcppCWB::cqp_initialize(registry = cwb_dirs[["registry_dir"]]) # } # Check ------------------------------------------------- id <- RcppCWB::cl_str2id( corpus = "AUSTEN", p_attribute = "word", str = "pride", registry = cwb_dirs[["registry_dir"]] ) cpos <- RcppCWB::cl_id2cpos( corpus = "AUSTEN", p_attribute = "word", id = id, registry = cwb_dirs[["registry_dir"]] ) expect_identical(length(cpos), length(which(Austen$tokenstream[["word"]] == "pride"))) # Check CWB encoded version against R version skip_on_os(os = "windows") skip_on_cran() tryCatch( {success <- cwb_install(cwb_dir = fs::path(tempdir(), "cwb"))}, error = function(e) testthat::skip("cannot download CWB") ) if (is.null(success)) testthat::skip("cannot install CWB") Austen$encode( corpus = "AUSTEN2", encoding = "utf8", p_attributes = c("word", "stem"), s_attributes = "book", registry_dir = cwb_dirs[["registry_dir"]], data_dir = austen_data_dir_tmp, method = "CWB", compress = FALSE, reload = TRUE ) id2 <- RcppCWB::cl_str2id( corpus = "AUSTEN2", p_attribute = "word", str = "pride", registry = cwb_dirs[["registry_dir"]] ) cpos2 <- RcppCWB::cl_id2cpos( corpus = "AUSTEN2", p_attribute = "word", id = id2, registry = cwb_dirs[["registry_dir"]] ) expect_identical(length(cpos), length(cpos2)) expect_identical(cpos, cpos2) expect_identical( RcppCWB::cl_attribute_size(corpus = "AUSTEN", attribute = "word", attribute_type = "p"), RcppCWB::cl_attribute_size(corpus = "AUSTEN2", attribute = "word", attribute_type = "p") ) lex1 <- RcppCWB::cl_lexicon_size(corpus = "AUSTEN", p_attribute = "word") lex2 <- RcppCWB::cl_lexicon_size(corpus = "AUSTEN2", p_attribute = "word") expect_identical(lex1, lex2) vocab1 <- RcppCWB::cl_id2str(corpus = "AUSTEN", p_attribute = "word", id = 0L:(lex1 - 1L)) vocab2 <- RcppCWB::cl_id2str(corpus = "AUSTEN2", p_attribute = "word", id = 0L:(lex1 - 1L)) expect_identical(vocab1, vocab2) # check availability of s-attributes s_attrs1 <- RcppCWB::corpus_s_attributes( corpus = "AUSTEN", registry = cwb_dirs[["registry_dir"]] ) s_attrs2 <- RcppCWB::corpus_s_attributes( corpus = "AUSTEN2", registry = cwb_dirs[["registry_dir"]] ) expect_identical(s_attrs1, s_attrs2) lapply(cwb_dirs, unlink) # clean up } ) test_that( "check p_attribute with R/CWB, without/with compression", { testthat::skip_if_not_installed("janeaustenr") testthat::skip_if_not_installed("tidytext") registry_tmp <- fs::path(tempdir(), "registry") dir.create (registry_tmp) token_stream <- tidytext::unnest_tokens( janeaustenr::austen_books(), word, text, to_lower = FALSE )[["word"]] # pure R ------------------------------------------------------------------- data_dir_tmp1a <- fs::path(tempdir(), "data_dir", "austen1a") dir.create(data_dir_tmp1a, recursive = TRUE) data_dir_tmp1b <- fs::path(tempdir(), "data_dir", "austen1b") dir.create(data_dir_tmp1b, recursive = TRUE) p_attribute_encode( token_stream = token_stream, registry_dir = registry_tmp, corpus = "AUSTEN1A", data_dir = data_dir_tmp1a, method = "R", verbose = TRUE, quietly = FALSE, encoding = "utf8", compress = FALSE ) p_attribute_encode( token_stream = token_stream, registry_dir = registry_tmp, corpus = "AUSTEN1B", data_dir = data_dir_tmp1b, method = "R", verbose = TRUE, quietly = FALSE, encoding = "utf8", compress = TRUE ) # CWB ---------------------------------------------------------------------- skip_on_os("windows") # we know that compression does not work on Windows skip_on_cran() # installation on Linux requires sudo privileges if (!cwb_is_installed()) cwb_install() data_dir_tmp2a <- fs::path(tempdir(), "data_dir", "austen2a") dir.create(data_dir_tmp2a, recursive = TRUE) data_dir_tmp2b <- fs::path(tempdir(), "data_dir", "austen2b") dir.create(data_dir_tmp2b, recursive = TRUE) p_attribute_encode( token_stream = token_stream, registry_dir = registry_tmp, corpus = "AUSTEN2A", data_dir = data_dir_tmp2a, method = "CWB", verbose = TRUE, quietly = FALSE, encoding = "utf8", compress = FALSE ) p_attribute_encode( token_stream = token_stream, registry_dir = registry_tmp, corpus = "AUSTEN2B", data_dir = data_dir_tmp2b, method = "CWB", verbose = TRUE, quietly = FALSE, encoding = "utf8", compress = TRUE ) # tests -------------------------------------------------------------------- austen1a_md5 <- tools::md5sum(list.files(data_dir_tmp1a, full.names = TRUE)) names(austen1a_md5) <- basename(names(austen1a_md5)) austen2a_md5 <- tools::md5sum(list.files(data_dir_tmp2a, full.names = TRUE)) names(austen2a_md5) <- basename(names(austen2a_md5)) testthat::expect_identical(names(austen1a_md5), names(austen2a_md5)) for (file in names(austen1a_md5)) expect_identical( austen1a_md5[[file]], austen2a_md5[[file]] ) austen1b_md5 <- tools::md5sum(list.files(data_dir_tmp1b, full.names = TRUE)) names(austen1b_md5) <- basename(names(austen1b_md5)) austen2b_md5 <- tools::md5sum(list.files(data_dir_tmp2b, full.names = TRUE)) names(austen2b_md5) <- basename(names(austen2b_md5)) testthat::expect_identical(names(austen1b_md5), names(austen2b_md5)) for (file in names(austen1b_md5)) expect_identical( austen1b_md5[[file]], austen2b_md5[[file]] ) } )