context("Connecting to Tika") # we are not allowed to install the Tika jar automatically on CRAN # the plan is to use skip_on_cran() # so tests run on travis but are skipped on CRAN. # install the jar after the skip_on_cran(), s install_if_needed <- function(){ if(is.na(rtika::tika_jar())){ rtika::install_tika() } } input <- c( system.file("extdata", "jsonlite.pdf", package = "rtika"), system.file("extdata", "curl.pdf", package = "rtika"), system.file("extdata", "table.docx", package = "rtika"), system.file("extdata", "xml2.pdf", package = "rtika"), system.file("extdata", "R-FAQ.html", package = "rtika"), system.file("extdata", "calculator.jpg", package = "rtika"), system.file("extdata", "tika.apache.org.zip", package = "rtika") ) test_that("gets valid path", { skip_on_cran() install_if_needed() path <- tika_jar() expect_true(length(path) == 1) expect_true(class(path) == "character") expect_true(file.exists(path)) }) test_that("check_md5_sum only works with character strings ", { skip_on_cran() install_if_needed() expect_error(tika_check(777)) }) test_that("check_md5_sum fails with wrong checksum", { skip_on_cran() install_if_needed() install_if_needed() expect_false(tika_check("not a checksum")) }) # test_that("md5_sum is correct for this version", { # skip_on_cran() # install_if_needed() # # expect_true(tika_check("e2720c2392c1bd6634cc4a8801f7363a")) # }) # causes problem with travis, but not locally. May need to skip test_that("tika warns with url to nowhere without curl package", { skip_on_cran() install_if_needed() nowhere <- "http://www.predict-r.com/rtika_testing_coverage_file_not_here.txt" expect_warning(tika(nowhere, lib.loc = "")) }) # causes problem with travis, but not locally. May need to skip test_that("tika stops when output_dir is root", { skip_on_cran() install_if_needed() expect_error(tika(input[1], output_dir = file.path("/"))) }) test_that("tika parses a local pdf without curl packages", { skip_on_cran() install_if_needed() text <- tika(input[1], lib.loc = "") expect_equal(length(text), 1) expect_true(!is.na(text[1])) expect_true(nchar(text[1]) > 0) }) test_that("tika handles fake file without curl packages", { skip_on_cran() install_if_needed() nowhere <- file.path("/rtika_fake_file_not_here.txt") expect_warning(text <- tika(nowhere, lib.loc = "")) expect_equal(length(text), 1) expect_equal(text[1], as.character(NA)) }) test_that("tika parses a single remote pdf without curl package", { skip_on_cran() install_if_needed() urls <- c( "https://cran.r-project.org/doc/manuals/r-release/R-data.pdf" , "https://cran.r-project.org/doc/manuals/r-release/R-exts.epub" , "https://cran.r-project.org/doc/manuals/r-release/R-FAQ.html" ) text <- tika(urls[1], lib.loc = "") expect_equal(length(text), 1) expect_true(!is.na(text[1])) expect_true(nchar(text[1]) > 0) }) # causes problem with travis, but not locally. Skipping on github for now. test_that("tika warns with url to nowhere with curl package", { skip_on_cran() install_if_needed() nowhere <- "http://www.predict-r.com/rtika_testing_coverage_file_not_here.txt" expect_warning(tika(nowhere)) }) test_that("tika parses single local pdf", { skip_on_cran() install_if_needed() text <- tika(input[1]) expect_equal(length(text), 1) expect_true(!is.na(text[1])) expect_true(nchar(text[1]) > 0) }) test_that("tika parses multiple local files", { skip_on_cran() install_if_needed() text <- tika(input) expect_equal(length(text), length(input)) expect_true(!any(is.na(text))) expect_true(all(nchar(text) > 0)) }) test_that("tika parses a single remote pdf", { skip_on_cran() install_if_needed() urls <- c( "https://cran.r-project.org/doc/manuals/r-release/R-data.pdf" , "https://cran.r-project.org/doc/manuals/r-release/R-exts.epub" , "https://cran.r-project.org/doc/manuals/r-release/R-FAQ.html" ) text <- tika(urls[1]) expect_equal(length(text), 1) expect_true(!is.na(text[1])) expect_true(nchar(text[1]) > 0) }) test_that("tika warns with path to nowhere", { skip_on_cran() install_if_needed() expect_warning(tika(file.path("/rtika_fake_file_not_here.txt"))) }) test_that("tika outputs NA with a path to nowhere", { skip_on_cran() install_if_needed() nowhere <- file.path("/rtika_fake_file_not_here.txt") text <- "" text <- expect_warning(tika(nowhere)) expect_equal(length(text), 1) expect_equal(text[1], as.character(NA)) }) test_that("tika outputs NA with a path to nowhere in right order", { skip_on_cran() install_if_needed() nowhere <- file.path("/rtika_fake_file_not_here.txt") text <- tika(c(input[1], nowhere, input[2])) expect_equal(text[2], as.character(NA)) expect_equal(length(text), 3) expect_true(all(nchar(text[c(1, 3)]) > 0)) }) test_that("tika warns with NA input", { skip_on_cran() install_if_needed() nowhere <- as.character(NA) expect_warning(text <- tika(nowhere)) expect_equal(length(text), 1) expect_equal(text[1], as.character(NA)) }) test_that("tika outputs parsable xml", { skip_on_cran() install_if_needed() # library('xml2') text <- tika_xml(input) processed_xml <- NA processed_xml <- xml2::read_xml(text[1]) expect_true(!is.na(processed_xml)) processed_xml <- NA processed_xml <- xml2::read_xml(text[2]) expect_true(!is.na(processed_xml)) processed_xml <- NA processed_xml <- xml2::read_xml(text[3]) expect_true(!is.na(processed_xml)) }) test_that("tika outputs parsable html", { skip_on_cran() install_if_needed() # library('xml2') text <- tika_html(input) processed_html <- NA processed_html <- xml2::read_html(text[1]) expect_true(!is.na(processed_html)) processed_html <- NA processed_html <- xml2::read_html(text[2]) expect_true(!is.na(processed_html)) processed_html <- NA processed_html <- xml2::read_html(text[3]) expect_true(!is.na(processed_html)) }) test_that("tika outputs parsable json", { skip_on_cran() install_if_needed() # library('jsonlite') text <- tika_json(input) for (i in seq_along(text)) { processed_json <- data.frame() processed_json <- jsonlite::fromJSON(text[i]) expect_true(nrow(processed_json) >= 1) } }) test_that("tika_text works", { skip_on_cran() install_if_needed() text <- tika_text(input[1]) expect_equal(length(text), 1) expect_true(!is.na(text[1])) expect_true(nchar(text[1]) > 0) }) test_that("tika puts files into the specified output_dir", { skip_on_cran() install_if_needed() test_dir <- tempfile("testthat_rtika_test") dir.create(test_dir) test_dir <- normalizePath(test_dir, winslash = "/") text <- tika(input[1], output_dir = test_dir, cleanup = TRUE) files <- list.files( test_dir , include.dirs = FALSE , recursive = TRUE ) expect_true(length(files) > 0) full_path <- file.path(test_dir, files) expect_true(all(file.exists(full_path))) file_info <- file.info(full_path) expect_true(!all(file_info$isdir)) }) test_that("tika cleans up", { skip_on_cran() install_if_needed() text <- tika(input[1], cleanup = TRUE) expect_equal(length(file.path( tempdir() , list.files( tempdir() , pattern = "^rtika_file" ) )), 0) expect_equal(length(file.path( tempdir() , list.files( tempdir() , pattern = "^rtika_dir" ) )), 0) })