test_that("find_duplicates() doesn't flag list-col NULLs as identical", { result <- list(1, NULL, NULL, 2, 1) |> find_duplicates() expect_equal(as.integer(result), c(1, 2, 3, 4, 1)) }) test_that("find_duplicates() works", { # with no duplicates x <- read_refs("testdata/ASP_ris_example.ris") result <- x |> dplyr::pull("doi") |> find_duplicates() expect_equal(as.integer(result), c(1:4)) # with duplicates x <- rbind(x, x[1,]) result <- x |> dplyr::pull("doi") |> find_duplicates() expect_equal(as.integer(result), c(1:4,1)) }) # example from `overview.Rmd` test_that("review_duplicates() doesn't fail", { bibfiles <- list.files( system.file("extdata/", package = "synthesisr"), full.names = TRUE) imported_files <- read_refs( filename = bibfiles, return_df = TRUE) possible_duplicates <- find_duplicates(imported_files$title, to_lower = TRUE, rm_punctuation = TRUE) # run review_duplicates manual_checks <- review_duplicates(imported_files$title, possible_duplicates) check_list <- manual_checks$title |> tolower() |> stringr::str_replace_all("[[:punct:]]", " ") |> split(manual_checks$matches) # ensure every entry in the list contains identical text lapply(check_list, \(a){all(a == a[[1]])}) |> unlist() |> all() |> expect_true() }) test_that("`deduplicate()` works using titles", { my_df <- tibble::tibble( title = c( "EviAtlas: a tool for visualising evidence synthesis databases", "revtools: An R package to support article screening for evidence synthesis", "An automated approach to identifying search terms for systematic reviews using keyword co-occurrence networks", "Reproducible, flexible and high-throughput data extraction from primary literature: The metaDigitise r package", "eviatlas:tool for visualizing evidence synthesis databases.", "REVTOOLS a package to support article-screening for evidence synthsis"), year = c("2019", "2019", "2019", "2019", NA, NA), authors = c("Haddaway et al", "Westgate", "Grames et al", "Pick et al", NA, NA)) # run deduplication on dataset with duplicates deduped <- deduplicate(my_df, "title", rm_punctuation = TRUE, to_lower = TRUE) expect_equal(deduped[1:3],my_df[1:4,]) # run deduplicate on dataset without duplicates deduped_2 <- deduplicate(my_df[1:4,], "title", rm_punctuation = TRUE, to_lower = TRUE) expect_equal(deduped_2[1:3], my_df[1:4,]) }) test_that("deduplicate() works using dois by default", { bibfiles <- list.files( system.file("extdata/", package = "synthesisr"), full.names = TRUE) df_initial <- read_refs( filename = bibfiles, return_df = TRUE) df <- deduplicate(df_initial) # tests go here. Seem to be two problems: # DOIs shouldn't be imported as a list here # - maybe need some post-hoc check to unlist() length-1 lists? # DOIs are returning n = 93, which is too low; # NA uniqueness not being recognized? })