## tests/testthat/test-read-biblio.R ## Target: >= 85% line coverage of R/read-biblio.R ## No network calls; all fixtures via tempfile(). ## ── helpers ─────────────────────────────────────────────────────────────────── ## Minimal Scopus CSV (has EID → auto-detected as "scopus") make_scopus_file <- function(n = 2L) { df <- data.frame( Title = paste0("Paper ", seq_len(n)), Authors = paste0("Author", seq_len(n), " A."), Year = 2020L + seq_len(n) - 1L, `Source title` = paste0("Journal ", seq_len(n)), `Cited by` = seq_len(n), EID = paste0("2-s2.0-00", seq_len(n)), DOI = paste0("10.1/sc", seq_len(n)), check.names = FALSE, stringsAsFactors = FALSE ) f <- tempfile(fileext = ".csv") write.csv(df, f, row.names = FALSE) f } ## Minimal Dimensions CSV (prepended "About the data: …" header line) make_dimensions_file <- function() { lines <- c( '"About the data: Export created 2024-01-01"', '"Publication ID","Title","PubYear","Source title","DOI","Times cited","Abstract","Publication Type","Authors","Cited references","Authors Affiliations - Name of Research organization","Authors Affiliations - Country of Research organization","Keywords"', '"pub.111","A dimensions paper","2021","Dim Journal","10.1/dim","3","Some abstract.","article","Smith, John","Jones A 2019 Jour","Uni A","USA","network"' ) f <- tempfile(fileext = ".csv") writeLines(lines, f) f } ## Minimal Lens CSV (has "Lens ID" header → auto-detected) make_lens_file <- function() { lines <- c( '"Lens ID","Title","Publication Year","Source Title","DOI","Citing Works Count","Abstract","Publication Type","Authors","References","Keywords"', '"000-001","A lens paper","2022","Lens Journal","10.1/lens","7","Lens abstract.","journal article","Smith J; Jones K","Ref A 2020","networks"' ) f <- tempfile(fileext = ".csv") writeLines(lines, f) f } ## Minimal WoS plaintext (starts with "FN" → auto-detected as "wos") make_wos_file <- function() { f <- tempfile(fileext = ".txt") writeLines(c( "FN Web of Science", "VR 1.0", "UT WOS:000001", "TI A wos paper", "AU Smith, John", "PY 2021", "SO WoS Journal", "TC 4", "ER", "EF" ), f) f } ## Minimal BibTeX (starts with @ → auto-detected as "bibtex") make_bibtex_file <- function() { f <- tempfile(fileext = ".bib") writeLines(c( "@article{key1,", " title = {A bibtex paper},", " author = {Smith, John},", " year = {2023},", " journal = {Bib Journal}", "}" ), f) f } ## Minimal RIS (starts with "TY -" → auto-detected as "ris") make_ris_file <- function() { f <- tempfile(fileext = ".ris") writeLines(c( "TY - JOUR", "TI - A ris paper", "AU - Smith, John", "PY - 2022", "JO - RIS Journal", "DO - 10.1/ris1", "ER - " ), f) f } ## Generic CSV helper make_generic_file <- function() { f <- tempfile(fileext = ".csv") df <- data.frame( doc_id = c("D1", "D2", "D3"), headline = c("Paper One", "Paper Two", "Paper Three"), Authors = c("Alice|Bob", "Carol", "Dave|Eve|Frank"), Tags = c("network|graph", "statistics", "learning|AI"), year = c(2020L, 2021L, 2022L), stringsAsFactors = FALSE ) write.csv(df, f, row.names = FALSE) f } ## ════════════════════════════════════════════════════════════════════════════ ## 1. detect_format() — uncovered branches ## ════════════════════════════════════════════════════════════════════════════ test_that("detect_format returns 'unknown' for an empty file", { f <- tempfile(fileext = ".csv") writeLines(character(0), f) ## bibnets internal function; access via ::: result <- bibnets:::detect_format(f) expect_equal(result, "unknown") }) test_that("detect_format returns 'wos' for a file starting with FN", { f <- tempfile(fileext = ".txt") writeLines(c("FN Web of Science", "VR 1.0"), f) expect_equal(bibnets:::detect_format(f), "wos") }) test_that("detect_format returns 'wos' for a file starting with PT", { f <- tempfile(fileext = ".txt") writeLines(c("PT J", "AU Smith, J"), f) expect_equal(bibnets:::detect_format(f), "wos") }) test_that("detect_format returns 'scopus' for CSV with EID header", { f <- make_scopus_file(1L) expect_equal(bibnets:::detect_format(f), "scopus") }) test_that("detect_format returns 'lens' for CSV with Lens ID header", { f <- make_lens_file() expect_equal(bibnets:::detect_format(f), "lens") }) test_that("detect_format returns 'unknown' for an unrecognised CSV", { f <- tempfile(fileext = ".csv") writeLines(c("col_a,col_b,col_c", "1,2,3"), f) expect_equal(bibnets:::detect_format(f), "unknown") }) ## ════════════════════════════════════════════════════════════════════════════ ## 2. resolve_paths() — directory and nonexistent path branches ## ════════════════════════════════════════════════════════════════════════════ test_that("resolve_paths returns files from a directory", { d <- tempdir() f1 <- file.path(d, paste0("test_rb_", Sys.getpid(), "_a.csv")) f2 <- file.path(d, paste0("test_rb_", Sys.getpid(), "_b.bib")) writeLines("col\n1", f1) writeLines("@misc{k}", f2) on.exit({ unlink(f1); unlink(f2) }, add = TRUE) result <- bibnets:::resolve_paths(d) expect_true(f1 %in% result) expect_true(f2 %in% result) }) test_that("resolve_paths returns character(0) for nonexistent path", { result <- bibnets:::resolve_paths("/tmp/no_such_path_xyz_bibnets") expect_equal(length(result), 0L) expect_type(result, "character") }) test_that("resolve_paths handles mix of existing file and nonexistent path", { f <- make_ris_file() result <- bibnets:::resolve_paths(c(f, "/tmp/definitely_missing_xyz")) expect_equal(result, f) }) ## ════════════════════════════════════════════════════════════════════════════ ## 3. read_biblio() — empty path stops with informative error ## ════════════════════════════════════════════════════════════════════════════ test_that("read_biblio stops with informative error when no files found", { expect_error( read_biblio("/tmp/absolutely_no_file_bibnets_xyz.csv"), regexp = "No files found" ) }) test_that("read_biblio stops when given a nonexistent directory", { expect_error( read_biblio("/tmp/nonexistent_dir_bibnets_xyz/"), regexp = "No files found" ) }) ## ════════════════════════════════════════════════════════════════════════════ ## 4. read_single_biblio() — format = "generic" short-circuit path ## ════════════════════════════════════════════════════════════════════════════ test_that("read_biblio with format='generic' invokes read_generic correctly", { f <- make_generic_file() d <- read_biblio(f, format = "generic", id = "doc_id", actors = c("Authors", "Tags"), sep = "|") expect_equal(nrow(d), 3L) expect_true(is.list(d$Authors)) expect_true(is.list(d$Tags)) ## First row: two Authors split on "|" expect_equal(length(d$Authors[[1]]), 2L) ## Third row: three Authors expect_equal(length(d$Authors[[3]]), 3L) ## ID column set from doc_id expect_equal(d$id, c("D1", "D2", "D3")) }) test_that("read_biblio generic: actors not in file are silently skipped", { f <- make_generic_file() d <- read_biblio(f, format = "generic", actors = c("Authors", "NonExistent")) expect_true(is.list(d$Authors)) ## "NonExistent" is absent — no error, not added as list col expect_false("NonExistent" %in% names(d)) }) test_that("read_biblio generic: NULL id uses row numbers as character id", { f <- make_generic_file() d <- read_biblio(f, format = "generic", id = NULL) expect_equal(d$id, as.character(1:3)) }) test_that("read_biblio generic: CSV already has an 'id' column → used as-is", { f <- tempfile(fileext = ".csv") writeLines(c("id,title", "A001,Title One", "A002,Title Two"), f) d <- read_biblio(f, format = "generic") ## id stays from the CSV column (character) expect_equal(d$id, c("A001", "A002")) }) ## ════════════════════════════════════════════════════════════════════════════ ## 5. read_single_biblio() — unknown format error path ## ════════════════════════════════════════════════════════════════════════════ test_that("read_biblio with undetectable format stops with actionable error", { f <- tempfile(fileext = ".csv") writeLines(c("col_x,col_y", "1,2"), f) ## auto-detect returns "unknown" → switch falls through to stop() expect_error( read_biblio(f), regexp = "Could not detect file format" ) }) test_that("read_single_biblio unknown format message names the file", { f <- tempfile(fileext = ".csv") writeLines(c("col_x,col_y", "1,2"), f) err <- tryCatch(read_biblio(f), error = function(e) conditionMessage(e)) expect_true(grepl(basename(f), err, fixed = FALSE) || grepl("Could not detect", err, fixed = FALSE)) }) ## ════════════════════════════════════════════════════════════════════════════ ## 6. read_biblio() auto-detect dispatch — formats not yet covered ## ════════════════════════════════════════════════════════════════════════════ test_that("read_biblio auto-detects scopus format", { f <- make_scopus_file(2L) d <- suppressMessages(read_biblio(f)) expect_equal(nrow(d), 2L) ## EID column used as id expect_true(all(grepl("^2-s2\\.0-", d$id))) }) test_that("read_biblio auto-detects wos format (FN line)", { f <- make_wos_file() d <- suppressMessages(read_biblio(f)) expect_equal(nrow(d), 1L) expect_equal(d$id, "WOS:000001") }) test_that("read_biblio auto-detects lens format", { f <- make_lens_file() d <- suppressMessages(read_biblio(f)) expect_equal(nrow(d), 1L) expect_equal(d$id, "000-001") }) test_that("read_biblio auto-detects dimensions format", { f <- make_dimensions_file() d <- suppressMessages(read_biblio(f)) expect_equal(nrow(d), 1L) expect_true(grepl("pub\\.111", d$id)) }) test_that("read_biblio explicit format='wos' bypasses auto-detection", { f <- make_wos_file() d <- suppressMessages(read_biblio(f, format = "wos")) expect_equal(nrow(d), 1L) }) test_that("read_biblio explicit format='wos_tab' dispatches to tab parser", { ## Build a minimal WoS tab-delimited file f <- tempfile(fileext = ".txt") writeLines(c( paste(c("UT", "TI", "AU", "PY", "SO", "DI", "TC", "AB", "DT", "DE", "ID", "CR"), collapse = "\t"), paste(c("WOS:T001", "Tab paper", "Smith, J", "2021", "Tab Journal", "10.1/t", "7", "An abstract.", "Article", "networks", "deep learning", ""), collapse = "\t") ), f) d <- suppressMessages(read_biblio(f, format = "wos_tab")) expect_equal(nrow(d), 1L) expect_equal(d$id, "WOS:T001") }) test_that("read_biblio explicit format='bibtex' dispatches correctly", { f <- make_bibtex_file() d <- suppressMessages(read_biblio(f, format = "bibtex")) expect_equal(nrow(d), 1L) expect_equal(d$title, "A bibtex paper") }) test_that("read_biblio explicit format='ris' dispatches correctly", { f <- make_ris_file() d <- suppressMessages(read_biblio(f, format = "ris")) expect_equal(nrow(d), 1L) expect_equal(d$title, "A ris paper") }) test_that("read_biblio explicit format='dimensions' dispatches correctly", { f <- make_dimensions_file() d <- suppressMessages(read_biblio(f, format = "dimensions")) expect_equal(nrow(d), 1L) }) test_that("read_biblio explicit format='lens' dispatches correctly", { f <- make_lens_file() d <- suppressMessages(read_biblio(f, format = "lens")) expect_equal(nrow(d), 1L) }) ## ════════════════════════════════════════════════════════════════════════════ ## 7. Multi-file ingest — message and row-count assertions ## ════════════════════════════════════════════════════════════════════════════ test_that("read_biblio emits 'Read N files: M rows total' message for multiple files", { f1 <- make_ris_file() f2 <- make_ris_file() expect_message( read_biblio(c(f1, f2)), regexp = "Read 2 files: 2 rows total" ) }) test_that("read_biblio combines two RIS files into correct row count", { f1 <- make_ris_file() f2 <- make_ris_file() d <- suppressMessages(read_biblio(c(f1, f2))) expect_equal(nrow(d), 2L) }) test_that("read_biblio directory input reads all matching files", { d_dir <- tempfile() dir.create(d_dir) on.exit(unlink(d_dir, recursive = TRUE), add = TRUE) ## Write three RIS files into the temp directory writeLines(c("TY - JOUR", "TI - Paper A", "PY - 2020", "ER - "), file.path(d_dir, "a.ris")) writeLines(c("TY - JOUR", "TI - Paper B", "PY - 2021", "ER - "), file.path(d_dir, "b.ris")) writeLines(c("TY - JOUR", "TI - Paper C", "PY - 2022", "ER - "), file.path(d_dir, "c.ris")) d <- suppressMessages(read_biblio(d_dir)) expect_equal(nrow(d), 3L) }) test_that("read_biblio directory: message fires when > 1 file found", { d_dir <- tempfile() dir.create(d_dir) on.exit(unlink(d_dir, recursive = TRUE), add = TRUE) writeLines(c("TY - JOUR", "TI - P1", "PY - 2020", "ER - "), file.path(d_dir, "r1.ris")) writeLines(c("TY - JOUR", "TI - P2", "PY - 2021", "ER - "), file.path(d_dir, "r2.ris")) expect_message(read_biblio(d_dir), "Read 2 files") }) test_that("read_biblio vector of 3 files: combined row count and message", { f1 <- make_ris_file() ## 1 row f2 <- make_ris_file() ## 1 row f3 <- make_bibtex_file() ## 1 row expect_message( {d <- read_biblio(c(f1, f2, f3))}, regexp = "Read 3 files: 3 rows total" ) expect_equal(nrow(d), 3L) }) test_that("read_biblio single file does NOT emit the multi-file message", { f <- make_ris_file() expect_no_message(read_biblio(f)) }) ## ════════════════════════════════════════════════════════════════════════════ ## 8. align_biblio_columns() — missing list-column filled with empty vectors ## ════════════════════════════════════════════════════════════════════════════ test_that("align_biblio_columns fills missing list-columns with empty vectors", { ## RIS has no 'affiliations'; OA CSV does. Combining them exercises the ## list-col fill branch of align_biblio_columns(). oa_f <- tempfile(fileext = ".csv") writeLines(c( "id,display_name,publication_year,primary_location.source.display_name,doi,cited_by_count,type,authorships.author.display_name,authorships.institutions.display_name,authorships.countries,primary_topic.display_name", "https://openalex.org/W9,OA paper,2024,Journal OA,https://doi.org/10.1/oa,2,article,Alice|Bob,Uni A|Uni B,US|GB,Networks" ), oa_f) ris_f <- make_ris_file() d <- suppressMessages(read_biblio(c(oa_f, ris_f))) expect_equal(nrow(d), 2L) ## 'countries' and 'affiliations' exist only in the OA row; the RIS row ## should have them as empty list elements (not NA, not missing column) expect_true("countries" %in% names(d)) expect_true(is.list(d$countries)) ris_row <- d[is.na(d$abstract) | d$id != "W9", ] ## The RIS row's countries entry should be an empty character vector expect_equal(length(d$countries[[2]]), 0L) }) test_that("align_biblio_columns fills missing scalar columns with NA", { ## Two Scopus files combined: both have same schema, no missing columns ## Use BibTeX + Scopus to get a scalar-col mismatch (e.g., 'language') bib_f <- make_bibtex_file() sc_f <- make_scopus_file(1L) d <- suppressMessages(read_biblio(c(sc_f, bib_f))) ## Scopus adds 'language'; BibTeX doesn't — should be NA for BibTeX row if ("language" %in% names(d)) { expect_true(any(is.na(d$language))) } expect_equal(nrow(d), 2L) }) ## ════════════════════════════════════════════════════════════════════════════ ## 9. read_generic() — edge-cases ## ════════════════════════════════════════════════════════════════════════════ test_that("read_generic errors on a nonexistent file", { ## read_biblio checks resolve_paths first; it never reaches read_generic for ## a missing file path. So we call the internal directly, or via a ## zero-length resolve that triggers "No files found" first. ## The "No files found" error is the correct user-facing error here. expect_error( read_biblio("/no/such/generic/file_bibnets_xyz.csv", format = "generic"), regexp = "No files found|File not found|not found|cannot open" ) }) test_that("read_generic sep parameter splits on custom delimiter", { f <- tempfile(fileext = ".csv") df <- data.frame( id = c("X1", "X2"), authors = c("A::B::C", "D::E"), stringsAsFactors = FALSE ) write.csv(df, f, row.names = FALSE) d <- read_biblio(f, format = "generic", actors = "authors", sep = "::") expect_equal(length(d$authors[[1]]), 3L) expect_equal(length(d$authors[[2]]), 2L) }) test_that("read_generic with no actors argument leaves columns as-is", { f <- make_generic_file() d <- read_biblio(f, format = "generic") ## No actors specified: Authors column should remain character, not list expect_false(is.list(d$Authors)) })