## Tests for read_scopus() — synthetic CSV fixtures only (no network calls) ## --------------------------------------------------------------------------- ## Helper: build a proper quoted CSV file from a data frame ## --------------------------------------------------------------------------- write_scopus_csv <- function(df) { f <- tempfile(fileext = ".csv") write.csv(df, f, row.names = FALSE) f } ## Full-featured 3-record Scopus data frame make_full_df <- function() { data.frame( Authors = c("Smith J.; Doe A.", "Garcia M.", "Müller K.; Çelik O."), `Author full names` = c("Smith, John; Doe, Alice", "Garcia, Maria", "Müller, Klaus; Çelik, Osman"), Title = c("Network Analysis Study", "Survey Paper", "European Study"), Year = c(2020L, 2021L, 2022L), `Source title` = c("Journal of Networks", "Review Journal", "European Journal"), `Cited by` = c(15L, 5L, 0L), DOI = c("10.1000/test001", "10.1000/test002", ""), `Author Keywords` = c("network analysis; machine learning", "survey", ""), `Index Keywords` = c("graph theory; clustering", "review methodology", ""), References = c("Ref A, 2019; Ref B, 2018", "", "Ref C, 2020"), Affiliations = c("Uni A", "Uni B", "Uni C"), Abstract = c("Abstract text here.", "Survey abstract.", "European abstract."), `Document Type` = c("Article", "Review", "Conference Paper"), `Language of Original Document` = c("English", "Spanish", "German"), EID = c("2-s2.0-001", "2-s2.0-002", "2-s2.0-003"), check.names = FALSE, stringsAsFactors = FALSE ) } ## --------------------------------------------------------------------------- ## 1. Standard columns present in correct order ## --------------------------------------------------------------------------- test_that("read_scopus returns all standard columns in correct order", { f <- write_scopus_csv(make_full_df()) d <- read_scopus(f) expected_cols <- c("id", "title", "year", "journal", "doi", "cited_by_count", "abstract", "type", "authors", "references", "keywords", "index_keywords", "affiliations", "language") expect_true(all(expected_cols %in% names(d))) ## The columns must appear in the expected order present <- names(d)[names(d) %in% expected_cols] expect_equal(present, expected_cols) }) ## --------------------------------------------------------------------------- ## 2. Row count ## --------------------------------------------------------------------------- test_that("read_scopus returns correct number of rows", { f <- write_scopus_csv(make_full_df()) d <- read_scopus(f) expect_equal(nrow(d), 3L) }) ## --------------------------------------------------------------------------- ## 3. Column types ## --------------------------------------------------------------------------- test_that("read_scopus: character columns have correct types", { f <- write_scopus_csv(make_full_df()) d <- read_scopus(f) expect_type(d$id, "character") expect_type(d$title, "character") expect_type(d$journal, "character") expect_type(d$doi, "character") expect_type(d$abstract, "character") expect_type(d$type, "character") expect_type(d$affiliations, "character") expect_type(d$language, "character") }) test_that("read_scopus: year is integer", { f <- write_scopus_csv(make_full_df()) d <- read_scopus(f) expect_type(d$year, "integer") }) test_that("read_scopus: cited_by_count is integer", { f <- write_scopus_csv(make_full_df()) d <- read_scopus(f) expect_type(d$cited_by_count, "integer") }) test_that("read_scopus: list-columns are lists", { f <- write_scopus_csv(make_full_df()) d <- read_scopus(f) expect_true(is.list(d$authors)) expect_true(is.list(d$references)) expect_true(is.list(d$keywords)) expect_true(is.list(d$index_keywords)) }) ## --------------------------------------------------------------------------- ## 4. EID used as id when present ## --------------------------------------------------------------------------- test_that("read_scopus uses EID column as id", { f <- write_scopus_csv(make_full_df()) d <- read_scopus(f) expect_equal(d$id, c("2-s2.0-001", "2-s2.0-002", "2-s2.0-003")) }) ## --------------------------------------------------------------------------- ## 5. EID fallback: no EID column → sequential S1, S2, ... ## --------------------------------------------------------------------------- test_that("read_scopus falls back to sequential id when no EID column", { df <- data.frame( Title = c("Paper One", "Paper Two"), Authors = c("Smith J.", "Doe A."), Year = c(2020L, 2021L), `Source title` = c("Some Journal", "Other Journal"), `Cited by` = c(3L, 0L), check.names = FALSE, stringsAsFactors = FALSE ) d <- read_scopus(write_scopus_csv(df)) expect_equal(d$id, c("S1", "S2")) }) ## --------------------------------------------------------------------------- ## 6. EID fallback: EID column present but empty → sequential id ## --------------------------------------------------------------------------- test_that("read_scopus falls back to sequential id when EID is empty string", { df <- data.frame( Title = c("Paper One", "Paper Two"), Authors = c("Smith J.", "Doe A."), Year = c(2020L, 2021L), `Source title` = c("Some Journal", "Other Journal"), `Cited by` = c(3L, 0L), EID = c("", ""), check.names = FALSE, stringsAsFactors = FALSE ) d <- read_scopus(write_scopus_csv(df)) expect_equal(d$id, c("S1", "S2")) }) ## --------------------------------------------------------------------------- ## 7. Author splitting: semicolon-delimited, uppercased, dots removed ## --------------------------------------------------------------------------- test_that("read_scopus splits multiple authors into list elements", { f <- write_scopus_csv(make_full_df()) d <- read_scopus(f) ## Record 1: "Smith J.; Doe A." → 2 elements expect_equal(length(d$authors[[1]]), 2L) }) test_that("read_scopus single author yields one-element list", { f <- write_scopus_csv(make_full_df()) d <- read_scopus(f) ## Record 2: "Garcia M." → 1 element expect_equal(length(d$authors[[2]]), 1L) }) test_that("read_scopus uppercases author names via standardize_authors", { f <- write_scopus_csv(make_full_df()) d <- read_scopus(f) ## All author tokens should already be uppercase for (i in seq_along(d$authors)) { if (length(d$authors[[i]]) > 0) expect_equal(d$authors[[i]], toupper(d$authors[[i]])) } }) test_that("read_scopus removes dots from author initials", { df <- data.frame( Authors = "Jones, F.J.; Brown, A.M.", Title = "Dots Test", Year = 2020L, `Source title` = "J", `Cited by` = 0L, check.names = FALSE, stringsAsFactors = FALSE ) d <- read_scopus(write_scopus_csv(df)) expect_false(any(grepl("\\.", d$authors[[1]]))) }) ## --------------------------------------------------------------------------- ## 8. Reference splitting: semicolons, uppercased via standardize_refs ## --------------------------------------------------------------------------- test_that("read_scopus splits references on semicolon", { f <- write_scopus_csv(make_full_df()) d <- read_scopus(f) ## Record 1: "Ref A, 2019; Ref B, 2018" → 2 references expect_equal(length(d$references[[1]]), 2L) }) test_that("read_scopus returns empty character vector for blank references", { f <- write_scopus_csv(make_full_df()) d <- read_scopus(f) ## Record 2 has empty References field expect_equal(length(d$references[[2]]), 0L) expect_type(d$references[[2]], "character") }) test_that("read_scopus uppercases references via standardize_refs", { f <- write_scopus_csv(make_full_df()) d <- read_scopus(f) expect_equal(d$references[[1]], c("REF A, 2019", "REF B, 2018")) }) ## --------------------------------------------------------------------------- ## 9. Keyword splitting ## --------------------------------------------------------------------------- test_that("read_scopus splits author keywords on semicolon", { f <- write_scopus_csv(make_full_df()) d <- read_scopus(f) ## Record 1: "network analysis; machine learning" → 2 expect_equal(length(d$keywords[[1]]), 2L) expect_equal(d$keywords[[1]], c("network analysis", "machine learning")) }) test_that("read_scopus returns empty vector for missing author keywords", { f <- write_scopus_csv(make_full_df()) d <- read_scopus(f) ## Record 3 has empty Author Keywords expect_equal(length(d$keywords[[3]]), 0L) }) test_that("read_scopus splits index keywords on semicolon", { f <- write_scopus_csv(make_full_df()) d <- read_scopus(f) expect_equal(length(d$index_keywords[[1]]), 2L) expect_equal(d$index_keywords[[1]], c("graph theory", "clustering")) }) test_that("read_scopus returns empty vector for missing index keywords", { f <- write_scopus_csv(make_full_df()) d <- read_scopus(f) expect_equal(length(d$index_keywords[[3]]), 0L) }) ## --------------------------------------------------------------------------- ## 10. Scalar field values ## --------------------------------------------------------------------------- test_that("read_scopus parses cited_by_count correctly", { f <- write_scopus_csv(make_full_df()) d <- read_scopus(f) expect_equal(d$cited_by_count, c(15L, 5L, 0L)) }) test_that("read_scopus parses year correctly", { f <- write_scopus_csv(make_full_df()) d <- read_scopus(f) expect_equal(d$year, c(2020L, 2021L, 2022L)) }) test_that("read_scopus parses DOI correctly", { f <- write_scopus_csv(make_full_df()) d <- read_scopus(f) expect_equal(d$doi[1], "10.1000/test001") expect_equal(d$doi[2], "10.1000/test002") }) test_that("read_scopus parses language field", { f <- write_scopus_csv(make_full_df()) d <- read_scopus(f) expect_equal(d$language, c("English", "Spanish", "German")) }) test_that("read_scopus parses document type field", { f <- write_scopus_csv(make_full_df()) d <- read_scopus(f) expect_equal(d$type, c("Article", "Review", "Conference Paper")) }) ## --------------------------------------------------------------------------- ## 11. Missing optional columns → NA defaults (no error) ## --------------------------------------------------------------------------- test_that("read_scopus handles missing optional columns gracefully", { df <- data.frame( Title = "Minimal Paper", Year = 2023L, `Source title` = "Minimal Journal", check.names = FALSE, stringsAsFactors = FALSE ) expect_no_error({ d <- read_scopus(write_scopus_csv(df)) }) expect_equal(d$id, "S1") expect_true(is.na(d$doi)) expect_true(is.na(d$abstract)) expect_true(is.na(d$affiliations)) expect_true(is.na(d$language)) expect_equal(length(d$authors[[1]]), 0L) expect_equal(length(d$references[[1]]), 0L) expect_equal(length(d$keywords[[1]]), 0L) expect_equal(length(d$index_keywords[[1]]), 0L) }) ## --------------------------------------------------------------------------- ## 12. cited_by_count defaults to 0 (not NA) when column is absent ## --------------------------------------------------------------------------- test_that("read_scopus defaults cited_by_count to 0 when column absent", { df <- data.frame( Title = "No Cited By", Year = 2023L, `Source title` = "Some Journal", check.names = FALSE, stringsAsFactors = FALSE ) d <- read_scopus(write_scopus_csv(df)) expect_equal(d$cited_by_count, 0L) expect_false(is.na(d$cited_by_count)) }) ## --------------------------------------------------------------------------- ## 13. UTF-8 / non-ASCII characters in author names ## --------------------------------------------------------------------------- test_that("read_scopus handles non-ASCII author names (UTF-8)", { f <- write_scopus_csv(make_full_df()) d <- read_scopus(f) ## Record 3: Müller K. and Çelik O. → 2 authors, non-empty after uppercasing expect_equal(length(d$authors[[3]]), 2L) expect_true(all(nchar(d$authors[[3]]) > 0)) }) ## --------------------------------------------------------------------------- ## 14. Case-insensitive alternate column spellings ## --------------------------------------------------------------------------- test_that("read_scopus accepts 'Document Title' as alternate title column", { df <- data.frame( `Document Title` = "Alternate Title Paper", Year = 2023L, `Source title` = "Alt Journal", `Cited by` = 2L, check.names = FALSE, stringsAsFactors = FALSE ) d <- read_scopus(write_scopus_csv(df)) expect_equal(d$title, "Alternate Title Paper") }) test_that("read_scopus accepts 'Author full names' as alternate authors column", { df <- data.frame( Title = "Some Paper", `Author full names` = "Jones A.; Brown B.", Year = 2022L, `Source title` = "Some Journal", `Cited by` = 1L, check.names = FALSE, stringsAsFactors = FALSE ) d <- read_scopus(write_scopus_csv(df)) expect_equal(length(d$authors[[1]]), 2L) }) test_that("read_scopus accepts 'Authors with affiliations' as alternate affiliations column", { df <- data.frame( Title = "Affil Paper", Authors = "Smith J.", Year = 2021L, `Source title` = "Journal X", `Cited by` = 0L, `Authors with affiliations` = "Smith J., Uni D", check.names = FALSE, stringsAsFactors = FALSE ) d <- read_scopus(write_scopus_csv(df)) expect_false(is.na(d$affiliations)) expect_equal(d$affiliations, "Smith J., Uni D") }) ## --------------------------------------------------------------------------- ## 15. File not found → informative error from check_file() ## --------------------------------------------------------------------------- test_that("read_scopus errors informatively on missing file", { expect_error(read_scopus("no_such_file_xyz.csv"), regexp = "File not found") }) ## --------------------------------------------------------------------------- ## 16. Single-record file ## --------------------------------------------------------------------------- test_that("read_scopus handles a single-row CSV correctly", { df <- data.frame( Authors = "Adams P.", Title = "Single Record Paper", Year = 2019L, `Source title` = "Journal One", `Cited by` = 10L, DOI = "10.1/single", EID = "2-s2.0-999", check.names = FALSE, stringsAsFactors = FALSE ) d <- read_scopus(write_scopus_csv(df)) expect_equal(nrow(d), 1L) expect_equal(d$id, "2-s2.0-999") expect_equal(d$year, 2019L) expect_equal(d$cited_by_count, 10L) }) test_that("read_scopus normalizes empty-string DOI to NA (regression)", { ## Regression: empty DOI cells were previously stored as "" rather than NA, ## breaking is.na(doi) deduplication checks on downstream code. df <- data.frame( Title = c("Has DOI", "Empty DOI"), Year = c(2020L, 2021L), `Source title` = c("J1", "J2"), DOI = c("10.1/x", ""), EID = c("E1", "E2"), check.names = FALSE, stringsAsFactors = FALSE ) d <- read_scopus(write_scopus_csv(df)) expect_equal(d$doi[1], "10.1/x") expect_true(is.na(d$doi[2])) })