## tests/testthat/test-read-lens.R
## Coverage target: >=80% of R/read-lens.R

## ── helpers ──────────────────────────────────────────────────────────────────

## Build a minimal but realistic Lens.org CSV in a temp file.
## Column names are taken verbatim from R/read-lens.R get_col() calls.
make_lens_csv <- function(...) {
  header <- paste(
    "Lens ID", "Title", "Publication Year", "Source Title",
    "DOI", "Citing Works Count", "Abstract", "Publication Type",
    "Authors", "References", "Keywords",
    sep = ","
  )
  rows <- c(...)
  f <- tempfile(fileext = ".csv")
  writeLines(c(header, rows), f)
  f
}

## Escape a CSV field that may contain commas
q <- function(x) paste0('"', x, '"')

## ── standard columns ─────────────────────────────────────────────────────────

test_that("read_lens returns all standard bibnets columns", {
  row1 <- paste(
    "000-001", q("Network science"), "2021", q("Journal of Networks"),
    "10.1/net", "42", q("A study of networks."), "journal article",
    q("Smith, J.; Jones, M."), q("Doe, A. 2018; Roe, B. 2019"), q("networks; science"),
    sep = ","
  )
  f <- make_lens_csv(row1)
  d <- read_lens(f)

  expected_cols <- c("id", "title", "year", "journal", "doi",
                     "cited_by_count", "abstract", "type",
                     "authors", "references", "keywords")
  expect_true(all(expected_cols %in% names(d)))
})

## ── column types ─────────────────────────────────────────────────────────────

test_that("read_lens year is integer", {
  row1 <- paste(
    "000-001", q("A paper"), "2020", "Some Journal",
    "10.1/x", "5", q("Abstract here."), "journal article",
    q("Author A"), "", "",
    sep = ","
  )
  f <- make_lens_csv(row1)
  d <- read_lens(f)
  expect_type(d$year, "integer")
  expect_equal(d$year, 2020L)
})

test_that("read_lens cited_by_count is integer with no NAs", {
  row1 <- paste(
    "000-001", q("A paper"), "2020", "Some Journal",
    "10.1/x", "7", q("Abstract here."), "journal article",
    q("Author A"), "", "",
    sep = ","
  )
  f <- make_lens_csv(row1)
  d <- read_lens(f)
  expect_type(d$cited_by_count, "integer")
  expect_false(anyNA(d$cited_by_count))
  expect_equal(d$cited_by_count, 7L)
})

test_that("read_lens authors, references, keywords are list-columns", {
  row1 <- paste(
    "000-001", q("A paper"), "2020", "Some Journal",
    "10.1/x", "0", q("Abs."), "journal article",
    q("Smith, J.; Jones, M."), q("Ref A; Ref B"), q("kw1; kw2"),
    sep = ","
  )
  f <- make_lens_csv(row1)
  d <- read_lens(f)
  expect_true(is.list(d$authors))
  expect_true(is.list(d$references))
  expect_true(is.list(d$keywords))
})

## ── multi-value splitting ─────────────────────────────────────────────────────

test_that("read_lens splits authors on semicolon", {
  row1 <- paste(
    "000-001", q("A paper"), "2021", "Some Journal",
    "10.1/x", "3", q("Abstract."), "journal article",
    q("Smith, J.; Jones, M.; Doe, A."), "", "",
    sep = ","
  )
  f <- make_lens_csv(row1)
  d <- read_lens(f)
  expect_equal(length(d$authors[[1]]), 3L)
})

test_that("read_lens splits keywords on semicolon", {
  row1 <- paste(
    "000-001", q("A paper"), "2021", "Some Journal",
    "10.1/x", "0", q("Abstract."), "journal article",
    q("Author A"), "", q("network analysis; bibliometrics; citation"),
    sep = ","
  )
  f <- make_lens_csv(row1)
  d <- read_lens(f)
  expect_equal(length(d$keywords[[1]]), 3L)
})

test_that("read_lens splits references on semicolon", {
  row1 <- paste(
    "000-001", q("A paper"), "2021", "Some Journal",
    "10.1/x", "0", q("Abstract."), "journal article",
    q("Author A"), q("Ref A 2018; Ref B 2019; Ref C 2020"), "",
    sep = ","
  )
  f <- make_lens_csv(row1)
  d <- read_lens(f)
  expect_equal(length(d$references[[1]]), 3L)
})

## ── author standardization ────────────────────────────────────────────────────

test_that("read_lens uppercases author names", {
  row1 <- paste(
    "000-001", q("A paper"), "2021", "Some Journal",
    "10.1/x", "0", q("Abstract."), "journal article",
    q("Smith, Jane"), "", "",
    sep = ","
  )
  f <- make_lens_csv(row1)
  d <- read_lens(f)
  expect_equal(d$authors[[1]], "SMITH, JANE")
})

test_that("read_lens removes dots from author initials", {
  row1 <- paste(
    "000-001", q("A paper"), "2021", "Some Journal",
    "10.1/x", "0", q("Abstract."), "journal article",
    q("Smith, J.K."), "", "",
    sep = ","
  )
  f <- make_lens_csv(row1)
  d <- read_lens(f)
  expect_false(grepl("\\.", d$authors[[1]][1]))
})

## ── reference standardization ────────────────────────────────────────────────

test_that("read_lens uppercases references", {
  ## standardize_refs uppercases but does NOT strip dots (only standardize_authors does)
  row1 <- paste(
    "000-001", q("A paper"), "2021", "Some Journal",
    "10.1/x", "0", q("Abstract."), "journal article",
    "", q("doe, a. 2019"), "",
    sep = ","
  )
  f <- make_lens_csv(row1)
  d <- read_lens(f)
  expect_equal(d$references[[1]], "DOE, A. 2019")
})

## ── empty / missing fields ────────────────────────────────────────────────────

test_that("read_lens empty references field becomes empty list element", {
  row1 <- paste(
    "000-001", q("A paper"), "2021", "Some Journal",
    "10.1/x", "0", q("Abstract."), "journal article",
    q("Author A"), "", "",
    sep = ","
  )
  f <- make_lens_csv(row1)
  d <- read_lens(f)
  expect_equal(length(d$references[[1]]), 0L)
})

test_that("read_lens empty keywords field becomes empty list element", {
  row1 <- paste(
    "000-001", q("A paper"), "2021", "Some Journal",
    "10.1/x", "0", q("Abstract."), "journal article",
    q("Author A"), "", "",
    sep = ","
  )
  f <- make_lens_csv(row1)
  d <- read_lens(f)
  expect_equal(length(d$keywords[[1]]), 0L)
})

test_that("read_lens handles single-author record", {
  row1 <- paste(
    "000-002", q("Solo paper"), "2022", "Solo Journal",
    "10.2/solo", "1", q("Only one author."), "journal article",
    q("Lone, A."), "", q("solo"),
    sep = ","
  )
  f <- make_lens_csv(row1)
  d <- read_lens(f)
  expect_equal(nrow(d), 1L)
  expect_equal(length(d$authors[[1]]), 1L)
})

## ── multiple rows ─────────────────────────────────────────────────────────────

test_that("read_lens returns correct row count for multiple records", {
  row1 <- paste("000-001", q("Paper 1"), "2020", "Journal A",
                "10.1/a", "10", q("Abs 1"), "journal article",
                q("Alpha, A."), "", q("kw1"), sep = ",")
  row2 <- paste("000-002", q("Paper 2"), "2021", "Journal B",
                "10.1/b", "5",  q("Abs 2"), "journal article",
                q("Beta, B.; Gamma, G."), q("Ref X; Ref Y"), q("kw2; kw3"),
                sep = ",")
  row3 <- paste("000-003", q("Paper 3"), "2022", "Journal C",
                "", "0", q("Abs 3"), "conference paper",
                "", "", "",
                sep = ",")
  f <- make_lens_csv(row1, row2, row3)
  d <- read_lens(f)
  expect_equal(nrow(d), 3L)
})

test_that("read_lens preserves Lens ID values", {
  row1 <- paste("LNS-001", q("Paper 1"), "2020", "Journal A",
                "10.1/a", "10", q("Abs 1"), "journal article",
                q("Alpha, A."), "", q("kw1"), sep = ",")
  row2 <- paste("LNS-002", q("Paper 2"), "2021", "Journal B",
                "10.1/b", "5",  q("Abs 2"), "journal article",
                q("Beta, B."), "", "",
                sep = ",")
  f <- make_lens_csv(row1, row2)
  d <- read_lens(f)
  expect_equal(d$id, c("LNS-001", "LNS-002"))
})

## ── non-ASCII characters ──────────────────────────────────────────────────────

test_that("read_lens handles non-ASCII characters in title and abstract", {
  row1 <- paste(
    "000-005", q("Réseau éducatif"), "2023", "Revue Francophone",
    "10.5/fr", "2", q("Une étude sur les réseaux."), "journal article",
    q("Dupont, Jean-Pierre"), "", q("réseaux"),
    sep = ","
  )
  f <- make_lens_csv(row1)
  d <- read_lens(f)
  expect_equal(nrow(d), 1L)
  expect_true(grepl("é", d$title, fixed = TRUE))
})

## ── alternate column name fallbacks ──────────────────────────────────────────

test_that("read_lens accepts 'Author/s' instead of 'Authors'", {
  ## Use the alternate Lens column name
  header <- paste(
    "Lens ID", "Title", "Publication Year", "Source Title",
    "DOI", "Citing Works Count", "Abstract", "Publication Type",
    "Author/s", "References", "Keywords",
    sep = ","
  )
  row1 <- paste(
    "000-010", q("Alt author paper"), "2021", "Alt Journal",
    "10.1/alt", "3", q("Abstract alt."), "journal article",
    q("Alt, A.; Alt, B."), "", "",
    sep = ","
  )
  f <- tempfile(fileext = ".csv")
  writeLines(c(header, row1), f)
  d <- read_lens(f)
  expect_equal(length(d$authors[[1]]), 2L)
})

test_that("read_lens accepts 'Cited Works' instead of 'References'", {
  header <- paste(
    "Lens ID", "Title", "Publication Year", "Source Title",
    "DOI", "Citing Works Count", "Abstract", "Publication Type",
    "Authors", "Cited Works", "Keywords",
    sep = ","
  )
  row1 <- paste(
    "000-011", q("Cited works test"), "2020", "Test Journal",
    "10.1/cw", "1", q("Abstract cw."), "journal article",
    q("Test, T."), q("Ref One; Ref Two"), "",
    sep = ","
  )
  f <- tempfile(fileext = ".csv")
  writeLines(c(header, row1), f)
  d <- read_lens(f)
  expect_equal(length(d$references[[1]]), 2L)
})

test_that("read_lens accepts 'MeSH Terms' instead of 'Keywords'", {
  header <- paste(
    "Lens ID", "Title", "Publication Year", "Source Title",
    "DOI", "Citing Works Count", "Abstract", "Publication Type",
    "Authors", "References", "MeSH Terms",
    sep = ","
  )
  row1 <- paste(
    "000-012", q("MeSH test"), "2019", "Medical Journal",
    "10.1/med", "8", q("Abstract med."), "journal article",
    q("Med, M."), "", q("brain; neuron; cortex"),
    sep = ","
  )
  f <- tempfile(fileext = ".csv")
  writeLines(c(header, row1), f)
  d <- read_lens(f)
  expect_equal(length(d$keywords[[1]]), 3L)
})

test_that("read_lens accepts 'Fields of Study' instead of 'Keywords'", {
  header <- paste(
    "Lens ID", "Title", "Publication Year", "Source Title",
    "DOI", "Citing Works Count", "Abstract", "Publication Type",
    "Authors", "References", "Fields of Study",
    sep = ","
  )
  row1 <- paste(
    "000-013", q("FoS test"), "2022", "Science Journal",
    "10.1/fos", "4", q("Abstract fos."), "journal article",
    q("Fos, F."), "", q("physics; mathematics"),
    sep = ","
  )
  f <- tempfile(fileext = ".csv")
  writeLines(c(header, row1), f)
  d <- read_lens(f)
  expect_equal(length(d$keywords[[1]]), 2L)
})

test_that("read_lens accepts 'Year of Publication' instead of 'Publication Year'", {
  header <- paste(
    "Lens ID", "Title", "Year of Publication", "Source Title",
    "DOI", "Citing Works Count", "Abstract", "Publication Type",
    "Authors", "References", "Keywords",
    sep = ","
  )
  row1 <- paste(
    "000-014", q("Year test"), "2018", "Old Journal",
    "10.1/yr", "0", q("Abstract yr."), "journal article",
    q("Old, O."), "", q("history"),
    sep = ","
  )
  f <- tempfile(fileext = ".csv")
  writeLines(c(header, row1), f)
  d <- read_lens(f)
  expect_equal(d$year, 2018L)
})

test_that("read_lens accepts 'Document Type' instead of 'Publication Type'", {
  header <- paste(
    "Lens ID", "Title", "Publication Year", "Source Title",
    "DOI", "Citing Works Count", "Abstract", "Document Type",
    "Authors", "References", "Keywords",
    sep = ","
  )
  row1 <- paste(
    "000-015", q("DocType test"), "2023", "Doc Journal",
    "10.1/dt", "2", q("Abstract dt."), "conference paper",
    q("Doc, D."), "", q("doc"),
    sep = ","
  )
  f <- tempfile(fileext = ".csv")
  writeLines(c(header, row1), f)
  d <- read_lens(f)
  expect_equal(d$type, "conference paper")
})

test_that("read_lens accepts 'Cited By Count' instead of 'Citing Works Count'", {
  header <- paste(
    "Lens ID", "Title", "Publication Year", "Source Title",
    "DOI", "Cited By Count", "Abstract", "Publication Type",
    "Authors", "References", "Keywords",
    sep = ","
  )
  row1 <- paste(
    "000-016", q("Count test"), "2021", "Count Journal",
    "10.1/ct", "99", q("Abstract ct."), "journal article",
    q("Count, C."), "", q("counting"),
    sep = ","
  )
  f <- tempfile(fileext = ".csv")
  writeLines(c(header, row1), f)
  d <- read_lens(f)
  expect_equal(d$cited_by_count, 99L)
})

test_that("read_lens uses synthetic Lens ID when 'ID' column is present but no 'Lens ID'", {
  header <- paste(
    "ID", "Title", "Publication Year", "Source Title",
    "DOI", "Citing Works Count", "Abstract", "Publication Type",
    "Authors", "References", "Keywords",
    sep = ","
  )
  row1 <- paste(
    "ID-999", q("ID fallback"), "2020", "Fallback Journal",
    "10.1/fb", "0", q("Abstract fb."), "journal article",
    q("Fallback, F."), "", "",
    sep = ","
  )
  f <- tempfile(fileext = ".csv")
  writeLines(c(header, row1), f)
  d <- read_lens(f)
  expect_equal(d$id, "ID-999")
})

## ── missing columns fallback to NA ────────────────────────────────────────────

test_that("read_lens falls back to NA for missing optional columns", {
  ## Minimal CSV: only Lens ID and Title; everything else absent
  f <- tempfile(fileext = ".csv")
  writeLines(c(
    "Lens ID,Title",
    "000-020,Minimal paper"
  ), f)
  d <- read_lens(f)
  expect_equal(nrow(d), 1L)
  expect_true(is.na(d$doi))
  expect_true(is.na(d$abstract))
  expect_true(is.na(d$journal))
})

test_that("read_lens assigns synthetic LENS-prefixed IDs when no ID column present", {
  f <- tempfile(fileext = ".csv")
  writeLines(c(
    "Title,Publication Year",
    "Solo Paper,2021"
  ), f)
  d <- read_lens(f)
  expect_equal(d$id[[1]], "LENS1")
  expect_true(grepl("^LENS", d$id[[1]]))
})

test_that("read_lens synthetic IDs do not duplicate rows for multi-row files (regression)", {
  ## Regression for previously-observed n^2 row inflation when neither 'Lens ID'
  ## nor 'ID' columns existed and the default ID vector was passed to rep(default, n).
  f <- tempfile(fileext = ".csv")
  writeLines(c(
    "Title,Publication Year",
    "Paper A,2020",
    "Paper B,2021",
    "Paper C,2022"
  ), f)
  d <- read_lens(f)
  expect_equal(nrow(d), 3L)
  expect_equal(d$id, c("LENS1", "LENS2", "LENS3"))
})

## ── error on missing file ─────────────────────────────────────────────────────

test_that("read_lens errors on non-existent file", {
  expect_error(read_lens("no_such_file_lens.csv"), "File not found")
})

## ── read_biblio auto-detection ────────────────────────────────────────────────

test_that("read_biblio auto-detects lens format via 'Lens ID' header", {
  row1 <- paste("LNS-100", q("Auto-detect paper"), "2021", "Some Journal",
                "10.1/ad", "2", q("Abstract."), "journal article",
                q("Smith, A."), "", q("detection"),
                sep = ",")
  f <- make_lens_csv(row1)
  d <- read_biblio(f)
  expect_equal(nrow(d), 1L)
  expect_true(is.list(d$authors))
})

test_that("read_biblio with format='lens' works explicitly", {
  row1 <- paste("LNS-200", q("Explicit lens"), "2022", "Explicit Journal",
                "10.1/ex", "5", q("Abstract."), "journal article",
                q("Explicit, E."), "", q("explicit"),
                sep = ",")
  f <- make_lens_csv(row1)
  d <- read_biblio(f, format = "lens")
  expect_equal(nrow(d), 1L)
  expect_equal(d$id, "LNS-200")
})