## Tests for read_wos() — plaintext (tagged) and tab-delimited formats
## Synthetic files only; no network/API calls.

## ---------------------------------------------------------------------------
## Helpers
## ---------------------------------------------------------------------------

## Write a minimal WoS plaintext file and return its path
wos_pt_file <- function(records_text) {
  f <- tempfile(fileext = ".txt")
  writeLines(records_text, f)
  f
}

## Build a single WoS plaintext record from a named list of tag->value pairs.
## Each value may be a character vector (multiple continuation lines).
make_record <- function(fields) {
  lines <- character(0)
  for (nm in names(fields)) {
    vals <- fields[[nm]]
    lines <- c(lines, sprintf("%-2s %s", nm, vals[1]))
    if (length(vals) > 1) {
      lines <- c(lines, vapply(vals[-1], function(v) sprintf("   %s", v),
                               character(1)))
    }
  }
  c(lines, "ER")
}

## ---------------------------------------------------------------------------
## Plaintext: standard columns are returned
## ---------------------------------------------------------------------------

test_that("read_wos plaintext returns standard bibnets columns", {
  rec <- make_record(list(
    UT = "WOS:000001",
    TI = "A great paper",
    AU = c("Smith, John", "Doe, Jane"),
    PY = "2020",
    SO = "Journal of Testing",
    DI = "10.1000/test.001",
    TC = "5",
    AB = "This is an abstract.",
    DT = "Article",
    DE = "network analysis; bibliometrics",
    ID = "machine learning; deep learning",
    CR = "Ref A, 2010; Ref B, 2011"
  ))
  f <- wos_pt_file(rec)
  d <- read_wos(f, format = "plaintext")

  expected_cols <- c("id", "title", "year", "journal", "doi",
                     "cited_by_count", "abstract", "type",
                     "authors", "references", "keywords", "keywords_plus")
  expect_true(all(expected_cols %in% names(d)))
})

test_that("read_wos plaintext column order matches standard schema", {
  rec <- make_record(list(
    UT = "WOS:000001",
    TI = "Title",
    AU = "Smith, J",
    PY = "2020",
    SO = "Journal",
    DI = "10.1/x",
    TC = "1",
    AB = "Abstract.",
    DT = "Article",
    DE = "kw1",
    ID = "kw2"
  ))
  f <- wos_pt_file(rec)
  d <- read_wos(f, format = "plaintext")
  # first 8 scalar columns in order
  expect_equal(names(d)[1:8],
               c("id", "title", "year", "journal", "doi",
                 "cited_by_count", "abstract", "type"))
})

## ---------------------------------------------------------------------------
## Plaintext: scalar field values
## ---------------------------------------------------------------------------

test_that("read_wos plaintext parses a full record correctly", {
  rec <- make_record(list(
    UT = "WOS:000001",
    TI = "A great paper",
    AU = c("Smith, John", "Doe, Jane"),
    PY = "2020",
    SO = "Journal of Testing",
    DI = "10.1000/test.001",
    TC = "5",
    AB = "This is an abstract.",
    DT = "Article",
    DE = "network analysis; bibliometrics",
    ID = "machine learning; deep learning",
    CR = "Ref A, 2010; Ref B, 2011"
  ))
  f <- wos_pt_file(rec)
  d <- read_wos(f, format = "plaintext")

  expect_equal(nrow(d), 1L)
  expect_equal(d$id,             "WOS:000001")
  expect_equal(d$title,          "A great paper")
  expect_equal(d$year,           2020L)
  expect_equal(d$journal,        "Journal of Testing")
  expect_equal(d$doi,            "10.1000/test.001")
  expect_equal(d$cited_by_count, 5L)
  expect_equal(d$abstract,       "This is an abstract.")
  expect_equal(d$type,           "Article")
})

test_that("read_wos plaintext year is integer", {
  rec <- make_record(list(UT = "WOS:1", TI = "T", AU = "Smith, J",
                          PY = "2018", SO = "J"))
  f <- wos_pt_file(rec)
  d <- read_wos(f, format = "plaintext")
  expect_type(d$year, "integer")
})

test_that("read_wos plaintext cited_by_count is integer", {
  rec <- make_record(list(UT = "WOS:1", TI = "T", AU = "Smith, J",
                          PY = "2018", SO = "J", TC = "42"))
  f <- wos_pt_file(rec)
  d <- read_wos(f, format = "plaintext")
  expect_type(d$cited_by_count, "integer")
  expect_equal(d$cited_by_count, 42L)
})

## ---------------------------------------------------------------------------
## Plaintext: list-columns
## ---------------------------------------------------------------------------

test_that("read_wos plaintext authors is a list-column", {
  rec <- make_record(list(UT = "WOS:1", TI = "T", AU = c("Smith, J", "Doe, A"),
                          PY = "2020", SO = "J"))
  f <- wos_pt_file(rec)
  d <- read_wos(f, format = "plaintext")
  expect_true(is.list(d$authors))
})

test_that("read_wos plaintext author names are uppercased", {
  rec <- make_record(list(UT = "WOS:1", TI = "T",
                          AU = c("Smith, John", "Doe, Jane"),
                          PY = "2020", SO = "J"))
  f <- wos_pt_file(rec)
  d <- read_wos(f, format = "plaintext")
  expect_true(all(d$authors[[1]] == toupper(d$authors[[1]])))
})

test_that("read_wos plaintext multi-line author list collects all authors", {
  ## AU tag with continuation lines (each author on its own line)
  rec <- make_record(list(
    UT = "WOS:1",
    TI = "T",
    AU = c("Alpha, A", "Beta, B", "Gamma, G"),
    PY = "2020",
    SO = "J"
  ))
  f <- wos_pt_file(rec)
  d <- read_wos(f, format = "plaintext")
  expect_equal(length(d$authors[[1]]), 3L)
})

test_that("read_wos plaintext references is a list-column", {
  rec <- make_record(list(UT = "WOS:1", TI = "T", AU = "Smith, J",
                          PY = "2020", SO = "J",
                          CR = "Ref A, 2010; Ref B, 2011"))
  f <- wos_pt_file(rec)
  d <- read_wos(f, format = "plaintext")
  expect_true(is.list(d$references))
})

test_that("read_wos plaintext references are split on semicolon", {
  rec <- make_record(list(UT = "WOS:1", TI = "T", AU = "Smith, J",
                          PY = "2020", SO = "J",
                          CR = "Ref A, 2010; Ref B, 2011"))
  f <- wos_pt_file(rec)
  d <- read_wos(f, format = "plaintext")
  expect_equal(length(d$references[[1]]), 2L)
})

test_that("read_wos plaintext references are uppercased", {
  rec <- make_record(list(UT = "WOS:1", TI = "T", AU = "Smith, J",
                          PY = "2020", SO = "J", CR = "ref a, 2010"))
  f <- wos_pt_file(rec)
  d <- read_wos(f, format = "plaintext")
  expect_true(all(d$references[[1]] == toupper(d$references[[1]])))
})

test_that("read_wos plaintext keywords is a list-column split on semicolon", {
  rec <- make_record(list(UT = "WOS:1", TI = "T", AU = "Smith, J",
                          PY = "2020", SO = "J",
                          DE = "network analysis; bibliometrics"))
  f <- wos_pt_file(rec)
  d <- read_wos(f, format = "plaintext")
  expect_true(is.list(d$keywords))
  expect_equal(length(d$keywords[[1]]), 2L)
  expect_true("network analysis" %in% d$keywords[[1]])
})

test_that("read_wos plaintext keywords_plus is a list-column split on semicolon", {
  rec <- make_record(list(UT = "WOS:1", TI = "T", AU = "Smith, J",
                          PY = "2020", SO = "J",
                          ID = "machine learning; deep learning"))
  f <- wos_pt_file(rec)
  d <- read_wos(f, format = "plaintext")
  expect_true(is.list(d$keywords_plus))
  expect_equal(length(d$keywords_plus[[1]]), 2L)
})

## ---------------------------------------------------------------------------
## Plaintext: missing / optional tags → NA / empty list
## ---------------------------------------------------------------------------

test_that("read_wos plaintext missing DOI returns NA", {
  rec <- make_record(list(UT = "WOS:1", TI = "T", AU = "Smith, J",
                          PY = "2020", SO = "J"))
  f <- wos_pt_file(rec)
  d <- read_wos(f, format = "plaintext")
  expect_true(is.na(d$doi))
})

test_that("read_wos plaintext missing TC defaults to 0L", {
  rec <- make_record(list(UT = "WOS:1", TI = "T", AU = "Smith, J",
                          PY = "2020", SO = "J"))
  f <- wos_pt_file(rec)
  d <- read_wos(f, format = "plaintext")
  expect_equal(d$cited_by_count, 0L)
})

test_that("read_wos plaintext missing CR returns empty list element", {
  rec <- make_record(list(UT = "WOS:1", TI = "T", AU = "Smith, J",
                          PY = "2020", SO = "J"))
  f <- wos_pt_file(rec)
  d <- read_wos(f, format = "plaintext")
  expect_equal(length(d$references[[1]]), 0L)
})

test_that("read_wos plaintext missing DE returns empty keywords list element", {
  rec <- make_record(list(UT = "WOS:1", TI = "T", AU = "Smith, J",
                          PY = "2020", SO = "J"))
  f <- wos_pt_file(rec)
  d <- read_wos(f, format = "plaintext")
  expect_equal(length(d$keywords[[1]]), 0L)
})

test_that("read_wos plaintext missing ID returns empty keywords_plus list element", {
  rec <- make_record(list(UT = "WOS:1", TI = "T", AU = "Smith, J",
                          PY = "2020", SO = "J"))
  f <- wos_pt_file(rec)
  d <- read_wos(f, format = "plaintext")
  expect_equal(length(d$keywords_plus[[1]]), 0L)
})

test_that("read_wos plaintext missing PY returns NA_integer_", {
  rec <- make_record(list(UT = "WOS:1", TI = "T", AU = "Smith, J", SO = "J"))
  f <- wos_pt_file(rec)
  d <- read_wos(f, format = "plaintext")
  expect_true(is.na(d$year))
  expect_type(d$year, "integer")
})

test_that("read_wos plaintext missing UT falls back to generated id", {
  rec <- make_record(list(TI = "T", AU = "Smith, J", PY = "2020", SO = "J"))
  f <- wos_pt_file(rec)
  d <- read_wos(f, format = "plaintext")
  expect_false(is.na(d$id))
  expect_true(nchar(d$id) > 0)
})

## ---------------------------------------------------------------------------
## Plaintext: multi-record file
## ---------------------------------------------------------------------------

test_that("read_wos plaintext parses multiple records", {
  rec1 <- make_record(list(UT = "WOS:001", TI = "First paper",
                           AU = "Alpha, A", PY = "2020", SO = "Journal A"))
  rec2 <- make_record(list(UT = "WOS:002", TI = "Second paper",
                           AU = "Beta, B", PY = "2021", SO = "Journal B",
                           DI = "10.99/x", TC = "3"))
  rec3 <- make_record(list(UT = "WOS:003", TI = "Third paper",
                           AU = "Gamma, G", PY = "2022", SO = "Journal C",
                           CR = "Some ref, 2000", DE = "keyword"))
  f <- wos_pt_file(c(rec1, "", rec2, "", rec3))
  d <- read_wos(f, format = "plaintext")
  expect_equal(nrow(d), 3L)
  expect_equal(d$id,    c("WOS:001", "WOS:002", "WOS:003"))
  expect_equal(d$year,  c(2020L, 2021L, 2022L))
  expect_equal(d$cited_by_count[2], 3L)
})

## ---------------------------------------------------------------------------
## Plaintext: FN/VR/EF header lines are skipped
## ---------------------------------------------------------------------------

test_that("read_wos plaintext ignores FN/VR/EF file-header lines", {
  rec <- make_record(list(UT = "WOS:1", TI = "T", AU = "Smith, J",
                          PY = "2020", SO = "J"))
  lines <- c(
    "FN Web of Science",
    "VR 1.0",
    rec,
    "EF"
  )
  f <- wos_pt_file(lines)
  d <- read_wos(f, format = "plaintext")
  expect_equal(nrow(d), 1L)
})

## ---------------------------------------------------------------------------
## Plaintext: CRLF line endings
## ---------------------------------------------------------------------------

test_that("read_wos plaintext handles Windows CRLF line endings", {
  rec <- make_record(list(UT = "WOS:CRLF", TI = "CRLF paper",
                          AU = "Smith, J", PY = "2019", SO = "J",
                          TC = "2"))
  f <- tempfile(fileext = ".txt")
  ## Write with explicit CRLF line endings (Windows-style)
  ## Encode each line as UTF-8 bytes followed by CR+LF
  crlf_bytes <- do.call(c, lapply(rec, function(ln) {
    c(chartr("", "", iconv(ln, to = "UTF-8")), as.raw(c(0x0d, 0x0a)))
  }))
  ## Use paste+rawToChar approach: build the full text with \r\n
  full_text <- paste(rec, collapse = "\r\n")
  writeBin(chartr("", "", full_text), f)
  ## Actually write raw bytes: convert string to raw with CRLF
  raw_content <- iconv(paste(rec, collapse = "\r\n"), to = "UTF-8")
  con <- file(f, open = "wb")
  writeBin(raw_content, con)
  close(con)
  d <- read_wos(f, format = "plaintext")
  expect_equal(nrow(d), 1L)
  expect_equal(d$year, 2019L)
})

## ---------------------------------------------------------------------------
## Plaintext: non-ASCII characters in title/abstract
## ---------------------------------------------------------------------------

test_that("read_wos plaintext handles non-ASCII characters", {
  rec <- make_record(list(
    UT = "WOS:NONASCII",
    TI = "Über die Netzwerkanalyse",
    AU = "Müller, Hans",
    PY = "2021",
    SO = "Zeitschrift",
    AB = "Résumé: données bibliométriques"
  ))
  f <- wos_pt_file(rec)
  d <- read_wos(f, format = "plaintext")
  expect_equal(nrow(d), 1L)
  expect_true(grepl("Netzwerkanalyse", d$title))
})

## ---------------------------------------------------------------------------
## Plaintext: DOI with trailing DOI strip in references
## ---------------------------------------------------------------------------

test_that("read_wos plaintext CR DOI suffix is stripped from references", {
  rec <- make_record(list(
    UT = "WOS:DOI",
    TI = "DOI test",
    AU = "Smith, J",
    PY = "2020",
    SO = "J",
    CR = "Author A, 2010, DOI 10.1000/abc; Author B, 2011, DOI 10.9999/xyz"
  ))
  f <- wos_pt_file(rec)
  d <- read_wos(f, format = "plaintext")
  refs <- d$references[[1]]
  expect_equal(length(refs), 2L)
  expect_false(any(grepl("DOI", refs, ignore.case = FALSE)))
})

## ---------------------------------------------------------------------------
## Plaintext: empty file → empty data frame
## ---------------------------------------------------------------------------

test_that("read_wos plaintext empty file returns empty data frame", {
  f <- wos_pt_file(character(0))
  d <- read_wos(f, format = "plaintext")
  expect_equal(nrow(d), 0L)
  expect_true(all(c("id", "title", "year", "journal", "doi",
                    "cited_by_count", "abstract", "type") %in% names(d)))
  ## Regression: empty result must include keywords_plus to match non-empty schema
  expect_true("keywords_plus" %in% names(d))
  expect_true(is.list(d$keywords_plus))
})

## ---------------------------------------------------------------------------
## Plaintext: trailing whitespace in field values
## ---------------------------------------------------------------------------

test_that("read_wos plaintext trims trailing whitespace from field values", {
  lines <- c(
    "UT WOS:TRIM   ",
    "TI  Title with trailing spaces   ",
    "AU  Smith, J   ",
    "PY  2020   ",
    "SO  Journal   ",
    "ER"
  )
  f <- wos_pt_file(lines)
  d <- read_wos(f, format = "plaintext")
  expect_false(grepl("\\s+$", d$title))
  expect_false(grepl("\\s+$", d$journal))
})

## ---------------------------------------------------------------------------
## Error handling
## ---------------------------------------------------------------------------

test_that("read_wos errors on non-existent file", {
  expect_error(read_wos("no_such_file.txt"), "not found")
})

test_that("read_wos errors on invalid format argument", {
  f <- wos_pt_file(character(0))
  expect_error(read_wos(f, format = "invalid"), "format")
})

## ---------------------------------------------------------------------------
## Tab-delimited format
## ---------------------------------------------------------------------------

## Build a tab-delimited WoS export file from a header + rows list
wos_tab_file <- function(header, rows) {
  f <- tempfile(fileext = ".txt")
  lines <- c(
    paste(header, collapse = "\t"),
    vapply(rows, function(r) paste(r, collapse = "\t"), character(1))
  )
  writeLines(lines, f)
  f
}

## Canonical WoS tab header (abbreviated to what the parser actually uses)
wos_tab_header <- c("UT", "TI", "AU", "PY", "SO", "DI", "TC",
                    "AB", "DT", "DE", "ID", "CR")

test_that("read_wos tab returns standard bibnets columns", {
  f <- wos_tab_file(
    wos_tab_header,
    list(c("WOS:T001", "Tab paper", "Smith, J; Doe, A",
           "2021", "Tab Journal", "10.1/t", "7",
           "An abstract.", "Article",
           "networks; analysis", "deep learning",
           "Ref A, 2010; Ref B, 2011"))
  )
  d <- read_wos(f, format = "tab")
  expected_cols <- c("id", "title", "year", "journal", "doi",
                     "cited_by_count", "abstract", "type",
                     "authors", "references", "keywords", "keywords_plus")
  expect_true(all(expected_cols %in% names(d)))
})

test_that("read_wos tab parses scalar fields correctly", {
  f <- wos_tab_file(
    wos_tab_header,
    list(c("WOS:T001", "Tab paper", "Smith, J",
           "2021", "Tab Journal", "10.1/t", "7",
           "An abstract.", "Article", "networks", "deep learning", ""))
  )
  d <- read_wos(f, format = "tab")
  expect_equal(nrow(d), 1L)
  expect_equal(d$id,             "WOS:T001")
  expect_equal(d$title,          "Tab paper")
  expect_equal(d$year,           2021L)
  expect_equal(d$journal,        "Tab Journal")
  expect_equal(d$doi,            "10.1/t")
  expect_equal(d$cited_by_count, 7L)
  expect_equal(d$abstract,       "An abstract.")
  expect_equal(d$type,           "Article")
})

test_that("read_wos tab year is integer", {
  f <- wos_tab_file(
    wos_tab_header,
    list(c("WOS:T001", "T", "S, J", "2019", "J", "", "0", "", "A", "", "", ""))
  )
  d <- read_wos(f, format = "tab")
  expect_type(d$year, "integer")
})

test_that("read_wos tab cited_by_count is integer", {
  f <- wos_tab_file(
    wos_tab_header,
    list(c("WOS:T001", "T", "S, J", "2019", "J", "", "15", "", "A", "", "", ""))
  )
  d <- read_wos(f, format = "tab")
  expect_type(d$cited_by_count, "integer")
  expect_equal(d$cited_by_count, 15L)
})

test_that("read_wos tab authors are split on semicolon and uppercased", {
  f <- wos_tab_file(
    wos_tab_header,
    list(c("WOS:T001", "T", "Smith, J; Doe, A", "2020", "J",
           "", "0", "", "A", "", "", ""))
  )
  d <- read_wos(f, format = "tab")
  expect_true(is.list(d$authors))
  expect_equal(length(d$authors[[1]]), 2L)
  expect_true(all(d$authors[[1]] == toupper(d$authors[[1]])))
})

test_that("read_wos tab references are split on semicolon", {
  f <- wos_tab_file(
    wos_tab_header,
    list(c("WOS:T001", "T", "S, J", "2020", "J", "", "0", "", "A",
           "", "", "Ref A, 2010; Ref B, 2011; Ref C, 2012"))
  )
  d <- read_wos(f, format = "tab")
  expect_true(is.list(d$references))
  expect_equal(length(d$references[[1]]), 3L)
})

test_that("read_wos tab keywords split on semicolon", {
  f <- wos_tab_file(
    wos_tab_header,
    list(c("WOS:T001", "T", "S, J", "2020", "J", "", "0", "", "A",
           "networks; analysis; visualization", "", ""))
  )
  d <- read_wos(f, format = "tab")
  expect_true(is.list(d$keywords))
  expect_equal(length(d$keywords[[1]]), 3L)
})

test_that("read_wos tab keywords_plus split on semicolon", {
  f <- wos_tab_file(
    wos_tab_header,
    list(c("WOS:T001", "T", "S, J", "2020", "J", "", "0", "", "A",
           "", "machine learning; NLP", ""))
  )
  d <- read_wos(f, format = "tab")
  expect_true(is.list(d$keywords_plus))
  expect_equal(length(d$keywords_plus[[1]]), 2L)
})

test_that("read_wos tab multiple records parsed correctly", {
  rows <- list(
    c("WOS:T001", "Paper One", "Alpha, A", "2019", "J1",
      "10.1/a", "3", "Abs1", "Article", "kw1", "kp1", "r1; r2"),
    c("WOS:T002", "Paper Two", "Beta, B; Gamma, G", "2020", "J2",
      "10.1/b", "8", "Abs2", "Review", "kw2; kw3", "", ""),
    c("WOS:T003", "Paper Three", "Delta, D", "2021", "J3",
      "", "0", "", "Article", "", "", "")
  )
  f <- wos_tab_file(wos_tab_header, rows)
  d <- read_wos(f, format = "tab")
  expect_equal(nrow(d), 3L)
  expect_equal(d$id,   c("WOS:T001", "WOS:T002", "WOS:T003"))
  expect_equal(d$year, c(2019L, 2020L, 2021L))
  expect_equal(length(d$references[[1]]), 2L)
  expect_equal(length(d$authors[[2]]), 2L)
})

test_that("read_wos tab with alternative 'Title' / 'Source Title' headers", {
  alt_header <- c("Accession Number", "Title", "Authors",
                  "Publication Year", "Source Title", "DOI",
                  "Times Cited", "Abstract", "Document Type",
                  "Author Keywords", "Keywords Plus", "Cited References")
  f <- wos_tab_file(
    alt_header,
    list(c("WOS:ALT1", "Alt Title", "Smith, J", "2022", "Alt Journal",
           "10.99/alt", "12", "Alt abstract.", "Article",
           "alt kw", "alt kp", ""))
  )
  d <- read_wos(f, format = "tab")
  expect_equal(nrow(d), 1L)
  expect_equal(d$title,   "Alt Title")
  expect_equal(d$journal, "Alt Journal")
})

test_that("read_wos tab empty AU cell returns empty authors list element", {
  f <- wos_tab_file(
    wos_tab_header,
    list(c("WOS:T001", "T", "", "2020", "J", "", "0", "", "A", "", "", ""))
  )
  d <- read_wos(f, format = "tab")
  expect_equal(length(d$authors[[1]]), 0L)
})

test_that("read_wos tab errors on non-existent file", {
  expect_error(read_wos("no_such_file.txt", format = "tab"), "not found")
})

## ---------------------------------------------------------------------------
## Coverage: line 85 — plaintext record with no AU tag at all
## ---------------------------------------------------------------------------

test_that("read_wos plaintext record without AU tag returns empty authors list", {
  ## Deliberately omit the AU field entirely
  lines <- c(
    "UT WOS:NOAU",
    "TI  Paper Without Author",
    "PY  2022",
    "SO  Journal",
    "ER"
  )
  f <- wos_pt_file(lines)
  d <- read_wos(f, format = "plaintext")
  expect_equal(nrow(d), 1L)
  expect_equal(length(d$authors[[1]]), 0L)
})

## ---------------------------------------------------------------------------
## Coverage: line 160 — tab get_col fallback when column name absent
## ---------------------------------------------------------------------------

test_that("read_wos tab get_col returns NA default when column absent", {
  ## Use a header that has none of the expected DOI column names
  header_no_doi <- c("UT", "TI", "AU", "PY", "SO", "TC",
                     "AB", "DT", "DE", "ID", "CR")
  f <- wos_tab_file(
    header_no_doi,
    list(c("WOS:NODOI", "No DOI paper", "Smith, J",
           "2023", "Journal", "0", "Abstract.", "Article", "kw", "kp", ""))
  )
  d <- read_wos(f, format = "tab")
  expect_equal(nrow(d), 1L)
  expect_true(is.na(d$doi))
})