## ── helpers ──────────────────────────────────────────────────────────────────

make_dim_csv <- function(rows, extra_cols = character(0), file = tempfile(fileext = ".csv")) {
  ## Dimensions CSVs always begin with a one-line metadata header then the
  ## real column-name row.  The parser (read_dimensions.R line 26) detects
  ## lines starting with ^"?About the data and skips 1 row.
  header <- paste0(
    '"About the data: Export created ',
    format(Sys.Date(), "%Y-%m-%d"),
    '"'
  )
  col_names <- c(
    "Publication ID", "Title", "PubYear", "Source title", "DOI",
    "Times cited", "Abstract", "Publication Type",
    "Authors", "Cited references",
    "Authors Affiliations - Name of Research organization",
    "Authors Affiliations - Country of Research organization",
    "Keywords",
    extra_cols
  )

  write_row <- function(r) {
    paste(vapply(r, function(v) {
      if (is.na(v)) "" else paste0('"', gsub('"', '""', v), '"')
    }, character(1)), collapse = ",")
  }

  lines <- c(
    header,
    paste(vapply(col_names, function(n) paste0('"', n, '"'), character(1)),
          collapse = ","),
    vapply(rows, write_row, character(1))
  )
  writeLines(lines, file)
  file
}

## Three representative rows used in most tests
typical_rows <- list(
  ## 1: full record, two authors, two references
  c(
    "pub.1111111111", "Learning Networks", "2021",
    "Journal of Education", "10.1000/xyz001", "15",
    "This paper studies learning networks.",
    "article",
    "Smith, John; Jones, Mary",
    "Brown A, Title X, 2018, Journal Z; Green B, Title Y, 2019, Journal W",
    "University A; University B",
    "USA; UK",
    "network; learning"
  ),
  ## 2: multiple affiliations & countries; non-ASCII author name
  c(
    "pub.2222222222", "Réseaux éducatifs", "2022",
    "Revue Française", "10.1000/xyz002", "3",
    "Study of éducation réseaux.",
    "article",
    "Müller, Hans; Lefèvre, Claire; Tanaka, Yuki",
    "White C, Study Z, 2020, Jour Q",
    "Universität Berlin; Université Paris; Kyoto University",
    "Germany; France; Japan",
    "éducation; réseaux; multilingual"
  ),
  ## 3: missing references (empty string)
  c(
    "pub.3333333333", "Citation-free Article", "2023",
    "Open Access Journal", "10.1000/xyz003", "0",
    "An article with no references.",
    "conference paper",
    "Doe, Jane",
    "",
    "State College",
    "USA",
    "open access"
  )
)

## ── Standard columns ──────────────────────────────────────────────────────────

test_that("read_dimensions returns all required standard columns", {
  f <- make_dim_csv(typical_rows)
  d <- read_dimensions(f)

  expected <- c("id", "title", "year", "journal", "doi",
                "cited_by_count", "abstract", "type",
                "authors", "references", "keywords",
                "affiliations", "countries")
  expect_true(all(expected %in% names(d)))
})

test_that("read_dimensions returns columns in correct order", {
  f <- make_dim_csv(typical_rows)
  d <- read_dimensions(f)
  std_order <- c("id", "title", "year", "journal", "doi",
                 "cited_by_count", "abstract", "type",
                 "authors", "references", "keywords",
                 "affiliations", "countries")
  expect_equal(names(d)[seq_along(std_order)], std_order)
})

test_that("read_dimensions returns 3 rows from 3-record fixture", {
  f <- make_dim_csv(typical_rows)
  d <- read_dimensions(f)
  expect_equal(nrow(d), 3L)
})

## ── Scalar column types ───────────────────────────────────────────────────────

test_that("year is integer", {
  f <- make_dim_csv(typical_rows)
  d <- read_dimensions(f)
  expect_type(d$year, "integer")
  expect_equal(d$year, c(2021L, 2022L, 2023L))
})

test_that("cited_by_count is integer with no NAs", {
  f <- make_dim_csv(typical_rows)
  d <- read_dimensions(f)
  expect_type(d$cited_by_count, "integer")
  expect_false(any(is.na(d$cited_by_count)))
  expect_equal(d$cited_by_count, c(15L, 3L, 0L))
})

test_that("id column is populated from Publication ID", {
  f <- make_dim_csv(typical_rows)
  d <- read_dimensions(f)
  expect_equal(d$id, c("pub.1111111111", "pub.2222222222", "pub.3333333333"))
})

test_that("title, journal, doi, abstract, type are character", {
  f <- make_dim_csv(typical_rows)
  d <- read_dimensions(f)
  for (col in c("title", "journal", "doi", "abstract", "type")) {
    expect_type(d[[col]], "character")
  }
})

## ── List-columns ──────────────────────────────────────────────────────────────

test_that("authors, references, keywords, affiliations, countries are list-columns", {
  f <- make_dim_csv(typical_rows)
  d <- read_dimensions(f)
  for (col in c("authors", "references", "keywords", "affiliations", "countries")) {
    expect_true(is.list(d[[col]]))
  }
})

test_that("semicolons split authors into multiple elements", {
  f <- make_dim_csv(typical_rows)
  d <- read_dimensions(f)
  expect_equal(length(d$authors[[1]]), 2L)   ## "Smith, John; Jones, Mary"
  expect_equal(length(d$authors[[2]]), 3L)   ## three authors
  expect_equal(length(d$authors[[3]]), 1L)   ## "Doe, Jane"
})

test_that("authors are uppercased", {
  f <- make_dim_csv(typical_rows)
  d <- read_dimensions(f)
  all_authors <- unlist(d$authors)
  expect_true(all(all_authors == toupper(all_authors)))
})

test_that("semicolons split references correctly", {
  f <- make_dim_csv(typical_rows)
  d <- read_dimensions(f)
  expect_equal(length(d$references[[1]]), 2L)
  expect_equal(length(d$references[[2]]), 1L)
})

test_that("references are uppercased", {
  f <- make_dim_csv(typical_rows)
  d <- read_dimensions(f)
  present <- unlist(d$references)
  expect_true(all(present == toupper(present)))
})

test_that("empty references yield character(0)", {
  f <- make_dim_csv(typical_rows)
  d <- read_dimensions(f)
  expect_equal(d$references[[3]], character(0))
})

test_that("keywords are split by semicolon", {
  f <- make_dim_csv(typical_rows)
  d <- read_dimensions(f)
  expect_equal(length(d$keywords[[1]]), 2L)   ## "network; learning"
  expect_equal(length(d$keywords[[2]]), 3L)
  expect_equal(length(d$keywords[[3]]), 1L)
})

test_that("affiliations are split by semicolon", {
  f <- make_dim_csv(typical_rows)
  d <- read_dimensions(f)
  expect_equal(length(d$affiliations[[1]]), 2L)
  expect_equal(length(d$affiliations[[2]]), 3L)
  expect_equal(length(d$affiliations[[3]]), 1L)
})

test_that("countries are split by semicolon", {
  f <- make_dim_csv(typical_rows)
  d <- read_dimensions(f)
  expect_equal(length(d$countries[[1]]), 2L)  ## "USA; UK"
  expect_equal(length(d$countries[[2]]), 3L)  ## three countries
  expect_equal(length(d$countries[[3]]), 1L)
})

## ── Dimensions metadata header quirk ─────────────────────────────────────────

test_that("file with About-the-data header is parsed correctly", {
  ## Parser should detect and skip the metadata line; only data rows returned.
  f <- make_dim_csv(typical_rows)
  first <- readLines(f, n = 1L)
  expect_true(grepl('^"About the data', first))
  d <- read_dimensions(f)
  expect_equal(nrow(d), 3L)
  ## First row title must be the first actual record, not junk from the header
  expect_equal(d$title[1], "Learning Networks")
})

test_that("file without About-the-data header is also parsed (skip_rows=0)", {
  ## Build a plain CSV with no metadata line — parser falls back to skip=0.
  rows <- typical_rows[1:2]
  col_names <- c(
    "Publication ID", "Title", "PubYear", "Source title", "DOI",
    "Times cited", "Abstract", "Publication Type",
    "Authors", "Cited references",
    "Authors Affiliations - Name of Research organization",
    "Authors Affiliations - Country of Research organization",
    "Keywords"
  )
  f <- tempfile(fileext = ".csv")
  write_row <- function(r) {
    paste(vapply(r, function(v) {
      if (is.na(v)) "" else paste0('"', gsub('"', '""', v), '"')
    }, character(1)), collapse = ",")
  }
  lines <- c(
    paste(vapply(col_names, function(n) paste0('"', n, '"'), character(1)),
          collapse = ","),
    vapply(rows, write_row, character(1))
  )
  writeLines(lines, f)
  d <- read_dimensions(f)
  expect_equal(nrow(d), 2L)
})

## ── Column name aliases ───────────────────────────────────────────────────────

test_that("Publication Year alias is accepted for year column", {
  f <- tempfile(fileext = ".csv")
  writeLines(c(
    '"About the data: test"',
    '"Publication ID","Title","Publication Year","Source title","DOI","Times cited","Abstract","Publication Type","Authors","Cited references","Authors Affiliations - Name of Research organization","Authors Affiliations - Country of Research organization","Keywords"',
    '"pub.99","Alt Year Test","2020","Some Journal","10.0/x","5","Abstract here.","article","Author, A","","Org A","CountryX","kw1"'
  ), f)
  d <- read_dimensions(f)
  expect_equal(d$year, 2020L)
})

test_that("Dimensions URL alias is accepted for id column", {
  f <- tempfile(fileext = ".csv")
  writeLines(c(
    '"About the data: test"',
    '"Dimensions URL","Title","PubYear","Source title","DOI","Times cited","Abstract","Publication Type","Authors","Cited references","Authors Affiliations - Name of Research organization","Authors Affiliations - Country of Research organization","Keywords"',
    '"https://app.dimensions.ai/pub.555","URL ID Test","2021","Journal","10.0/u","2","Abs.","article","Auth, B","","Org B","CountryY","kw2"'
  ), f)
  d <- read_dimensions(f)
  expect_equal(d$id[1], "https://app.dimensions.ai/pub.555")
})

## ── Fallback ID generation ────────────────────────────────────────────────────

test_that("rows with empty Publication ID get DIMn fallback id", {
  f <- tempfile(fileext = ".csv")
  writeLines(c(
    '"About the data: test"',
    '"Publication ID","Title","PubYear","Source title","DOI","Times cited","Abstract","Publication Type","Authors","Cited references","Authors Affiliations - Name of Research organization","Authors Affiliations - Country of Research organization","Keywords"',
    '"","No ID Paper","2020","Journal","10.0/n","1","Abs.","article","Auth, C","","Org C","CountryZ","kw3"'
  ), f)
  d <- read_dimensions(f)
  expect_equal(d$id[1], "DIM1")
})

## ── Missing optional columns ──────────────────────────────────────────────────

test_that("missing Keywords column produces empty list-column", {
  f <- tempfile(fileext = ".csv")
  writeLines(c(
    '"About the data: test"',
    '"Publication ID","Title","PubYear","Source title","DOI","Times cited","Abstract","Publication Type","Authors","Cited references","Authors Affiliations - Name of Research organization","Authors Affiliations - Country of Research organization"',
    '"pub.1","No KW","2021","J","10.0/k","0","A.","article","Auth, D","","Org D","CountryA"'
  ), f)
  d <- read_dimensions(f)
  expect_true(is.list(d$keywords))
  expect_equal(d$keywords[[1]], character(0))
})

test_that("missing affiliations/countries columns produce empty list-columns", {
  f <- tempfile(fileext = ".csv")
  writeLines(c(
    '"About the data: test"',
    '"Publication ID","Title","PubYear","Source title","DOI","Times cited","Abstract","Publication Type","Authors","Cited references","Keywords"',
    '"pub.2","No Aff","2022","J","10.0/a","2","Abs.","article","Auth, E","ref1","kw"'
  ), f)
  d <- read_dimensions(f)
  expect_true(is.list(d$affiliations))
  expect_true(is.list(d$countries))
  expect_equal(d$affiliations[[1]], character(0))
  expect_equal(d$countries[[1]], character(0))
})

## ── Non-ASCII / special characters ───────────────────────────────────────────

test_that("non-ASCII characters in author names are preserved", {
  f <- make_dim_csv(typical_rows)
  d <- read_dimensions(f)
  ## Row 2 has Müller, Hans; Lefèvre, Claire; Tanaka, Yuki
  ## After uppercasing these become MÜLLER, HANS etc.
  author_str <- paste(d$authors[[2]], collapse = " ")
  expect_true(nchar(author_str) > 0)
  expect_equal(length(d$authors[[2]]), 3L)
})

## ── Error on bad file ─────────────────────────────────────────────────────────

test_that("read_dimensions errors on non-existent file", {
  expect_error(read_dimensions("no_such_file_xyz.csv"))
})

## ── read_biblio integration ───────────────────────────────────────────────────

test_that("read_biblio auto-detects Dimensions format", {
  f <- make_dim_csv(typical_rows)
  d <- read_biblio(f)
  expect_equal(nrow(d), 3L)
  expect_true(is.list(d$authors))
  expect_true(is.list(d$affiliations))
})