## Tests for read_openalex() — uses synthetic data frames only; NO network calls.
##
## read_openalex() takes the output of openalexR::oa_fetch(entity="works") —
## a nested tibble/data frame — and returns the standard bibnets schema.
##
## Key openalexR column names (confirmed from parser source):
##   id                (OpenAlex work URL, e.g. "https://openalex.org/W123")
##   display_name      (title)
##   publication_year  (integer or character)
##   so                (source/journal name)
##   doi               (may be prefixed with "https://doi.org/")
##   cited_by_count    (integer)
##   ab                (abstract string)
##   type              (e.g. "article", "book-chapter")
##   author            (list-col of data.frames: au_display_name, au_id, ...)
##   referenced_works  (list-col of character vectors of OpenAlex IDs)
##   concepts          (list-col of data.frames: display_name, ...)
##   keywords          (list-col of data.frames or char vectors: display_name, keyword)

## ── helpers ──────────────────────────────────────────────────────────────────

make_author_df <- function(names) {
  data.frame(au_display_name = names, stringsAsFactors = FALSE)
}

make_concept_df <- function(names) {
  data.frame(display_name = names, stringsAsFactors = FALSE)
}

make_keyword_df <- function(names) {
  data.frame(display_name = names, stringsAsFactors = FALSE)
}

## Minimal valid synthetic oa_fetch() output with 3 records
make_synthetic <- function() {
  data.frame(
    id               = c("https://openalex.org/W1001",
                         "https://openalex.org/W1002",
                         "https://openalex.org/W1003"),
    display_name     = c("Title One", "Title Two", "Title Three"),
    publication_year = c(2021L, 2019L, 2023L),
    so               = c("Journal A", "Journal B", "Journal C"),
    doi              = c("https://doi.org/10.1000/aaa",
                         "https://doi.org/10.1000/bbb",
                         NA_character_),
    cited_by_count   = c(10L, 0L, 5L),
    ab               = c("Abstract one.", NA_character_, "Abstract three."),
    type             = c("article", "book-chapter", "article"),
    stringsAsFactors = FALSE
  )
}

## ── Column presence ───────────────────────────────────────────────────────────

test_that("read_openalex returns all standard columns", {
  d   <- make_synthetic()
  out <- read_openalex(d)
  expected <- c("id", "title", "year", "journal", "doi",
                "cited_by_count", "abstract", "type",
                "authors", "references", "keywords")
  expect_true(all(expected %in% names(out)))
})

test_that("standard columns appear in the correct order", {
  d   <- make_synthetic()
  out <- read_openalex(d)
  positions <- match(
    c("id", "title", "year", "journal", "doi",
      "cited_by_count", "abstract", "type",
      "authors", "references", "keywords"),
    names(out)
  )
  expect_true(all(!is.na(positions)))
  expect_equal(positions, sort(positions))
})

test_that("read_openalex returns the correct number of rows", {
  d <- make_synthetic()
  expect_equal(nrow(read_openalex(d)), 3L)
})

## ── Types ─────────────────────────────────────────────────────────────────────

test_that("year column is integer", {
  out <- read_openalex(make_synthetic())
  expect_type(out$year, "integer")
})

test_that("cited_by_count is integer", {
  out <- read_openalex(make_synthetic())
  expect_type(out$cited_by_count, "integer")
})

test_that("authors, references, keywords are list-columns", {
  d   <- make_synthetic()
  out <- read_openalex(d)
  expect_true(is.list(out$authors))
  expect_true(is.list(out$references))
  expect_true(is.list(out$keywords))
})

## ── ID handling ───────────────────────────────────────────────────────────────

test_that("id column is preserved verbatim from oa_fetch (no URL stripping)", {
  ## The parser passes the `id` column through unchanged; OpenAlex IDs keep
  ## their full URL form (stripping is not done in read_openalex).
  d   <- make_synthetic()
  out <- read_openalex(d)
  expect_equal(out$id, d$id)
})

test_that("id falls back to OA-prefixed sequence when id column is absent", {
  ## Regression: previously, safe_col() passed the length-n default through
  ## rep(default, n), inflating output to n*n rows. Fixed so a length-n
  ## default is used as-is.
  d    <- make_synthetic()
  d$id <- NULL
  out  <- read_openalex(d)
  expect_equal(nrow(out), 3L)
  expect_equal(out$id, paste0("OA", 1:3))
})

## ── DOI handling ──────────────────────────────────────────────────────────────

test_that("DOI URL prefix is stripped", {
  d   <- make_synthetic()
  out <- read_openalex(d)
  expect_equal(out$doi[1], "10.1000/aaa")
  expect_equal(out$doi[2], "10.1000/bbb")
})

test_that("NA doi passes through as NA", {
  d   <- make_synthetic()
  out <- read_openalex(d)
  expect_true(is.na(out$doi[3]))
})

test_that("doi without URL prefix is preserved unchanged", {
  d     <- make_synthetic()
  d$doi <- c("10.1000/plain", NA_character_, "10.5555/xyz")
  out   <- read_openalex(d)
  expect_equal(out$doi[1], "10.1000/plain")
  expect_equal(out$doi[3], "10.5555/xyz")
})

## ── Other scalar columns ──────────────────────────────────────────────────────

test_that("title maps from display_name", {
  d   <- make_synthetic()
  out <- read_openalex(d)
  expect_equal(out$title, d$display_name)
})

test_that("journal maps from so column", {
  d   <- make_synthetic()
  out <- read_openalex(d)
  expect_equal(out$journal, d$so)
})

test_that("year values are correct", {
  d   <- make_synthetic()
  out <- read_openalex(d)
  expect_equal(out$year, c(2021L, 2019L, 2023L))
})

test_that("cited_by_count maps correctly", {
  d   <- make_synthetic()
  out <- read_openalex(d)
  expect_equal(out$cited_by_count, c(10L, 0L, 5L))
})

test_that("abstract passes through correctly including NA", {
  d   <- make_synthetic()
  out <- read_openalex(d)
  expect_equal(out$abstract[1], "Abstract one.")
  expect_true(is.na(out$abstract[2]))
  expect_equal(out$abstract[3], "Abstract three.")
})

test_that("type column is preserved as-is", {
  d   <- make_synthetic()
  out <- read_openalex(d)
  expect_equal(out$type, c("article", "book-chapter", "article"))
})

## ── Missing optional columns ─────────────────────────────────────────────────

test_that("missing display_name yields NA titles", {
  d            <- make_synthetic()
  d$display_name <- NULL
  out          <- read_openalex(d)
  expect_true(all(is.na(out$title)))
})

test_that("missing so column yields NA journals", {
  d    <- make_synthetic()
  d$so <- NULL
  out  <- read_openalex(d)
  expect_true(all(is.na(out$journal)))
})

test_that("missing cited_by_count column defaults to 0L", {
  d                 <- make_synthetic()
  d$cited_by_count  <- NULL
  out               <- read_openalex(d)
  expect_true(all(out$cited_by_count == 0L))
  expect_type(out$cited_by_count, "integer")
})

test_that("missing type column yields NA types", {
  d      <- make_synthetic()
  d$type <- NULL
  out    <- read_openalex(d)
  expect_true(all(is.na(out$type)))
})

test_that("missing ab column yields NA abstracts", {
  d     <- make_synthetic()
  d$ab  <- NULL
  out   <- read_openalex(d)
  expect_true(all(is.na(out$abstract)))
})

## ── Authors ───────────────────────────────────────────────────────────────────

test_that("authors are extracted from nested author data.frame (au_display_name)", {
  d        <- make_synthetic()
  d$author <- list(
    make_author_df(c("Alice Smith", "Bob Jones")),
    make_author_df("Carlos García"),
    make_author_df("Dana Lee")
  )
  out <- read_openalex(d)
  expect_equal(length(out$authors[[1]]), 2L)
  ## standardize_authors uppercases names
  expect_equal(out$authors[[1]], c("ALICE SMITH", "BOB JONES"))
  expect_equal(out$authors[[2]], "CARLOS GARCÍA")
})

test_that("author display_name column also accepted when au_display_name absent", {
  d        <- make_synthetic()
  d$author <- list(
    data.frame(display_name = c("Eve White", "Frank Black"),
               stringsAsFactors = FALSE),
    NULL,
    NULL
  )
  out <- read_openalex(d)
  expect_equal(out$authors[[1]], c("EVE WHITE", "FRANK BLACK"))
})

test_that("au_name column also accepted as fallback for author names", {
  d        <- make_synthetic()
  d$author <- list(
    data.frame(au_name = c("Grace Hopper"),
               stringsAsFactors = FALSE),
    NULL,
    NULL
  )
  out <- read_openalex(d)
  expect_equal(out$authors[[1]], "GRACE HOPPER")
})

test_that("NULL author cell yields empty character vector", {
  d        <- make_synthetic()
  d$author <- list(NULL, NULL, NULL)
  out      <- read_openalex(d)
  expect_true(all(vapply(out$authors, length, integer(1)) == 0L))
})

test_that("non-data.frame author cell yields empty character vector", {
  d        <- make_synthetic()
  d$author <- list("not a data frame", NULL, NULL)
  out      <- read_openalex(d)
  expect_equal(out$authors[[1]], character(0))
})

test_that("author data.frame with no recognized name column yields empty vector", {
  d        <- make_synthetic()
  d$author <- list(
    data.frame(some_other_col = "x", stringsAsFactors = FALSE),
    NULL,
    NULL
  )
  out <- read_openalex(d)
  expect_equal(out$authors[[1]], character(0))
})

test_that("missing author column yields list of empty character vectors", {
  d   <- make_synthetic()
  out <- read_openalex(d)
  expect_true(all(vapply(out$authors, length, integer(1)) == 0L))
})

test_that("non-ASCII author names are uppercased correctly", {
  d        <- make_synthetic()
  d$author <- list(
    make_author_df("José Martínez"),
    make_author_df("Li Wei"),
    NULL
  )
  out <- read_openalex(d)
  expect_equal(out$authors[[1]], toupper("José Martínez"))
  expect_equal(out$authors[[2]], "LI WEI")
})

## ── References ────────────────────────────────────────────────────────────────

test_that("referenced_works list-col is flattened to character vectors", {
  d                  <- make_synthetic()
  d$referenced_works <- list(
    c("https://openalex.org/W999", "https://openalex.org/W888"),
    character(0),
    NULL
  )
  out <- read_openalex(d)
  expect_equal(out$references[[1]],
               c("https://openalex.org/W999", "https://openalex.org/W888"))
  expect_equal(out$references[[2]], character(0))
  expect_equal(out$references[[3]], character(0))
})

test_that("NULL referenced_works cell yields empty character vector", {
  d                  <- make_synthetic()
  d$referenced_works <- list(NULL, NULL, NULL)
  out                <- read_openalex(d)
  expect_true(all(vapply(out$references, length, integer(1)) == 0L))
})

test_that("missing referenced_works column yields empty reference lists", {
  d   <- make_synthetic()
  out <- read_openalex(d)
  expect_true(all(vapply(out$references, length, integer(1)) == 0L))
})

test_that("referenced_works as scalar character (non-list) is split by comma", {
  ## Parser uses split_field(as.character(...), sep=",") for non-list column
  d                  <- make_synthetic()
  d$referenced_works <- c("W1,W2", "W3", NA_character_)
  ## Make it NOT a list so the else branch is taken
  class(d$referenced_works) <- "character"
  out <- read_openalex(d)
  expect_equal(out$references[[1]], c("W1", "W2"))
  expect_equal(out$references[[2]], "W3")
})

## ── Keywords (concepts branch) ────────────────────────────────────────────────

test_that("keywords extracted from concepts column (display_name)", {
  d           <- make_synthetic()
  d$concepts  <- list(
    make_concept_df(c("Bibliometrics", "Network Analysis")),
    make_concept_df("Education"),
    NULL
  )
  out <- read_openalex(d)
  expect_equal(out$keywords[[1]], c("Bibliometrics", "Network Analysis"))
  expect_equal(out$keywords[[2]], "Education")
  expect_equal(out$keywords[[3]], character(0))
})

test_that("concepts column: concept_name accepted as fallback", {
  d          <- make_synthetic()
  d$concepts <- list(
    data.frame(concept_name = c("Topic A", "Topic B"),
               stringsAsFactors = FALSE),
    NULL,
    NULL
  )
  out <- read_openalex(d)
  expect_equal(out$keywords[[1]], c("Topic A", "Topic B"))
})

test_that("concepts column: NULL cell yields empty character vector", {
  d          <- make_synthetic()
  d$concepts <- list(NULL, NULL, NULL)
  out        <- read_openalex(d)
  expect_true(all(vapply(out$keywords, length, integer(1)) == 0L))
})

test_that("concepts column: non-data.frame cell yields empty character vector", {
  d          <- make_synthetic()
  d$concepts <- list("not a df", NULL, NULL)
  out        <- read_openalex(d)
  expect_equal(out$keywords[[1]], character(0))
})

test_that("concepts column: data.frame with no recognized name column yields empty", {
  d          <- make_synthetic()
  d$concepts <- list(
    data.frame(score = 0.9, stringsAsFactors = FALSE),
    NULL,
    NULL
  )
  out <- read_openalex(d)
  expect_equal(out$keywords[[1]], character(0))
})

## ── Keywords (keywords branch, no concepts column) ───────────────────────────

test_that("keywords extracted from keywords data.frame column (display_name)", {
  d           <- make_synthetic()
  d$keywords  <- list(
    make_keyword_df(c("Machine Learning", "Citation Analysis")),
    make_keyword_df("Open Access"),
    NULL
  )
  out <- read_openalex(d)
  expect_equal(out$keywords[[1]], c("Machine Learning", "Citation Analysis"))
  expect_equal(out$keywords[[2]], "Open Access")
  expect_equal(out$keywords[[3]], character(0))
})

test_that("keywords data.frame: keyword column accepted as fallback", {
  d          <- make_synthetic()
  d$keywords <- list(
    data.frame(keyword = c("scientometrics", "altmetrics"),
               stringsAsFactors = FALSE),
    NULL,
    NULL
  )
  out <- read_openalex(d)
  expect_equal(out$keywords[[1]], c("scientometrics", "altmetrics"))
})

test_that("keywords as plain character vector (non-data.frame) is accepted", {
  d          <- make_synthetic()
  d$keywords <- list(
    c("keyword one", "keyword two"),
    NULL,
    NULL
  )
  out <- read_openalex(d)
  expect_equal(out$keywords[[1]], c("keyword one", "keyword two"))
})

test_that("keywords column: NULL cell yields empty character vector", {
  d          <- make_synthetic()
  d$keywords <- list(NULL, NULL, NULL)
  out        <- read_openalex(d)
  expect_true(all(vapply(out$keywords, length, integer(1)) == 0L))
})

test_that("keywords column: data.frame with no recognized name yields empty", {
  d          <- make_synthetic()
  d$keywords <- list(
    data.frame(score = 0.5, stringsAsFactors = FALSE),
    NULL,
    NULL
  )
  out <- read_openalex(d)
  expect_equal(out$keywords[[1]], character(0))
})

test_that("concepts takes precedence over keywords when both columns present", {
  d          <- make_synthetic()
  d$concepts <- list(
    make_concept_df("From Concepts"),
    NULL,
    NULL
  )
  d$keywords <- list(
    make_keyword_df("From Keywords"),
    NULL,
    NULL
  )
  out <- read_openalex(d)
  ## Parser checks concepts first
  expect_equal(out$keywords[[1]], "From Concepts")
})

test_that("missing both concepts and keywords yields empty keyword lists", {
  d   <- make_synthetic()
  out <- read_openalex(d)
  expect_true(all(vapply(out$keywords, length, integer(1)) == 0L))
})

## ── Edge cases ────────────────────────────────────────────────────────────────

test_that("empty input data frame returns zero-row result with correct columns", {
  d   <- make_synthetic()[0, ]
  out <- read_openalex(d)
  expect_equal(nrow(out), 0L)
  expect_true(all(c("id", "title", "year", "journal", "doi",
                    "cited_by_count", "abstract", "type",
                    "authors", "references", "keywords") %in% names(out)))
})

test_that("non-data.frame input raises an error", {
  expect_error(read_openalex(list(a = 1, b = 2)))
  expect_error(read_openalex("not a data frame"))
  expect_error(read_openalex(42L))
})

test_that("single-row input works correctly", {
  d <- data.frame(
    id               = "https://openalex.org/W9999",
    display_name     = "Solo Paper",
    publication_year = 2020L,
    so               = "Solo Journal",
    doi              = "https://doi.org/10.9999/solo",
    cited_by_count   = 3L,
    ab               = "Solo abstract.",
    type             = "article",
    stringsAsFactors = FALSE
  )
  out <- read_openalex(d)
  expect_equal(nrow(out), 1L)
  expect_equal(out$doi, "10.9999/solo")
  expect_equal(out$year, 2020L)
})

test_that("publication_year as character is coerced to integer", {
  d                  <- make_synthetic()
  d$publication_year <- c("2021", "2019", "2023")
  out                <- read_openalex(d)
  expect_type(out$year, "integer")
  expect_equal(out$year, c(2021L, 2019L, 2023L))
})

test_that("all columns may contain NA without error", {
  d <- data.frame(
    id               = NA_character_,
    display_name     = NA_character_,
    publication_year = NA_integer_,
    so               = NA_character_,
    doi              = NA_character_,
    cited_by_count   = NA_integer_,
    ab               = NA_character_,
    type             = NA_character_,
    stringsAsFactors = FALSE
  )
  expect_no_error(read_openalex(d))
  out <- read_openalex(d)
  expect_equal(nrow(out), 1L)
})

test_that("result is a plain data.frame (not tibble or other subclass)", {
  d   <- make_synthetic()
  out <- read_openalex(d)
  expect_true(is.data.frame(out))
})