testthat::test_that("build_gemini_batch_requests builds valid requests", {
  data("example_writing_samples", package = "pairwiseLLM")

  pairs <- make_pairs(example_writing_samples)
  pairs <- pairs[1:2, ]

  td <- trait_description("overall_quality")
  tmpl <- set_prompt_template()

  batch <- build_gemini_batch_requests(
    pairs             = pairs,
    model             = "gemini-3-pro-preview",
    trait_name        = td$name,
    trait_description = td$description,
    prompt_template   = tmpl,
    thinking_level    = "low"
  )

  testthat::expect_s3_class(batch, "tbl_df")
  testthat::expect_equal(nrow(batch), 2L)
  testthat::expect_true(all(c("custom_id", "request") %in% names(batch)))

  # Basic structure checks on first request
  r1 <- batch$request[[1]]
  testthat::expect_true(is.list(r1$contents))
  testthat::expect_true(is.list(r1$generationConfig))

  # User message should contain SAMPLE_1 / SAMPLE_2 labels in the text
  msg1 <- r1$contents[[1]]
  testthat::expect_equal(msg1$role, "user")
  parts <- msg1$parts
  testthat::expect_true(is.list(parts))
  text_block <- parts[[1]]$text

  # We now just require that the labels SAMPLE_1 / SAMPLE_2 appear somewhere,
  # not necessarily wrapped in angle brackets.
  testthat::expect_true(grepl("SAMPLE_1", text_block, fixed = TRUE))
  testthat::expect_true(grepl("SAMPLE_2", text_block, fixed = TRUE))
})

testthat::test_that("parse_gemini_batch_output handles succeeded and errored
                    results", {
  tmp <- tempfile(fileext = ".jsonl")
  on.exit(unlink(tmp), add = TRUE)

  # Succeeded result line, similar in spirit to live responses
  succ_resp <- list(
    model = "gemini-3-pro-preview",
    candidates = list(
      list(
        content = list(
          parts = list(
            list(
              text = "<BETTER_SAMPLE>SAMPLE_2</BETTER_SAMPLE> Hello!"
            )
          )
        )
      )
    ),
    usageMetadata = list(
      promptTokenCount     = 10L,
      candidatesTokenCount = 5L,
      totalTokenCount      = 15L
    )
  )

  line_ok <- list(
    custom_id = "GEM_S01_vs_S02",
    result = list(
      type     = "succeeded",
      response = succ_resp
    )
  )

  # Errored result line
  line_err <- list(
    custom_id = "GEM_S03_vs_S04",
    result = list(
      type = "errored",
      error = list(
        code    = 400L,
        message = "Validation error",
        status  = "INVALID_ARGUMENT"
      )
    )
  )

  json_lines <- c(
    jsonlite::toJSON(line_ok, auto_unbox = TRUE),
    jsonlite::toJSON(line_err, auto_unbox = TRUE)
  )
  writeLines(json_lines, con = tmp, useBytes = TRUE)

  # New API: parse_gemini_batch_output() expects a requests_tbl with
  # custom_id / ID1 / ID2 in the same order as the requests.
  requests_tbl <- tibble::tibble(
    custom_id = c("GEM_S01_vs_S02", "GEM_S03_vs_S04"),
    ID1       = c("S01", "S03"),
    ID2       = c("S02", "S04")
  )

  res <- parse_gemini_batch_output(
    results_path = tmp,
    requests_tbl = requests_tbl
  )

  testthat::expect_s3_class(res, "tbl_df")
  testthat::expect_equal(nrow(res), 2L)

  # First row: succeeded
  r1 <- res[1, ]
  testthat::expect_equal(r1$custom_id, "GEM_S01_vs_S02")
  testthat::expect_equal(r1$ID1, "S01")
  testthat::expect_equal(r1$ID2, "S02")
  testthat::expect_equal(r1$result_type, "succeeded")
  testthat::expect_equal(r1$status_code, 200L)
  testthat::expect_true(is.na(r1$error_message))
  testthat::expect_equal(r1$model, "gemini-3-pro-preview")
  testthat::expect_equal(r1$better_sample, "SAMPLE_2")
  testthat::expect_equal(r1$better_id, "S02")
  testthat::expect_equal(r1$prompt_tokens, 10)
  testthat::expect_equal(r1$completion_tokens, 5)
  testthat::expect_equal(r1$total_tokens, 15)

  # Second row: errored
  r2 <- res[2, ]
  testthat::expect_equal(r2$custom_id, "GEM_S03_vs_S04")
  testthat::expect_equal(r2$ID1, "S03")
  testthat::expect_equal(r2$ID2, "S04")
  testthat::expect_equal(r2$result_type, "errored")
  testthat::expect_true(is.na(r2$status_code))
  testthat::expect_match(r2$error_message, "Validation error")
  testthat::expect_true(is.na(r2$content))
  testthat::expect_true(is.na(r2$better_sample))
  testthat::expect_true(is.na(r2$better_id))
})

testthat::test_that("parse_gemini_batch_output handles invalid JSON lines
                    gracefully", {
  tmp <- tempfile(fileext = ".jsonl")
  on.exit(unlink(tmp), add = TRUE)

  writeLines("not-json", con = tmp, useBytes = TRUE)

  # Even for invalid JSON, we now must pass a requests_tbl; IDs here are dummies.
  requests_tbl <- tibble::tibble(
    custom_id = "GEM_S01_vs_S02",
    ID1       = "S01",
    ID2       = "S02"
  )

  res <- parse_gemini_batch_output(
    results_path = tmp,
    requests_tbl = requests_tbl
  )

  testthat::expect_equal(nrow(res), 1L)
  testthat::expect_true(is.na(res$custom_id))
  testthat::expect_match(res$error_message, "Failed to parse JSON line")
})

testthat::test_that("run_gemini_batch_pipeline works with polling and parsing
                    (mocked)", {
  pairs <- tibble::tibble(
    ID1   = "S01",
    text1 = "Text 1",
    ID2   = "S02",
    text2 = "Text 2"
  )

  # New requests_tbl shape: custom_id + ID1 + ID2 + request
  fake_req_tbl <- tibble::tibble(
    custom_id = "GEM_S01_vs_S02",
    ID1       = "S01",
    ID2       = "S02",
    request   = list(list(dummy = TRUE))
  )

  fake_batch_initial <- list(
    name     = "batches/123",
    metadata = list(state = "JOB_STATE_RUNNING")
  )

  fake_batch_final <- list(
    name     = "batches/123",
    metadata = list(state = "JOB_STATE_SUCCEEDED")
  )

  fake_results <- tibble::tibble(
    custom_id         = "GEM_S01_vs_S02",
    ID1               = "S01",
    ID2               = "S02",
    model             = "gemini-3-pro-preview",
    object_type       = "generateContent",
    status_code       = 200L,
    result_type       = "succeeded",
    error_message     = NA_character_,
    content           = "<BETTER_SAMPLE>SAMPLE_1</BETTER_SAMPLE>",
    better_sample     = "SAMPLE_1",
    better_id         = "S01",
    prompt_tokens     = 10,
    completion_tokens = 5,
    total_tokens      = 15
  )

  created_batch_name <- NULL
  polled_batch_name <- NULL
  download_batch_obj <- NULL
  parsed_path <- NULL

  td <- list(name = "Overall quality", description = "Quality")
  tmpl <- set_prompt_template()

  testthat::with_mocked_bindings(
    build_gemini_batch_requests = function(pairs, model, trait_name,
                                           trait_description,
                                           prompt_template, thinking_level,
                                           ...) {
      fake_req_tbl
    },
    gemini_create_batch = function(requests, model, api_key, api_version,
                                   display_name = NULL) {
      created_batch_name <<- "batches/123"
      fake_batch_initial
    },
    gemini_poll_batch_until_complete = function(batch_name, interval_seconds,
                                                timeout_seconds, api_key,
                                                api_version, verbose) {
      polled_batch_name <<- batch_name
      fake_batch_final
    },
    gemini_download_batch_results = function(batch, requests_tbl, output_path,
                                             api_key, api_version) {
      download_batch_obj <<- batch
      # Write a dummy .jsonl file so that parse_* can read it
      writeLines('{"dummy": true}', con = output_path)
      invisible(output_path)
    },
    # New signature: parse_gemini_batch_output(results_path, requests_tbl)
    parse_gemini_batch_output = function(results_path, requests_tbl) {
      parsed_path <<- results_path
      fake_results
    },
    {
      res <- run_gemini_batch_pipeline(
        pairs             = pairs,
        model             = "gemini-3-pro-preview",
        trait_name        = td$name,
        trait_description = td$description,
        prompt_template   = tmpl,
        thinking_level    = "low",
        interval_seconds  = 0,
        timeout_seconds   = 10,
        verbose           = FALSE
      )

      testthat::expect_equal(created_batch_name, "batches/123")
      testthat::expect_equal(polled_batch_name, "batches/123")
      testthat::expect_identical(download_batch_obj, fake_batch_final)
      testthat::expect_true(file.exists(res$batch_input_path))
      testthat::expect_true(file.exists(res$batch_output_path))
      testthat::expect_true(file.exists(parsed_path))

      # Return structure should mirror other batch pipelines
      testthat::expect_true(all(c(
        "batch_input_path", "batch_output_path", "file", "batch", "results"
      ) %in% names(res)))

      testthat::expect_null(res$file)
      testthat::expect_equal(res$batch$metadata$state, "JOB_STATE_SUCCEEDED")
      testthat::expect_equal(res$results$better_id, "S01")
    }
  )
})

testthat::test_that("run_gemini_batch_pipeline does not poll or parse when
                    poll = FALSE", {
  pairs <- tibble::tibble(
    ID1   = "S01",
    text1 = "Text 1",
    ID2   = "S02",
    text2 = "Text 2"
  )

  fake_req_tbl <- tibble::tibble(
    custom_id = "GEM_S01_vs_S02",
    ID1       = "S01",
    ID2       = "S02",
    request   = list(list(dummy = TRUE))
  )

  fake_batch_initial <- list(
    name     = "batches/123",
    metadata = list(state = "JOB_STATE_RUNNING")
  )

  poll_called <- FALSE
  download_called <- FALSE
  parse_called <- FALSE

  td <- list(name = "Overall quality", description = "Quality")
  tmpl <- set_prompt_template()

  testthat::with_mocked_bindings(
    build_gemini_batch_requests = function(pairs, model, trait_name,
                                           trait_description,
                                           prompt_template, thinking_level,
                                           ...) {
      fake_req_tbl
    },
    gemini_create_batch = function(requests, model, api_key, api_version,
                                   display_name = NULL) {
      fake_batch_initial
    },
    gemini_poll_batch_until_complete = function(batch_name, interval_seconds,
                                                timeout_seconds, api_key,
                                                api_version, verbose) {
      poll_called <<- TRUE
      stop("Polling should not be called when poll = FALSE")
    },
    gemini_download_batch_results = function(batch, requests_tbl, output_path,
                                             api_key, api_version) {
      download_called <<- TRUE
      stop("Download should not be called when poll = FALSE")
    },
    # New signature: parse_gemini_batch_output(results_path, requests_tbl)
    parse_gemini_batch_output = function(results_path, requests_tbl) {
      parse_called <<- TRUE
      stop("Parse should not be called when poll = FALSE")
    },
    {
      res <- run_gemini_batch_pipeline(
        pairs             = pairs,
        model             = "gemini-3-pro-preview",
        trait_name        = td$name,
        trait_description = td$description,
        prompt_template   = tmpl,
        thinking_level    = "low",
        poll              = FALSE
      )

      testthat::expect_false(poll_called)
      testthat::expect_false(download_called)
      testthat::expect_false(parse_called)

      testthat::expect_true(file.exists(res$batch_input_path))
      testthat::expect_null(res$batch_output_path)
      testthat::expect_null(res$results)

      # Standardised return structure
      testthat::expect_true(all(c(
        "batch_input_path", "batch_output_path", "file", "batch", "results"
      ) %in% names(res)))
      testthat::expect_null(res$file)
      testthat::expect_equal(res$batch$metadata$state, "JOB_STATE_RUNNING")
    }
  )
})

# tests/testthat/test-gemini-batch-api.R

# ------------------------------------------------------------------------------
# build_gemini_batch_requests
# ------------------------------------------------------------------------------

testthat::test_that("build_gemini_batch_requests builds valid requests", {
  data("example_writing_samples", package = "pairwiseLLM")

  pairs <- make_pairs(example_writing_samples)
  pairs <- pairs[1:2, ]

  td <- trait_description("overall_quality")
  tmpl <- set_prompt_template()

  batch <- build_gemini_batch_requests(
    pairs             = pairs,
    model             = "gemini-3-pro-preview",
    trait_name        = td$name,
    trait_description = td$description,
    prompt_template   = tmpl,
    thinking_level    = "low"
  )

  testthat::expect_s3_class(batch, "tbl_df")
  testthat::expect_equal(nrow(batch), 2L)
  testthat::expect_true(all(c("custom_id", "request") %in% names(batch)))

  # Basic structure checks on first request
  r1 <- batch$request[[1]]
  testthat::expect_true(is.list(r1$contents))
  testthat::expect_true(is.list(r1$generationConfig))

  # User message should contain SAMPLE_1 / SAMPLE_2 labels in the text
  msg1 <- r1$contents[[1]]
  testthat::expect_equal(msg1$role, "user")
  parts <- msg1$parts
  testthat::expect_true(is.list(parts))
  text_block <- parts[[1]]$text

  # We now just require that the labels SAMPLE_1 / SAMPLE_2 appear somewhere,
  # not necessarily wrapped in angle brackets.
  testthat::expect_true(grepl("SAMPLE_1", text_block, fixed = TRUE))
  testthat::expect_true(grepl("SAMPLE_2", text_block, fixed = TRUE))
})

# ------------------------------------------------------------------------------
# parse_gemini_batch_output: normal + error-shaped lines
# ------------------------------------------------------------------------------

testthat::test_that(
  "parse_gemini_batch_output handles succeeded and errored results",
  {
    tmp <- tempfile(fileext = ".jsonl")
    on.exit(unlink(tmp), add = TRUE)

    # Succeeded result line, similar in spirit to live responses
    succ_resp <- list(
      model = "gemini-3-pro-preview",
      candidates = list(
        list(
          content = list(
            parts = list(
              list(
                text = "<BETTER_SAMPLE>SAMPLE_2</BETTER_SAMPLE> Hello!"
              )
            )
          )
        )
      ),
      usageMetadata = list(
        promptTokenCount     = 10L,
        candidatesTokenCount = 5L,
        totalTokenCount      = 15L
      )
    )

    line_ok <- list(
      custom_id = "GEM_S01_vs_S02",
      result = list(
        type     = "succeeded",
        response = succ_resp
      )
    )

    # Errored result line
    line_err <- list(
      custom_id = "GEM_S03_vs_S04",
      result = list(
        type = "errored",
        error = list(
          code    = 400L,
          message = "Validation error",
          status  = "INVALID_ARGUMENT"
        )
      )
    )

    json_lines <- c(
      jsonlite::toJSON(line_ok, auto_unbox = TRUE),
      jsonlite::toJSON(line_err, auto_unbox = TRUE)
    )
    writeLines(json_lines, con = tmp, useBytes = TRUE)

    # parse_gemini_batch_output() expects a requests_tbl with custom_id / ID1 / ID2
    requests_tbl <- tibble::tibble(
      custom_id = c("GEM_S01_vs_S02", "GEM_S03_vs_S04"),
      ID1       = c("S01", "S03"),
      ID2       = c("S02", "S04")
    )

    res <- parse_gemini_batch_output(
      results_path = tmp,
      requests_tbl = requests_tbl
    )

    testthat::expect_s3_class(res, "tbl_df")
    testthat::expect_equal(nrow(res), 2L)

    # First row: succeeded
    r1 <- res[1, ]
    testthat::expect_equal(r1$custom_id, "GEM_S01_vs_S02")
    testthat::expect_equal(r1$ID1, "S01")
    testthat::expect_equal(r1$ID2, "S02")
    testthat::expect_equal(r1$result_type, "succeeded")
    testthat::expect_equal(r1$status_code, 200L)
    testthat::expect_true(is.na(r1$error_message))
    testthat::expect_equal(r1$model, "gemini-3-pro-preview")
    testthat::expect_equal(r1$better_sample, "SAMPLE_2")
    testthat::expect_equal(r1$better_id, "S02")
    testthat::expect_equal(r1$prompt_tokens, 10)
    testthat::expect_equal(r1$completion_tokens, 5)
    testthat::expect_equal(r1$total_tokens, 15)

    # Second row: errored
    r2 <- res[2, ]
    testthat::expect_equal(r2$custom_id, "GEM_S03_vs_S04")
    testthat::expect_equal(r2$ID1, "S03")
    testthat::expect_equal(r2$ID2, "S04")
    testthat::expect_equal(r2$result_type, "errored")
    testthat::expect_true(is.na(r2$status_code))
    testthat::expect_match(r2$error_message, "Validation error")
    testthat::expect_true(is.na(r2$content))
    testthat::expect_true(is.na(r2$better_sample))
    testthat::expect_true(is.na(r2$better_id))
  }
)

testthat::test_that(
  "parse_gemini_batch_output handles invalid JSON lines gracefully",
  {
    tmp <- tempfile(fileext = ".jsonl")
    on.exit(unlink(tmp), add = TRUE)

    writeLines("not-json", con = tmp, useBytes = TRUE)

    # We must still pass a requests_tbl; IDs here are dummies.
    requests_tbl <- tibble::tibble(
      custom_id = "GEM_S01_vs_S02",
      ID1       = "S01",
      ID2       = "S02"
    )

    res <- parse_gemini_batch_output(
      results_path = tmp,
      requests_tbl = requests_tbl
    )

    testthat::expect_equal(nrow(res), 1L)
    testthat::expect_true(is.na(res$custom_id))
    testthat::expect_match(res$error_message, "Failed to parse JSON line")
  }
)

# ------------------------------------------------------------------------------
# gemini_download_batch_results: validations + mismatch warning + JSONL writing
# ------------------------------------------------------------------------------

testthat::test_that(
  "gemini_download_batch_results validates requests_tbl structure",
  {
    batch <- list(response = list(inlinedResponses = data.frame(x = 1)))
    bad_requests <- tibble::tibble(id = 1:2)
    tmp <- tempfile(fileext = ".jsonl")
    on.exit(unlink(tmp), add = TRUE)

    testthat::expect_error(
      gemini_download_batch_results(
        batch        = batch,
        requests_tbl = bad_requests,
        output_path  = tmp,
        api_key      = "TEST_KEY",
        api_version  = "v1beta"
      ),
      "requests_tbl.*custom_id",
      fixed = FALSE
    )
  }
)

testthat::test_that(
  "gemini_download_batch_results errors when inline responses are missing",
  {
    # response$inlinedResponses is NULL -> error branch
    batch <- list(response = list(inlinedResponses = NULL))
    requests_tbl <- tibble::tibble(custom_id = "id1")
    tmp <- tempfile(fileext = ".jsonl")
    on.exit(unlink(tmp), add = TRUE)

    testthat::expect_error(
      gemini_download_batch_results(
        batch        = batch,
        requests_tbl = requests_tbl,
        output_path  = tmp,
        api_key      = "TEST_KEY",
        api_version  = "v1beta"
      ),
      "Batch does not contain response\\$inlinedResponses\\$inlinedResponses",
      fixed = FALSE
    )
  }
)

testthat::test_that(
  "gemini_download_batch_results writes JSONL and warns on count mismatch",
  {
    # inlinedResponses is already a response data.frame
    inlined_df <- data.frame(
      score = c(1, 2),
      stringsAsFactors = FALSE
    )

    batch <- list(
      response = list(
        inlinedResponses = inlined_df
      )
    )

    # 3 requests vs 2 responses -> mismatch warning
    requests_tbl <- tibble::tibble(
      custom_id = c("req1", "req2", "req3")
    )

    tmp <- tempfile(fileext = ".jsonl")
    on.exit(unlink(tmp), add = TRUE)

    testthat::expect_warning(
      {
        out_path <- gemini_download_batch_results(
          batch        = batch,
          requests_tbl = requests_tbl,
          output_path  = tmp,
          api_key      = "TEST_KEY",
          api_version  = "v1beta"
        )

        testthat::expect_true(file.exists(out_path))
        lines <- readLines(out_path, warn = FALSE, encoding = "UTF-8")
        # Only min(3,2) = 2 lines should be written
        testthat::expect_equal(length(lines), 2L)
      },
      "Number of inlined responses",
      fixed = FALSE
    )
  }
)

# ------------------------------------------------------------------------------
# gemini_poll_batch_until_complete: validation + timeout branch
# ------------------------------------------------------------------------------

testthat::test_that(
  "gemini_poll_batch_until_complete validates batch_name",
  {
    testthat::expect_error(
      gemini_poll_batch_until_complete(
        batch_name       = "",
        interval_seconds = 0,
        timeout_seconds  = 1,
        api_key          = "TEST_KEY",
        api_version      = "v1beta",
        verbose          = FALSE
      ),
      "`batch_name` must be a non-empty character scalar.",
      fixed = TRUE
    )
  }
)

testthat::test_that(
  "gemini_poll_batch_until_complete respects timeout_seconds and returns last batch",
  {
    # Always return a non-terminal state so timeout logic is hit
    testthat::with_mocked_bindings(
      gemini_get_batch = function(batch_name, api_key, api_version = "v1beta") {
        list(
          name     = batch_name,
          metadata = list(state = "BATCH_STATE_RUNNING")
        )
      },
      {
        testthat::expect_warning(
          {
            batch <- gemini_poll_batch_until_complete(
              batch_name       = "batches/timeout",
              interval_seconds = 0,
              timeout_seconds  = 0,
              api_key          = "TEST_KEY",
              api_version      = "v1beta",
              verbose          = TRUE
            )

            testthat::expect_type(batch, "list")
            testthat::expect_equal(batch$metadata$state, "BATCH_STATE_RUNNING")
          },
          "Timeout reached while waiting for Gemini batch to complete",
          fixed = FALSE
        )
      }
    )
  }
)

# ------------------------------------------------------------------------------
# run_gemini_batch_pipeline (mocked end-to-end)
# ------------------------------------------------------------------------------

testthat::test_that(
  "run_gemini_batch_pipeline works with polling and parsing (mocked)",
  {
    pairs <- tibble::tibble(
      ID1   = "S01",
      text1 = "Text 1",
      ID2   = "S02",
      text2 = "Text 2"
    )

    # New requests_tbl shape: custom_id + ID1 + ID2 + request
    fake_req_tbl <- tibble::tibble(
      custom_id = "GEM_S01_vs_S02",
      ID1       = "S01",
      ID2       = "S02",
      request   = list(list(dummy = TRUE))
    )

    fake_batch_initial <- list(
      name     = "batches/123",
      metadata = list(state = "JOB_STATE_RUNNING")
    )

    fake_batch_final <- list(
      name     = "batches/123",
      metadata = list(state = "JOB_STATE_SUCCEEDED")
    )

    fake_results <- tibble::tibble(
      custom_id         = "GEM_S01_vs_S02",
      ID1               = "S01",
      ID2               = "S02",
      model             = "gemini-3-pro-preview",
      object_type       = "generateContent",
      status_code       = 200L,
      result_type       = "succeeded",
      error_message     = NA_character_,
      content           = "<BETTER_SAMPLE>SAMPLE_1</BETTER_SAMPLE>",
      better_sample     = "SAMPLE_1",
      better_id         = "S01",
      prompt_tokens     = 10,
      completion_tokens = 5,
      total_tokens      = 15
    )

    created_batch_name <- NULL
    polled_batch_name <- NULL
    download_batch_obj <- NULL
    parsed_path <- NULL

    td <- list(name = "Overall quality", description = "Quality")
    tmpl <- set_prompt_template()

    testthat::with_mocked_bindings(
      build_gemini_batch_requests = function(pairs, model, trait_name,
                                             trait_description,
                                             prompt_template, thinking_level,
                                             ...) {
        fake_req_tbl
      },
      gemini_create_batch = function(requests, model, api_key, api_version,
                                     display_name = NULL) {
        created_batch_name <<- "batches/123"
        fake_batch_initial
      },
      gemini_poll_batch_until_complete = function(batch_name, interval_seconds,
                                                  timeout_seconds, api_key,
                                                  api_version, verbose) {
        polled_batch_name <<- batch_name
        fake_batch_final
      },
      gemini_download_batch_results = function(batch, requests_tbl, output_path,
                                               api_key, api_version) {
        download_batch_obj <<- batch
        # Write a dummy .jsonl file so that parse_* can read it
        writeLines('{"dummy": true}', con = output_path)
        invisible(output_path)
      },
      parse_gemini_batch_output = function(results_path, requests_tbl) {
        parsed_path <<- results_path
        fake_results
      },
      {
        res <- run_gemini_batch_pipeline(
          pairs             = pairs,
          model             = "gemini-3-pro-preview",
          trait_name        = td$name,
          trait_description = td$description,
          prompt_template   = tmpl,
          thinking_level    = "low",
          interval_seconds  = 0,
          timeout_seconds   = 10,
          verbose           = FALSE
        )

        testthat::expect_equal(created_batch_name, "batches/123")
        testthat::expect_equal(polled_batch_name, "batches/123")
        testthat::expect_identical(download_batch_obj, fake_batch_final)
        testthat::expect_true(file.exists(res$batch_input_path))
        testthat::expect_true(file.exists(res$batch_output_path))
        testthat::expect_true(file.exists(parsed_path))

        # Standardised return structure
        testthat::expect_true(all(c(
          "batch_input_path", "batch_output_path", "file", "batch", "results"
        ) %in% names(res)))

        testthat::expect_null(res$file)
        testthat::expect_equal(res$batch$metadata$state, "JOB_STATE_SUCCEEDED")
        testthat::expect_equal(res$results$better_id, "S01")
      }
    )
  }
)

testthat::test_that(
  "run_gemini_batch_pipeline does not poll or parse when poll = FALSE",
  {
    pairs <- tibble::tibble(
      ID1   = "S01",
      text1 = "Text 1",
      ID2   = "S02",
      text2 = "Text 2"
    )

    fake_req_tbl <- tibble::tibble(
      custom_id = "GEM_S01_vs_S02",
      ID1       = "S01",
      ID2       = "S02",
      request   = list(list(dummy = TRUE))
    )

    fake_batch_initial <- list(
      name     = "batches/123",
      metadata = list(state = "JOB_STATE_RUNNING")
    )

    poll_called <- FALSE
    download_called <- FALSE
    parse_called <- FALSE

    td <- list(name = "Overall quality", description = "Quality")
    tmpl <- set_prompt_template()

    testthat::with_mocked_bindings(
      build_gemini_batch_requests = function(pairs, model, trait_name,
                                             trait_description,
                                             prompt_template, thinking_level,
                                             ...) {
        fake_req_tbl
      },
      gemini_create_batch = function(requests, model, api_key, api_version,
                                     display_name = NULL) {
        fake_batch_initial
      },
      gemini_poll_batch_until_complete = function(batch_name, interval_seconds,
                                                  timeout_seconds, api_key,
                                                  api_version, verbose) {
        poll_called <<- TRUE
        stop("Polling should not be called when poll = FALSE")
      },
      gemini_download_batch_results = function(batch, requests_tbl, output_path,
                                               api_key, api_version) {
        download_called <<- TRUE
        stop("Download should not be called when poll = FALSE")
      },
      parse_gemini_batch_output = function(results_path, requests_tbl) {
        parse_called <<- TRUE
        stop("Parse should not be called when poll = FALSE")
      },
      {
        res <- run_gemini_batch_pipeline(
          pairs             = pairs,
          model             = "gemini-3-pro-preview",
          trait_name        = td$name,
          trait_description = td$description,
          prompt_template   = tmpl,
          thinking_level    = "low",
          poll              = FALSE
        )

        testthat::expect_false(poll_called)
        testthat::expect_false(download_called)
        testthat::expect_false(parse_called)

        testthat::expect_true(file.exists(res$batch_input_path))
        testthat::expect_null(res$batch_output_path)
        testthat::expect_null(res$results)

        # Standardised return structure
        testthat::expect_true(all(c(
          "batch_input_path", "batch_output_path", "file", "batch", "results"
        ) %in% names(res)))
        testthat::expect_null(res$file)
        testthat::expect_equal(res$batch$metadata$state, "JOB_STATE_RUNNING")
      }
    )
  }
)

testthat::test_that("build_gemini_batch_requests validates inputs and handles parameters", {
  td <- trait_description("overall_quality")
  tmpl <- set_prompt_template()

  # 1. Missing columns
  bad_pairs <- tibble::tibble(ID1 = "A", text1 = "txt")
  testthat::expect_error(
    build_gemini_batch_requests(bad_pairs, "gemini-model", td$name, td$description),
    "must contain columns"
  )

  # 2. Invalid model
  pairs <- tibble::tibble(ID1 = "A", text1 = "t", ID2 = "B", text2 = "t")
  testthat::expect_error(
    build_gemini_batch_requests(pairs, "", td$name, td$description),
    "model.*must be a non-empty character"
  )

  # 3. Warnings for thinking_budget and medium level
  testthat::expect_warning(
    build_gemini_batch_requests(
      pairs, "gemini-model", td$name, td$description,
      thinking_budget = 1000 # Should trigger warning
    ),
    "thinking_budget.*is ignored"
  )

  testthat::expect_warning(
    build_gemini_batch_requests(
      pairs, "gemini-model", td$name, td$description,
      thinking_level = "medium" # Should trigger warning
    ),
    "thinking_level = \"medium\".*mapping to \"High\""
  )

  # 4. Check parameter passthrough (temperature, top_p, etc.)
  batch <- build_gemini_batch_requests(
    pairs, "gemini-model", td$name, td$description,
    temperature = 0.7,
    top_p = 0.9,
    include_thoughts = TRUE
  )

  config <- batch$request[[1]]$generationConfig
  testthat::expect_equal(config$temperature, 0.7)
  testthat::expect_equal(config$topP, 0.9)
  testthat::expect_true(config$thinkingConfig$includeThoughts)
})

testthat::test_that("gemini_create_batch validates inputs", {
  testthat::expect_error(
    gemini_create_batch(list(), "model"),
    "must be a non-empty list"
  )
  testthat::expect_error(
    gemini_create_batch(list(a = 1), ""),
    "model.*must be a non-empty character"
  )
})

testthat::test_that("gemini_download_batch_results handles batch name string and count mismatch", {
  # Mock get_batch to return a fake batch object from a string name
  # We construct a data frame where the 'response' column is a nested data frame,
  # satisfying is.data.frame() checks in the function.
  resp_col <- data.frame(a = 1:2)
  inlined <- data.frame(row_id = 1:2)
  inlined$response <- resp_col

  mock_batch <- list(
    response = list(
      inlinedResponses = inlined
    )
  )

  # Mismatch: 3 requests, but only 2 responses in mock_batch
  req_tbl <- tibble::tibble(custom_id = c("1", "2", "3"))
  tmp <- tempfile()
  on.exit(unlink(tmp), add = TRUE)

  testthat::with_mocked_bindings(
    gemini_get_batch = function(...) mock_batch,
    {
      # Should warn about mismatch
      testthat::expect_warning(
        gemini_download_batch_results("batches/123", req_tbl, tmp),
        "Number of inlined responses.*does not match"
      )

      # Should resolve string "batches/123" to mock_batch via get_batch
      testthat::expect_true(file.exists(tmp))
    }
  )
})

testthat::test_that(".parse_gemini_pair_response logic extracts thoughts correctly", {
  # Internal function is available directly

  # 1. Error response handling
  err_resp <- list(error = list(message = "Blocked"))
  res_err <- .parse_gemini_pair_response("id", "A", "B", err_resp)
  testthat::expect_equal(res_err$result_type, "errored")
  testthat::expect_equal(res_err$error_message, "Blocked")

  # 2. Thoughts extraction: include_thoughts=TRUE, 2 parts
  # Part 1: Thought, Part 2: Answer
  resp_thoughts <- list(
    candidates = list(
      list(
        content = list(
          parts = list(
            list(text = "Thinking..."),
            list(text = "Answer")
          )
        )
      )
    )
  )

  res_t <- .parse_gemini_pair_response("id", "A", "B", resp_thoughts, include_thoughts = TRUE)
  testthat::expect_equal(res_t$thoughts, "Thinking...")
  testthat::expect_equal(res_t$content, "Answer")

  # 3. Thoughts extraction: include_thoughts=TRUE, but only 1 part
  # Should fallback to treating it as content, thoughts = NA
  resp_single <- list(
    candidates = list(
      list(
        content = list(
          parts = list(
            list(text = "Just answer")
          )
        )
      )
    )
  )
  res_s <- .parse_gemini_pair_response("id", "A", "B", resp_single, include_thoughts = TRUE)
  testthat::expect_true(is.na(res_s$thoughts))
  testthat::expect_equal(res_s$content, "Just answer")
})

testthat::test_that("parse_gemini_batch_output detects include_thoughts from request column", {
  tmp <- tempfile()
  on.exit(unlink(tmp), add = TRUE)

  # Create a result file with 2 parts (thought + content)
  resp_data <- list(
    candidates = list(
      list(
        content = list(
          parts = list(
            list(text = "My thought process"),
            list(text = "Final Answer")
          )
        )
      )
    )
  )

  line <- list(
    custom_id = "CID",
    result = list(type = "succeeded", response = resp_data)
  )
  writeLines(jsonlite::toJSON(line, auto_unbox = TRUE), tmp)

  # Case A: Request column exists and has includeThoughts = TRUE
  req_tbl_true <- tibble::tibble(
    custom_id = "CID", ID1 = "A", ID2 = "B",
    request = list(
      list(generationConfig = list(thinkingConfig = list(includeThoughts = TRUE)))
    )
  )

  res_true <- parse_gemini_batch_output(tmp, req_tbl_true)
  testthat::expect_equal(res_true$thoughts, "My thought process")
  testthat::expect_equal(res_true$content, "Final Answer")

  # Case B: Request column exists but includeThoughts = FALSE (or missing)
  req_tbl_false <- tibble::tibble(
    custom_id = "CID", ID1 = "A", ID2 = "B",
    request = list(
      list(generationConfig = list(thinkingConfig = list(includeThoughts = FALSE)))
    )
  )

  res_false <- parse_gemini_batch_output(tmp, req_tbl_false)
  # When include_thoughts is false, everything is concatenated into content
  testthat::expect_true(is.na(res_false$thoughts))
  testthat::expect_equal(res_false$content, "My thought processFinal Answer")
})

testthat::test_that("build_gemini_batch_requests validates inputs and warns on medium thinking", {
  td <- trait_description("overall_quality")
  tmpl <- set_prompt_template()
  pairs <- tibble::tibble(ID1 = "A", text1 = "t", ID2 = "B", text2 = "t")

  # Error on missing columns
  bad_pairs <- tibble::tibble(ID1 = "A", text1 = "t")
  testthat::expect_error(
    build_gemini_batch_requests(bad_pairs, "gemini-1.5-pro", td$name, td$description),
    "must contain columns"
  )

  # Warning on thinking_level = "medium"
  testthat::expect_warning(
    req <- build_gemini_batch_requests(
      pairs, "gemini-1.5-pro", td$name, td$description,
      thinking_level = "medium"
    ),
    "mapping to \"High\" internally"
  )

  # Verify the mapping occurred in the request body
  config <- req$request[[1]]$generationConfig
  testthat::expect_equal(config$thinkingConfig$thinkingLevel, "High")
})

testthat::test_that("gemini_download_batch_results warns if response count mismatches request count", {
  # Mock a batch object with 2 responses.
  # We construct a data frame where the 'response' column is a nested data frame.
  # This ensures inlined$response satisfies is.data.frame().
  resp_df <- data.frame(
    candidates = I(list(list(content=list(parts=list(list(text="A")))),
                        list(content=list(parts=list(list(text="B"))))))
  )

  inlined <- data.frame(row_id = 1:2)
  inlined$response <- resp_df

  mock_batch <- list(
    response = list(
      inlinedResponses = inlined
    )
  )

  # Provide 3 requests (mismatch with 2 responses)
  req_tbl <- tibble::tibble(custom_id = c("1", "2", "3"))
  tmp <- tempfile()
  on.exit(unlink(tmp), add = TRUE)

  testthat::with_mocked_bindings(
    gemini_get_batch = function(...) mock_batch,
    {
      testthat::expect_warning(
        gemini_download_batch_results("batch_123", req_tbl, tmp),
        "does not match number of requests"
      )
    }
  )
})