# tests/testthat/test-llm-batch.R
test_that("llm_submit_pairs_batch validates pairs and model", {
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
bad_pairs <- tibble::tibble(
ID1 = "S01",
text1 = "Sample 1"
# missing ID2, text2
)
expect_error(
llm_submit_pairs_batch(
pairs = bad_pairs,
backend = "openai",
model = "gpt-4o-mini",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl
),
"`pairs` must contain columns",
fixed = FALSE
)
good_pairs <- tibble::tibble(
ID1 = "S01",
text1 = "Sample 1",
ID2 = "S02",
text2 = "Sample 2"
)
expect_error(
llm_submit_pairs_batch(
pairs = good_pairs,
backend = "openai",
model = "",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl
),
"`model` must be a non-empty character scalar",
fixed = TRUE
)
})
test_that("llm_submit_pairs_batch dispatches to the correct backend pipelines", {
pairs <- tibble::tibble(
ID1 = c("S01", "S02"),
text1 = c("Text 1a", "Text 2a"),
ID2 = c("S03", "S04"),
text2 = c("Text 1b", "Text 2b")
)
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
openai_calls <- list()
anthropic_calls <- list()
gemini_calls <- list()
fake_batch_return <- function(backend_name) {
input_path <- tempfile(
pattern = paste0("input_", backend_name, "_"),
fileext = ".jsonl"
)
output_path <- tempfile(
pattern = paste0("output_", backend_name, "_"),
fileext = ".jsonl"
)
# Create the files so file.exists() expectations pass
file.create(input_path)
file.create(output_path)
list(
backend = backend_name,
batch_input_path = input_path,
batch_output_path = output_path,
batch = list(id = paste0("batch_", backend_name)),
results = tibble::tibble(
custom_id = "BATCH_S01_vs_S02",
ID1 = "S01",
ID2 = "S02",
model = paste0("model_", backend_name),
object_type = "batch",
status_code = 200L,
error_message = NA_character_,
thoughts = NA_character_,
content = "SAMPLE_1",
better_sample = "SAMPLE_1",
better_id = "S01",
prompt_tokens = 10L,
completion_tokens = 2L,
total_tokens = 12L
)
)
}
testthat::with_mocked_bindings(
run_openai_batch_pipeline = function(pairs,
model,
trait_name,
trait_description,
prompt_template,
endpoint = c("chat.completions", "responses"),
batch_input_path = tempfile("openai_batch_input_", fileext = ".jsonl"),
batch_output_path = tempfile("openai_batch_output_", fileext = ".jsonl"),
poll = TRUE,
interval_seconds = 5,
timeout_seconds = 600,
max_attempts = Inf,
metadata = NULL,
api_key = Sys.getenv("OPENAI_API_KEY"),
include_thoughts = FALSE,
include_raw = FALSE,
...) {
openai_calls <<- append(openai_calls, list(
list(
model = model,
trait_name = trait_name,
trait_description = trait_description,
include_thoughts = include_thoughts,
include_raw = include_raw
)
))
fake_batch_return("openai")
},
run_anthropic_batch_pipeline = function(pairs,
model,
trait_name,
trait_description,
prompt_template,
include_thoughts = FALSE,
include_raw = FALSE,
...) {
anthropic_calls <<- append(anthropic_calls, list(
list(
model = model,
trait_name = trait_name,
trait_description = trait_description,
include_thoughts = include_thoughts,
include_raw = include_raw
)
))
fake_batch_return("anthropic")
},
run_gemini_batch_pipeline = function(pairs,
model,
trait_name,
trait_description,
prompt_template,
include_thoughts = FALSE,
include_raw = FALSE,
...) {
gemini_calls <<- append(gemini_calls, list(
list(
model = model,
trait_name = trait_name,
trait_description = trait_description,
include_thoughts = include_thoughts,
include_raw = include_raw
)
))
fake_batch_return("gemini")
},
{
# OpenAI
batch_openai <- llm_submit_pairs_batch(
pairs = pairs,
backend = "openai",
model = "gpt-4o-mini",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
include_thoughts = FALSE,
include_raw = TRUE
)
expect_s3_class(batch_openai, "pairwiseLLM_batch")
expect_equal(batch_openai$backend, "openai")
expect_equal(length(openai_calls), 1L)
expect_true(file.exists(batch_openai$batch_input_path))
expect_true(file.exists(batch_openai$batch_output_path))
# Anthropic
batch_anthropic <- llm_submit_pairs_batch(
pairs = pairs,
backend = "anthropic",
model = "claude-3-5-sonnet-latest",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
include_thoughts = TRUE,
include_raw = FALSE
)
expect_s3_class(batch_anthropic, "pairwiseLLM_batch")
expect_equal(batch_anthropic$backend, "anthropic")
expect_equal(length(anthropic_calls), 1L)
# Gemini
batch_gemini <- llm_submit_pairs_batch(
pairs = pairs,
backend = "gemini",
model = "gemini-3-pro-preview",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
include_thoughts = TRUE,
include_raw = FALSE
)
expect_s3_class(batch_gemini, "pairwiseLLM_batch")
expect_equal(batch_gemini$backend, "gemini")
expect_equal(length(gemini_calls), 1L)
}
)
})
test_that("llm_submit_pairs_batch chooses OpenAI responses endpoint for gpt-5.1/5.2 with thoughts or reasoning", {
pairs <- tibble::tibble(
ID1 = "S01",
text1 = "Text 1a",
ID2 = "S02",
text2 = "Text 1b"
)
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
endpoints <- list()
fake_batch_return <- function(endpoint_value) {
input_path <- tempfile(
pattern = paste0("input_", endpoint_value, "_"),
fileext = ".jsonl"
)
output_path <- tempfile(
pattern = paste0("output_", endpoint_value, "_"),
fileext = ".jsonl"
)
file.create(input_path)
file.create(output_path)
list(
batch_input_path = input_path,
batch_output_path = output_path,
batch = list(id = paste0("batch_", endpoint_value)),
results = tibble::tibble(
custom_id = "BATCH_S01_vs_S02",
ID1 = "S01",
ID2 = "S02",
model = "gpt-5.1-mini",
object_type = "batch",
status_code = 200L,
error_message = NA_character_,
thoughts = NA_character_,
content = "SAMPLE_1",
better_sample = "SAMPLE_1",
better_id = "S01",
prompt_tokens = 10L,
completion_tokens = 2L,
total_tokens = 12L
)
)
}
testthat::with_mocked_bindings(
run_openai_batch_pipeline = function(pairs,
model,
trait_name,
trait_description,
prompt_template,
endpoint = c("chat.completions", "responses"),
batch_input_path = tempfile("openai_batch_input_", fileext = ".jsonl"),
batch_output_path = tempfile("openai_batch_output_", fileext = ".jsonl"),
poll = TRUE,
interval_seconds = 5,
timeout_seconds = 600,
max_attempts = Inf,
metadata = NULL,
api_key = Sys.getenv("OPENAI_API_KEY"),
include_thoughts = FALSE,
include_raw = FALSE,
...) {
endpoints <<- append(endpoints, list(endpoint))
fake_batch_return(endpoint)
},
{
# 1) gpt-5.1 with include_thoughts = TRUE -> responses endpoint
batch_resp <- llm_submit_pairs_batch(
pairs = pairs,
backend = "openai",
model = "gpt-5.1-mini",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
include_thoughts = TRUE
)
expect_s3_class(batch_resp, "pairwiseLLM_batch")
# 2) gpt-5.1 with include_thoughts = FALSE and reasoning = "none" -> chat.completions
# Note: model name here is "gpt-5.1-mini" which triggers logic
batch_chat <- llm_submit_pairs_batch(
pairs = pairs,
backend = "openai",
model = "gpt-5.1-mini",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
include_thoughts = FALSE,
reasoning = "none"
)
expect_s3_class(batch_chat, "pairwiseLLM_batch")
# 3) gpt-5.2 date-stamped with thoughts -> responses endpoint
batch_resp_52 <- llm_submit_pairs_batch(
pairs = pairs,
backend = "openai",
model = "gpt-5.2-2025-12-11",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
include_thoughts = TRUE
)
expect_s3_class(batch_resp_52, "pairwiseLLM_batch")
expect_equal(length(endpoints), 3L)
# First call (with thoughts) should use responses
expect_equal(endpoints[[1]], "responses")
# Second call (no thoughts, reasoning = "none") should use chat.completions
expect_equal(endpoints[[2]], "chat.completions")
# Third call (gpt-5.2 date stamped) should use responses
expect_equal(endpoints[[3]], "responses")
}
)
})
test_that("llm_download_batch_results extracts results tibble", {
fake_batch <- list(
backend = "openai",
batch_input_path = "input.jsonl",
batch_output_path = "output.jsonl",
results = tibble::tibble(
custom_id = "BATCH_S01_vs_S02",
ID1 = "S01",
ID2 = "S02",
model = "model_openai",
better_sample = "SAMPLE_1",
better_id = "S01"
)
)
class(fake_batch) <- c("pairwiseLLM_batch", class(fake_batch))
res <- llm_download_batch_results(fake_batch)
expect_s3_class(res, "tbl_df")
expect_equal(nrow(res), 1L)
expect_equal(res$ID1, "S01")
expect_equal(res$better_sample, "SAMPLE_1")
# Non classed, but list with results should still work
res2 <- llm_download_batch_results(unclass(fake_batch))
expect_equal(res2$ID2, "S02")
# Invalid input
expect_error(
llm_download_batch_results(list(foo = "bar")),
"Unsupported input to `llm_download_batch_results",
fixed = FALSE
)
})
test_that("build_openai_batch_requests builds valid chat.completions JSONL objects", {
data("example_writing_samples", package = "pairwiseLLM")
pairs <- make_pairs(example_writing_samples)
pairs <- pairs[1:2, ]
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
batch <- build_openai_batch_requests(
pairs = pairs,
model = "gpt-4.1",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
endpoint = "chat.completions",
temperature = 0,
top_p = 1,
logprobs = NULL
)
expect_s3_class(batch, "tbl_df")
expect_equal(nrow(batch), 2L)
expect_true(all(c("custom_id", "method", "url", "body") %in% names(batch)))
# Body structure check
b1 <- batch$body[[1]]
expect_equal(b1$model, "gpt-4.1")
expect_true(is.list(b1$messages))
roles <- vapply(b1$messages, function(m) m[["role"]], character(1))
expect_true(any(roles == "user"))
})
test_that("write_openai_batch_file writes JSONL file", {
data("example_writing_samples", package = "pairwiseLLM")
pairs <- make_pairs(example_writing_samples)
pairs <- pairs[1:2, ]
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
batch <- build_openai_batch_requests(
pairs = pairs,
model = "gpt-4.1",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
endpoint = "chat.completions"
)
tmp <- tempfile("openai-batch-", fileext = ".jsonl")
write_openai_batch_file(batch, tmp)
expect_true(file.exists(tmp))
lines <- readLines(tmp, warn = FALSE)
expect_equal(length(lines), nrow(batch))
# Each line should be valid JSON with required top-level keys
objs <- lapply(lines, jsonlite::fromJSON)
keys <- lapply(objs, names)
expect_true(all(vapply(keys, function(k) {
all(c(
"custom_id", "method",
"url", "body"
) %in% k)
}, logical(1))))
})
test_that("build_openai_batch_requests supports gpt-5.1 with reasoning = 'none' on responses", {
data("example_writing_samples", package = "pairwiseLLM")
pairs <- make_pairs(example_writing_samples)
pairs <- pairs[1:1, ]
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
# For gpt-5.1 + reasoning = "none", temperature/top_p/logprobs are allowed
batch <- build_openai_batch_requests(
pairs = pairs,
model = "gpt-5.1",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
endpoint = "responses",
reasoning = "none",
temperature = 0,
top_p = 1,
logprobs = NULL
)
expect_s3_class(batch, "tbl_df")
expect_equal(nrow(batch), 1L)
b1 <- batch$body[[1]]
expect_equal(b1$model, "gpt-5.1")
expect_equal(b1$input, build_prompt(
template = tmpl,
trait_name = td$name,
trait_desc = td$description,
text1 = pairs$text1[1],
text2 = pairs$text2[1]
))
# reasoning should be present with effort = "none"
expect_true("reasoning" %in% names(b1) || is.null(b1$reasoning) ||
identical(b1$reasoning$effort, "none"))
})
test_that("build_openai_batch_requests errors for gpt-5.1/5.2 + reasoning != 'none' with temp", {
data("example_writing_samples", package = "pairwiseLLM")
pairs <- make_pairs(example_writing_samples)[1:1, ]
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
# GPT-5.1
expect_error(
build_openai_batch_requests(
pairs = pairs, model = "gpt-5.1", trait_name = td$name, trait_description = td$description,
endpoint = "responses", reasoning = "low", temperature = 0
),
regexp = "For gpt-5.1/5.2 with reasoning, temperature/top_p/logprobs must be NULL."
)
# GPT-5.2
expect_error(
build_openai_batch_requests(
pairs = pairs, model = "gpt-5.2", trait_name = td$name, trait_description = td$description,
endpoint = "responses", reasoning = "high", top_p = 0.5
),
regexp = "For gpt-5.1/5.2 with reasoning, temperature/top_p/logprobs must be NULL."
)
})
test_that("build_openai_batch_requests allows other gpt-5* models with default temp=0", {
data("example_writing_samples", package = "pairwiseLLM")
pairs <- make_pairs(example_writing_samples)
pairs <- pairs[1:1, ]
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
# Pass NULL temp; since reasoning logic doesn't match 5.1/5.2 regex,
# it should default to 0 inside the function and succeed.
batch <- build_openai_batch_requests(
pairs = pairs,
model = "gpt-5-mini",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
endpoint = "responses",
reasoning = "low",
temperature = NULL,
top_p = NULL,
logprobs = NULL
)
expect_s3_class(batch, "tbl_df")
expect_equal(nrow(batch), 1L)
b1 <- batch$body[[1]]
expect_equal(b1$model, "gpt-5-mini")
# Verify temperature defaulted to 0
expect_equal(b1$temperature, 0)
})
testthat::test_that("parse_openai_batch_output collects thoughts and message text separately for responses", {
tmp <- tempfile(fileext = ".jsonl")
on.exit(unlink(tmp), add = TRUE)
# Construct a fake batch output line similar to gpt-5.1 responses
line_obj <- list(
custom_id = "LIVE_S01_vs_S02",
response = list(
status_code = 200L,
body = list(
object = "response",
model = "gpt-5.1",
reasoning = list(
effort = "low",
summary = list(text = "Reasoning summary. ")
),
output = list(
list(
id = "rs_x",
type = "reasoning",
summary = list()
),
list(
id = "msg_x",
type = "message",
status = "completed",
content = list(
list(
type = "output_text",
text = "SAMPLE_2 Final answer."
)
),
role = "assistant"
)
),
usage = list(
input_tokens = 10L,
output_tokens = 5L,
total_tokens = 15L
)
)
),
error = NULL
)
json_line <- jsonlite::toJSON(line_obj, auto_unbox = TRUE)
writeLines(json_line, con = tmp, useBytes = TRUE)
res <- parse_openai_batch_output(tmp)
testthat::expect_s3_class(res, "tbl_df")
testthat::expect_equal(nrow(res), 1L)
# IDs from custom_id
testthat::expect_equal(res$custom_id, "LIVE_S01_vs_S02")
testthat::expect_equal(res$ID1, "S01")
testthat::expect_equal(res$ID2, "S02")
# Basic metadata
testthat::expect_equal(res$model, "gpt-5.1")
testthat::expect_equal(res$object_type, "response")
testthat::expect_equal(res$status_code, 200L)
testthat::expect_true(is.na(res$error_message))
# Reasoning summary should go to thoughts
testthat::expect_equal(res$thoughts, "Reasoning summary. ")
# Content should be assistant message only
testthat::expect_equal(
res$content,
"SAMPLE_2 Final answer."
)
# Tag parsing and better_id mapping
testthat::expect_equal(res$better_sample, "SAMPLE_2")
testthat::expect_equal(res$better_id, "S02")
# Token usage
testthat::expect_equal(res$prompt_tokens, 10)
testthat::expect_equal(res$completion_tokens, 5)
testthat::expect_equal(res$total_tokens, 15)
})
test_that("build_openai_batch_requests adds reasoning summary when include_thoughts = TRUE", {
data("example_writing_samples", package = "pairwiseLLM")
pairs <- make_pairs(example_writing_samples)
pairs <- pairs[1:1, ]
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
# include_thoughts = TRUE, reasoning != "none" -> summary = "auto"
batch <- build_openai_batch_requests(
pairs = pairs,
model = "gpt-5.1",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
endpoint = "responses",
reasoning = "low",
include_thoughts = TRUE
)
testthat::expect_s3_class(batch, "tbl_df")
testthat::expect_equal(nrow(batch), 1L)
b1 <- batch$body[[1]]
testthat::expect_equal(b1$model, "gpt-5.1")
testthat::expect_true("reasoning" %in% names(b1))
testthat::expect_equal(b1$reasoning$effort, "low")
testthat::expect_equal(b1$reasoning$summary, "auto")
# include_thoughts = TRUE but reasoning = "none" -> no summary field
batch_none <- build_openai_batch_requests(
pairs = pairs,
model = "gpt-5.1",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
endpoint = "responses",
reasoning = "none",
include_thoughts = TRUE
)
b2 <- batch_none$body[[1]]
testthat::expect_true("reasoning" %in% names(b2))
testthat::expect_equal(b2$reasoning$effort, "none")
testthat::expect_false("summary" %in% names(b2$reasoning))
})