# tests/testthat/test-llm-batch.R test_that("llm_submit_pairs_batch validates pairs and model", { td <- trait_description("overall_quality") tmpl <- set_prompt_template() bad_pairs <- tibble::tibble( ID1 = "S01", text1 = "Sample 1" # missing ID2, text2 ) expect_error( llm_submit_pairs_batch( pairs = bad_pairs, backend = "openai", model = "gpt-4o-mini", trait_name = td$name, trait_description = td$description, prompt_template = tmpl ), "`pairs` must contain columns", fixed = FALSE ) good_pairs <- tibble::tibble( ID1 = "S01", text1 = "Sample 1", ID2 = "S02", text2 = "Sample 2" ) expect_error( llm_submit_pairs_batch( pairs = good_pairs, backend = "openai", model = "", trait_name = td$name, trait_description = td$description, prompt_template = tmpl ), "`model` must be a non-empty character scalar", fixed = TRUE ) }) test_that("llm_submit_pairs_batch dispatches to the correct backend pipelines", { pairs <- tibble::tibble( ID1 = c("S01", "S02"), text1 = c("Text 1a", "Text 2a"), ID2 = c("S03", "S04"), text2 = c("Text 1b", "Text 2b") ) td <- trait_description("overall_quality") tmpl <- set_prompt_template() openai_calls <- list() anthropic_calls <- list() gemini_calls <- list() fake_batch_return <- function(backend_name) { input_path <- tempfile( pattern = paste0("input_", backend_name, "_"), fileext = ".jsonl" ) output_path <- tempfile( pattern = paste0("output_", backend_name, "_"), fileext = ".jsonl" ) # Create the files so file.exists() expectations pass file.create(input_path) file.create(output_path) list( backend = backend_name, batch_input_path = input_path, batch_output_path = output_path, batch = list(id = paste0("batch_", backend_name)), results = tibble::tibble( custom_id = "BATCH_S01_vs_S02", ID1 = "S01", ID2 = "S02", model = paste0("model_", backend_name), object_type = "batch", status_code = 200L, error_message = NA_character_, thoughts = NA_character_, content = "SAMPLE_1", better_sample = "SAMPLE_1", better_id = "S01", prompt_tokens = 10L, completion_tokens = 2L, total_tokens = 12L ) ) } testthat::with_mocked_bindings( run_openai_batch_pipeline = function(pairs, model, trait_name, trait_description, prompt_template, endpoint = c("chat.completions", "responses"), batch_input_path = tempfile("openai_batch_input_", fileext = ".jsonl"), batch_output_path = tempfile("openai_batch_output_", fileext = ".jsonl"), poll = TRUE, interval_seconds = 5, timeout_seconds = 600, max_attempts = Inf, metadata = NULL, api_key = Sys.getenv("OPENAI_API_KEY"), include_thoughts = FALSE, include_raw = FALSE, ...) { openai_calls <<- append(openai_calls, list( list( model = model, trait_name = trait_name, trait_description = trait_description, include_thoughts = include_thoughts, include_raw = include_raw ) )) fake_batch_return("openai") }, run_anthropic_batch_pipeline = function(pairs, model, trait_name, trait_description, prompt_template, include_thoughts = FALSE, include_raw = FALSE, ...) { anthropic_calls <<- append(anthropic_calls, list( list( model = model, trait_name = trait_name, trait_description = trait_description, include_thoughts = include_thoughts, include_raw = include_raw ) )) fake_batch_return("anthropic") }, run_gemini_batch_pipeline = function(pairs, model, trait_name, trait_description, prompt_template, include_thoughts = FALSE, include_raw = FALSE, ...) { gemini_calls <<- append(gemini_calls, list( list( model = model, trait_name = trait_name, trait_description = trait_description, include_thoughts = include_thoughts, include_raw = include_raw ) )) fake_batch_return("gemini") }, { # OpenAI batch_openai <- llm_submit_pairs_batch( pairs = pairs, backend = "openai", model = "gpt-4o-mini", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, include_thoughts = FALSE, include_raw = TRUE ) expect_s3_class(batch_openai, "pairwiseLLM_batch") expect_equal(batch_openai$backend, "openai") expect_equal(length(openai_calls), 1L) expect_true(file.exists(batch_openai$batch_input_path)) expect_true(file.exists(batch_openai$batch_output_path)) # Anthropic batch_anthropic <- llm_submit_pairs_batch( pairs = pairs, backend = "anthropic", model = "claude-3-5-sonnet-latest", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, include_thoughts = TRUE, include_raw = FALSE ) expect_s3_class(batch_anthropic, "pairwiseLLM_batch") expect_equal(batch_anthropic$backend, "anthropic") expect_equal(length(anthropic_calls), 1L) # Gemini batch_gemini <- llm_submit_pairs_batch( pairs = pairs, backend = "gemini", model = "gemini-3-pro-preview", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, include_thoughts = TRUE, include_raw = FALSE ) expect_s3_class(batch_gemini, "pairwiseLLM_batch") expect_equal(batch_gemini$backend, "gemini") expect_equal(length(gemini_calls), 1L) } ) }) test_that("llm_submit_pairs_batch chooses OpenAI responses endpoint for gpt-5.1/5.2 with thoughts or reasoning", { pairs <- tibble::tibble( ID1 = "S01", text1 = "Text 1a", ID2 = "S02", text2 = "Text 1b" ) td <- trait_description("overall_quality") tmpl <- set_prompt_template() endpoints <- list() fake_batch_return <- function(endpoint_value) { input_path <- tempfile( pattern = paste0("input_", endpoint_value, "_"), fileext = ".jsonl" ) output_path <- tempfile( pattern = paste0("output_", endpoint_value, "_"), fileext = ".jsonl" ) file.create(input_path) file.create(output_path) list( batch_input_path = input_path, batch_output_path = output_path, batch = list(id = paste0("batch_", endpoint_value)), results = tibble::tibble( custom_id = "BATCH_S01_vs_S02", ID1 = "S01", ID2 = "S02", model = "gpt-5.1-mini", object_type = "batch", status_code = 200L, error_message = NA_character_, thoughts = NA_character_, content = "SAMPLE_1", better_sample = "SAMPLE_1", better_id = "S01", prompt_tokens = 10L, completion_tokens = 2L, total_tokens = 12L ) ) } testthat::with_mocked_bindings( run_openai_batch_pipeline = function(pairs, model, trait_name, trait_description, prompt_template, endpoint = c("chat.completions", "responses"), batch_input_path = tempfile("openai_batch_input_", fileext = ".jsonl"), batch_output_path = tempfile("openai_batch_output_", fileext = ".jsonl"), poll = TRUE, interval_seconds = 5, timeout_seconds = 600, max_attempts = Inf, metadata = NULL, api_key = Sys.getenv("OPENAI_API_KEY"), include_thoughts = FALSE, include_raw = FALSE, ...) { endpoints <<- append(endpoints, list(endpoint)) fake_batch_return(endpoint) }, { # 1) gpt-5.1 with include_thoughts = TRUE -> responses endpoint batch_resp <- llm_submit_pairs_batch( pairs = pairs, backend = "openai", model = "gpt-5.1-mini", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, include_thoughts = TRUE ) expect_s3_class(batch_resp, "pairwiseLLM_batch") # 2) gpt-5.1 with include_thoughts = FALSE and reasoning = "none" -> chat.completions # Note: model name here is "gpt-5.1-mini" which triggers logic batch_chat <- llm_submit_pairs_batch( pairs = pairs, backend = "openai", model = "gpt-5.1-mini", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, include_thoughts = FALSE, reasoning = "none" ) expect_s3_class(batch_chat, "pairwiseLLM_batch") # 3) gpt-5.2 date-stamped with thoughts -> responses endpoint batch_resp_52 <- llm_submit_pairs_batch( pairs = pairs, backend = "openai", model = "gpt-5.2-2025-12-11", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, include_thoughts = TRUE ) expect_s3_class(batch_resp_52, "pairwiseLLM_batch") expect_equal(length(endpoints), 3L) # First call (with thoughts) should use responses expect_equal(endpoints[[1]], "responses") # Second call (no thoughts, reasoning = "none") should use chat.completions expect_equal(endpoints[[2]], "chat.completions") # Third call (gpt-5.2 date stamped) should use responses expect_equal(endpoints[[3]], "responses") } ) }) test_that("llm_download_batch_results extracts results tibble", { fake_batch <- list( backend = "openai", batch_input_path = "input.jsonl", batch_output_path = "output.jsonl", results = tibble::tibble( custom_id = "BATCH_S01_vs_S02", ID1 = "S01", ID2 = "S02", model = "model_openai", better_sample = "SAMPLE_1", better_id = "S01" ) ) class(fake_batch) <- c("pairwiseLLM_batch", class(fake_batch)) res <- llm_download_batch_results(fake_batch) expect_s3_class(res, "tbl_df") expect_equal(nrow(res), 1L) expect_equal(res$ID1, "S01") expect_equal(res$better_sample, "SAMPLE_1") # Non classed, but list with results should still work res2 <- llm_download_batch_results(unclass(fake_batch)) expect_equal(res2$ID2, "S02") # Invalid input expect_error( llm_download_batch_results(list(foo = "bar")), "Unsupported input to `llm_download_batch_results", fixed = FALSE ) }) test_that("build_openai_batch_requests builds valid chat.completions JSONL objects", { data("example_writing_samples", package = "pairwiseLLM") pairs <- make_pairs(example_writing_samples) pairs <- pairs[1:2, ] td <- trait_description("overall_quality") tmpl <- set_prompt_template() batch <- build_openai_batch_requests( pairs = pairs, model = "gpt-4.1", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, endpoint = "chat.completions", temperature = 0, top_p = 1, logprobs = NULL ) expect_s3_class(batch, "tbl_df") expect_equal(nrow(batch), 2L) expect_true(all(c("custom_id", "method", "url", "body") %in% names(batch))) # Body structure check b1 <- batch$body[[1]] expect_equal(b1$model, "gpt-4.1") expect_true(is.list(b1$messages)) roles <- vapply(b1$messages, function(m) m[["role"]], character(1)) expect_true(any(roles == "user")) }) test_that("write_openai_batch_file writes JSONL file", { data("example_writing_samples", package = "pairwiseLLM") pairs <- make_pairs(example_writing_samples) pairs <- pairs[1:2, ] td <- trait_description("overall_quality") tmpl <- set_prompt_template() batch <- build_openai_batch_requests( pairs = pairs, model = "gpt-4.1", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, endpoint = "chat.completions" ) tmp <- tempfile("openai-batch-", fileext = ".jsonl") write_openai_batch_file(batch, tmp) expect_true(file.exists(tmp)) lines <- readLines(tmp, warn = FALSE) expect_equal(length(lines), nrow(batch)) # Each line should be valid JSON with required top-level keys objs <- lapply(lines, jsonlite::fromJSON) keys <- lapply(objs, names) expect_true(all(vapply(keys, function(k) { all(c( "custom_id", "method", "url", "body" ) %in% k) }, logical(1)))) }) test_that("build_openai_batch_requests supports gpt-5.1 with reasoning = 'none' on responses", { data("example_writing_samples", package = "pairwiseLLM") pairs <- make_pairs(example_writing_samples) pairs <- pairs[1:1, ] td <- trait_description("overall_quality") tmpl <- set_prompt_template() # For gpt-5.1 + reasoning = "none", temperature/top_p/logprobs are allowed batch <- build_openai_batch_requests( pairs = pairs, model = "gpt-5.1", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, endpoint = "responses", reasoning = "none", temperature = 0, top_p = 1, logprobs = NULL ) expect_s3_class(batch, "tbl_df") expect_equal(nrow(batch), 1L) b1 <- batch$body[[1]] expect_equal(b1$model, "gpt-5.1") expect_equal(b1$input, build_prompt( template = tmpl, trait_name = td$name, trait_desc = td$description, text1 = pairs$text1[1], text2 = pairs$text2[1] )) # reasoning should be present with effort = "none" expect_true("reasoning" %in% names(b1) || is.null(b1$reasoning) || identical(b1$reasoning$effort, "none")) }) test_that("build_openai_batch_requests errors for gpt-5.1/5.2 + reasoning != 'none' with temp", { data("example_writing_samples", package = "pairwiseLLM") pairs <- make_pairs(example_writing_samples)[1:1, ] td <- trait_description("overall_quality") tmpl <- set_prompt_template() # GPT-5.1 expect_error( build_openai_batch_requests( pairs = pairs, model = "gpt-5.1", trait_name = td$name, trait_description = td$description, endpoint = "responses", reasoning = "low", temperature = 0 ), regexp = "For gpt-5.1/5.2 with reasoning, temperature/top_p/logprobs must be NULL." ) # GPT-5.2 expect_error( build_openai_batch_requests( pairs = pairs, model = "gpt-5.2", trait_name = td$name, trait_description = td$description, endpoint = "responses", reasoning = "high", top_p = 0.5 ), regexp = "For gpt-5.1/5.2 with reasoning, temperature/top_p/logprobs must be NULL." ) }) test_that("build_openai_batch_requests allows other gpt-5* models with default temp=0", { data("example_writing_samples", package = "pairwiseLLM") pairs <- make_pairs(example_writing_samples) pairs <- pairs[1:1, ] td <- trait_description("overall_quality") tmpl <- set_prompt_template() # Pass NULL temp; since reasoning logic doesn't match 5.1/5.2 regex, # it should default to 0 inside the function and succeed. batch <- build_openai_batch_requests( pairs = pairs, model = "gpt-5-mini", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, endpoint = "responses", reasoning = "low", temperature = NULL, top_p = NULL, logprobs = NULL ) expect_s3_class(batch, "tbl_df") expect_equal(nrow(batch), 1L) b1 <- batch$body[[1]] expect_equal(b1$model, "gpt-5-mini") # Verify temperature defaulted to 0 expect_equal(b1$temperature, 0) }) testthat::test_that("parse_openai_batch_output collects thoughts and message text separately for responses", { tmp <- tempfile(fileext = ".jsonl") on.exit(unlink(tmp), add = TRUE) # Construct a fake batch output line similar to gpt-5.1 responses line_obj <- list( custom_id = "LIVE_S01_vs_S02", response = list( status_code = 200L, body = list( object = "response", model = "gpt-5.1", reasoning = list( effort = "low", summary = list(text = "Reasoning summary. ") ), output = list( list( id = "rs_x", type = "reasoning", summary = list() ), list( id = "msg_x", type = "message", status = "completed", content = list( list( type = "output_text", text = "SAMPLE_2 Final answer." ) ), role = "assistant" ) ), usage = list( input_tokens = 10L, output_tokens = 5L, total_tokens = 15L ) ) ), error = NULL ) json_line <- jsonlite::toJSON(line_obj, auto_unbox = TRUE) writeLines(json_line, con = tmp, useBytes = TRUE) res <- parse_openai_batch_output(tmp) testthat::expect_s3_class(res, "tbl_df") testthat::expect_equal(nrow(res), 1L) # IDs from custom_id testthat::expect_equal(res$custom_id, "LIVE_S01_vs_S02") testthat::expect_equal(res$ID1, "S01") testthat::expect_equal(res$ID2, "S02") # Basic metadata testthat::expect_equal(res$model, "gpt-5.1") testthat::expect_equal(res$object_type, "response") testthat::expect_equal(res$status_code, 200L) testthat::expect_true(is.na(res$error_message)) # Reasoning summary should go to thoughts testthat::expect_equal(res$thoughts, "Reasoning summary. ") # Content should be assistant message only testthat::expect_equal( res$content, "SAMPLE_2 Final answer." ) # Tag parsing and better_id mapping testthat::expect_equal(res$better_sample, "SAMPLE_2") testthat::expect_equal(res$better_id, "S02") # Token usage testthat::expect_equal(res$prompt_tokens, 10) testthat::expect_equal(res$completion_tokens, 5) testthat::expect_equal(res$total_tokens, 15) }) test_that("build_openai_batch_requests adds reasoning summary when include_thoughts = TRUE", { data("example_writing_samples", package = "pairwiseLLM") pairs <- make_pairs(example_writing_samples) pairs <- pairs[1:1, ] td <- trait_description("overall_quality") tmpl <- set_prompt_template() # include_thoughts = TRUE, reasoning != "none" -> summary = "auto" batch <- build_openai_batch_requests( pairs = pairs, model = "gpt-5.1", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, endpoint = "responses", reasoning = "low", include_thoughts = TRUE ) testthat::expect_s3_class(batch, "tbl_df") testthat::expect_equal(nrow(batch), 1L) b1 <- batch$body[[1]] testthat::expect_equal(b1$model, "gpt-5.1") testthat::expect_true("reasoning" %in% names(b1)) testthat::expect_equal(b1$reasoning$effort, "low") testthat::expect_equal(b1$reasoning$summary, "auto") # include_thoughts = TRUE but reasoning = "none" -> no summary field batch_none <- build_openai_batch_requests( pairs = pairs, model = "gpt-5.1", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, endpoint = "responses", reasoning = "none", include_thoughts = TRUE ) b2 <- batch_none$body[[1]] testthat::expect_true("reasoning" %in% names(b2)) testthat::expect_equal(b2$reasoning$effort, "none") testthat::expect_false("summary" %in% names(b2$reasoning)) })