# ===================================================================== # test-openai_batch.R # Tests for build_openai_batch_requests() and related batch helpers # ===================================================================== testthat::test_that("build_openai_batch_requests builds valid chat.completions JSONL objects", { data("example_writing_samples", package = "pairwiseLLM") pairs <- make_pairs(example_writing_samples) pairs <- pairs[1:2, ] td <- trait_description("overall_quality") tmpl <- set_prompt_template() batch <- build_openai_batch_requests( pairs = pairs, model = "gpt-4.1", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, endpoint = "chat.completions", temperature = 0, top_p = 1, logprobs = NULL ) testthat::expect_s3_class(batch, "tbl_df") testthat::expect_equal(nrow(batch), 2L) testthat::expect_true(all(c("custom_id", "method", "url", "body") %in% names(batch))) # Body structure check b1 <- batch$body[[1]] testthat::expect_equal(b1$model, "gpt-4.1") testthat::expect_true(is.list(b1$messages)) roles <- vapply(b1$messages, function(m) m[["role"]], character(1)) testthat::expect_true(any(roles == "user")) }) testthat::test_that("write_openai_batch_file writes JSONL file", { data("example_writing_samples", package = "pairwiseLLM") pairs <- make_pairs(example_writing_samples) pairs <- pairs[1:2, ] td <- trait_description("overall_quality") tmpl <- set_prompt_template() batch <- build_openai_batch_requests( pairs = pairs, model = "gpt-4.1", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, endpoint = "chat.completions" ) tmp <- tempfile("openai-batch-", fileext = ".jsonl") write_openai_batch_file(batch, tmp) testthat::expect_true(file.exists(tmp)) lines <- readLines(tmp, warn = FALSE) testthat::expect_equal(length(lines), nrow(batch)) # Each line should be valid JSON with required top-level keys objs <- lapply(lines, jsonlite::fromJSON) keys <- lapply(objs, names) testthat::expect_true(all(vapply(keys, function(k) { all(c("custom_id", "method", "url", "body") %in% k) }, logical(1)))) }) testthat::test_that("build_openai_batch_requests supports gpt-5.1 with reasoning = 'none' on responses", { data("example_writing_samples", package = "pairwiseLLM") pairs <- make_pairs(example_writing_samples) pairs <- pairs[1:1, ] td <- trait_description("overall_quality") tmpl <- set_prompt_template() # For gpt-5.1 + reasoning = "none", temperature/top_p/logprobs are allowed batch <- build_openai_batch_requests( pairs = pairs, model = "gpt-5.1", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, endpoint = "responses", reasoning = "none", temperature = 0, top_p = 1, logprobs = NULL ) testthat::expect_s3_class(batch, "tbl_df") testthat::expect_equal(nrow(batch), 1L) b1 <- batch$body[[1]] testthat::expect_equal(b1$model, "gpt-5.1") testthat::expect_equal(b1$input, build_prompt( template = tmpl, trait_name = td$name, trait_desc = td$description, text1 = pairs$text1[1], text2 = pairs$text2[1] )) # reasoning should be present with effort = "none" testthat::expect_true("reasoning" %in% names(b1) || is.null(b1$reasoning) || identical(b1$reasoning$effort, "none")) }) testthat::test_that("build_openai_batch_requests errors for gpt-5.1 + reasoning != 'none' with temp/top_p/logprobs", { data("example_writing_samples", package = "pairwiseLLM") pairs <- make_pairs(example_writing_samples) pairs <- pairs[1:1, ] td <- trait_description("overall_quality") tmpl <- set_prompt_template() testthat::expect_error( build_openai_batch_requests( pairs = pairs, model = "gpt-5.1", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, endpoint = "responses", reasoning = "low", # <- not 'none' temperature = 0, top_p = 1, logprobs = NULL ), regexp = "For gpt-5.1/5.2 with reasoning" ) }) testthat::test_that("build_openai_batch_requests allows other gpt-5* models with default temp=0", { data("example_writing_samples", package = "pairwiseLLM") pairs <- make_pairs(example_writing_samples) pairs <- pairs[1:1, ] td <- trait_description("overall_quality") tmpl <- set_prompt_template() batch <- build_openai_batch_requests( pairs = pairs, model = "gpt-5-mini", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, endpoint = "responses", reasoning = "low", temperature = NULL, top_p = NULL, logprobs = NULL ) testthat::expect_s3_class(batch, "tbl_df") testthat::expect_equal(nrow(batch), 1L) # Verify temperature defaulted to 0 testthat::expect_equal(batch$body[[1]]$temperature, 0) }) testthat::test_that("parse_openai_batch_output collects thoughts and message text separately for responses", { tmp <- tempfile(fileext = ".jsonl") on.exit(unlink(tmp), add = TRUE) # Construct a fake batch output line similar to gpt-5.1 responses line_obj <- list( custom_id = "LIVE_S01_vs_S02", response = list( status_code = 200L, body = list( object = "response", model = "gpt-5.1", reasoning = list( effort = "low", summary = list(text = "Reasoning summary. ") ), output = list( list( id = "rs_x", type = "reasoning", summary = list() ), list( id = "msg_x", type = "message", status = "completed", content = list( list( type = "output_text", text = "SAMPLE_2 Final answer." ) ), role = "assistant" ) ), usage = list( input_tokens = 10L, output_tokens = 5L, total_tokens = 15L ) ) ), error = NULL ) json_line <- jsonlite::toJSON(line_obj, auto_unbox = TRUE) writeLines(json_line, con = tmp, useBytes = TRUE) res <- parse_openai_batch_output(tmp) testthat::expect_s3_class(res, "tbl_df") testthat::expect_equal(nrow(res), 1L) # IDs from custom_id testthat::expect_equal(res$custom_id, "LIVE_S01_vs_S02") testthat::expect_equal(res$ID1, "S01") testthat::expect_equal(res$ID2, "S02") # Basic metadata testthat::expect_equal(res$model, "gpt-5.1") testthat::expect_equal(res$object_type, "response") testthat::expect_equal(res$status_code, 200L) testthat::expect_true(is.na(res$error_message)) # Reasoning summary should go to thoughts testthat::expect_equal(res$thoughts, "Reasoning summary. ") # Content should be assistant message only testthat::expect_equal( res$content, "SAMPLE_2 Final answer." ) # Tag parsing and better_id mapping testthat::expect_equal(res$better_sample, "SAMPLE_2") testthat::expect_equal(res$better_id, "S02") # Token usage testthat::expect_equal(res$prompt_tokens, 10) testthat::expect_equal(res$completion_tokens, 5) testthat::expect_equal(res$total_tokens, 15) }) testthat::test_that("build_openai_batch_requests adds reasoning summary when include_thoughts = TRUE", { data("example_writing_samples", package = "pairwiseLLM") pairs <- make_pairs(example_writing_samples) pairs <- pairs[1:1, ] td <- trait_description("overall_quality") tmpl <- set_prompt_template() # include_thoughts = TRUE, reasoning != "none" -> summary = "auto" batch <- build_openai_batch_requests( pairs = pairs, model = "gpt-5.1", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, endpoint = "responses", reasoning = "low", include_thoughts = TRUE ) testthat::expect_s3_class(batch, "tbl_df") testthat::expect_equal(nrow(batch), 1L) b1 <- batch$body[[1]] testthat::expect_equal(b1$model, "gpt-5.1") testthat::expect_true("reasoning" %in% names(b1)) testthat::expect_equal(b1$reasoning$effort, "low") testthat::expect_equal(b1$reasoning$summary, "auto") # include_thoughts = TRUE but reasoning = "none" -> no summary field batch_none <- build_openai_batch_requests( pairs = pairs, model = "gpt-5.1", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, endpoint = "responses", reasoning = "none", include_thoughts = TRUE ) b2 <- batch_none$body[[1]] testthat::expect_true("reasoning" %in% names(b2)) testthat::expect_equal(b2$reasoning$effort, "none") testthat::expect_false("summary" %in% names(b2$reasoning)) }) testthat::test_that("build_openai_batch_requests handles empty pairs tibble", { # Covers the n == 0L check empty_pairs <- tibble::tibble( ID1 = character(), text1 = character(), ID2 = character(), text2 = character() ) td <- trait_description("overall_quality") tmpl <- set_prompt_template() batch <- build_openai_batch_requests( pairs = empty_pairs, model = "gpt-4.1", trait_name = td$name, trait_description = td$description, prompt_template = tmpl ) testthat::expect_s3_class(batch, "tbl_df") testthat::expect_equal(nrow(batch), 0L) testthat::expect_named(batch, c("custom_id", "method", "url", "body")) }) testthat::test_that("build_openai_batch_requests warns if include_thoughts=TRUE for non-reasoning model", { # Covers the warning block when is_reasoning_model is FALSE but include_thoughts is TRUE data("example_writing_samples", package = "pairwiseLLM") pairs <- make_pairs(example_writing_samples)[1:1, ] td <- trait_description("overall_quality") tmpl <- set_prompt_template() testthat::expect_warning( build_openai_batch_requests( pairs = pairs, model = "gpt-4o", # Not a reasoning model trait_name = td$name, trait_description = td$description, prompt_template = tmpl, endpoint = "responses", include_thoughts = TRUE ), "include_thoughts requested for non-reasoning model" ) }) testthat::test_that("build_openai_batch_requests passes top_p and logprobs to body", { # Covers the lines adding optional parameters to the body list for both endpoints data("example_writing_samples", package = "pairwiseLLM") pairs <- make_pairs(example_writing_samples)[1:1, ] td <- trait_description("overall_quality") tmpl <- set_prompt_template() # 1. Chat Completions batch_chat <- build_openai_batch_requests( pairs = pairs, model = "gpt-4.1", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, endpoint = "chat.completions", temperature = 0.5, top_p = 0.9, logprobs = TRUE ) body_chat <- batch_chat$body[[1]] testthat::expect_equal(body_chat$temperature, 0.5) testthat::expect_equal(body_chat$top_p, 0.9) testthat::expect_equal(body_chat$logprobs, TRUE) # 2. Responses batch_resp <- build_openai_batch_requests( pairs = pairs, model = "gpt-4.1", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, endpoint = "responses", temperature = 0.7, top_p = 0.8, logprobs = FALSE ) body_resp <- batch_resp$body[[1]] testthat::expect_equal(body_resp$temperature, 0.7) testthat::expect_equal(body_resp$top_p, 0.8) testthat::expect_equal(body_resp$logprobs, FALSE) }) # --------------------------------------------------------------------- # Coverage improvements # --------------------------------------------------------------------- testthat::test_that("build_openai_batch_requests handles empty pairs tibble", { # Covers the n == 0L check empty_pairs <- tibble::tibble( ID1 = character(), text1 = character(), ID2 = character(), text2 = character() ) td <- trait_description("overall_quality") tmpl <- set_prompt_template() batch <- build_openai_batch_requests( pairs = empty_pairs, model = "gpt-4.1", trait_name = td$name, trait_description = td$description, prompt_template = tmpl ) testthat::expect_s3_class(batch, "tbl_df") testthat::expect_equal(nrow(batch), 0L) testthat::expect_named(batch, c("custom_id", "method", "url", "body")) }) testthat::test_that("build_openai_batch_requests warns if include_thoughts=TRUE for non-reasoning model", { # Covers the warning block when is_reasoning_model is FALSE but include_thoughts is TRUE data("example_writing_samples", package = "pairwiseLLM") pairs <- make_pairs(example_writing_samples)[1:1, ] td <- trait_description("overall_quality") tmpl <- set_prompt_template() testthat::expect_warning( build_openai_batch_requests( pairs = pairs, model = "gpt-4o", # Not a reasoning model trait_name = td$name, trait_description = td$description, prompt_template = tmpl, endpoint = "responses", include_thoughts = TRUE ), "include_thoughts requested for non-reasoning model" ) }) testthat::test_that("build_openai_batch_requests passes top_p and logprobs to body", { # Covers the lines adding optional parameters to the body list for both endpoints data("example_writing_samples", package = "pairwiseLLM") pairs <- make_pairs(example_writing_samples)[1:1, ] td <- trait_description("overall_quality") tmpl <- set_prompt_template() # 1. Chat Completions batch_chat <- build_openai_batch_requests( pairs = pairs, model = "gpt-4.1", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, endpoint = "chat.completions", temperature = 0.5, top_p = 0.9, logprobs = TRUE ) body_chat <- batch_chat$body[[1]] testthat::expect_equal(body_chat$temperature, 0.5) testthat::expect_equal(body_chat$top_p, 0.9) testthat::expect_equal(body_chat$logprobs, TRUE) # 2. Responses batch_resp <- build_openai_batch_requests( pairs = pairs, model = "gpt-4.1", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, endpoint = "responses", temperature = 0.7, top_p = 0.8, logprobs = FALSE ) body_resp <- batch_resp$body[[1]] testthat::expect_equal(body_resp$temperature, 0.7) testthat::expect_equal(body_resp$top_p, 0.8) testthat::expect_equal(body_resp$logprobs, FALSE) })