# =====================================================================
# test-openai_live.R
# Tests for openai_compare_pair_live() and submit_openai_pairs_live()
# =====================================================================
testthat::test_that("openai_compare_pair_live parses chat.completions correctly", {
data("example_writing_samples", package = "pairwiseLLM")
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
ID1 <- "S01"
ID2 <- "S02"
text1 <- "Text 1"
text2 <- "Text 2"
fake_body <- list(
object = "chat.completion",
model = "gpt-4.1",
choices = list(list(
message = list(
role = "assistant",
content = "SAMPLE_1 Some explanation."
)
)),
usage = list(
prompt_tokens = 10L,
completion_tokens = 5L,
total_tokens = 15L
)
)
testthat::with_mocked_bindings(
.openai_api_key = function(...) "FAKEKEY",
.openai_req_body_json = function(req, body) req,
.openai_req_perform = function(req) structure(list(), class = "fake_resp"),
.openai_resp_body_json = function(...) fake_body,
.openai_resp_status = function(...) 200L,
{
res <- openai_compare_pair_live(
ID1 = ID1,
text1 = text1,
ID2 = ID2,
text2 = text2,
model = "gpt-4.1",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
endpoint = "chat.completions",
temperature = 0,
include_raw = TRUE
)
testthat::expect_s3_class(res, "tbl_df")
testthat::expect_equal(nrow(res), 1L)
testthat::expect_equal(res$custom_id, sprintf("LIVE_%s_vs_%s", ID1, ID2))
testthat::expect_equal(res$ID1, ID1)
testthat::expect_equal(res$ID2, ID2)
testthat::expect_equal(res$model, "gpt-4.1")
testthat::expect_equal(res$object_type, "chat.completion")
testthat::expect_equal(res$status_code, 200L)
testthat::expect_true(is.na(res$error_message))
testthat::expect_equal(
res$content,
"SAMPLE_1 Some explanation."
)
testthat::expect_equal(res$better_sample, "SAMPLE_1")
testthat::expect_equal(res$better_id, ID1)
testthat::expect_equal(res$prompt_tokens, 10)
testthat::expect_equal(res$completion_tokens, 5)
testthat::expect_equal(res$total_tokens, 15)
# raw_response
testthat::expect_true("raw_response" %in% names(res))
testthat::expect_type(res$raw_response, "list")
testthat::expect_equal(res$raw_response[[1]]$object, "chat.completion")
testthat::expect_equal(res$raw_response[[1]]$model, "gpt-4.1")
}
)
})
# ---------------------------------------------------------------------
testthat::test_that("openai_compare_pair_live parses responses endpoint correctly", {
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
ID1 <- "S01"
ID2 <- "S02"
text1 <- "Text A"
text2 <- "Text B"
fake_body <- list(
object = "response",
model = "gpt-5.1",
output = list(list(
content = list(
list(
type = "output_text", text = "SAMPLE_2 A "
),
list(type = "output_text", text = "B")
)
)),
usage = list(
input_tokens = 7L,
output_tokens = 3L,
total_tokens = 10L
)
)
testthat::with_mocked_bindings(
.openai_api_key = function(...) "FAKEKEY",
.openai_req_body_json = function(req, body) req,
.openai_req_perform = function(req) structure(list(), class = "fake_resp"),
.openai_resp_body_json = function(...) fake_body,
.openai_resp_status = function(...) 200L,
{
res <- openai_compare_pair_live(
ID1 = ID1,
text1 = text1,
ID2 = ID2,
text2 = text2,
model = "gpt-5.1",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
endpoint = "responses",
reasoning = "none",
include_raw = TRUE
)
testthat::expect_s3_class(res, "tbl_df")
testthat::expect_equal(res$object_type, "response")
testthat::expect_equal(res$model, "gpt-5.1")
testthat::expect_equal(
res$content,
"SAMPLE_2 A B"
)
testthat::expect_equal(res$better_sample, "SAMPLE_2")
testthat::expect_equal(res$better_id, ID2)
testthat::expect_equal(res$prompt_tokens, 7)
testthat::expect_equal(res$completion_tokens, 3)
testthat::expect_equal(res$total_tokens, 10)
testthat::expect_true("raw_response" %in% names(res))
testthat::expect_equal(res$raw_response[[1]]$model, "gpt-5.1")
}
)
})
# ---------------------------------------------------------------------
testthat::test_that("openai_compare_pair_live returns error row on JSON parse failure", {
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
ID1 <- "S01"
ID2 <- "S02"
testthat::with_mocked_bindings(
.openai_api_key = function(...) "FAKEKEY",
.openai_req_body_json = function(req, body) req,
.openai_req_perform = function(req) structure(list(), class = "fake_resp"),
.openai_resp_body_json = function(...) stop("boom"),
.openai_resp_status = function(...) 500L,
{
res <- openai_compare_pair_live(
ID1 = ID1,
text1 = "X",
ID2 = ID2,
text2 = "Y",
model = "gpt-4.1",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
endpoint = "chat.completions",
include_raw = TRUE
)
testthat::expect_equal(res$status_code, 500L)
testthat::expect_equal(
res$error_message,
"Failed to parse JSON."
)
testthat::expect_true(is.na(res$better_sample))
testthat::expect_true(is.null(res$raw_response[[1]]))
}
)
})
# ---------------------------------------------------------------------
testthat::test_that("openai_compare_pair_live enforces gpt-5.1/5.2 + reasoning constraints", {
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
# 1. GPT-5.1 Should error
testthat::expect_error(
openai_compare_pair_live(
ID1 = "A", text1 = "x", ID2 = "B", text2 = "y",
model = "gpt-5.1",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
endpoint = "responses",
reasoning = "low",
temperature = 0
),
regexp = "gpt-5.1/5.2"
)
# 2. GPT-5.2 date-stamped should error
testthat::expect_error(
openai_compare_pair_live(
ID1 = "A", text1 = "x", ID2 = "B", text2 = "y",
model = "gpt-5.2-2025-12-11",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
endpoint = "responses",
reasoning = "medium",
top_p = 0.5
),
regexp = "gpt-5.1/5.2"
)
# Allowed case
fake_body <- list(
object = "response",
model = "gpt-5.1",
output = list(list(
content = list(list(
type = "output_text", text = "SAMPLE_1"
))
)),
usage = list(input_tokens = 1L, output_tokens = 1L, total_tokens = 2L)
)
testthat::with_mocked_bindings(
.openai_api_key = function(...) "FAKEKEY",
.openai_req_body_json = function(req, body) req,
.openai_req_perform = function(req) structure(list(), class = "fake_resp"),
.openai_resp_body_json = function(...) fake_body,
.openai_resp_status = function(...) 200L,
{
res <- openai_compare_pair_live(
ID1 = "A", text1 = "x", ID2 = "B", text2 = "y",
model = "gpt-5.1",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
endpoint = "responses",
reasoning = "none",
include_raw = TRUE
)
testthat::expect_equal(res$better_id, "A")
}
)
})
# ---------------------------------------------------------------------
testthat::test_that("openai_compare_pair_live allows other gpt-5* models with temp=0", {
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
# Mock success response
fake_body <- list(
object = "response",
model = "gpt-5-mini",
output = list(list(
content = list(list(
type = "output_text", text = "SAMPLE_2"
))
)),
usage = list(input_tokens = 1L, output_tokens = 1L, total_tokens = 2L)
)
# Capture request body to check temperature
captured_body <- NULL
testthat::with_mocked_bindings(
.openai_api_key = function(...) "FAKEKEY",
.openai_req_body_json = function(req, body) {
captured_body <<- body
req
},
.openai_req_perform = function(req) structure(list(), class = "fake_resp"),
.openai_resp_body_json = function(...) fake_body,
.openai_resp_status = function(...) 200L,
{
res <- openai_compare_pair_live(
ID1 = "A", text1 = "x", ID2 = "B", text2 = "y",
model = "gpt-5-mini",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
endpoint = "responses",
include_raw = TRUE
)
testthat::expect_equal(res$better_id, "B")
# Check that temperature was defaulted to 0
testthat::expect_equal(captured_body$temperature, 0)
}
)
})
# ---------------------------------------------------------------------
testthat::test_that("submit_openai_pairs_live returns empty tibble for zero rows", {
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
empty_pairs <- tibble::tibble(
ID1 = character(0),
text1 = character(0),
ID2 = character(0),
text2 = character(0)
)
res <- submit_openai_pairs_live(
pairs = empty_pairs,
model = "gpt-4.1",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
endpoint = "chat.completions"
)
testthat::expect_equal(nrow(res), 0L)
testthat::expect_true("thoughts" %in% names(res))
testthat::expect_false("raw_response" %in% names(res))
})
# ---------------------------------------------------------------------
testthat::test_that("submit_openai_pairs_live with include_raw=TRUE returns raw_response column", {
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
empty_pairs <- tibble::tibble(
ID1 = character(0),
text1 = character(0),
ID2 = character(0),
text2 = character(0)
)
res <- submit_openai_pairs_live(
pairs = empty_pairs,
model = "gpt-4.1",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
endpoint = "chat.completions",
include_raw = TRUE
)
testthat::expect_equal(nrow(res), 0L)
testthat::expect_true("thoughts" %in% names(res))
testthat::expect_true("raw_response" %in% names(res))
testthat::expect_type(res$raw_response, "list")
})
# ---------------------------------------------------------------------
testthat::test_that("submit_openai_pairs_live calls openai_compare_pair_live row-wise", {
pairs <- tibble::tibble(
ID1 = c("S01", "S03"),
text1 = c("Text 1", "Text 3"),
ID2 = c("S02", "S04"),
text2 = c("Text 2", "Text 4")
)
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
calls <- list()
fake_result <- function(ID1, ID2, chosen) {
tibble::tibble(
custom_id = sprintf("LIVE_%s_vs_%s", ID1, ID2),
ID1 = ID1,
ID2 = ID2,
model = "gpt-4.1",
object_type = "chat.completion",
status_code = 200L,
error_message = NA_character_,
content = sprintf("%s", chosen),
better_sample = chosen,
better_id = if (chosen == "SAMPLE_1") ID1 else ID2,
prompt_tokens = 10,
completion_tokens = 5,
total_tokens = 15
)
}
testthat::with_mocked_bindings(
openai_compare_pair_live = function(ID1, text1, ID2, text2, model, trait_name,
trait_description, prompt_template, endpoint, api_key,
include_raw, ...) {
calls <<- append(calls, list(list(ID1 = ID1, ID2 = ID2)))
if (ID1 == "S01") {
fake_result(ID1, ID2, "SAMPLE_1")
} else {
fake_result(ID1, ID2, "SAMPLE_2")
}
},
{
res <- submit_openai_pairs_live(
pairs = pairs,
model = "gpt-4.1",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
endpoint = "chat.completions",
include_raw = FALSE,
verbose = FALSE,
progress = FALSE
)
testthat::expect_equal(length(calls), 2L)
testthat::expect_equal(res$better_id, c("S01", "S04"))
}
)
})
# ---------------------------------------------------------------------
testthat::test_that("openai_compare_pair_live collects thoughts and message text separately for responses", {
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
ID1 <- "S01"
ID2 <- "S02"
text1 <- "Text A"
text2 <- "Text B"
fake_body <- list(
object = "response",
model = "gpt-5.1",
reasoning = list(
effort = "low",
summary = list(text = "Reasoning summary. ")
),
output = list(
list(
id = "rs_x",
type = "reasoning",
summary = list()
),
list(
id = "msg_x",
type = "message",
status = "completed",
content = list(
list(
type = "output_text",
text = "SAMPLE_2 Final answer."
)
),
role = "assistant"
)
),
usage = list(
input_tokens = 10L,
output_tokens = 5L,
total_tokens = 15L
)
)
testthat::with_mocked_bindings(
.openai_api_key = function(...) "FAKEKEY",
.openai_req_body_json = function(req, body) req,
.openai_req_perform = function(req) structure(list(), class = "fake_resp"),
.openai_resp_body_json = function(...) fake_body,
.openai_resp_status = function(...) 200L,
{
res <- openai_compare_pair_live(
ID1 = ID1,
text1 = text1,
ID2 = ID2,
text2 = text2,
model = "gpt-5.1",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
endpoint = "responses",
reasoning = "low",
include_thoughts = TRUE,
include_raw = TRUE
)
testthat::expect_s3_class(res, "tbl_df")
testthat::expect_equal(res$object_type, "response")
# Reasoning summary should go to thoughts
testthat::expect_equal(res$thoughts, "Reasoning summary. ")
# Content should be assistant message only
testthat::expect_equal(
res$content,
"SAMPLE_2 Final answer."
)
testthat::expect_equal(res$better_sample, "SAMPLE_2")
testthat::expect_equal(res$better_id, ID2)
}
)
})
# ---------------------------------------------------------------------
testthat::test_that("openai_compare_pair_live picks up reasoning summary from output items", {
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
ID1 <- "S01"
ID2 <- "S02"
text1 <- "Text A"
text2 <- "Text B"
fake_body <- list(
object = "response",
model = "gpt-5.1",
# No top-level reasoning$summary here
reasoning = list(
effort = "low"
),
output = list(
list(
id = "rs_x",
type = "reasoning",
summary = list(
list(type = "summary_text", text = "Reasoning sentence 1."),
list(type = "summary_text", text = "Reasoning sentence 2.")
)
),
list(
id = "msg_x",
type = "message",
status = "completed",
content = list(
list(
type = "output_text",
text = "SAMPLE_2 Final answer."
)
),
role = "assistant"
)
),
usage = list(
input_tokens = 5L,
output_tokens = 5L,
total_tokens = 10L
)
)
testthat::with_mocked_bindings(
.openai_api_key = function(...) "FAKEKEY",
.openai_req_body_json = function(req, body) req,
.openai_req_perform = function(req) structure(list(), class = "fake_resp"),
.openai_resp_body_json = function(...) fake_body,
.openai_resp_status = function(...) 200L,
{
res <- openai_compare_pair_live(
ID1 = ID1,
text1 = text1,
ID2 = ID2,
text2 = text2,
model = "gpt-5.1",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
endpoint = "responses",
reasoning = "low",
include_thoughts = TRUE,
include_raw = TRUE
)
testthat::expect_s3_class(res, "tbl_df")
testthat::expect_equal(res$object_type, "response")
# Thoughts should be both summary_text entries present
testthat::expect_match(res$thoughts, "Reasoning sentence 1.", fixed = TRUE)
testthat::expect_match(res$thoughts, "Reasoning sentence 2.", fixed = TRUE)
# Content should be assistant message only
testthat::expect_equal(
res$content,
"SAMPLE_2 Final answer."
)
testthat::expect_equal(res$better_sample, "SAMPLE_2")
testthat::expect_equal(res$better_id, ID2)
}
)
})
testthat::test_that("openai_compare_pair_live validates input types", {
td <- trait_description("overall_quality")
testthat::expect_error(
openai_compare_pair_live(
ID1 = 123, text1 = "t", ID2 = "B", text2 = "t",
model = "gpt-4", trait_name = td$name, trait_description = td$description
),
"ID1 invalid"
)
testthat::expect_error(
openai_compare_pair_live(
ID1 = "A", text1 = "t", ID2 = "B", text2 = list(),
model = "gpt-4", trait_name = td$name, trait_description = td$description
),
"text2 invalid"
)
})
testthat::test_that("openai_compare_pair_live handles HTTP errors gracefully", {
td <- trait_description("overall_quality")
# Simulate 400 Bad Request
fake_error_body <- list(
error = list(message = "Invalid parameter")
)
testthat::with_mocked_bindings(
.openai_api_key = function(...) "KEY",
.openai_req_body_json = function(req, ...) req,
.openai_req_perform = function(...) "RESP",
.openai_resp_status = function(...) 400L,
.openai_resp_body_json = function(...) fake_error_body,
{
res <- openai_compare_pair_live(
ID1 = "A", text1 = "t", ID2 = "B", text2 = "t",
model = "gpt-4", trait_name = td$name, trait_description = td$description
)
testthat::expect_equal(res$status_code, 400L)
testthat::expect_equal(res$error_message, "Invalid parameter")
testthat::expect_true(is.na(res$content))
}
)
})
testthat::test_that("openai_compare_pair_live parses legacy reasoning summary location", {
td <- trait_description("overall_quality")
# Old structure where summary was at body$reasoning$summary$text
fake_body <- list(
object = "response",
model = "gpt-5.1",
reasoning = list(
effort = "low",
summary = list(text = "Legacy summary.")
),
output = list(
list(
type = "message",
content = list(list(type = "output_text", text = "Content"))
)
)
)
testthat::with_mocked_bindings(
.openai_api_key = function(...) "KEY",
.openai_req_body_json = function(req, ...) req,
.openai_req_perform = function(...) "RESP",
.openai_resp_status = function(...) 200L,
.openai_resp_body_json = function(...) fake_body,
{
res <- openai_compare_pair_live(
ID1 = "A", text1 = "t", ID2 = "B", text2 = "t",
model = "gpt-5.1", trait_name = td$name, trait_description = td$description,
endpoint = "responses"
)
testthat::expect_equal(res$thoughts, "Legacy summary.")
testthat::expect_equal(res$content, "Content")
}
)
})
testthat::test_that("submit_openai_pairs_live validates inputs", {
td <- trait_description("overall_quality")
# Missing columns
bad_pairs <- tibble::tibble(ID1 = "A", text1 = "t")
testthat::expect_error(
submit_openai_pairs_live(bad_pairs, "gpt-4", td$name, td$description),
"must contain columns"
)
# Invalid status_every
good_pairs <- tibble::tibble(ID1 = "A", text1 = "t", ID2 = "B", text2 = "t")
testthat::expect_error(
submit_openai_pairs_live(
good_pairs, "gpt-4", td$name, td$description,
status_every = 0
),
"status_every` must be a single positive integer"
)
})
testthat::test_that("openai_compare_pair_live validates ID2, text1, and model", {
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
# Invalid ID2
testthat::expect_error(
openai_compare_pair_live(
ID1 = "A", text1 = "A", ID2 = 123, text2 = "B",
model = "gpt-4", trait_name = td$name, trait_description = td$description,
prompt_template = tmpl
),
"ID2 invalid"
)
# Invalid text1
testthat::expect_error(
openai_compare_pair_live(
ID1 = "A", text1 = list(), ID2 = "B", text2 = "B",
model = "gpt-4", trait_name = td$name, trait_description = td$description,
prompt_template = tmpl
),
"text1 invalid"
)
# Invalid model
testthat::expect_error(
openai_compare_pair_live(
ID1 = "A", text1 = "A", ID2 = "B", text2 = "B",
model = 123, trait_name = td$name, trait_description = td$description,
prompt_template = tmpl
),
"model invalid"
)
})
testthat::test_that("openai_compare_pair_live passes optional parameters (top_p, logprobs)", {
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
captured_body <- NULL
testthat::with_mocked_bindings(
.openai_api_key = function(...) "KEY",
.openai_req_body_json = function(req, body) {
captured_body <<- body
req
},
.openai_req_perform = function(req) structure(list(), class = "fake_resp"),
.openai_resp_body_json = function(...) list(),
.openai_resp_status = function(...) 200L,
{
# 1. Chat Completions
openai_compare_pair_live(
"A", "t", "B", "t", "gpt-4", td$name, td$description, tmpl,
endpoint = "chat.completions",
top_p = 0.9,
logprobs = TRUE
)
testthat::expect_equal(captured_body$top_p, 0.9)
testthat::expect_equal(captured_body$logprobs, TRUE)
# 2. Responses
openai_compare_pair_live(
"A", "t", "B", "t", "gpt-5.1", td$name, td$description, tmpl,
endpoint = "responses",
top_p = 0.8,
logprobs = FALSE
)
testthat::expect_equal(captured_body$top_p, 0.8)
testthat::expect_equal(captured_body$logprobs, FALSE)
}
)
})
testthat::test_that("openai_compare_pair_live constructs generic HTTP error message", {
td <- trait_description("overall_quality")
# Response with error status but no body$error object (triggering the else if status >= 400 block)
testthat::with_mocked_bindings(
.openai_api_key = function(...) "KEY",
.openai_req_body_json = function(req, body) req,
.openai_req_perform = function(req) structure(list(), class = "fake_resp"),
.openai_resp_body_json = function(...) list(), # Empty body
.openai_resp_status = function(...) 418L,
{
res <- openai_compare_pair_live("A", "t", "B", "t", "gpt-4", td$name, td$description)
testthat::expect_equal(res$status_code, 418L)
testthat::expect_equal(res$error_message, "HTTP 418")
}
)
})
testthat::test_that("openai_compare_pair_live parses dataframe reasoning summaries", {
td <- trait_description("overall_quality")
# output structure where summary is a data.frame
fake_body <- list(
object = "response",
model = "gpt-5.1",
output = list(
list(
type = "reasoning",
summary = data.frame(text = "DF Summary", stringsAsFactors = FALSE)
)
)
)
testthat::with_mocked_bindings(
.openai_api_key = function(...) "KEY",
.openai_req_body_json = function(req, body) req,
.openai_req_perform = function(req) structure(list(), class = "fake_resp"),
.openai_resp_body_json = function(...) fake_body,
.openai_resp_status = function(...) 200L,
{
res <- openai_compare_pair_live("A", "t", "B", "t", "gpt-5.1", td$name, td$description, endpoint = "responses")
testthat::expect_equal(res$thoughts, "DF Summary")
}
)
})