# =====================================================================
# test-openai_live.R
# Tests for openai_compare_pair_live() and submit_openai_pairs_live()
# =====================================================================
testthat::test_that("openai_compare_pair_live parses chat.completions
correctly", {
data("example_writing_samples", package = "pairwiseLLM")
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
ID1 <- "S01"
ID2 <- "S02"
text1 <- "Text 1"
text2 <- "Text 2"
fake_body <- list(
object = "chat.completion",
model = "gpt-4.1",
choices = list(list(
message = list(
role = "assistant",
content = "SAMPLE_1 Some explanation."
)
)),
usage = list(
prompt_tokens = 10L,
completion_tokens = 5L,
total_tokens = 15L
)
)
testthat::with_mocked_bindings(
.openai_api_key = function(...) "FAKEKEY",
.openai_req_body_json = function(req, body) req,
.openai_req_perform = function(req) structure(list(), class = "fake_resp"),
.openai_resp_body_json = function(...) fake_body,
.openai_resp_status = function(...) 200L,
{
res <- openai_compare_pair_live(
ID1 = ID1,
text1 = text1,
ID2 = ID2,
text2 = text2,
model = "gpt-4.1",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
endpoint = "chat.completions",
temperature = 0,
include_raw = TRUE
)
testthat::expect_s3_class(res, "tbl_df")
testthat::expect_equal(nrow(res), 1L)
testthat::expect_equal(res$custom_id, sprintf("LIVE_%s_vs_%s", ID1, ID2))
testthat::expect_equal(res$ID1, ID1)
testthat::expect_equal(res$ID2, ID2)
testthat::expect_equal(res$model, "gpt-4.1")
testthat::expect_equal(res$object_type, "chat.completion")
testthat::expect_equal(res$status_code, 200L)
testthat::expect_true(is.na(res$error_message))
testthat::expect_equal(
res$content,
"SAMPLE_1 Some explanation."
)
testthat::expect_equal(res$better_sample, "SAMPLE_1")
testthat::expect_equal(res$better_id, ID1)
testthat::expect_equal(res$prompt_tokens, 10)
testthat::expect_equal(res$completion_tokens, 5)
testthat::expect_equal(res$total_tokens, 15)
# raw_response
testthat::expect_true("raw_response" %in% names(res))
testthat::expect_type(res$raw_response, "list")
testthat::expect_equal(res$raw_response[[1]]$object, "chat.completion")
testthat::expect_equal(res$raw_response[[1]]$model, "gpt-4.1")
}
)
})
# ---------------------------------------------------------------------
testthat::test_that("openai_compare_pair_live parses responses endpoint
correctly", {
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
ID1 <- "S01"
ID2 <- "S02"
text1 <- "Text A"
text2 <- "Text B"
fake_body <- list(
object = "response",
model = "gpt-5.1",
output = list(list(
content = list(
list(
type = "output_text", text =
"SAMPLE_2 A "
),
list(type = "output_text", text = "B")
)
)),
usage = list(
input_tokens = 7L,
output_tokens = 3L,
total_tokens = 10L
)
)
testthat::with_mocked_bindings(
.openai_api_key = function(...) "FAKEKEY",
.openai_req_body_json = function(req, body) req,
.openai_req_perform = function(req) structure(list(), class = "fake_resp"),
.openai_resp_body_json = function(...) fake_body,
.openai_resp_status = function(...) 200L,
{
res <- openai_compare_pair_live(
ID1 = ID1,
text1 = text1,
ID2 = ID2,
text2 = text2,
model = "gpt-5.1",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
endpoint = "responses",
reasoning = "none",
include_raw = TRUE
)
testthat::expect_s3_class(res, "tbl_df")
testthat::expect_equal(res$object_type, "response")
testthat::expect_equal(res$model, "gpt-5.1")
testthat::expect_equal(
res$content,
"SAMPLE_2 A B"
)
testthat::expect_equal(res$better_sample, "SAMPLE_2")
testthat::expect_equal(res$better_id, ID2)
testthat::expect_equal(res$prompt_tokens, 7)
testthat::expect_equal(res$completion_tokens, 3)
testthat::expect_equal(res$total_tokens, 10)
testthat::expect_true("raw_response" %in% names(res))
testthat::expect_equal(res$raw_response[[1]]$model, "gpt-5.1")
}
)
})
# ---------------------------------------------------------------------
testthat::test_that("openai_compare_pair_live returns error row
on JSON parse failure", {
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
ID1 <- "S01"
ID2 <- "S02"
testthat::with_mocked_bindings(
.openai_api_key = function(...) "FAKEKEY",
.openai_req_body_json = function(req, body) req,
.openai_req_perform = function(req) structure(list(), class = "fake_resp"),
.openai_resp_body_json = function(...) stop("boom"),
.openai_resp_status = function(...) 500L,
{
res <- openai_compare_pair_live(
ID1 = ID1,
text1 = "X",
ID2 = ID2,
text2 = "Y",
model = "gpt-4.1",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
endpoint = "chat.completions",
include_raw = TRUE
)
testthat::expect_equal(res$status_code, 500L)
testthat::expect_equal(
res$error_message,
"Failed to parse response body as JSON."
)
testthat::expect_true(is.na(res$better_sample))
testthat::expect_true(is.null(res$raw_response[[1]]))
}
)
})
# ---------------------------------------------------------------------
testthat::test_that("openai_compare_pair_live enforces gpt-5.1 + reasoning
constraints", {
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
# Should error
testthat::expect_error(
openai_compare_pair_live(
ID1 = "A", text1 = "x", ID2 = "B", text2 = "y",
model = "gpt-5.1",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
endpoint = "responses",
reasoning = "low",
temperature = 0
),
regexp = "gpt-5.1"
)
# Allowed case
fake_body <- list(
object = "response",
model = "gpt-5.1",
output = list(list(
content = list(list(
type = "output_text", text =
"SAMPLE_1"
))
)),
usage = list(input_tokens = 1L, output_tokens = 1L, total_tokens = 2L)
)
testthat::with_mocked_bindings(
.openai_api_key = function(...) "FAKEKEY",
.openai_req_body_json = function(req, body) req,
.openai_req_perform = function(req) structure(list(), class = "fake_resp"),
.openai_resp_body_json = function(...) fake_body,
.openai_resp_status = function(...) 200L,
{
res <- openai_compare_pair_live(
ID1 = "A", text1 = "x", ID2 = "B", text2 = "y",
model = "gpt-5.1",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
endpoint = "responses",
reasoning = "none",
include_raw = TRUE
)
testthat::expect_equal(res$better_id, "A")
}
)
})
# ---------------------------------------------------------------------
testthat::test_that("openai_compare_pair_live enforces other gpt-5*
constraints", {
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
testthat::expect_error(
openai_compare_pair_live(
ID1 = "A", text1 = "x", ID2 = "B", text2 = "y",
model = "gpt-5-mini",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
endpoint = "responses",
temperature = 0
)
)
fake_body <- list(
object = "response",
model = "gpt-5-mini",
output = list(list(
content = list(list(
type = "output_text", text =
"SAMPLE_2"
))
)),
usage = list(input_tokens = 1L, output_tokens = 1L, total_tokens = 2L)
)
testthat::with_mocked_bindings(
.openai_api_key = function(...) "FAKEKEY",
.openai_req_body_json = function(req, body) req,
.openai_req_perform = function(req) structure(list(), class = "fake_resp"),
.openai_resp_body_json = function(...) fake_body,
.openai_resp_status = function(...) 200L,
{
res <- openai_compare_pair_live(
ID1 = "A", text1 = "x", ID2 = "B", text2 = "y",
model = "gpt-5-mini",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
endpoint = "responses",
include_raw = TRUE
)
testthat::expect_equal(res$better_id, "B")
}
)
})
# ---------------------------------------------------------------------
testthat::test_that("submit_openai_pairs_live returns empty tibble for
zero rows", {
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
empty_pairs <- tibble::tibble(
ID1 = character(0),
text1 = character(0),
ID2 = character(0),
text2 = character(0)
)
res <- submit_openai_pairs_live(
pairs = empty_pairs,
model = "gpt-4.1",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
endpoint = "chat.completions"
)
testthat::expect_equal(nrow(res), 0L)
testthat::expect_true("thoughts" %in% names(res))
testthat::expect_false("raw_response" %in% names(res))
})
# ---------------------------------------------------------------------
testthat::test_that("submit_openai_pairs_live with include_raw=TRUE returns
raw_response column", {
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
empty_pairs <- tibble::tibble(
ID1 = character(0),
text1 = character(0),
ID2 = character(0),
text2 = character(0)
)
res <- submit_openai_pairs_live(
pairs = empty_pairs,
model = "gpt-4.1",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
endpoint = "chat.completions",
include_raw = TRUE
)
testthat::expect_equal(nrow(res), 0L)
testthat::expect_true("thoughts" %in% names(res))
testthat::expect_true("raw_response" %in% names(res))
testthat::expect_type(res$raw_response, "list")
})
# ---------------------------------------------------------------------
testthat::test_that("submit_openai_pairs_live calls
openai_compare_pair_live row-wise", {
pairs <- tibble::tibble(
ID1 = c("S01", "S03"),
text1 = c("Text 1", "Text 3"),
ID2 = c("S02", "S04"),
text2 = c("Text 2", "Text 4")
)
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
calls <- list()
fake_result <- function(ID1, ID2, chosen) {
tibble::tibble(
custom_id = sprintf("LIVE_%s_vs_%s", ID1, ID2),
ID1 = ID1,
ID2 = ID2,
model = "gpt-4.1",
object_type = "chat.completion",
status_code = 200L,
error_message = NA_character_,
content = sprintf("%s", chosen),
better_sample = chosen,
better_id = if (chosen == "SAMPLE_1") ID1 else ID2,
prompt_tokens = 10,
completion_tokens = 5,
total_tokens = 15
)
}
testthat::with_mocked_bindings(
openai_compare_pair_live = function(ID1, text1, ID2, text2, model, trait_name,
trait_description, prompt_template, endpoint, api_key,
include_raw, ...) {
calls <<- append(calls, list(list(ID1 = ID1, ID2 = ID2)))
if (ID1 == "S01") {
fake_result(ID1, ID2, "SAMPLE_1")
} else {
fake_result(ID1, ID2, "SAMPLE_2")
}
},
{
res <- submit_openai_pairs_live(
pairs = pairs,
model = "gpt-4.1",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
endpoint = "chat.completions",
include_raw = FALSE,
verbose = FALSE,
progress = FALSE
)
testthat::expect_equal(length(calls), 2L)
testthat::expect_equal(res$better_id, c("S01", "S04"))
}
)
})
# ---------------------------------------------------------------------
testthat::test_that("openai_compare_pair_live collects thoughts and
message text separately for responses", {
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
ID1 <- "S01"
ID2 <- "S02"
text1 <- "Text A"
text2 <- "Text B"
fake_body <- list(
object = "response",
model = "gpt-5.1",
reasoning = list(
effort = "low",
summary = list(text = "Reasoning summary. ")
),
output = list(
list(
id = "rs_x",
type = "reasoning",
summary = list()
),
list(
id = "msg_x",
type = "message",
status = "completed",
content = list(
list(
type = "output_text",
text = "SAMPLE_2 Final answer."
)
),
role = "assistant"
)
),
usage = list(
input_tokens = 10L,
output_tokens = 5L,
total_tokens = 15L
)
)
testthat::with_mocked_bindings(
.openai_api_key = function(...) "FAKEKEY",
.openai_req_body_json = function(req, body) req,
.openai_req_perform = function(req) structure(list(), class = "fake_resp"),
.openai_resp_body_json = function(...) fake_body,
.openai_resp_status = function(...) 200L,
{
res <- openai_compare_pair_live(
ID1 = ID1,
text1 = text1,
ID2 = ID2,
text2 = text2,
model = "gpt-5.1",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
endpoint = "responses",
reasoning = "low",
include_thoughts = TRUE,
include_raw = TRUE
)
testthat::expect_s3_class(res, "tbl_df")
testthat::expect_equal(res$object_type, "response")
# Reasoning summary should go to thoughts
testthat::expect_equal(res$thoughts, "Reasoning summary. ")
# Content should be assistant message only
testthat::expect_equal(
res$content,
"SAMPLE_2 Final answer."
)
testthat::expect_equal(res$better_sample, "SAMPLE_2")
testthat::expect_equal(res$better_id, ID2)
}
)
})
# ---------------------------------------------------------------------
testthat::test_that("openai_compare_pair_live picks up reasoning summary
from output items", {
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
ID1 <- "S01"
ID2 <- "S02"
text1 <- "Text A"
text2 <- "Text B"
fake_body <- list(
object = "response",
model = "gpt-5.1",
# No top-level reasoning$summary here
reasoning = list(
effort = "low"
),
output = list(
list(
id = "rs_x",
type = "reasoning",
summary = list(
list(type = "summary_text", text = "Reasoning sentence 1."),
list(type = "summary_text", text = "Reasoning sentence 2.")
)
),
list(
id = "msg_x",
type = "message",
status = "completed",
content = list(
list(
type = "output_text",
text = "SAMPLE_2 Final answer."
)
),
role = "assistant"
)
),
usage = list(
input_tokens = 5L,
output_tokens = 5L,
total_tokens = 10L
)
)
testthat::with_mocked_bindings(
.openai_api_key = function(...) "FAKEKEY",
.openai_req_body_json = function(req, body) req,
.openai_req_perform = function(req) structure(list(), class = "fake_resp"),
.openai_resp_body_json = function(...) fake_body,
.openai_resp_status = function(...) 200L,
{
res <- openai_compare_pair_live(
ID1 = ID1,
text1 = text1,
ID2 = ID2,
text2 = text2,
model = "gpt-5.1",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
endpoint = "responses",
reasoning = "low",
include_thoughts = TRUE,
include_raw = TRUE
)
testthat::expect_s3_class(res, "tbl_df")
testthat::expect_equal(res$object_type, "response")
# Thoughts should be both summary_text entries present
testthat::expect_match(res$thoughts, "Reasoning sentence 1.",
fixed = TRUE
)
testthat::expect_match(res$thoughts, "Reasoning sentence 2.",
fixed = TRUE
)
# Content should be assistant message only
testthat::expect_equal(
res$content,
"SAMPLE_2 Final answer."
)
testthat::expect_equal(res$better_sample, "SAMPLE_2")
testthat::expect_equal(res$better_id, ID2)
}
)
})
testthat::test_that("openai_compare_pair_live validates input types", {
td <- trait_description("overall_quality")
testthat::expect_error(
openai_compare_pair_live(
ID1 = 123, text1 = "t", ID2 = "B", text2 = "t",
model = "gpt-4", trait_name = td$name, trait_description = td$description
),
"`ID1` must be a single character"
)
testthat::expect_error(
openai_compare_pair_live(
ID1 = "A", text1 = "t", ID2 = "B", text2 = list(),
model = "gpt-4", trait_name = td$name, trait_description = td$description
),
"`text2` must be a single character"
)
})
testthat::test_that("openai_compare_pair_live handles HTTP errors gracefully", {
td <- trait_description("overall_quality")
# Simulate 400 Bad Request
fake_error_body <- list(
error = list(message = "Invalid parameter")
)
testthat::with_mocked_bindings(
.openai_api_key = function(...) "KEY",
.openai_req_body_json = function(req, ...) req,
.openai_req_perform = function(...) "RESP",
.openai_resp_status = function(...) 400L,
.openai_resp_body_json = function(...) fake_error_body,
{
res <- openai_compare_pair_live(
ID1 = "A", text1 = "t", ID2 = "B", text2 = "t",
model = "gpt-4", trait_name = td$name, trait_description = td$description
)
testthat::expect_equal(res$status_code, 400L)
testthat::expect_equal(res$error_message, "Invalid parameter")
testthat::expect_true(is.na(res$content))
}
)
})
testthat::test_that("openai_compare_pair_live parses legacy reasoning summary location", {
td <- trait_description("overall_quality")
# Old structure where summary was at body$reasoning$summary$text
fake_body <- list(
object = "response",
model = "gpt-5.1",
reasoning = list(
effort = "low",
summary = list(text = "Legacy summary.")
),
output = list(
list(
type = "message",
content = list(list(type = "output_text", text = "Content"))
)
)
)
testthat::with_mocked_bindings(
.openai_api_key = function(...) "KEY",
.openai_req_body_json = function(req, ...) req,
.openai_req_perform = function(...) "RESP",
.openai_resp_status = function(...) 200L,
.openai_resp_body_json = function(...) fake_body,
{
res <- openai_compare_pair_live(
ID1 = "A", text1 = "t", ID2 = "B", text2 = "t",
model = "gpt-5.1", trait_name = td$name, trait_description = td$description,
endpoint = "responses"
)
testthat::expect_equal(res$thoughts, "Legacy summary.")
testthat::expect_equal(res$content, "Content")
}
)
})
testthat::test_that("submit_openai_pairs_live validates inputs", {
td <- trait_description("overall_quality")
# Missing columns
bad_pairs <- tibble::tibble(ID1 = "A", text1 = "t")
testthat::expect_error(
submit_openai_pairs_live(bad_pairs, "gpt-4", td$name, td$description),
"must contain columns"
)
# Invalid status_every
good_pairs <- tibble::tibble(ID1 = "A", text1 = "t", ID2 = "B", text2 = "t")
testthat::expect_error(
submit_openai_pairs_live(
good_pairs, "gpt-4", td$name, td$description,
status_every = 0
),
"status_every` must be a single positive integer"
)
})