# ===================================================================== # test-openai_live.R # Tests for openai_compare_pair_live() and submit_openai_pairs_live() # ===================================================================== testthat::test_that("openai_compare_pair_live parses chat.completions correctly", { data("example_writing_samples", package = "pairwiseLLM") td <- trait_description("overall_quality") tmpl <- set_prompt_template() ID1 <- "S01" ID2 <- "S02" text1 <- "Text 1" text2 <- "Text 2" fake_body <- list( object = "chat.completion", model = "gpt-4.1", choices = list(list( message = list( role = "assistant", content = "SAMPLE_1 Some explanation." ) )), usage = list( prompt_tokens = 10L, completion_tokens = 5L, total_tokens = 15L ) ) testthat::with_mocked_bindings( .openai_api_key = function(...) "FAKEKEY", .openai_req_body_json = function(req, body) req, .openai_req_perform = function(req) structure(list(), class = "fake_resp"), .openai_resp_body_json = function(...) fake_body, .openai_resp_status = function(...) 200L, { res <- openai_compare_pair_live( ID1 = ID1, text1 = text1, ID2 = ID2, text2 = text2, model = "gpt-4.1", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, endpoint = "chat.completions", temperature = 0, include_raw = TRUE ) testthat::expect_s3_class(res, "tbl_df") testthat::expect_equal(nrow(res), 1L) testthat::expect_equal(res$custom_id, sprintf("LIVE_%s_vs_%s", ID1, ID2)) testthat::expect_equal(res$ID1, ID1) testthat::expect_equal(res$ID2, ID2) testthat::expect_equal(res$model, "gpt-4.1") testthat::expect_equal(res$object_type, "chat.completion") testthat::expect_equal(res$status_code, 200L) testthat::expect_true(is.na(res$error_message)) testthat::expect_equal( res$content, "SAMPLE_1 Some explanation." ) testthat::expect_equal(res$better_sample, "SAMPLE_1") testthat::expect_equal(res$better_id, ID1) testthat::expect_equal(res$prompt_tokens, 10) testthat::expect_equal(res$completion_tokens, 5) testthat::expect_equal(res$total_tokens, 15) # raw_response testthat::expect_true("raw_response" %in% names(res)) testthat::expect_type(res$raw_response, "list") testthat::expect_equal(res$raw_response[[1]]$object, "chat.completion") testthat::expect_equal(res$raw_response[[1]]$model, "gpt-4.1") } ) }) # --------------------------------------------------------------------- testthat::test_that("openai_compare_pair_live parses responses endpoint correctly", { td <- trait_description("overall_quality") tmpl <- set_prompt_template() ID1 <- "S01" ID2 <- "S02" text1 <- "Text A" text2 <- "Text B" fake_body <- list( object = "response", model = "gpt-5.1", output = list(list( content = list( list( type = "output_text", text = "SAMPLE_2 A " ), list(type = "output_text", text = "B") ) )), usage = list( input_tokens = 7L, output_tokens = 3L, total_tokens = 10L ) ) testthat::with_mocked_bindings( .openai_api_key = function(...) "FAKEKEY", .openai_req_body_json = function(req, body) req, .openai_req_perform = function(req) structure(list(), class = "fake_resp"), .openai_resp_body_json = function(...) fake_body, .openai_resp_status = function(...) 200L, { res <- openai_compare_pair_live( ID1 = ID1, text1 = text1, ID2 = ID2, text2 = text2, model = "gpt-5.1", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, endpoint = "responses", reasoning = "none", include_raw = TRUE ) testthat::expect_s3_class(res, "tbl_df") testthat::expect_equal(res$object_type, "response") testthat::expect_equal(res$model, "gpt-5.1") testthat::expect_equal( res$content, "SAMPLE_2 A B" ) testthat::expect_equal(res$better_sample, "SAMPLE_2") testthat::expect_equal(res$better_id, ID2) testthat::expect_equal(res$prompt_tokens, 7) testthat::expect_equal(res$completion_tokens, 3) testthat::expect_equal(res$total_tokens, 10) testthat::expect_true("raw_response" %in% names(res)) testthat::expect_equal(res$raw_response[[1]]$model, "gpt-5.1") } ) }) # --------------------------------------------------------------------- testthat::test_that("openai_compare_pair_live returns error row on JSON parse failure", { td <- trait_description("overall_quality") tmpl <- set_prompt_template() ID1 <- "S01" ID2 <- "S02" testthat::with_mocked_bindings( .openai_api_key = function(...) "FAKEKEY", .openai_req_body_json = function(req, body) req, .openai_req_perform = function(req) structure(list(), class = "fake_resp"), .openai_resp_body_json = function(...) stop("boom"), .openai_resp_status = function(...) 500L, { res <- openai_compare_pair_live( ID1 = ID1, text1 = "X", ID2 = ID2, text2 = "Y", model = "gpt-4.1", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, endpoint = "chat.completions", include_raw = TRUE ) testthat::expect_equal(res$status_code, 500L) testthat::expect_equal( res$error_message, "Failed to parse response body as JSON." ) testthat::expect_true(is.na(res$better_sample)) testthat::expect_true(is.null(res$raw_response[[1]])) } ) }) # --------------------------------------------------------------------- testthat::test_that("openai_compare_pair_live enforces gpt-5.1 + reasoning constraints", { td <- trait_description("overall_quality") tmpl <- set_prompt_template() # Should error testthat::expect_error( openai_compare_pair_live( ID1 = "A", text1 = "x", ID2 = "B", text2 = "y", model = "gpt-5.1", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, endpoint = "responses", reasoning = "low", temperature = 0 ), regexp = "gpt-5.1" ) # Allowed case fake_body <- list( object = "response", model = "gpt-5.1", output = list(list( content = list(list( type = "output_text", text = "SAMPLE_1" )) )), usage = list(input_tokens = 1L, output_tokens = 1L, total_tokens = 2L) ) testthat::with_mocked_bindings( .openai_api_key = function(...) "FAKEKEY", .openai_req_body_json = function(req, body) req, .openai_req_perform = function(req) structure(list(), class = "fake_resp"), .openai_resp_body_json = function(...) fake_body, .openai_resp_status = function(...) 200L, { res <- openai_compare_pair_live( ID1 = "A", text1 = "x", ID2 = "B", text2 = "y", model = "gpt-5.1", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, endpoint = "responses", reasoning = "none", include_raw = TRUE ) testthat::expect_equal(res$better_id, "A") } ) }) # --------------------------------------------------------------------- testthat::test_that("openai_compare_pair_live enforces other gpt-5* constraints", { td <- trait_description("overall_quality") tmpl <- set_prompt_template() testthat::expect_error( openai_compare_pair_live( ID1 = "A", text1 = "x", ID2 = "B", text2 = "y", model = "gpt-5-mini", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, endpoint = "responses", temperature = 0 ) ) fake_body <- list( object = "response", model = "gpt-5-mini", output = list(list( content = list(list( type = "output_text", text = "SAMPLE_2" )) )), usage = list(input_tokens = 1L, output_tokens = 1L, total_tokens = 2L) ) testthat::with_mocked_bindings( .openai_api_key = function(...) "FAKEKEY", .openai_req_body_json = function(req, body) req, .openai_req_perform = function(req) structure(list(), class = "fake_resp"), .openai_resp_body_json = function(...) fake_body, .openai_resp_status = function(...) 200L, { res <- openai_compare_pair_live( ID1 = "A", text1 = "x", ID2 = "B", text2 = "y", model = "gpt-5-mini", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, endpoint = "responses", include_raw = TRUE ) testthat::expect_equal(res$better_id, "B") } ) }) # --------------------------------------------------------------------- testthat::test_that("submit_openai_pairs_live returns empty tibble for zero rows", { td <- trait_description("overall_quality") tmpl <- set_prompt_template() empty_pairs <- tibble::tibble( ID1 = character(0), text1 = character(0), ID2 = character(0), text2 = character(0) ) res <- submit_openai_pairs_live( pairs = empty_pairs, model = "gpt-4.1", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, endpoint = "chat.completions" ) testthat::expect_equal(nrow(res), 0L) testthat::expect_true("thoughts" %in% names(res)) testthat::expect_false("raw_response" %in% names(res)) }) # --------------------------------------------------------------------- testthat::test_that("submit_openai_pairs_live with include_raw=TRUE returns raw_response column", { td <- trait_description("overall_quality") tmpl <- set_prompt_template() empty_pairs <- tibble::tibble( ID1 = character(0), text1 = character(0), ID2 = character(0), text2 = character(0) ) res <- submit_openai_pairs_live( pairs = empty_pairs, model = "gpt-4.1", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, endpoint = "chat.completions", include_raw = TRUE ) testthat::expect_equal(nrow(res), 0L) testthat::expect_true("thoughts" %in% names(res)) testthat::expect_true("raw_response" %in% names(res)) testthat::expect_type(res$raw_response, "list") }) # --------------------------------------------------------------------- testthat::test_that("submit_openai_pairs_live calls openai_compare_pair_live row-wise", { pairs <- tibble::tibble( ID1 = c("S01", "S03"), text1 = c("Text 1", "Text 3"), ID2 = c("S02", "S04"), text2 = c("Text 2", "Text 4") ) td <- trait_description("overall_quality") tmpl <- set_prompt_template() calls <- list() fake_result <- function(ID1, ID2, chosen) { tibble::tibble( custom_id = sprintf("LIVE_%s_vs_%s", ID1, ID2), ID1 = ID1, ID2 = ID2, model = "gpt-4.1", object_type = "chat.completion", status_code = 200L, error_message = NA_character_, content = sprintf("%s", chosen), better_sample = chosen, better_id = if (chosen == "SAMPLE_1") ID1 else ID2, prompt_tokens = 10, completion_tokens = 5, total_tokens = 15 ) } testthat::with_mocked_bindings( openai_compare_pair_live = function(ID1, text1, ID2, text2, model, trait_name, trait_description, prompt_template, endpoint, api_key, include_raw, ...) { calls <<- append(calls, list(list(ID1 = ID1, ID2 = ID2))) if (ID1 == "S01") { fake_result(ID1, ID2, "SAMPLE_1") } else { fake_result(ID1, ID2, "SAMPLE_2") } }, { res <- submit_openai_pairs_live( pairs = pairs, model = "gpt-4.1", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, endpoint = "chat.completions", include_raw = FALSE, verbose = FALSE, progress = FALSE ) testthat::expect_equal(length(calls), 2L) testthat::expect_equal(res$better_id, c("S01", "S04")) } ) }) # --------------------------------------------------------------------- testthat::test_that("openai_compare_pair_live collects thoughts and message text separately for responses", { td <- trait_description("overall_quality") tmpl <- set_prompt_template() ID1 <- "S01" ID2 <- "S02" text1 <- "Text A" text2 <- "Text B" fake_body <- list( object = "response", model = "gpt-5.1", reasoning = list( effort = "low", summary = list(text = "Reasoning summary. ") ), output = list( list( id = "rs_x", type = "reasoning", summary = list() ), list( id = "msg_x", type = "message", status = "completed", content = list( list( type = "output_text", text = "SAMPLE_2 Final answer." ) ), role = "assistant" ) ), usage = list( input_tokens = 10L, output_tokens = 5L, total_tokens = 15L ) ) testthat::with_mocked_bindings( .openai_api_key = function(...) "FAKEKEY", .openai_req_body_json = function(req, body) req, .openai_req_perform = function(req) structure(list(), class = "fake_resp"), .openai_resp_body_json = function(...) fake_body, .openai_resp_status = function(...) 200L, { res <- openai_compare_pair_live( ID1 = ID1, text1 = text1, ID2 = ID2, text2 = text2, model = "gpt-5.1", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, endpoint = "responses", reasoning = "low", include_thoughts = TRUE, include_raw = TRUE ) testthat::expect_s3_class(res, "tbl_df") testthat::expect_equal(res$object_type, "response") # Reasoning summary should go to thoughts testthat::expect_equal(res$thoughts, "Reasoning summary. ") # Content should be assistant message only testthat::expect_equal( res$content, "SAMPLE_2 Final answer." ) testthat::expect_equal(res$better_sample, "SAMPLE_2") testthat::expect_equal(res$better_id, ID2) } ) }) # --------------------------------------------------------------------- testthat::test_that("openai_compare_pair_live picks up reasoning summary from output items", { td <- trait_description("overall_quality") tmpl <- set_prompt_template() ID1 <- "S01" ID2 <- "S02" text1 <- "Text A" text2 <- "Text B" fake_body <- list( object = "response", model = "gpt-5.1", # No top-level reasoning$summary here reasoning = list( effort = "low" ), output = list( list( id = "rs_x", type = "reasoning", summary = list( list(type = "summary_text", text = "Reasoning sentence 1."), list(type = "summary_text", text = "Reasoning sentence 2.") ) ), list( id = "msg_x", type = "message", status = "completed", content = list( list( type = "output_text", text = "SAMPLE_2 Final answer." ) ), role = "assistant" ) ), usage = list( input_tokens = 5L, output_tokens = 5L, total_tokens = 10L ) ) testthat::with_mocked_bindings( .openai_api_key = function(...) "FAKEKEY", .openai_req_body_json = function(req, body) req, .openai_req_perform = function(req) structure(list(), class = "fake_resp"), .openai_resp_body_json = function(...) fake_body, .openai_resp_status = function(...) 200L, { res <- openai_compare_pair_live( ID1 = ID1, text1 = text1, ID2 = ID2, text2 = text2, model = "gpt-5.1", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, endpoint = "responses", reasoning = "low", include_thoughts = TRUE, include_raw = TRUE ) testthat::expect_s3_class(res, "tbl_df") testthat::expect_equal(res$object_type, "response") # Thoughts should be both summary_text entries present testthat::expect_match(res$thoughts, "Reasoning sentence 1.", fixed = TRUE ) testthat::expect_match(res$thoughts, "Reasoning sentence 2.", fixed = TRUE ) # Content should be assistant message only testthat::expect_equal( res$content, "SAMPLE_2 Final answer." ) testthat::expect_equal(res$better_sample, "SAMPLE_2") testthat::expect_equal(res$better_id, ID2) } ) }) testthat::test_that("openai_compare_pair_live validates input types", { td <- trait_description("overall_quality") testthat::expect_error( openai_compare_pair_live( ID1 = 123, text1 = "t", ID2 = "B", text2 = "t", model = "gpt-4", trait_name = td$name, trait_description = td$description ), "`ID1` must be a single character" ) testthat::expect_error( openai_compare_pair_live( ID1 = "A", text1 = "t", ID2 = "B", text2 = list(), model = "gpt-4", trait_name = td$name, trait_description = td$description ), "`text2` must be a single character" ) }) testthat::test_that("openai_compare_pair_live handles HTTP errors gracefully", { td <- trait_description("overall_quality") # Simulate 400 Bad Request fake_error_body <- list( error = list(message = "Invalid parameter") ) testthat::with_mocked_bindings( .openai_api_key = function(...) "KEY", .openai_req_body_json = function(req, ...) req, .openai_req_perform = function(...) "RESP", .openai_resp_status = function(...) 400L, .openai_resp_body_json = function(...) fake_error_body, { res <- openai_compare_pair_live( ID1 = "A", text1 = "t", ID2 = "B", text2 = "t", model = "gpt-4", trait_name = td$name, trait_description = td$description ) testthat::expect_equal(res$status_code, 400L) testthat::expect_equal(res$error_message, "Invalid parameter") testthat::expect_true(is.na(res$content)) } ) }) testthat::test_that("openai_compare_pair_live parses legacy reasoning summary location", { td <- trait_description("overall_quality") # Old structure where summary was at body$reasoning$summary$text fake_body <- list( object = "response", model = "gpt-5.1", reasoning = list( effort = "low", summary = list(text = "Legacy summary.") ), output = list( list( type = "message", content = list(list(type = "output_text", text = "Content")) ) ) ) testthat::with_mocked_bindings( .openai_api_key = function(...) "KEY", .openai_req_body_json = function(req, ...) req, .openai_req_perform = function(...) "RESP", .openai_resp_status = function(...) 200L, .openai_resp_body_json = function(...) fake_body, { res <- openai_compare_pair_live( ID1 = "A", text1 = "t", ID2 = "B", text2 = "t", model = "gpt-5.1", trait_name = td$name, trait_description = td$description, endpoint = "responses" ) testthat::expect_equal(res$thoughts, "Legacy summary.") testthat::expect_equal(res$content, "Content") } ) }) testthat::test_that("submit_openai_pairs_live validates inputs", { td <- trait_description("overall_quality") # Missing columns bad_pairs <- tibble::tibble(ID1 = "A", text1 = "t") testthat::expect_error( submit_openai_pairs_live(bad_pairs, "gpt-4", td$name, td$description), "must contain columns" ) # Invalid status_every good_pairs <- tibble::tibble(ID1 = "A", text1 = "t", ID2 = "B", text2 = "t") testthat::expect_error( submit_openai_pairs_live( good_pairs, "gpt-4", td$name, td$description, status_every = 0 ), "status_every` must be a single positive integer" ) })