# tests/testthat/test-gemini_live.R test_that("gemini_compare_pair_live parses a successful response without thoughts", { skip_if_not_installed("httr2") ns <- asNamespace("pairwiseLLM") fake_resp <- structure(list(), class = "httr2_response") fake_body <- list( model = "models/gemini-3-pro-preview", candidates = list( list( content = list( parts = list( list(text = "Some internal text that we treat as content. "), list(text = "SAMPLE_1") ) ) ) ), usageMetadata = list( promptTokenCount = 42L, candidatesTokenCount = 7L, totalTokenCount = 49L ) ) testthat::local_mocked_bindings( # ✅ Avoid hitting .get_api_key() / env .gemini_api_key = function(api_key = NULL) "TEST_GEMINI_KEY", .gemini_req_perform = function(req) fake_resp, .gemini_resp_status = function(resp) 200L, .gemini_resp_body_json = function(resp, ...) fake_body, .env = ns ) td <- trait_description("overall_quality") tmpl <- set_prompt_template() res <- gemini_compare_pair_live( ID1 = "S01", text1 = "Sample 1 text.", ID2 = "S02", text2 = "Sample 2 text.", model = "gemini-3-pro-preview", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, thinking_level = "low", include_thoughts = FALSE, include_raw = TRUE ) expect_s3_class(res, "tbl_df") expect_equal(nrow(res), 1L) expect_equal(res$ID1, "S01") expect_equal(res$ID2, "S02") expect_equal(res$model, "models/gemini-3-pro-preview") expect_equal(res$object_type, "generateContent") expect_equal(res$status_code, 200L) expect_true(is.na(res$error_message) || identical(res$error_message, "")) # Without include_thoughts, everything is collapsed into content expect_true(is.na(res$thoughts)) expect_true(grepl("Some internal text", res$content, fixed = TRUE)) expect_true(grepl("SAMPLE_1", res$content, fixed = TRUE )) expect_equal(res$better_sample, "SAMPLE_1") expect_equal(res$better_id, "S01") expect_equal(res$prompt_tokens, 42) expect_equal(res$completion_tokens, 7) expect_equal(res$total_tokens, 49) expect_true("raw_response" %in% names(res)) expect_type(res$raw_response, "list") expect_identical(res$raw_response[[1]], fake_body) }) test_that("gemini_compare_pair_live parses thoughts and content when include_thoughts = TRUE", { skip_if_not_installed("httr2") ns <- asNamespace("pairwiseLLM") fake_resp <- structure(list(), class = "httr2_response") fake_body <- list( model = "models/gemini-3-pro-preview", candidates = list( list( content = list( parts = list( list(text = "These are my detailed thoughts..."), list(text = "SAMPLE_2") ) ) ) ), usageMetadata = list() ) testthat::local_mocked_bindings( .gemini_api_key = function(api_key = NULL) "TEST_GEMINI_KEY", .gemini_req_perform = function(req) fake_resp, .gemini_resp_status = function(resp) 200L, .gemini_resp_body_json = function(resp, ...) fake_body, .env = ns ) td <- trait_description("overall_quality") tmpl <- set_prompt_template() res <- gemini_compare_pair_live( ID1 = "S01", text1 = "Sample 1 text.", ID2 = "S02", text2 = "Sample 2 text.", model = "gemini-3-pro-preview", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, thinking_level = "high", include_thoughts = TRUE, include_raw = FALSE ) expect_s3_class(res, "tbl_df") expect_equal(nrow(res), 1L) # thoughts should come from first part, content from subsequent parts expect_equal(res$thoughts, "These are my detailed thoughts...") expect_equal(res$content, "SAMPLE_2") expect_equal(res$better_sample, "SAMPLE_2") expect_equal(res$better_id, "S02") # usageMetadata missing -> NAs expect_true(is.na(res$prompt_tokens)) expect_true(is.na(res$completion_tokens)) expect_true(is.na(res$total_tokens)) }) test_that("gemini_compare_pair_live handles responses without tag", { skip_if_not_installed("httr2") ns <- asNamespace("pairwiseLLM") fake_resp <- structure(list(), class = "httr2_response") fake_body <- list( model = "models/gemini-3-pro-preview", candidates = list( list( content = list( parts = list( list(text = "I forgot to include the tag, sorry.") ) ) ) ) ) testthat::local_mocked_bindings( .gemini_api_key = function(api_key = NULL) "TEST_GEMINI_KEY", .gemini_req_perform = function(req) fake_resp, .gemini_resp_status = function(resp) 200L, .gemini_resp_body_json = function(resp, ...) fake_body, .env = ns ) td <- trait_description("overall_quality") tmpl <- set_prompt_template() res <- gemini_compare_pair_live( ID1 = "S01", text1 = "Sample 1 text.", ID2 = "S02", text2 = "Sample 2 text.", model = "gemini-3-pro-preview", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, thinking_level = "low", include_thoughts = FALSE ) expect_true(is.na(res$better_sample)) expect_true(is.na(res$better_id)) }) test_that("gemini_compare_pair_live returns an error row when request fails", { skip_if_not_installed("httr2") ns <- asNamespace("pairwiseLLM") # Simulate a generic error thrown by .gemini_req_perform testthat::local_mocked_bindings( .gemini_api_key = function(api_key = NULL) "TEST_GEMINI_KEY", .gemini_req_perform = function(req) stop("HTTP 500 Internal Server Error"), .gemini_resp_status = function(resp) 500L, .gemini_resp_body_json = function(resp, ...) NULL, .env = ns ) td <- trait_description("overall_quality") tmpl <- set_prompt_template() res <- gemini_compare_pair_live( ID1 = "S01", text1 = "Sample 1 text.", ID2 = "S02", text2 = "Sample 2 text.", model = "gemini-3-pro-preview", trait_name = td$name, trait_description = td$description, prompt_template = tmpl ) expect_equal(nrow(res), 1L) expect_equal(res$ID1, "S01") expect_equal(res$ID2, "S02") expect_true(is.na(res$object_type)) expect_true(is.na(res$content)) expect_true(is.na(res$thoughts)) expect_true(is.na(res$better_sample)) expect_true(is.na(res$better_id)) expect_match(res$error_message, "HTTP 500", fixed = FALSE) }) test_that("gemini_compare_pair_live validates model and maps thinking_level medium to High", { skip_if_not_installed("httr2") ns <- asNamespace("pairwiseLLM") fake_resp <- structure(list(), class = "httr2_response") # Just enough structure to not break parsing fake_body <- list( model = "models/gemini-3-pro-preview", candidates = list( list( content = list( parts = list( list(text = "SAMPLE_2") ) ) ) ) ) testthat::local_mocked_bindings( .gemini_api_key = function(api_key = NULL) "TEST_GEMINI_KEY", .gemini_req_perform = function(req) fake_resp, .gemini_resp_status = function(resp) 200L, .gemini_resp_body_json = function(resp, ...) fake_body, .env = ns ) td <- trait_description("overall_quality") tmpl <- set_prompt_template() # Invalid model expect_error( gemini_compare_pair_live( ID1 = "S01", text1 = "Sample 1 text.", ID2 = "S02", text2 = "Sample 2 text.", model = "", trait_name = td$name, trait_description = td$description, prompt_template = tmpl ), "`model` must be a non-empty character scalar" ) # thinking_level = "medium" should warn but still work expect_warning( { res <- gemini_compare_pair_live( ID1 = "S01", text1 = "Sample 1 text.", ID2 = "S02", text2 = "Sample 2 text.", model = "gemini-3-pro-preview", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, thinking_level = "medium" ) expect_equal(res$better_sample, "SAMPLE_2") }, "mapping to \"High\" internally", fixed = FALSE ) # thinking_budget in ... should be ignored (no error, but warning) expect_warning( { res <- gemini_compare_pair_live( ID1 = "S01", text1 = "Sample 1 text.", ID2 = "S02", text2 = "Sample 2 text.", model = "gemini-3-pro-preview", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, thinking_level = "low", thinking_budget = 2048 # should be ignored ) expect_equal(res$better_sample, "SAMPLE_2") }, "`thinking_budget` is ignored for Gemini 3", fixed = FALSE ) }) test_that("submit_gemini_pairs_live validates inputs and handles zero-row pairs", { td <- trait_description("overall_quality") tmpl <- set_prompt_template() # Missing columns bad_pairs <- tibble::tibble( ID1 = "S01", text1 = "Sample 1" # no ID2/text2 ) expect_error( submit_gemini_pairs_live( pairs = bad_pairs, model = "gemini-3-pro-preview", trait_name = td$name, trait_description = td$description, prompt_template = tmpl ), "`pairs` must contain columns" ) # Zero rows: should return empty tibble with expected columns empty_pairs <- tibble::tibble( ID1 = character(0), text1 = character(0), ID2 = character(0), text2 = character(0) ) res_empty <- submit_gemini_pairs_live( pairs = empty_pairs, model = "gemini-3-pro-preview", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, include_thoughts = TRUE ) expect_s3_class(res_empty, "tbl_df") expect_equal(nrow(res_empty), 0L) expect_setequal( names(res_empty), c( "custom_id", "ID1", "ID2", "model", "object_type", "status_code", "error_message", "thoughts", "content", "better_sample", "better_id", "prompt_tokens", "completion_tokens", "total_tokens" ) ) }) test_that("submit_gemini_pairs_live calls gemini_compare_pair_live for each row and passes include_thoughts", { ns <- asNamespace("pairwiseLLM") pairs <- tibble::tibble( ID1 = c("S01", "S02"), text1 = c("Text 1a", "Text 2a"), ID2 = c("S03", "S04"), text2 = c("Text 1b", "Text 2b") ) td <- trait_description("overall_quality") tmpl <- set_prompt_template() calls <- list() fake_gemini_compare <- function( ID1, text1, ID2, text2, model, trait_name, trait_description, prompt_template, api_key, thinking_level, temperature, top_p, top_k, max_output_tokens, api_version, include_raw, include_thoughts, ... ) { calls <<- append(calls, list( list( ID1 = ID1, ID2 = ID2, model = model, thinking_level = thinking_level, include_thoughts = include_thoughts ) )) tibble::tibble( custom_id = sprintf("LIVE_%s_vs_%s", ID1, ID2), ID1 = ID1, ID2 = ID2, model = model, object_type = "generateContent", status_code = 200L, error_message = NA_character_, thoughts = if (isTRUE(include_thoughts)) { "fake thoughts" } else { NA_character_ }, content = "SAMPLE_1", better_sample = "SAMPLE_1", better_id = ID1, prompt_tokens = 10, completion_tokens = 2, total_tokens = 12 ) } testthat::local_mocked_bindings( gemini_compare_pair_live = fake_gemini_compare, .env = ns ) res <- submit_gemini_pairs_live( pairs = pairs, model = "gemini-3-pro-preview", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, thinking_level = "low", include_thoughts = TRUE, include_raw = FALSE, verbose = FALSE, progress = FALSE ) expect_equal(nrow(res), 2L) expect_equal(length(calls), 2L) expect_equal(calls[[1]]$ID1, "S01") expect_equal(calls[[1]]$ID2, "S03") expect_equal(calls[[1]]$model, "gemini-3-pro-preview") expect_equal(calls[[1]]$thinking_level, "low") expect_true(calls[[1]]$include_thoughts) expect_equal(calls[[2]]$ID1, "S02") expect_equal(calls[[2]]$ID2, "S04") expect_true(all(res$better_sample == "SAMPLE_1")) expect_true(all(res$thoughts == "fake thoughts")) })