# ===================================================================== # test-anthropic_live.R # Tests for anthropic_compare_pair_live() and submit_anthropic_pairs_live() # ===================================================================== testthat::test_that("anthropic_compare_pair_live parses /v1/messages correctly", { td <- trait_description("overall_quality") tmpl <- set_prompt_template() ID1 <- "S01" ID2 <- "S02" text1 <- "Text 1" text2 <- "Text 2" fake_body <- list( model = "claude-sonnet-4-5-20250929", id = "msg_01XYZ", type = "message", role = "assistant", content = list( list( type = "text", text = "SAMPLE_1 Explanation." ) ), usage = list( input_tokens = 14L, output_tokens = 4L, total_tokens = 18L ) ) testthat::with_mocked_bindings( .anthropic_api_key = function(...) "FAKEKEY", .anthropic_req_body_json = function(req, body) req, .anthropic_req_perform = function(req) { structure(list(), class = "fake_resp" ) }, .anthropic_resp_body_json = function(...) fake_body, .anthropic_resp_status = function(...) 200L, { res <- anthropic_compare_pair_live( ID1 = ID1, text1 = text1, ID2 = ID2, text2 = text2, model = "claude-sonnet-4-5", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, temperature = 0, max_tokens = 64, reasoning = "none", include_raw = TRUE ) testthat::expect_s3_class(res, "tbl_df") testthat::expect_equal(nrow(res), 1L) testthat::expect_equal(res$custom_id, sprintf("LIVE_%s_vs_%s", ID1, ID2)) testthat::expect_equal(res$ID1, ID1) testthat::expect_equal(res$ID2, ID2) testthat::expect_equal(res$model, "claude-sonnet-4-5-20250929") testthat::expect_equal(res$object_type, "message") testthat::expect_equal(res$status_code, 200L) testthat::expect_true(is.na(res$error_message)) testthat::expect_equal( res$content, "SAMPLE_1 Explanation." ) testthat::expect_equal(res$better_sample, "SAMPLE_1") testthat::expect_equal(res$better_id, ID1) testthat::expect_equal(res$prompt_tokens, 14) testthat::expect_equal(res$completion_tokens, 4) testthat::expect_equal(res$total_tokens, 18) testthat::expect_true("raw_response" %in% names(res)) testthat::expect_type(res$raw_response, "list") testthat::expect_equal( res$raw_response[[1]]$model, "claude-sonnet-4-5-20250929" ) } ) }) # --------------------------------------------------------------------- testthat::test_that("anthropic_compare_pair_live returns error row on JSON parse failure", { td <- trait_description("overall_quality") tmpl <- set_prompt_template() ID1 <- "S01" ID2 <- "S02" testthat::with_mocked_bindings( .anthropic_api_key = function(...) "FAKEKEY", .anthropic_req_body_json = function(req, body) req, .anthropic_req_perform = function(req) { structure(list(), class = "fake_resp" ) }, .anthropic_resp_body_json = function(...) stop("boom"), .anthropic_resp_status = function(...) 500L, { res <- anthropic_compare_pair_live( ID1 = ID1, text1 = "X", ID2 = ID2, text2 = "Y", model = "claude-sonnet-4-5", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, include_raw = TRUE ) testthat::expect_equal(res$status_code, 500L) testthat::expect_equal( res$error_message, "Failed to parse response body as JSON." ) testthat::expect_true(is.na(res$better_sample)) testthat::expect_true(is.null(res$raw_response[[1]])) } ) }) # --------------------------------------------------------------------- testthat::test_that("anthropic_compare_pair_live applies recommended defaults", { td <- trait_description("overall_quality") tmpl <- set_prompt_template() ID1 <- "S01" ID2 <- "S02" bodies <- list() fake_body <- list( model = "claude-sonnet-4-5-20250929", id = "msg_01XYZ", type = "message", role = "assistant", content = list( list(type = "text", text = "SAMPLE_2") ), usage = list( input_tokens = 10L, output_tokens = 4L, total_tokens = 14L ) ) testthat::with_mocked_bindings( .anthropic_api_key = function(...) "FAKEKEY", .anthropic_req_body_json = function(req, body) { bodies <<- append(bodies, list(body)) req }, .anthropic_req_perform = function(req) { structure(list(), class = "fake_resp" ) }, .anthropic_resp_body_json = function(...) fake_body, .anthropic_resp_status = function(...) 200L, { res <- anthropic_compare_pair_live( ID1 = ID1, text1 = "A", ID2 = ID2, text2 = "B", model = "claude-sonnet-4-5", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, include_raw = FALSE # note: no temperature or max_tokens supplied ) testthat::expect_equal(res$better_id, ID2) # Check that body has our recommended defaults testthat::expect_equal(length(bodies), 1L) b <- bodies[[1]] testthat::expect_equal(b$model, "claude-sonnet-4-5") testthat::expect_equal(b$max_tokens, 768) testthat::expect_equal(b$temperature, 0) } ) }) # --------------------------------------------------------------------- testthat::test_that("submit_anthropic_pairs_live returns empty tibble for zero rows", { td <- trait_description("overall_quality") tmpl <- set_prompt_template() empty_pairs <- tibble::tibble( ID1 = character(0), text1 = character(0), ID2 = character(0), text2 = character(0) ) res <- submit_anthropic_pairs_live( pairs = empty_pairs, model = "claude-sonnet-4-5", trait_name = td$name, trait_description = td$description, prompt_template = tmpl ) testthat::expect_equal(nrow(res), 0L) testthat::expect_false("raw_response" %in% names(res)) res_raw <- submit_anthropic_pairs_live( pairs = empty_pairs, model = "claude-sonnet-4-5", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, include_raw = TRUE ) testthat::expect_equal(nrow(res_raw), 0L) testthat::expect_true("raw_response" %in% names(res_raw)) testthat::expect_type(res_raw$raw_response, "list") }) # --------------------------------------------------------------------- testthat::test_that("submit_anthropic_pairs_live calls anthropic_compare_pair_live row-wise", { pairs <- tibble::tibble( ID1 = c("S01", "S03"), text1 = c("Text 1", "Text 3"), ID2 = c("S02", "S04"), text2 = c("Text 2", "Text 4") ) td <- trait_description("overall_quality") tmpl <- set_prompt_template() calls <- list() fake_result <- function(ID1, ID2, chosen) { tibble::tibble( custom_id = sprintf("LIVE_%s_vs_%s", ID1, ID2), ID1 = ID1, ID2 = ID2, model = "claude-sonnet-4-5-20250929", object_type = "message", status_code = 200L, error_message = NA_character_, content = sprintf("%s", chosen), better_sample = chosen, better_id = if (chosen == "SAMPLE_1") ID1 else ID2, prompt_tokens = 10, completion_tokens = 5, total_tokens = 15 ) } testthat::with_mocked_bindings( anthropic_compare_pair_live = function(ID1, text1, ID2, text2, model, trait_name, trait_description, prompt_template, api_key, anthropic_version, reasoning, include_raw, include_thoughts, ...) { calls <<- append(calls, list(list(ID1 = ID1, ID2 = ID2))) if (ID1 == "S01") { fake_result(ID1, ID2, "SAMPLE_1") } else { fake_result(ID1, ID2, "SAMPLE_2") } }, { res <- submit_anthropic_pairs_live( pairs = pairs, model = "claude-sonnet-4-5", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, include_raw = FALSE, verbose = FALSE, progress = FALSE ) testthat::expect_equal(length(calls), 2L) testthat::expect_equal(res$better_id, c("S01", "S04")) } ) }) # --------------------------------------------------------------------- testthat::test_that("anthropic_compare_pair_live defaults and thinking depend on reasoning", { td <- trait_description("overall_quality") tmpl <- set_prompt_template() bodies <- list() fake_body <- list( model = "claude-sonnet-4-5-20250929", id = "msg_01XYZ", type = "message", role = "assistant", content = list( list(type = "text", text = "SAMPLE_1") ), usage = list( input_tokens = 10L, output_tokens = 4L # total_tokens omitted on purpose so we test computed total ) ) testthat::with_mocked_bindings( .anthropic_api_key = function(...) "FAKEKEY", .anthropic_req_body_json = function(req, body) { bodies <<- append(bodies, list(body)) req }, .anthropic_req_perform = function(req) { structure(list(), class = "fake_resp" ) }, .anthropic_resp_body_json = function(...) fake_body, .anthropic_resp_status = function(...) 200L, { # Call once with reasoning = "none" res_none <- anthropic_compare_pair_live( ID1 = "S1", text1 = "A", ID2 = "S2", text2 = "B", model = "claude-sonnet-4-5", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, reasoning = "none" ) # Call again with reasoning = "enabled" res_reason <- anthropic_compare_pair_live( ID1 = "S1", text1 = "A", ID2 = "S2", text2 = "B", model = "claude-sonnet-4-5", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, reasoning = "enabled" ) # Parsed output correctness (same fake response in both cases) testthat::expect_equal(res_none$better_id, "S1") testthat::expect_equal(res_reason$better_id, "S1") # total_tokens should be computed as input + output (10 + 4) testthat::expect_equal(res_none$total_tokens, 14) testthat::expect_equal(res_reason$total_tokens, 14) # We should have captured two bodies with different defaults testthat::expect_equal(length(bodies), 2L) b_none <- bodies[[1]] b_reason <- bodies[[2]] # Temperatures differ by reasoning mode testthat::expect_equal(b_none$temperature, 0) testthat::expect_equal(b_reason$temperature, 1) # Different max_tokens by reasoning mode testthat::expect_equal(b_none$max_tokens, 768) testthat::expect_equal(b_reason$max_tokens, 2048) # No thinking block when reasoning = "none" testthat::expect_false("thinking" %in% names(b_none)) # Thinking block present and well-formed when reasoning = "enabled" testthat::expect_true("thinking" %in% names(b_reason)) testthat::expect_equal(b_reason$thinking$type, "enabled") # Default matches your function's default; adjust here if you changed it testthat::expect_equal(b_reason$thinking$budget_tokens, 1024) } ) }) testthat::test_that("anthropic_compare_pair_live upgrades reasoning when include_thoughts = TRUE", { td <- trait_description("overall_quality") tmpl <- set_prompt_template() bodies <- list() fake_body <- list( model = "claude-sonnet-4-5-20250929", id = "msg_01XYZ", type = "message", role = "assistant", content = list( list(type = "text", text = "SAMPLE_1") ), usage = list( input_tokens = 10L, output_tokens = 4L ) ) testthat::with_mocked_bindings( .anthropic_api_key = function(...) "FAKEKEY", .anthropic_req_body_json = function(req, body) { bodies <<- append(bodies, list(body)) req }, .anthropic_req_perform = function(req) { structure(list(), class = "fake_resp" ) }, .anthropic_resp_body_json = function(...) fake_body, .anthropic_resp_status = function(...) 200L, { res <- anthropic_compare_pair_live( ID1 = "S1", text1 = "A", ID2 = "S2", text2 = "B", model = "claude-sonnet-4-5", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, reasoning = "none", include_thoughts = TRUE ) testthat::expect_equal(res$better_id, "S1") testthat::expect_equal(length(bodies), 1L) b <- bodies[[1]] # When include_thoughts = TRUE and reasoning = "none", we upgrade to # extended thinking mode, which implies temperature = 1 and # a thinking block testthat::expect_equal(b$temperature, 1) testthat::expect_equal(b$max_tokens, 2048) testthat::expect_true("thinking" %in% names(b)) testthat::expect_equal(b$thinking$type, "enabled") testthat::expect_equal(b$thinking$budget_tokens, 1024) } ) }) testthat::test_that("anthropic_compare_pair_live parses reasoning-enabled output correctly", { td <- trait_description("overall_quality") tmpl <- set_prompt_template() ID1 <- "S01" ID2 <- "S02" fake_body <- list( model = "claude-sonnet-4-5-20250929", id = "msg_01XYZ", type = "message", role = "assistant", content = list( # Simulate a more verbose answer you'd get with extended thinking: list(type = "text", text = "Some hidden reasoning and explanation. "), list( type = "text", text = "Final choice: SAMPLE_2." ) ), usage = list( input_tokens = 20L, output_tokens = 30L, total_tokens = 50L ) ) bodies <- list() testthat::with_mocked_bindings( .anthropic_api_key = function(...) "FAKEKEY", .anthropic_req_body_json = function(req, body) { bodies <<- append(bodies, list(body)) req }, .anthropic_req_perform = function(req) { structure(list(), class = "fake_resp" ) }, .anthropic_resp_body_json = function(...) fake_body, .anthropic_resp_status = function(...) 200L, { res <- anthropic_compare_pair_live( ID1 = ID1, text1 = "Text 1", ID2 = ID2, text2 = "Text 2", model = "claude-sonnet-4-5", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, reasoning = "enabled", include_raw = TRUE ) # Still a single-row tibble testthat::expect_s3_class(res, "tbl_df") testthat::expect_equal(nrow(res), 1L) # We concatenated both blocks correctly testthat::expect_true(grepl( "Some hidden reasoning and explanation\\.", res$content )) testthat::expect_true(grepl("SAMPLE_2", res$content, fixed = TRUE )) # Tag parsing works even with extra explanation testthat::expect_equal(res$better_sample, "SAMPLE_2") testthat::expect_equal(res$better_id, ID2) # Tokens passed through correctly testthat::expect_equal(res$prompt_tokens, 20) testthat::expect_equal(res$completion_tokens, 30) testthat::expect_equal(res$total_tokens, 50) # We did send a thinking block in the request testthat::expect_equal(length(bodies), 1L) b <- bodies[[1]] testthat::expect_true("thinking" %in% names(b)) testthat::expect_equal(b$thinking$type, "enabled") } ) }) testthat::test_that("anthropic_compare_pair_live errors if temperature != 1 with reasoning = 'enabled'", { td <- trait_description("overall_quality") tmpl <- set_prompt_template() # temperature = 0 should be rejected when reasoning = "enabled" testthat::expect_error( anthropic_compare_pair_live( ID1 = "S1", text1 = "A", ID2 = "S2", text2 = "B", model = "claude-sonnet-4-5", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, reasoning = "enabled", temperature = 0 ), "temperature` must be 1", fixed = FALSE ) }) testthat::test_that("anthropic_compare_pair_live enforces thinking_budget_tokens constraints", { td <- trait_description("overall_quality") tmpl <- set_prompt_template() # budget too small testthat::expect_error( anthropic_compare_pair_live( ID1 = "S1", text1 = "A", ID2 = "S2", text2 = "B", model = "claude-sonnet-4-5", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, reasoning = "enabled", thinking_budget_tokens = 512 ), "at least 1024", fixed = TRUE ) # budget >= max_tokens should also error testthat::expect_error( anthropic_compare_pair_live( ID1 = "S1", text1 = "A", ID2 = "S2", text2 = "B", model = "claude-sonnet-4-5", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, reasoning = "enabled", max_tokens = 1500, thinking_budget_tokens = 1500 ), "thinking_budget_tokens.*must be smaller than.*max_tokens", fixed = FALSE ) }) testthat::test_that("anthropic_compare_pair_live validates input types", { td <- trait_description("overall_quality") tmpl <- set_prompt_template() # Test non-character inputs testthat::expect_error( anthropic_compare_pair_live( ID1 = 123, text1 = "t", ID2 = "B", text2 = "t", model = "claude", trait_name = td$name, trait_description = td$description ), "`ID1` must be a single character" ) testthat::expect_error( anthropic_compare_pair_live( ID1 = "A", text1 = "t", ID2 = "B", text2 = "t", model = 123, trait_name = td$name, trait_description = td$description ), "`model` must be a single character" ) }) testthat::test_that("anthropic_compare_pair_live warns on conflicting thoughts settings", { td <- trait_description("overall_quality") tmpl <- set_prompt_template() # If user says reasoning="enabled" but include_thoughts=FALSE, it should warn # but keep reasoning="enabled" testthat::with_mocked_bindings( # Mock internals to avoid actual API call failure .anthropic_api_key = function(...) "FAKE", .anthropic_request = function(...) list(), .anthropic_req_body_json = function(req, ...) req, .anthropic_req_perform = function(...) structure(list(), class = "fake"), .anthropic_resp_status = function(...) 200L, .anthropic_resp_body_json = function(...) list(type = "message"), { testthat::expect_warning( anthropic_compare_pair_live( ID1 = "A", text1 = "t", ID2 = "B", text2 = "t", model = "claude", trait_name = td$name, trait_description = td$description, reasoning = "enabled", include_thoughts = FALSE ), "include_thoughts = FALSE but reasoning = 'enabled'" ) } ) }) testthat::test_that("anthropic_compare_pair_live handles complex blocks (redacted thinking, multi-text)", { td <- trait_description("overall_quality") tmpl <- set_prompt_template() # Complex body: Thinking + Redacted Thinking + Split Text complex_body <- list( id = "msg_complex", type = "message", model = "claude-test", role = "assistant", content = list( list(type = "thinking", thinking = "I am thinking... "), list(type = "redacted_thinking", data = "HIDDEN"), list(type = "text", text = "Here is part 1. "), list(type = "text", text = "SAMPLE_1") ), usage = list(input_tokens = 10, output_tokens = 10, total_tokens = 20) ) testthat::with_mocked_bindings( .anthropic_api_key = function(...) "FAKE", .anthropic_req_body_json = function(req, ...) req, .anthropic_req_perform = function(...) structure(list(), class = "fake"), .anthropic_resp_status = function(...) 200L, .anthropic_resp_body_json = function(...) complex_body, { res <- anthropic_compare_pair_live( ID1 = "S1", text1 = "A", ID2 = "S2", text2 = "B", model = "claude", trait_name = td$name, trait_description = td$description ) # Check thoughts concatenation including redaction tag testthat::expect_match(res$thoughts, "I am thinking... ", fixed = TRUE) testthat::expect_match(res$thoughts, "[REDACTED_THINKING]HIDDEN", fixed = TRUE) # Check content concatenation testthat::expect_match(res$content, "Here is part 1. ", fixed = TRUE) testthat::expect_equal(res$better_sample, "SAMPLE_1") } ) }) testthat::test_that("anthropic_compare_pair_live calculates total_tokens fallback", { td <- trait_description("overall_quality") # Usage body missing total_tokens body_no_total <- list( type = "message", content = list(list(type = "text", text = "HI")), usage = list(input_tokens = 50, output_tokens = 50) # Sum = 100 ) testthat::with_mocked_bindings( .anthropic_api_key = function(...) "FAKE", .anthropic_req_body_json = function(req, ...) req, .anthropic_req_perform = function(...) structure(list(), class = "fake"), .anthropic_resp_status = function(...) 200L, .anthropic_resp_body_json = function(...) body_no_total, { res <- anthropic_compare_pair_live( ID1 = "A", text1 = "t", ID2 = "B", text2 = "t", model = "claude", trait_name = td$name, trait_description = td$description ) testthat::expect_equal(res$total_tokens, 100) } ) }) testthat::test_that("submit_anthropic_pairs_live validates input columns and status_every", { td <- trait_description("overall_quality") # Missing text1 column bad_pairs <- tibble::tibble(ID1 = "A", ID2 = "B", text2 = "t") testthat::expect_error( submit_anthropic_pairs_live(bad_pairs, "claude", td$name, td$description), "must contain columns" ) # Invalid status_every good_pairs <- tibble::tibble(ID1 = "A", text1 = "t", ID2 = "B", text2 = "t") testthat::expect_error( submit_anthropic_pairs_live( good_pairs, "claude", td$name, td$description, status_every = 0 ), "status_every` must be a single positive integer" ) }) testthat::test_that("submit_anthropic_pairs_live is resilient to individual pair failures", { td <- trait_description("overall_quality") pairs <- tibble::tibble( ID1 = c("S1", "S2"), text1 = c("A", "C"), ID2 = c("S2", "S3"), text2 = c("B", "D") ) # Mock the single comparison function directly. # Pair 1 throws error, Pair 2 succeeds. testthat::with_mocked_bindings( anthropic_compare_pair_live = function(ID1, ...) { if (ID1 == "S1") { stop("Simulated Network Error") } else { # Return a valid 1-row tibble for success tibble::tibble( custom_id = "LIVE_S2_vs_S3", ID1 = "S2", ID2 = "S3", model = "claude", object_type = "message", status_code = 200L, error_message = NA_character_, thoughts = NA_character_, content = "Result", better_sample = "SAMPLE_1", better_id = "S2", prompt_tokens = 10, completion_tokens = 10, total_tokens = 20 ) } }, { # Capture output, suppress verbose messages for clean test logs res <- suppressMessages( submit_anthropic_pairs_live( pairs, "claude", td$name, td$description, verbose = FALSE, progress = FALSE ) ) testthat::expect_equal(nrow(res), 2L) # Row 1: Failed testthat::expect_equal(res$ID1[1], "S1") testthat::expect_match(res$error_message[1], "Simulated Network Error") testthat::expect_true(is.na(res$status_code[1])) # Row 2: Succeeded testthat::expect_equal(res$ID1[2], "S2") testthat::expect_true(is.na(res$error_message[2])) testthat::expect_equal(res$better_id[2], "S2") } ) })