# =====================================================================
# test-together_live.R
# Tests for together_compare_pair_live() and submit_together_pairs_live()
# =====================================================================
testthat::test_that(
"together_compare_pair_live parses a successful response without thoughts and respects explicit temperature",
{
pll_ns <- asNamespace("pairwiseLLM")
fake_body <- list(
id = "chatcmpl-123",
object = "chat.completion",
model = "moonshotai/Kimi-K2-Instruct-0905",
choices = list(
list(
index = 0L,
message = list(
role = "assistant",
content = "SAMPLE_1 Explanation."
),
finish_reason = "stop"
)
),
usage = list(
prompt_tokens = 42L,
completion_tokens = 7L,
total_tokens = 49L
)
)
captured_bodies <- list()
testthat::local_mocked_bindings(
.together_api_key = function(api_key = NULL) "TEST_TOGETHER_KEY",
.together_req_body_json = function(req, body) {
captured_bodies <<- append(captured_bodies, list(body))
req
},
.together_req_perform = function(req) "FAKE_RESP",
.together_resp_status = function(resp) 200L,
.together_resp_body_json = function(resp, simplifyVector = FALSE) fake_body,
.env = pll_ns
)
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
res <- together_compare_pair_live(
ID1 = "S01",
text1 = "Sample 1 text.",
ID2 = "S02",
text2 = "Sample 2 text.",
model = "moonshotai/Kimi-K2-Instruct-0905",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
include_raw = TRUE,
temperature = 0
)
# Basic structure
testthat::expect_s3_class(res, "tbl_df")
testthat::expect_equal(nrow(res), 1L)
testthat::expect_equal(res$ID1, "S01")
testthat::expect_equal(res$ID2, "S02")
testthat::expect_equal(res$model, "moonshotai/Kimi-K2-Instruct-0905")
testthat::expect_equal(res$object_type, "chat.completion")
testthat::expect_equal(res$status_code, 200L)
testthat::expect_true(is.na(res$error_message) || identical(res$error_message, ""))
# No in content, no thoughts extracted
testthat::expect_true(is.na(res$thoughts))
testthat::expect_true(grepl(
"SAMPLE_1",
res$content,
fixed = TRUE
))
testthat::expect_equal(res$better_sample, "SAMPLE_1")
testthat::expect_equal(res$better_id, "S01")
testthat::expect_equal(res$prompt_tokens, 42)
testthat::expect_equal(res$completion_tokens, 7)
testthat::expect_equal(res$total_tokens, 49)
# raw_response list-column present and correct
testthat::expect_true("raw_response" %in% names(res))
testthat::expect_type(res$raw_response, "list")
testthat::expect_identical(res$raw_response[[1]], fake_body)
# Outgoing request body captured once, with explicit temperature = 0
testthat::expect_equal(length(captured_bodies), 1L)
b <- captured_bodies[[1]]
testthat::expect_equal(b$model, "moonshotai/Kimi-K2-Instruct-0905")
testthat::expect_equal(b$temperature, 0)
testthat::expect_true(is.list(b$messages))
testthat::expect_true(length(b$messages) == 1L)
}
)
# ---------------------------------------------------------------------
testthat::test_that(
"together_compare_pair_live applies default temperatures when not supplied",
{
pll_ns <- asNamespace("pairwiseLLM")
fake_body <- list(
id = "chatcmpl-any",
object = "chat.completion",
model = "dummy",
choices = list(
list(
index = 0L,
message = list(
role = "assistant",
content = "SAMPLE_1"
),
finish_reason = "stop"
)
)
)
captured_bodies <- list()
testthat::local_mocked_bindings(
.together_api_key = function(api_key = NULL) "TEST_TOGETHER_KEY",
.together_req_body_json = function(req, body) {
captured_bodies <<- append(captured_bodies, list(body))
req
},
.together_req_perform = function(req) "FAKE_RESP",
.together_resp_status = function(resp) 200L,
.together_resp_body_json = function(resp, simplifyVector = FALSE) fake_body,
.env = pll_ns
)
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
# 1) Non-thinking model (Kimi) with no temperature -> default 0
together_compare_pair_live(
ID1 = "S01",
text1 = "Text 1",
ID2 = "S02",
text2 = "Text 2",
model = "moonshotai/Kimi-K2-Instruct-0905",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl
)
# 2) DeepSeek-R1 with no temperature -> default 0.6
together_compare_pair_live(
ID1 = "S03",
text1 = "Text 3",
ID2 = "S04",
text2 = "Text 4",
model = "deepseek-ai/DeepSeek-R1",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl
)
testthat::expect_equal(length(captured_bodies), 2L)
b1 <- captured_bodies[[1]]
b2 <- captured_bodies[[2]]
testthat::expect_equal(b1$model, "moonshotai/Kimi-K2-Instruct-0905")
testthat::expect_equal(b1$temperature, 0)
testthat::expect_equal(b2$model, "deepseek-ai/DeepSeek-R1")
testthat::expect_equal(b2$temperature, 0.6)
}
)
# ---------------------------------------------------------------------
testthat::test_that(
"together_compare_pair_live parses DeepSeek-R1 thoughts correctly",
{
pll_ns <- asNamespace("pairwiseLLM")
raw_content <- paste0(
"This is internal chain-of-thought.\n",
"SAMPLE_2 Visible explanation."
)
fake_body <- list(
id = "chatcmpl-456",
object = "chat.completion",
model = "deepseek-ai/DeepSeek-R1",
choices = list(
list(
index = 0L,
message = list(
role = "assistant",
content = raw_content
),
finish_reason = "stop"
)
),
usage = list(
prompt_tokens = 100L,
completion_tokens = 20L,
total_tokens = 120L
)
)
testthat::local_mocked_bindings(
.together_api_key = function(api_key = NULL) "TEST_TOGETHER_KEY",
.together_req_body_json = function(req, body) req,
.together_req_perform = function(req) "FAKE_RESP",
.together_resp_status = function(resp) 200L,
.together_resp_body_json = function(resp, simplifyVector = FALSE) fake_body,
.env = pll_ns
)
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
res <- together_compare_pair_live(
ID1 = "S01",
text1 = "Sample 1 text.",
ID2 = "S02",
text2 = "Sample 2 text.",
model = "deepseek-ai/DeepSeek-R1",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl
)
testthat::expect_s3_class(res, "tbl_df")
testthat::expect_equal(nrow(res), 1L)
# Thoughts should be extracted from inside ...
testthat::expect_false(is.na(res$thoughts))
testthat::expect_true(grepl(
"internal chain-of-thought",
res$thoughts,
fixed = TRUE
))
# Content should no longer contain tags, only the visible answer
testthat::expect_false(grepl("", res$content, fixed = TRUE))
testthat::expect_true(grepl(
"SAMPLE_2",
res$content,
fixed = TRUE
))
testthat::expect_equal(res$better_sample, "SAMPLE_2")
testthat::expect_equal(res$better_id, "S02")
# Token counts passed through correctly
testthat::expect_equal(res$prompt_tokens, 100)
testthat::expect_equal(res$completion_tokens, 20)
testthat::expect_equal(res$total_tokens, 120)
}
)
# ---------------------------------------------------------------------
testthat::test_that(
"together_compare_pair_live handles responses without tag",
{
pll_ns <- asNamespace("pairwiseLLM")
fake_body <- list(
id = "chatcmpl-789",
object = "chat.completion",
model = "Qwen/Qwen3-235B-A22B-Instruct-2507-tput",
choices = list(
list(
index = 0L,
message = list(
role = "assistant",
content = "I forgot to include the tag, sorry."
),
finish_reason = "stop"
)
)
)
testthat::local_mocked_bindings(
.together_api_key = function(api_key = NULL) "TEST_TOGETHER_KEY",
.together_req_body_json = function(req, body) req,
.together_req_perform = function(req) "FAKE_RESP",
.together_resp_status = function(resp) 200L,
.together_resp_body_json = function(resp, simplifyVector = FALSE) fake_body,
.env = pll_ns
)
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
res <- together_compare_pair_live(
ID1 = "S01",
text1 = "Sample 1 text.",
ID2 = "S02",
text2 = "Sample 2 text.",
model = "Qwen/Qwen3-235B-A22B-Instruct-2507-tput",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl
)
testthat::expect_true(is.na(res$better_sample))
testthat::expect_true(is.na(res$better_id))
}
)
# ---------------------------------------------------------------------
testthat::test_that(
"together_compare_pair_live returns an error row when JSON parse fails",
{
pll_ns <- asNamespace("pairwiseLLM")
testthat::local_mocked_bindings(
.together_api_key = function(api_key = NULL) "TEST_TOGETHER_KEY",
.together_req_body_json = function(req, body) req,
.together_req_perform = function(req) "FAKE_RESP",
.together_resp_status = function(resp) 500L,
.together_resp_body_json = function(resp, simplifyVector = FALSE) {
stop("boom")
},
.env = pll_ns
)
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
res <- together_compare_pair_live(
ID1 = "S01",
text1 = "Sample 1 text.",
ID2 = "S02",
text2 = "Sample 2 text.",
model = "deepseek-ai/DeepSeek-V3",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
include_raw = TRUE
)
testthat::expect_equal(nrow(res), 1L)
testthat::expect_equal(res$ID1, "S01")
testthat::expect_equal(res$ID2, "S02")
testthat::expect_true(is.na(res$model))
testthat::expect_true(is.na(res$object_type))
testthat::expect_equal(res$status_code, 500L)
testthat::expect_true(is.na(res$content))
testthat::expect_true(is.na(res$thoughts))
testthat::expect_true(is.na(res$better_sample))
testthat::expect_true(is.na(res$better_id))
testthat::expect_match(
res$error_message,
"Failed to parse Together.ai response body as JSON",
fixed = FALSE
)
# When parse fails and include_raw = TRUE we expect NULL raw_response[[1]]
testthat::expect_true("raw_response" %in% names(res))
testthat::expect_true(is.null(res$raw_response[[1]]))
}
)
# ---------------------------------------------------------------------
testthat::test_that(
"submit_together_pairs_live validates inputs and handles zero-row pairs",
{
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
# Missing columns
bad_pairs <- tibble::tibble(
ID1 = "S01",
text1 = "Sample 1"
# missing ID2/text2
)
testthat::expect_error(
submit_together_pairs_live(
pairs = bad_pairs,
model = "deepseek-ai/DeepSeek-R1",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl
),
"`pairs` must contain columns",
fixed = FALSE
)
# Zero rows: should return empty tibble with expected columns
empty_pairs <- tibble::tibble(
ID1 = character(0),
text1 = character(0),
ID2 = character(0),
text2 = character(0)
)
res_empty <- submit_together_pairs_live(
pairs = empty_pairs,
model = "deepseek-ai/DeepSeek-R1",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl
)
testthat::expect_s3_class(res_empty, "tbl_df")
testthat::expect_equal(nrow(res_empty), 0L)
testthat::expect_setequal(
names(res_empty),
c(
"custom_id", "ID1", "ID2", "model", "object_type",
"status_code", "error_message", "thoughts", "content",
"better_sample", "better_id",
"prompt_tokens", "completion_tokens", "total_tokens"
)
)
# When include_raw = TRUE, raw_response column should be present even for zero rows
res_empty_raw <- submit_together_pairs_live(
pairs = empty_pairs,
model = "deepseek-ai/DeepSeek-R1",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
include_raw = TRUE
)
testthat::expect_true("raw_response" %in% names(res_empty_raw))
testthat::expect_type(res_empty_raw$raw_response, "list")
}
)
# ---------------------------------------------------------------------
testthat::test_that("submit_together_pairs_live runs correctly (mocking internals)", {
pll_ns <- asNamespace("pairwiseLLM")
pairs <- tibble::tibble(
ID1 = c("S01", "S02"),
text1 = c("Text 1a", "Text 2a"),
ID2 = c("S03", "S04"),
text2 = c("Text 1b", "Text 2b")
)
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
# We capture the bodies sent to the API to verify arguments were passed down
captured_bodies <- list()
fake_body_resp <- list(
id = "chatcmpl-test",
object = "chat.completion",
choices = list(
list(
message = list(content = "SAMPLE_1")
)
),
usage = list(total_tokens = 10)
)
testthat::with_mocked_bindings(
.together_api_key = function(...) "TEST_KEY",
.together_req_body_json = function(req, body) {
captured_bodies <<- append(captured_bodies, list(body))
req
},
.together_req_perform = function(req) "FAKE_RESP",
.together_resp_status = function(resp) 200L,
.together_resp_body_json = function(resp, simplifyVector = FALSE) fake_body_resp,
.env = pll_ns,
{
res <- submit_together_pairs_live(
pairs = pairs,
model = "deepseek-ai/DeepSeek-R1",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
include_raw = FALSE,
verbose = FALSE,
progress = FALSE,
temperature = 0.7 # Custom arg to verify passthrough
)
# Verify structure
testthat::expect_s3_class(res, "tbl_df")
testthat::expect_equal(nrow(res), 2L)
testthat::expect_equal(res$ID1, c("S01", "S02"))
# Verify results parsed from the fake body
testthat::expect_true(all(res$better_sample == "SAMPLE_1"))
# Verify arguments were passed down to internals correctly
testthat::expect_equal(length(captured_bodies), 2L)
# Check passthrough of 'temperature' and 'model'
testthat::expect_equal(captured_bodies[[1]]$model, "deepseek-ai/DeepSeek-R1")
testthat::expect_equal(captured_bodies[[1]]$temperature, 0.7)
}
)
})
testthat::test_that("together_compare_pair_live validates input types", {
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
# ID1 must be character
testthat::expect_error(
together_compare_pair_live(
ID1 = 123, text1 = "t", ID2 = "B", text2 = "t",
model = "model", trait_name = td$name, trait_description = td$description
),
"`ID1` must be a single character"
)
# text1 must be character
testthat::expect_error(
together_compare_pair_live(
ID1 = "A", text1 = list(), ID2 = "B", text2 = "t",
model = "model", trait_name = td$name, trait_description = td$description
),
"`text1` must be a single character"
)
# model must be character
testthat::expect_error(
together_compare_pair_live(
ID1 = "A", text1 = "t", ID2 = "B", text2 = "t",
model = 1, trait_name = td$name, trait_description = td$description
),
"`model` must be a single character"
)
})
testthat::test_that("together_compare_pair_live handles network/HTTP errors gracefully", {
pll_ns <- asNamespace("pairwiseLLM")
td <- trait_description("overall_quality")
# Mock internals to simulate a connection error
testthat::with_mocked_bindings(
.together_api_key = function(...) "KEY",
.together_req_body_json = function(req, ...) req,
.together_req_perform = function(...) {
stop("Simulated connection timeout")
},
.env = pll_ns,
{
res <- together_compare_pair_live(
ID1 = "S1", text1 = "A", ID2 = "S2", text2 = "B",
model = "model", trait_name = td$name, trait_description = td$description,
include_raw = TRUE
)
testthat::expect_s3_class(res, "tbl_df")
testthat::expect_equal(res$ID1, "S1")
# The catch-all error handler sets status_code to NA
testthat::expect_true(is.na(res$status_code))
# Check error message is captured
testthat::expect_match(res$error_message, "Together.ai request error: Simulated connection timeout")
# Check include_raw behavior on error (should be NULL)
testthat::expect_true("raw_response" %in% names(res))
testthat::expect_true(is.null(res$raw_response[[1]]))
}
)
})
testthat::test_that("together_compare_pair_live handles API-level errors (valid JSON, bad status)", {
pll_ns <- asNamespace("pairwiseLLM")
td <- trait_description("overall_quality")
fake_error_body <- list(
error = list(
message = "Rate limit exceeded",
type = "rate_limit_error"
)
)
testthat::with_mocked_bindings(
.together_api_key = function(...) "KEY",
.together_req_body_json = function(req, ...) req,
.together_req_perform = function(...) "RESP",
.together_resp_status = function(...) 429L,
.together_resp_body_json = function(...) fake_error_body,
.env = pll_ns,
{
res <- together_compare_pair_live(
ID1 = "S1", text1 = "A", ID2 = "S2", text2 = "B",
model = "model", trait_name = td$name, trait_description = td$description
)
testthat::expect_equal(res$status_code, 429L)
testthat::expect_match(res$error_message, "Rate limit exceeded")
testthat::expect_true(is.na(res$content))
}
)
})
testthat::test_that("together_compare_pair_live handles incomplete tags", {
pll_ns <- asNamespace("pairwiseLLM")
td <- trait_description("overall_quality")
# Case: is started but never closed (e.g. max tokens reached)
raw_text <- " I am thinking... [cut off]"
fake_body <- list(
choices = list(list(message = list(content = raw_text)))
)
testthat::with_mocked_bindings(
.together_api_key = function(...) "KEY",
.together_req_body_json = function(req, ...) req,
.together_req_perform = function(...) "RESP",
.together_resp_status = function(...) 200L,
.together_resp_body_json = function(...) fake_body,
.env = pll_ns,
{
res <- together_compare_pair_live(
ID1 = "S1", text1 = "A", ID2 = "S2", text2 = "B",
model = "deepseek-ai/DeepSeek-R1", trait_name = td$name, trait_description = td$description
)
# If regex tags aren't both present, it treats the whole thing as visible content
testthat::expect_true(is.na(res$thoughts))
testthat::expect_equal(res$content, raw_text)
}
)
})
testthat::test_that("submit_together_pairs_live validates status_every", {
td <- trait_description("overall_quality")
pairs <- tibble::tibble(ID1 = "A", text1 = "t", ID2 = "B", text2 = "t")
testthat::expect_error(
submit_together_pairs_live(pairs, "model", td$name, td$description, status_every = 0),
"status_every` must be a single positive integer"
)
testthat::expect_error(
submit_together_pairs_live(pairs, "model", td$name, td$description, status_every = "1"),
"status_every` must be a single positive integer"
)
})
testthat::test_that("submit_together_pairs_live handles internal R errors in loop", {
pll_ns <- asNamespace("pairwiseLLM")
td <- trait_description("overall_quality")
pairs <- tibble::tibble(
ID1 = c("S1", "S2"),
text1 = "A", ID2 = "B", text2 = "C"
)
# Here we mock 'together_compare_pair_live' SAFELY because we use with_mocked_bindings
# and this test is isolated at the end. However, to be ultra-safe and consistent with
# the strategy, we can simulate the error by mocking the internals to crash for specific ID.
testthat::with_mocked_bindings(
.together_api_key = function(...) "KEY",
.together_req_perform = function(...) "RESP",
.together_resp_status = function(...) 200L,
.together_resp_body_json = function(...) list(choices = list(list(message = list(content = "OK")))),
# We mock the high-level function only for this specific aggregation test
# to guarantee we trigger the catch block inside the loop without relying on network stack details.
together_compare_pair_live = function(ID1, ...) {
if (ID1 == "S1") stop("Unexpected internal crash")
tibble::tibble(
custom_id = "LIVE_S2_vs_B", ID1 = "S2", ID2 = "B",
model = "mod", object_type = "chat", status_code = 200L,
error_message = NA_character_, thoughts = NA_character_,
content = "Res", better_sample = "SAMPLE_1", better_id = "S2",
prompt_tokens = 1, completion_tokens = 1, total_tokens = 2
)
},
.env = pll_ns,
{
res <- suppressMessages(
submit_together_pairs_live(
pairs, "model", td$name, td$description,
verbose = FALSE, progress = FALSE, include_raw = TRUE
)
)
testthat::expect_equal(nrow(res), 2L)
# First row: Error caught by loop tryCatch
r1 <- res[1, ]
testthat::expect_equal(r1$ID1, "S1")
testthat::expect_match(r1$error_message, "Error during Together.ai comparison: Unexpected internal crash")
testthat::expect_true(is.null(r1$raw_response[[1]]))
# Second row: Success
r2 <- res[2, ]
testthat::expect_equal(r2$ID1, "S2")
testthat::expect_true(is.na(r2$error_message))
}
)
})