# =====================================================================
# test-ollama_live.R
# Tests for ollama_compare_pair_live() and submit_ollama_pairs_live()
# =====================================================================
# ---------------------------------------------------------------------
# ollama_compare_pair_live: happy path
# ---------------------------------------------------------------------
testthat::test_that("ollama_compare_pair_live parses successful response
correctly", {
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
ID1 <- "S01"
ID2 <- "S02"
text1 <- "Text 1"
text2 <- "Text 2"
fake_body <- list(
model = "mistral-small3.2:24b",
response =
"SAMPLE_1 Some explanation.",
prompt_eval_count = 10L,
eval_count = 5L
)
captured_body <- NULL
testthat::with_mocked_bindings(
# Mock the retry wrapper so no real HTTP requests are made
.retry_httr2_request = function(req) {
structure(list(), class = "fake_resp")
},
# We only need to intercept these; request/req_url_path_append/req_error
# can remain as in the real code.
req_body_json = function(req, body) {
captured_body <<- body
req
},
resp_body_json = function(resp, simplifyVector = FALSE) fake_body,
resp_status = function(resp) 200L,
{
res <- ollama_compare_pair_live(
ID1 = ID1,
text1 = text1,
ID2 = ID2,
text2 = text2,
model = "mistral-small3.2:24b",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
host = "http://127.0.0.1:11434",
num_ctx = 8192L,
think = FALSE,
include_raw = TRUE
)
# Basic shape
testthat::expect_s3_class(res, "tbl_df")
testthat::expect_equal(nrow(res), 1L)
testthat::expect_equal(res$custom_id, sprintf("LIVE_%s_vs_%s", ID1, ID2))
testthat::expect_equal(res$ID1, ID1)
testthat::expect_equal(res$ID2, ID2)
testthat::expect_equal(res$model, "mistral-small3.2:24b")
testthat::expect_equal(res$object_type, "ollama.generate")
testthat::expect_equal(res$status_code, 200L)
testthat::expect_true(is.na(res$error_message))
testthat::expect_equal(
res$content,
"SAMPLE_1 Some explanation."
)
testthat::expect_equal(res$better_sample, "SAMPLE_1")
testthat::expect_equal(res$better_id, ID1)
# Token counts from prompt_eval_count + eval_count
testthat::expect_equal(res$prompt_tokens, 10)
testthat::expect_equal(res$completion_tokens, 5)
testthat::expect_equal(res$total_tokens, 15)
# raw_response
testthat::expect_true("raw_response" %in% names(res))
testthat::expect_type(res$raw_response, "list")
testthat::expect_equal(
res$raw_response[[1]]$model, "mistral-small3.2:24b"
)
# Request body sanity checks
testthat::expect_type(captured_body, "list")
testthat::expect_equal(captured_body$model, "mistral-small3.2:24b")
testthat::expect_false(isTRUE(captured_body$stream))
# Default context window + temperature for non-Qwen models
testthat::expect_equal(captured_body$options$num_ctx, 8192L)
testthat::expect_equal(captured_body$options$temperature, 0)
}
)
})
# ---------------------------------------------------------------------
# ollama_compare_pair_live: Qwen + think = TRUE → temperature = 0.6
# ---------------------------------------------------------------------
testthat::test_that("ollama_compare_pair_live sets Qwen temperature
when think = TRUE", {
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
ID1 <- "S01"
ID2 <- "S02"
text1 <- "Text A"
text2 <- "Text B"
fake_body <- list(
model = "qwen3:32b",
response = "SAMPLE_2 Explanation.",
prompt_eval_count = 4L,
eval_count = 6L
)
captured_body <- NULL
testthat::with_mocked_bindings(
.retry_httr2_request = function(req) {
structure(list(), class = "fake_resp")
},
req_body_json = function(req, body) {
captured_body <<- body
req
},
resp_body_json = function(resp, simplifyVector = FALSE) fake_body,
resp_status = function(resp) 200L,
{
res <- ollama_compare_pair_live(
ID1 = ID1,
text1 = text1,
ID2 = ID2,
text2 = text2,
model = "qwen3:32b",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
host = "http://127.0.0.1:11434",
think = TRUE,
num_ctx = 4096L,
include_raw = FALSE
)
testthat::expect_s3_class(res, "tbl_df")
testthat::expect_equal(res$better_sample, "SAMPLE_2")
testthat::expect_equal(res$better_id, ID2)
# Temperature logic: Qwen + think = TRUE → 0.6
testthat::expect_type(captured_body, "list")
testthat::expect_equal(captured_body$model, "qwen3:32b")
testthat::expect_equal(captured_body$options$num_ctx, 4096L)
testthat::expect_equal(captured_body$options$temperature, 0.6)
}
)
})
# ---------------------------------------------------------------------
# ollama_compare_pair_live: JSON parse failure
# ---------------------------------------------------------------------
testthat::test_that("ollama_compare_pair_live returns error row on
JSON parse failure", {
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
ID1 <- "S01"
ID2 <- "S02"
testthat::with_mocked_bindings(
.retry_httr2_request = function(req) {
structure(list(), class = "fake_resp")
},
req_body_json = function(req, body) req,
resp_body_json = function(resp, simplifyVector = FALSE) stop("boom"),
resp_status = function(resp) 500L,
{
res <- ollama_compare_pair_live(
ID1 = ID1,
text1 = "X",
ID2 = ID2,
text2 = "Y",
model = "mistral-small3.2:24b",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
host = "http://127.0.0.1:11434",
include_raw = TRUE
)
testthat::expect_s3_class(res, "tbl_df")
testthat::expect_equal(res$status_code, 500L)
testthat::expect_equal(
res$error_message,
"Failed to parse response body as JSON."
)
testthat::expect_true(is.na(res$better_sample))
testthat::expect_true(is.na(res$better_id))
testthat::expect_true(is.null(res$raw_response[[1]]))
}
)
})
# ---------------------------------------------------------------------
# submit_ollama_pairs_live: zero rows
# ---------------------------------------------------------------------
testthat::test_that("submit_ollama_pairs_live returns empty tibble
for zero rows", {
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
empty_pairs <- tibble::tibble(
ID1 = character(0),
text1 = character(0),
ID2 = character(0),
text2 = character(0)
)
res <- submit_ollama_pairs_live(
pairs = empty_pairs,
model = "mistral-small3.2:24b",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
host = "http://127.0.0.1:11434",
verbose = FALSE,
progress = FALSE
)
testthat::expect_s3_class(res, "tbl_df")
testthat::expect_equal(nrow(res), 0L)
testthat::expect_true("thoughts" %in% names(res))
testthat::expect_false("raw_response" %in% names(res))
# With include_raw = TRUE, empty tibble still has raw_response column
res2 <- submit_ollama_pairs_live(
pairs = empty_pairs,
model = "mistral-small3.2:24b",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
host = "http://127.0.0.1:11434",
verbose = FALSE,
progress = FALSE,
include_raw = TRUE
)
testthat::expect_s3_class(res2, "tbl_df")
testthat::expect_equal(nrow(res2), 0L)
testthat::expect_true("raw_response" %in% names(res2))
testthat::expect_type(res2$raw_response, "list")
})
# ---------------------------------------------------------------------
# submit_ollama_pairs_live: row-wise calling of ollama_compare_pair_live
# ---------------------------------------------------------------------
testthat::test_that("submit_ollama_pairs_live calls
ollama_compare_pair_live row-wise", {
pairs <- tibble::tibble(
ID1 = c("S01", "S03"),
text1 = c("Text 1", "Text 3"),
ID2 = c("S02", "S04"),
text2 = c("Text 2", "Text 4")
)
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
fake_result <- function(ID1, ID2, chosen) {
tibble::tibble(
custom_id = sprintf("LIVE_%s_vs_%s", ID1, ID2),
ID1 = ID1,
ID2 = ID2,
model = "mistral-small3.2:24b",
object_type = "ollama.generate",
status_code = 200L,
error_message = NA_character_,
thoughts = NA_character_,
content = sprintf("%s", chosen),
better_sample = chosen,
better_id = if (chosen == "SAMPLE_1") ID1 else ID2,
prompt_tokens = 10,
completion_tokens = 5,
total_tokens = 15
)
}
calls <- list()
testthat::with_mocked_bindings(
ollama_compare_pair_live = function(
ID1,
text1,
ID2,
text2,
model,
trait_name,
trait_description,
prompt_template,
host,
tag_prefix,
tag_suffix,
think,
num_ctx,
include_raw,
...
) {
calls <<- append(calls, list(list(
ID1 = ID1,
ID2 = ID2,
model = model,
trait_name = trait_name,
trait_description = trait_description,
prompt_template = prompt_template,
host = host,
think = think,
num_ctx = num_ctx,
include_raw = include_raw,
dots = list(...)
)))
if (ID1 == "S01") {
fake_result(ID1, ID2, "SAMPLE_1")
} else {
fake_result(ID1, ID2, "SAMPLE_2")
}
},
{
res <- submit_ollama_pairs_live(
pairs = pairs,
model = "mistral-small3.2:24b",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
host = "http://127.0.0.1:11434",
verbose = FALSE,
status_every = 1,
progress = FALSE,
include_raw = FALSE,
think = FALSE,
num_ctx = 8192L
)
# Should call helper once per row
testthat::expect_equal(length(calls), 2L)
testthat::expect_equal(calls[[1]]$ID1, "S01")
testthat::expect_equal(calls[[2]]$ID1, "S03")
# Shared parameters should be forwarded correctly
for (call in calls) {
testthat::expect_equal(call$model, "mistral-small3.2:24b")
testthat::expect_equal(call$trait_name, td$name)
testthat::expect_equal(call$trait_description, td$description)
testthat::expect_identical(call$prompt_template, tmpl)
testthat::expect_equal(call$host, "http://127.0.0.1:11434")
testthat::expect_false(call$think)
testthat::expect_equal(call$num_ctx, 8192L)
testthat::expect_false(call$include_raw)
}
# Aggregated result should be consistent with fake_result logic
testthat::expect_s3_class(res, "tbl_df")
testthat::expect_equal(res$better_id, c("S01", "S04"))
}
)
})
testthat::test_that("ollama_compare_pair_live validates scalar arguments", {
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
# ID1 must be scalar character
expect_error(
ollama_compare_pair_live(
ID1 = c("S01", "S02"),
text1 = "A",
ID2 = "S02",
text2 = "B",
model = "mistral-small3.2:24b",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl
),
"`ID1` must be a single character.",
fixed = TRUE
)
# host must be non-empty
expect_error(
ollama_compare_pair_live(
ID1 = "S01",
text1 = "A",
ID2 = "S02",
text2 = "B",
model = "mistral-small3.2:24b",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
host = ""
),
"`host` must be a non-empty character scalar.",
fixed = TRUE
)
# num_ctx must be positive scalar
expect_error(
ollama_compare_pair_live(
ID1 = "S01",
text1 = "A",
ID2 = "S02",
text2 = "B",
model = "mistral-small3.2:24b",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
num_ctx = 0
),
"`num_ctx` must be a single positive number.",
fixed = TRUE
)
})
testthat::test_that("ollama_compare_pair_live exposes thinking only when think = TRUE", {
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
ID1 <- "S01"
ID2 <- "S02"
text1 <- "Text A"
text2 <- "Text B"
fake_body <- list(
model = "qwen3:32b",
response = "SAMPLE_2 Explanation.",
prompt_eval_count = 4L,
eval_count = 6L,
thinking = "Internal reasoning trace"
)
captured_body <- NULL
testthat::with_mocked_bindings(
.retry_httr2_request = function(req) {
structure(list(), class = "fake_resp")
},
req_body_json = function(req, body) {
captured_body <<- body
req
},
resp_body_json = function(resp, simplifyVector = FALSE) fake_body,
resp_status = function(resp) 200L,
{
res <- ollama_compare_pair_live(
ID1 = ID1,
text1 = text1,
ID2 = ID2,
text2 = text2,
model = "qwen3:32b",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
host = "http://127.0.0.1:11434",
think = TRUE,
num_ctx = 4096L,
include_raw = FALSE
)
testthat::expect_equal(res$thoughts, "Internal reasoning trace")
testthat::expect_equal(captured_body$options$temperature, 0.6)
}
)
})
testthat::test_that("ollama_compare_pair_live uses error field when status != 200", {
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
fake_body <- list(
model = "mistral-small3.2:24b",
response = "SAMPLE_1",
prompt_eval_count = 1L,
eval_count = 1L,
error = "OOM in backend"
)
testthat::with_mocked_bindings(
.retry_httr2_request = function(req) structure(list(), class = "fake_resp"),
req_body_json = function(req, body) req,
resp_body_json = function(resp, simplifyVector = FALSE) fake_body,
resp_status = function(resp) 500L,
{
res <- ollama_compare_pair_live(
ID1 = "S01",
text1 = "A",
ID2 = "S02",
text2 = "B",
model = "mistral-small3.2:24b",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
host = "http://127.0.0.1:11434"
)
testthat::expect_equal(res$status_code, 500L)
testthat::expect_equal(res$error_message, "OOM in backend")
}
)
})
testthat::test_that("ollama_compare_pair_live falls back to generic error message", {
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
fake_body <- list(
model = "mistral-small3.2:24b",
response = "SAMPLE_1",
prompt_eval_count = 1L,
eval_count = 1L
# no error or message field
)
testthat::with_mocked_bindings(
.retry_httr2_request = function(req) structure(list(), class = "fake_resp"),
req_body_json = function(req, body) req,
resp_body_json = function(resp, simplifyVector = FALSE) fake_body,
resp_status = function(resp) 503L,
{
res <- ollama_compare_pair_live(
ID1 = "S01",
text1 = "A",
ID2 = "S02",
text2 = "B",
model = "mistral-small3.2:24b",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
host = "http://127.0.0.1:11434"
)
testthat::expect_equal(res$status_code, 503L)
testthat::expect_match(
res$error_message,
"Ollama request failed with status 503",
fixed = FALSE
)
}
)
})
testthat::test_that("submit_ollama_pairs_live validates required columns", {
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
bad_pairs <- tibble::tibble(
ID1 = "S01",
text1 = "A"
# missing ID2, text2
)
expect_error(
submit_ollama_pairs_live(
pairs = bad_pairs,
model = "mistral-small3.2:24b",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
verbose = FALSE,
progress = FALSE
),
"`pairs` must contain columns:",
fixed = TRUE
)
})
testthat::test_that("submit_ollama_pairs_live validates status_every", {
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
pairs <- tibble::tibble(
ID1 = "S01",
text1 = "A",
ID2 = "S02",
text2 = "B"
)
expect_error(
submit_ollama_pairs_live(
pairs = pairs,
model = "mistral-small3.2:24b",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
status_every = 0,
verbose = FALSE,
progress = FALSE
),
"`status_every` must be a single positive integer.",
fixed = TRUE
)
})
testthat::test_that("submit_ollama_pairs_live emits warning for error rows", {
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
pairs <- tibble::tibble(
ID1 = "S01",
text1 = "A",
ID2 = "S02",
text2 = "B"
)
fake_row <- tibble::tibble(
custom_id = "LIVE_S01_vs_S02",
ID1 = "S01",
ID2 = "S02",
model = "mistral-small3.2:24b",
object_type = "ollama.generate",
status_code = 500L,
error_message = "backend failure",
thoughts = NA_character_,
content = NA_character_,
better_sample = NA_character_,
better_id = NA_character_,
prompt_tokens = NA_real_,
completion_tokens = NA_real_,
total_tokens = NA_real_
)
testthat::with_mocked_bindings(
ollama_compare_pair_live = function(...) fake_row,
{
res <- submit_ollama_pairs_live(
pairs = pairs,
model = "mistral-small3.2:24b",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
verbose = TRUE,
status_every = 1,
progress = TRUE, # exercises txtProgressBar path
include_raw = FALSE,
think = FALSE,
num_ctx = 8192L
)
testthat::expect_equal(res$error_message, "backend failure")
testthat::expect_equal(res$status_code, 500L)
}
)
})
testthat::test_that("submit_ollama_pairs_live computes timing when show_status is TRUE", {
td <- trait_description("overall_quality")
tmpl <- set_prompt_template()
pairs <- tibble::tibble(
ID1 = "S01",
text1 = "A",
ID2 = "S02",
text2 = "B"
)
ok_row <- tibble::tibble(
custom_id = "LIVE_S01_vs_S02",
ID1 = "S01",
ID2 = "S02",
model = "mistral-small3.2:24b",
object_type = "ollama.generate",
status_code = 200L,
error_message = NA_character_,
thoughts = NA_character_,
content = "SAMPLE_1",
better_sample = "SAMPLE_1",
better_id = "S01",
prompt_tokens = 10,
completion_tokens = 5,
total_tokens = 15
)
testthat::with_mocked_bindings(
ollama_compare_pair_live = function(...) ok_row,
{
res <- submit_ollama_pairs_live(
pairs = pairs,
model = "mistral-small3.2:24b",
trait_name = td$name,
trait_description = td$description,
prompt_template = tmpl,
verbose = TRUE,
status_every = 1,
progress = FALSE,
include_raw = FALSE
)
testthat::expect_equal(res$better_id, "S01")
testthat::expect_true(all(is.na(res$error_message)))
}
)
})
testthat::test_that("ensure_only_ollama_model_loaded validates model argument", {
expect_error(
ensure_only_ollama_model_loaded(c("m1", "m2")),
"`model` must be a non-empty character scalar.",
fixed = TRUE
)
})
# ---------------------------------------------------------------------
# ensure_only_ollama_model_loaded: system / parsing behaviour
# ---------------------------------------------------------------------
testthat::test_that(
"ensure_only_ollama_model_loaded validates model argument",
{
testthat::expect_error(
ensure_only_ollama_model_loaded(character()),
"`model` must be a non-empty character scalar.",
fixed = TRUE
)
testthat::expect_error(
ensure_only_ollama_model_loaded(c("a", "b")),
"`model` must be a non-empty character scalar.",
fixed = TRUE
)
testthat::expect_error(
ensure_only_ollama_model_loaded(""),
"`model` must be a non-empty character scalar.",
fixed = TRUE
)
}
)
testthat::test_that(
"ensure_only_ollama_model_loaded handles ollama ps failure",
{
testthat::with_mocked_bindings(
.ollama_system2 = function(command, args, stdout = TRUE, stderr = TRUE, ...) {
stop("simulated failure from ollama ps")
},
{
res <- ensure_only_ollama_model_loaded("mistral-small3.2:24b", verbose = TRUE)
# Should return an empty character vector, invisibly
testthat::expect_type(res, "character")
testthat::expect_length(res, 0L)
}
)
}
)
testthat::test_that(
"ensure_only_ollama_model_loaded handles empty and header-only output",
{
# Case 1: completely empty output but status == 0
testthat::with_mocked_bindings(
.ollama_system2 = function(command, args, stdout = TRUE, stderr = TRUE, ...) {
structure(character(0), status = 0L)
},
{
res_empty <- ensure_only_ollama_model_loaded("qwen3:32b", verbose = FALSE)
testthat::expect_type(res_empty, "character")
testthat::expect_length(res_empty, 0L)
}
)
# Case 2: header line only
header_only <- structure(
"NAME ID STATUS",
status = 0L
)
testthat::with_mocked_bindings(
.ollama_system2 = function(command, args, stdout = TRUE, stderr = TRUE, ...) {
header_only
},
{
res_header <- ensure_only_ollama_model_loaded("qwen3:32b", verbose = FALSE)
testthat::expect_type(res_header, "character")
testthat::expect_length(res_header, 0L)
}
)
}
)
testthat::test_that(
"ensure_only_ollama_model_loaded parses models and unloads others",
{
calls <- list()
fake_ps_output <- structure(
c(
"NAME ID STATUS",
"mistral-small3.2:24b abc123 running",
"qwen3:32b def456 running",
"gemma3:27b ghi789 running"
),
status = 0L
)
fake_ollama_system2 <- function(command, args, stdout = TRUE, stderr = TRUE, ...) {
if (identical(args, "ps")) {
# First call: simulate `ollama ps`
fake_ps_output
} else {
# Subsequent calls should be of the form c("stop", )
calls <<- append(calls, list(list(command = command, args = args)))
invisible(NULL)
}
}
testthat::with_mocked_bindings(
.ollama_system2 = fake_ollama_system2,
{
keep_model <- "qwen3:32b"
res <- ensure_only_ollama_model_loaded(keep_model, verbose = FALSE)
# Should request unloading of all models except the one we keep
testthat::expect_setequal(
res,
c("mistral-small3.2:24b", "gemma3:27b")
)
# We should have issued stop commands for the same set
stopped_models <- vapply(
calls,
function(x) x$args[2],
character(1)
)
testthat::expect_setequal(
stopped_models,
c("mistral-small3.2:24b", "gemma3:27b")
)
# All stop commands should target the `ollama` binary
testthat::expect_true(all(vapply(calls, function(x) x$command, "") == "ollama"))
}
)
}
)
testthat::test_that("ensure_only_ollama_model_loaded handles empty or weird CLI output", {
# Scenario 1: ollama ps returns only header
testthat::with_mocked_bindings(
.ollama_system2 = function(...) {
structure("NAME ID SIZE PROCESSOR UNTIL", status = 0L)
},
{
# Should return empty character invisibly (no models found to unload)
res <- ensure_only_ollama_model_loaded("mistral", verbose = FALSE)
testthat::expect_length(res, 0L)
}
)
# Scenario 2: ollama ps returns header and the target model only
testthat::with_mocked_bindings(
.ollama_system2 = function(...) {
structure(c(
"NAME ID SIZE PROCESSOR UNTIL",
"mistral abc 4GB gpu 4m"
), status = 0L)
},
{
# Should return empty character (target matches, nothing to unload)
res <- ensure_only_ollama_model_loaded("mistral", verbose = FALSE)
testthat::expect_length(res, 0L)
}
)
})