# ===================================================================== # test-ollama_live.R # Tests for ollama_compare_pair_live() and submit_ollama_pairs_live() # ===================================================================== # --------------------------------------------------------------------- # ollama_compare_pair_live: happy path # --------------------------------------------------------------------- testthat::test_that("ollama_compare_pair_live parses successful response correctly", { td <- trait_description("overall_quality") tmpl <- set_prompt_template() ID1 <- "S01" ID2 <- "S02" text1 <- "Text 1" text2 <- "Text 2" fake_body <- list( model = "mistral-small3.2:24b", response = "SAMPLE_1 Some explanation.", prompt_eval_count = 10L, eval_count = 5L ) captured_body <- NULL testthat::with_mocked_bindings( # Mock the retry wrapper so no real HTTP requests are made .retry_httr2_request = function(req) { structure(list(), class = "fake_resp") }, # We only need to intercept these; request/req_url_path_append/req_error # can remain as in the real code. req_body_json = function(req, body) { captured_body <<- body req }, resp_body_json = function(resp, simplifyVector = FALSE) fake_body, resp_status = function(resp) 200L, { res <- ollama_compare_pair_live( ID1 = ID1, text1 = text1, ID2 = ID2, text2 = text2, model = "mistral-small3.2:24b", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, host = "http://127.0.0.1:11434", num_ctx = 8192L, think = FALSE, include_raw = TRUE ) # Basic shape testthat::expect_s3_class(res, "tbl_df") testthat::expect_equal(nrow(res), 1L) testthat::expect_equal(res$custom_id, sprintf("LIVE_%s_vs_%s", ID1, ID2)) testthat::expect_equal(res$ID1, ID1) testthat::expect_equal(res$ID2, ID2) testthat::expect_equal(res$model, "mistral-small3.2:24b") testthat::expect_equal(res$object_type, "ollama.generate") testthat::expect_equal(res$status_code, 200L) testthat::expect_true(is.na(res$error_message)) testthat::expect_equal( res$content, "SAMPLE_1 Some explanation." ) testthat::expect_equal(res$better_sample, "SAMPLE_1") testthat::expect_equal(res$better_id, ID1) # Token counts from prompt_eval_count + eval_count testthat::expect_equal(res$prompt_tokens, 10) testthat::expect_equal(res$completion_tokens, 5) testthat::expect_equal(res$total_tokens, 15) # raw_response testthat::expect_true("raw_response" %in% names(res)) testthat::expect_type(res$raw_response, "list") testthat::expect_equal( res$raw_response[[1]]$model, "mistral-small3.2:24b" ) # Request body sanity checks testthat::expect_type(captured_body, "list") testthat::expect_equal(captured_body$model, "mistral-small3.2:24b") testthat::expect_false(isTRUE(captured_body$stream)) # Default context window + temperature for non-Qwen models testthat::expect_equal(captured_body$options$num_ctx, 8192L) testthat::expect_equal(captured_body$options$temperature, 0) } ) }) # --------------------------------------------------------------------- # ollama_compare_pair_live: Qwen + think = TRUE → temperature = 0.6 # --------------------------------------------------------------------- testthat::test_that("ollama_compare_pair_live sets Qwen temperature when think = TRUE", { td <- trait_description("overall_quality") tmpl <- set_prompt_template() ID1 <- "S01" ID2 <- "S02" text1 <- "Text A" text2 <- "Text B" fake_body <- list( model = "qwen3:32b", response = "SAMPLE_2 Explanation.", prompt_eval_count = 4L, eval_count = 6L ) captured_body <- NULL testthat::with_mocked_bindings( .retry_httr2_request = function(req) { structure(list(), class = "fake_resp") }, req_body_json = function(req, body) { captured_body <<- body req }, resp_body_json = function(resp, simplifyVector = FALSE) fake_body, resp_status = function(resp) 200L, { res <- ollama_compare_pair_live( ID1 = ID1, text1 = text1, ID2 = ID2, text2 = text2, model = "qwen3:32b", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, host = "http://127.0.0.1:11434", think = TRUE, num_ctx = 4096L, include_raw = FALSE ) testthat::expect_s3_class(res, "tbl_df") testthat::expect_equal(res$better_sample, "SAMPLE_2") testthat::expect_equal(res$better_id, ID2) # Temperature logic: Qwen + think = TRUE → 0.6 testthat::expect_type(captured_body, "list") testthat::expect_equal(captured_body$model, "qwen3:32b") testthat::expect_equal(captured_body$options$num_ctx, 4096L) testthat::expect_equal(captured_body$options$temperature, 0.6) } ) }) # --------------------------------------------------------------------- # ollama_compare_pair_live: JSON parse failure # --------------------------------------------------------------------- testthat::test_that("ollama_compare_pair_live returns error row on JSON parse failure", { td <- trait_description("overall_quality") tmpl <- set_prompt_template() ID1 <- "S01" ID2 <- "S02" testthat::with_mocked_bindings( .retry_httr2_request = function(req) { structure(list(), class = "fake_resp") }, req_body_json = function(req, body) req, resp_body_json = function(resp, simplifyVector = FALSE) stop("boom"), resp_status = function(resp) 500L, { res <- ollama_compare_pair_live( ID1 = ID1, text1 = "X", ID2 = ID2, text2 = "Y", model = "mistral-small3.2:24b", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, host = "http://127.0.0.1:11434", include_raw = TRUE ) testthat::expect_s3_class(res, "tbl_df") testthat::expect_equal(res$status_code, 500L) testthat::expect_equal( res$error_message, "Failed to parse response body as JSON." ) testthat::expect_true(is.na(res$better_sample)) testthat::expect_true(is.na(res$better_id)) testthat::expect_true(is.null(res$raw_response[[1]])) } ) }) # --------------------------------------------------------------------- # submit_ollama_pairs_live: zero rows # --------------------------------------------------------------------- testthat::test_that("submit_ollama_pairs_live returns empty tibble for zero rows", { td <- trait_description("overall_quality") tmpl <- set_prompt_template() empty_pairs <- tibble::tibble( ID1 = character(0), text1 = character(0), ID2 = character(0), text2 = character(0) ) res <- submit_ollama_pairs_live( pairs = empty_pairs, model = "mistral-small3.2:24b", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, host = "http://127.0.0.1:11434", verbose = FALSE, progress = FALSE ) testthat::expect_s3_class(res, "tbl_df") testthat::expect_equal(nrow(res), 0L) testthat::expect_true("thoughts" %in% names(res)) testthat::expect_false("raw_response" %in% names(res)) # With include_raw = TRUE, empty tibble still has raw_response column res2 <- submit_ollama_pairs_live( pairs = empty_pairs, model = "mistral-small3.2:24b", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, host = "http://127.0.0.1:11434", verbose = FALSE, progress = FALSE, include_raw = TRUE ) testthat::expect_s3_class(res2, "tbl_df") testthat::expect_equal(nrow(res2), 0L) testthat::expect_true("raw_response" %in% names(res2)) testthat::expect_type(res2$raw_response, "list") }) # --------------------------------------------------------------------- # submit_ollama_pairs_live: row-wise calling of ollama_compare_pair_live # --------------------------------------------------------------------- testthat::test_that("submit_ollama_pairs_live calls ollama_compare_pair_live row-wise", { pairs <- tibble::tibble( ID1 = c("S01", "S03"), text1 = c("Text 1", "Text 3"), ID2 = c("S02", "S04"), text2 = c("Text 2", "Text 4") ) td <- trait_description("overall_quality") tmpl <- set_prompt_template() fake_result <- function(ID1, ID2, chosen) { tibble::tibble( custom_id = sprintf("LIVE_%s_vs_%s", ID1, ID2), ID1 = ID1, ID2 = ID2, model = "mistral-small3.2:24b", object_type = "ollama.generate", status_code = 200L, error_message = NA_character_, thoughts = NA_character_, content = sprintf("%s", chosen), better_sample = chosen, better_id = if (chosen == "SAMPLE_1") ID1 else ID2, prompt_tokens = 10, completion_tokens = 5, total_tokens = 15 ) } calls <- list() testthat::with_mocked_bindings( ollama_compare_pair_live = function( ID1, text1, ID2, text2, model, trait_name, trait_description, prompt_template, host, tag_prefix, tag_suffix, think, num_ctx, include_raw, ... ) { calls <<- append(calls, list(list( ID1 = ID1, ID2 = ID2, model = model, trait_name = trait_name, trait_description = trait_description, prompt_template = prompt_template, host = host, think = think, num_ctx = num_ctx, include_raw = include_raw, dots = list(...) ))) if (ID1 == "S01") { fake_result(ID1, ID2, "SAMPLE_1") } else { fake_result(ID1, ID2, "SAMPLE_2") } }, { res <- submit_ollama_pairs_live( pairs = pairs, model = "mistral-small3.2:24b", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, host = "http://127.0.0.1:11434", verbose = FALSE, status_every = 1, progress = FALSE, include_raw = FALSE, think = FALSE, num_ctx = 8192L ) # Should call helper once per row testthat::expect_equal(length(calls), 2L) testthat::expect_equal(calls[[1]]$ID1, "S01") testthat::expect_equal(calls[[2]]$ID1, "S03") # Shared parameters should be forwarded correctly for (call in calls) { testthat::expect_equal(call$model, "mistral-small3.2:24b") testthat::expect_equal(call$trait_name, td$name) testthat::expect_equal(call$trait_description, td$description) testthat::expect_identical(call$prompt_template, tmpl) testthat::expect_equal(call$host, "http://127.0.0.1:11434") testthat::expect_false(call$think) testthat::expect_equal(call$num_ctx, 8192L) testthat::expect_false(call$include_raw) } # Aggregated result should be consistent with fake_result logic testthat::expect_s3_class(res, "tbl_df") testthat::expect_equal(res$better_id, c("S01", "S04")) } ) }) testthat::test_that("ollama_compare_pair_live validates scalar arguments", { td <- trait_description("overall_quality") tmpl <- set_prompt_template() # ID1 must be scalar character expect_error( ollama_compare_pair_live( ID1 = c("S01", "S02"), text1 = "A", ID2 = "S02", text2 = "B", model = "mistral-small3.2:24b", trait_name = td$name, trait_description = td$description, prompt_template = tmpl ), "`ID1` must be a single character.", fixed = TRUE ) # host must be non-empty expect_error( ollama_compare_pair_live( ID1 = "S01", text1 = "A", ID2 = "S02", text2 = "B", model = "mistral-small3.2:24b", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, host = "" ), "`host` must be a non-empty character scalar.", fixed = TRUE ) # num_ctx must be positive scalar expect_error( ollama_compare_pair_live( ID1 = "S01", text1 = "A", ID2 = "S02", text2 = "B", model = "mistral-small3.2:24b", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, num_ctx = 0 ), "`num_ctx` must be a single positive number.", fixed = TRUE ) }) testthat::test_that("ollama_compare_pair_live exposes thinking only when think = TRUE", { td <- trait_description("overall_quality") tmpl <- set_prompt_template() ID1 <- "S01" ID2 <- "S02" text1 <- "Text A" text2 <- "Text B" fake_body <- list( model = "qwen3:32b", response = "SAMPLE_2 Explanation.", prompt_eval_count = 4L, eval_count = 6L, thinking = "Internal reasoning trace" ) captured_body <- NULL testthat::with_mocked_bindings( .retry_httr2_request = function(req) { structure(list(), class = "fake_resp") }, req_body_json = function(req, body) { captured_body <<- body req }, resp_body_json = function(resp, simplifyVector = FALSE) fake_body, resp_status = function(resp) 200L, { res <- ollama_compare_pair_live( ID1 = ID1, text1 = text1, ID2 = ID2, text2 = text2, model = "qwen3:32b", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, host = "http://127.0.0.1:11434", think = TRUE, num_ctx = 4096L, include_raw = FALSE ) testthat::expect_equal(res$thoughts, "Internal reasoning trace") testthat::expect_equal(captured_body$options$temperature, 0.6) } ) }) testthat::test_that("ollama_compare_pair_live uses error field when status != 200", { td <- trait_description("overall_quality") tmpl <- set_prompt_template() fake_body <- list( model = "mistral-small3.2:24b", response = "SAMPLE_1", prompt_eval_count = 1L, eval_count = 1L, error = "OOM in backend" ) testthat::with_mocked_bindings( .retry_httr2_request = function(req) structure(list(), class = "fake_resp"), req_body_json = function(req, body) req, resp_body_json = function(resp, simplifyVector = FALSE) fake_body, resp_status = function(resp) 500L, { res <- ollama_compare_pair_live( ID1 = "S01", text1 = "A", ID2 = "S02", text2 = "B", model = "mistral-small3.2:24b", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, host = "http://127.0.0.1:11434" ) testthat::expect_equal(res$status_code, 500L) testthat::expect_equal(res$error_message, "OOM in backend") } ) }) testthat::test_that("ollama_compare_pair_live falls back to generic error message", { td <- trait_description("overall_quality") tmpl <- set_prompt_template() fake_body <- list( model = "mistral-small3.2:24b", response = "SAMPLE_1", prompt_eval_count = 1L, eval_count = 1L # no error or message field ) testthat::with_mocked_bindings( .retry_httr2_request = function(req) structure(list(), class = "fake_resp"), req_body_json = function(req, body) req, resp_body_json = function(resp, simplifyVector = FALSE) fake_body, resp_status = function(resp) 503L, { res <- ollama_compare_pair_live( ID1 = "S01", text1 = "A", ID2 = "S02", text2 = "B", model = "mistral-small3.2:24b", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, host = "http://127.0.0.1:11434" ) testthat::expect_equal(res$status_code, 503L) testthat::expect_match( res$error_message, "Ollama request failed with status 503", fixed = FALSE ) } ) }) testthat::test_that("submit_ollama_pairs_live validates required columns", { td <- trait_description("overall_quality") tmpl <- set_prompt_template() bad_pairs <- tibble::tibble( ID1 = "S01", text1 = "A" # missing ID2, text2 ) expect_error( submit_ollama_pairs_live( pairs = bad_pairs, model = "mistral-small3.2:24b", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, verbose = FALSE, progress = FALSE ), "`pairs` must contain columns:", fixed = TRUE ) }) testthat::test_that("submit_ollama_pairs_live validates status_every", { td <- trait_description("overall_quality") tmpl <- set_prompt_template() pairs <- tibble::tibble( ID1 = "S01", text1 = "A", ID2 = "S02", text2 = "B" ) expect_error( submit_ollama_pairs_live( pairs = pairs, model = "mistral-small3.2:24b", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, status_every = 0, verbose = FALSE, progress = FALSE ), "`status_every` must be a single positive integer.", fixed = TRUE ) }) testthat::test_that("submit_ollama_pairs_live emits warning for error rows", { td <- trait_description("overall_quality") tmpl <- set_prompt_template() pairs <- tibble::tibble( ID1 = "S01", text1 = "A", ID2 = "S02", text2 = "B" ) fake_row <- tibble::tibble( custom_id = "LIVE_S01_vs_S02", ID1 = "S01", ID2 = "S02", model = "mistral-small3.2:24b", object_type = "ollama.generate", status_code = 500L, error_message = "backend failure", thoughts = NA_character_, content = NA_character_, better_sample = NA_character_, better_id = NA_character_, prompt_tokens = NA_real_, completion_tokens = NA_real_, total_tokens = NA_real_ ) testthat::with_mocked_bindings( ollama_compare_pair_live = function(...) fake_row, { res <- submit_ollama_pairs_live( pairs = pairs, model = "mistral-small3.2:24b", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, verbose = TRUE, status_every = 1, progress = TRUE, # exercises txtProgressBar path include_raw = FALSE, think = FALSE, num_ctx = 8192L ) testthat::expect_equal(res$error_message, "backend failure") testthat::expect_equal(res$status_code, 500L) } ) }) testthat::test_that("submit_ollama_pairs_live computes timing when show_status is TRUE", { td <- trait_description("overall_quality") tmpl <- set_prompt_template() pairs <- tibble::tibble( ID1 = "S01", text1 = "A", ID2 = "S02", text2 = "B" ) ok_row <- tibble::tibble( custom_id = "LIVE_S01_vs_S02", ID1 = "S01", ID2 = "S02", model = "mistral-small3.2:24b", object_type = "ollama.generate", status_code = 200L, error_message = NA_character_, thoughts = NA_character_, content = "SAMPLE_1", better_sample = "SAMPLE_1", better_id = "S01", prompt_tokens = 10, completion_tokens = 5, total_tokens = 15 ) testthat::with_mocked_bindings( ollama_compare_pair_live = function(...) ok_row, { res <- submit_ollama_pairs_live( pairs = pairs, model = "mistral-small3.2:24b", trait_name = td$name, trait_description = td$description, prompt_template = tmpl, verbose = TRUE, status_every = 1, progress = FALSE, include_raw = FALSE ) testthat::expect_equal(res$better_id, "S01") testthat::expect_true(all(is.na(res$error_message))) } ) }) testthat::test_that("ensure_only_ollama_model_loaded validates model argument", { expect_error( ensure_only_ollama_model_loaded(c("m1", "m2")), "`model` must be a non-empty character scalar.", fixed = TRUE ) }) # --------------------------------------------------------------------- # ensure_only_ollama_model_loaded: system / parsing behaviour # --------------------------------------------------------------------- testthat::test_that( "ensure_only_ollama_model_loaded validates model argument", { testthat::expect_error( ensure_only_ollama_model_loaded(character()), "`model` must be a non-empty character scalar.", fixed = TRUE ) testthat::expect_error( ensure_only_ollama_model_loaded(c("a", "b")), "`model` must be a non-empty character scalar.", fixed = TRUE ) testthat::expect_error( ensure_only_ollama_model_loaded(""), "`model` must be a non-empty character scalar.", fixed = TRUE ) } ) testthat::test_that( "ensure_only_ollama_model_loaded handles ollama ps failure", { testthat::with_mocked_bindings( .ollama_system2 = function(command, args, stdout = TRUE, stderr = TRUE, ...) { stop("simulated failure from ollama ps") }, { res <- ensure_only_ollama_model_loaded("mistral-small3.2:24b", verbose = TRUE) # Should return an empty character vector, invisibly testthat::expect_type(res, "character") testthat::expect_length(res, 0L) } ) } ) testthat::test_that( "ensure_only_ollama_model_loaded handles empty and header-only output", { # Case 1: completely empty output but status == 0 testthat::with_mocked_bindings( .ollama_system2 = function(command, args, stdout = TRUE, stderr = TRUE, ...) { structure(character(0), status = 0L) }, { res_empty <- ensure_only_ollama_model_loaded("qwen3:32b", verbose = FALSE) testthat::expect_type(res_empty, "character") testthat::expect_length(res_empty, 0L) } ) # Case 2: header line only header_only <- structure( "NAME ID STATUS", status = 0L ) testthat::with_mocked_bindings( .ollama_system2 = function(command, args, stdout = TRUE, stderr = TRUE, ...) { header_only }, { res_header <- ensure_only_ollama_model_loaded("qwen3:32b", verbose = FALSE) testthat::expect_type(res_header, "character") testthat::expect_length(res_header, 0L) } ) } ) testthat::test_that( "ensure_only_ollama_model_loaded parses models and unloads others", { calls <- list() fake_ps_output <- structure( c( "NAME ID STATUS", "mistral-small3.2:24b abc123 running", "qwen3:32b def456 running", "gemma3:27b ghi789 running" ), status = 0L ) fake_ollama_system2 <- function(command, args, stdout = TRUE, stderr = TRUE, ...) { if (identical(args, "ps")) { # First call: simulate `ollama ps` fake_ps_output } else { # Subsequent calls should be of the form c("stop", ) calls <<- append(calls, list(list(command = command, args = args))) invisible(NULL) } } testthat::with_mocked_bindings( .ollama_system2 = fake_ollama_system2, { keep_model <- "qwen3:32b" res <- ensure_only_ollama_model_loaded(keep_model, verbose = FALSE) # Should request unloading of all models except the one we keep testthat::expect_setequal( res, c("mistral-small3.2:24b", "gemma3:27b") ) # We should have issued stop commands for the same set stopped_models <- vapply( calls, function(x) x$args[2], character(1) ) testthat::expect_setequal( stopped_models, c("mistral-small3.2:24b", "gemma3:27b") ) # All stop commands should target the `ollama` binary testthat::expect_true(all(vapply(calls, function(x) x$command, "") == "ollama")) } ) } ) testthat::test_that("ensure_only_ollama_model_loaded handles empty or weird CLI output", { # Scenario 1: ollama ps returns only header testthat::with_mocked_bindings( .ollama_system2 = function(...) { structure("NAME ID SIZE PROCESSOR UNTIL", status = 0L) }, { # Should return empty character invisibly (no models found to unload) res <- ensure_only_ollama_model_loaded("mistral", verbose = FALSE) testthat::expect_length(res, 0L) } ) # Scenario 2: ollama ps returns header and the target model only testthat::with_mocked_bindings( .ollama_system2 = function(...) { structure(c( "NAME ID SIZE PROCESSOR UNTIL", "mistral abc 4GB gpu 4m" ), status = 0L) }, { # Should return empty character (target matches, nothing to unload) res <- ensure_only_ollama_model_loaded("mistral", verbose = FALSE) testthat::expect_length(res, 0L) } ) })