test_that("translate_turns works", { skip_on_cran() example_sample <- example_task()$get_samples()[1, , drop = FALSE] timestamps <- list(started_at = Sys.time(), completed_at = Sys.time()) chat_translated <- translate_to_events( example_sample, timestamps = list(solve = timestamps, score = timestamps) ) inspect_log <- example_inspect_log() inspect_log_first_events <- inspect_log$samples[[1]]$events expect_contains( names(inspect_log_first_events[[1]]), names(chat_translated[[1]]) ) expect_contains( names(inspect_log_first_events[[2]]), names(chat_translated[[2]]) ) }) test_that("vitals writes valid eval logs (scorer tool calls, claude)", { vcr::local_cassette("translate-anthropic-scorer-tool-calls") key_get("ANTHROPIC_API_KEY") tmp_dir <- withr::local_tempdir() withr::local_envvar(list(VITALS_LOG_DIR = tmp_dir)) withr::local_options(cli.default_handler = function(...) {}) local_mocked_bindings(interactive = function(...) FALSE) library(ellmer) sum_problems <- tibble::tibble( input = c( "What is 125 + 267?", "What is 543 + 789?", "What is 91 + 38?" ), target = c("392", "1332", "129") ) take_sum <- tool( function(a, b) { list(sum = a + b) }, name = "sum_tool", description = "Add two numbers together to verify arithmetic. Always use this tool.", arguments = list( a = type_number("First number"), b = type_number("Second number") ) ) scorer_chat <- chat_claude( model = "claude-sonnet-4-5-20250929", system_prompt = paste( "You are a scorer that checks arithmetic answers.", "ALWAYS use the sum_tool to verify the answer before grading.", "Call sum_tool with the numbers from the question, then compare", "the result to the submitted answer." ) ) scorer_chat$register_tool(take_sum) tsk <- Task$new( dataset = sum_problems, solver = generate(chat_claude(model = "claude-sonnet-4-5-20250929")), scorer = model_graded_qa(scorer_chat = scorer_chat) ) tsk$eval() expect_valid_log(tsk$log()) })