# ===================================================================== # test-openai_parse.R # Tests for parse_openai_batch_output() # ===================================================================== testthat::test_that("parse_openai_batch_output validates input file", { # Non-existent file testthat::expect_error( parse_openai_batch_output("nonexistent.jsonl"), "File does not exist" ) # Empty file tmp <- tempfile() file.create(tmp) on.exit(unlink(tmp), add = TRUE) testthat::expect_error( parse_openai_batch_output(tmp), "File contains no lines" ) }) testthat::test_that("parse_openai_batch_output handles malformed JSON and body", { tmp <- tempfile() on.exit(unlink(tmp), add = TRUE) lines <- c( "", # Empty line (should be skipped) "NOT JSON", # Malformed -> NULL -> skipped '{"custom_id": "bad_id"}', # No response body -> NA row '{"custom_id": "LIVE_A_vs_B", "response": {"status_code": 200, "body": null}}' # Explicit null body -> NA row ) writeLines(lines, tmp) res <- parse_openai_batch_output(tmp) testthat::expect_equal(nrow(res), 2L) # Row 1 (from 'bad_id') r1 <- res[1, ] testthat::expect_equal(r1$custom_id, "bad_id") # "bad_id" fails the _vs_ regex, so IDs should be NA testthat::expect_true(is.na(r1$ID1)) testthat::expect_true(is.na(r1$model)) # Row 2 (from 'LIVE_A_vs_B') r2 <- res[2, ] testthat::expect_equal(r2$custom_id, "LIVE_A_vs_B") # "LIVE_A_vs_B" parses correctly: left="LIVE_A", right="B" # suffix after last _ in left is "A" testthat::expect_equal(r2$ID1, "A") testthat::expect_equal(r2$ID2, "B") testthat::expect_equal(r2$status_code, 200L) testthat::expect_true(is.na(r2$content)) }) testthat::test_that("parse_openai_batch_output extracts detailed token usage", { tmp <- tempfile() on.exit(unlink(tmp), add = TRUE) # Chat completion object with detailed usage obj <- list( custom_id = "LIVE_S1_vs_S2", response = list( status_code = 200, body = list( object = "chat.completion", model = "gpt-4", choices = list(list(message = list(content = "Hi"))), usage = list( prompt_tokens = 50, completion_tokens = 20, total_tokens = 70, input_tokens_details = list(cached_tokens = 25), output_tokens_details = list(reasoning_tokens = 10) ) ) ) ) writeLines(jsonlite::toJSON(obj, auto_unbox = TRUE), tmp) res <- parse_openai_batch_output(tmp) testthat::expect_equal(res$prompt_tokens, 50) testthat::expect_equal(res$prompt_cached_tokens, 25) testthat::expect_equal(res$reasoning_tokens, 10) }) testthat::test_that("parse_openai_batch_output extracts better_id correctly from ID1_vs_ID2", { # Edge case: ID1 contains underscores, e.g. "PREFIX_ID_1_vs_ID_2" tmp <- tempfile() on.exit(unlink(tmp), add = TRUE) obj <- list( custom_id = "LIVE_A_1_vs_B_2", # ID1="A_1", ID2="B_2" (assuming prefix logic matches) response = list( body = list( object = "chat.completion", choices = list(list(message = list(content = "SAMPLE_1"))) ) ) ) writeLines(jsonlite::toJSON(obj, auto_unbox = TRUE), tmp) res <- parse_openai_batch_output(tmp) # The parser logic: parts = strsplit(..., "_vs_") # left = "LIVE_A_1", right = "B_2" # regexpr("_[^_]*$", left) matches "_1". substring after matches "1". # So ID1 = "1", NOT "A_1". This confirms current behavior. testthat::expect_equal(res$ID1, "1") testthat::expect_equal(res$ID2, "B_2") testthat::expect_equal(res$better_id, "1") }) testthat::test_that("parse_openai_batch_output collects thoughts and message text separately for responses", { tmp <- tempfile(fileext = ".jsonl") on.exit(unlink(tmp), add = TRUE) # Construct a fake batch output line similar to gpt-5.1 responses line_obj <- list( custom_id = "LIVE_S01_vs_S02", response = list( status_code = 200L, body = list( object = "response", model = "gpt-5.1", reasoning = list( effort = "low", summary = list(text = "Reasoning summary. ") ), output = list( list( id = "rs_x", type = "reasoning", summary = list() ), list( id = "msg_x", type = "message", status = "completed", content = list( list( type = "output_text", text = "SAMPLE_2 Final answer." ) ), role = "assistant" ) ), usage = list( input_tokens = 10L, output_tokens = 5L, total_tokens = 15L ) ) ), error = NULL ) json_line <- jsonlite::toJSON(line_obj, auto_unbox = TRUE) writeLines(json_line, con = tmp, useBytes = TRUE) res <- parse_openai_batch_output(tmp) testthat::expect_s3_class(res, "tbl_df") testthat::expect_equal(nrow(res), 1L) # IDs from custom_id testthat::expect_equal(res$custom_id, "LIVE_S01_vs_S02") testthat::expect_equal(res$ID1, "S01") testthat::expect_equal(res$ID2, "S02") # Basic metadata testthat::expect_equal(res$model, "gpt-5.1") testthat::expect_equal(res$object_type, "response") testthat::expect_equal(res$status_code, 200L) testthat::expect_true(is.na(res$error_message)) # Reasoning summary should go to thoughts testthat::expect_equal(res$thoughts, "Reasoning summary. ") # Content should be assistant message only testthat::expect_equal( res$content, "SAMPLE_2 Final answer." ) # Tag parsing and better_id mapping testthat::expect_equal(res$better_sample, "SAMPLE_2") testthat::expect_equal(res$better_id, "S02") # Token usage testthat::expect_equal(res$prompt_tokens, 10) testthat::expect_equal(res$completion_tokens, 5) testthat::expect_equal(res$total_tokens, 15) })