# ────────────────────────────────────────────────────────────────────────────── # Full integration test: exercises every module with synthetic data. # No API keys needed — we construct experiment objects manually. # ────────────────────────────────────────────────────────────────────────────── # --- Synthetic data ---------------------------------------------------------- fake_texts <- c( "The new policy was received warmly by voters.", "Critics slammed the proposal as reckless.", "The committee met on Tuesday to discuss budgets.", "Supporters praised the bold leadership.", "Opposition leaders called for an immediate reversal.", "The weather on election day was cloudy.", "Citizens celebrated in the streets after the result.", "Analysts described the outcome as a surprise.", "Local businesses reported no change in foot traffic.", "Protests erupted across the capital overnight." ) human_labels <- c( "positive", "negative", "neutral", "positive", "negative", "neutral", "positive", "neutral", "neutral", "negative" ) # Simulated LLM labels (85% agreement with human) llm_labels <- c( "positive", "negative", "neutral", "positive", "negative", "neutral", "positive", "positive", "neutral", "negative" ) # --- Helper: build a fake executed experiment -------------------------------- make_fake_experiment <- function(texts, responses, name = "test_exp", model = "gpt-4o") { results <- tibble::tibble( prompt_id = seq_along(texts), prompt = texts, response = responses, status = "success", input_tokens = sample(80:120, length(texts), replace = TRUE), output_tokens = sample(5:15, length(texts), replace = TRUE), cost = runif(length(texts), 0.001, 0.005), model = model, timestamp = Sys.time() ) structure( list( prompts = as.list(texts), chat = NULL, name = name, description = "Synthetic integration test", model = model, system_prompt_hash = digest::digest("Classify sentiment.", algo = "sha256"), created_at = Sys.time(), results = results, executed = TRUE, run_time = 2.5 ), class = "llm_experiment" ) } # ============================================================================ # 1. DESIGN MODULE # ============================================================================ test_that("cross_design produces correct factorial grid", { # Use a minimal named list (no actual Chat objects needed for design) fake_chats <- list(gpt4 = "placeholder", claude = "placeholder") class(fake_chats$gpt4) <- "Chat" class(fake_chats$claude) <- "Chat" design <- cross_design( prompts = fake_texts[1:3], chats = fake_chats, temperatures = c(0, 0.5, 1.0) ) expect_s3_class(design, "tbl_df") expect_equal(nrow(design), 3 * 2 * 3) # 3 prompts x 2 models x 3 temps expect_true(all(c("condition_id", "prompt", "chat_name", "temperature") %in% names(design))) }) test_that("replicate_design expands correctly", { fake_chats <- list(m1 = structure("x", class = "Chat")) design <- cross_design(fake_texts[1:2], fake_chats, temperatures = 0) expanded <- replicate_design(design, n = 5) expect_equal(nrow(expanded), 2 * 5) expect_true("replicate" %in% names(expanded)) }) test_that("randomize_design shuffles with seed", { fake_chats <- list(m1 = structure("x", class = "Chat")) design <- cross_design(fake_texts[1:5], fake_chats, temperatures = 0) r1 <- randomize_design(design, seed = 42) r2 <- randomize_design(design, seed = 42) expect_equal(r1$condition_id, r2$condition_id) }) # ============================================================================ # 2. RELIABILITY MODULE # ============================================================================ test_that("llm_human_reliability computes all metrics", { rel <- llm_human_reliability(llm_labels, human_labels) expect_s3_class(rel, "llm_reliability") expect_true(!is.na(rel$cohens_kappa)) expect_true(!is.na(rel$krippendorffs_alpha)) expect_equal(rel$n, 10) expect_true(rel$accuracy > 0.7) # We designed 85% agreement expect_true(rel$macro_f1 > 0) expect_output(print(rel), "Cohen's kappa") }) test_that("confusion_summary computes per-class F1", { conf <- confusion_summary(llm_labels, human_labels) expect_true(is.list(conf)) expect_s3_class(conf$per_class, "tbl_df") expect_true(all(c("class", "precision", "recall", "f1") %in% names(conf$per_class))) expect_true(conf$macro_f1 > 0) expect_true(conf$micro_f1 > 0) # neutral has 1 misclass (human neutral -> llm positive), so recall < 1 neutral_row <- conf$per_class[conf$per_class$class == "neutral", ] expect_true(neutral_row$recall < 1) }) test_that("llm_intermodel_reliability computes pairwise agreement", { combined <- tibble::tibble( prompt = rep(fake_texts, 2), response = c(llm_labels, human_labels), model = rep(c("gpt4", "claude"), each = 10) ) inter <- llm_intermodel_reliability(combined) expect_s3_class(inter, "tbl_df") expect_equal(nrow(inter), 1) # 1 pair expect_true(inter$pct_agree > 0.7) }) # ============================================================================ # 3. VALIDATION MODULE # ============================================================================ test_that("gold standard creation and validation workflow", { gs <- create_gold_standard( texts = fake_texts, labels = human_labels, metadata = list(coder = "RA1", date = "2025-03-01") ) expect_s3_class(gs, "gold_standard") expect_equal(gs$n, 10) expect_equal(gs$classes, c("negative", "neutral", "positive")) expect_type(gs$hash, "character") expect_output(print(gs), "Gold Standard") # Validate against fake experiment exp <- make_fake_experiment(fake_texts, llm_labels) val <- validate_against_gold(exp, gs) expect_s3_class(val, "llm_reliability") expect_equal(val$n, 10) expect_true(val$accuracy > 0.7) }) test_that("sample_for_validation draws reproducible samples", { s1 <- sample_for_validation(fake_texts, n = 5, seed = 123) s2 <- sample_for_validation(fake_texts, n = 5, seed = 123) expect_equal(s1$idx, s2$idx) expect_equal(nrow(s1), 5) }) # ============================================================================ # 4. SENSITIVITY MODULE # ============================================================================ test_that("sensitivity_summary works on hand-built objects", { # Prompt sensitivity ps <- structure( list( results = tibble::tibble( prompt = rep(fake_texts[1:5], 2), response = c("pos", "neg", "neu", "pos", "neg", "pos", "neg", "pos", "pos", "neg"), status = "success", cost = 0.003, variant = rep(c("direct", "cot"), each = 5) ), agreement = tibble::tibble( model_1 = "direct", model_2 = "cot", n_shared = 5L, cohens_kappa = 0.4, pct_agree = 0.6 ), n_texts = 5L, variants = c("direct", "cot") ), class = "prompt_sensitivity" ) smry <- sensitivity_summary(ps) expect_equal(nrow(smry), 2) expect_true(all(smry$n_responses == 5)) expect_output(print(ps), "Prompt Sensitivity") }) test_that("downstream_sensitivity runs regressions per condition", { ps <- structure( list( results = tibble::tibble( prompt = rep(fake_texts, 2), response = c( rep("positive", 5), rep("negative", 5), # variant 1 rep("negative", 5), rep("positive", 5) # variant 2 (flipped) ), status = "success", cost = 0.003, variant = rep(c("v1", "v2"), each = 10) ), agreement = tibble::tibble(), n_texts = 10L, variants = c("v1", "v2") ), class = "prompt_sensitivity" ) survey_data <- data.frame( text = fake_texts, outcome = rnorm(10), age = sample(25:65, 10, replace = TRUE) ) ds <- downstream_sensitivity(ps, survey_data, outcome ~ llm_label + age) expect_s3_class(ds, "tbl_df") expect_true(all(c("condition", "term", "estimate", "p_value") %in% names(ds))) # Should have results for both variants expect_true(all(c("v1", "v2") %in% ds$condition)) }) # ============================================================================ # 5. PREREGISTRATION MODULE # ============================================================================ test_that("full preregistration workflow: freeze, verify, export, reload", { fp <- freeze_prompt( system_prompt = "Classify the following text as positive, negative, or neutral.", user_template = "Text: {text}", model = "gpt-4o", temperature = 0, categories = c("positive", "negative", "neutral"), metadata = list(project = "integration_test", version = "1.0") ) expect_s3_class(fp, "frozen_prompt") expect_equal(nchar(fp$hash), 64) expect_true(verify_prompt(fp)) expect_output(print(fp), "gpt-4o") # Export and reload tmp <- withr::local_tempfile(fileext = ".json") export_preregistration(fp, tmp) expect_true(file.exists(tmp)) reloaded <- load_frozen_prompt(tmp) expect_true(verify_prompt(reloaded)) expect_equal(reloaded$hash, fp$hash) # Tampering detection tampered <- reloaded tampered$system_prompt <- "TAMPERED PROMPT" expect_false(verify_prompt(tampered)) }) # ============================================================================ # 6. REPORTING MODULE # ============================================================================ test_that("methods_section generates text with and without reliability", { exp <- make_fake_experiment(fake_texts, llm_labels) # Without reliability ms <- methods_section(exp, format = "markdown") expect_type(ms, "character") expect_true(grepl("gpt-4o", ms)) expect_true(grepl("10 texts", ms)) # With reliability rel <- llm_human_reliability(llm_labels, human_labels) ms_rel <- methods_section(exp, reliability = rel, format = "latex") expect_true(grepl("kappa", ms_rel, ignore.case = TRUE)) expect_true(grepl("alpha", ms_rel, ignore.case = TRUE)) }) test_that("results_to_latex writes valid macro file", { exp <- make_fake_experiment(fake_texts, llm_labels) rel <- llm_human_reliability(llm_labels, human_labels) tmp <- withr::local_tempfile(fileext = ".tex") results_to_latex(exp, tmp, reliability = rel) expect_true(file.exists(tmp)) lines <- readLines(tmp) expect_true(any(grepl("\\\\newcommand", lines))) expect_true(any(grepl("llmModel", lines))) expect_true(any(grepl("llmKappa", lines))) expect_true(any(grepl("llmMacroF", lines))) }) test_that("export_replication creates full directory structure", { exp <- make_fake_experiment(fake_texts, llm_labels) rel <- llm_human_reliability(llm_labels, human_labels) gs <- create_gold_standard(fake_texts, human_labels) tmp_dir <- withr::local_tempdir() out <- file.path(tmp_dir, "replication") export_replication(exp, out, reliability = rel, gold_standard = gs) expect_true(file.exists(file.path(out, "results.csv"))) expect_true(file.exists(file.path(out, "parameters.txt"))) expect_true(file.exists(file.path(out, "gold_standard.csv"))) expect_true(file.exists(file.path(out, "reliability.txt"))) expect_true(file.exists(file.path(out, "methods_section.md"))) # Check results.csv has correct rows results <- readr::read_csv(file.path(out, "results.csv"), show_col_types = FALSE) expect_equal(nrow(results), 10) }) test_that("estimate_cost returns sensible numbers", { # Use a minimal fake Chat just for the model name fake_chat <- structure(list( get_model = function() "gpt-4o" ), class = "Chat") est <- estimate_cost(fake_texts, fake_chat, n_reps = 3) expect_type(est, "list") expect_true(est$est_cost_usd > 0) expect_equal(est$n_texts, 10) expect_equal(est$n_reps, 3) }) # ============================================================================ # 7. LOGGING MODULE # ============================================================================ test_that("audit trail: log, retrieve, export", { reset_session() exp <- make_fake_experiment(fake_texts, llm_labels, name = "audit_test") log_experiment(exp) log <- get_session_log() expect_s3_class(log, "tbl_df") expect_equal(nrow(log), 10) expect_true("session_id" %in% names(log)) expect_true("experiment_name" %in% names(log)) # Export CSV tmp <- withr::local_tempfile(fileext = ".csv") export_audit_trail(tmp, format = "csv") expect_true(file.exists(tmp)) exported <- readr::read_csv(tmp, show_col_types = FALSE) expect_equal(nrow(exported), 10) # Export RDS tmp_rds <- withr::local_tempfile(fileext = ".rds") export_audit_trail(tmp_rds, format = "rds") expect_true(file.exists(tmp_rds)) reloaded <- readRDS(tmp_rds) expect_equal(nrow(reloaded), 10) reset_session() }) # ============================================================================ # 8. FULL PIPELINE: Design → Experiment → Validate → Report → Pre-register # ============================================================================ test_that("full pipeline runs end-to-end on synthetic data", { # 1. Design fake_chats <- list(m1 = structure("x", class = "Chat")) design <- cross_design(fake_texts[1:5], fake_chats, temperatures = 0) design <- replicate_design(design, n = 2) design <- randomize_design(design, seed = 42) expect_equal(nrow(design), 10) # 2. Experiment (synthetic) exp <- make_fake_experiment(fake_texts, llm_labels, name = "pipeline_test") expect_output(print(exp), "pipeline_test") s <- summary(exp) expect_equal(s$n_prompts, 10) expect_equal(s$n_successful, 10) # 3. Reliability rel <- llm_human_reliability(llm_labels, human_labels) expect_true(rel$cohens_kappa > 0) # 4. Gold standard validation gs <- create_gold_standard(fake_texts, human_labels, metadata = list(coder = "RA1")) val <- validate_against_gold(exp, gs) expect_true(val$accuracy > 0.7) # 5. Pre-register fp <- freeze_prompt( system_prompt = "Classify sentiment.", user_template = "Text: {text}", model = "gpt-4o", temperature = 0, categories = c("positive", "negative", "neutral") ) expect_true(verify_prompt(fp)) # 6. Report ms <- methods_section(exp, reliability = rel) expect_true(nchar(ms) > 100) tmp_dir <- withr::local_tempdir() # LaTeX macros tex_path <- file.path(tmp_dir, "stats.tex") results_to_latex(exp, tex_path, reliability = rel) tex_lines <- readLines(tex_path) expect_true(any(grepl("llmKappa", tex_lines))) # Replication package rep_dir <- file.path(tmp_dir, "replication") export_replication(exp, rep_dir, reliability = rel, gold_standard = gs) expect_true(file.exists(file.path(rep_dir, "results.csv"))) # Pre-registration export prereg_path <- file.path(tmp_dir, "prompt_spec.json") export_preregistration(fp, prereg_path) reloaded_fp <- load_frozen_prompt(prereg_path) expect_true(verify_prompt(reloaded_fp)) # Audit trail reset_session() log_experiment(exp) audit_path <- file.path(tmp_dir, "audit.csv") export_audit_trail(audit_path) expect_true(file.exists(audit_path)) reset_session() })