# ──────────────────────────────────────────────────────────────────────────────
# Full integration test: exercises every module with synthetic data.
# No API keys needed — we construct experiment objects manually.
# ──────────────────────────────────────────────────────────────────────────────

# --- Synthetic data ----------------------------------------------------------

fake_texts <- c(
  "The new policy was received warmly by voters.",
  "Critics slammed the proposal as reckless.",
  "The committee met on Tuesday to discuss budgets.",
  "Supporters praised the bold leadership.",
  "Opposition leaders called for an immediate reversal.",
  "The weather on election day was cloudy.",
  "Citizens celebrated in the streets after the result.",
  "Analysts described the outcome as a surprise.",
  "Local businesses reported no change in foot traffic.",
  "Protests erupted across the capital overnight."
)

human_labels <- c(
  "positive", "negative", "neutral", "positive", "negative",
  "neutral", "positive", "neutral", "neutral", "negative"
)

# Simulated LLM labels (85% agreement with human)
llm_labels <- c(
  "positive", "negative", "neutral", "positive", "negative",
  "neutral", "positive", "positive", "neutral", "negative"
)

# --- Helper: build a fake executed experiment --------------------------------

make_fake_experiment <- function(texts, responses, name = "test_exp",
                                  model = "gpt-4o") {
  results <- tibble::tibble(
    prompt_id     = seq_along(texts),
    prompt        = texts,
    response      = responses,
    status        = "success",
    input_tokens  = sample(80:120, length(texts), replace = TRUE),
    output_tokens = sample(5:15, length(texts), replace = TRUE),
    cost          = runif(length(texts), 0.001, 0.005),
    model         = model,
    timestamp     = Sys.time()
  )

  structure(
    list(
      prompts            = as.list(texts),
      chat               = NULL,
      name               = name,
      description        = "Synthetic integration test",
      model              = model,
      system_prompt_hash = digest::digest("Classify sentiment.", algo = "sha256"),
      created_at         = Sys.time(),
      results            = results,
      executed           = TRUE,
      run_time           = 2.5
    ),
    class = "llm_experiment"
  )
}

# ============================================================================
# 1. DESIGN MODULE
# ============================================================================

test_that("cross_design produces correct factorial grid", {
  # Use a minimal named list (no actual Chat objects needed for design)
  fake_chats <- list(gpt4 = "placeholder", claude = "placeholder")
  class(fake_chats$gpt4) <- "Chat"
  class(fake_chats$claude) <- "Chat"

  design <- cross_design(
    prompts = fake_texts[1:3],
    chats = fake_chats,
    temperatures = c(0, 0.5, 1.0)
  )

  expect_s3_class(design, "tbl_df")
  expect_equal(nrow(design), 3 * 2 * 3)  # 3 prompts x 2 models x 3 temps
  expect_true(all(c("condition_id", "prompt", "chat_name", "temperature") %in%
                    names(design)))
})

test_that("replicate_design expands correctly", {
  fake_chats <- list(m1 = structure("x", class = "Chat"))
  design <- cross_design(fake_texts[1:2], fake_chats, temperatures = 0)
  expanded <- replicate_design(design, n = 5)
  expect_equal(nrow(expanded), 2 * 5)
  expect_true("replicate" %in% names(expanded))
})

test_that("randomize_design shuffles with seed", {
  fake_chats <- list(m1 = structure("x", class = "Chat"))
  design <- cross_design(fake_texts[1:5], fake_chats, temperatures = 0)
  r1 <- randomize_design(design, seed = 42)
  r2 <- randomize_design(design, seed = 42)
  expect_equal(r1$condition_id, r2$condition_id)
})

# ============================================================================
# 2. RELIABILITY MODULE
# ============================================================================

test_that("llm_human_reliability computes all metrics", {
  rel <- llm_human_reliability(llm_labels, human_labels)

  expect_s3_class(rel, "llm_reliability")
  expect_true(!is.na(rel$cohens_kappa))
  expect_true(!is.na(rel$krippendorffs_alpha))
  expect_equal(rel$n, 10)
  expect_true(rel$accuracy > 0.7)  # We designed 85% agreement
  expect_true(rel$macro_f1 > 0)
  expect_output(print(rel), "Cohen's kappa")
})

test_that("confusion_summary computes per-class F1", {
  conf <- confusion_summary(llm_labels, human_labels)

  expect_true(is.list(conf))
  expect_s3_class(conf$per_class, "tbl_df")
  expect_true(all(c("class", "precision", "recall", "f1") %in%
                    names(conf$per_class)))
  expect_true(conf$macro_f1 > 0)
  expect_true(conf$micro_f1 > 0)
  # neutral has 1 misclass (human neutral -> llm positive), so recall < 1
  neutral_row <- conf$per_class[conf$per_class$class == "neutral", ]
  expect_true(neutral_row$recall < 1)
})

test_that("llm_intermodel_reliability computes pairwise agreement", {
  combined <- tibble::tibble(
    prompt   = rep(fake_texts, 2),
    response = c(llm_labels, human_labels),
    model    = rep(c("gpt4", "claude"), each = 10)
  )
  inter <- llm_intermodel_reliability(combined)
  expect_s3_class(inter, "tbl_df")
  expect_equal(nrow(inter), 1)  # 1 pair
  expect_true(inter$pct_agree > 0.7)
})

# ============================================================================
# 3. VALIDATION MODULE
# ============================================================================

test_that("gold standard creation and validation workflow", {
  gs <- create_gold_standard(
    texts    = fake_texts,
    labels   = human_labels,
    metadata = list(coder = "RA1", date = "2025-03-01")
  )

  expect_s3_class(gs, "gold_standard")
  expect_equal(gs$n, 10)
  expect_equal(gs$classes, c("negative", "neutral", "positive"))
  expect_type(gs$hash, "character")
  expect_output(print(gs), "Gold Standard")

  # Validate against fake experiment

  exp <- make_fake_experiment(fake_texts, llm_labels)
  val <- validate_against_gold(exp, gs)
  expect_s3_class(val, "llm_reliability")
  expect_equal(val$n, 10)
  expect_true(val$accuracy > 0.7)
})

test_that("sample_for_validation draws reproducible samples", {
  s1 <- sample_for_validation(fake_texts, n = 5, seed = 123)
  s2 <- sample_for_validation(fake_texts, n = 5, seed = 123)
  expect_equal(s1$idx, s2$idx)
  expect_equal(nrow(s1), 5)
})

# ============================================================================
# 4. SENSITIVITY MODULE
# ============================================================================

test_that("sensitivity_summary works on hand-built objects", {
  # Prompt sensitivity
  ps <- structure(
    list(
      results = tibble::tibble(
        prompt   = rep(fake_texts[1:5], 2),
        response = c("pos", "neg", "neu", "pos", "neg",
                      "pos", "neg", "pos", "pos", "neg"),
        status   = "success",
        cost     = 0.003,
        variant  = rep(c("direct", "cot"), each = 5)
      ),
      agreement = tibble::tibble(
        model_1 = "direct", model_2 = "cot",
        n_shared = 5L, cohens_kappa = 0.4, pct_agree = 0.6
      ),
      n_texts  = 5L,
      variants = c("direct", "cot")
    ),
    class = "prompt_sensitivity"
  )

  smry <- sensitivity_summary(ps)
  expect_equal(nrow(smry), 2)
  expect_true(all(smry$n_responses == 5))
  expect_output(print(ps), "Prompt Sensitivity")
})

test_that("downstream_sensitivity runs regressions per condition", {
  ps <- structure(
    list(
      results = tibble::tibble(
        prompt   = rep(fake_texts, 2),
        response = c(
          rep("positive", 5), rep("negative", 5),  # variant 1
          rep("negative", 5), rep("positive", 5)    # variant 2 (flipped)
        ),
        status  = "success",
        cost    = 0.003,
        variant = rep(c("v1", "v2"), each = 10)
      ),
      agreement = tibble::tibble(),
      n_texts  = 10L,
      variants = c("v1", "v2")
    ),
    class = "prompt_sensitivity"
  )

  survey_data <- data.frame(
    text    = fake_texts,
    outcome = rnorm(10),
    age     = sample(25:65, 10, replace = TRUE)
  )

  ds <- downstream_sensitivity(ps, survey_data, outcome ~ llm_label + age)
  expect_s3_class(ds, "tbl_df")
  expect_true(all(c("condition", "term", "estimate", "p_value") %in% names(ds)))
  # Should have results for both variants
  expect_true(all(c("v1", "v2") %in% ds$condition))
})

# ============================================================================
# 5. PREREGISTRATION MODULE
# ============================================================================

test_that("full preregistration workflow: freeze, verify, export, reload", {
  fp <- freeze_prompt(
    system_prompt = "Classify the following text as positive, negative, or neutral.",
    user_template = "Text: {text}",
    model = "gpt-4o",
    temperature = 0,
    categories = c("positive", "negative", "neutral"),
    metadata = list(project = "integration_test", version = "1.0")
  )

  expect_s3_class(fp, "frozen_prompt")
  expect_equal(nchar(fp$hash), 64)
  expect_true(verify_prompt(fp))
  expect_output(print(fp), "gpt-4o")

  # Export and reload
  tmp <- withr::local_tempfile(fileext = ".json")
  export_preregistration(fp, tmp)
  expect_true(file.exists(tmp))

  reloaded <- load_frozen_prompt(tmp)
  expect_true(verify_prompt(reloaded))
  expect_equal(reloaded$hash, fp$hash)

  # Tampering detection
  tampered <- reloaded
  tampered$system_prompt <- "TAMPERED PROMPT"
  expect_false(verify_prompt(tampered))
})

# ============================================================================
# 6. REPORTING MODULE
# ============================================================================

test_that("methods_section generates text with and without reliability", {
  exp <- make_fake_experiment(fake_texts, llm_labels)

  # Without reliability
  ms <- methods_section(exp, format = "markdown")
  expect_type(ms, "character")
  expect_true(grepl("gpt-4o", ms))
  expect_true(grepl("10 texts", ms))

  # With reliability
  rel <- llm_human_reliability(llm_labels, human_labels)
  ms_rel <- methods_section(exp, reliability = rel, format = "latex")
  expect_true(grepl("kappa", ms_rel, ignore.case = TRUE))
  expect_true(grepl("alpha", ms_rel, ignore.case = TRUE))
})

test_that("results_to_latex writes valid macro file", {
  exp <- make_fake_experiment(fake_texts, llm_labels)
  rel <- llm_human_reliability(llm_labels, human_labels)

  tmp <- withr::local_tempfile(fileext = ".tex")
  results_to_latex(exp, tmp, reliability = rel)
  expect_true(file.exists(tmp))

  lines <- readLines(tmp)
  expect_true(any(grepl("\\\\newcommand", lines)))
  expect_true(any(grepl("llmModel", lines)))
  expect_true(any(grepl("llmKappa", lines)))
  expect_true(any(grepl("llmMacroF", lines)))
})

test_that("export_replication creates full directory structure", {
  exp <- make_fake_experiment(fake_texts, llm_labels)
  rel <- llm_human_reliability(llm_labels, human_labels)
  gs  <- create_gold_standard(fake_texts, human_labels)

  tmp_dir <- withr::local_tempdir()
  out <- file.path(tmp_dir, "replication")

  export_replication(exp, out, reliability = rel, gold_standard = gs)

  expect_true(file.exists(file.path(out, "results.csv")))
  expect_true(file.exists(file.path(out, "parameters.txt")))
  expect_true(file.exists(file.path(out, "gold_standard.csv")))
  expect_true(file.exists(file.path(out, "reliability.txt")))
  expect_true(file.exists(file.path(out, "methods_section.md")))

  # Check results.csv has correct rows
  results <- readr::read_csv(file.path(out, "results.csv"),
                              show_col_types = FALSE)
  expect_equal(nrow(results), 10)
})

test_that("estimate_cost returns sensible numbers", {
  # Use a minimal fake Chat just for the model name
  fake_chat <- structure(list(
    get_model = function() "gpt-4o"
  ), class = "Chat")

  est <- estimate_cost(fake_texts, fake_chat, n_reps = 3)
  expect_type(est, "list")
  expect_true(est$est_cost_usd > 0)
  expect_equal(est$n_texts, 10)
  expect_equal(est$n_reps, 3)
})

# ============================================================================
# 7. LOGGING MODULE
# ============================================================================

test_that("audit trail: log, retrieve, export", {
  reset_session()

  exp <- make_fake_experiment(fake_texts, llm_labels, name = "audit_test")
  log_experiment(exp)

  log <- get_session_log()
  expect_s3_class(log, "tbl_df")
  expect_equal(nrow(log), 10)
  expect_true("session_id" %in% names(log))
  expect_true("experiment_name" %in% names(log))

  # Export CSV
  tmp <- withr::local_tempfile(fileext = ".csv")
  export_audit_trail(tmp, format = "csv")
  expect_true(file.exists(tmp))
  exported <- readr::read_csv(tmp, show_col_types = FALSE)
  expect_equal(nrow(exported), 10)

  # Export RDS
  tmp_rds <- withr::local_tempfile(fileext = ".rds")
  export_audit_trail(tmp_rds, format = "rds")
  expect_true(file.exists(tmp_rds))
  reloaded <- readRDS(tmp_rds)
  expect_equal(nrow(reloaded), 10)

  reset_session()
})

# ============================================================================
# 8. FULL PIPELINE: Design → Experiment → Validate → Report → Pre-register
# ============================================================================

test_that("full pipeline runs end-to-end on synthetic data", {
  # 1. Design
  fake_chats <- list(m1 = structure("x", class = "Chat"))
  design <- cross_design(fake_texts[1:5], fake_chats, temperatures = 0)
  design <- replicate_design(design, n = 2)
  design <- randomize_design(design, seed = 42)
  expect_equal(nrow(design), 10)

  # 2. Experiment (synthetic)
  exp <- make_fake_experiment(fake_texts, llm_labels, name = "pipeline_test")
  expect_output(print(exp), "pipeline_test")

  s <- summary(exp)
  expect_equal(s$n_prompts, 10)
  expect_equal(s$n_successful, 10)

  # 3. Reliability
  rel <- llm_human_reliability(llm_labels, human_labels)
  expect_true(rel$cohens_kappa > 0)

  # 4. Gold standard validation
  gs <- create_gold_standard(fake_texts, human_labels,
                              metadata = list(coder = "RA1"))
  val <- validate_against_gold(exp, gs)
  expect_true(val$accuracy > 0.7)

  # 5. Pre-register
  fp <- freeze_prompt(
    system_prompt = "Classify sentiment.",
    user_template = "Text: {text}",
    model = "gpt-4o",
    temperature = 0,
    categories = c("positive", "negative", "neutral")
  )
  expect_true(verify_prompt(fp))

  # 6. Report
  ms <- methods_section(exp, reliability = rel)
  expect_true(nchar(ms) > 100)

  tmp_dir <- withr::local_tempdir()

  # LaTeX macros
  tex_path <- file.path(tmp_dir, "stats.tex")
  results_to_latex(exp, tex_path, reliability = rel)
  tex_lines <- readLines(tex_path)
  expect_true(any(grepl("llmKappa", tex_lines)))

  # Replication package
  rep_dir <- file.path(tmp_dir, "replication")
  export_replication(exp, rep_dir, reliability = rel, gold_standard = gs)
  expect_true(file.exists(file.path(rep_dir, "results.csv")))

  # Pre-registration export
  prereg_path <- file.path(tmp_dir, "prompt_spec.json")
  export_preregistration(fp, prereg_path)
  reloaded_fp <- load_frozen_prompt(prereg_path)
  expect_true(verify_prompt(reloaded_fp))

  # Audit trail
  reset_session()
  log_experiment(exp)
  audit_path <- file.path(tmp_dir, "audit.csv")
  export_audit_trail(audit_path)
  expect_true(file.exists(audit_path))

  reset_session()
})