# Tests for proper Bayes factor handling
# These tests define the expected behavior after implementing the spec in bayes_factor_spec.md

# =============================================================================
# test_bayes_t() should NOT return pseudo-p-values
# =============================================================================

test_that("test_bayes_t returns bf without p_value", {

  ctrl <- c(100, 110, 105, 108, 103)
  trt <- c(200, 210, 205, 208, 203)

  result <- test_bayes_t(ctrl, trt)

  expect_type(result, "list")
  expect_true("bf" %in% names(result))
  expect_true("effect_size" %in% names(result))
  expect_equal(result$method, "bayes_t")


  # p_value should NOT be returned - this is the key change

  expect_false("p_value" %in% names(result))
})


test_that("test_bayes_t returns valid Bayes factor",
{
  ctrl <- c(100, 110, 105, 108, 103)
  trt <- c(200, 210, 205, 208, 203)

  result <- test_bayes_t(ctrl, trt)

  # BF should be positive
  expect_true(result$bf > 0)

  # For clearly different groups, BF should strongly favor alternative
  expect_true(result$bf > 10)
})


test_that("test_bayes_t returns BF < 1 for similar groups", {
  ctrl <- c(100, 102, 101, 99, 100)
  trt <- c(101, 100, 102, 99, 101)

  result <- test_bayes_t(ctrl, trt)

  # For very similar groups, BF should favor null (BF < 1)
  expect_true(result$bf < 3)  # At least not strong evidence for difference
})


# =============================================================================
# classify_bf_evidence() helper function
# =============================================================================

test_that("classify_bf_evidence returns correct categories", {
  # Test all five evidence categories
  expect_equal(as.character(classify_bf_evidence(0.05)), "strong_null")
  expect_equal(as.character(classify_bf_evidence(0.2)), "moderate_null")
  expect_equal(as.character(classify_bf_evidence(1.0)), "inconclusive")
  expect_equal(as.character(classify_bf_evidence(5.0)), "moderate_alt")
  expect_equal(as.character(classify_bf_evidence(20.0)), "strong_alt")
})


test_that("classify_bf_evidence returns ordered factor", {
  evidence <- classify_bf_evidence(c(0.05, 0.2, 1.0, 5.0, 20.0))

  expect_s3_class(evidence, "factor")
  expect_true(is.ordered(evidence))

  # Check correct order
  expected_levels <- c("strong_null", "moderate_null", "inconclusive",
                       "moderate_alt", "strong_alt")
  expect_equal(levels(evidence), expected_levels)
})


test_that("classify_bf_evidence handles boundary values", {
 # At boundaries (using exact thresholds from spec)
  # BF < 0.1 is strong_null, so 0.1 is moderate_null
  expect_equal(as.character(classify_bf_evidence(0.1)), "moderate_null")
  # BF < 1/3 is moderate_null, so 1/3 is inconclusive
  expect_equal(as.character(classify_bf_evidence(1/3)), "inconclusive")
  # BF < 3 is inconclusive, so 3 is moderate_alt
  expect_equal(as.character(classify_bf_evidence(3)), "moderate_alt")
  # BF < 10 is moderate_alt, so 10 is strong_alt
  expect_equal(as.character(classify_bf_evidence(10)), "strong_alt")
})


# =============================================================================
# compare() with bayes_t should handle results correctly
# =============================================================================

test_that("compare with bayes_t has bf column, not p_value", {
  test_data <- make_minimal_test_data()
  test_file <- write_test_csv(test_data)
  on.exit(unlink(test_file))

  imported <- read_pepdiff(
    file = test_file,
    id = "peptide",
    gene = "gene_id",
    value = "value",
    factors = "treatment",
    replicate = "bio_rep"
  )

  result <- compare(
    imported,
    compare = "treatment",
    ref = "ctrl",
    method = "pairwise",
    test = "bayes_t"
  )

  # bf column should be present
  expect_true("bf" %in% names(result$results))

  # p_value and fdr should be NA for bayes_t
  expect_true(all(is.na(result$results$p_value)))
  expect_true(all(is.na(result$results$fdr)))
})


test_that("compare with bayes_t has evidence column", {
  test_data <- make_minimal_test_data()
  test_file <- write_test_csv(test_data)
  on.exit(unlink(test_file))

  imported <- read_pepdiff(
    file = test_file,
    id = "peptide",
    gene = "gene_id",
    value = "value",
    factors = "treatment",
    replicate = "bio_rep"
  )

  result <- compare(
    imported,
    compare = "treatment",
    ref = "ctrl",
    method = "pairwise",
    test = "bayes_t"
  )

  # evidence column should be present and be an ordered factor
  expect_true("evidence" %in% names(result$results))
  expect_s3_class(result$results$evidence, "factor")
  expect_true(is.ordered(result$results$evidence))

  # Should have correct levels
  expected_levels <- c("strong_null", "moderate_null", "inconclusive",
                       "moderate_alt", "strong_alt")
  expect_equal(levels(result$results$evidence), expected_levels)
})


test_that("compare with bayes_t uses bf_threshold for significance", {
  test_data <- make_test_data(
    n_peptides = 20,
    n_reps = 4,
    effect_size = 3.0,
    affected_fraction = 0.5,
    seed = 42
  )
  test_file <- write_test_csv(test_data)
  on.exit(unlink(test_file))

  imported <- read_pepdiff(
    file = test_file,
    id = "peptide",
    gene = "gene_id",
    value = "value",
    factors = "treatment",
    replicate = "bio_rep"
  )

  # Default threshold = 3
  result <- compare(
    imported,
    compare = "treatment",
    ref = "ctrl",
    method = "pairwise",
    test = "bayes_t"
  )

  # significant should be TRUE where bf > 3
  sig_idx <- result$results$significant
  bf_vals <- result$results$bf

  expect_true(all(bf_vals[sig_idx] > 3, na.rm = TRUE))
  expect_true(all(bf_vals[!sig_idx] <= 3, na.rm = TRUE))
})


test_that("compare with bayes_t respects custom bf_threshold", {
  test_data <- make_test_data(
    n_peptides = 20,
    n_reps = 4,
    effect_size = 3.0,
    affected_fraction = 0.5,
    seed = 42
  )
  test_file <- write_test_csv(test_data)
  on.exit(unlink(test_file))

  imported <- read_pepdiff(
    file = test_file,
    id = "peptide",
    gene = "gene_id",
    value = "value",
    factors = "treatment",
    replicate = "bio_rep"
  )

  # Custom threshold = 10 (strong evidence)
  result <- compare(
    imported,
    compare = "treatment",
    ref = "ctrl",
    method = "pairwise",
    test = "bayes_t",
    bf_threshold = 10
  )

  # significant should be TRUE where bf > 10
  sig_idx <- result$results$significant
  bf_vals <- result$results$bf

  expect_true(all(bf_vals[sig_idx] > 10, na.rm = TRUE))
  expect_true(all(bf_vals[!sig_idx] <= 10, na.rm = TRUE))

  # Params should record the threshold

expect_equal(result$params$bf_threshold, 10)
})


test_that("compare stores bf_threshold in params", {
  test_data <- make_minimal_test_data()
  test_file <- write_test_csv(test_data)
  on.exit(unlink(test_file))

  imported <- read_pepdiff(
    file = test_file,
    id = "peptide",
    gene = "gene_id",
    value = "value",
    factors = "treatment",
    replicate = "bio_rep"
  )

  result <- compare(
    imported,
    compare = "treatment",
    ref = "ctrl",
    method = "pairwise",
    test = "bayes_t"
  )

  # Default bf_threshold should be stored
  expect_equal(result$params$bf_threshold, 3)
})


# =============================================================================
# Other pairwise tests should NOT be affected
# =============================================================================

test_that("compare with wilcoxon still has p_value and fdr", {
  test_data <- make_minimal_test_data()
  test_file <- write_test_csv(test_data)
  on.exit(unlink(test_file))

  imported <- read_pepdiff(
    file = test_file,
    id = "peptide",
    gene = "gene_id",
    value = "value",
    factors = "treatment",
    replicate = "bio_rep"
  )

  result <- compare(
    imported,
    compare = "treatment",
    ref = "ctrl",
    method = "pairwise",
    test = "wilcoxon"
  )

  # wilcoxon should still have p_value and fdr
  expect_true("p_value" %in% names(result$results))
  expect_true("fdr" %in% names(result$results))
  expect_false(all(is.na(result$results$p_value)))
  expect_false(all(is.na(result$results$fdr)))

  # wilcoxon should NOT have bf or evidence columns
  expect_false("bf" %in% names(result$results))
  expect_false("evidence" %in% names(result$results))
})


test_that("compare with bootstrap_t still has p_value and fdr", {
  test_data <- make_minimal_test_data()
  test_file <- write_test_csv(test_data)
  on.exit(unlink(test_file))

  imported <- read_pepdiff(
    file = test_file,
    id = "peptide",
    gene = "gene_id",
    value = "value",
    factors = "treatment",
    replicate = "bio_rep"
  )

  result <- compare(
    imported,
    compare = "treatment",
    ref = "ctrl",
    method = "pairwise",
    test = "bootstrap_t"
  )

  # bootstrap_t should still have p_value and fdr
  expect_true("p_value" %in% names(result$results))
  expect_true("fdr" %in% names(result$results))
  expect_false(all(is.na(result$results$p_value)))

  # bootstrap_t should NOT have bf or evidence columns
  expect_false("bf" %in% names(result$results))
  expect_false("evidence" %in% names(result$results))
})


# =============================================================================
# bf_to_pvalue should be removed
# =============================================================================

test_that("bf_to_pvalue function no longer exists", {
  # This function should be removed as it's statistically invalid
  expect_false(exists("bf_to_pvalue", envir = asNamespace("pepdiff")))
})


# =============================================================================
# Phase 2: Results class methods for bayes_t
# =============================================================================

test_that("print.pepdiff_results shows evidence breakdown for bayes_t", {
  test_data <- make_test_data(
    n_peptides = 20,
    n_reps = 4,
    effect_size = 3.0,
    affected_fraction = 0.5,
    seed = 42
  )
  test_file <- write_test_csv(test_data)
  on.exit(unlink(test_file))

  imported <- read_pepdiff(
    file = test_file,
    id = "peptide",
    gene = "gene_id",
    value = "value",
    factors = "treatment",
    replicate = "bio_rep"
  )

  result <- compare(
    imported,
    compare = "treatment",
    ref = "ctrl",
    method = "pairwise",
    test = "bayes_t"
  )

  output <- capture.output(print(result))
  output_text <- paste(output, collapse = "\n")

  # Should show evidence breakdown, not FDR
  expect_true(grepl("Evidence", output_text, ignore.case = TRUE))
  expect_false(grepl("FDR <", output_text))

  # Should mention BF threshold
  expect_true(grepl("BF", output_text))
})


test_that("print.pepdiff_results shows FDR for p-value tests", {
  test_data <- make_minimal_test_data()
  test_file <- write_test_csv(test_data)
  on.exit(unlink(test_file))

  imported <- read_pepdiff(
    file = test_file,
    id = "peptide",
    gene = "gene_id",
    value = "value",
    factors = "treatment",
    replicate = "bio_rep"
  )

  result <- compare(
    imported,
    compare = "treatment",
    ref = "ctrl",
    method = "pairwise",
    test = "wilcoxon"
  )

  output <- capture.output(print(result))
  output_text <- paste(output, collapse = "\n")

  # Should show FDR for p-value tests
  expect_true(grepl("FDR", output_text))
})


test_that("summary.pepdiff_results shows evidence breakdown for bayes_t", {
  test_data <- make_test_data(
    n_peptides = 20,
    n_reps = 4,
    effect_size = 3.0,
    affected_fraction = 0.5,
    seed = 42
  )
  test_file <- write_test_csv(test_data)
  on.exit(unlink(test_file))

  imported <- read_pepdiff(
    file = test_file,
    id = "peptide",
    gene = "gene_id",
    value = "value",
    factors = "treatment",
    replicate = "bio_rep"
  )

  result <- compare(
    imported,
    compare = "treatment",
    ref = "ctrl",
    method = "pairwise",
    test = "bayes_t"
  )

  output <- capture.output(summ <- summary(result))
  output_text <- paste(output, collapse = "\n")

  # Should show evidence categories
  expect_true(grepl("Evidence|BF", output_text, ignore.case = TRUE))

  # Summary should include bf_threshold
  expect_equal(summ$params$bf_threshold, 3)
})


test_that("significant() works with bayes_t results", {
  test_data <- make_test_data(
    n_peptides = 20,
    n_reps = 4,
    effect_size = 3.0,
    affected_fraction = 0.5,
    seed = 42
  )
  test_file <- write_test_csv(test_data)
  on.exit(unlink(test_file))

  imported <- read_pepdiff(
    file = test_file,
    id = "peptide",
    gene = "gene_id",
    value = "value",
    factors = "treatment",
    replicate = "bio_rep"
  )

  result <- compare(
    imported,
    compare = "treatment",
    ref = "ctrl",
    method = "pairwise",
    test = "bayes_t"
  )

  sig <- significant(result)

  # All returned results should have bf > threshold (default 3)
  expect_true(all(sig$bf > 3, na.rm = TRUE))
})


test_that("significant() respects bf_threshold parameter for bayes_t", {
  test_data <- make_test_data(
    n_peptides = 20,
    n_reps = 4,
    effect_size = 3.0,
    affected_fraction = 0.5,
    seed = 42
  )
  test_file <- write_test_csv(test_data)
  on.exit(unlink(test_file))

  imported <- read_pepdiff(
    file = test_file,
    id = "peptide",
    gene = "gene_id",
    value = "value",
    factors = "treatment",
    replicate = "bio_rep"
  )

  result <- compare(
    imported,
    compare = "treatment",
    ref = "ctrl",
    method = "pairwise",
    test = "bayes_t",
    bf_threshold = 10
  )

  sig <- significant(result)

  # All returned results should have bf > 10
  expect_true(all(sig$bf > 10, na.rm = TRUE))
})


test_that("significant() allows custom bf_threshold override", {
  test_data <- make_test_data(
    n_peptides = 20,
    n_reps = 4,
    effect_size = 3.0,
    affected_fraction = 0.5,
    seed = 42
  )
  test_file <- write_test_csv(test_data)
  on.exit(unlink(test_file))

  imported <- read_pepdiff(
    file = test_file,
    id = "peptide",
    gene = "gene_id",
    value = "value",
    factors = "treatment",
    replicate = "bio_rep"
  )

  result <- compare(
    imported,
    compare = "treatment",
    ref = "ctrl",
    method = "pairwise",
    test = "bayes_t",
    bf_threshold = 3
  )

  # Override with stricter threshold
  sig_strict <- significant(result, bf_threshold = 10)

  # All returned results should have bf > 10
  expect_true(all(sig_strict$bf > 10, na.rm = TRUE))

  # Stricter threshold should return fewer or equal results
  sig_default <- significant(result)
  expect_true(nrow(sig_strict) <= nrow(sig_default))
})


# =============================================================================
# Phase 3: Plot methods for bayes_t
# =============================================================================

test_that("plot.pepdiff_results dispatches to BF plots for bayes_t", {
  test_data <- make_test_data(
    n_peptides = 20,
    n_reps = 4,
    effect_size = 3.0,
    affected_fraction = 0.5,
    seed = 42
  )
  test_file <- write_test_csv(test_data)
  on.exit(unlink(test_file))

  imported <- read_pepdiff(
    file = test_file,
    id = "peptide",
    gene = "gene_id",
    value = "value",
    factors = "treatment",
    replicate = "bio_rep"
  )

  result <- compare(
    imported,
    compare = "treatment",
    ref = "ctrl",
    method = "pairwise",
    test = "bayes_t"
  )

  # Should produce a plot without error
  p <- plot(result)
  expect_s3_class(p, "gg")
})


test_that("plot_volcano_bf creates valid BF volcano plot", {
  test_data <- make_test_data(
    n_peptides = 20,
    n_reps = 4,
    effect_size = 3.0,
    affected_fraction = 0.5,
    seed = 42
  )
  test_file <- write_test_csv(test_data)
  on.exit(unlink(test_file))

  imported <- read_pepdiff(
    file = test_file,
    id = "peptide",
    gene = "gene_id",
    value = "value",
    factors = "treatment",
    replicate = "bio_rep"
  )

  result <- compare(
    imported,
    compare = "treatment",
    ref = "ctrl",
    method = "pairwise",
    test = "bayes_t"
  )

  p <- plot_volcano_bf(result)

  expect_s3_class(p, "gg")

  # Check y-axis is log10(bf) based (should have reference lines at log10(3) and log10(10))
  # The plot should be buildable
  built <- ggplot2::ggplot_build(p)
  expect_true(length(built$data) > 0)
})


test_that("plot_bf_distribution creates valid BF histogram", {
  test_data <- make_test_data(
    n_peptides = 20,
    n_reps = 4,
    effect_size = 3.0,
    affected_fraction = 0.5,
    seed = 42
  )
  test_file <- write_test_csv(test_data)
  on.exit(unlink(test_file))

  imported <- read_pepdiff(
    file = test_file,
    id = "peptide",
    gene = "gene_id",
    value = "value",
    factors = "treatment",
    replicate = "bio_rep"
  )

  result <- compare(
    imported,
    compare = "treatment",
    ref = "ctrl",
    method = "pairwise",
    test = "bayes_t"
  )

  p <- plot_bf_distribution(result)

  expect_s3_class(p, "gg")

  # The plot should be buildable
  built <- ggplot2::ggplot_build(p)
  expect_true(length(built$data) > 0)
})


test_that("plot.pepdiff_results still works for p-value tests", {
  test_data <- make_minimal_test_data()
  test_file <- write_test_csv(test_data)
  on.exit(unlink(test_file))

  imported <- read_pepdiff(
    file = test_file,
    id = "peptide",
    gene = "gene_id",
    value = "value",
    factors = "treatment",
    replicate = "bio_rep"
  )

  result <- compare(
    imported,
    compare = "treatment",
    ref = "ctrl",
    method = "pairwise",
    test = "wilcoxon"
  )

  # Should produce a plot without error (original p-value based plots)
  p <- plot(result)
  expect_s3_class(p, "gg")
})