skip_if_tune_deps <- function() {
  skip_if_not_installed("tune")
  skip_if_not_installed("dials")
  skip_if_not_installed("glmnet")
  skip_if_not_installed("recipes")
  skip_if_not_installed("rsample")
  skip_if_not_installed("yardstick")
  skip_if_not_installed("workflows")
  skip_if_not_installed("parsnip")
}

tune_resample_quiet <- function(...) {
  out <- NULL
  capture.output({
    out <- suppressWarnings(tune_resample(...))
  })
  out
}

test_that("tune_resample selects a deterministic simple model with one_std_err", {
  skip_if_tune_deps()

  df <- make_class_df(24)
  splits <- make_split_plan_quiet(df, outcome = "outcome",
                                  mode = "subject_grouped", group = "subject",
                                  v = 2, nested = TRUE, stratify = FALSE, seed = 11)

  spec <- parsnip::logistic_reg(penalty = tune::tune(), mixture = 1) |>
    parsnip::set_engine("glmnet")
  rec <- recipes::recipe(outcome ~ x1 + x2, data = df)

  local_mocked_bindings(
    tune_grid = function(...) structure(list(), class = "mock_tune_results"),
    collect_metrics = function(...) {
      data.frame(
        .metric = rep("accuracy", 4),
        .estimator = rep("binary", 4),
        .estimate = c(0.80, 0.76, 0.77, 0.77),
        .config = c("cfg_a", "cfg_a", "cfg_b", "cfg_b"),
        penalty = c(0.01, 0.01, 1.0, 1.0),
        stringsAsFactors = FALSE
      )
    },
    finalize_workflow = function(x, parameters, ...) x,
    .package = "tune"
  )
  local_mocked_bindings(
    fit_resample = function(x, outcome, splits, ...) {
      truth <- factor(rep(c(0, 1), length.out = 4), levels = c(0, 1))
      methods::new(
        "LeakFit",
        splits = splits,
        metrics = data.frame(fold = 1, learner = "mock", accuracy = 0.75),
        metric_summary = data.frame(learner = "mock", accuracy_mean = 0.75, accuracy_sd = 0),
        audit = data.frame(),
        predictions = list(data.frame(
          id = as.character(seq_len(4)),
          truth = truth,
          pred = c(0.45, 0.55, 0.40, 0.60),
          fold = 1,
          learner = "mock",
          stringsAsFactors = FALSE
        )),
        preprocess = list(),
        learners = list(),
        outcome = outcome,
        task = "binomial",
        feature_names = c("x1", "x2"),
        info = list(sample_ids = as.character(seq_len(nrow(x))))
      )
    },
    .package = "bioLeak"
  )

  tuned_best <- tune_resample_quiet(df, outcome = "outcome", splits = splits,
                                    learner = spec, preprocess = rec, grid = 2,
                                    metrics = "accuracy", selection = "best", seed = 11)
  tuned_ose <- tune_resample_quiet(df, outcome = "outcome", splits = splits,
                                   learner = spec, preprocess = rec, grid = 2,
                                   metrics = "accuracy", selection = "one_std_err", seed = 11)

  expect_true(nrow(tuned_best$best_params) > 0)
  expect_true(nrow(tuned_ose$best_params) > 0)
  expect_true(all(tuned_best$best_params$penalty == 0.01))
  expect_true(all(tuned_ose$best_params$penalty == 1.0))
})

test_that("tune_resample supports final refit and stores fold status", {
  skip_if_tune_deps()

  df <- make_class_df(80)
  splits <- make_split_plan_quiet(df, outcome = "outcome",
                                  mode = "subject_grouped", group = "subject",
                                  v = 2, nested = TRUE, stratify = FALSE, seed = 2)

  spec <- parsnip::logistic_reg(penalty = tune::tune(), mixture = 1) |>
    parsnip::set_engine("glmnet")
  rec <- recipes::recipe(outcome ~ x1 + x2, data = df)

  tuned <- tune_resample_quiet(df, outcome = "outcome", splits = splits,
                               learner = spec, preprocess = rec, grid = 2,
                               metrics = "accuracy", selection = "best",
                               refit = TRUE, seed = 2)

  expect_true(isTRUE(tuned$info$refit))
  expect_true(!is.null(tuned$final_model))
  expect_s3_class(tuned$final_workflow, "workflow")
  expect_true(nrow(tuned$final_params) > 0)
  expect_true(is.data.frame(tuned$fold_status))
  expect_true(nrow(tuned$fold_status) == length(splits@indices))
  expect_true(all(tuned$fold_status$status %in% c("success", "skipped", "failed")))
})

test_that("final refit uses aggregated params, not single best outer fold", {
  skip_if_tune_deps()

  df <- make_class_df(80)
  splits <- make_split_plan_quiet(df, outcome = "outcome",
                                  mode = "subject_grouped", group = "subject",
                                  v = 2, nested = TRUE, stratify = FALSE, seed = 2)

  spec <- parsnip::logistic_reg(penalty = tune::tune(), mixture = 1) |>
    parsnip::set_engine("glmnet")
  rec <- recipes::recipe(outcome ~ x1 + x2, data = df)

  tuned <- tune_resample_quiet(df, outcome = "outcome", splits = splits,
                               learner = spec, preprocess = rec, grid = 3,
                               metrics = "accuracy", selection = "best",
                               refit = TRUE, seed = 2)

  # refit_method must be "aggregate", not a single-fold selection
  expect_identical(tuned$info$refit_method, "aggregate")

  # refit_fold must be NA — no single fold was selected based on outer metrics
  expect_true(is.na(tuned$info$refit_fold))

  # final_params must have exactly the hyperparameter columns, no fold column
  expect_false("fold" %in% names(tuned$final_params))

  # --- Behavioral proof that outer test metrics are not consulted ---
  # final_params must exactly equal the coordinate-wise aggregate of ALL folds'
  # best_params. This is the core independence guarantee: the old (leaky) code
  # did which.max(metrics_df[[metric]]) to pick a single fold, so final_params
  # would equal that fold's row. The new code aggregates all folds equally.
  bp <- tuned$best_params
  param_cols <- setdiff(names(bp), "fold")
  expect_true(nrow(bp) >= 2,
              info = "Need at least 2 outer folds to test aggregation")
  expect_true(length(param_cols) >= 1,
              info = "Need at least 1 hyperparameter column")

  for (col in param_cols) {
    if (is.numeric(bp[[col]])) {
      expected <- stats::median(bp[[col]], na.rm = TRUE)
      expect_equal(tuned$final_params[[col]], expected,
                   info = paste("Param", col,
                                "must equal median across ALL folds, not a single fold's value"))
    } else {
      tbl <- table(bp[[col]])
      expected <- names(tbl)[which.max(tbl)]
      expect_equal(as.character(tuned$final_params[[col]]), expected,
                   info = paste("Param", col,
                                "must equal majority vote across ALL folds"))
    }
  }

  # Verify final_params is not simply copied from the best-metric fold.
  # Identify the fold that had the best outer metric — if it differs from the
  # aggregate, the old (leaky) logic would have returned that fold's params.
  metric_col <- tuned$info$selection_metric
  if (!is.null(metric_col) && metric_col %in% names(tuned$metrics)) {
    metric_vals <- tuned$metrics[[metric_col]]
    minimize <- metric_col %in% c("rmse", "mae", "log_loss", "mn_log_loss")
    best_idx <- if (minimize) which.min(metric_vals) else which.max(metric_vals)
    best_fold <- tuned$metrics$fold[best_idx]
    best_fold_params <- bp[bp$fold == best_fold, param_cols, drop = FALSE]

    # Check if folds actually disagree on params
    folds_disagree <- FALSE
    for (col in param_cols) {
      if (length(unique(bp[[col]])) > 1L) {
        folds_disagree <- TRUE
        break
      }
    }

    if (folds_disagree && nrow(best_fold_params) == 1L) {
      # When folds disagree, the aggregate should NOT equal the best-metric
      # fold's params. This is the direct proof: the old code would have
      # returned best_fold_params, the new code returns the aggregate.
      params_match_best <- all(vapply(param_cols, function(col) {
        identical(tuned$final_params[[col]], best_fold_params[[col]])
      }, logical(1)))
      expect_false(params_match_best,
                   info = paste("final_params must not equal fold", best_fold,
                                "params (the best-metric fold) — that would be",
                                "nested-CV leakage"))
    }
  }
})

test_that("LeakTune summary and audit handle skipped outer folds", {
  skip_if_tune_deps()

  set.seed(1)
  df <- data.frame(
    subject = paste0("s", seq_len(60)),
    outcome = factor(c(rep(0, 30), rep(1, 30)), levels = c(0, 1)),
    x1 = rnorm(60),
    x2 = rnorm(60),
    stringsAsFactors = FALSE
  )

  indices <- list(
    list(train = 1:20, test = 21:60, fold = 1L, repeat_id = 1L),
    list(train = c(1:20, 31:60), test = 21:30, fold = 2L, repeat_id = 1L)
  )
  split_info <- list(
    outcome = "outcome",
    v = 2L,
    repeats = 1L,
    seed = 1L,
    mode = "subject_grouped",
    perm_mode = "subject_grouped",
    group = "subject",
    batch = NULL,
    study = NULL,
    time = NULL,
    stratify = FALSE,
    nested = FALSE,
    horizon = 0,
    summary = data.frame(
      fold = c(1L, 2L),
      repeat_id = c(1L, 1L),
      train_n = c(20L, 50L),
      test_n = c(40L, 10L)
    ),
    hash = "manual",
    inner = NULL,
    compact = FALSE,
    fold_assignments = NULL,
    coldata = df
  )
  splits <- bioLeak:::LeakSplits(mode = "subject_grouped", indices = indices, info = split_info)

  spec <- parsnip::logistic_reg(penalty = tune::tune(), mixture = 1) |>
    parsnip::set_engine("glmnet")
  rec <- recipes::recipe(outcome ~ x1 + x2, data = df)

  tuned <- tune_resample_quiet(df, outcome = "outcome", splits = splits,
                               learner = spec, preprocess = rec, grid = 2,
                               metrics = "accuracy", seed = 1)

  expect_true(is.null(tuned$outer_fits[[1]]))
  expect_true(!is.null(tuned$outer_fits[[2]]))
  expect_true(is.data.frame(tuned$fold_status))
  expect_equal(nrow(tuned$fold_status), 2)
  expect_true(any(tuned$fold_status$status == "skipped"))
  expect_true(any(tuned$fold_status$status == "success"))

  sum_out <- paste(capture.output(summary(tuned)), collapse = "\n")
  expect_match(sum_out, "Outer Folds: 1 successful / 2 total", fixed = TRUE)

  audit <- suppressWarnings(expect_no_error(
    audit_leakage(
      tuned,
      metric = "accuracy",
      B = 3,
      boot_B = 20,
      perm_stratify = FALSE,
      return_perm = FALSE,
      target_scan = FALSE,
      target_scan_multivariate = FALSE
    )
  ))
  expect_s4_class(audit, "LeakAudit")
})

test_that("tune_resample tunes binomial thresholds from inner predictions", {
  skip_if_tune_deps()

  df <- make_class_df(24)
  splits <- make_split_plan_quiet(df, outcome = "outcome",
                                  mode = "subject_grouped", group = "subject",
                                  v = 2, nested = TRUE, stratify = FALSE, seed = 21)

  spec <- parsnip::logistic_reg(penalty = tune::tune(), mixture = 1) |>
    parsnip::set_engine("glmnet")
  rec <- recipes::recipe(outcome ~ x1 + x2, data = df)

  seen_save_pred <- logical()
  seen_thresholds <- numeric()

  local_mocked_bindings(
    tune_grid = function(..., control) {
      seen_save_pred <<- c(seen_save_pred, isTRUE(control$save_pred))
      structure(list(), class = "mock_tune_results")
    },
    collect_metrics = function(...) {
      data.frame(
        .metric = rep("accuracy", 4),
        .estimator = rep("binary", 4),
        .estimate = c(0.8, 0.8, 0.7, 0.7),
        .config = c("cfg_a", "cfg_a", "cfg_b", "cfg_b"),
        penalty = c(0.01, 0.01, 1.0, 1.0),
        stringsAsFactors = FALSE
      )
    },
    collect_predictions = function(...) {
      data.frame(
        .config = rep("cfg_a", 4),
        outcome = factor(c(0, 1, 0, 1), levels = c(0, 1)),
        .pred_0 = c(0.9, 0.6, 0.55, 0.4),
        .pred_1 = c(0.1, 0.4, 0.45, 0.6),
        stringsAsFactors = FALSE
      )
    },
    finalize_workflow = function(x, parameters, ...) x,
    .package = "tune"
  )
  local_mocked_bindings(
    fit_resample = function(x, outcome, splits, classification_threshold = 0.5, ...) {
      seen_thresholds <<- c(seen_thresholds, classification_threshold)
      truth <- factor(rep(c(0, 1), length.out = 4), levels = c(0, 1))
      methods::new(
        "LeakFit",
        splits = splits,
        metrics = data.frame(fold = 1, learner = "mock", accuracy = 0.75),
        metric_summary = data.frame(learner = "mock", accuracy_mean = 0.75, accuracy_sd = 0),
        audit = data.frame(),
        predictions = list(data.frame(
          id = as.character(seq_len(4)),
          truth = truth,
          pred = c(0.1, 0.4, 0.45, 0.6),
          fold = 1,
          learner = "mock",
          stringsAsFactors = FALSE
        )),
        preprocess = list(),
        learners = list(),
        outcome = outcome,
        task = "binomial",
        feature_names = c("x1", "x2"),
        info = list(sample_ids = as.character(seq_len(nrow(x))))
      )
    },
    .package = "bioLeak"
  )

  tuned <- tune_resample_quiet(df, outcome = "outcome", splits = splits,
                               learner = spec, preprocess = rec, grid = 2,
                               metrics = "accuracy", selection = "best", seed = 21,
                               tune_threshold = TRUE, threshold_grid = c(0.2, 0.8))

  expect_true(all(seen_save_pred))
  expect_equal(length(seen_thresholds), length(splits@indices))
  expect_true(all(abs(seen_thresholds - 0.2) < 1e-12))
  expect_true(is.data.frame(tuned$thresholds))
  expect_equal(nrow(tuned$thresholds), length(splits@indices))
  expect_true(all(abs(tuned$thresholds$threshold - 0.2) < 1e-12))
  expect_true(isTRUE(tuned$info$threshold_tuned))
  expect_equal(tuned$info$threshold_metric, "accuracy")
})