# CV implementation audit — tests that SHOULD pass but currently FAIL.
#
# These expose real bugs found by auditing R against the Python reference.
# Each test documents the bug, the expected correct behavior, and the
# current broken behavior. Fix the implementation, then remove the skip.

# ---------------------------------------------------------------------------
# BUG 1: R CV does not stratify folds
# ---------------------------------------------------------------------------
# R split.R line 196: fold_ids <- sample(rep(seq_len(folds), length.out = n))
# This is plain random assignment. Python's cv() calls _stratified_kfold()
# when stratify=True, preserving class ratio per fold.
#
# Impact: With imbalanced data, some folds may have 0 minority class samples,
# causing models to fail or produce degenerate predictions.

test_that("BUG: CV folds should preserve class ratio (stratification)", {
  withr::local_seed(42L)
  n <- 300L
  # Imbalanced: 80% class 0, 20% class 1
  target <- c(rep(0L, 240L), rep(1L, 60L))
  df <- data.frame(x = rnorm(n), target = target)
  cv <- ml_split(df, "target", seed = 42L, folds = 5L)

  dev <- cv$train  # dev data (test held out)
  global_ratio <- mean(dev$target == 1L)
  for (i in seq_along(cv$folds)) {
    fold_target <- dev$target[cv$folds[[i]]$valid]
    fold_ratio <- mean(fold_target == 1L)
    # With stratification, each fold should be within ±0.08 of global
    # Without stratification (current bug), folds can deviate wildly
    expect_true(
      abs(fold_ratio - global_ratio) < 0.08,
      label = sprintf(
        "Fold %d: minority ratio %.3f vs global %.3f (diff=%.3f)",
        i, fold_ratio, global_ratio, abs(fold_ratio - global_ratio)
      )
    )
  }
})


# ---------------------------------------------------------------------------
# BUG 2: R temporal CV drops remainder rows
# ---------------------------------------------------------------------------
# R .temporal_cv() line 283: valid_end <- min(train_end + chunk_size, n)
# Python cv_temporal(): last fold sets valid_end = n.
# With n not divisible by (folds+1), R silently drops trailing rows.
#
# Impact: Some data points are never validated. Coverage < 100%.

test_that("BUG: temporal CV must cover all rows (no remainder drop)", {
  n <- 503L  # Not divisible by (3+1)=4
  df <- data.frame(t = seq_len(n), x = rnorm(n), target = rnorm(n))
  cv <- ml_split(df, "target", time = "t", folds = 3L)

  # Collect all rows that appear in ANY fold (train or valid)
  all_touched <- integer(0)
  for (f in cv$folds) {
    all_touched <- union(all_touched, c(f$train, f$valid))
  }

  # Every dev row should appear in at least one fold
  n_dev <- nrow(cv$train)
  expect_equal(
    length(all_touched), n_dev,
    label = sprintf("Expected %d rows covered, got %d", n_dev, length(all_touched))
  )
})

test_that("BUG: temporal CV last fold must extend to final row", {
  n <- 503L
  df <- data.frame(t = seq_len(n), x = rnorm(n), target = rnorm(n))
  cv <- ml_split(df, "target", time = "t", folds = 3L)

  last_fold <- cv$folds[[length(cv$folds)]]
  max_valid <- max(last_fold$valid)
  n_dev <- nrow(cv$train)
  expect_equal(
    max_valid, n_dev,
    label = sprintf("Last fold valid ends at %d, should be %d", max_valid, n_dev)
  )
})


# ---------------------------------------------------------------------------
# BUG 3: R CV has no test holdout
# ---------------------------------------------------------------------------
# Python: cv() takes SplitResult, operates on .dev, preserves .test.
# R: ml_split(folds=) creates CV on ALL data. No test partition.
#
# Impact: Users who do ml_split(folds=5) then ml_fit() have no held-out
# test set for ml_assess(). The entire assess() workflow breaks.
# This is the split/cv separation the Python package already enforces.

test_that("BUG: CV should preserve a test holdout for assess()", {
  df <- data.frame(
    x1 = rnorm(200L), x2 = rnorm(200L),
    target = sample(0:1, 200L, replace = TRUE)
  )

  # The correct workflow: split first (gets test), then CV on dev
  # R currently: ml_split(folds=5) creates CV on ALL data, no test
  cv <- ml_split(df, "target", seed = 42L, folds = 5L)

  # CV data should be dev (train+valid), not all data
  # With a 60/20/20 split on 200 rows, dev ≈ 160, test ≈ 40
  expect_true(
    nrow(cv$train) < nrow(df),
    label = sprintf(
      "CV dev has %d rows (should be < %d, test should be held out)",
      nrow(cv$train), nrow(df)
    )
  )
  # Test partition should exist and be non-empty
  expect_true(nrow(cv$test) > 0L)
})