# ── ml_cv() verb parity with Python ml.cv() ──────────────────────────────────
#
# Tests structural invariants that MUST hold in both languages:
# 1. Fold count matches requested k
# 2. Fold sizes balanced (±1 row)
# 3. No validation overlap (partition property)
# 4. Complete coverage (every dev row in exactly one valid fold)
# 5. Stratification preserves class ratios per fold
# 6. Group non-overlap (ml_cv_group)
# 7. Temporal ordering (ml_cv_temporal: train indices < valid indices)
# 8. Train + valid = all dev rows (no rows lost or invented)
# 9. ml_fit(cv_result) produces scores_ (non-NULL)

# ── Structural: fold count ───────────────────────────────────────────────────

test_that("ml_cv: fold count matches k for k=2,3,5,10", {
  s <- ml_split(iris, "Species", seed = 42L)
  for (k in c(2L, 3L, 5L, 10L)) {
    c <- ml_cv(s, "Species", folds = k, seed = 42L)
    expect_length(c$folds, k)
  }
})

# ── Structural: fold size balance ────────────────────────────────────────────

test_that("ml_cv: fold sizes balanced within ±1", {
  s <- ml_split(iris, "Species", seed = 42L)
  c <- ml_cv(s, "Species", folds = 5L, seed = 42L)
  sizes <- vapply(c$folds, function(f) length(f$valid), integer(1))
  expect_true(max(sizes) - min(sizes) <= 1L)
})

# ── Structural: no validation overlap ────────────────────────────────────────

test_that("ml_cv: no overlap between validation folds", {
  s <- ml_split(iris, "Species", seed = 42L)
  c <- ml_cv(s, "Species", folds = 5L, seed = 42L)
  valids <- lapply(c$folds, `[[`, "valid")
  for (i in seq_along(valids)) {
    for (j in seq_along(valids)) {
      if (i < j) {
        expect_length(intersect(valids[[i]], valids[[j]]), 0L)
      }
    }
  }
})

# ── Structural: complete coverage ────────────────────────────────────────────

test_that("ml_cv: every dev row appears in exactly one valid fold", {
  s <- ml_split(iris, "Species", seed = 42L)
  c <- ml_cv(s, "Species", folds = 5L, seed = 42L)
  all_valid <- sort(unlist(lapply(c$folds, `[[`, "valid")))
  expect_equal(all_valid, seq_len(nrow(s$dev)))
})

# ── Structural: train + valid = all dev rows ─────────────────────────────────

test_that("ml_cv: train + valid covers all dev rows per fold", {
  s <- ml_split(iris, "Species", seed = 42L)
  c <- ml_cv(s, "Species", folds = 5L, seed = 42L)
  n_dev <- nrow(s$dev)
  for (fold in c$folds) {
    all_idx <- sort(c(fold$train, fold$valid))
    expect_equal(all_idx, seq_len(n_dev))
  }
})

# ── Structural: stratification preserves class ratios ────────────────────────

test_that("ml_cv: stratified folds preserve class proportions within 10%", {
  s <- ml_split(iris, "Species", seed = 42L)
  c <- ml_cv(s, "Species", folds = 5L, seed = 42L, stratify = TRUE)
  dev_data <- s$dev
  overall_freq <- table(dev_data$Species) / nrow(dev_data)
  for (fold in c$folds) {
    fold_y <- dev_data$Species[fold$valid]
    fold_freq <- table(fold_y) / length(fold_y)
    for (cls in names(overall_freq)) {
      expect_true(abs(fold_freq[[cls]] - overall_freq[[cls]]) < 0.10,
                  label = sprintf("class %s: fold=%.2f overall=%.2f",
                                  cls, fold_freq[[cls]], overall_freq[[cls]]))
    }
  }
})

# ── Structural: regression (no stratification) ───────────────────────────────

test_that("ml_cv: works with regression target", {
  df <- data.frame(x = rnorm(200), y = rnorm(200))
  s <- ml_split(df, "y", seed = 42L)
  c <- ml_cv(s, "y", folds = 5L, seed = 42L)
  expect_s3_class(c, "ml_cv_result")
  all_valid <- sort(unlist(lapply(c$folds, `[[`, "valid")))
  expect_equal(all_valid, seq_len(nrow(s$dev)))
})

# ── Integration: ml_fit(cv_result) produces scores_ ─────────────────────────

test_that("ml_fit on ml_cv result produces non-NULL scores_", {
  s <- ml_split(iris, "Species", seed = 42L)
  c <- ml_cv(s, "Species", folds = 3L, seed = 42L)
  model <- ml_fit(c, "Species", seed = 42L)
  expect_false(is.null(model$scores_))
  expect_true(is.list(model$scores_))
  expect_true(length(model$scores_) > 0L)
})

test_that("ml_fit on ml_cv: scores_ has expected metric names", {
  s <- ml_split(iris, "Species", seed = 42L)
  c <- ml_cv(s, "Species", folds = 3L, seed = 42L)
  model <- ml_fit(c, "Species", seed = 42L)
  expect_true("accuracy" %in% names(model$scores_))
})

# ── Integration: ml_assess after ml_cv fit ───────────────────────────────────

test_that("ml_assess works after ml_cv + ml_fit", {
  s <- ml_split(iris, "Species", seed = 42L)
  c <- ml_cv(s, "Species", folds = 3L, seed = 42L)
  model <- ml_fit(c, "Species", seed = 42L)
  evidence <- ml_assess(model, test = s$test)
  expect_s3_class(evidence, "ml_evidence")
})

# ── Temporal CV: train indices < valid indices ───────────────────────────────

test_that("ml_cv_temporal: train indices always before valid indices", {
  df <- data.frame(date = 1:200, x = rnorm(200), y = rnorm(200))
  s <- ml_split_temporal(df, "y", time = "date")
  c <- ml_cv_temporal(s, "y", folds = 5L)
  for (fold in c$folds) {
    expect_true(max(fold$train) < min(fold$valid))
  }
})

test_that("ml_cv_temporal: expanding window (later folds have more training)", {
  df <- data.frame(date = 1:200, x = rnorm(200), y = rnorm(200))
  s <- ml_split_temporal(df, "y", time = "date")
  c <- ml_cv_temporal(s, "y", folds = 5L)
  sizes <- vapply(c$folds, function(f) length(f$train), integer(1))
  # Each fold should have more training data than the previous
  expect_true(all(diff(sizes) > 0L))
})

# ── Group CV: no group leakage ───────────────────────────────────────────────

test_that("ml_cv_group: no group in both train and valid", {
  df <- data.frame(
    pid = rep(1:30, each = 5),
    x = rnorm(150),
    y = sample(c("a", "b"), 150, TRUE)
  )
  s <- ml_split(df, "y", seed = 42L)
  c <- ml_cv_group(s, "y", groups = "pid", folds = 5L, seed = 42L)
  dev_data <- s$dev
  for (fold in c$folds) {
    train_groups <- unique(dev_data$pid[fold$train])
    valid_groups <- unique(dev_data$pid[fold$valid])
    expect_length(intersect(train_groups, valid_groups), 0L)
  }
})

test_that("ml_cv_group: all groups covered across folds", {
  df <- data.frame(
    pid = rep(1:20, each = 5),
    x = rnorm(100),
    y = sample(c("a", "b"), 100, TRUE)
  )
  s <- ml_split(df, "y", seed = 42L)
  c <- ml_cv_group(s, "y", groups = "pid", folds = 4L, seed = 42L)
  dev_data <- s$dev
  all_valid_groups <- unique(unlist(lapply(c$folds, function(f) {
    unique(dev_data$pid[f$valid])
  })))
  expect_equal(sort(all_valid_groups), sort(unique(dev_data$pid)))
})

# ── Parity: ml_cv() vs ml_split(folds=) produce same structure ──────────────

test_that("ml_cv() and ml_split(folds=) both feed into ml_fit with scores_", {
  # Old path: ml_split(folds=)
  s_old <- ml_split(iris, "Species", seed = 42L, folds = 5L)
  m_old <- ml_fit(s_old, "Species", seed = 42L)

  # New path: ml_split() then ml_cv()
  s_new <- ml_split(iris, "Species", seed = 42L)
  c_new <- ml_cv(s_new, "Species", folds = 5L, seed = 42L)
  m_new <- ml_fit(c_new, "Species", seed = 42L)

  # Both should produce scores_
  expect_false(is.null(m_old$scores_))
  expect_false(is.null(m_new$scores_))

  # Both should have the same metric names
  expect_equal(sort(names(m_old$scores_)), sort(names(m_new$scores_)))
})