# W3 User Persona Parity — canonical end-to-end workflows
# Mirrors the canonical Python API section exactly (same verb sequence).
# Max 20 tests.

library(ml)

# ── PERSONA A: BEGINNER — dev loop workflow ────────────────────────────────────
# Python:  s = ml.split(df, "target", seed=42)
#          model = ml.fit(s.train, "target", seed=42)
#          preds = ml.predict(model, s.valid)
#          metrics = ml.evaluate(model, s.valid)
#          imp = ml.explain(model)

test_that("W3-01: split → fit → predict → evaluate pipeline works end-to-end", {
  s      <- ml_split(iris, "Species", seed = 42L)
  model  <- ml_fit(s$train, "Species", seed = 42L)
  preds  <- ml_predict(model, s$valid)
  metrics <- ml_evaluate(model, s$valid)
  expect_true(inherits(s, "ml_split_result"))
  expect_true(inherits(model, "ml_model"))
  expect_equal(length(preds), nrow(s$valid))
  expect_true("accuracy" %in% names(metrics))
  expect_true(metrics[["accuracy"]] >= 0 && metrics[["accuracy"]] <= 1)
})

test_that("W3-02: explain() returns feature importance for default model", {
  s     <- ml_split(iris, "Species", seed = 42L)
  model <- ml_fit(s$train, "Species", algorithm = "random_forest", seed = 42L)
  imp   <- ml_explain(model)
  expect_true(inherits(imp, "ml_explanation"))
  df <- as.data.frame(imp)
  expect_true(nrow(df) > 0L)
  expect_true(all(c("feature", "importance") %in% names(df)))
  expect_true(all(df$importance >= 0))
})

# ── PERSONA B: PRACTITIONER — tuning workflow ──────────────────────────────────
# Python:  tuned = ml.tune(s.train, "target", algorithm="xgboost", seed=42)
#          stacked = ml.stack(s.train, "target", seed=42)
#          lb = ml.compare([tuned, model], s.valid)

test_that("W3-03: tune returns ml_tuning_result that can be used in compare", {
  s     <- ml_split(iris, "Species", seed = 42L)
  tuned <- ml_tune(s$train, "Species", algorithm = "logistic",
                   n_trials = 2L, seed = 42L)
  model <- ml_fit(s$train, "Species", algorithm = "logistic", seed = 42L)
  lb    <- ml_compare(list(tuned, model), s$valid)
  expect_true(inherits(lb, "ml_leaderboard"))
  expect_equal(nrow(lb), 2L)
})

test_that("W3-04: compare([tuned, model], data) is sorted by primary metric", {
  s     <- ml_split(iris, "Species", seed = 42L)
  m1    <- ml_fit(s$train, "Species", algorithm = "logistic", seed = 42L)
  m2    <- ml_fit(s$train, "Species", algorithm = "logistic", seed = 99L)
  lb    <- ml_compare(list(m1, m2), s$valid)
  df    <- as.data.frame(lb)
  # Should be sorted by accuracy (descending) for multiclass without roc_auc
  expect_equal(nrow(df), 2L)
  if (nrow(df) >= 2L && "accuracy" %in% names(df)) {
    expect_true(df$accuracy[1] >= df$accuracy[2] - 1e-6)
  }
})

# ── PERSONA C: PRODUCTION — finalize workflow ──────────────────────────────────
# Python:  final = ml.fit(s.dev, "target", seed=42)   ← retrain on train+valid
#          gate = ml.validate(final, test=s.test, rules={"accuracy": ">0.85"}, baseline=model)
#          verdict = ml.assess(final, test=s.test)

test_that("W3-05: fit on $dev (train+valid) works and produces valid model", {
  s     <- ml_split(iris, "Species", seed = 42L)
  final <- ml_fit(s$dev, "Species", seed = 42L)
  expect_true(inherits(final, "ml_model"))
  # final model should be trained on more data than train-only model
  expect_true(final$n_train >= nrow(s$train))
})

test_that("W3-06: validate passes when accuracy > 0.5 on iris logistic", {
  s     <- ml_split(iris, "Species", seed = 42L)
  model <- ml_fit(s$train, "Species", algorithm = "logistic", seed = 42L)
  gate  <- ml_validate(model, test = s$test,
                        rules = list(accuracy = ">0.5"))
  expect_true(inherits(gate, "ml_validate_result"))
  expect_true(gate$passed, info = paste("Validate failed. failures:",
                                         paste(gate$failures, collapse="; ")))
})

test_that("W3-07: validate fails when impossible threshold set (accuracy > 0.9999)", {
  s     <- ml_split(iris, "Species", seed = 42L)
  model <- ml_fit(s$train, "Species", algorithm = "logistic", seed = 42L)
  gate  <- ml_validate(model, test = s$test,
                        rules = list(accuracy = ">0.9999"))
  # May pass or fail — just check structure is correct
  expect_true(is.logical(gate$passed))
  expect_true(is.character(gate$failures))
})

test_that("W3-08: assess() returns ml_metrics (the 'do once' final exam)", {
  s       <- ml_split(iris, "Species", seed = 42L)
  model   <- ml_fit(s$train, "Species", algorithm = "logistic", seed = 42L)
  verdict <- ml_assess(model, test = s$test)
  expect_true(inherits(verdict, "ml_evidence"))
  expect_true("accuracy" %in% names(verdict))
})

test_that("W3-09: assess() errors on second call (Python: raises ModelError)", {
  s     <- ml_split(iris, "Species", seed = 42L)
  model <- ml_fit(s$train, "Species", algorithm = "logistic", seed = 42L)
  ml_assess(model, test = s$test)  # first call — OK
  expect_error(
    ml_assess(model, test = s$test),  # second call — must error
    regexp = "peeking|times|repeated",
    ignore.case = TRUE
  )
})

# ── PERSONA D: MONITORING — drift + shelf workflow ─────────────────────────────
# Python:  result = ml.drift(reference=s.train, new=new_customers)
#          result = ml.shelf(model, new=labeled_batch, target="churn")

test_that("W3-10: drift detects no drift on same distribution", {
  s   <- ml_split(iris, "Species", seed = 42L)
  res <- ml_drift(reference = s$train, new = s$valid, target = "Species")
  expect_true(inherits(res, "ml_drift_result"))
  expect_true(is.logical(res$shifted))
  # Same underlying distribution → likely no drift
  expect_false(res$shifted,
               info = paste("Unexpected drift detected. severity:", res$severity))
})

test_that("W3-11: drift detects obvious drift (large mean shift)", {
  s      <- ml_split(iris, "Species", seed = 42L)
  new_df <- s$test
  new_df$Sepal.Length <- new_df$Sepal.Length + 10  # massive shift
  new_df$Sepal.Width  <- new_df$Sepal.Width  + 10
  res <- ml_drift(reference = s$train, new = new_df, target = "Species")
  expect_true(res$shifted, info = "Expected drift to be detected with +10 shift")
  expect_true(length(res$features_shifted) >= 2L)
})

test_that("W3-12: shelf() works end-to-end on CV model", {
  cv    <- ml_split(iris, "Species", seed = 42L, folds = 3L)
  model <- ml_fit(cv, "Species", algorithm = "logistic", seed = 42L)
  new_batch <- iris[sample(nrow(iris), 40L), ]
  result <- ml_shelf(model, new = new_batch, target = "Species")
  expect_true(inherits(result, "ml_shelf_result"))
  expect_true(is.logical(result$fresh))
  expect_true(is.character(result$recommendation))
})

# ── PERSONA E: SCREENING — algorithm discovery ─────────────────────────────────
# Python:  leaderboard = ml.screen(s, "target")

test_that("W3-13: screen returns ranked leaderboard", {
  s  <- ml_split(iris, "Species", seed = 42L)
  lb <- ml_screen(s, "Species", seed = 42L,
                   algorithms = c("logistic", "random_forest"))
  expect_true(inherits(lb, "ml_leaderboard"))
  expect_true(nrow(lb) >= 1L)
  expect_true("algorithm" %in% names(lb))
})

test_that("W3-14: screen $best_model is an ml_model", {
  s  <- ml_split(iris, "Species", seed = 42L)
  lb <- ml_screen(s, "Species", seed = 42L, algorithms = c("logistic"))
  # Python: lb.best_model returns the top Model
  # R: lb$best_model should return the top ml_model
  bm <- lb$best_model
  if (!is.null(bm)) {
    expect_true(inherits(bm, "ml_model"))
  }
})

# ── PERSONA F: REGRESSION workflow ─────────────────────────────────────────────

test_that("W3-15: regression workflow — split, fit, evaluate, assess all work", {
  s       <- ml_split(mtcars, "mpg", seed = 42L)
  model   <- ml_fit(s$train, "mpg", algorithm = "random_forest", seed = 42L)
  metrics <- ml_evaluate(model, s$valid)
  verdict <- ml_assess(model, test = s$test)
  expect_true("rmse" %in% names(metrics))
  expect_true("r2"   %in% names(metrics))
  expect_true(metrics[["rmse"]] > 0)
  expect_true(inherits(verdict, "ml_evidence"))
})

test_that("W3-16: regression validate with rmse threshold works", {
  s     <- ml_split(mtcars, "mpg", seed = 42L)
  model <- ml_fit(s$train, "mpg", algorithm = "random_forest", seed = 42L)
  gate  <- ml_validate(model, test = s$test,
                        rules = list(r2 = "> -1.0"))  # always passes
  expect_true(gate$passed,
              info = paste("Validation failed. failures:",
                            paste(gate$failures, collapse="; ")))
})

# ── PERSONA G: SAVE / LOAD WORKFLOW ──────────────────────────────────────────

test_that("W3-17: save and load round-trip preserves predictions", {
  s     <- ml_split(iris, "Species", seed = 42L)
  model <- ml_fit(s$train, "Species", algorithm = "logistic", seed = 42L)
  tmp   <- tempfile(fileext = ".mlr")
  on.exit(unlink(tmp), add = TRUE)
  ml_save(model, tmp)
  loaded <- ml_load(tmp)
  expect_true(inherits(loaded, "ml_model"))
  preds_orig   <- ml_predict(model, s$valid)
  preds_loaded <- ml_predict(loaded, s$valid)
  expect_equal(preds_orig, preds_loaded)
})

# ── PERSONA H: VALIDATE WITH BASELINE ────────────────────────────────────────

test_that("W3-18: validate with baseline detects regression", {
  s        <- ml_split(iris, "Species", seed = 42L)
  old_model <- ml_fit(s$train, "Species", algorithm = "logistic", seed = 42L)
  new_model <- ml_fit(s$train, "Species", algorithm = "logistic", seed = 42L)
  gate <- ml_validate(new_model, test = s$test, baseline = old_model, tolerance = 0.5)
  # With wide tolerance, should pass (same model architecture)
  expect_true(is.logical(gate$passed))
  # Should have baseline_metrics populated
  if (!is.null(gate$baseline_metrics)) {
    expect_true(length(gate$baseline_metrics) > 0L)
  }
})

# ── PERSONA I: PREDICT API ────────────────────────────────────────────────────

test_that("W3-19: predict returns same-length vector as input rows", {
  s     <- ml_split(iris, "Species", seed = 42L)
  model <- ml_fit(s$train, "Species", algorithm = "logistic", seed = 42L)
  preds <- ml_predict(model, s$test)
  expect_equal(length(preds), nrow(s$test))
  # All predictions should be one of the training classes
  valid_classes <- unique(iris$Species)
  expect_true(all(preds %in% as.character(valid_classes)))
})

# ── PERSONA J: MODULE-STYLE ACCESS ($) ────────────────────────────────────────

test_that("W3-20: ml$fit() module style produces same result as ml_fit()", {
  # Python: import ml; ml.fit(...)
  # R: ml$fit(...) — module-style alternative
  s  <- ml_split(iris, "Species", seed = 42L)
  m1 <- ml_fit(s$train, "Species", algorithm = "logistic", seed = 42L)
  m2 <- ml$fit(s$train, "Species", algorithm = "logistic", seed = 42L)
  expect_equal(m1$algorithm, m2$algorithm)
  expect_equal(m1$task,      m2$task)
  expect_equal(m1$target,    m2$target)
})