test_that("nc_backtest returns nowcast_backtest", {
  d <- make_bridge_data(n_quarters = 30, seed = 42)
  bt <- nc_backtest(target ~ ind_1 + ind_2, data = d, start = 15, ar_order = 0)

  expect_s3_class(bt, "nowcast_backtest")
  expect_equal(bt$method, "bridge")
  expect_equal(bt$window_type, "expanding")
})

test_that("nc_backtest results have correct structure", {
  d <- make_bridge_data(n_quarters = 30, seed = 42)
  bt <- nc_backtest(target ~ ind_1, data = d, start = 15)

  expect_true(all(c("date", "nowcast", "actual", "error", "ci_lower", "ci_upper")
                  %in% names(bt$results)))
  expect_true(nrow(bt$results) > 0)
  expect_equal(bt$results$error, bt$results$nowcast - bt$results$actual,
               tolerance = 1e-10)
})

test_that("nc_backtest metrics are consistent with nc_evaluate", {
  d <- make_bridge_data(n_quarters = 30, seed = 42)
  bt <- nc_backtest(target ~ ind_1 + ind_2, data = d, start = 15, ar_order = 0)

  manual <- nc_evaluate(bt$results$nowcast, bt$results$actual)
  expect_equal(bt$metrics$rmse, manual$rmse, tolerance = 1e-10)
  expect_equal(bt$metrics$mae, manual$mae, tolerance = 1e-10)
  expect_equal(bt$metrics$bias, manual$bias, tolerance = 1e-10)
})

test_that("nc_backtest expanding window uses more data over time", {
  d <- make_bridge_data(n_quarters = 30, seed = 42)
  bt <- nc_backtest(target ~ ind_1, data = d, start = 10)
  expect_true(nrow(bt$results) >= 10)
})

test_that("nc_backtest rolling window works", {
  d <- make_bridge_data(n_quarters = 40, seed = 42)
  bt <- nc_backtest(target ~ ind_1, data = d, start = 15,
                    window = "rolling", window_size = 15)

  expect_s3_class(bt, "nowcast_backtest")
  expect_equal(bt$window_type, "rolling")
  expect_true(nrow(bt$results) > 0)
})

test_that("nc_backtest RMSE beats naive benchmark for well-specified DGP", {
  d <- make_bridge_data(n_quarters = 100, beta = c(0.8, 0.5), seed = 42)
  bt <- nc_backtest(target ~ ind_1 + ind_2, data = d, start = 30, ar_order = 0)

  naive_errors <- bt$results$actual - mean(d$target[1:30])
  naive_rmse <- sqrt(mean(naive_errors^2))

  expect_true(bt$metrics$rmse < naive_rmse)
})

test_that("nc_backtest errors when start >= n", {
  d <- make_bridge_data(n_quarters = 10, seed = 42)
  expect_error(nc_backtest(target ~ ind_1, data = d, start = 10), "less than")
})

test_that("nc_backtest works with nc_dataset", {
  target <- data.frame(
    date = seq(as.Date("2015-01-01"), by = "quarter", length.out = 30),
    value = rnorm(30)
  )
  ind <- data.frame(
    date = seq(as.Date("2015-01-01"), by = "month", length.out = 90),
    value = rnorm(90)
  )
  ds <- nc_align(target, ind1 = ind)
  bt <- nc_backtest(target ~ ind1, data = ds, start = 15)
  expect_s3_class(bt, "nowcast_backtest")
})

test_that("nc_backtest handles custom alpha", {
  d <- make_bridge_data(n_quarters = 30, seed = 42)
  bt <- nc_backtest(target ~ ind_1, data = d, start = 15, alpha = 0.01)

  widths <- bt$results$ci_upper - bt$results$ci_lower
  expect_true(all(widths > 0))
})

# --- Prediction interval correctness (audit issue #10) ---

test_that("nc_backtest prediction intervals are wider than naive sigma*z", {
  d <- make_bridge_data(n_quarters = 40, seed = 42)
  bt <- nc_backtest(target ~ ind_1 + ind_2, data = d, start = 20, ar_order = 0)

  # All CI widths should be positive
  widths <- bt$results$ci_upper - bt$results$ci_lower
  expect_true(all(widths > 0))
})

# --- No look-ahead bias test ---

test_that("nc_backtest does not use future data", {
  # Create data where target depends on a lagged indicator
  set.seed(42)
  n <- 40
  d <- data.frame(
    date = seq(as.Date("2015-01-01"), by = "quarter", length.out = n),
    ind = rnorm(n)
  )
  d$target <- c(NA, d$ind[-n]) + rnorm(n, sd = 0.1)  # target[t] = ind[t-1]
  d$target[1] <- rnorm(1)

  bt <- nc_backtest(target ~ ind, data = d, start = 15, ar_order = 0)

  # The first evaluation should be for a date AFTER the start-th observation
  expect_true(bt$results$date[1] > d$date[15])
})

# --- AR terms in backtest ---

test_that("nc_backtest with ar_order includes AR terms", {
  d <- make_bridge_data(n_quarters = 40, seed = 42)
  bt_ar0 <- nc_backtest(target ~ ind_1, data = d, start = 15, ar_order = 0)
  bt_ar1 <- nc_backtest(target ~ ind_1, data = d, start = 15, ar_order = 1)

  # Both should produce results

  expect_s3_class(bt_ar0, "nowcast_backtest")
  expect_s3_class(bt_ar1, "nowcast_backtest")
  # AR model may have fewer evaluations due to lag creation
  expect_true(nrow(bt_ar1$results) > 0)
})

# --- AR lag correctness test (audit phase 2) ---

test_that("nc_backtest AR lags use correct (past) target values", {
  # Create data with a known pattern so we can verify AR values
  set.seed(42)
  n <- 25
  d <- data.frame(
    date = seq(as.Date("2015-01-01"), by = "quarter", length.out = n),
    ind = rnorm(n),
    target = 1:n * 0.1  # deterministic so we can trace values
  )

  bt <- nc_backtest(target ~ ind, data = d, start = 15, ar_order = 1)

  # The first backtest evaluation is for period 16 (start=15, so train on 1:15)
  # The AR(1) lag for period 16 should be target[15]
  # We can't directly inspect the AR values used, but we can verify the
  # nowcast is finite and the error is reasonable
  expect_s3_class(bt, "nowcast_backtest")
  expect_true(all(is.finite(bt$results$nowcast)))
  # All evaluation dates should be strictly after the start-th date
  expect_true(all(bt$results$date > d$date[15]))
})

# --- Prediction interval width test (audit phase 2) ---

test_that("nc_backtest early windows have wider CIs than later windows", {
  d <- make_bridge_data(n_quarters = 60, seed = 42)
  bt <- nc_backtest(target ~ ind_1 + ind_2, data = d, start = 10, ar_order = 0)

  widths <- bt$results$ci_upper - bt$results$ci_lower
  n_eval <- length(widths)
  # Average width of first third should be >= average width of last third
  early_mean <- mean(widths[1:floor(n_eval / 3)])
  late_mean <- mean(widths[ceiling(2 * n_eval / 3):n_eval])
  expect_true(early_mean >= late_mean * 0.9)  # allow small tolerance
})