test_that("errors for non-numeric ps", {
  expect_propensity_error(
    ps_calibrate("not numeric", c(0, 1))
  )
})

test_that("errors for out-of-range ps", {
  expect_propensity_error(
    ps_calibrate(c(-0.1, 0.2), c(0, 1))
  )
  expect_propensity_error(
    ps_calibrate(c(0.5, 1.1), c(0, 1))
  )
})

test_that("errors when ps and .exposure have different lengths", {
  expect_propensity_error(
    ps_calibrate(runif(5), rep(0:1, length.out = 6))
  )
})

test_that("returns a ps_calib object of correct length and range", {
  set.seed(42)
  ps <- rep(0.5, 100)
  treat <- rbinom(100, 1, 0.3)

  out <- ps_calibrate(ps, treat)

  expect_s3_class(out, "ps_calib")
  expect_length(out, 100)
  expect_true(all(as.numeric(out) >= 0 & as.numeric(out) <= 1))
})

test_that("constant ps yields calibrated = observed prevalence", {
  ps <- rep(0.5, 20)
  treat <- rep(c(0, 1), each = 10) # prevalence = 0.5

  out <- ps_calibrate(ps, treat)
  # all values should equal the 0.5 prevalence
  expect_equal(unique(as.numeric(out)), 0.5)
})

test_that("calibration metadata is properly stored", {
  ps <- runif(10)
  treat <- rbinom(10, 1, ps)

  # Test logistic calibration with smooth = TRUE
  out_smooth <- ps_calibrate(ps, treat, method = "logistic", smooth = TRUE)
  meta_smooth <- ps_calib_meta(out_smooth)
  expect_equal(meta_smooth$method, "logistic")
  expect_true(meta_smooth$smooth)

  # Test logistic calibration with smooth = FALSE
  out_simple <- ps_calibrate(ps, treat, method = "logistic", smooth = FALSE)
  meta_simple <- ps_calib_meta(out_simple)
  expect_equal(meta_simple$method, "logistic")
  expect_false(meta_simple$smooth)

  # Test isotonic regression
  out_iso <- ps_calibrate(ps, treat, method = "isoreg")
  meta_iso <- ps_calib_meta(out_iso)
  expect_equal(meta_iso$method, "isoreg")
  expect_false(meta_iso$smooth)
})

test_that("calibration changes the distribution", {
  set.seed(123)
  n <- 1000
  # Generate miscalibrated propensity scores
  true_ps <- runif(n, 0.2, 0.8)
  # Add systematic bias
  obs_ps <- plogis(qlogis(true_ps) + 0.5)
  treat <- rbinom(n, 1, true_ps)

  # Calibrate
  calibrated_ps <- ps_calibrate(obs_ps, treat)

  # Check that calibration changed the values
  expect_false(identical(as.numeric(obs_ps), as.numeric(calibrated_ps)))

  # Check that all values are valid probabilities
  expect_true(all(
    as.numeric(calibrated_ps) >= 0 & as.numeric(calibrated_ps) <= 1
  ))

  # In this specific case with systematic bias, check if calibration helps
  # Calculate mean calibration error
  obs_bins <- cut(obs_ps, breaks = seq(0, 1, by = 0.1), include.lowest = TRUE)
  calib_bins <- cut(
    calibrated_ps,
    breaks = seq(0, 1, by = 0.1),
    include.lowest = TRUE
  )

  obs_calib_error <- abs(
    tapply(treat, obs_bins, mean, na.rm = TRUE) -
      tapply(obs_ps, obs_bins, mean, na.rm = TRUE)
  )
  calib_calib_error <- abs(
    tapply(treat, calib_bins, mean, na.rm = TRUE) -
      tapply(calibrated_ps, calib_bins, mean, na.rm = TRUE)
  )

  # Average calibration error should generally be reduced
  expect_true(
    mean(calib_calib_error, na.rm = TRUE) <=
      mean(obs_calib_error, na.rm = TRUE) + 0.1
  ) # Allow some tolerance
})

test_that("handles edge cases with extreme propensity scores", {
  # Near 0 and 1 values
  ps <- c(0.001, 0.01, 0.1, 0.5, 0.9, 0.99, 0.999)
  treat <- c(0, 0, 0, 1, 1, 1, 1)

  expect_no_error(suppressWarnings(calibrated <- ps_calibrate(ps, treat)))
  expect_true(all(as.numeric(calibrated) >= 0 & as.numeric(calibrated) <= 1))
})

test_that("works with different treatment codings", {
  set.seed(456)
  ps <- runif(50)
  treat_num <- rbinom(50, 1, ps)

  # Test with different treatment encodings
  treat_char <- ifelse(treat_num == 1, "treated", "control")
  treat_factor <- factor(treat_char)
  treat_logical <- as.logical(treat_num)

  calib1 <- ps_calibrate(ps, treat_num)
  calib2 <- ps_calibrate(
    ps,
    treat_char,
    .focal_level = "treated",
    .reference_level = "control"
  )
  calib3 <- ps_calibrate(
    ps,
    treat_factor,
    .focal_level = "treated",
    .reference_level = "control"
  )
  calib4 <- ps_calibrate(ps, treat_logical)

  # All should produce the same result
  expect_equal(as.numeric(calib1), as.numeric(calib2))
  expect_equal(as.numeric(calib1), as.numeric(calib3))
  expect_equal(as.numeric(calib1), as.numeric(calib4))
})

test_that(".focal_level and .reference_level parameters work consistently with package patterns", {
  set.seed(123)
  ps <- runif(30, 0.3, 0.7)

  # Test automatic detection with 0/1 coding
  treat_01 <- rbinom(30, 1, ps)
  calib_auto <- ps_calibrate(ps, treat_01)
  expect_s3_class(calib_auto, "ps_calib")

  # Test explicit specification with 0/1 coding
  calib_explicit <- ps_calibrate(
    ps,
    treat_01,
    .focal_level = 1,
    .reference_level = 0
  )
  expect_equal(as.numeric(calib_auto), as.numeric(calib_explicit))

  # Test with character coding
  treat_char <- ifelse(treat_01 == 1, "treat", "control")
  calib_char_explicit <- ps_calibrate(
    ps,
    treat_char,
    .focal_level = "treat",
    .reference_level = "control"
  )
  expect_equal(as.numeric(calib_auto), as.numeric(calib_char_explicit))

  # Test automatic detection with factor
  treat_factor <- factor(treat_char, levels = c("control", "treat"))
  calib_factor_auto <- ps_calibrate(ps, treat_factor)
  expect_equal(as.numeric(calib_auto), as.numeric(calib_factor_auto))

  # Test with logical coding (should be automatic)
  treat_logical <- as.logical(treat_01)
  calib_logical <- ps_calibrate(ps, treat_logical)
  expect_equal(as.numeric(calib_auto), as.numeric(calib_logical))
})

test_that(".focal_level/.reference_level defaults are NULL like other package functions", {
  # Check that the defaults match the package pattern
  ps_calibrate_formals <- formals(ps_calibrate)
  expect_null(ps_calibrate_formals$.focal_level)
  expect_null(ps_calibrate_formals$.reference_level)

  # Compare with other weight functions to ensure consistency
  wt_ate_formals <- formals(wt_ate)
  expect_equal(ps_calibrate_formals$.focal_level, wt_ate_formals$.focal_level)
  expect_equal(
    ps_calibrate_formals$.reference_level,
    wt_ate_formals$.reference_level
  )
})

test_that("automatic treatment detection works with binary vectors", {
  set.seed(789)
  ps <- runif(40, 0.2, 0.8)

  # Test with different binary representations
  treat_01 <- rbinom(40, 1, ps)
  treat_12 <- treat_01 + 1 # 1/2 coding
  treat_neg <- ifelse(treat_01 == 1, 1, -1) # -1/1 coding

  # All should work with automatic detection
  calib_01 <- ps_calibrate(ps, treat_01)
  expect_s3_class(calib_01, "ps_calib")

  # These require explicit specification
  calib_12 <- ps_calibrate(ps, treat_12, .focal_level = 2, .reference_level = 1)
  calib_neg <- ps_calibrate(
    ps,
    treat_neg,
    .focal_level = 1,
    .reference_level = -1
  )

  # All should produce valid results
  expect_true(all(as.numeric(calib_01) >= 0 & as.numeric(calib_01) <= 1))
  expect_true(all(as.numeric(calib_12) >= 0 & as.numeric(calib_12) <= 1))
  expect_true(all(as.numeric(calib_neg) >= 0 & as.numeric(calib_neg) <= 1))
})

test_that("error handling for ambiguous treatment coding", {
  set.seed(456)
  ps <- runif(20, 0.3, 0.7)

  # Three-level factor should require explicit specification
  treat_three <- factor(sample(c("A", "B", "C"), 20, replace = TRUE))
  expect_propensity_error(
    ps_calibrate(ps, treat_three)
  )

  # Should work with explicit specification
  treat_binary_from_three <- ifelse(treat_three == "A", 1, 0)
  expect_no_error(
    ps_calibrate(ps, treat_binary_from_three)
  )
})

test_that("is_ps_calibrated works correctly", {
  ps <- runif(20)
  treat <- rbinom(20, 1, ps)

  # Regular numeric vector
  expect_false(is_ps_calibrated(ps))

  # Uncalibrated psw object
  ps_wt <- psw(ps, estimand = "ate")
  expect_false(is_ps_calibrated(ps_wt))

  # Calibrated psw object
  calibrated <- ps_calibrate(ps, treat)
  expect_true(is_ps_calibrated(calibrated))
})

test_that("errors when trying to calibrate already calibrated ps", {
  ps <- runif(20)
  treat <- rbinom(20, 1, ps)

  calibrated <- ps_calibrate(ps, treat)

  expect_propensity_error(
    ps_calibrate(calibrated, treat)
  )
})

test_that("handles NA values appropriately", {
  ps <- c(0.1, 0.3, NA, 0.7, 0.9)
  treat <- c(0, 0, 1, 1, 1)

  # Should preserve NAs in output
  suppressWarnings(calibrated <- ps_calibrate(ps, treat))
  expect_length(calibrated, 5)
  expect_true(is.na(calibrated[3]))
  expect_s3_class(calibrated, "ps_calib")

  # Test with isotonic regression too
  calibrated_iso <- ps_calibrate(ps, treat, method = "isoreg")
  expect_length(calibrated_iso, 5)
  expect_true(is.na(calibrated_iso[3]))
})

test_that("isotonic regression calibration works", {
  set.seed(789)
  ps <- runif(100, 0.1, 0.9)
  treat <- rbinom(100, 1, ps)

  # Should not error
  calibrated_iso <- ps_calibrate(ps, treat, method = "isoreg")

  expect_s3_class(calibrated_iso, "ps_calib")
  expect_length(calibrated_iso, 100)
  expect_true(all(
    as.numeric(calibrated_iso) >= 0 & as.numeric(calibrated_iso) <= 1
  ))
  expect_true(is_ps_calibrated(calibrated_iso))
})

test_that("isotonic regression preserves monotonicity", {
  # Create data where treatment probability increases with ps
  set.seed(456)
  n <- 200
  ps <- seq(0.1, 0.9, length.out = n)
  # Add some noise but maintain overall trend
  treat <- rbinom(n, 1, ps + rnorm(n, 0, 0.05))

  calibrated_iso <- ps_calibrate(ps, treat, method = "isoreg")

  # Check that calibrated scores are monotonic (allowing for ties)
  diffs <- diff(as.numeric(calibrated_iso))
  expect_true(all(diffs >= -1e-10)) # Allow for numerical tolerance
})

test_that("method parameter validation works", {
  ps <- runif(20)
  treat <- rbinom(20, 1, ps)

  # Invalid method should error
  expect_propensity_error(
    ps_calibrate(ps, treat, method = "invalid")
  )
})

test_that("isotonic and logistic calibration can differ", {
  set.seed(123)
  # Create data where isotonic might perform differently
  ps <- c(rep(0.2, 50), rep(0.8, 50))
  treat <- c(rbinom(50, 1, 0.3), rbinom(50, 1, 0.7))

  calib_logistic <- ps_calibrate(ps, treat, method = "logistic")
  calib_iso <- ps_calibrate(ps, treat, method = "isoreg")

  # They should produce different results in general
  expect_false(identical(as.numeric(calib_logistic), as.numeric(calib_iso)))
})

test_that("isotonic calibration matches WeightIt isoreg exactly", {
  skip_if_not_installed("WeightIt")

  set.seed(321)
  ps <- runif(100, 0.2, 0.8)
  treat <- rbinom(100, 1, ps)

  our_iso <- ps_calibrate(ps, treat, method = "isoreg")
  weightit_iso <- WeightIt::calibrate(ps, treat, method = "isoreg")

  expect_equal(as.numeric(our_iso), as.numeric(weightit_iso), tolerance = 1e-10)
})

test_that("isotonic regression preserves monotonicity better than logistic", {
  set.seed(888)
  # Create data where isotonic should perform better
  n <- 100
  ps <- seq(0.1, 0.9, length.out = n)
  # Non-linear relationship that violates logistic assumption
  true_prob <- 0.2 + 0.6 * ps^2
  treat <- rbinom(n, 1, true_prob)

  logistic_calib <- ps_calibrate(ps, treat, method = "logistic")
  iso_calib <- ps_calibrate(ps, treat, method = "isoreg")

  # Check isotonic preserves monotonicity
  iso_diffs <- diff(as.numeric(iso_calib))
  expect_true(all(iso_diffs >= -1e-10)) # Allow for numerical tolerance

  # Both should be different from original
  expect_false(identical(as.numeric(ps), as.numeric(logistic_calib)))
  expect_false(identical(as.numeric(ps), as.numeric(iso_calib)))

  # They should produce different results for non-linear data
  expect_false(identical(as.numeric(logistic_calib), as.numeric(iso_calib)))
})

test_that("isotonic regression handles various cases like WeightIt", {
  skip_if_not_installed("WeightIt")

  # Test with ties in propensity scores (but ensure sufficient data)
  set.seed(999)
  ps_ties <- rep(c(0.3, 0.7), each = 4)
  treat_ties <- c(0, 0, 1, 1, 0, 1, 1, 1)

  our_iso_ties <- ps_calibrate(ps_ties, treat_ties, method = "isoreg")
  weightit_iso_ties <- WeightIt::calibrate(
    ps_ties,
    treat_ties,
    method = "isoreg"
  )

  expect_equal(
    as.numeric(our_iso_ties),
    as.numeric(weightit_iso_ties),
    tolerance = 1e-10
  )

  # Test that our implementation handles edge cases gracefully even if WeightIt fails
  ps_extreme <- c(0.001, 0.01, 0.1, 0.5, 0.9, 0.99, 0.999)
  treat_extreme <- c(0, 0, 0, 1, 1, 1, 1)

  # Our implementation should handle this without error
  expect_no_error(
    our_extreme <- ps_calibrate(ps_extreme, treat_extreme, method = "isoreg")
  )
  expect_true(all(as.numeric(our_extreme) >= 0 & as.numeric(our_extreme) <= 1))
  expect_true(all(diff(as.numeric(our_extreme)) >= -1e-10)) # Monotonic
})

test_that("smooth parameter works correctly for logistic calibration", {
  skip_if_not_installed("mgcv")

  set.seed(42)
  ps <- runif(50, 0.2, 0.8)
  treat <- rbinom(50, 1, ps)

  # Both smooth options should work
  calib_smooth <- ps_calibrate(ps, treat, smooth = TRUE)
  calib_simple <- ps_calibrate(ps, treat, smooth = FALSE)

  # Both should be psw objects
  expect_s3_class(calib_smooth, "ps_calib")
  expect_s3_class(calib_simple, "ps_calib")

  # Both should be in valid range
  expect_true(all(
    as.numeric(calib_smooth) >= 0 & as.numeric(calib_smooth) <= 1
  ))
  expect_true(all(
    as.numeric(calib_simple) >= 0 & as.numeric(calib_simple) <= 1
  ))

  # They should generally produce different results
  expect_false(identical(as.numeric(calib_smooth), as.numeric(calib_simple)))
})

test_that("smooth parameter is ignored for isotonic regression", {
  set.seed(123)
  ps <- runif(30, 0.1, 0.9)
  treat <- rbinom(30, 1, ps)

  # smooth should be ignored for isoreg
  iso_smooth_true <- ps_calibrate(ps, treat, method = "isoreg", smooth = TRUE)
  iso_smooth_false <- ps_calibrate(ps, treat, method = "isoreg", smooth = FALSE)

  # Should be identical since smooth is ignored for isoreg
  expect_equal(as.numeric(iso_smooth_true), as.numeric(iso_smooth_false))
})

# Cross-validation tests against WeightIt and the probably package

test_that("ps_calibrate with smooth=FALSE matches WeightIt::calibrate for logistic calibration", {
  skip_if_not_installed("WeightIt")

  set.seed(789)
  n <- 500
  # Create some realistic propensity scores with miscalibration
  X <- rnorm(n)
  true_ps <- plogis(0.5 * X)
  # Add miscalibration
  obs_ps <- plogis(qlogis(true_ps) + 0.3 + 0.2 * X)
  treat <- rbinom(n, 1, true_ps)

  # Our calibration with simple logistic (to match WeightIt)
  our_calib <- ps_calibrate(obs_ps, treat, smooth = FALSE)

  # WeightIt calibration (platt method is logistic calibration)
  weightit_calib <- WeightIt::calibrate(obs_ps, treat, method = "platt")

  # Should be very close (allowing for numerical differences)
  expect_equal(
    as.numeric(our_calib),
    as.numeric(weightit_calib),
    tolerance = 1e-10
  )
})

test_that("ps_calibrate handles different treatment encodings like WeightIt", {
  skip_if_not_installed("WeightIt")

  set.seed(321)
  ps <- runif(100, 0.2, 0.8)
  treat_num <- rbinom(100, 1, ps)
  treat_char <- ifelse(treat_num == 1, "T", "C")

  # Our calibration with character treatment (use smooth=FALSE to match WeightIt)
  our_calib <- ps_calibrate(
    ps,
    treat_char,
    .focal_level = "T",
    .reference_level = "C",
    smooth = FALSE
  )

  # WeightIt with numeric treatment
  weightit_calib <- WeightIt::calibrate(ps, treat_num, method = "platt")

  expect_equal(
    as.numeric(our_calib),
    as.numeric(weightit_calib),
    tolerance = 1e-10
  )
})

test_that("compare calibration performance metrics with WeightIt", {
  skip_if_not_installed("WeightIt")

  set.seed(456)
  n <- 1000
  # Generate data with known miscalibration
  X1 <- rnorm(n)
  X2 <- rbinom(n, 1, 0.5)
  true_ps <- plogis(-1 + 0.5 * X1 + X2)
  # Observed PS with systematic bias
  obs_ps <- plogis(qlogis(true_ps) + 0.5)
  treat <- rbinom(n, 1, true_ps)

  # Calibrate with both methods (use smooth=FALSE to match WeightIt)
  our_calib <- ps_calibrate(obs_ps, treat, smooth = FALSE)
  weightit_calib <- WeightIt::calibrate(obs_ps, treat, method = "platt")

  # Check they produce identical results
  expect_equal(
    as.numeric(our_calib),
    as.numeric(weightit_calib),
    tolerance = 1e-10
  )

  # Test that calibration changes the distribution
  # (calibration doesn't always improve slope toward 1)
  expect_false(identical(as.numeric(obs_ps), as.numeric(our_calib)))
})

test_that("ps_calibrate produces similar results to probably package", {
  skip_if_not_installed("probably")
  skip_if_not_installed("tidyselect")

  set.seed(654)
  n <- 200
  # Create miscalibrated probabilities
  true_ps <- runif(n, 0.2, 0.8)
  obs_ps <- plogis(qlogis(true_ps) + 0.3) # Add miscalibration
  treat <- rbinom(n, 1, true_ps) # Use true ps for treatment

  # Our calibration
  our_calib <- ps_calibrate(obs_ps, treat)

  # probably calibration
  df <- data.frame(
    treat = factor(treat, levels = c("0", "1")),
    .pred_0 = 1 - obs_ps,
    .pred_1 = obs_ps
  )

  suppressWarnings({
    cal_data <- probably::cal_estimate_logistic(
      df,
      truth = treat,
      estimate = tidyselect::starts_with(".pred_")
    )
    df_cal <- probably::cal_apply(df, cal_data)
  })

  # Compare calibrated probabilities
  prob_calib <- df_cal$.pred_1

  # Both should be in valid range
  expect_true(all(as.numeric(our_calib) >= 0 & as.numeric(our_calib) <= 1))
  expect_true(all(prob_calib >= 0 & prob_calib <= 1))

  # Helper function to calculate calibration error using binned approach
  calc_calib_error <- function(pred_probs, true_outcomes, n_bins = 5) {
    bins <- cut(pred_probs, breaks = n_bins, include.lowest = TRUE)
    bin_means_true <- tapply(true_outcomes, bins, mean, na.rm = TRUE)
    bin_means_pred <- tapply(pred_probs, bins, mean, na.rm = TRUE)
    mean(abs(bin_means_true - bin_means_pred), na.rm = TRUE)
  }

  # Calculate calibration errors for comparison
  orig_error <- calc_calib_error(obs_ps, treat)
  our_error <- calc_calib_error(as.numeric(our_calib), treat)
  prob_error <- calc_calib_error(prob_calib, treat)

  # Both should reduce calibration error compared to original
  expect_true(our_error <= orig_error + 0.05) # Allow small tolerance
  expect_true(prob_error <= orig_error + 0.05)

  # The correlation between our calibration and probably's should be high
  expect_true(cor(as.numeric(our_calib), prob_calib) > 0.8)
})

test_that("ps_calibrate with smooth=TRUE matches probably's default behavior exactly", {
  skip_if_not_installed("probably")
  skip_if_not_installed("tidyselect")
  skip_if_not_installed("mgcv")

  set.seed(123)
  n <- 100
  ps <- runif(n, 0.1, 0.9)
  treat <- rbinom(n, 1, ps)

  # Our smoothed calibration (default smooth=TRUE)
  our_smooth <- ps_calibrate(ps, treat, smooth = TRUE)

  # probably calibration with smooth=TRUE (default)
  df <- data.frame(
    treat = factor(treat, levels = c("0", "1")),
    .pred_0 = 1 - ps,
    .pred_1 = ps
  )

  suppressWarnings({
    cal_data <- probably::cal_estimate_logistic(
      df,
      truth = treat,
      estimate = tidyselect::starts_with(".pred_"),
      smooth = TRUE
    )
    df_cal <- probably::cal_apply(df, cal_data)
  })

  prob_smooth <- df_cal$.pred_1

  # Should be very close (allowing for numerical differences in GAM fitting)
  # Use a more reasonable tolerance for GAM differences and ensure both are vectors
  expect_equal(
    as.numeric(our_smooth),
    as.numeric(prob_smooth),
    tolerance = 1e-3
  )
})

test_that("ps_calibrate with smooth=FALSE matches probably's simple logistic exactly", {
  skip_if_not_installed("probably")
  skip_if_not_installed("tidyselect")

  set.seed(456)
  n <- 100
  ps <- runif(n, 0.1, 0.9)
  treat <- rbinom(n, 1, ps)

  # Our simple logistic calibration
  our_simple <- ps_calibrate(ps, treat, smooth = FALSE)

  # probably calibration with smooth=FALSE
  df <- data.frame(
    treat = factor(treat, levels = c("0", "1")),
    .pred_0 = 1 - ps,
    .pred_1 = ps
  )

  suppressWarnings({
    cal_data <- probably::cal_estimate_logistic(
      df,
      truth = treat,
      estimate = tidyselect::starts_with(".pred_"),
      smooth = FALSE
    )
    df_cal <- probably::cal_apply(df, cal_data)
  })

  prob_simple <- df_cal$.pred_1

  # Should be identical for simple logistic regression
  expect_equal(as.numeric(our_simple), prob_simple, tolerance = 1e-10)
})

test_that("extreme values handled consistently with WeightIt", {
  skip_if_not_installed("WeightIt")

  # Test with extreme propensity scores
  ps <- c(0.001, 0.01, 0.1, 0.5, 0.9, 0.99, 0.999)
  treat <- c(0, 0, 0, 1, 1, 1, 1)

  suppressWarnings({
    our_calib <- ps_calibrate(ps, treat)
    weightit_calib <- WeightIt::calibrate(ps, treat, method = "platt")
  })

  # Both should handle extreme values similarly
  expect_equal(
    as.numeric(our_calib),
    as.numeric(weightit_calib),
    tolerance = 1e-10
  )
})

# Standalone pava_weighted() tests ------------------------------------------

test_that("pava_weighted returns input unchanged when already non-decreasing", {
  y <- c(0.1, 0.3, 0.5, 0.8, 1.0)
  x <- seq_along(y)
  result <- pava_weighted(x, y)
  expect_equal(result, y)
})

test_that("pava_weighted merges violating pairs", {
  # y = c(0, 1, 0, 1): middle pair violates, should merge to 0.5
  result <- pava_weighted(1:4, c(0, 1, 0, 1))
  expect_equal(result, c(0, 0.5, 0.5, 1))
})

test_that("pava_weighted handles all-constant y", {
  result <- pava_weighted(1:5, rep(0.5, 5))
  expect_equal(result, rep(0.5, 5))
})

test_that("pava_weighted handles single observation", {
  result <- pava_weighted(1, 0.7)
  expect_equal(result, 0.7)
})

test_that("pava_weighted handles completely decreasing y", {
  result <- pava_weighted(1:4, c(1, 0.75, 0.5, 0.25))
  # All should merge to the grand mean
  expect_equal(result, rep(mean(c(1, 0.75, 0.5, 0.25)), 4))
})

test_that("pava_weighted respects observation weights", {
  # Two observations: y = c(1, 0) with equal weights -> mean = 0.5
  result_equal <- pava_weighted(1:2, c(1, 0), w = c(1, 1))
  expect_equal(result_equal, c(0.5, 0.5))

  # Same but with weight 3 on first obs: weighted mean = (1*3 + 0*1)/4 = 0.75
  result_weighted <- pava_weighted(1:2, c(1, 0), w = c(3, 1))
  expect_equal(result_weighted, c(0.75, 0.75))
})

test_that("pava_weighted handles tied x values", {
  # Tied x-values should NOT be grouped (unlike stats::isoreg)
  x <- c(1, 1, 2, 2)
  y <- c(0, 1, 0, 1)
  result <- pava_weighted(x, y)
  # Each observation is its own block initially; result must be monotonic
  expect_true(all(diff(result) >= -1e-10))
})

test_that("pava_weighted preserves original order", {
  x <- c(3, 1, 2)
  y <- c(0.9, 0.1, 0.5)
  result <- pava_weighted(x, y)
  # After ordering by x: (1, 0.1), (2, 0.5), (3, 0.9) - already non-decreasing
  # So result should be the same as input
  expect_equal(result, y)
})