test_that("importance() accepts all ci_method values", {
	task = sim_dgp_independent(n = 100)

	pfi = PFI$new(
		task = task,
		learner = lrn("regr.rpart"),
		measure = msr("regr.mse"),
		resampling = rsmp("subsampling", repeats = 5),
		n_repeats = 2
	)

	pfi$compute()

	# Test that all variance methods work
	imp_none = pfi$importance(ci_method = "none")
	imp_raw = pfi$importance(ci_method = "raw")
	expect_warning(
		imp_nb <- pfi$importance(ci_method = "nadeau_bengio")
	)
	imp_quantile = pfi$importance(ci_method = "quantile")

	expect_importance_dt(imp_none, features = pfi$features)
	expect_importance_dt(imp_raw, features = pfi$features)
	expect_importance_dt(imp_nb, features = pfi$features)
	expect_importance_dt(imp_quantile, features = pfi$features)
})

test_that("ci_method='none' produces no variance columns", {
	task = sim_dgp_independent(n = 100)

	pfi = PFI$new(
		task = task,
		learner = lrn("regr.rpart"),
		measure = msr("regr.mse"),
		resampling = rsmp("subsampling", repeats = 5),
		n_repeats = 2
	)

	pfi$compute()
	imp_none = pfi$importance(ci_method = "none")

	# Check that only feature and importance columns exist
	expect_equal(names(imp_none), c("feature", "importance"))
})

test_that("raw CIs are narrower than nadeau_bengio corrected CIs", {
	task = sim_dgp_independent(n = 200)

	pfi = PFI$new(
		task = task,
		learner = lrn("regr.rpart"),
		measure = msr("regr.mse"),
		resampling = rsmp("subsampling", repeats = 11, ratio = 0.8),
		n_repeats = 3
	)

	pfi$compute()

	# Use two-sided to compare finite CI widths
	imp_raw = pfi$importance(ci_method = "raw", alternative = "two.sided")
	imp_nb = pfi$importance(ci_method = "nadeau_bengio", alternative = "two.sided")

	# Calculate CI widths
	width_raw = imp_raw$conf_upper - imp_raw$conf_lower
	width_nb = imp_nb$conf_upper - imp_nb$conf_lower

	# Raw CIs should be narrower than corrected ones on average
	# Compare the mean widths instead of individual features
	# The nadeau_bengio correction factor should make CIs wider on average
	expect_true(mean(width_nb) > mean(width_raw))
})

test_that("nadeau_bengio correction requires appropriate resampling", {
	task = sim_dgp_independent(n = 100)

	# Cross-validation is not supported for nadeau_bengio
	pfi = PFI$new(
		task = task,
		learner = lrn("regr.rpart"),
		measure = msr("regr.mse"),
		resampling = rsmp("cv", folds = 3),
		n_repeats = 2
	)

	pfi$compute()

	# Should error for unsupported resampling
	expect_warning(
		pfi$importance(ci_method = "nadeau_bengio"),
		regexp = "recommended for resampling types"
	)

	# But raw variance should still work
	imp_raw = pfi$importance(ci_method = "raw")
	expect_importance_dt(imp_raw, features = pfi$features)
})

test_that("confidence level parameter works correctly", {
	task = sim_dgp_independent(n = 100)

	pfi = PFI$new(
		task = task,
		learner = lrn("regr.rpart"),
		measure = msr("regr.mse"),
		resampling = rsmp("subsampling", repeats = 5),
		n_repeats = 2
	)

	pfi$compute()

	# Test different confidence levels with two-sided CIs to compare widths
	imp_90 = pfi$importance(ci_method = "raw", conf_level = 0.90, alternative = "two.sided")
	imp_95 = pfi$importance(ci_method = "raw", conf_level = 0.95, alternative = "two.sided")
	imp_99 = pfi$importance(ci_method = "raw", conf_level = 0.99, alternative = "two.sided")

	# Calculate CI widths
	width_90 = imp_90$conf_upper - imp_90$conf_lower
	width_95 = imp_95$conf_upper - imp_95$conf_lower
	width_99 = imp_99$conf_upper - imp_99$conf_lower

	# Higher confidence level should produce wider CIs (on average)
	expect_true(mean(width_90) < mean(width_95))
	expect_true(mean(width_95) < mean(width_99))
})

test_that("variance estimation works with bootstrap resampling", {
	task = sim_dgp_independent(n = 100)

	pfi = PFI$new(
		task = task,
		learner = lrn("regr.rpart"),
		measure = msr("regr.mse"),
		resampling = rsmp("bootstrap", repeats = 11),
		n_repeats = 2
	)

	pfi$compute()

	# Both raw and nadeau_bengio should work with bootstrap
	imp_raw = pfi$importance(ci_method = "raw")
	imp_nb = pfi$importance(ci_method = "nadeau_bengio")

	expect_importance_dt(imp_raw, features = pfi$features)
	expect_importance_dt(imp_nb, features = pfi$features)

	# Verify variance columns exist
	expect_true(all(c("se", "conf_lower", "conf_upper") %in% names(imp_raw)))
	expect_true(all(c("se", "conf_lower", "conf_upper") %in% names(imp_nb)))
})

test_that("quantile variance method works", {
	task = sim_dgp_independent(n = 200)

	pfi = PFI$new(
		task = task,
		learner = lrn("regr.rpart"),
		measure = msr("regr.mse"),
		resampling = rsmp("subsampling", repeats = 5),
		n_repeats = 2
	)

	pfi$compute()

	# Use two-sided for testing finite CI bounds
	imp_quantile = pfi$importance(ci_method = "quantile", alternative = "two.sided")

	# Check structure
	expect_importance_dt(imp_quantile, features = pfi$features)

	# Quantile method only returns confidence bounds, not se/statistic/p.value
	expected_cols = c("feature", "importance", "conf_lower", "conf_upper")
	expect_equal(names(imp_quantile), expected_cols)

	# All CIs should be valid intervals (two-sided has finite bounds)
	expect_true(all(imp_quantile$conf_lower <= imp_quantile$conf_upper))

	# Point estimates should be between lower and upper bounds (or close)
	# Due to using mean vs quantiles, this is not guaranteed but usually holds
	expect_true(all(
		imp_quantile$importance >= imp_quantile$conf_lower |
			abs(imp_quantile$importance - imp_quantile$conf_lower) < 0.01
	))
	expect_true(all(
		imp_quantile$importance <= imp_quantile$conf_upper |
			abs(imp_quantile$importance - imp_quantile$conf_upper) < 0.01
	))
})

test_that("quantile CIs differ from parametric methods", {
	task = sim_dgp_independent(n = 200)

	pfi = PFI$new(
		task = task,
		learner = lrn("regr.rpart"),
		measure = msr("regr.mse"),
		resampling = rsmp("subsampling", repeats = 15),
		n_repeats = 3
	)

	pfi$compute()

	# Use two-sided to compare finite CI bounds
	imp_raw = pfi$importance(ci_method = "raw", alternative = "two.sided")
	imp_quantile = pfi$importance(ci_method = "quantile", alternative = "two.sided")

	# Point estimates should be the same (both use mean)
	expect_equal(imp_raw$importance, imp_quantile$importance)

	# CIs should generally differ between methods
	# (quantile is non-parametric, raw assumes normality)
	expect_false(all(imp_raw$conf_lower == imp_quantile$conf_lower))
	expect_false(all(imp_raw$conf_upper == imp_quantile$conf_upper))
})

test_that("alternative='greater' produces one-sided CIs and tests", {
	task = sim_dgp_independent(n = 200)

	pfi = PFI$new(
		task = task,
		learner = lrn("regr.rpart"),
		measure = msr("regr.mse"),
		resampling = rsmp("subsampling", repeats = 11),
		n_repeats = 3
	)

	pfi$compute()

	# Test raw method with greater alternative
	imp_raw = pfi$importance(ci_method = "raw", alternative = "greater")

	# Should have statistic and p.value columns
	expect_true(all(c("statistic", "p.value") %in% names(imp_raw)))

	# Upper bound should be Inf for one-sided
	expect_true(all(is.infinite(imp_raw$conf_upper)))
	expect_true(all(imp_raw$conf_upper > 0)) # Inf, not -Inf

	# Lower bound should be finite
	expect_true(all(is.finite(imp_raw$conf_lower)))

	# Test nadeau_bengio with greater alternative
	imp_nb = pfi$importance(ci_method = "nadeau_bengio", alternative = "greater")
	expect_true(all(is.infinite(imp_nb$conf_upper)))
	expect_true(all(c("statistic", "p.value") %in% names(imp_nb)))

	# Test quantile with greater alternative (no statistic/p.value)
	imp_quantile = pfi$importance(ci_method = "quantile", alternative = "greater")
	expect_true(all(is.infinite(imp_quantile$conf_upper)))
	# Quantile method doesn't have statistic/p.value
	expect_false("statistic" %in% names(imp_quantile))
	expect_false("p.value" %in% names(imp_quantile))
})

test_that("alternative='two.sided' produces two-sided CIs and tests", {
	task = sim_dgp_independent(n = 200)

	pfi = PFI$new(
		task = task,
		learner = lrn("regr.rpart"),
		measure = msr("regr.mse"),
		resampling = rsmp("subsampling", repeats = 11),
		n_repeats = 3
	)

	pfi$compute()

	# Test raw method with two.sided alternative
	imp_raw = pfi$importance(ci_method = "raw", alternative = "two.sided")

	# Should have statistic and p.value columns
	expect_true(all(c("statistic", "p.value") %in% names(imp_raw)))

	# Both bounds should be finite
	expect_true(all(is.finite(imp_raw$conf_lower)))
	expect_true(all(is.finite(imp_raw$conf_upper)))

	# Test nadeau_bengio
	imp_nb = pfi$importance(ci_method = "nadeau_bengio", alternative = "two.sided")
	expect_true(all(is.finite(imp_nb$conf_upper)))
	expect_true(all(c("statistic", "p.value") %in% names(imp_nb)))

	# Test quantile (no statistic/p.value for quantile method)
	imp_quantile = pfi$importance(ci_method = "quantile", alternative = "two.sided")
	expect_true(all(is.finite(imp_quantile$conf_upper)))
	expect_false("statistic" %in% names(imp_quantile))
	expect_false("p.value" %in% names(imp_quantile))
})

test_that("two-sided p-values are larger than one-sided for positive importance", {
	task = sim_dgp_independent(n = 200)

	pfi = PFI$new(
		task = task,
		learner = lrn("regr.rpart"),
		measure = msr("regr.mse"),
		resampling = rsmp("subsampling", repeats = 11),
		n_repeats = 3
	)

	pfi$compute()

	imp_greater = pfi$importance(ci_method = "raw", alternative = "greater")
	imp_twosided = pfi$importance(ci_method = "raw", alternative = "two.sided")

	# For features with positive importance, two-sided p-values should be ~2x one-sided
	positive_mask = imp_greater$importance > 0
	if (any(positive_mask)) {
		expect_true(all(
			imp_twosided$p.value[positive_mask] >= imp_greater$p.value[positive_mask]
		))
	}

	# Test statistics should be identical regardless of alternative
	expect_equal(imp_greater$statistic, imp_twosided$statistic)
})

test_that("p_adjust = 'bonferroni' adjusts p-values and CIs for raw", {
	task = sim_dgp_independent(n = 200)

	pfi = PFI$new(
		task = task,
		learner = lrn("regr.rpart"),
		measure = msr("regr.mse"),
		resampling = rsmp("subsampling", repeats = 11),
		n_repeats = 3
	)
	pfi$compute()

	imp_none = pfi$importance(ci_method = "raw", alternative = "two.sided")
	imp_bonf = pfi$importance(ci_method = "raw", alternative = "two.sided", p_adjust = "bonferroni")

	# p-values should be larger (or equal) after Bonferroni correction
	# (filter out NAs from features with zero variance)
	valid = is.finite(imp_none$p.value) & is.finite(imp_bonf$p.value)
	expect_true(all(imp_bonf$p.value[valid] >= imp_none$p.value[valid] - 1e-10))

	# CIs should be wider with Bonferroni correction
	width_none = imp_none$conf_upper - imp_none$conf_lower
	width_bonf = imp_bonf$conf_upper - imp_bonf$conf_lower
	expect_true(all(width_bonf >= width_none - 1e-10))

	# Point estimates and SEs should be unchanged
	expect_equal(imp_none$importance, imp_bonf$importance)
	expect_equal(imp_none$se, imp_bonf$se)
})

test_that("p_adjust = 'BH' adjusts only p-values for nadeau_bengio", {
	task = sim_dgp_independent(n = 200)

	pfi = PFI$new(
		task = task,
		learner = lrn("regr.rpart"),
		measure = msr("regr.mse"),
		resampling = rsmp("subsampling", repeats = 11, ratio = 0.8),
		n_repeats = 3
	)
	pfi$compute()

	imp_none = pfi$importance(ci_method = "nadeau_bengio", alternative = "two.sided")
	imp_bh = pfi$importance(ci_method = "nadeau_bengio", alternative = "two.sided", p_adjust = "BH")

	# CIs should be identical (BH does not adjust CIs)
	expect_equal(imp_none$conf_lower, imp_bh$conf_lower)
	expect_equal(imp_none$conf_upper, imp_bh$conf_upper)

	# Point estimates and SEs should be unchanged
	expect_equal(imp_none$importance, imp_bh$importance)
	expect_equal(imp_none$se, imp_bh$se)
})

test_that("invalid p_adjust value is rejected", {
	task = sim_dgp_independent(n = 100)

	pfi = PFI$new(
		task = task,
		learner = lrn("regr.rpart"),
		measure = msr("regr.mse"),
		resampling = rsmp("subsampling", repeats = 5),
		n_repeats = 2
	)
	pfi$compute()

	expect_error(
		pfi$importance(ci_method = "raw", p_adjust = "invalid_method"),
		regexp = "p_adjust"
	)
})

test_that("unknown arguments to $importance() are rejected", {
	task = sim_dgp_independent(n = 100)

	pfi = PFI$new(
		task = task,
		learner = lrn("regr.rpart"),
		measure = msr("regr.mse"),
		resampling = rsmp("subsampling", repeats = 5),
		n_repeats = 2
	)
	pfi$compute()

	expect_error(
		pfi$importance(ci_method = "raw", tset = "t"),
		regexp = "Unknown argument"
	)
})

test_that("default alternative is 'two.sided'", {
	task = sim_dgp_independent(n = 100)

	pfi = PFI$new(
		task = task,
		learner = lrn("regr.rpart"),
		measure = msr("regr.mse"),
		resampling = rsmp("subsampling", repeats = 5),
		n_repeats = 2
	)

	pfi$compute()

	# Default should be one-sided
	imp_default = pfi$importance(ci_method = "raw")
	imp_greater = pfi$importance(ci_method = "raw", alternative = "two.sided")

	expect_equal(imp_default$conf_lower, imp_greater$conf_lower)
	expect_equal(imp_default$conf_upper, imp_greater$conf_upper)
	expect_equal(imp_default$p.value, imp_greater$p.value)
})