library(ggplot2) test_that("check_model_calibration works with basic input", { # Create simple test data with larger sample size set.seed(123) n <- 500 predicted <- runif(n, 0, 1) actual <- rbinom(n, 1, predicted) test_data <- data.frame( pred = predicted, obs = actual ) # Test basic functionality result <- suppress_calibration_warnings(check_model_calibration( test_data, pred, obs )) expect_s3_class(result, "tbl_df") expect_true(all( c(".bin", "predicted_rate", "observed_rate", "count", "lower", "upper") %in% names(result) )) expect_true(nrow(result) > 0) expect_true(all(result$predicted_rate >= 0 & result$predicted_rate <= 1)) expect_true(all(result$observed_rate >= 0 & result$observed_rate <= 1)) expect_true(all(result$count > 0)) }) test_that("check_model_calibration works with quoted column names", { # Create simple test data set.seed(123) n <- 500 predicted <- runif(n, 0, 1) actual <- rbinom(n, 1, predicted) test_data <- data.frame( pred = predicted, obs = actual ) # Test with quoted column names result <- suppress_calibration_warnings(check_model_calibration( test_data, pred, obs )) expect_s3_class(result, "tbl_df") expect_true(nrow(result) > 0) }) test_that("check_model_calibration works with unquoted column names", { # Create simple test data set.seed(123) n <- 500 predicted <- runif(n, 0, 1) actual <- rbinom(n, 1, predicted) test_data <- data.frame( pred = predicted, obs = actual ) result <- suppress_calibration_warnings(check_model_calibration( test_data, "pred", "obs" )) expect_s3_class(result, "tbl_df") expect_true(nrow(result) > 0) }) test_that("check_model_calibration provides clear error messages for missing columns", { # Create test data set.seed(123) test_data <- data.frame( pred = runif(50, 0, 1), obs = rbinom(50, 1, 0.5) ) # Test with non-existent .fitted column expect_halfmoon_error( check_model_calibration(test_data, "nonexistent", "obs"), "halfmoon_column_error" ) # Test with non-existent .group column expect_halfmoon_error( check_model_calibration(test_data, "pred", "nonexistent"), "halfmoon_column_error" ) }) test_that("check_model_calibration handles different binning methods", { # Create test data set.seed(123) n <- 500 predicted <- runif(n, 0, 1) actual <- rbinom(n, 1, predicted) test_data <- data.frame( pred = predicted, obs = actual ) # Test equal_width binning result_eq <- suppress_calibration_warnings(check_model_calibration( test_data, "pred", "obs", binning_method = "equal_width" )) # Test quantile binning result_qt <- suppress_calibration_warnings(check_model_calibration( test_data, "pred", "obs", binning_method = "quantile" )) expect_s3_class(result_eq, "tbl_df") expect_s3_class(result_qt, "tbl_df") expect_true(nrow(result_eq) > 0) expect_true(nrow(result_qt) > 0) }) test_that("check_model_calibration handles edge cases", { # Test empty data empty_data <- data.frame(pred = numeric(0), obs = numeric(0)) result_empty <- check_model_calibration(empty_data, pred, obs) expect_s3_class(result_empty, "tbl_df") expect_equal(nrow(result_empty), 0) # Test all zeros (treatment level 0, so observed_rate should be 1) all_zeros <- data.frame(pred = runif(50, 0, 1), obs = rep(0, 50)) result_zeros <- suppress_calibration_warnings(check_model_calibration( all_zeros, pred, obs )) expect_s3_class(result_zeros, "tbl_df") expect_true(all(result_zeros$observed_rate == 1)) # All obs are 0, which becomes the treatment level # Test all ones (treatment level 1, so observed_rate should be 1) all_ones <- data.frame(pred = runif(50, 0, 1), obs = rep(1, 50)) result_ones <- suppress_calibration_warnings(check_model_calibration( all_ones, pred, obs )) expect_s3_class(result_ones, "tbl_df") expect_true(all(result_ones$observed_rate == 1)) # All obs are 1, which becomes the treatment level }) test_that("check_model_calibration handles NA values", { # Create test data with NAs set.seed(123) n <- 100 predicted <- runif(n, 0, 1) actual <- rbinom(n, 1, predicted) # Add some NAs predicted[1:5] <- NA actual[6:10] <- NA test_data <- data.frame( pred = predicted, obs = actual ) # Test with na.rm = TRUE result_na_rm <- suppress_calibration_warnings(check_model_calibration( test_data, pred, obs, na.rm = TRUE )) expect_s3_class(result_na_rm, "tbl_df") expect_true(nrow(result_na_rm) > 0) # Test with na.rm = FALSE (should have fewer complete cases) result_na_keep <- suppress_calibration_warnings(check_model_calibration( test_data, pred, obs, na.rm = FALSE )) expect_s3_class(result_na_keep, "tbl_df") }) test_that("check_model_calibration works with logistic method", { # Create test data set.seed(123) n <- 200 predicted <- runif(n, 0, 1) actual <- rbinom(n, 1, predicted) test_data <- data.frame( pred = predicted, obs = actual ) # Test logistic without smoothing result_logistic <- check_model_calibration( test_data, pred, obs, method = "logistic", smooth = FALSE ) expect_s3_class(result_logistic, "tbl_df") expect_true(all( c("predicted_rate", "observed_rate", "lower", "upper") %in% names(result_logistic) )) expect_equal(nrow(result_logistic), 100) # Default 100 prediction points expect_true(all( result_logistic$predicted_rate >= 0 & result_logistic$predicted_rate <= 1 )) expect_true(all( result_logistic$observed_rate >= 0 & result_logistic$observed_rate <= 1 )) expect_true(all(result_logistic$lower >= 0 & result_logistic$lower <= 1)) expect_true(all(result_logistic$upper >= 0 & result_logistic$upper <= 1)) expect_true(all(result_logistic$lower <= result_logistic$observed_rate)) expect_true(all(result_logistic$upper >= result_logistic$observed_rate)) # Test logistic with smoothing result_smooth <- suppress_calibration_warnings(check_model_calibration( test_data, pred, obs, method = "logistic", smooth = TRUE )) expect_s3_class(result_smooth, "tbl_df") expect_equal(nrow(result_smooth), 100) expect_true(all(result_smooth$lower <= result_smooth$observed_rate)) expect_true(all(result_smooth$upper >= result_smooth$observed_rate)) # Test that predictions span the range of input data expect_equal(min(result_logistic$predicted_rate), min(test_data$pred)) expect_equal(max(result_logistic$predicted_rate), max(test_data$pred)) }) test_that("check_model_calibration logistic method handles different confidence levels", { set.seed(123) test_data <- data.frame( pred = runif(100, 0, 1), obs = rbinom(100, 1, 0.5) ) # Test with 90% confidence result_90 <- suppress_calibration_warnings(check_model_calibration( test_data, pred, obs, method = "logistic", conf_level = 0.90 )) # Test with 99% confidence result_99 <- suppress_calibration_warnings(check_model_calibration( test_data, pred, obs, method = "logistic", conf_level = 0.99 )) # 99% CI should be wider than 90% CI expect_true( mean(result_99$upper - result_99$lower) > mean(result_90$upper - result_90$lower) ) }) test_that("check_model_calibration logistic method handles perfect separation", { # Create data with perfect separation test_data <- data.frame( pred = c(rep(0.1, 50), rep(0.9, 50)), obs = c(rep(0, 50), rep(1, 50)) ) # Should not error with perfect separation expect_no_error({ result <- check_model_calibration( test_data, pred, obs, method = "logistic", smooth = FALSE ) }) }) test_that("check_model_calibration works with windowed method", { # Create test data set.seed(123) n <- 200 predicted <- runif(n, 0, 1) actual <- rbinom(n, 1, predicted) test_data <- data.frame( pred = predicted, obs = actual ) # Test windowed method result_windowed <- suppress_calibration_warnings(check_model_calibration( test_data, pred, obs, method = "windowed", window_size = 0.2, step_size = 0.1 )) expect_s3_class(result_windowed, "tbl_df") expect_true(all( c("predicted_rate", "observed_rate", "lower", "upper") %in% names(result_windowed) )) expect_true(nrow(result_windowed) > 0) expect_true(all( result_windowed$predicted_rate >= 0 & result_windowed$predicted_rate <= 1 )) expect_true(all( result_windowed$observed_rate >= 0 & result_windowed$observed_rate <= 1 )) expect_true(all(result_windowed$lower >= 0 & result_windowed$lower <= 1)) expect_true(all(result_windowed$upper >= 0 & result_windowed$upper <= 1)) expect_true(all(result_windowed$lower <= result_windowed$observed_rate)) expect_true(all(result_windowed$upper >= result_windowed$observed_rate)) # Test with different window sizes result_small_window <- suppress_calibration_warnings(check_model_calibration( test_data, pred, obs, method = "windowed", window_size = 0.05, step_size = 0.025 )) expect_true(nrow(result_small_window) > nrow(result_windowed)) # Test window centers expected_centers <- seq(0, 1, by = 0.1) expect_equal(result_windowed$predicted_rate, expected_centers) }) test_that("check_model_calibration windowed method handles edge cases", { set.seed(123) # Test with data concentrated at edges test_data <- data.frame( pred = c(runif(50, 0, 0.1), runif(50, 0.9, 1)), obs = rbinom(100, 1, 0.5) ) result <- check_model_calibration( test_data, pred, obs, method = "windowed", window_size = 0.2, step_size = 0.1 ) expect_s3_class(result, "tbl_df") expect_true(nrow(result) > 0) # Test with very small window result_tiny <- suppress_calibration_warnings(check_model_calibration( test_data, pred, obs, method = "windowed", window_size = 0.01, step_size = 0.1 )) # Some windows might be empty expect_true(nrow(result_tiny) <= length(seq(0, 1, by = 0.1))) }) test_that("check_model_calibration windowed method respects window boundaries", { # Create data only in middle range test_data <- data.frame( pred = runif(100, 0.4, 0.6), obs = rbinom(100, 1, 0.5) ) result <- check_model_calibration( test_data, pred, obs, method = "windowed", window_size = 0.1, step_size = 0.1 ) # Windows at 0 and 1 should have no data (depending on window size) # But windows around 0.5 should have data center_window <- result[result$predicted_rate == 0.5, ] expect_equal(nrow(center_window), 1) expect_true(center_window$observed_rate >= 0) }) test_that("check_model_calibration method parameter validation", { test_data <- data.frame(pred = runif(50), obs = rbinom(50, 1, 0.5)) # Test invalid method expect_halfmoon_error( check_model_calibration(test_data, pred, obs, method = "invalid") ) }) test_that("check_model_calibration handles edge cases with all methods", { # Empty data empty_data <- data.frame(pred = numeric(0), obs = numeric(0)) result_breaks <- check_model_calibration( empty_data, pred, obs, method = "breaks" ) expect_equal(nrow(result_breaks), 0) expect_true("count" %in% names(result_breaks)) expect_true(".bin" %in% names(result_breaks)) result_logistic <- check_model_calibration( empty_data, pred, obs, method = "logistic" ) expect_equal(nrow(result_logistic), 0) expect_false("count" %in% names(result_logistic)) expect_false(".bin" %in% names(result_logistic)) result_windowed <- check_model_calibration( empty_data, pred, obs, method = "windowed" ) expect_equal(nrow(result_windowed), 0) expect_false("count" %in% names(result_windowed)) expect_false(".bin" %in% names(result_windowed)) }) test_that("check_model_calibration handles all zeros and all ones", { set.seed(123) # All zeros - when .focal_level is not specified, 0 becomes the treatment level # so observed_rate will be 1 (all observations match treatment level) zeros_data <- data.frame( pred = runif(50, 0, 1), obs = rep(0, 50) ) # When all observations are 0, default .focal_level will be 0 # Test the actual behavior result_breaks_zeros_default <- suppress_calibration_warnings(check_model_calibration( zeros_data, pred, obs, method = "breaks" )) # All values are 0 and .focal_level=0, so observed_rate=1 expect_true(all(result_breaks_zeros_default$observed_rate == 1)) # Now test with mixed data where 1 is the treatment level mixed_data <- data.frame( pred = runif(50, 0, 1), obs = c(rep(0, 25), rep(1, 25)) ) result_mixed <- suppress_calibration_warnings(check_model_calibration( mixed_data, pred, obs, method = "breaks", .focal_level = 1 )) # This should give us meaningful calibration metrics expect_true(all( result_mixed$observed_rate >= 0 & result_mixed$observed_rate <= 1 )) # All ones - default treatment level will be 1, so observed_rate = 1 ones_data <- data.frame( pred = runif(50, 0, 1), obs = rep(1, 50) ) result_breaks_ones <- suppress_calibration_warnings(check_model_calibration( ones_data, pred, obs, method = "breaks" )) expect_true(all(result_breaks_ones$observed_rate == 1)) result_windowed_ones <- suppress_calibration_warnings(check_model_calibration( ones_data, pred, obs, method = "windowed" )) expect_true(all(result_windowed_ones$observed_rate == 1)) # Test the default behavior with all zeros result_default_zeros <- suppress_calibration_warnings(check_model_calibration( zeros_data, pred, obs, method = "breaks" )) # With default .focal_level, all zeros means .focal_level=0, so observed_rate=1 expect_true(all(result_default_zeros$observed_rate == 1)) }) test_that("check_model_calibration handles NA values correctly", { # Create data with NAs set.seed(123) test_data <- data.frame( pred = c(runif(45), rep(NA, 5)), obs = c(rbinom(45, 1, 0.5), rep(NA, 5)) ) # Test with na.rm = FALSE (default) result_false <- suppress_calibration_warnings(check_model_calibration( test_data, pred, obs, method = "breaks", na.rm = FALSE )) # Test with na.rm = TRUE result_true <- suppress_calibration_warnings(check_model_calibration( test_data, pred, obs, method = "breaks", na.rm = TRUE )) # na.rm = TRUE should have data, na.rm = FALSE might have less expect_true(sum(result_true$count) == 45) }) test_that("check_model_calibration handles factor treatment variables", { set.seed(123) test_data <- data.frame( pred = runif(100, 0, 1), obs = factor(rbinom(100, 1, 0.5), levels = c("0", "1")) ) # Should work with factor result <- suppress_calibration_warnings(check_model_calibration( test_data, pred, obs, method = "breaks", .focal_level = "1" )) expect_s3_class(result, "tbl_df") expect_true(all(result$observed_rate >= 0 & result$observed_rate <= 1)) }) test_that("check_model_calibration validates input parameters", { test_data <- data.frame(pred = runif(50), obs = rbinom(50, 1, 0.5)) # Invalid bins for breaks method expect_halfmoon_error( check_model_calibration(test_data, pred, obs, method = "breaks", bins = 1), "halfmoon_arg_error" ) # Non-integer bins expect_halfmoon_error( check_model_calibration( test_data, pred, obs, method = "breaks", bins = 2.5 ), "halfmoon_arg_error" ) # Missing column expect_halfmoon_error( check_model_calibration(test_data, nonexistent, obs), "halfmoon_column_error" ) }) test_that("check_model_calibration produces consistent results across methods", { # Create well-calibrated data set.seed(123) n <- 500 pred <- runif(n, 0, 1) obs <- rbinom(n, 1, pred) # Perfect calibration on average test_data <- data.frame(pred = pred, obs = obs) # Get results from all methods result_breaks <- check_model_calibration( test_data, pred, obs, method = "breaks", bins = 5 ) result_logistic <- suppress_calibration_warnings(check_model_calibration( test_data, pred, obs, method = "logistic" )) result_windowed <- suppress_calibration_warnings(check_model_calibration( test_data, pred, obs, method = "windowed", window_size = 0.2 )) # All methods should show reasonable calibration (observed_rate ≈ predicted_rate) # For breaks method breaks_diff <- mean(abs( result_breaks$observed_rate - result_breaks$predicted_rate )) expect_true(breaks_diff < 0.2) # Allow some deviation # For windowed method windowed_diff <- mean(abs( result_windowed$observed_rate - result_windowed$predicted_rate )) expect_true(windowed_diff < 0.2) }) test_that("check_model_calibration provides helpful warnings for small cell sizes", { # Create test data with imbalanced distribution to create small cells set.seed(123) n <- 100 test_data <- data.frame( pred = c(runif(90, 0, 0.5), runif(10, 0.9, 1)), # Most values in lower range obs = rbinom(n, 1, 0.3) ) # Test breaks method with small cells expect_halfmoon_warning( check_model_calibration(test_data, pred, obs, method = "breaks", bins = 10), class = "halfmoon_data_warning" ) }) test_that("check_model_calibration provides helpful warnings for extreme proportions", { # Create data where some bins have all 0s or all 1s set.seed(123) test_data <- data.frame( pred = c(runif(50, 0, 0.3), runif(50, 0.7, 1)), obs = c(rep(0, 50), rep(1, 50)) # Perfect separation ) expect_halfmoon_warning( check_model_calibration(test_data, pred, obs, method = "breaks", bins = 10), "halfmoon_data_warning" ) }) test_that("check_model_calibration windowed method provides helpful warnings", { # Create sparse data set.seed(123) test_data <- data.frame( pred = c(runif(20, 0.4, 0.6)), # Concentrated in middle obs = rbinom(20, 1, 0.5) ) # Small window size will create windows with few observations expect_halfmoon_warning( check_model_calibration( test_data, pred, obs, method = "windowed", window_size = 0.05, step_size = 0.1 ), "halfmoon_data_warning" ) # Capture the specific warning message warning_msg <- capture_warnings( check_model_calibration( test_data, pred, obs, method = "windowed", window_size = 0.05, step_size = 0.1 ) ) expect_match(warning_msg[1], "windows centered at") expect_match(warning_msg[1], "Consider using a larger window size") }) test_that("check_model_calibration doesn't warn for adequate sample sizes", { # Create well-distributed data set.seed(123) n <- 1000 test_data <- data.frame( pred = runif(n, 0, 1), obs = rbinom(n, 1, 0.5) ) # Should not produce warnings expect_no_warning( check_model_calibration(test_data, pred, obs, method = "breaks", bins = 10) ) expect_no_warning( check_model_calibration( test_data, pred, obs, method = "windowed", window_size = 0.2, step_size = 0.1 ) ) }) test_that("warnings include specific bin/window information", { # Create data that will have problems in specific bins set.seed(123) test_data <- data.frame( pred = c(runif(95, 0, 0.8), runif(5, 0.9, 1)), # Very few in last bin obs = rbinom(100, 1, 0.3) ) warning_msg <- capture_warnings( check_model_calibration(test_data, pred, obs, method = "breaks", bins = 10) ) # Should mention specific bins expect_match(warning_msg[1], "bins [0-9, ]+") # For windowed method test_data2 <- data.frame( pred = runif(30, 0.4, 0.6), obs = rbinom(30, 1, 0.5) ) warning_msg2 <- capture_warnings( check_model_calibration( test_data2, pred, obs, method = "windowed", window_size = 0.1, step_size = 0.1 ) ) # Should mention specific window centers expect_match(warning_msg2[1], "windows centered at [0-9., ]+") }) test_that("warnings handle edge case with no valid results gracefully", { # Empty data empty_data <- data.frame(pred = numeric(0), obs = numeric(0)) # Should not produce warnings for empty data expect_no_warning( check_model_calibration(empty_data, pred, obs, method = "breaks") ) expect_no_warning( check_model_calibration(empty_data, pred, obs, method = "windowed") ) }) test_that("geom_calibration works with breaks method", { # Create test data with known calibration pattern set.seed(123) n <- 200 predicted <- runif(n, 0, 1) actual <- rbinom(n, 1, predicted) cal_data <- data.frame( pred = predicted, obs = actual ) # Test breaks method p_breaks <- ggplot(cal_data, aes(.fitted = pred, .exposure = obs)) + geom_calibration(method = "breaks", bins = 5) + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + labs(x = "Predicted Probability", y = "Observed Rate") + lims(x = c(0, 1), y = c(0, 1)) expect_s3_class(p_breaks, "ggplot") expect_doppelganger("calibration breaks method", p_breaks) }) test_that("geom_calibration works with logistic method", { # Create test data set.seed(123) n <- 200 predicted <- runif(n, 0, 1) actual <- rbinom(n, 1, predicted) cal_data <- data.frame( pred = predicted, obs = actual ) # Test logistic method (linear) p_logistic <- ggplot(cal_data, aes(.fitted = pred, .exposure = obs)) + geom_calibration(method = "logistic", smooth = FALSE, show_points = FALSE) + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + labs(x = "Predicted Probability", y = "Observed Rate") + lims(x = c(0, 1), y = c(0, 1)) expect_s3_class(p_logistic, "ggplot") expect_doppelganger("calibration logistic method", p_logistic) }) test_that("geom_calibration works with windowed method", { # Create test data set.seed(123) n <- 200 predicted <- runif(n, 0, 1) actual <- rbinom(n, 1, predicted) cal_data <- data.frame( pred = predicted, obs = actual ) # Test windowed method p_windowed <- ggplot(cal_data, aes(.fitted = pred, .exposure = obs)) + geom_calibration(method = "windowed", window_size = 0.2, step_size = 0.1) + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + labs(x = "Predicted Probability", y = "Observed Rate") + lims(x = c(0, 1), y = c(0, 1)) expect_s3_class(p_windowed, "ggplot") expect_doppelganger("calibration windowed method", p_windowed) }) test_that("geom_calibration works with different show options", { # Create test data set.seed(123) n <- 200 predicted <- runif(n, 0, 1) actual <- rbinom(n, 1, predicted) cal_data <- data.frame( pred = predicted, obs = actual ) # Test with ribbon hidden p_no_ribbon <- ggplot(cal_data, aes(.fitted = pred, .exposure = obs)) + geom_calibration(method = "breaks", show_ribbon = FALSE) + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + lims(x = c(0, 1), y = c(0, 1)) # Test with points hidden p_no_points <- ggplot(cal_data, aes(.fitted = pred, .exposure = obs)) + geom_calibration(method = "breaks", show_points = FALSE) + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + lims(x = c(0, 1), y = c(0, 1)) expect_s3_class(p_no_ribbon, "ggplot") expect_s3_class(p_no_points, "ggplot") expect_doppelganger("calibration no ribbon", p_no_ribbon) expect_doppelganger("calibration no points", p_no_points) }) test_that("geom_calibration handles different confidence levels", { # Create test data set.seed(123) n <- 200 predicted <- runif(n, 0, 1) actual <- rbinom(n, 1, predicted) cal_data <- data.frame( pred = predicted, obs = actual ) # Test with 90% confidence level p_90 <- ggplot(cal_data, aes(.fitted = pred, .exposure = obs)) + geom_calibration(method = "breaks", conf_level = 0.90) + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + lims(x = c(0, 1), y = c(0, 1)) expect_s3_class(p_90, "ggplot") expect_doppelganger("calibration 90% confidence", p_90) }) test_that("geom_calibration works with nhefs_weights data", { # Test with actual package data # Create binary outcome from propensity score set.seed(123) cal_data <- nhefs_weights cal_data$binary_outcome <- rbinom(nrow(cal_data), 1, cal_data$.fitted) p_nhefs <- ggplot( cal_data, aes(.fitted = .fitted, .exposure = binary_outcome) ) + geom_calibration(method = "breaks", bins = 8) + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + labs(x = "Propensity Score", y = "Observed Rate") + lims(x = c(0, 1), y = c(0, 1)) expect_s3_class(p_nhefs, "ggplot") expect_doppelganger("calibration nhefs data", p_nhefs) }) test_that("geom_calibration works with both numeric and factor outcomes", { set.seed(123) n <- 100 test_data <- data.frame( pred = runif(n, 0, 1) ) test_data$obs_numeric <- rbinom(n, 1, test_data$pred) test_data$obs_factor <- factor(test_data$obs_numeric, levels = c(0, 1)) # Test with numeric outcome p_numeric <- ggplot(test_data, aes(.fitted = pred, .exposure = obs_numeric)) + geom_calibration(method = "breaks", bins = 5) + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + labs(title = "Calibration with numeric outcome") # Test with factor outcome p_factor <- ggplot(test_data, aes(.fitted = pred, .exposure = obs_factor)) + geom_calibration(method = "breaks", bins = 5) + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + labs(title = "Calibration with factor outcome") # Both should work expect_s3_class(p_numeric, "ggplot") expect_s3_class(p_factor, "ggplot") expect_no_error(suppressWarnings(ggplot_build(p_numeric))) expect_no_error(suppressWarnings(ggplot_build(p_factor))) # Visual tests skip_on_ci() expect_doppelganger("calibration-numeric-outcome", p_numeric) expect_doppelganger("calibration-factor-outcome", p_factor) }) test_that("geom_calibration errors with invalid method", { # Create test data set.seed(123) n <- 50 predicted <- runif(n, 0, 1) actual <- rbinom(n, 1, predicted) cal_data <- data.frame( pred = predicted, obs = actual ) # Test with invalid method - error occurs during rendering p <- ggplot(cal_data, aes(.fitted = pred, .exposure = obs)) + geom_calibration(method = "invalid_method") # We expect a warning with the halfmoon_method_warning class # Use nested expect_warning to properly capture the warning without it leaking to test output expect_warning( expect_warning( ggplot_build(p), class = "halfmoon_method_warning", regexp = "Invalid calibration method" ) ) }) test_that("check_model_calibration errors with invalid bins", { # Create test data set.seed(123) n <- 50 predicted <- runif(n, 0, 1) actual <- rbinom(n, 1, predicted) cal_data <- data.frame( pred = predicted, obs = actual ) # Test with invalid bins expect_halfmoon_error( check_model_calibration(cal_data, pred, obs, bins = 1) ) expect_halfmoon_error( check_model_calibration(cal_data, pred, obs, bins = 2.5) ) }) test_that("geom_calibration logistic method works with smooth = TRUE", { # Create test data set.seed(123) n <- 200 predicted <- runif(n, 0, 1) actual <- rbinom(n, 1, predicted) cal_data <- data.frame( pred = predicted, obs = actual ) # Test logistic method with smooth = TRUE (should work regardless of mgcv availability) p_smooth <- ggplot(cal_data, aes(.fitted = pred, .exposure = obs)) + geom_calibration(method = "logistic", smooth = TRUE, show_points = FALSE) + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + labs(x = "Predicted Probability", y = "Observed Rate") + lims(x = c(0, 1), y = c(0, 1)) expect_s3_class(p_smooth, "ggplot") expect_doppelganger("calibration logistic smooth", p_smooth) }) test_that("k parameter works in check_calibration", { # Create test data with non-linear relationship set.seed(123) n <- 500 x <- runif(n, 0, 1) # Create a wavy pattern that would benefit from different k values true_prob <- plogis(10 * sin(2 * pi * x)) actual <- rbinom(n, 1, true_prob) test_data <- data.frame( pred = x, obs = actual ) # Test with default k = 10 result_k10 <- check_model_calibration( test_data, pred, obs, method = "logistic", smooth = TRUE, k = 10 ) # Test with smaller k = 5 result_k5 <- check_model_calibration( test_data, pred, obs, method = "logistic", smooth = TRUE, k = 5 ) # Test with larger k = 20 result_k20 <- check_model_calibration( test_data, pred, obs, method = "logistic", smooth = TRUE, k = 20 ) # All should return valid results expect_s3_class(result_k10, "tbl_df") expect_s3_class(result_k5, "tbl_df") expect_s3_class(result_k20, "tbl_df") # All should have the same number of rows (100 prediction points) expect_equal(nrow(result_k10), 100) expect_equal(nrow(result_k5), 100) expect_equal(nrow(result_k20), 100) # Results should be different (different smoothness) # Can't test exact values but can verify structure expect_true(all(result_k5$observed_rate >= 0 & result_k5$observed_rate <= 1)) expect_true(all( result_k10$observed_rate >= 0 & result_k10$observed_rate <= 1 )) expect_true(all( result_k20$observed_rate >= 0 & result_k20$observed_rate <= 1 )) }) test_that("k parameter is ignored when smooth = FALSE", { set.seed(123) n <- 200 predicted <- runif(n, 0, 1) actual <- rbinom(n, 1, predicted) test_data <- data.frame( pred = predicted, obs = actual ) # Results should be identical when smooth = FALSE result1 <- check_model_calibration( test_data, pred, obs, method = "logistic", smooth = FALSE, k = 5 ) result2 <- check_model_calibration( test_data, pred, obs, method = "logistic", smooth = FALSE, k = 20 ) # Results should be identical expect_equal(result1$observed_rate, result2$observed_rate) expect_equal(result1$lower, result2$lower) expect_equal(result1$upper, result2$upper) }) test_that("k parameter is ignored for non-logistic methods", { set.seed(123) n <- 200 predicted <- runif(n, 0, 1) actual <- rbinom(n, 1, predicted) test_data <- data.frame( pred = predicted, obs = actual ) # k should be ignored for breaks method result1 <- suppress_calibration_warnings(check_model_calibration( test_data, pred, obs, method = "breaks", k = 5 )) result2 <- suppress_calibration_warnings(check_model_calibration( test_data, pred, obs, method = "breaks", k = 20 )) # Results should be identical expect_equal(result1$observed_rate, result2$observed_rate) # k should be ignored for windowed method result3 <- suppress_calibration_warnings(check_model_calibration( test_data, pred, obs, method = "windowed", k = 5 )) result4 <- suppress_calibration_warnings(check_model_calibration( test_data, pred, obs, method = "windowed", k = 20 )) # Results should be identical expect_equal(result3$observed_rate, result4$observed_rate) }) test_that("k parameter works in geom_calibration", { # Create test data set.seed(123) n <- 500 x <- runif(n, 0, 1) true_prob <- plogis(10 * sin(2 * pi * x)) actual <- rbinom(n, 1, true_prob) test_data <- data.frame( pred = x, obs = actual ) # Create plots with different k values p_k5 <- ggplot(test_data, aes(.fitted = pred, .exposure = obs)) + geom_calibration(method = "logistic", smooth = TRUE, k = 5) + geom_abline(intercept = 0, slope = 1, linetype = "dashed") + labs(x = "Predicted Probability", y = "Observed Rate") + lims(x = c(0, 1), y = c(0, 1)) p_k10 <- ggplot(test_data, aes(.fitted = pred, .exposure = obs)) + geom_calibration(method = "logistic", smooth = TRUE, k = 10) + geom_abline(intercept = 0, slope = 1, linetype = "dashed") + labs(x = "Predicted Probability", y = "Observed Rate") + lims(x = c(0, 1), y = c(0, 1)) p_k20 <- ggplot(test_data, aes(.fitted = pred, .exposure = obs)) + geom_calibration(method = "logistic", smooth = TRUE, k = 20) + geom_abline(intercept = 0, slope = 1, linetype = "dashed") + labs(x = "Predicted Probability", y = "Observed Rate") + lims(x = c(0, 1), y = c(0, 1)) # Test with vdiffr expect_doppelganger("calibration k=5", p_k5) expect_doppelganger("calibration k=10", p_k10) expect_doppelganger("calibration k=20", p_k20) }) test_that("k parameter works in plot_calibration", { # Create test data set.seed(123) n <- 500 x <- runif(n, 0, 1) true_prob <- plogis(10 * sin(2 * pi * x)) actual <- rbinom(n, 1, true_prob) test_data <- data.frame( pred = x, obs = actual ) # Create plots with different k values p_k5 <- plot_model_calibration( test_data, pred, obs, method = "logistic", k = 5 ) p_k10 <- plot_model_calibration( test_data, pred, obs, method = "logistic", k = 10 ) p_k20 <- plot_model_calibration( test_data, pred, obs, method = "logistic", k = 20 ) # Test with vdiffr expect_doppelganger("plot_calibration k=5", p_k5) expect_doppelganger("plot_calibration k=10", p_k10) expect_doppelganger("plot_calibration k=20", p_k20) })