test_that("Binary internal calibration: posterior mean ranks with empirical logits; SD tracks binomial information", { skip_if_not_installed("RBesT") set.seed(310) dose_levels <- c(0, 1, 2, 4) # Use separated but not extreme probabilities to avoid separation artifacts p_true <- c(0.12, 0.20, 0.33, 0.52) theta_true <- qlogis(p_true) # Mild priors centered near truth to stabilize CI; still allows data to dominate prior_list <- setNames( lapply(seq_along(dose_levels), function(i) { RBesT::mixnorm(comp1 = c(w = 1, m = theta_true[i], s = 2.0), sigma = 2) }), c("Ctr", paste0("DG_", seq_along(dose_levels[-1]))) ) run_once <- function(n_per_dose) { dat <- data.frame( simulation = 1L, dose = rep(dose_levels, each = n_per_dose), response = unlist(lapply(p_true, function(p) rbinom(n_per_dose, 1, p))) ) attr(dat, "probability_scale") <- TRUE post <- getPosterior(prior_list = prior_list, data = dat, probability_scale = TRUE) smry <- summary(post) # Empirical proportions/logits with continuity correction to avoid +/-Inf p_hat <- tapply(dat$response, dat$dose, mean) eps <- 1/(n_per_dose + 1) p_hat_cc <- pmin(pmax(as.numeric(p_hat), eps), 1 - eps) logit_hat <- qlogis(p_hat_cc) list( mean = as.numeric(smry[, "mean"]), sd = as.numeric(smry[, "sd"]), logit_hat = as.numeric(logit_hat), p_hat = as.numeric(p_hat_cc) ) } # Compare two sample sizes to test shrinkage + scaling small_n <- 40 big_n <- 200 out_small <- run_once(small_n) out_big <- run_once(big_n) # --- Means: plausible & monotone (ranking) --- # rank(posterior mean) should match rank(empirical logit) expect_equal(order(out_small$mean), order(out_small$logit_hat)) expect_equal(order(out_big$mean), order(out_big$logit_hat)) # Means should be reasonably close to empirical logits (looser at small n) expect_equal(out_small$mean, out_small$logit_hat, tolerance = 0.60) expect_equal(out_big$mean, out_big$logit_hat, tolerance = 0.35) # --- SD sanity --- expect_true(all(is.finite(out_small$sd))) expect_true(all(is.finite(out_big$sd))) expect_true(all(out_small$sd > 0)) expect_true(all(out_big$sd > 0)) # --- Shrinkage with n (must hold in aggregate) --- expect_true(mean(out_big$sd) < mean(out_small$sd)) # --- Binomial information scaling on logit scale --- # For Bernoulli with logit link, Fisher info ~ n p (1-p), so sd(theta) ~ 1/sqrt(n p(1-p)) approx_sd <- function(n, p) 1 / sqrt(n * p * (1 - p)) # Compare ratio of SDs between small and big n: should be ~sqrt(big/small) up to slack ratio_obs <- out_small$sd / out_big$sd ratio_exp <- sqrt(big_n / small_n) * approx_sd(small_n, out_small$p_hat) / approx_sd(big_n, out_big$p_hat) # ratio_exp simplifies close to sqrt(big/small) if p's match; keep robust by computing anyway. # Require positive association between observed and expected scaling across doses expect_true(stats::cor(ratio_obs, ratio_exp) > 0.3) # Also require median ratio in a plausible band around sqrt(big/small) (~2.236) target <- sqrt(big_n / small_n) expect_true(stats::median(ratio_obs) > 0.9 * target) expect_true(stats::median(ratio_obs) < 1.6 * target) })