# Tests for integration of gradient() and hessian() with R's optimizers. # ============================================================================ # gradient() with optim() # ============================================================================ test_that("gradient() works as optim() gr argument on a quadratic", { # Minimize f(x) = (x - 3)^2: minimum at x = 3 f_ll <- function(theta) { x <- theta[1] -(x - 3)^2 # negative because we'll negate for minimization } nll <- function(theta) -f_ll(theta) ngr <- function(theta) -gradient(f_ll, theta) fit <- optim(0, fn = nll, gr = ngr, method = "BFGS") expect_equal(fit$par, 3, tolerance = 1e-6) expect_equal(fit$convergence, 0) }) test_that("gradient() with optim() on 2D quadratic", { # Minimize f(x,y) = (x-1)^2 + (y-2)^2 f_ll <- function(theta) { -(theta[1] - 1)^2 - (theta[2] - 2)^2 } nll <- function(theta) -f_ll(theta) ngr <- function(theta) -gradient(f_ll, theta) fit <- optim(c(0, 0), fn = nll, gr = ngr, method = "BFGS") expect_equal(fit$par, c(1, 2), tolerance = 1e-6) }) test_that("negated gradient gives correct minimization direction", { # gradient() gives the gradient of the log-likelihood (ascent direction) # -gradient() gives the descent direction for minimization f_ll <- function(theta) { mu <- theta[1] -0.5 * (5 - mu)^2 } s <- gradient(f_ll, 3) # d/dmu = (5 - 3) = 2 (positive: ascend toward 5) expect_equal(s[1], 2) neg_s <- -gradient(f_ll, 3) expect_equal(neg_s[1], -2) # negative: descend toward 5 from below }) # ============================================================================ # gradient() + hessian() with nlminb() # ============================================================================ test_that("gradient() and hessian() work with nlminb()", { f_ll <- function(theta) { -(theta[1] - 3)^2 - 2 * (theta[2] + 1)^2 } nll <- function(theta) -f_ll(theta) ngr <- function(theta) -gradient(f_ll, theta) nhess <- function(theta) -hessian(f_ll, theta) fit <- nlminb(c(0, 0), objective = nll, gradient = ngr, hessian = nhess) expect_equal(fit$par, c(3, -1), tolerance = 1e-6) expect_equal(fit$convergence, 0) }) # ============================================================================ # AD gradient matches optim's numerical gradient (same solution) # ============================================================================ test_that("optim with AD gradient matches optim without gradient", { set.seed(42) data_norm <- rnorm(50, mean = 5, sd = 2) n <- length(data_norm) sum_x <- sum(data_norm) sum_x2 <- sum(data_norm^2) ll <- function(theta) { mu <- theta[1]; sigma <- theta[2] -n * log(sigma) - (1 / (2 * sigma^2)) * (sum_x2 - 2 * mu * sum_x + n * mu^2) } nll <- function(theta) -ll(theta) ngr <- function(theta) -gradient(ll, theta) # Use L-BFGS-B with lower bound on sigma to prevent log(negative) NaN warnings start <- c(3, 1.5) fit_no_gr <- optim(start, fn = nll, method = "L-BFGS-B", lower = c(-Inf, 0.01)) fit_ad_gr <- optim(start, fn = nll, gr = ngr, method = "L-BFGS-B", lower = c(-Inf, 0.01)) # Both should converge to the same point expect_equal(fit_ad_gr$par, fit_no_gr$par, tolerance = 1e-5) expect_equal(fit_ad_gr$convergence, 0) }) # ============================================================================ # Standard errors from -hessian() match analytical values # ============================================================================ test_that("SEs from -hessian() match analytical for Normal(mu, sigma)", { fix <- make_normal_fixture() n <- fix$n; sum_x <- fix$sum_x; sum_x2 <- fix$sum_x2 ll <- function(theta) { mu <- theta[1]; sigma <- theta[2] -n * log(sigma) - (1 / (2 * sigma^2)) * (sum_x2 - 2 * mu * sum_x + n * mu^2) } # Find MLE mle <- c(fix$mle_mu, fix$mle_sigma) # Observed information at MLE obs_info <- -hessian(ll, mle) vcov <- solve(obs_info) se_ad <- sqrt(diag(vcov)) # Analytical SEs at the MLE: # Var(mu_hat) = sigma^2/n, Var(sigma_hat) = sigma^2/(2n) se_analytical <- c( fix$mle_sigma / sqrt(n), fix$mle_sigma / sqrt(2 * n) ) expect_equal(se_ad, se_analytical, tolerance = 1e-6) }) # ============================================================================ # Poisson MLE via optim() matches sum(x)/n # ============================================================================ test_that("Poisson MLE via optim + AD matches analytical", { set.seed(123) data_pois <- rpois(100, lambda = 3.5) n_pois <- length(data_pois) sum_x_pois <- sum(data_pois) sum_lfact <- sum(lfactorial(data_pois)) ll <- function(theta) { lambda <- theta[1] sum_x_pois * log(lambda) - n_pois * lambda - sum_lfact } nll <- function(theta) -ll(theta) ngr <- function(theta) -gradient(ll, theta) fit <- optim(1, fn = nll, gr = ngr, method = "BFGS") analytical_mle <- sum_x_pois / n_pois expect_equal(fit$par, analytical_mle, tolerance = 1e-6) }) # ============================================================================ # Logistic regression: optim + AD matches glm() # ============================================================================ test_that("logistic regression via optim + AD matches glm()", { set.seed(7) n_lr <- 200 x1 <- rnorm(n_lr) x2 <- rnorm(n_lr) X <- cbind(1, x1, x2) beta_true <- c(-0.5, 1.2, -0.8) eta_true <- X %*% beta_true prob_true <- 1 / (1 + exp(-eta_true)) y <- rbinom(n_lr, 1, prob_true) ll_dual <- function(theta) { result <- dual_constant(0) for (i in seq_len(n_lr)) { eta_i <- theta[1] * X[i, 1] + theta[2] * X[i, 2] + theta[3] * X[i, 3] result <- result + y[i] * eta_i - log(1 + exp(eta_i)) } result } ll_num <- function(beta) { eta <- X %*% beta sum(y * eta - log(1 + exp(eta))) } nll <- function(theta) -ll_num(theta) ngr <- function(theta) -gradient(ll_dual, theta) fit_optim <- optim(c(0, 0, 0), fn = nll, gr = ngr, method = "BFGS") fit_glm <- glm(y ~ x1 + x2, family = binomial) expect_equal(fit_optim$par, unname(coef(fit_glm)), tolerance = 1e-4) }) test_that("logistic regression SEs from AD match glm()", { set.seed(7) n_lr <- 200 x1 <- rnorm(n_lr) x2 <- rnorm(n_lr) X <- cbind(1, x1, x2) beta_true <- c(-0.5, 1.2, -0.8) eta_true <- X %*% beta_true prob_true <- 1 / (1 + exp(-eta_true)) y <- rbinom(n_lr, 1, prob_true) ll_dual <- function(theta) { result <- dual_constant(0) for (i in seq_len(n_lr)) { eta_i <- theta[1] * X[i, 1] + theta[2] * X[i, 2] + theta[3] * X[i, 3] result <- result + y[i] * eta_i - log(1 + exp(eta_i)) } result } ll_num <- function(beta) { eta <- X %*% beta sum(y * eta - log(1 + exp(eta))) } nll <- function(theta) -ll_num(theta) ngr <- function(theta) -gradient(ll_dual, theta) fit_optim <- optim(c(0, 0, 0), fn = nll, gr = ngr, method = "BFGS") fit_glm <- glm(y ~ x1 + x2, family = binomial) # SEs from AD Hessian obs_info <- -hessian(ll_dual, fit_optim$par) vcov_ad <- solve(obs_info) se_ad <- sqrt(diag(vcov_ad)) se_glm <- summary(fit_glm)$coefficients[, "Std. Error"] expect_equal(se_ad, unname(se_glm), tolerance = 0.01) }) # ============================================================================ # nlminb converges for multi-parameter model # ============================================================================ test_that("nlminb with full AD converges for Normal(mu, sigma)", { fix <- make_normal_fixture() n <- fix$n; sum_x <- fix$sum_x; sum_x2 <- fix$sum_x2 ll <- function(theta) { mu <- theta[1]; sigma <- theta[2] -n * log(sigma) - (1 / (2 * sigma^2)) * (sum_x2 - 2 * mu * sum_x + n * mu^2) } nll <- function(theta) -ll(theta) ngr <- function(theta) -gradient(ll, theta) nhess <- function(theta) -hessian(ll, theta) fit <- nlminb(c(0, 1), objective = nll, gradient = ngr, hessian = nhess) expect_equal(fit$par[1], fix$mle_mu, tolerance = 1e-6) expect_equal(fit$par[2], fix$mle_sigma, tolerance = 1e-4) expect_equal(fit$convergence, 0) })