# Tests for multi-parameter derivative functions: gradient, hessian, jacobian.
#
# Each test defines a function with known analytical derivatives and
# verifies the AD results.
#
# IMPORTANT: Functions must keep dual parameters as duals throughout.
# Since base R's sum() doesn't dispatch on dual objects, compute sums
# algebraically (e.g., n*mu^2 - 2*mu*sum(data)) rather than element-wise
# (sum((data - mu)^2)).

tol_deriv <- 1e-6

# -- Normal distribution (mu only) --------------------------------------------

test_that("Normal (mu only): gradient and hessian", {
  data <- c(1, 2, 3, 4, 5)
  n <- length(data)
  sum_x <- sum(data)
  sum_x2 <- sum(data^2)

  # Log-likelihood keeping mu as dual:
  # -n/2*log(2pi) - 0.5 * (sum(x^2) - 2*mu*sum(x) + n*mu^2)
  f <- function(x) {
    mu <- x[1]
    -n/2 * log(2 * pi) - 0.5 * (sum_x2 - 2 * mu * sum_x + n * mu^2)
  }

  theta <- c(2)

  g <- gradient(f, theta)
  # Analytical: d/dmu = sum(x) - n*mu = 15 - 10 = 5
  expect_equal(g[1], sum_x - n * theta[1], tolerance = tol_deriv)

  H <- hessian(f, theta)
  # Analytical: d^2/dmu^2 = -n
  expect_equal(H[1,1], -n, tolerance = tol_deriv)

  # observed_information is just -hessian
  expect_equal(-H[1,1], n, tolerance = tol_deriv)
})

# -- Normal distribution (mu and sigma) ----------------------------------------

test_that("Normal (mu, sigma): gradient and hessian", {
  data <- c(1, 2, 3, 4, 5)
  n <- length(data)
  sum_x <- sum(data)
  sum_x2 <- sum(data^2)

  # sum((x-mu)^2) = sum(x^2) - 2*mu*sum(x) + n*mu^2
  f <- function(x) {
    mu <- x[1]
    sigma <- x[2]
    ss <- sum_x2 - 2 * mu * sum_x + n * mu^2  # sum of squared residuals
    -n/2 * log(2 * pi) - n * log(sigma) - 0.5 * ss / sigma^2
  }

  mu0 <- 3
  sigma0 <- sqrt(2)
  theta <- c(mu0, sigma0)

  g <- gradient(f, theta)
  H <- hessian(f, theta)

  # Analytical gradient:
  ss0 <- sum((data - mu0)^2)
  expected_g1 <- sum(data - mu0) / sigma0^2
  expected_g2 <- -n / sigma0 + ss0 / sigma0^3
  expect_equal(g[1], expected_g1, tolerance = tol_deriv)
  expect_equal(g[2], expected_g2, tolerance = tol_deriv)

  # Analytical Hessian:
  expect_equal(H[1,1], -n/sigma0^2, tolerance = tol_deriv)
  expect_equal(H[1,2], -2*sum(data-mu0)/sigma0^3, tolerance = tol_deriv)
  expect_equal(H[2,1], H[1,2], tolerance = 1e-10)
  expect_equal(H[2,2], n/sigma0^2 - 3*ss0/sigma0^4, tolerance = tol_deriv)
})

# -- Poisson distribution (lambda) --------------------------------------------

test_that("Poisson (lambda): gradient and hessian", {
  data <- c(1, 3, 2, 0, 4, 2)
  n <- length(data)
  sum_x <- sum(data)

  f <- function(x) {
    lambda <- x[1]
    sum_x * log(lambda) - n * lambda - sum(lfactorial(data))
  }

  lambda0 <- 2
  theta <- c(lambda0)

  g <- gradient(f, theta)
  H <- hessian(f, theta)

  expect_equal(g[1], sum_x/lambda0 - n, tolerance = tol_deriv)
  expect_equal(H[1,1], -sum_x/lambda0^2, tolerance = tol_deriv)
})

# -- Gamma distribution (shape, rate) -----------------------------------------

test_that("Gamma (shape, rate): gradient matches numerical", {
  data <- c(1.5, 2.1, 0.8, 3.2, 1.9)
  n <- length(data)
  sum_x <- sum(data)
  sum_log_x <- sum(log(data))

  f <- function(x) {
    alpha <- x[1]
    beta <- x[2]
    n * alpha * log(beta) - n * lgamma(alpha) +
      (alpha - 1) * sum_log_x - beta * sum_x
  }

  theta <- c(2, 1.5)

  g <- gradient(f, theta)
  H <- hessian(f, theta)

  # Compare against numerical derivatives of the same function
  num_f <- function(t) {
    n * t[1] * log(t[2]) - n * lgamma(t[1]) +
      (t[1] - 1) * sum_log_x - t[2] * sum_x
  }
  num_g <- numerical_gradient(num_f, theta)
  num_H <- numerical_hessian(num_f, theta)

  expect_equal(g, num_g, tolerance = tol_deriv)
  expect_equal(H, num_H, tolerance = 1e-4)
})

# -- jacobian -----------------------------------------------------------------

test_that("jacobian of vector-valued function", {
  # f: R^2 -> R^2, f(a, b) = (a*b, a^2 + b)
  # J = [[b, a], [2a, 1]]
  f <- function(x) {
    a <- x[1]; b <- x[2]
    list(a * b, a^2 + b)
  }

  theta <- c(3, 4)
  J <- jacobian(f, theta)

  expect_equal(dim(J), c(2, 2))
  expect_equal(J[1, 1], 4, tolerance = 1e-10)   # df1/da = b = 4
  expect_equal(J[1, 2], 3, tolerance = 1e-10)   # df1/db = a = 3
  expect_equal(J[2, 1], 6, tolerance = 1e-10)   # df2/da = 2a = 6
  expect_equal(J[2, 2], 1, tolerance = 1e-10)   # df2/db = 1
})

test_that("jacobian of R^2 -> R^3 function", {
  # f(a, b) = (a*b, a^2, sin(b))
  # J = [[b, a], [2a, 0], [0, cos(b)]]
  f <- function(x) {
    a <- x[1]; b <- x[2]
    list(a * b, a^2, sin(b))
  }

  a0 <- 2; b0 <- pi/4
  J <- jacobian(f, c(a0, b0))

  expect_equal(dim(J), c(3, 2))
  expect_equal(J[1, 1], b0, tolerance = 1e-10)
  expect_equal(J[1, 2], a0, tolerance = 1e-10)
  expect_equal(J[2, 1], 2 * a0, tolerance = 1e-10)
  expect_equal(J[2, 2], 0, tolerance = 1e-10)
  expect_equal(J[3, 1], 0, tolerance = 1e-10)
  expect_equal(J[3, 2], cos(b0), tolerance = 1e-10)
})

test_that("jacobian of scalar function returns gradient vector", {
  f <- function(x) x[1]^2 + x[2]^2

  J <- jacobian(f, c(3, 4))

  # D returns a vector (not 1xp matrix) for scalar-valued f
  expect_equal(length(J), 2)
  expect_equal(J[1], 6, tolerance = 1e-10)
  expect_equal(J[2], 8, tolerance = 1e-10)
})

test_that("jacobian matches numerical for non-linear R^3 -> R^2", {
  # f(a, b, c) = (exp(a*b), sin(b + c))
  f <- function(x) {
    a <- x[1]; b <- x[2]; cc <- x[3]
    list(exp(a * b), sin(b + cc))
  }

  theta <- c(0.5, 1.0, 0.3)
  J <- jacobian(f, theta)

  # Numerical Jacobian via central differences
  eps <- 1e-7
  p <- length(theta)
  J_num <- matrix(0, nrow = 2, ncol = p)
  f_num <- function(t) c(exp(t[1] * t[2]), sin(t[2] + t[3]))
  for (i in seq_len(p)) {
    tp <- tm <- theta
    tp[i] <- tp[i] + eps
    tm[i] <- tm[i] - eps
    J_num[, i] <- (f_num(tp) - f_num(tm)) / (2 * eps)
  }

  expect_equal(J, J_num, tolerance = 1e-6)
})

# -- Works with optim-style function (vector indexing) -------------------------

test_that("gradient and hessian work with optim-style functions", {
  data <- c(2.1, 3.5, 2.8, 4.0, 3.2)
  n <- length(data)
  sum_x <- sum(data)
  sum_x2 <- sum(data^2)

  f <- function(x) {
    mu <- x[1]
    -0.5 * (sum_x2 - 2 * mu * sum_x + n * mu^2)
  }

  g <- gradient(f, c(3))
  expect_true(is.numeric(g))
  expect_equal(length(g), 1)

  H <- hessian(f, c(3))
  expect_true(is.matrix(H))
  expect_equal(dim(H), c(1, 1))
  expect_equal(H[1,1], -n, tolerance = tol_deriv)
})

# -- Multi-parameter indexing --------------------------------------------------

test_that("multi-parameter function with vector indexing", {
  f <- function(x) {
    a <- x[1]
    b <- x[2]
    -(a - 1)^2 - 2 * (b - 3)^2
  }

  theta <- c(0, 0)
  g <- gradient(f, theta)
  # d/da = -2*(a-1) = 2 at a=0
  # d/db = -4*(b-3) = 12 at b=0
  expect_equal(g[1], 2, tolerance = tol_deriv)
  expect_equal(g[2], 12, tolerance = tol_deriv)

  H <- hessian(f, theta)
  expect_equal(H[1,1], -2, tolerance = tol_deriv)
  expect_equal(H[2,2], -4, tolerance = tol_deriv)
  expect_equal(H[1,2], 0, tolerance = tol_deriv)
  expect_equal(H[2,1], 0, tolerance = tol_deriv)
})

# -- Exponential family --------------------------------------------------------

test_that("Exponential distribution: gradient and hessian", {
  data <- c(0.5, 1.2, 0.3, 2.1, 0.8)
  n <- length(data)
  sum_x <- sum(data)

  f <- function(x) {
    rate <- x[1]
    n * log(rate) - rate * sum_x
  }

  rate0 <- 1.5
  theta <- c(rate0)

  g <- gradient(f, theta)
  H <- hessian(f, theta)

  expect_equal(g[1], n/rate0 - sum_x, tolerance = tol_deriv)
  expect_equal(H[1,1], -n/rate0^2, tolerance = tol_deriv)
})

# == Vector-gradient optimization tests (0.4.0) ================================

# -- gradient() matches numerical gradient for all models -----------------------

test_that("1-pass gradient matches numerical_gradient: Normal(mu,sigma)", {
  data <- c(1, 2, 3, 4, 5)
  n <- length(data)
  sum_x <- sum(data)
  sum_x2 <- sum(data^2)

  f <- function(x) {
    mu <- x[1]
    sigma <- x[2]
    ss <- sum_x2 - 2 * mu * sum_x + n * mu^2
    -n/2 * log(2 * pi) - n * log(sigma) - 0.5 * ss / sigma^2
  }

  theta <- c(3, sqrt(2))
  g <- gradient(f, theta)
  num_g <- numerical_gradient(f, theta)
  expect_equal(g, num_g, tolerance = tol_deriv)
})

test_that("1-pass gradient matches numerical_gradient: Poisson(lambda)", {
  data <- c(1, 3, 2, 0, 4, 2)
  n <- length(data)
  sum_x <- sum(data)

  f <- function(x) {
    lambda <- x[1]
    sum_x * log(lambda) - n * lambda - sum(lfactorial(data))
  }

  theta <- c(2)
  g <- gradient(f, theta)
  num_g <- numerical_gradient(f, theta)
  expect_equal(g, num_g, tolerance = tol_deriv)
})

test_that("1-pass gradient matches numerical_gradient: Gamma(shape,rate)", {
  data <- c(1.5, 2.1, 0.8, 3.2, 1.9)
  n <- length(data)
  sum_x <- sum(data)
  sum_log_x <- sum(log(data))

  f <- function(x) {
    alpha <- x[1]
    beta <- x[2]
    n * alpha * log(beta) - n * lgamma(alpha) +
      (alpha - 1) * sum_log_x - beta * sum_x
  }

  theta <- c(2, 1.5)
  g <- gradient(f, theta)
  num_g <- numerical_gradient(f, theta)
  expect_equal(g, num_g, tolerance = tol_deriv)
})

# -- hessian() symmetry and numerical accuracy --------------------------------

test_that("p-pass hessian is symmetric: Normal(mu,sigma)", {
  data <- c(1, 2, 3, 4, 5)
  n <- length(data)
  sum_x <- sum(data)
  sum_x2 <- sum(data^2)

  f <- function(x) {
    mu <- x[1]
    sigma <- x[2]
    ss <- sum_x2 - 2 * mu * sum_x + n * mu^2
    -n/2 * log(2 * pi) - n * log(sigma) - 0.5 * ss / sigma^2
  }

  theta <- c(3, sqrt(2))
  H <- hessian(f, theta)

  expect_equal(H[1,2], H[2,1], tolerance = 1e-10)
})

test_that("p-pass hessian matches numerical_hessian: Normal(mu,sigma)", {
  data <- c(1, 2, 3, 4, 5)
  n <- length(data)
  sum_x <- sum(data)
  sum_x2 <- sum(data^2)

  f <- function(x) {
    mu <- x[1]
    sigma <- x[2]
    ss <- sum_x2 - 2 * mu * sum_x + n * mu^2
    -n/2 * log(2 * pi) - n * log(sigma) - 0.5 * ss / sigma^2
  }

  theta <- c(3, sqrt(2))
  H <- hessian(f, theta)
  num_H <- numerical_hessian(f, theta)
  expect_equal(H, num_H, tolerance = 1e-4)
})

test_that("p-pass hessian matches numerical_hessian: multi-param quadratic", {
  f <- function(x) {
    a <- x[1]
    b <- x[2]
    -(a - 1)^2 - 2 * (b - 3)^2 - 0.5 * a * b
  }

  theta <- c(0, 0)
  H <- hessian(f, theta)
  num_H <- numerical_hessian(f, theta)

  expect_equal(H, num_H, tolerance = 1e-4)
  # Cross-derivative should be symmetric
  expect_equal(H[1,2], H[2,1], tolerance = 1e-10)
  # Diagonal entries: d2f/da2 = -2, d2f/db2 = -4
  expect_equal(H[1,1], -2, tolerance = tol_deriv)
  expect_equal(H[2,2], -4, tolerance = tol_deriv)
  # Off-diagonal: d2f/dadb = -0.5
  expect_equal(H[1,2], -0.5, tolerance = tol_deriv)
})

test_that("p-pass hessian matches numerical: 3-parameter model", {
  f <- function(x) {
    a <- x[1]; b <- x[2]; c <- x[3]
    -(a - 1)^2 - 2 * (b - 2)^2 - 3 * (c - 3)^2 + a * b - 0.5 * b * c
  }

  theta <- c(1, 2, 3)
  H <- hessian(f, theta)
  num_H <- numerical_hessian(f, theta)

  expect_equal(H, num_H, tolerance = 1e-4)
  # Verify symmetry
  expect_equal(H[1,2], H[2,1], tolerance = 1e-10)
  expect_equal(H[1,3], H[3,1], tolerance = 1e-10)
  expect_equal(H[2,3], H[3,2], tolerance = 1e-10)
})

# -- Edge case: constant function returns zeros --------------------------------

test_that("gradient and hessian handle constant function", {
  f <- function(x) 42

  theta <- c(1, 2)

  g <- gradient(f, theta)
  expect_equal(g, c(0, 0))

  H <- hessian(f, theta)
  expect_equal(H, matrix(0, 2, 2))
})

# -- Edge case: single parameter (p=1) ----------------------------------------

test_that("gradient and hessian work with single parameter", {
  f <- function(x) {
    a <- x[1]
    -0.5 * (a - 3)^2
  }

  theta <- c(1)
  g <- gradient(f, theta)
  H <- hessian(f, theta)

  expect_equal(g[1], 2, tolerance = tol_deriv)  # d/da = -(a-3) = 2 at a=1
  expect_equal(H[1,1], -1, tolerance = tol_deriv)
})

# -- -hessian() gives observed information -------------------------------------

test_that("-hessian() gives observed information", {
  data <- c(1, 2, 3, 4, 5)
  n <- length(data)
  sum_x <- sum(data)
  sum_x2 <- sum(data^2)

  f <- function(x) {
    mu <- x[1]
    sigma <- x[2]
    ss <- sum_x2 - 2 * mu * sum_x + n * mu^2
    -n/2 * log(2 * pi) - n * log(sigma) - 0.5 * ss / sigma^2
  }

  theta <- c(3, sqrt(2))
  I <- -hessian(f, theta)
  H <- hessian(f, theta)

  expect_equal(I, -H, tolerance = 1e-10)
})

# == D operator tests ==========================================================

tol_D1 <- 1e-10   # first-order (exact AD)
tol_D2 <- 1e-10   # second-order
tol_D3 <- 1e-10   # third-order (AD still exact; only numerical comparison is loose)
tol_num <- 1e-5   # for comparing against numerical finite differences

# -- D returns a function ------------------------------------------------------

test_that("D(f) returns a function", {
  f <- function(x) x[1]^2 + x[2]^2
  g <- D(f)
  expect_true(is.function(g))
  result <- g(c(3, 4))
  expect_equal(result, c(6, 8), tolerance = tol_D1)
})

test_that("D(f, order=2) returns a function when x is NULL", {
  f <- function(x) x[1]^2 + x[2]^2
  g <- D(f, order = 2)
  expect_true(is.function(g))
  result <- g(c(3, 4))
  expect_equal(result, diag(2, 2), tolerance = tol_D2)
})

test_that("D rejects order < 1", {
  expect_error(D(sin, order = 0), "positive integer")
})

# -- First-order, scalar f: matches gradient -----------------------------------

test_that("D(f, x) matches gradient for quadratic", {
  f <- function(x) x[1]^2 * x[2] + x[2]^3
  x <- c(3, 4)
  expect_equal(D(f, x), gradient(f, x), tolerance = tol_D1)
})

test_that("D(f, x) matches gradient for exponential", {
  f <- function(x) exp(x[1] + x[2])
  x <- c(1, 2)
  expect_equal(D(f, x), gradient(f, x), tolerance = tol_D1)
})

test_that("D(f, x) matches gradient for trigonometric", {
  f <- function(x) sin(x[1]) * cos(x[2])
  x <- c(pi/3, pi/4)
  expect_equal(D(f, x), gradient(f, x), tolerance = tol_D1)
})

test_that("D(f, x) matches numerical gradient", {
  f <- function(x) x[1]^2 * x[2] + x[2]^3
  x <- c(3, 4)
  expect_equal(D(f, x), numerical_gradient(f, x), tolerance = tol_num)
})

# -- First-order, vector f: matches old jacobian results -----------------------

test_that("D(f, x) produces Jacobian for R^2 -> R^2", {
  f <- function(x) {
    a <- x[1]; b <- x[2]
    list(a * b, a^2 + b)
  }
  J <- D(f, c(3, 4))
  expect_equal(dim(J), c(2, 2))
  expect_equal(J[1, 1], 4, tolerance = tol_D1)   # df1/da = b = 4
  expect_equal(J[1, 2], 3, tolerance = tol_D1)   # df1/db = a = 3
  expect_equal(J[2, 1], 6, tolerance = tol_D1)   # df2/da = 2a = 6
  expect_equal(J[2, 2], 1, tolerance = tol_D1)   # df2/db = 1
})

test_that("D(f, x) produces Jacobian for R^2 -> R^3", {
  f <- function(x) {
    a <- x[1]; b <- x[2]
    list(a * b, a^2, sin(b))
  }
  a0 <- 2; b0 <- pi/4
  J <- D(f, c(a0, b0))
  expect_equal(dim(J), c(3, 2))
  expect_equal(J[1, 1], b0, tolerance = tol_D1)
  expect_equal(J[1, 2], a0, tolerance = tol_D1)
  expect_equal(J[2, 1], 2 * a0, tolerance = tol_D1)
  expect_equal(J[2, 2], 0, tolerance = tol_D1)
  expect_equal(J[3, 1], 0, tolerance = tol_D1)
  expect_equal(J[3, 2], cos(b0), tolerance = tol_D1)
})

test_that("D(f, x) for scalar f returns vector (not matrix)", {
  f <- function(x) x[1]^2 + x[2]^2
  result <- D(f, c(3, 4))
  expect_true(is.numeric(result))
  expect_equal(length(result), 2)
  expect_null(dim(result))  # vector, not 1-row matrix
  expect_equal(result, c(6, 8), tolerance = tol_D1)
})

# -- Second-order, scalar f: matches hessian -----------------------------------

test_that("D(f, x, order=2) matches hessian for quadratic", {
  f <- function(x) -(x[1] - 3)^2 - (x[2] - 5)^2
  x <- c(1, 2)
  expect_equal(D(f, x, order = 2), hessian(f, x), tolerance = tol_D2)
})

test_that("D(f, x, order=2) matches hessian for cross-term", {
  f <- function(x) x[1]^2 * x[2] + x[2]^3
  x <- c(3, 4)
  H <- D(f, x, order = 2)
  expect_equal(dim(H), c(2, 2))
  # d2f/da2 = 2*b = 8
  expect_equal(H[1, 1], 2 * x[2], tolerance = tol_D2)
  # d2f/dadb = 2*a = 6
  expect_equal(H[1, 2], 2 * x[1], tolerance = tol_D2)
  expect_equal(H[2, 1], 2 * x[1], tolerance = tol_D2)  # symmetry
  # d2f/db2 = 6*b = 24
  expect_equal(H[2, 2], 6 * x[2], tolerance = tol_D2)
})

test_that("D(f, x, order=2) matches numerical Hessian", {
  f <- function(x) x[1]^2 * x[2] + x[2]^3
  x <- c(3, 4)
  expect_equal(D(f, x, order = 2), numerical_hessian(f, x), tolerance = tol_num)
})

# -- Second-order via composition: D(D(f)) == D(f, order=2) -------------------

test_that("D(D(f))(x) equals D(f, x, order=2)", {
  f <- function(x) x[1]^2 * x[2]
  x <- c(3, 4)
  Df <- D(f)
  DDf <- D(Df)
  expect_equal(DDf(x), D(f, x, order = 2), tolerance = tol_D2)
})

# -- Third-order ---------------------------------------------------------------

test_that("D(f, x, order=3) for x[1]^4 at x=c(2,1)", {
  f <- function(x) x[1]^4
  x <- c(2, 1)
  T3 <- D(f, x, order = 3)
  # f = x1^4, only x1 derivatives are non-zero
  # d3f/dx1^3 = 24*x1 = 48
  # All other entries are 0 (no x2 dependence)
  expect_equal(dim(T3), c(2, 2, 2))
  expect_equal(T3[1, 1, 1], 48, tolerance = tol_D3)  # d3f/dx1^3
  # All entries involving x2 should be zero
  expect_equal(T3[2, 1, 1], 0, tolerance = tol_D3)
  expect_equal(T3[1, 2, 1], 0, tolerance = tol_D3)
  expect_equal(T3[1, 1, 2], 0, tolerance = tol_D3)
  expect_equal(T3[2, 2, 1], 0, tolerance = tol_D3)
  expect_equal(T3[2, 1, 2], 0, tolerance = tol_D3)
  expect_equal(T3[1, 2, 2], 0, tolerance = tol_D3)
  expect_equal(T3[2, 2, 2], 0, tolerance = tol_D3)
})

test_that("D(f, x, order=3) for x[1]^2 * x[2] gives correct tensor", {
  f <- function(x) x[1]^2 * x[2]
  x <- c(3, 4)
  T3 <- D(f, x, order = 3)
  expect_equal(dim(T3), c(2, 2, 2))
  # d3f/dx1^2 dx2 = 2 (in all permutations of indices)
  # d3f/dx1^3 = 0
  # d3f/dx2^3 = 0
  expect_equal(T3[1, 1, 1], 0, tolerance = tol_D3)
  expect_equal(T3[1, 1, 2], 2, tolerance = tol_D3)
  expect_equal(T3[1, 2, 1], 2, tolerance = tol_D3)
  expect_equal(T3[2, 1, 1], 2, tolerance = tol_D3)
  expect_equal(T3[2, 2, 2], 0, tolerance = tol_D3)
  expect_equal(T3[2, 2, 1], 0, tolerance = tol_D3)
  expect_equal(T3[2, 1, 2], 0, tolerance = tol_D3)
  expect_equal(T3[1, 2, 2], 0, tolerance = tol_D3)
})

# -- Fourth-order --------------------------------------------------------------

test_that("D(f, x, order=4) for exp(x[1]) at x=c(1)", {
  f <- function(x) exp(x[1])
  x <- c(1)
  T4 <- D(f, x, order = 4)
  # exp has all derivatives equal to exp(x)
  # n=1: 4th-order tensor is (1,1,1,1) array
  expect_equal(dim(T4), c(1, 1, 1, 1))
  expect_equal(T4[1, 1, 1, 1], exp(1), tolerance = tol_D3)
})

test_that("D(f, x, order=4) for x[1]^5 at x=c(2)", {
  f <- function(x) x[1]^5
  x <- c(2)
  T4 <- D(f, x, order = 4)
  # f^(4) = 120*x = 240
  expect_equal(dim(T4), c(1, 1, 1, 1))
  expect_equal(T4[1, 1, 1, 1], 240, tolerance = tol_D3)
})

# -- Second-order, vector f: produces (m,n,n) tensor ---------------------------

test_that("D(f, x, order=2) for f: R^2 -> R^2 gives (2,2,2) array", {
  f <- function(x) list(x[1]^2 * x[2], x[1] + x[2]^3)
  x <- c(2, 3)
  T2 <- D(f, x, order = 2)
  expect_equal(dim(T2), c(2, 2, 2))
  # Component 1: x1^2 * x2
  #   d2/dx1^2 = 2*x2 = 6
  #   d2/dx1dx2 = 2*x1 = 4
  #   d2/dx2^2 = 0
  expect_equal(T2[1, 1, 1], 6, tolerance = tol_D2)
  expect_equal(T2[1, 1, 2], 4, tolerance = tol_D2)
  expect_equal(T2[1, 2, 1], 4, tolerance = tol_D2)
  expect_equal(T2[1, 2, 2], 0, tolerance = tol_D2)
  # Component 2: x1 + x2^3
  #   d2/dx1^2 = 0
  #   d2/dx1dx2 = 0
  #   d2/dx2^2 = 6*x2 = 18
  expect_equal(T2[2, 1, 1], 0, tolerance = tol_D2)
  expect_equal(T2[2, 1, 2], 0, tolerance = tol_D2)
  expect_equal(T2[2, 2, 1], 0, tolerance = tol_D2)
  expect_equal(T2[2, 2, 2], 18, tolerance = tol_D2)
})

# -- Edge cases ----------------------------------------------------------------

test_that("D of constant function gives zero tensor", {
  f <- function(x) 42
  x <- c(1, 2)
  expect_equal(D(f, x), c(0, 0))
  expect_equal(D(f, x, order = 2), matrix(0, 2, 2))
})

test_that("D of linear function gives constant Jacobian, zero Hessian", {
  f <- function(x) 3 * x[1] + 5 * x[2]
  x <- c(1, 2)
  expect_equal(D(f, x), c(3, 5), tolerance = tol_D1)
  expect_equal(D(f, x, order = 2), matrix(0, 2, 2))
})

test_that("D works with n=1 (scalar input)", {
  f <- function(x) x[1]^3
  expect_equal(D(f, c(2)), 12, tolerance = tol_D1)        # 3*x^2
  # n=1 higher-order: returns (1,1) matrix, (1,1,1) array, etc.
  expect_equal(D(f, c(2), order = 2)[1, 1], 12, tolerance = tol_D2)  # 6*x
  expect_equal(D(f, c(2), order = 3)[1, 1, 1], 6, tolerance = tol_D3)   # 6
})

test_that("D works with n=1 for exp", {
  f <- function(x) exp(x[1])
  expect_equal(D(f, c(1)), exp(1), tolerance = tol_D1)
  expect_equal(D(f, c(1), order = 2)[1, 1], exp(1), tolerance = tol_D2)
})

# -- Numerical verification for gradient and hessian via D ---------------------

test_that("gradient via D matches numerical for Normal(mu,sigma)", {
  data <- c(1, 2, 3, 4, 5)
  n <- length(data)
  sum_x <- sum(data)
  sum_x2 <- sum(data^2)

  f <- function(x) {
    mu <- x[1]; sigma <- x[2]
    ss <- sum_x2 - 2 * mu * sum_x + n * mu^2
    -n/2 * log(2 * pi) - n * log(sigma) - 0.5 * ss / sigma^2
  }

  theta <- c(3, sqrt(2))
  expect_equal(gradient(f, theta), numerical_gradient(f, theta), tolerance = tol_num)
})

test_that("hessian via D matches numerical for Normal(mu,sigma)", {
  data <- c(1, 2, 3, 4, 5)
  n <- length(data)
  sum_x <- sum(data)
  sum_x2 <- sum(data^2)

  f <- function(x) {
    mu <- x[1]; sigma <- x[2]
    ss <- sum_x2 - 2 * mu * sum_x + n * mu^2
    -n/2 * log(2 * pi) - n * log(sigma) - 0.5 * ss / sigma^2
  }

  theta <- c(3, sqrt(2))
  expect_equal(hessian(f, theta), numerical_hessian(f, theta), tolerance = 1e-4)
})

test_that("hessian via D is symmetric", {
  f <- function(x) x[1]^2 * x[2] + exp(x[1] * x[2])
  x <- c(1, 0.5)
  H <- hessian(f, x)
  expect_equal(H[1, 2], H[2, 1], tolerance = 1e-12)
})