# Tests for ag_sequential, ag_dropout, ag_batch_norm, ag_embedding

src <- function() {
  if (!exists("ag_tensor", mode = "function")) {
    source("R/autograd.R")
    source("R/ag_layers.R")
  }
}
src()

# ============================================================================
# ag_sequential
# ============================================================================

test_that("ag_sequential: forward shape correct", {
  set.seed(1)
  model <- ag_sequential(
    ag_linear(4L, 8L, activation = "relu"),
    ag_linear(8L, 3L, activation = "softmax")
  )
  x   <- ag_tensor(matrix(runif(4 * 16), 4, 16))
  out <- model$forward(x)
  expect_equal(dim(out$data), c(3L, 16L))
})

test_that("ag_sequential: parameters() collects all params", {
  model <- ag_sequential(
    ag_linear(4L, 8L),
    ag_linear(8L, 2L)
  )
  params <- model$parameters()
  # 2 layers × 2 params (W, b) = 4
  expect_equal(length(params), 4L)
  expect_true(all(sapply(params, function(p) p$requires_grad)))
})

test_that("ag_sequential: train/eval mode propagates to sub-layers", {
  model <- ag_sequential(
    ag_linear(4L, 8L),
    ag_dropout(0.5),
    ag_linear(8L, 2L)
  )
  ag_eval(model)
  expect_false(model$training)
  expect_false(model$layers[[2L]]$training)  # dropout

  ag_train(model)
  expect_true(model$training)
  expect_true(model$layers[[2L]]$training)
})

test_that("ag_sequential: backward reduces loss", {
  set.seed(2)
  model  <- ag_sequential(
    ag_linear(4L, 8L, activation = "relu"),
    ag_linear(8L, 2L)
  )
  params <- model$parameters()
  opt    <- optimizer_adam(params, lr = 0.01)

  n     <- 32L
  x_mat <- matrix(runif(4 * n), 4, n)
  y_mat <- matrix(0, 2, n)
  y_mat[1L, ] <- 1  # all class 0

  losses <- numeric(10L)
  for (i in seq_len(10L)) {
    with_grad_tape({
      out  <- model$forward(ag_tensor(x_mat))
      loss <- ag_mse_loss(out, y_mat)
    })
    grads <- backward(loss)
    opt$step(grads)
    opt$zero_grad()
    losses[i] <- loss$data[1L]
  }
  expect_lt(mean(losses[8:10]), mean(losses[1:3]))
})

# ============================================================================
# ag_dropout
# ============================================================================

test_that("ag_dropout: eval mode is identity", {
  set.seed(10)
  drop <- ag_dropout(0.5)
  ag_eval(drop)
  x   <- ag_tensor(matrix(c(1, 2, 3, 4), 2, 2))
  out <- drop$forward(x)
  expect_equal(out$data, x$data)
})

test_that("ag_dropout: train mode changes some values", {
  set.seed(11)
  drop <- ag_dropout(0.5)  # training=TRUE by default
  x    <- ag_tensor(matrix(rep(1, 100), 10, 10))
  out  <- drop$forward(x)
  # Some values should be 0, some 2 (inverted dropout scale = 1/(1-0.5)=2)
  vals <- as.numeric(out$data)
  expect_true(any(vals == 0))
  expect_true(any(abs(vals - 2) < 1e-9))
})

test_that("ag_dropout: rate=0 is always identity", {
  drop <- ag_dropout(0.0)
  x    <- ag_tensor(matrix(1:4, 2, 2))
  expect_equal(drop$forward(x)$data, x$data)
})

test_that("ag_dropout: gradcheck passes in train mode", {
  set.seed(12)
  W    <- ag_param(matrix(runif(8, -1, 1), 2, 4))
  x    <- ag_tensor(matrix(runif(4, -1, 1), 4, 1))
  drop <- ag_dropout(0.0)   # rate=0 -> deterministic identity, safe for gradcheck
  ok   <- ag_gradcheck(
    fn = function(ins) ag_mse_loss(drop$forward(ag_matmul(ins$W, x)),
                                    matrix(0, 2, 1)),
    inputs = list(W = W), atol = 1e-4, quiet = TRUE
  )
  expect_true(ok)
})

# ============================================================================
# ag_batch_norm
# ============================================================================

test_that("ag_batch_norm: output has approx zero mean and unit var (train)", {
  set.seed(20)
  bn  <- ag_batch_norm(4L)
  x   <- ag_tensor(matrix(rnorm(4 * 64, mean = 5, sd = 3), 4, 64))
  out <- bn$forward(x)
  mu  <- rowMeans(out$data)
  v   <- rowMeans((out$data - mu)^2)
  # After BN (gamma=1, beta=0): mean ~ 0, var ~ 1
  expect_lt(max(abs(mu)), 1e-5)
  expect_equal(v, rep(1, 4), tolerance = 0.05)
})

test_that("ag_batch_norm: running stats update in train mode", {
  set.seed(21)
  bn <- ag_batch_norm(2L)
  expect_equal(as.numeric(bn$running_mean), c(0, 0))

  x <- ag_tensor(matrix(c(10, 10, 10, 10, 20, 20, 20, 20), 2, 4))
  bn$forward(x)
  # running_mean should have moved toward [10, 20]
  expect_gt(bn$running_mean[1L], 0)
  expect_gt(bn$running_mean[2L], 0)
})

test_that("ag_batch_norm: eval mode uses running stats", {
  set.seed(22)
  bn <- ag_batch_norm(2L)

  # Train for several batches to build up running stats
  for (i in 1:20) {
    x <- ag_tensor(matrix(rnorm(2 * 32, mean = c(5, -5), sd = 1), 2, 32))
    bn$forward(x)
  }
  running_mean_saved <- bn$running_mean

  # Switch to eval and verify output uses running stats (same input -> same output)
  ag_eval(bn)
  x1 <- ag_tensor(matrix(c(5, -5), 2, 1))
  x2 <- ag_tensor(matrix(c(5, -5), 2, 1))
  out1 <- bn$forward(x1)
  out2 <- bn$forward(x2)
  expect_equal(out1$data, out2$data)
  # running stats should NOT change in eval mode
  expect_equal(bn$running_mean, running_mean_saved)
})

test_that("ag_batch_norm: gradcheck passes (gamma/beta)", {
  set.seed(23)
  bn  <- ag_batch_norm(3L)
  x_d <- matrix(rnorm(3 * 8), 3, 8)

  # gradcheck by swapping bn$gamma with the checked param tensor
  ok_gamma <- ag_gradcheck(
    fn = function(ins) {
      orig_g    <- bn$gamma
      bn$gamma  <- ins$gamma
      out       <- bn$forward(ag_tensor(x_d))
      bn$gamma  <- orig_g
      ag_sum(out)
    },
    inputs = list(gamma = bn$gamma), atol = 1e-3, quiet = TRUE
  )
  expect_true(ok_gamma)
})

# ============================================================================
# ag_embedding
# ============================================================================

test_that("ag_embedding: output shape correct", {
  emb <- ag_embedding(10L, 4L)
  idx <- matrix(c(0L, 3L, 7L, 2L), 2L, 2L)
  out <- emb$forward(idx)
  expect_equal(dim(out$data), c(4L, 4L))  # [dim, seq_len*batch]
})

test_that("ag_embedding: same index -> same output", {
  emb <- ag_embedding(10L, 4L)
  idx <- matrix(c(0L, 0L), 1L, 2L)
  out <- emb$forward(idx)
  expect_equal(out$data[, 1L], out$data[, 2L])
})

test_that("ag_embedding: gradcheck passes", {
  set.seed(30)
  emb <- ag_embedding(5L, 3L)
  idx <- matrix(c(0L, 2L, 4L), 1L, 3L)

  # gradcheck: point ins$weight directly at emb$weight so forward() sees updates
  ok <- ag_gradcheck(
    fn = function(ins) {
      # temporarily swap emb$weight to the checked tensor
      orig  <- emb$weight
      emb$weight <- ins$weight
      out   <- emb$forward(idx)
      emb$weight <- orig
      ag_sum(out)
    },
    inputs = list(weight = emb$weight), atol = 1e-4, quiet = TRUE
  )
  expect_true(ok)
})

test_that("ag_embedding: optimizer updates weight", {
  set.seed(31)
  emb  <- ag_embedding(5L, 3L)
  opt  <- optimizer_adam(emb$parameters(), lr = 0.1)
  idx  <- matrix(c(0L, 1L, 2L), 1L, 3L)
  W0   <- emb$weight$data

  with_grad_tape({
    out  <- emb$forward(idx)
    loss <- ag_sum(ag_pow(out, 2))   # push embeddings toward 0
  })
  grads <- backward(loss)
  opt$step(grads)

  expect_false(identical(emb$weight$data, W0))
})

# ============================================================================
# Integration: sequential with dropout + batch_norm
# ============================================================================

test_that("integration: sequential model with BN and dropout trains", {
  set.seed(40)
  model <- ag_sequential(
    ag_linear(4L, 16L, activation = "relu"),
    ag_batch_norm(16L),
    ag_dropout(0.2),
    ag_linear(16L, 2L)
  )
  params <- model$parameters()
  opt    <- optimizer_adam(params, lr = 0.01)

  n     <- 64L
  x_mat <- matrix(runif(4 * n), 4, n)
  y_mat <- rbind(rep(1, n), rep(0, n))   # all class 0

  losses <- numeric(15L)
  for (i in seq_len(15L)) {
    with_grad_tape({
      out  <- model$forward(ag_tensor(x_mat))
      loss <- ag_mse_loss(out, y_mat)
    })
    grads <- backward(loss)
    opt$step(grads)
    opt$zero_grad()
    losses[i] <- loss$data[1L]
  }
  expect_lt(mean(losses[13:15]), mean(losses[1:3]))
})