# Tests for DataLoader, lr_scheduler, clip_grad_norm src <- function() { if (!exists("ag_tensor", mode = "function")) { source("R/autograd.R") source("R/ag_layers.R") source("R/ag_training.R") } } src() # ============================================================================ # ag_dataloader # ============================================================================ test_that("ag_dataloader: basic shape", { n <- 64L x <- matrix(runif(4 * n), 4, n) y <- matrix(runif(2 * n), 2, n) dl <- ag_dataloader(x, y, batch_size = 16L, shuffle = FALSE) expect_equal(dl$n_batches(), 4L) batch <- dl$next_batch() expect_s3_class(batch$x, "ag_tensor") expect_equal(dim(batch$x$data), c(4L, 16L)) expect_equal(dim(batch$y$data), c(2L, 16L)) }) test_that("ag_dataloader: covers all samples (no shuffle)", { n <- 32L x <- matrix(seq_len(32), 1, n) # single feature, values 1..32 dl <- ag_dataloader(x, batch_size = 8L, shuffle = FALSE) seen <- c() while (dl$has_next()) { b <- dl$next_batch() seen <- c(seen, as.integer(b$x$data)) } expect_equal(sort(seen), seq_len(n)) }) test_that("ag_dataloader: shuffle produces different order", { set.seed(42) n <- 100L x <- matrix(seq_len(n), 1, n) dl1 <- ag_dataloader(x, batch_size = 10L, shuffle = TRUE) dl2 <- ag_dataloader(x, batch_size = 10L, shuffle = FALSE) get_all <- function(dl) { out <- c() while (dl$has_next()) out <- c(out, as.integer(dl$next_batch()$x$data)) out } ord1 <- get_all(dl1) ord2 <- get_all(dl2) # With seed 42 the shuffle should produce a different order expect_false(identical(ord1, ord2)) # But cover the same elements expect_equal(sort(ord1), sort(ord2)) }) test_that("ag_dataloader: reset reshuffles", { set.seed(1) n <- 20L x <- matrix(seq_len(n), 1, n) dl <- ag_dataloader(x, batch_size = 5L, shuffle = TRUE) ord1 <- dl$order dl$reset() ord2 <- dl$order # Reshuffle should (almost certainly) give different order # We can't guarantee it but with n=20 collision prob is tiny expect_false(identical(ord1, ord2)) }) test_that("ag_dataloader: has_next FALSE after exhaustion", { dl <- ag_dataloader(matrix(1:10, 1, 10), batch_size = 5L) dl$next_batch(); dl$next_batch() expect_false(dl$has_next()) }) test_that("ag_dataloader: col_major=FALSE transposes", { # row-major: [n, features] n <- 16L x <- matrix(runif(n * 4), n, 4) # [16, 4] dl <- ag_dataloader(x, batch_size = 8L, col_major = FALSE) b <- dl$next_batch() # after transpose: [features, batch] = [4, 8] expect_equal(dim(b$x$data), c(4L, 8L)) }) test_that("ag_dataloader: epoch() returns all batches", { n <- 32L x <- matrix(seq_len(n), 1, n) dl <- ag_dataloader(x, batch_size = 8L, shuffle = FALSE) batches <- dl$epoch() expect_equal(length(batches), 4L) }) # ============================================================================ # lr_scheduler_step # ============================================================================ test_that("lr_scheduler_step: decays at correct steps", { w <- ag_param(matrix(1, 2, 2)) opt <- optimizer_sgd(list(w = w), lr = 1.0) sch <- lr_scheduler_step(opt, step_size = 5L, gamma = 0.5) for (i in 1:5) sch$step() expect_equal(opt$lr, 0.5, tolerance = 1e-9) for (i in 1:5) sch$step() expect_equal(opt$lr, 0.25, tolerance = 1e-9) for (i in 1:4) sch$step() # only 4, no decay yet expect_equal(opt$lr, 0.25, tolerance = 1e-9) sch$step() # 5th step -> decay expect_equal(opt$lr, 0.125, tolerance = 1e-9) }) test_that("lr_scheduler_step: no decay before step_size", { w <- ag_param(matrix(1, 2, 2)) opt <- optimizer_sgd(list(w = w), lr = 0.1) sch <- lr_scheduler_step(opt, step_size = 10L, gamma = 0.1) for (i in 1:9) sch$step() expect_equal(opt$lr, 0.1, tolerance = 1e-9) }) # ============================================================================ # lr_scheduler_cosine # ============================================================================ test_that("lr_scheduler_cosine: lr decreases toward lr_min", { w <- ag_param(matrix(1, 2, 2)) opt <- optimizer_adam(list(w = w), lr = 0.1) sch <- lr_scheduler_cosine(opt, T_max = 10L, lr_min = 0) lrs <- numeric(10L) for (i in 1:10) lrs[i] <- sch$step() # First step: slightly below max; last step: near 0 expect_lt(lrs[10], lrs[1]) expect_lt(lrs[10], 0.01) }) test_that("lr_scheduler_cosine: stays at lr_min after T_max (no restart)", { w <- ag_param(matrix(1, 2, 2)) opt <- optimizer_adam(list(w = w), lr = 0.1) sch <- lr_scheduler_cosine(opt, T_max = 5L, lr_min = 0.01, restart = FALSE) for (i in 1:5) sch$step() lr_at_5 <- opt$lr sch$step() # beyond T_max expect_equal(opt$lr, lr_at_5, tolerance = 1e-9) }) test_that("lr_scheduler_cosine: restarts correctly", { w <- ag_param(matrix(1, 2, 2)) opt <- optimizer_adam(list(w = w), lr = 0.1) sch <- lr_scheduler_cosine(opt, T_max = 5L, lr_min = 0, restart = TRUE) lrs <- numeric(10L) for (i in 1:10) lrs[i] <- sch$step() # Cycle 1: steps 1-5 should decrease expect_lt(lrs[5], lrs[1]) # Step 6 should restart near max, step 5 should be near 0 expect_gt(lrs[6], lrs[5]) }) # ============================================================================ # clip_grad_norm # ============================================================================ test_that("clip_grad_norm: does not clip when norm < max_norm", { w <- ag_param(matrix(c(0.1, 0.1, 0.1, 0.1), 2, 2)) x <- ag_tensor(matrix(c(1, 1), 2, 1)) with_grad_tape({ out <- ag_matmul(w, x) loss <- ag_mse_loss(out, matrix(0, 2, 1)) }) grads <- backward(loss) orig_g <- get0(as.character(w$id), envir = grads) pre_norm <- clip_grad_norm(list(w = w), grads, max_norm = 1000) after_g <- get0(as.character(w$id), envir = grads) # Should be unchanged expect_equal(orig_g, after_g, tolerance = 1e-10) }) test_that("clip_grad_norm: clips to max_norm", { w <- ag_param(matrix(c(1, 2, 3, 4), 2, 2)) x <- ag_tensor(matrix(c(10, 10), 2, 1)) with_grad_tape({ out <- ag_matmul(w, x) loss <- ag_mse_loss(out, matrix(0, 2, 1)) }) grads <- backward(loss) pre_norm <- clip_grad_norm(list(w = w), grads, max_norm = 1.0) g_after <- get0(as.character(w$id), envir = grads) clipped_norm <- sqrt(sum(g_after^2)) expect_gt(pre_norm, 1.0) expect_equal(clipped_norm, 1.0, tolerance = 1e-5) }) test_that("clip_grad_norm: works with multiple params", { w1 <- ag_param(matrix(runif(4), 2, 2)) w2 <- ag_param(matrix(runif(4), 2, 2)) x <- ag_tensor(matrix(c(1, 1), 2, 1)) with_grad_tape({ h <- ag_matmul(w1, x) out <- ag_matmul(w2, h) loss <- ag_mse_loss(out, matrix(0, 2, 1)) }) grads <- backward(loss) pre_norm <- clip_grad_norm(list(w1 = w1, w2 = w2), grads, max_norm = 0.5) g1 <- get0(as.character(w1$id), envir = grads) g2 <- get0(as.character(w2$id), envir = grads) joint_norm <- sqrt(sum(g1^2) + sum(g2^2)) expect_equal(joint_norm, 0.5, tolerance = 1e-5) }) # ============================================================================ # Integration: full training loop with all utils # ============================================================================ test_that("integration: dataloader + scheduler + clip_grad trains", { set.seed(99) n <- 64L x_mat <- matrix(runif(4 * n), 4, n) y_mat <- rbind(rep(1, n), rep(0, n)) dl <- ag_dataloader(x_mat, y_mat, batch_size = 16L, shuffle = TRUE) model <- ag_sequential( ag_linear(4L, 8L, activation = "relu"), ag_linear(8L, 2L) ) params <- model$parameters() opt <- optimizer_adam(params, lr = 0.05) sch <- lr_scheduler_step(opt, step_size = 5L, gamma = 0.5) all_losses <- c() for (epoch in 1:15) { epoch_loss <- 0 batches <- dl$epoch() for (batch in batches) { with_grad_tape({ out <- model$forward(batch$x) loss <- ag_mse_loss(out, batch$y) }) grads <- backward(loss) clip_grad_norm(params, grads, max_norm = 5.0) opt$step(grads) opt$zero_grad() epoch_loss <- epoch_loss + loss$data[1L] } all_losses <- c(all_losses, epoch_loss / length(batches)) sch$step() } expect_lt(mean(tail(all_losses, 3)), mean(head(all_losses, 3))) # lr should have decayed after epochs 5 and 10 expect_lt(opt$lr, 0.05) }) # ============================================================================ # dp_train # ============================================================================ test_that("dp_train: 1 replica converges on simple regression", { set.seed(7L) D_IN <- 3L; D_OUT <- 2L; N <- 32L X <- matrix(rnorm(D_IN * N), D_IN, N) Y <- X[1:D_OUT, ] * 0.5 + 0.1 # deterministic target to ensure convergence data <- lapply(seq_len(N), function(i) list(x = X[, i, drop = FALSE], y = Y[, i, drop = FALSE])) make_model <- function() { W <- ag_param(matrix(rnorm(D_OUT * D_IN) * 0.1, D_OUT, D_IN)) b <- ag_param(matrix(0.0, D_OUT, 1L)) list(forward = function(x) ag_add(ag_matmul(W, x), b), parameters = function() list(W = W, b = b)) } result <- dp_train( make_model = make_model, data = data, loss_fn = function(out, tgt) ag_mse_loss(out, tgt), forward_fn = function(model, s) model$forward(s$x), target_fn = function(s) s$y, n_gpu = 1L, n_iter = 60L, lr = 1e-2, verbose = FALSE ) expect_named(result, c("params", "loss_history", "model")) expect_length(result$loss_history, 60L) expect_lt(mean(tail(result$loss_history, 10)), result$loss_history[1]) }) test_that("dp_train: 2 replicas — initial weight sync and no NaN", { set.seed(8L) D_IN <- 3L; D_OUT <- 2L make_model <- function() { W <- ag_param(matrix(rnorm(D_OUT * D_IN) * 0.1, D_OUT, D_IN)) list(forward = function(x) ag_matmul(W, x), parameters = function() list(W = W)) } # data: each sample is a list(x, y) with compatible dims data <- lapply(1:8, function(i) list(x = matrix(rnorm(D_IN), D_IN, 1L), y = matrix(rnorm(D_OUT), D_OUT, 1L))) result <- dp_train( make_model = make_model, data = data, loss_fn = function(out, tgt) ag_mse_loss(out, tgt), forward_fn = function(model, s) model$forward(s$x), target_fn = function(s) s$y, n_gpu = 2L, n_iter = 8L, lr = 1e-2, verbose = FALSE ) expect_named(result, c("params", "loss_history", "model")) expect_length(result$loss_history, 8L) expect_false(any(is.nan(result$loss_history))) }) test_that("dp_train: returns loss_history of correct length", { set.seed(9L) make_model <- function() { W <- ag_param(matrix(rnorm(4) * 0.1, 2, 2)) list(forward = function(x) ag_matmul(W, x), parameters = function() list(W = W)) } data <- lapply(1:8, function(i) matrix(rnorm(2), 2, 1)) result <- dp_train( make_model = make_model, data = data, loss_fn = function(out, tgt) ag_mse_loss(out, tgt), n_gpu = 1L, n_iter = 20L, lr = 1e-3, verbose = FALSE ) expect_length(result$loss_history, 20L) expect_true(all(is.finite(result$loss_history))) })