# Tests for R-level dynamic graph / autograd test_that("ag_tensor creates tensor with correct shape", { x <- ag_tensor(matrix(1:6, 2, 3)) expect_s3_class(x, "ag_tensor") expect_equal(dim(x$data), c(2L, 3L)) expect_false(x$requires_grad) expect_null(x$grad) }) test_that("ag_param sets requires_grad = TRUE", { w <- ag_param(matrix(1:4, 2, 2)) expect_true(w$requires_grad) }) test_that("ag_matmul forward pass is correct", { A <- ag_param(matrix(c(1, 0, 0, 1), 2, 2)) # identity B <- ag_tensor(matrix(c(1, 2, 3, 4), 2, 2)) out <- ag_matmul(A, B) expect_equal(out$data, B$data) }) test_that("backward computes correct gradient for matmul", { # W filled column-major: matrix(c(1,2,3,4), 2,2) = [[1,3],[2,4]] # W %*% [1,1]^T = [1+3, 2+4] = [4, 6] W <- ag_param(matrix(c(1, 2, 3, 4), 2, 2)) x <- ag_tensor(matrix(c(1, 1), 2, 1)) y_true <- matrix(c(4, 6), 2, 1) # exact match -> zero loss with_grad_tape({ out <- ag_matmul(W, x) loss <- ag_mse_loss(out, y_true) }) grads <- backward(loss) # loss should be 0 expect_lt(abs(loss$data[1]), 1e-10) # grad should be near 0 g <- get0(as.character(W$id), envir = grads) expect_lt(max(abs(g)), 1e-10) }) test_that("backward: nonzero gradient for mse_loss", { W <- ag_param(matrix(c(1, 0, 0, 1), 2, 2)) x <- ag_tensor(matrix(c(2, 3), 2, 1)) y_true <- matrix(c(0, 0), 2, 1) with_grad_tape({ out <- ag_matmul(W, x) loss <- ag_mse_loss(out, y_true) }) grads <- backward(loss) g <- get0(as.character(W$id), envir = grads) expect_false(is.null(g)) expect_equal(dim(g), dim(W$data)) # Analytical: d/dW MSE = 2/n * (Wx - y) x^T # n = 2, Wx = [2,3], y=0 -> residual = [2,3] # grad = 2/2 * [2,3] %*% t([2,3]) = [[4,6],[6,9]] expected_g <- matrix(c(4, 6, 6, 9), 2, 2) expect_equal(g, expected_g, tolerance = 1e-6) }) test_that("relu backward: gradient is zero below 0", { # column-major fill: matrix(c(-1,2,-3,4), 2,2) = [[-1,-3],[2,4]] # [1,1]=-1 (neg), [2,1]=2 (pos), [1,2]=-3 (neg), [2,2]=4 (pos) x <- ag_param(matrix(c(-1, 2, -3, 4), 2, 2)) with_grad_tape({ out <- ag_relu(x) loss <- ag_mse_loss(out, matrix(0, 2, 2)) }) grads <- backward(loss) g <- get0(as.character(x$id), envir = grads) # grad 0 where x < 0: [1,1] and [1,2] expect_equal(g[1, 1], 0) expect_equal(g[1, 2], 0) # grad nonzero where x > 0: [2,1] and [2,2] expect_gt(abs(g[2, 1]), 0) expect_gt(abs(g[2, 2]), 0) }) test_that("sigmoid backward is correct", { x <- ag_param(matrix(c(0), 1, 1)) with_grad_tape({ out <- ag_sigmoid(x) loss <- ag_mse_loss(out, matrix(0, 1, 1)) }) grads <- backward(loss) g <- get0(as.character(x$id), envir = grads) # sigmoid(0) = 0.5, d_sigmoid = 0.5*0.5 = 0.25 # d_mse/d_out = 2/1*(0.5 - 0) = 1 # grad = 1 * 0.25 = 0.25 expect_equal(as.numeric(g), 0.25, tolerance = 1e-6) }) test_that("cross_entropy_loss backward: grad shape matches pred", { # 3 classes, 4 samples set.seed(1) logits <- ag_param(matrix(runif(3 * 4), 3, 4)) with_grad_tape({ pred <- ag_softmax(logits) # one-hot targets y <- matrix(c(1,0,0, 0,1,0, 0,0,1, 1,0,0), 3, 4) loss <- ag_cross_entropy_loss(pred, y) }) grads <- backward(loss) g_logits <- get0(as.character(logits$id), envir = grads) # softmax + CE grad flows through softmax into logits # grad for pred: (p - y)/n # grad for logits from softmax JVP expect_equal(dim(g_logits), c(3L, 4L)) }) test_that("optimizer_sgd updates parameters", { W <- ag_param(matrix(c(1, 0, 0, 1), 2, 2)) x <- ag_tensor(matrix(c(2, 3), 2, 1)) y_true <- matrix(c(0, 0), 2, 1) W_orig <- W$data opt <- optimizer_sgd(list(W = W), lr = 0.1) with_grad_tape({ out <- ag_matmul(W, x) loss <- ag_mse_loss(out, y_true) }) grads <- backward(loss) opt$step(grads) expect_false(identical(opt$params$W$data, W_orig)) }) test_that("optimizer_adam updates parameters", { W <- ag_param(matrix(c(1, 0, 0, 1), 2, 2)) x <- ag_tensor(matrix(c(1, 1), 2, 1)) y_true <- matrix(c(0, 0), 2, 1) opt <- optimizer_adam(list(W = W), lr = 0.01) W_orig <- W$data with_grad_tape({ out <- ag_matmul(W, x) loss <- ag_mse_loss(out, y_true) }) grads <- backward(loss) opt$step(grads) expect_false(identical(opt$params$W$data, W_orig)) expect_equal(opt$t, 1L) }) test_that("ag_linear forward has correct output shape", { layer <- ag_linear(4L, 8L, activation = "relu") x <- ag_tensor(matrix(runif(4 * 16), 4, 16)) out <- layer$forward(x) expect_equal(dim(out$data), c(8L, 16L)) expect_true(all(out$data >= 0)) # ReLU }) test_that("full training loop reduces loss", { set.seed(123) # XOR-like: 2 inputs -> 1 output n <- 64L x_mat <- matrix(sample(c(0, 1), 2 * n, replace = TRUE), 2, n) y_mat <- matrix(as.numeric(xor(x_mat[1,], x_mat[2,])), 1, n) l1 <- ag_linear(2L, 4L, activation = "relu") l2 <- ag_linear(4L, 1L, activation = "sigmoid") params <- c(l1$params(), l2$params()) opt <- optimizer_adam(params, lr = 0.05) losses <- numeric(50L) for (i in seq_len(50L)) { x <- ag_tensor(x_mat) y <- ag_tensor(y_mat) with_grad_tape({ h <- l1$forward(x) out <- l2$forward(h) loss <- ag_mse_loss(out, y) }) grads <- backward(loss) opt$step(grads) opt$zero_grad() losses[i] <- loss$data[1] } # Loss should decrease over training (average last 10 vs first 10) expect_lt(mean(losses[41:50]), mean(losses[1:10])) })