# Chain test: Autograd manual early stopping with weight restore # Pattern from titanic_classification.R variant 4: # ag_sequential + optimizer_adam + manual val_loss monitoring + # best-weights snapshot + restore on plateau # # Uses synthetic data (no external files). # Helper: predict with ag model (softmax, chunked) ag_predict <- function(model, x_col) { n <- ncol(x_col) out <- matrix(0.0, 2L, n) for (s in seq(1L, n, by = 32L)) { e <- min(s + 31L, n) xb <- ag_tensor(x_col[, s:e, drop = FALSE]) lg <- model$forward(xb)$data ev <- exp(lg - matrix(apply(lg, 2, max), nrow = nrow(lg), ncol = ncol(lg), byrow = TRUE)) cs <- matrix(colSums(ev), nrow = nrow(ev), ncol = ncol(ev), byrow = TRUE) out[, s:e] <- ev / cs } out } # Compute BCE loss on predictions bce_loss <- function(probs, targets) { eps <- 1e-7 p <- pmin(pmax(probs, eps), 1 - eps) -mean(targets[1,] * log(p[1,]) + targets[2,] * log(p[2,])) } # ── Manual early stopping: stops and restores weights ────── test_that("chain ag-early-stopping: stops training and restores best weights", { set.seed(42) n <- 120L # Linearly separable x_all <- rbind(matrix(rnorm(n, -1.5, 0.5), n/2, 2), matrix(rnorm(n, 1.5, 0.5), n/2, 2)) y_all <- rbind(matrix(c(1,0), n/2, 2, byrow = TRUE), matrix(c(0,1), n/2, 2, byrow = TRUE)) # Split 80/20 idx_val <- c(1:12, 61:72) idx_tr <- setdiff(1:n, idx_val) x_tr <- t(x_all[idx_tr, ]) # [2, n_tr] y_tr <- t(y_all[idx_tr, ]) # [2, n_tr] x_vl <- t(x_all[idx_val, ]) y_vl <- t(y_all[idx_val, ]) m <- ag_sequential( ag_linear(2L, 16L, activation = "relu"), ag_linear(16L, 2L) ) params <- m$parameters() opt <- optimizer_adam(params, lr = 1e-2) # Early stopping state patience <- 10L best_val_loss <- Inf best_weights <- NULL wait <- 0L stopped_epoch <- NA_integer_ n_tr <- ncol(x_tr) BS <- 16L ag_train(m) for (ep in seq_len(200L)) { perm <- sample(n_tr) for (b in seq_len(ceiling(n_tr / BS))) { idx <- perm[((b-1L)*BS+1L):min(b*BS, n_tr)] xb <- ag_tensor(x_tr[, idx, drop = FALSE]) yb <- y_tr[, idx, drop = FALSE] with_grad_tape({ loss <- ag_softmax_cross_entropy_loss(m$forward(xb), yb) }) grads <- backward(loss) opt$step(grads) opt$zero_grad() } # Val loss ag_eval(m) vl <- bce_loss(ag_predict(m, x_vl), y_vl) ag_train(m) if (vl < best_val_loss - 1e-4) { best_val_loss <- vl best_weights <- lapply(params, function(p) p$data) wait <- 0L } else { wait <- wait + 1L if (wait >= patience) { stopped_epoch <- ep break } } } # Should have stopped early (not run all 200) expect_true(!is.na(stopped_epoch)) expect_true(stopped_epoch < 200L) # Best weights should be saved expect_false(is.null(best_weights)) # Restore best weights for (nm in names(params)) params[[nm]]$data <- best_weights[[nm]] # Predictions after restore should be valid ag_eval(m) probs <- ag_predict(m, x_vl) expect_equal(nrow(probs), 2L) expect_equal(ncol(probs), ncol(x_vl)) expect_true(all(is.finite(probs))) expect_true(all(probs >= 0 & probs <= 1)) # Restored model should have decent accuracy on separable data pred_class <- apply(probs, 2, which.max) true_class <- apply(y_vl, 2, which.max) acc <- mean(pred_class == true_class) expect_true(acc > 0.7) }) # ── Scheduler + clip_grad + dataloader full pipeline ─────── test_that("chain ag-pipeline: scheduler + clip_grad + dataloader trains", { set.seed(42) n <- 80L x_all <- rbind(matrix(rnorm(n, -2, 0.5), n/2, 2), matrix(rnorm(n, 2, 0.5), n/2, 2)) y_all <- rbind(matrix(c(1,0), n/2, 2, byrow = TRUE), matrix(c(0,1), n/2, 2, byrow = TRUE)) x_cm <- t(x_all) # [2, n] y_cm <- t(y_all) # [2, n] m <- ag_sequential( ag_linear(2L, 16L, activation = "relu"), ag_batch_norm(16L), ag_dropout(0.2), ag_linear(16L, 2L) ) params <- m$parameters() opt <- optimizer_adam(params, lr = 1e-2) sch <- lr_scheduler_cosine(opt, T_max = 30L, lr_min = 1e-4) dl <- ag_dataloader(x_cm, y_cm, batch_size = 16L, shuffle = TRUE) losses <- numeric(30L) ag_train(m) for (ep in seq_len(30L)) { ep_loss <- 0 nb <- 0L for (batch in dl$epoch()) { with_grad_tape({ loss <- ag_softmax_cross_entropy_loss(m$forward(batch$x), batch$y$data) }) grads <- backward(loss) clip_grad_norm(params, grads, max_norm = 5.0) opt$step(grads) opt$zero_grad() ep_loss <- ep_loss + loss$data[1] nb <- nb + 1L } losses[ep] <- ep_loss / nb sch$step() } # Loss should decrease expect_true(mean(losses[26:30]) < mean(losses[1:5])) # LR should have decayed expect_true(opt$lr < 1e-2) # Predictions should be valid ag_eval(m) probs <- ag_predict(m, x_cm) expect_true(all(is.finite(probs))) }) # ── Raw ag_param + dp_train ──────────────────────────────── test_that("chain ag-raw-params: dp_train converges on separable data", { set.seed(42) n <- 60L x_all <- rbind(matrix(rnorm(n, -2, 0.3), n/2, 2), matrix(rnorm(n, 2, 0.3), n/2, 2)) y_all <- rbind(matrix(c(1,0), n/2, 2, byrow = TRUE), matrix(c(0,1), n/2, 2, byrow = TRUE)) dp_data <- lapply(seq_len(n), function(i) list(x = matrix(x_all[i, ], 2, 1), y = matrix(y_all[i, ], 2, 1))) make_model <- function() { W1 <- ag_param(matrix(rnorm(16 * 2) * 0.5, 16, 2)) b1 <- ag_param(matrix(0, 16, 1)) W2 <- ag_param(matrix(rnorm(2 * 16) * 0.5, 2, 16)) b2 <- ag_param(matrix(0, 2, 1)) list( forward = function(x) { h <- ag_relu(ag_add(ag_matmul(W1, x), b1)) ag_add(ag_matmul(W2, h), b2) }, parameters = function() list(W1=W1, b1=b1, W2=W2, b2=b2) ) } res <- dp_train( make_model = make_model, data = dp_data, loss_fn = function(out, tgt) ag_softmax_cross_entropy_loss(out, tgt), forward_fn = function(model, s) model$forward(ag_tensor(s$x)), target_fn = function(s) s$y, n_gpu = 1L, n_iter = 500L, lr = 1e-3, verbose = FALSE ) # Loss should have decreased lh <- res$loss_history expect_true(mean(tail(lh, 50)) < mean(head(lh, 50))) })