# Chain tests: Popular ML patterns # 1. Autograd regression (MSE loss) # 2. Transfer learning: freeze backbone, train head only # 3. Sequential save/load with BatchNorm (running stats preserved) # 4. Functional multi-input classification (two feature groups → merge) # ── 1. Autograd regression: MSE loss on synthetic data ───── test_that("chain pattern: autograd regression with MSE loss", { # y = 2*x1 + 3*x2 + 1 + noise set.seed(42) n <- 80L x1 <- rnorm(n) x2 <- rnorm(n) y_true <- 2 * x1 + 3 * x2 + 1 + rnorm(n, sd = 0.1) x_cm <- rbind(x1, x2) # [2, n] y_cm <- matrix(y_true, nrow = 1) # [1, n] m <- ag_sequential( ag_linear(2L, 16L, activation = "relu"), ag_linear(16L, 1L) ) opt <- optimizer_adam(m$parameters(), lr = 1e-2) BS <- 16L losses <- numeric(50) ag_train(m) for (ep in seq_len(50L)) { perm <- sample(n) ep_loss <- 0; nb <- 0L for (b in seq_len(ceiling(n / BS))) { idx <- perm[((b-1L)*BS+1L):min(b*BS, n)] xb <- ag_tensor(x_cm[, idx, drop = FALSE]) yb <- y_cm[, idx, drop = FALSE] with_grad_tape({ pred <- m$forward(xb) loss <- ag_mse_loss(pred, yb) }) grads <- backward(loss) opt$step(grads) opt$zero_grad() ep_loss <- ep_loss + loss$data[1] nb <- nb + 1L } losses[ep] <- ep_loss / nb } # Loss should decrease significantly expect_true(mean(tail(losses, 5)) < mean(head(losses, 5))) # Predictions should be reasonable ag_eval(m) pred <- m$forward(ag_tensor(x_cm))$data rmse <- sqrt(mean((pred[1,] - y_true)^2)) expect_true(rmse < 1.0) # should be much better than naive }) # ── 2. Transfer learning: freeze backbone, train head ────── test_that("chain pattern: transfer learning — freeze backbone, train head", { set.seed(42) n <- 80L x_all <- rbind(matrix(rnorm(n, -2, 0.5), n/2, 2), matrix(rnorm(n, 2, 0.5), n/2, 2)) y_all <- rbind(matrix(c(1,0), n/2, 2, byrow = TRUE), matrix(c(0,1), n/2, 2, byrow = TRUE)) x_cm <- t(x_all) y_cm <- t(y_all) # "Pretrained" backbone backbone <- ag_sequential( ag_linear(2L, 16L, activation = "relu"), ag_linear(16L, 8L, activation = "relu") ) # New head head_layer <- ag_linear(8L, 2L) # Snapshot backbone weights before training bb_params <- backbone$parameters() bb_before <- lapply(bb_params, function(p) p$data) # Only optimize head parameters head_params <- head_layer$params() opt <- optimizer_adam(head_params, lr = 1e-2) BS <- 16L ag_train(backbone) ag_train(head_layer) for (ep in seq_len(30L)) { perm <- sample(n) for (b in seq_len(ceiling(n / BS))) { idx <- perm[((b-1L)*BS+1L):min(b*BS, n)] xb <- ag_tensor(x_cm[, idx, drop = FALSE]) yb <- y_cm[, idx, drop = FALSE] with_grad_tape({ features <- backbone$forward(xb) logits <- head_layer$forward(features) loss <- ag_softmax_cross_entropy_loss(logits, yb) }) grads <- backward(loss) # Only step on head params opt$step(grads) opt$zero_grad() } } # Backbone weights should NOT change (not in optimizer) bb_after <- lapply(bb_params, function(p) p$data) for (nm in names(bb_before)) { expect_equal(bb_before[[nm]], bb_after[[nm]], info = paste("backbone param", nm, "should be frozen")) } # Head should have updated (loss should be finite) ag_eval(backbone) ag_eval(head_layer) features <- backbone$forward(ag_tensor(x_cm)) logits <- head_layer$forward(features)$data expect_true(all(is.finite(logits))) }) # ── 3. Sequential save/load with BatchNorm ───────────────── test_that("chain pattern: save/load sequential+BatchNorm preserves predictions", { set.seed(42) n <- 60L x <- matrix(rnorm(n * 2), n, 2) y <- cbind(as.numeric(x[,1] > 0), as.numeric(x[,1] <= 0)) m <- ggml_model_sequential() |> ggml_layer_dense(8L, activation = "relu", input_shape = 2L) |> ggml_layer_batch_norm() |> ggml_layer_dense(2L, activation = "softmax") |> ggml_compile(optimizer = "adam", loss = "categorical_crossentropy") m <- ggml_fit(m, x, y, epochs = 30L, batch_size = 10L, verbose = 0L) p_before <- ggml_predict(m, x[1:10, ], batch_size = 10L) # Save and load path <- tempfile(fileext = ".ggml") ggml_save_model(m, path) m2 <- ggml_load_model(path) p_after <- ggml_predict(m2, x[1:10, ], batch_size = 10L) # Predictions should match expect_equal(p_before, p_after, tolerance = 1e-4) }) # ── 4. Functional multi-input: two branches → add → classify ── test_that("chain pattern: functional multi-input trains and predicts", { set.seed(42) n <- 80L # Branch 1: continuous features x1 <- matrix(rnorm(n * 3), n, 3) # Branch 2: one-hot categorical (4 categories) cats <- sample(1:4, n, replace = TRUE) x2 <- matrix(0, n, 4) x2[cbind(1:n, cats)] <- 1 # Target: depends on first feature + category y_class <- as.integer(x1[,1] + (cats == 1) > 0) y <- cbind(y_class, 1L - y_class) * 1.0 inp1 <- ggml_input(shape = 3L) inp2 <- ggml_input(shape = 4L) h1 <- inp1 |> ggml_layer_dense(8L, activation = "relu") h2 <- inp2 |> ggml_layer_dense(8L, activation = "relu") merged <- ggml_layer_add(list(h1, h2)) out <- merged |> ggml_layer_dense(2L, activation = "softmax") m <- ggml_model(inputs = list(inp1, inp2), outputs = out) |> ggml_compile(optimizer = "adam", loss = "categorical_crossentropy") m <- ggml_fit(m, x = list(x1, x2), y = y, epochs = 50L, batch_size = 10L, verbose = 0L) p <- ggml_predict(m, x = list(x1, x2), batch_size = 16L) expect_equal(nrow(p), n) expect_equal(ncol(p), 2L) expect_true(all(is.finite(p))) # Probabilities sum to ~1 expect_true(all(abs(rowSums(p) - 1.0) < 0.05)) }) # ── 5. Sequential: dropout eval is deterministic ────────── test_that("chain pattern: sequential dropout predict is deterministic", { set.seed(42) n <- 40L x <- matrix(rnorm(n * 2), n, 2) y <- cbind(as.numeric(x[,1] > 0), as.numeric(x[,1] <= 0)) m <- ggml_model_sequential() |> ggml_layer_dense(16L, activation = "relu", input_shape = 2L) |> ggml_layer_dropout(0.5) |> ggml_layer_dense(2L, activation = "softmax") |> ggml_compile(optimizer = "adam", loss = "categorical_crossentropy") m <- ggml_fit(m, x, y, epochs = 20L, batch_size = 10L, verbose = 0L) # Two predict calls should give identical results (eval mode → no dropout) p1 <- ggml_predict(m, x[1:10, ], batch_size = 10L) p2 <- ggml_predict(m, x[1:10, ], batch_size = 10L) expect_equal(p1, p2) })