test_that("penalty works with all optimizers and mixture values", {
  skip_on_cran()
  skip_if_not(torch::torch_is_installed())
  skip_if_not_installed("modeldata")

  suppressPackageStartupMessages(library(dplyr))

  # Use small dataset for speed
  data("parabolic", package = "modeldata")
  set.seed(1)
  in_train <- sample(seq_len(nrow(parabolic)), 300)
  parabolic_tr <- parabolic[in_train, ]

  # Test parameters
  optimizers <- c("SGD", "ADAMw", "RMSprop", "Adadelta", "Adagrad")
  mixture_values <- c(0, 0.5, 1) # L2, elastic net, L1
  penalty_values <- c(0, 0.1, 10)

  # For each optimizer
  for (opt in optimizers) {
    # Skip long-running combinations for CRAN
    if (
      identical(Sys.getenv("NOT_CRAN"), "false") &&
        opt %in% c("Adadelta", "Adagrad")
    ) {
      next
    }

    # Test pure L2 (mixture = 0)
    set.seed(123)
    fit_no_penalty <- brulee_mlp(
      class ~ .,
      data = parabolic_tr,
      hidden_units = 2,
      epochs = 50L,
      learn_rate = if (opt == "ADAMw") 0.01 else 0.1,
      activation = "elu",
      penalty = 0,
      mixture = 0,
      batch_size = 32,
      optimizer = opt,
      verbose = FALSE
    )

    set.seed(123)
    fit_with_penalty <- brulee_mlp(
      class ~ .,
      data = parabolic_tr,
      hidden_units = 2,
      epochs = 50L,
      learn_rate = if (opt == "ADAMw") 0.01 else 0.1,
      activation = "elu",
      penalty = 10,
      mixture = 0,
      batch_size = 32,
      optimizer = opt,
      verbose = FALSE
    )

    # Penalty should make a difference
    expect_false(
      isTRUE(all.equal(fit_no_penalty$loss, fit_with_penalty$loss)),
      info = paste0(opt, " with mixture=0: penalty should affect loss")
    )

    # With penalty, loss should generally be higher or converge differently
    # (due to regularization)
    expect_true(
      !identical(tail(fit_no_penalty$loss, 1), tail(fit_with_penalty$loss, 1)),
      info = paste0(opt, " with mixture=0: final losses should differ")
    )

    # Test L1 penalty (mixture = 1) - skip for ADAMw as it requires mixture = 0
    if (opt != "ADAMw") {
      set.seed(123)
      fit_no_penalty_l1 <- brulee_mlp(
        class ~ .,
        data = parabolic_tr,
        hidden_units = 2,
        epochs = 50L,
        learn_rate = 0.1,
        activation = "elu",
        penalty = 0,
        mixture = 1,
        batch_size = 32,
        optimizer = opt,
        verbose = FALSE
      )

      set.seed(123)
      fit_with_penalty_l1 <- brulee_mlp(
        class ~ .,
        data = parabolic_tr,
        hidden_units = 2,
        epochs = 50L,
        learn_rate = 0.1,
        activation = "elu",
        penalty = 3,
        mixture = 1,
        batch_size = 32,
        optimizer = opt,
        verbose = FALSE
      )

      # Penalty should make a difference
      expect_false(
        isTRUE(all.equal(fit_no_penalty_l1$loss, fit_with_penalty_l1$loss)),
        info = paste0(opt, " with mixture=1: penalty should affect loss")
      )

      # Test elastic net (mixture = 0.5)
      set.seed(123)
      fit_no_penalty_en <- brulee_mlp(
        class ~ .,
        data = parabolic_tr,
        hidden_units = 2,
        epochs = 50L,
        learn_rate = 0.1,
        activation = "elu",
        penalty = 0,
        mixture = 0.5,
        batch_size = 32,
        optimizer = opt,
        verbose = FALSE
      )

      set.seed(123)
      fit_with_penalty_en <- brulee_mlp(
        class ~ .,
        data = parabolic_tr,
        hidden_units = 2,
        epochs = 50L,
        learn_rate = 0.1,
        activation = "elu",
        penalty = 3,
        mixture = 0.5,
        batch_size = 32,
        optimizer = opt,
        verbose = FALSE
      )

      # Penalty should make a difference
      expect_false(
        isTRUE(all.equal(fit_no_penalty_en$loss, fit_with_penalty_en$loss)),
        info = paste0(opt, " with mixture=0.5: penalty should affect loss")
      )
    }
  }
})

test_that("ADAMw enforces pure L2 penalty", {
  skip_on_cran()
  skip_if_not(torch::torch_is_installed())
  skip_if_not_installed("modeldata")

  data("parabolic", package = "modeldata")
  set.seed(1)
  in_train <- sample(seq_len(nrow(parabolic)), 300)
  parabolic_tr <- parabolic[in_train, ]

  # ADAMw with mixture = 0 should work fine
  expect_no_error({
    set.seed(123)
    fit_adamw_l2 <- brulee_mlp(
      class ~ .,
      data = parabolic_tr,
      hidden_units = 2,
      epochs = 50L,
      learn_rate = 0.01,
      activation = "elu",
      penalty = 0.1,
      mixture = 0,
      batch_size = 256,
      optimizer = "ADAMw",
      verbose = FALSE
    )
  })

  # ADAMw with mixture != 0 should warn and convert to 0
  expect_warning(
    {
      set.seed(123)
      fit_adamw_mixed <- brulee_mlp(
        class ~ .,
        data = parabolic_tr,
        hidden_units = 2,
        epochs = 50L,
        learn_rate = 0.01,
        activation = "elu",
        penalty = 0.1,
        mixture = 0.5,
        batch_size = 256,
        optimizer = "ADAMw",
        verbose = FALSE
      )
    },
    regexp = "pure L2 penalty"
  )

  # The warning case should produce same result as mixture = 0
  expect_equal(fit_adamw_l2$loss, fit_adamw_mixed$loss)
})

test_that("penalty magnitude affects regularization strength", {
  skip_on_cran()
  skip_if_not(torch::torch_is_installed())
  skip_if_not_installed("modeldata")

  data("parabolic", package = "modeldata")
  set.seed(1)
  in_train <- sample(seq_len(nrow(parabolic)), 300)
  parabolic_tr <- parabolic[in_train, ]

  # Test with SGD - different penalty values should produce different results
  set.seed(123)
  fit_pen_0 <- brulee_mlp(
    class ~ .,
    data = parabolic_tr,
    hidden_units = 2,
    epochs = 100L,
    learn_rate = 0.1,
    activation = "elu",
    penalty = 0,
    mixture = 0,
    batch_size = 256,
    optimizer = "SGD",
    verbose = FALSE
  )

  set.seed(123)
  fit_pen_01 <- brulee_mlp(
    class ~ .,
    data = parabolic_tr,
    hidden_units = 2,
    epochs = 100L,
    learn_rate = 0.1,
    activation = "elu",
    penalty = 0.1,
    mixture = 0,
    batch_size = 256,
    optimizer = "SGD",
    verbose = FALSE
  )

  set.seed(123)
  fit_pen_10 <- brulee_mlp(
    class ~ .,
    data = parabolic_tr,
    hidden_units = 2,
    epochs = 100L,
    learn_rate = 0.1,
    activation = "elu",
    penalty = 10,
    mixture = 0,
    batch_size = 256,
    optimizer = "SGD",
    verbose = FALSE
  )

  # All three should be different
  expect_false(isTRUE(all.equal(fit_pen_0$loss, fit_pen_01$loss)))
  expect_false(isTRUE(all.equal(fit_pen_0$loss, fit_pen_10$loss)))
  expect_false(isTRUE(all.equal(fit_pen_01$loss, fit_pen_10$loss)))

  # Higher penalty should generally lead to higher final loss (more regularization)
  # or at least different convergence
  expect_true(tail(fit_pen_10$loss, 1) > tail(fit_pen_01$loss, 1))
})

test_that("L1 vs L2 vs elastic net produce different results", {
  skip_on_cran()
  skip_if_not(torch::torch_is_installed())
  skip_if_not_installed("modeldata")

  data("parabolic", package = "modeldata")
  set.seed(1)
  in_train <- sample(seq_len(nrow(parabolic)), 300)
  parabolic_tr <- parabolic[in_train, ]

  penalty_val <- 1

  # L2 (mixture = 0)
  set.seed(123)
  fit_l2 <- brulee_mlp(
    class ~ .,
    data = parabolic_tr,
    hidden_units = 2,
    epochs = 100L,
    learn_rate = 0.1,
    activation = "elu",
    penalty = penalty_val,
    mixture = 0,
    batch_size = 256,
    optimizer = "SGD",
    verbose = FALSE
  )

  # L1 (mixture = 1)
  set.seed(123)
  fit_l1 <- brulee_mlp(
    class ~ .,
    data = parabolic_tr,
    hidden_units = 2,
    epochs = 100L,
    learn_rate = 0.1,
    activation = "elu",
    penalty = penalty_val,
    mixture = 1,
    batch_size = 256,
    optimizer = "SGD",
    verbose = FALSE
  )

  # Elastic net (mixture = 0.5)
  set.seed(123)
  fit_en <- brulee_mlp(
    class ~ .,
    data = parabolic_tr,
    hidden_units = 2,
    epochs = 100L,
    learn_rate = 0.1,
    activation = "elu",
    penalty = penalty_val,
    mixture = 0.5,
    batch_size = 256,
    optimizer = "SGD",
    verbose = FALSE
  )

  # All three should produce different results
  expect_false(isTRUE(all.equal(fit_l2$loss, fit_l1$loss)))
  expect_false(isTRUE(all.equal(fit_l2$loss, fit_en$loss)))
  expect_false(isTRUE(all.equal(fit_l1$loss, fit_en$loss)))
})

test_that("LBFGS penalty works correctly", {
  skip_on_cran()
  skip_if_not(torch::torch_is_installed())
  skip_if_not_installed("modeldata")

  data("parabolic", package = "modeldata")
  set.seed(1)
  in_train <- sample(seq_len(nrow(parabolic)), 200)
  parabolic_tr <- parabolic[in_train, ]

  # LBFGS with no penalty
  set.seed(123)
  fit_no_pen <- brulee_mlp(
    class ~ .,
    data = parabolic_tr,
    hidden_units = 2,
    epochs = 5L,
    learn_rate = 0.001,
    activation = "elu",
    penalty = 0,
    mixture = 0,
    optimizer = "LBFGS",
    verbose = FALSE
  )

  # LBFGS with penalty (uses loss function, not weight_decay)
  set.seed(123)
  fit_with_pen <- brulee_mlp(
    class ~ .,
    data = parabolic_tr,
    hidden_units = 2,
    epochs = 5L,
    learn_rate = 0.001,
    activation = "elu",
    penalty = 0.1,
    mixture = 0,
    optimizer = "LBFGS",
    verbose = FALSE
  )

  # Should produce different results
  expect_false(isTRUE(all.equal(fit_no_pen$loss, fit_with_pen$loss)))

  # LBFGS with L1 penalty should also work
  set.seed(123)
  fit_with_l1 <- brulee_mlp(
    class ~ .,
    data = parabolic_tr,
    hidden_units = 2,
    epochs = 5L,
    learn_rate = 0.001,
    activation = "elu",
    penalty = 0.1,
    mixture = 1,
    optimizer = "LBFGS",
    verbose = FALSE
  )

  # Should produce different results from both no penalty and L2
  expect_false(isTRUE(all.equal(fit_no_pen$loss, fit_with_l1$loss)))
  expect_false(isTRUE(all.equal(fit_with_pen$loss, fit_with_l1$loss)))
})