# Tests for Softmax Operations

# ============================================================================
# Basic Softmax
# ============================================================================

test_that("ggml_soft_max computes softmax correctly", {
  ctx <- ggml_init(16 * 1024 * 1024)
  on.exit(ggml_free(ctx))

  a <- ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 5)
  ggml_set_f32(a, c(1, 2, 3, 4, 5))

  result <- ggml_soft_max(ctx, a)
  graph <- ggml_build_forward_expand(ctx, result)
  ggml_graph_compute(ctx, graph)

  output <- ggml_get_f32(result)

  # Softmax properties
  expect_true(all(output > 0))
  expect_true(all(output < 1))
  expect_equal(sum(output), 1, tolerance = 1e-5)
})

test_that("ggml_soft_max outputs sum to 1", {
  ctx <- ggml_init(16 * 1024 * 1024)
  on.exit(ggml_free(ctx))

  a <- ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 100)
  ggml_set_f32(a, rnorm(100))

  result <- ggml_soft_max(ctx, a)
  graph <- ggml_build_forward_expand(ctx, result)
  ggml_graph_compute(ctx, graph)

  output <- ggml_get_f32(result)
  expect_equal(sum(output), 1, tolerance = 1e-5)
})

test_that("ggml_soft_max preserves order", {
  ctx <- ggml_init(16 * 1024 * 1024)
  on.exit(ggml_free(ctx))

  a <- ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 5)
  ggml_set_f32(a, c(1, 2, 3, 4, 5))

  result <- ggml_soft_max(ctx, a)
  graph <- ggml_build_forward_expand(ctx, result)
  ggml_graph_compute(ctx, graph)

  output <- ggml_get_f32(result)

  # Larger input -> larger softmax output
  expect_true(all(diff(output) > 0))
})

test_that("ggml_soft_max with uniform input gives uniform output", {
  ctx <- ggml_init(16 * 1024 * 1024)
  on.exit(ggml_free(ctx))

  n <- 5
  a <- ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n)
  ggml_set_f32(a, rep(1, n))

  result <- ggml_soft_max(ctx, a)
  graph <- ggml_build_forward_expand(ctx, result)
  ggml_graph_compute(ctx, graph)

  output <- ggml_get_f32(result)

  # All equal -> uniform 1/n
  expect_equal(output, rep(1/n, n), tolerance = 1e-5)
})

test_that("ggml_soft_max handles negative inputs", {
  ctx <- ggml_init(16 * 1024 * 1024)
  on.exit(ggml_free(ctx))

  a <- ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 5)
  ggml_set_f32(a, c(-5, -3, -1, 0, 1))

  result <- ggml_soft_max(ctx, a)
  graph <- ggml_build_forward_expand(ctx, result)
  ggml_graph_compute(ctx, graph)

  output <- ggml_get_f32(result)

  expect_true(all(output > 0))
  expect_equal(sum(output), 1, tolerance = 1e-5)
})

# ============================================================================
# Softmax In-place
# ============================================================================

test_that("ggml_soft_max_inplace computes in-place", {
  ctx <- ggml_init(16 * 1024 * 1024)
  on.exit(ggml_free(ctx))

  a <- ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 5)
  ggml_set_f32(a, c(1, 2, 3, 4, 5))

  result <- ggml_soft_max_inplace(ctx, a)
  graph <- ggml_build_forward_expand(ctx, result)
  ggml_graph_compute(ctx, graph)

  output <- ggml_get_f32(result)

  expect_true(all(output > 0))
  expect_equal(sum(output), 1, tolerance = 1e-5)
})

# ============================================================================
# Extended Softmax (with scale)
# ============================================================================

test_that("ggml_soft_max_ext with scale=1 matches basic softmax", {
  ctx <- ggml_init(16 * 1024 * 1024)
  on.exit(ggml_free(ctx))

  a1 <- ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 5)
  a2 <- ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 5)
  ggml_set_f32(a1, c(1, 2, 3, 4, 5))
  ggml_set_f32(a2, c(1, 2, 3, 4, 5))

  r1 <- ggml_soft_max(ctx, a1)
  r2 <- ggml_soft_max_ext(ctx, a2, mask = NULL, scale = 1.0, max_bias = 0.0)

  g1 <- ggml_build_forward_expand(ctx, r1)
  ggml_graph_compute(ctx, g1)
  out1 <- ggml_get_f32(r1)

  g2 <- ggml_build_forward_expand(ctx, r2)
  ggml_graph_compute(ctx, g2)
  out2 <- ggml_get_f32(r2)

  expect_equal(out1, out2, tolerance = 1e-5)
})

test_that("ggml_soft_max_ext with scale < 1 produces flatter distribution", {
  ctx <- ggml_init(16 * 1024 * 1024)
  on.exit(ggml_free(ctx))

  a1 <- ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 5)
  a2 <- ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 5)
  ggml_set_f32(a1, c(1, 2, 3, 4, 5))
  ggml_set_f32(a2, c(1, 2, 3, 4, 5))

  r1 <- ggml_soft_max_ext(ctx, a1, scale = 1.0)
  r2 <- ggml_soft_max_ext(ctx, a2, scale = 0.5)  # Lower temp = flatter

  g1 <- ggml_build_forward_expand(ctx, r1)
  ggml_graph_compute(ctx, g1)
  out1 <- ggml_get_f32(r1)

  g2 <- ggml_build_forward_expand(ctx, r2)
  ggml_graph_compute(ctx, g2)
  out2 <- ggml_get_f32(r2)

  # Lower scale -> flatter distribution (max is smaller)
  expect_lt(max(out2), max(out1))
})

test_that("ggml_soft_max_ext with scale > 1 produces sharper distribution", {
  ctx <- ggml_init(16 * 1024 * 1024)
  on.exit(ggml_free(ctx))

  a1 <- ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 5)
  a2 <- ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 5)
  ggml_set_f32(a1, c(1, 2, 3, 4, 5))
  ggml_set_f32(a2, c(1, 2, 3, 4, 5))

  r1 <- ggml_soft_max_ext(ctx, a1, scale = 1.0)
  r2 <- ggml_soft_max_ext(ctx, a2, scale = 2.0)  # Higher temp = sharper

  g1 <- ggml_build_forward_expand(ctx, r1)
  ggml_graph_compute(ctx, g1)
  out1 <- ggml_get_f32(r1)

  g2 <- ggml_build_forward_expand(ctx, r2)
  ggml_graph_compute(ctx, g2)
  out2 <- ggml_get_f32(r2)

  # Higher scale -> sharper distribution (max is larger)
  expect_gt(max(out2), max(out1))
})

# ============================================================================
# 2D Softmax
# ============================================================================

test_that("ggml_soft_max on 2D tensor", {
  ctx <- ggml_init(16 * 1024 * 1024)
  on.exit(ggml_free(ctx))

  # 4x3 tensor
  a <- ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 4, 3)
  ggml_set_f32(a, as.numeric(1:12))

  result <- ggml_soft_max(ctx, a)
  graph <- ggml_build_forward_expand(ctx, result)
  ggml_graph_compute(ctx, graph)

  output <- ggml_get_f32(result)

  # All values should be positive
  expect_true(all(output > 0))
  expect_true(all(output < 1))
})

# ============================================================================
# Numerical Stability
# ============================================================================

test_that("ggml_soft_max handles large values", {
  ctx <- ggml_init(16 * 1024 * 1024)
  on.exit(ggml_free(ctx))

  a <- ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 5)
  ggml_set_f32(a, c(100, 200, 300, 400, 500))

  result <- ggml_soft_max(ctx, a)
  graph <- ggml_build_forward_expand(ctx, result)
  ggml_graph_compute(ctx, graph)

  output <- ggml_get_f32(result)

  # Should not overflow
  expect_false(any(is.na(output)))
  expect_false(any(is.infinite(output)))
  expect_equal(sum(output), 1, tolerance = 1e-4)
})

test_that("ggml_soft_max handles very negative values", {
  ctx <- ggml_init(16 * 1024 * 1024)
  on.exit(ggml_free(ctx))

  a <- ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 5)
  ggml_set_f32(a, c(-500, -400, -300, -200, -100))

  result <- ggml_soft_max(ctx, a)
  graph <- ggml_build_forward_expand(ctx, result)
  ggml_graph_compute(ctx, graph)

  output <- ggml_get_f32(result)

  # Should not underflow to zero
  expect_false(any(is.na(output)))
  expect_equal(sum(output), 1, tolerance = 1e-4)
})