# Tests for Transformer Operations
# RoPE, Flash Attention, Causal Masking, Get Rows

test_that("ggml_set_i32 and ggml_get_i32 work correctly", {
  ctx <- ggml_init(16 * 1024 * 1024)
  on.exit(ggml_free(ctx))

  # Create I32 tensor
  t <- ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 5)

  # Set data
  ggml_set_i32(t, c(10L, 20L, 30L, 40L, 50L))

  # Get data back
  result <- ggml_get_i32(t)

  expect_equal(result, c(10L, 20L, 30L, 40L, 50L))
})

test_that("ggml_get_rows extracts embeddings by indices", {
  ctx <- ggml_init(32 * 1024 * 1024)
  on.exit(ggml_free(ctx))

  # Create embedding matrix: 4-dim embeddings, 10 tokens
  embeddings <- ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 4, 10)

  # Fill with sequential data for easy verification
  # Row i will have values [i*4, i*4+1, i*4+2, i*4+3]
  data <- as.numeric(0:39)
  ggml_set_f32(embeddings, data)

  # Create index tensor to select rows 0, 2, 5
  indices <- ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3)
  ggml_set_i32(indices, c(0L, 2L, 5L))

  # Get rows
  result <- ggml_get_rows(ctx, embeddings, indices)

  # Build and compute
  graph <- ggml_build_forward_expand(ctx, result)
  ggml_graph_compute(ctx, graph)

  # Get result
  out <- ggml_get_f32(result)

  # Expected: rows 0, 2, 5 -> [0,1,2,3], [8,9,10,11], [20,21,22,23]
  expected <- c(0, 1, 2, 3, 8, 9, 10, 11, 20, 21, 22, 23)
  expect_equal(out, expected)

  # Check shape
  shape <- ggml_tensor_shape(result)
  expect_equal(shape[1], 4)  # embedding dim
  expect_equal(shape[2], 3)  # number of selected rows
})

test_that("ggml_diag_mask_inf creates causal mask", {
  ctx <- ggml_init(32 * 1024 * 1024)
  on.exit(ggml_free(ctx))

  # Create 4x4 matrix of ones
  m <- ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 4, 4)
  ggml_set_f32(m, rep(1.0, 16))

  # Apply causal mask (upper triangle -> -Inf)
  masked <- ggml_diag_mask_inf(ctx, m, 0)

  # Build and compute
  graph <- ggml_build_forward_expand(ctx, masked)
  ggml_graph_compute(ctx, graph)

  # Get result - GGML is row-major, R is column-major
  out <- ggml_get_f32(masked)
  # GGML stores [row0], [row1], ... so byrow=TRUE to match
  out_mat <- matrix(out, nrow = 4, ncol = 4, byrow = TRUE)

  # Lower triangle and diagonal should be 1
  # Upper triangle should be -Inf
  for (i in 1:4) {
    for (j in 1:4) {
      if (j <= i) {
        expect_equal(out_mat[i, j], 1.0)
      } else {
        expect_true(is.infinite(out_mat[i, j]) && out_mat[i, j] < 0)
      }
    }
  }
})

test_that("ggml_diag_mask_zero creates zero mask", {
  ctx <- ggml_init(32 * 1024 * 1024)
  on.exit(ggml_free(ctx))

  # Create 3x3 matrix of ones
  m <- ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 3, 3)
  ggml_set_f32(m, rep(1.0, 9))

  # Apply zero mask
  masked <- ggml_diag_mask_zero(ctx, m, 0)

  # Build and compute
  graph <- ggml_build_forward_expand(ctx, masked)
  ggml_graph_compute(ctx, graph)

  # Get result - GGML is row-major
  out <- ggml_get_f32(masked)
  out_mat <- matrix(out, nrow = 3, ncol = 3, byrow = TRUE)

  # Lower triangle and diagonal should be 1, upper should be 0
  for (i in 1:3) {
    for (j in 1:3) {
      if (j <= i) {
        expect_equal(out_mat[i, j], 1.0)
      } else {
        expect_equal(out_mat[i, j], 0.0)
      }
    }
  }
})

test_that("ggml_rope applies rotary position embedding", {
  ctx <- ggml_init(64 * 1024 * 1024)
  on.exit(ggml_free(ctx))

  # Create input tensor: [head_dim=8, n_head=2, seq_len=4, batch=1]
  head_dim <- 8
  n_head <- 2
  seq_len <- 4

  q <- ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_dim, n_head, seq_len, 1)

  # Fill with ones
  n_elem <- head_dim * n_head * seq_len
  ggml_set_f32(q, rep(1.0, n_elem))

  # Create position tensor
  pos <- ggml_new_tensor_1d(ctx, GGML_TYPE_I32, seq_len)
  ggml_set_i32(pos, 0:(seq_len - 1))

  # Apply RoPE
  q_rope <- ggml_rope(ctx, q, pos, head_dim, GGML_ROPE_TYPE_NORM)

  # Build and compute
  graph <- ggml_build_forward_expand(ctx, q_rope)
  ggml_graph_compute(ctx, graph)

  # Get result
  out <- ggml_get_f32(q_rope)

  # Basic sanity checks:
  # 1. Output should have same number of elements
  expect_equal(length(out), n_elem)

  # 2. Output should not be all the same (rotation applied)
  expect_true(sd(out) > 0)

  # 3. Shape should be preserved
  shape <- ggml_tensor_shape(q_rope)
  expect_equal(shape[1], head_dim)
  expect_equal(shape[2], n_head)
  expect_equal(shape[3], seq_len)
})

test_that("ggml_rope_ext works with frequency scaling", {
  ctx <- ggml_init(64 * 1024 * 1024)
  on.exit(ggml_free(ctx))

  head_dim <- 8
  n_head <- 2
  seq_len <- 4

  q <- ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_dim, n_head, seq_len, 1)
  ggml_set_f32(q, rep(1.0, head_dim * n_head * seq_len))

  pos <- ggml_new_tensor_1d(ctx, GGML_TYPE_I32, seq_len)
  ggml_set_i32(pos, 0:(seq_len - 1))

  # Apply extended RoPE with custom freq_base
  q_rope <- ggml_rope_ext(ctx, q, pos, NULL,
                          n_dims = head_dim,
                          mode = GGML_ROPE_TYPE_NORM,
                          n_ctx_orig = 2048,
                          freq_base = 10000.0,
                          freq_scale = 1.0,
                          ext_factor = 0.0,
                          attn_factor = 1.0,
                          beta_fast = 32.0,
                          beta_slow = 1.0)

  graph <- ggml_build_forward_expand(ctx, q_rope)
  ggml_graph_compute(ctx, graph)

  out <- ggml_get_f32(q_rope)

  expect_equal(length(out), head_dim * n_head * seq_len)
  expect_true(sd(out) > 0)
})

test_that("ggml_flash_attn_ext computes attention", {
  ctx <- ggml_init(128 * 1024 * 1024)
  on.exit(ggml_free(ctx))

  head_dim <- 8
  n_head <- 2
  n_head_kv <- 2  # Same as n_head (no GQA)
  seq_len <- 4

  # Create Q, K, V tensors
  q <- ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_dim, n_head, seq_len, 1)
  k <- ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_dim, n_head_kv, seq_len, 1)
  v <- ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_dim, n_head_kv, seq_len, 1)

  # Fill with random-ish data
  set.seed(42)
  ggml_set_f32(q, rnorm(head_dim * n_head * seq_len))
  ggml_set_f32(k, rnorm(head_dim * n_head_kv * seq_len))
  ggml_set_f32(v, rnorm(head_dim * n_head_kv * seq_len))

  # Scale = 1/sqrt(head_dim)
  scale <- 1.0 / sqrt(head_dim)

  # Compute attention
  out <- ggml_flash_attn_ext(ctx, q, k, v, NULL, scale, 0.0, 0.0)

  graph <- ggml_build_forward_expand(ctx, out)
  ggml_graph_compute(ctx, graph)

  result <- ggml_get_f32(out)

  # Output should have same number of elements as Q
  expect_equal(length(result), head_dim * n_head * seq_len)

  # Output should not be all zeros or NaN
  expect_true(all(is.finite(result)))
  expect_true(sd(result) > 0)

  # Check head_dim is preserved (first dimension)
  shape <- ggml_tensor_shape(out)
  expect_equal(shape[1], head_dim)
  # Total elements should match
  expect_equal(prod(shape), head_dim * n_head * seq_len)
})

test_that("RoPE type constants are defined", {
  expect_equal(GGML_ROPE_TYPE_NORM, 0L)
  expect_equal(GGML_ROPE_TYPE_NEOX, 2L)
  expect_equal(GGML_ROPE_TYPE_MROPE, 8L)
  expect_equal(GGML_ROPE_TYPE_VISION, 24L)
})