# Tests for inplace tensor operations

test_that("ggml_abs_inplace computes absolute value", {
  ctx <- ggml_init(1024 * 1024)
  on.exit(ggml_free(ctx))

  a <- ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4)
  ggml_set_f32(a, c(-1, 2, -3, 4))
  ggml_set_input(a)

  r <- ggml_abs_inplace(ctx, a)
  ggml_set_output(r)

  backend <- ggml_backend_cpu_init()
  on.exit(ggml_backend_free(backend), add = TRUE)
  ggml_backend_cpu_set_n_threads(backend, 2L)

  graph <- ggml_build_forward_expand(ctx, r)
  ggml_backend_graph_compute(backend, graph)
  result <- ggml_get_f32(r)
  expect_equal(result, c(1, 2, 3, 4), tolerance = 1e-5)
})

# Helper to test unary inplace ops
test_inplace_op <- function(op_name, op_fn, input, expected, tol = 1e-4) {
  test_that(paste0(op_name, " works"), {
    ctx <- ggml_init(1024 * 1024)
    on.exit(ggml_free(ctx))

    a <- ggml_new_tensor_1d(ctx, GGML_TYPE_F32, length(input))
    ggml_set_f32(a, input)
    ggml_set_input(a)

    r <- op_fn(ctx, a)
    ggml_set_output(r)

    backend <- ggml_backend_cpu_init()
    on.exit(ggml_backend_free(backend), add = TRUE)
    ggml_backend_cpu_set_n_threads(backend, 2L)

    graph <- ggml_build_forward_expand(ctx, r)
    ggml_backend_graph_compute(backend, graph)
    result <- ggml_get_f32(r)
    expect_equal(result, expected, tolerance = tol)
  })
}

test_inplace_op("ggml_neg_inplace", ggml_neg_inplace,
                c(1, -2, 3, -4), c(-1, 2, -3, 4))

test_inplace_op("ggml_sqr_inplace", ggml_sqr_inplace,
                c(1, 2, 3, 4), c(1, 4, 9, 16))

test_inplace_op("ggml_sqrt_inplace", ggml_sqrt_inplace,
                c(1, 4, 9, 16), c(1, 2, 3, 4))

test_inplace_op("ggml_relu_inplace", ggml_relu_inplace,
                c(-1, 0, 1, 2), c(0, 0, 1, 2))

test_inplace_op("ggml_sigmoid_inplace", ggml_sigmoid_inplace,
                c(0, 0, 0, 0), c(0.5, 0.5, 0.5, 0.5))

test_inplace_op("ggml_tanh_inplace", ggml_tanh_inplace,
                c(0, 0, 0, 0), c(0, 0, 0, 0))

test_inplace_op("ggml_exp_inplace", ggml_exp_inplace,
                c(0, 1, 0, 0), c(1, exp(1), 1, 1))

test_inplace_op("ggml_silu_inplace", ggml_silu_inplace,
                c(0, 0, 0, 0), c(0, 0, 0, 0))

test_inplace_op("ggml_gelu_inplace", ggml_gelu_inplace,
                c(0, 0, 0, 0), c(0, 0, 0, 0))

test_inplace_op("ggml_ceil_inplace", ggml_ceil_inplace,
                c(1.1, 2.5, -0.1, 3.0), c(2, 3, 0, 3))

test_inplace_op("ggml_floor_inplace", ggml_floor_inplace,
                c(1.1, 2.5, -0.1, 3.0), c(1, 2, -1, 3))

test_inplace_op("ggml_round_inplace", ggml_round_inplace,
                c(1.1, 2.5, 2.6, 3.0), c(1, 3, 3, 3))

test_inplace_op("ggml_log_inplace", ggml_log_inplace,
                c(1, exp(1), exp(2), exp(3)), c(0, 1, 2, 3))

# Binary inplace ops need two input tensors
test_that("ggml_mul_inplace works", {
  ctx <- ggml_init(1024 * 1024)
  on.exit(ggml_free(ctx))

  a <- ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4)
  b <- ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4)
  ggml_set_f32(a, c(1, 2, 3, 4))
  ggml_set_f32(b, c(2, 3, 4, 5))
  ggml_set_input(a)
  ggml_set_input(b)

  r <- ggml_mul_inplace(ctx, a, b)
  ggml_set_output(r)

  backend <- ggml_backend_cpu_init()
  on.exit(ggml_backend_free(backend), add = TRUE)
  ggml_backend_cpu_set_n_threads(backend, 2L)

  graph <- ggml_build_forward_expand(ctx, r)
  ggml_backend_graph_compute(backend, graph)
  result <- ggml_get_f32(r)
  expect_equal(result, c(2, 6, 12, 20), tolerance = 1e-5)
})

test_that("ggml_div_inplace works", {
  ctx <- ggml_init(1024 * 1024)
  on.exit(ggml_free(ctx))

  a <- ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4)
  b <- ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4)
  ggml_set_f32(a, c(10, 20, 30, 40))
  ggml_set_f32(b, c(2, 4, 5, 8))
  ggml_set_input(a)
  ggml_set_input(b)

  r <- ggml_div_inplace(ctx, a, b)
  ggml_set_output(r)

  backend <- ggml_backend_cpu_init()
  on.exit(ggml_backend_free(backend), add = TRUE)
  ggml_backend_cpu_set_n_threads(backend, 2L)

  graph <- ggml_build_forward_expand(ctx, r)
  ggml_backend_graph_compute(backend, graph)
  result <- ggml_get_f32(r)
  expect_equal(result, c(5, 5, 6, 5), tolerance = 1e-5)
})

test_that("ggml_sub_inplace works", {
  ctx <- ggml_init(1024 * 1024)
  on.exit(ggml_free(ctx))

  a <- ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4)
  b <- ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4)
  ggml_set_f32(a, c(10, 20, 30, 40))
  ggml_set_f32(b, c(1, 2, 3, 4))
  ggml_set_input(a)
  ggml_set_input(b)

  r <- ggml_sub_inplace(ctx, a, b)
  ggml_set_output(r)

  backend <- ggml_backend_cpu_init()
  on.exit(ggml_backend_free(backend), add = TRUE)
  ggml_backend_cpu_set_n_threads(backend, 2L)

  graph <- ggml_build_forward_expand(ctx, r)
  ggml_backend_graph_compute(backend, graph)
  result <- ggml_get_f32(r)
  expect_equal(result, c(9, 18, 27, 36), tolerance = 1e-5)
})

test_that("ggml_dup_inplace works", {
  ctx <- ggml_init(1024 * 1024)
  on.exit(ggml_free(ctx))

  a <- ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4)
  ggml_set_f32(a, c(1, 2, 3, 4))
  ggml_set_input(a)

  r <- ggml_dup_inplace(ctx, a)
  ggml_set_output(r)

  backend <- ggml_backend_cpu_init()
  on.exit(ggml_backend_free(backend), add = TRUE)
  ggml_backend_cpu_set_n_threads(backend, 2L)

  graph <- ggml_build_forward_expand(ctx, r)
  ggml_backend_graph_compute(backend, graph)
  result <- ggml_get_f32(r)
  expect_equal(result, c(1, 2, 3, 4), tolerance = 1e-5)
})

test_that("ggml_scale_inplace works", {
  ctx <- ggml_init(1024 * 1024)
  on.exit(ggml_free(ctx))

  a <- ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4)
  ggml_set_f32(a, c(1, 2, 3, 4))
  ggml_set_input(a)

  r <- ggml_scale_inplace(ctx, a, 3.0)
  ggml_set_output(r)

  backend <- ggml_backend_cpu_init()
  on.exit(ggml_backend_free(backend), add = TRUE)
  ggml_backend_cpu_set_n_threads(backend, 2L)

  graph <- ggml_build_forward_expand(ctx, r)
  ggml_backend_graph_compute(backend, graph)
  result <- ggml_get_f32(r)
  expect_equal(result, c(3, 6, 9, 12), tolerance = 1e-5)
})

test_that("ggml_soft_max_inplace works", {
  ctx <- ggml_init(1024 * 1024)
  on.exit(ggml_free(ctx))

  a <- ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4)
  ggml_set_f32(a, c(1, 2, 3, 4))
  ggml_set_input(a)

  r <- ggml_soft_max_inplace(ctx, a)
  ggml_set_output(r)

  backend <- ggml_backend_cpu_init()
  on.exit(ggml_backend_free(backend), add = TRUE)
  ggml_backend_cpu_set_n_threads(backend, 2L)

  graph <- ggml_build_forward_expand(ctx, r)
  ggml_backend_graph_compute(backend, graph)
  result <- ggml_get_f32(r)
  expect_equal(sum(result), 1.0, tolerance = 1e-5)
  expect_true(all(diff(result) > 0))
})

test_that("ggml_elu_inplace works", {
  ctx <- ggml_init(1024 * 1024)
  on.exit(ggml_free(ctx))

  a <- ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4)
  ggml_set_f32(a, c(-1, 0, 1, 2))
  ggml_set_input(a)

  r <- ggml_elu_inplace(ctx, a)
  ggml_set_output(r)

  backend <- ggml_backend_cpu_init()
  on.exit(ggml_backend_free(backend), add = TRUE)
  ggml_backend_cpu_set_n_threads(backend, 2L)

  graph <- ggml_build_forward_expand(ctx, r)
  ggml_backend_graph_compute(backend, graph)
  result <- ggml_get_f32(r)
  expect_equal(result[2], 0.0, tolerance = 1e-5)
  expect_equal(result[3], 1.0, tolerance = 1e-5)
  expect_true(result[1] < 0)  # exp(-1)-1
})

test_that("ggml_softplus_inplace works", {
  ctx <- ggml_init(1024 * 1024)
  on.exit(ggml_free(ctx))

  a <- ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4)
  ggml_set_f32(a, c(0, 1, 2, 3))
  ggml_set_input(a)

  r <- ggml_softplus_inplace(ctx, a)
  ggml_set_output(r)

  backend <- ggml_backend_cpu_init()
  on.exit(ggml_backend_free(backend), add = TRUE)
  ggml_backend_cpu_set_n_threads(backend, 2L)

  graph <- ggml_build_forward_expand(ctx, r)
  ggml_backend_graph_compute(backend, graph)
  result <- ggml_get_f32(r)
  expected <- log(1 + exp(c(0, 1, 2, 3)))
  expect_equal(result, expected, tolerance = 1e-4)
})

test_that("ggml_diag_mask_inf_inplace works", {
  ctx <- ggml_init(1024 * 1024)
  on.exit(ggml_free(ctx))

  a <- ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 3, 3)
  ggml_set_f32(a, rep(1.0, 9))
  ggml_set_input(a)

  r <- ggml_diag_mask_inf_inplace(ctx, a, 0L)
  ggml_set_output(r)

  backend <- ggml_backend_cpu_init()
  on.exit(ggml_backend_free(backend), add = TRUE)
  ggml_backend_cpu_set_n_threads(backend, 2L)

  graph <- ggml_build_forward_expand(ctx, r)
  ggml_backend_graph_compute(backend, graph)
  result <- ggml_get_f32(r)
  # n_past=0: mask where i > j, so (i=1,j=0) at index 2 is -Inf
  expect_equal(result[1], 1.0, tolerance = 1e-5)
  expect_true(is.infinite(result[2]) && result[2] < 0)
})

test_that("ggml_norm_inplace works", {
  ctx <- ggml_init(1024 * 1024)
  on.exit(ggml_free(ctx))

  a <- ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4)
  ggml_set_f32(a, c(1, 2, 3, 4))
  ggml_set_input(a)

  r <- ggml_norm_inplace(ctx, a, 1e-5)
  ggml_set_output(r)

  backend <- ggml_backend_cpu_init()
  on.exit(ggml_backend_free(backend), add = TRUE)
  ggml_backend_cpu_set_n_threads(backend, 2L)

  graph <- ggml_build_forward_expand(ctx, r)
  ggml_backend_graph_compute(backend, graph)
  result <- ggml_get_f32(r)
  expect_equal(mean(result), 0.0, tolerance = 1e-4)
})

test_that("ggml_rms_norm_inplace works", {
  ctx <- ggml_init(1024 * 1024)
  on.exit(ggml_free(ctx))

  a <- ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4)
  ggml_set_f32(a, c(1, 2, 3, 4))
  ggml_set_input(a)

  r <- ggml_rms_norm_inplace(ctx, a, 1e-5)
  ggml_set_output(r)

  backend <- ggml_backend_cpu_init()
  on.exit(ggml_backend_free(backend), add = TRUE)
  ggml_backend_cpu_set_n_threads(backend, 2L)

  graph <- ggml_build_forward_expand(ctx, r)
  ggml_backend_graph_compute(backend, graph)
  result <- ggml_get_f32(r)
  expect_equal(sqrt(mean(result^2)), 1.0, tolerance = 1e-3)
})

test_that("ggml_l2_norm_inplace works", {
  ctx <- ggml_init(1024 * 1024)
  on.exit(ggml_free(ctx))

  a <- ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4)
  ggml_set_f32(a, c(3, 0, 4, 0))
  ggml_set_input(a)

  r <- ggml_l2_norm_inplace(ctx, a, 1e-5)
  ggml_set_output(r)

  backend <- ggml_backend_cpu_init()
  on.exit(ggml_backend_free(backend), add = TRUE)
  ggml_backend_cpu_set_n_threads(backend, 2L)

  graph <- ggml_build_forward_expand(ctx, r)
  ggml_backend_graph_compute(backend, graph)
  result <- ggml_get_f32(r)
  expect_equal(sqrt(sum(result^2)), 1.0, tolerance = 1e-4)
})

test_that("ggml_rope_inplace works", {
  ctx <- ggml_init(1024 * 1024)
  on.exit(ggml_free(ctx))

  # rope requires 3D+ tensor: ne[2] == b->ne[0] (n_positions)
  a <- ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 4, 2, 1)
  ggml_set_f32(a, rep(1.0, 8))
  ggml_set_input(a)

  pos <- ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1)
  ggml_set_i32(pos, 0L)
  ggml_set_input(pos)

  r <- ggml_rope_inplace(ctx, a, pos, n_dims = 4L, mode = 0L)
  ggml_set_output(r)

  backend <- ggml_backend_cpu_init()
  on.exit(ggml_backend_free(backend), add = TRUE)
  ggml_backend_cpu_set_n_threads(backend, 2L)

  graph <- ggml_build_forward_expand(ctx, r)
  ggml_backend_graph_compute(backend, graph)
  result <- ggml_get_f32(r)
  expect_length(result, 8)
  # pos=0: no rotation applied
  expect_equal(result[1:4], rep(1.0, 4), tolerance = 1e-4)
})