MODEL_PATH <- "/mnt/Data2/DS_projects/llm_models/tiny-mistral-test-Q2_K.gguf"
LORA_PATH  <- "/mnt/Data2/DS_projects/llm_models/test-lora-adapter.gguf"

# ============================================================
# Shared fixtures: load model & context once
# ============================================================

HAS_MODEL <- file.exists(MODEL_PATH)

if (HAS_MODEL) {
    shared_model <- llama_load_model(MODEL_PATH)
    shared_info  <- llama_model_info(shared_model)
    shared_ctx   <- llama_new_context(shared_model, n_ctx = 256L, n_threads = 2L)

    withr::defer(llama_free_context(shared_ctx), teardown_env())
    withr::defer(llama_free_model(shared_model), teardown_env())
}

skip_if_no_model <- function() {
    if (!HAS_MODEL) skip("test model not available")
}

# ============================================================
# Package load (no model required)
# ============================================================

test_that("package loads correctly", {
    expect_true(require(llamaR, quietly = TRUE))
})

# ============================================================
# Verbosity (no model required)
# ============================================================

test_that("verbosity can be set and retrieved", {
    old <- llama_get_verbosity()

    llama_set_verbosity(0L)
    expect_equal(llama_get_verbosity(), 0L)

    llama_set_verbosity(3L)
    expect_equal(llama_get_verbosity(), 3L)

    llama_set_verbosity(old)
    expect_equal(llama_get_verbosity(), old)
})

# ============================================================
# Hardware / System info (no model required)
# ============================================================

test_that("llama_supports_gpu returns logical", {
    result <- llama_supports_gpu()
    expect_true(is.logical(result))
    expect_equal(length(result), 1L)
})

test_that("system_info returns non-empty string", {
    info <- llama_system_info()
    expect_true(is.character(info))
    expect_true(nchar(info) > 0)
})

test_that("supports_mmap returns logical", {
    result <- llama_supports_mmap()
    expect_true(is.logical(result))
    expect_equal(length(result), 1L)
})

test_that("supports_mlock returns logical", {
    result <- llama_supports_mlock()
    expect_true(is.logical(result))
    expect_equal(length(result), 1L)
})

test_that("max_devices returns positive integer", {
    result <- llama_max_devices()
    expect_true(is.integer(result))
    expect_true(result >= 1L)
})

test_that("llama_time_us returns positive numeric", {
    t <- llama_time_us()
    expect_true(is.numeric(t))
    expect_true(t > 0)
})

test_that("llama_numa_init does not error with disabled", {
    expect_no_error(llama_numa_init("disabled"))
})

test_that("llama_numa_init errors on invalid strategy", {
    expect_error(llama_numa_init("bogus"), "invalid NUMA")
})

test_that("llama_backend_devices returns data.frame", {
    df <- llama_backend_devices()
    expect_true(is.data.frame(df))
    expect_true(nrow(df) >= 1L)
    expect_true(all(c("name", "description", "type") %in% names(df)))
    expect_true(all(df$type %in% c("cpu", "gpu", "igpu", "accel", "unknown")))
})

test_that("llama_load_model with devices='cpu' works", {
    skip_if_no_model()
    model <- llama_load_model(MODEL_PATH, devices = "cpu")
    expect_false(is.null(model))
    llama_free_model(model)
})

test_that("chat_builtin_templates returns character vector", {
    templates <- llama_chat_builtin_templates()
    expect_true(is.character(templates))
    expect_true(length(templates) > 0)
})

# ============================================================
# Model: load + info
# ============================================================

test_that("model loads and info is returned", {
    skip_if_no_model()

    expect_false(is.null(shared_model))

    expect_true(is.list(shared_info))
    expect_true(shared_info$n_vocab > 0)
    expect_true(shared_info$n_embd  > 0)
    expect_true(shared_info$n_layer > 0)
    expect_true(shared_info$n_head  > 0)
    expect_true(nchar(shared_info$desc) > 0)
})

test_that("model_info returns extended fields", {
    skip_if_no_model()

    expect_true(is.numeric(shared_info$size))
    expect_true(shared_info$size > 0)
    expect_true(is.numeric(shared_info$n_params))
    expect_true(shared_info$n_params > 0)
    expect_true(is.logical(shared_info$has_encoder))
    expect_true(is.logical(shared_info$has_decoder))
    expect_true(is.logical(shared_info$is_recurrent))
})

# ============================================================
# Model metadata
# ============================================================

test_that("model_meta returns named character vector", {
    skip_if_no_model()

    meta <- llama_model_meta(shared_model)
    expect_true(is.character(meta))
    expect_true(length(meta) > 0)
    expect_false(is.null(names(meta)))
})

test_that("model_meta_val retrieves values by key", {
    skip_if_no_model()

    arch <- llama_model_meta_val(shared_model, "general.architecture")
    expect_true(is.character(arch) || is.null(arch))

    val <- llama_model_meta_val(shared_model, "nonexistent.key.12345")
    expect_null(val)
})

# ============================================================
# Vocabulary info
# ============================================================

test_that("vocab_info returns named integer vector", {
    skip_if_no_model()

    vocab <- llama_vocab_info(shared_model)
    expect_true(is.integer(vocab))
    expect_equal(length(vocab), 11L)
    expect_true(all(c("bos", "eos", "eot", "sep", "nl", "pad",
                       "fim_pre", "fim_suf", "fim_mid", "fim_rep", "fim_sep")
                     %in% names(vocab)))
})

# ============================================================
# Chat templates
# ============================================================

test_that("chat template can be retrieved from model", {
    skip_if_no_model()

    tmpl <- llama_chat_template(shared_model)
    expect_true(is.null(tmpl) || is.character(tmpl))
})

test_that("chat_apply_template formats messages", {
    skip_if_no_model()

    tmpl <- llama_chat_template(shared_model)
    if (is.null(tmpl)) skip("model has no built-in chat template")

    messages <- list(list(role = "user", content = "Hello"))
    prompt <- llama_chat_apply_template(messages, template = tmpl)

    expect_true(is.character(prompt))
    expect_true(nchar(prompt) > 0)
    expect_true(grepl("Hello", prompt, fixed = TRUE))
})

# ============================================================
# Context: create + config
# ============================================================

test_that("context can be created", {
    skip_if_no_model()
    expect_false(is.null(shared_ctx))
})

test_that("n_ctx returns correct context size", {
    skip_if_no_model()

    n <- llama_n_ctx(shared_ctx)
    expect_true(is.integer(n))
    expect_equal(n, 256L)
})

test_that("set_threads does not error", {
    skip_if_no_model()

    expect_no_error(llama_set_threads(shared_ctx, n_threads = 4L))
    expect_no_error(llama_set_threads(shared_ctx, n_threads = 2L, n_threads_batch = 4L))
    # restore
    llama_set_threads(shared_ctx, n_threads = 2L)
})

test_that("set_causal_attn does not error", {
    skip_if_no_model()

    expect_no_error(llama_set_causal_attn(shared_ctx, FALSE))
    expect_no_error(llama_set_causal_attn(shared_ctx, TRUE))
})

test_that("n_ctx_seq returns positive integer", {
    skip_if_no_model()

    n <- llama_n_ctx_seq(shared_ctx)
    expect_true(is.integer(n))
    expect_true(n >= 1L)
})

test_that("n_batch returns positive integer", {
    skip_if_no_model()

    n <- llama_n_batch(shared_ctx)
    expect_true(is.integer(n))
    expect_true(n >= 1L)
})

test_that("n_ubatch returns positive integer", {
    skip_if_no_model()

    n <- llama_n_ubatch(shared_ctx)
    expect_true(is.integer(n))
    expect_true(n >= 1L)
})

test_that("n_seq_max returns positive integer", {
    skip_if_no_model()

    n <- llama_n_seq_max(shared_ctx)
    expect_true(is.integer(n))
    expect_true(n >= 1L)
})

test_that("n_threads returns positive integer matching set_threads", {
    skip_if_no_model()

    llama_set_threads(shared_ctx, n_threads = 3L, n_threads_batch = 5L)
    expect_equal(llama_n_threads(shared_ctx), 3L)
    expect_equal(llama_n_threads_batch(shared_ctx), 5L)
    # restore
    llama_set_threads(shared_ctx, n_threads = 2L)
})

test_that("pooling_type returns known string", {
    skip_if_no_model()

    pt <- llama_pooling_type(shared_ctx)
    expect_true(is.character(pt))
    expect_true(pt %in% c("none", "mean", "cls", "last", "rank", "unspecified"))
})

# ============================================================
# Tokenize / Detokenize
# ============================================================

test_that("tokenize and detokenize are inverse operations", {
    skip_if_no_model()

    text   <- "Hello, world!"
    tokens <- llama_tokenize(shared_ctx, text)

    expect_true(is.integer(tokens))
    expect_true(length(tokens) > 0)

    recovered <- llama_detokenize(shared_ctx, tokens)
    expect_true(is.character(recovered))
    expect_equal(recovered, text)
})

# ============================================================
# Generation
# ============================================================

test_that("generation produces non-empty output", {
    skip_if_no_model()

    result <- llama_generate(shared_ctx, "The capital of France is",
                             max_new_tokens = 20L, temp = 0.1)
    expect_true(is.character(result))
    expect_true(nchar(result, type = "bytes") > 0)
})

test_that("greedy generation is deterministic", {
    skip_if_no_model()

    r1 <- llama_generate(shared_ctx, "Once upon a time", max_new_tokens = 30L, temp = 0.0)
    r2 <- llama_generate(shared_ctx, "Once upon a time", max_new_tokens = 30L, temp = 0.0)
    expect_equal(r1, r2)
})

# ============================================================
# Advanced sampling
# ============================================================

test_that("generation with min_p produces output", {
    skip_if_no_model()

    result <- llama_generate(shared_ctx, "Hello", max_new_tokens = 10L,
                             temp = 0.8, min_p = 0.05)
    expect_true(is.character(result))
    expect_true(nchar(result, type = "bytes") > 0)
})

test_that("generation with repeat_penalty produces output", {
    skip_if_no_model()

    result <- llama_generate(shared_ctx, "Hello", max_new_tokens = 10L,
                             temp = 0.8, repeat_penalty = 1.1,
                             repeat_last_n = 32L)
    expect_true(is.character(result))
    expect_true(nchar(result, type = "bytes") > 0)
})

test_that("generation with mirostat v2 produces output", {
    skip_if_no_model()

    result <- llama_generate(shared_ctx, "Hello", max_new_tokens = 10L,
                             mirostat = 2L, mirostat_tau = 5.0,
                             mirostat_eta = 0.1)
    expect_true(is.character(result))
    expect_true(nchar(result, type = "bytes") > 0)
})

test_that("generation with typical_p produces output", {
    skip_if_no_model()

    result <- llama_generate(shared_ctx, "Hello", max_new_tokens = 10L,
                             temp = 0.8, typical_p = 0.9)
    expect_true(is.character(result))
    expect_true(nchar(result, type = "bytes") > 0)
})

# ============================================================
# Embeddings
# ============================================================

test_that("embeddings have correct dimensionality", {
    skip_if_no_model()

    emb <- llama_embeddings(shared_ctx, "Hello")

    expect_true(is.numeric(emb))
    expect_equal(length(emb), shared_info$n_embd)
    expect_true(any(emb != 0))
})

test_that("llama_get_embeddings_ith returns correct vector", {
    skip_if_no_model()

    ctx <- llama_new_context(shared_model, n_ctx = 256L, n_threads = 2L)
    on.exit(llama_free_context(ctx))

    # run embeddings to populate output
    emb_full <- llama_embeddings(ctx, "Hello")

    # get_embeddings_ith(-1) should return the same as llama_embeddings
    emb_ith <- llama_get_embeddings_ith(ctx, -1L)

    expect_true(is.numeric(emb_ith))
    expect_equal(length(emb_ith), shared_info$n_embd)
    expect_equal(emb_ith, emb_full)
})

# ============================================================
# Logits
# ============================================================

test_that("get_logits returns numeric vector of n_vocab length", {
    skip_if_no_model()

    llama_generate(shared_ctx, "Hello", max_new_tokens = 1L, temp = 0)

    logits <- llama_get_logits(shared_ctx)
    expect_true(is.numeric(logits))
    expect_equal(length(logits), shared_info$n_vocab)
    expect_true(any(logits != 0))
})

test_that("get_logits_ith(-1) matches get_logits after single-token decode", {
    skip_if_no_model()

    llama_generate(shared_ctx, "Hello", max_new_tokens = 1L, temp = 0)

    logits_all <- llama_get_logits(shared_ctx)
    logits_ith <- llama_get_logits_ith(shared_ctx, -1L)

    expect_true(is.numeric(logits_ith))
    expect_equal(length(logits_ith), shared_info$n_vocab)
    expect_equal(logits_ith, logits_all)
})

test_that("get_logits_ith(0) returns numeric vector of n_vocab length", {
    skip_if_no_model()

    llama_generate(shared_ctx, "Hello", max_new_tokens = 1L, temp = 0)

    logits <- llama_get_logits_ith(shared_ctx, 0L)
    expect_true(is.numeric(logits))
    expect_equal(length(logits), shared_info$n_vocab)
})

# ============================================================
# KV Cache operations
# ============================================================

test_that("memory_clear works", {
    skip_if_no_model()

    llama_generate(shared_ctx, "Hello", max_new_tokens = 5L, temp = 0)
    expect_no_error(llama_memory_clear(shared_ctx))
})

test_that("memory_seq_rm works", {
    skip_if_no_model()

    llama_generate(shared_ctx, "Hello", max_new_tokens = 5L, temp = 0)
    result <- llama_memory_seq_rm(shared_ctx, seq_id = 0L, p0 = -1L, p1 = -1L)
    expect_true(is.logical(result))
})

test_that("memory_seq_keep works", {
    skip_if_no_model()

    llama_generate(shared_ctx, "Hello", max_new_tokens = 5L, temp = 0)
    expect_no_error(llama_memory_seq_keep(shared_ctx, seq_id = 0L))
})

test_that("memory_seq_pos_range returns named integer", {
    skip_if_no_model()

    range <- llama_memory_seq_pos_range(shared_ctx, seq_id = 0L)
    expect_true(is.integer(range))
    expect_equal(length(range), 2L)
    expect_true(all(c("min", "max") %in% names(range)))
})

test_that("memory_can_shift returns logical", {
    skip_if_no_model()

    result <- llama_memory_can_shift(shared_ctx)
    expect_true(is.logical(result))
    expect_equal(length(result), 1L)
})

# ============================================================
# State save/load
# ============================================================

test_that("state save and load round-trip", {
    skip_if_no_model()

    llama_generate(shared_ctx, "Hello world", max_new_tokens = 5L, temp = 0)

    state_file <- tempfile(fileext = ".bin")
    on.exit(unlink(state_file), add = TRUE)

    result <- llama_state_save(shared_ctx, state_file)
    expect_true(result)
    expect_true(file.exists(state_file))
    expect_true(file.info(state_file)$size > 0)

    ctx2 <- llama_new_context(shared_model, n_ctx = 256L, n_threads = 2L)
    result2 <- llama_state_load(ctx2, state_file)
    expect_true(result2)

    llama_free_context(ctx2)
})

test_that("state_load errors on non-existent file", {
    skip_if_no_model()
    expect_error(llama_state_load(shared_ctx, "nonexistent_state.bin"))
})

# ============================================================
# Performance counters
# ============================================================

test_that("perf returns named list with expected fields", {
    skip_if_no_model()

    llama_generate(shared_ctx, "Hello", max_new_tokens = 5L, temp = 0)

    perf <- llama_perf(shared_ctx)
    expect_true(is.list(perf))
    expect_true(all(c("t_load_ms", "t_p_eval_ms", "t_eval_ms",
                       "n_p_eval", "n_eval", "n_reused") %in% names(perf)))
    expect_true(perf$n_eval > 0)

    expect_no_error(llama_perf_reset(shared_ctx))
})

# ============================================================
# LoRA adapters (separate model load — LoRA modifies model)
# ============================================================

test_that("lora_load returns handle or errors on missing file", {
    skip_if_no_model()

    expect_error(llama_lora_load(shared_model, "nonexistent.gguf"))

    if (file.exists(LORA_PATH)) {
        lora <- llama_lora_load(shared_model, LORA_PATH)
        expect_false(is.null(lora))
    }
})

test_that("lora_apply and lora_remove work on context", {
    skip_if_no_model()
    if (!file.exists(LORA_PATH)) skip("test LoRA adapter not available")

    model <- llama_load_model(MODEL_PATH)
    lora <- llama_lora_load(model, LORA_PATH)
    ctx <- llama_new_context(model, n_ctx = 128L, n_threads = 2L)

    expect_no_error(llama_lora_apply(ctx, lora, scale = 1.0))

    result <- llama_lora_remove(ctx, lora)
    expect_equal(result, 0L)

    llama_free_context(ctx)
    llama_free_model(model)
})

test_that("lora_clear works on context", {
    skip_if_no_model()
    if (!file.exists(LORA_PATH)) skip("test LoRA adapter not available")

    model <- llama_load_model(MODEL_PATH)
    lora <- llama_lora_load(model, LORA_PATH)
    ctx <- llama_new_context(model, n_ctx = 128L, n_threads = 2L)

    llama_lora_apply(ctx, lora, scale = 0.5)
    expect_no_error(llama_lora_clear(ctx))

    result <- llama_lora_remove(ctx, lora)
    expect_equal(result, -1L)

    llama_free_context(ctx)
    llama_free_model(model)
})

# ============================================================
# token_to_piece
# ============================================================

test_that("token_to_piece returns character string", {
    skip_if_no_model()

    # add_special=FALSE to avoid BOS which may render as empty string
    tokens <- llama_tokenize(shared_ctx, "Hello", add_special = FALSE)
    piece  <- llama_token_to_piece(shared_ctx, tokens[1])

    expect_true(is.character(piece))
    expect_equal(length(piece), 1L)
    expect_true(nchar(piece) > 0)
})

test_that("token_to_piece with special=TRUE does not error", {
    skip_if_no_model()

    vocab  <- llama_vocab_info(shared_model)
    bos_id <- vocab["bos"]
    if (is.na(bos_id) || bos_id < 0L) skip("model has no BOS token")

    piece <- llama_token_to_piece(shared_ctx, bos_id, special = TRUE)
    expect_true(is.character(piece))
    expect_equal(length(piece), 1L)
})

test_that("token_to_piece round-trips with tokenize", {
    skip_if_no_model()

    text   <- "world"
    tokens <- llama_tokenize(shared_ctx, text, add_special = FALSE)
    pieces <- vapply(tokens, function(t) llama_token_to_piece(shared_ctx, t), character(1))

    expect_true(length(pieces) > 0)
    reconstructed <- paste(pieces, collapse = "")
    # strip possible leading space added by tokenizer
    expect_true(grepl(text, reconstructed, fixed = TRUE))
})

# ============================================================
# GPU: token_to_piece on GPU context
# ============================================================

test_that("token_to_piece works on GPU context", {
    skip_if_no_model()
    skip_if(!llama_supports_gpu(), "GPU not available")

    gpu_model <- llama_load_model(MODEL_PATH, n_gpu_layers = -1L)
    gpu_ctx   <- llama_new_context(gpu_model, n_ctx = 128L)
    on.exit({ llama_free_context(gpu_ctx); llama_free_model(gpu_model) }, add = TRUE)

    tokens <- llama_tokenize(gpu_ctx, "GPU test", add_special = FALSE)
    piece  <- llama_token_to_piece(gpu_ctx, tokens[1])

    expect_true(is.character(piece))
    expect_true(nchar(piece) > 0)
})

# ============================================================
# llama_batch_init / llama_batch_free
# ============================================================

test_that("batch_init returns external pointer", {
    batch <- llama_batch_init(512L)

    expect_true(is.list(batch) || inherits(batch, "externalptr"))
    expect_false(is.null(batch))
})

test_that("batch_init with embd mode does not error", {
    expect_no_error(llama_batch_init(64L, embd = 512L, n_seq_max = 4L))
})

test_that("batch_free clears the batch", {
    batch <- llama_batch_init(128L)
    expect_no_error(llama_batch_free(batch))
    # double-free should be safe (pointer already NULLed)
    expect_no_error(llama_batch_free(batch))
})

test_that("batch GC finalizer works (no explicit free)", {
    # allocate inside local scope — GC should clean up
    local({
        b <- llama_batch_init(256L)
        expect_false(is.null(b))
    })
    gc()
    succeed()
})

# ============================================================
# llama_encode (encoder-decoder)
# ============================================================

test_that("llama_encode returns integer on encoder-decoder model", {
    skip_if_no_model()
    skip_if(!shared_info$has_encoder || !shared_info$has_decoder,
            "model is not encoder-decoder")

    tokens <- llama_tokenize(shared_ctx, "Translate: Hello world")
    ret    <- llama_encode(shared_ctx, tokens)

    expect_true(is.integer(ret))
    expect_equal(ret, 0L)
})

# ============================================================
# GPU: batch_init + encode on GPU context
# ============================================================

# ============================================================
# embed_llamar
# ============================================================

test_that("llama_embed_batch returns matrix with correct dimensions", {
    skip_if_no_model()

    # embedding=FALSE: sequential last-token decode (works on generative models)
    ctx <- llama_new_context(shared_model, n_ctx = 256L, n_threads = 2L)
    on.exit(llama_free_context(ctx))

    mat <- llama_embed_batch(ctx, c("hello", "world", "test"))

    expect_true(is.matrix(mat))
    expect_equal(nrow(mat), 3L)
    expect_equal(ncol(mat), shared_info$n_embd)
    expect_true(any(mat != 0))
})

test_that("llama_embed_batch single text matches llama_embeddings", {
    skip_if_no_model()

    ctx1 <- llama_new_context(shared_model, n_ctx = 256L, n_threads = 2L)
    on.exit(llama_free_context(ctx1), add = TRUE)

    emb_single <- llama_embeddings(ctx1, "hello")

    ctx2 <- llama_new_context(shared_model, n_ctx = 256L, n_threads = 2L)
    on.exit(llama_free_context(ctx2), add = TRUE)

    mat <- llama_embed_batch(ctx2, "hello")

    expect_equal(nrow(mat), 1L)
    expect_equal(ncol(mat), length(emb_single))
})

test_that("llama_embed_batch empty input returns 0-row matrix", {
    skip_if_no_model()

    ctx <- llama_new_context(shared_model, n_ctx = 256L, n_threads = 2L)
    on.exit(llama_free_context(ctx))

    mat <- llama_embed_batch(ctx, character(0))

    expect_true(is.matrix(mat))
    expect_equal(nrow(mat), 0L)
})

test_that("embed_llamar partial application returns a function", {
    skip_if_no_model()

    fn <- embed_llamar(model = shared_model)
    expect_true(is.function(fn))
})

test_that("embed_llamar partial application produces list of vectors", {
    skip_if_no_model()

    fn <- embed_llamar(model = shared_model, n_ctx = 256L, n_threads = 2L)
    result <- fn(c("hello", "world"))

    expect_true(is.list(result))
    expect_equal(length(result), 2L)
    expect_true(is.numeric(result[[1]]))
    expect_equal(length(result[[1]]), shared_info$n_embd)
})

test_that("embed_llamar direct call returns matrix", {
    skip_if_no_model()

    mat <- embed_llamar(c("hello", "world"), model = shared_model,
                        n_ctx = 256L, n_threads = 2L)

    expect_true(is.matrix(mat))
    expect_equal(nrow(mat), 2L)
    expect_equal(ncol(mat), shared_info$n_embd)
})

test_that("embed_llamar normalizes by default", {
    skip_if_no_model()

    mat <- embed_llamar("hello", model = shared_model,
                        n_ctx = 256L, n_threads = 2L)
    norm <- sqrt(sum(mat[1, ]^2))
    expect_equal(norm, 1.0, tolerance = 1e-6)
})

test_that("embed_llamar normalize=FALSE skips normalization", {
    skip_if_no_model()

    mat <- embed_llamar("hello", model = shared_model,
                        n_ctx = 256L, n_threads = 2L, normalize = FALSE)
    norm <- sqrt(sum(mat[1, ]^2))
    # raw embeddings are unlikely to have unit norm
    expect_true(is.numeric(mat))
})

test_that("embed_llamar with data.frame returns data.frame with embedding column", {
    skip_if_no_model()

    df <- data.frame(text = c("hello", "world"), id = 1:2)
    result <- embed_llamar(df, model = shared_model,
                           n_ctx = 256L, n_threads = 2L)

    expect_true(is.data.frame(result))
    expect_true("embedding" %in% names(result))
    expect_true("id" %in% names(result))
    expect_equal(nrow(result), 2L)
    expect_true(is.list(result$embedding))
    expect_equal(length(result$embedding[[1]]), shared_info$n_embd)
})

test_that("embed_llamar errors on data.frame without text column", {
    skip_if_no_model()

    df <- data.frame(content = "hello")
    expect_error(embed_llamar(df, model = shared_model),
                 "text")
})

test_that("embed_llamar with model path loads and frees model", {
    skip_if_no_model()

    mat <- embed_llamar("hello", model = MODEL_PATH,
                        n_ctx = 256L, n_threads = 2L)
    expect_true(is.matrix(mat))
    expect_equal(nrow(mat), 1L)
    expect_equal(ncol(mat), shared_info$n_embd)
})

# ============================================================
# GPU: batch_init + encode on GPU context
# ============================================================

test_that("batch_init works with GPU context loaded", {
    skip_if_no_model()
    skip_if(!llama_supports_gpu(), "GPU not available")

    gpu_model <- llama_load_model(MODEL_PATH, n_gpu_layers = -1L)
    gpu_ctx   <- llama_new_context(gpu_model, n_ctx = 128L)
    on.exit({ llama_free_context(gpu_ctx); llama_free_model(gpu_model) }, add = TRUE)

    batch <- llama_batch_init(128L)
    expect_false(is.null(batch))
    expect_no_error(llama_batch_free(batch))
})

# ============================================================
# Chain: context introspection
# ============================================================

test_that("context getters are consistent with creation params", {
    skip_if_no_model()

    ctx <- llama_new_context(shared_model, n_ctx = 128L, n_threads = 3L)
    on.exit(llama_free_context(ctx))
    llama_set_threads(ctx, n_threads = 3L, n_threads_batch = 6L)

    expect_true(llama_n_ctx(ctx) >= 128L)
    expect_true(llama_n_ctx_seq(ctx) >= 1L)
    expect_true(llama_n_batch(ctx) >= 1L)
    expect_true(llama_n_ubatch(ctx) >= 1L)
    expect_true(llama_n_seq_max(ctx) >= 1L)
    expect_equal(llama_n_threads(ctx), 3L)
    expect_equal(llama_n_threads_batch(ctx), 6L)
    expect_true(llama_pooling_type(ctx) %in%
                    c("none", "mean", "cls", "last", "rank", "unspecified"))
})

# ============================================================
# Chain: generate → logits → top token
# ============================================================

test_that("generate then inspect logits for top token", {
    skip_if_no_model()

    llama_generate(shared_ctx, "The capital of France is", max_new_tokens = 1L, temp = 0)

    logits <- llama_get_logits_ith(shared_ctx, -1L)
    expect_equal(length(logits), shared_info$n_vocab)

    top_id <- which.max(logits)
    expect_true(is.integer(top_id))
    expect_true(top_id >= 1L && top_id <= shared_info$n_vocab)

    piece <- llama_token_to_piece(shared_ctx, top_id - 1L)  # 0-based token id
    expect_true(is.character(piece))
    expect_true(nchar(piece) > 0)
})

# ============================================================
# Chain: generate → save state → restore → continue
# ============================================================

test_that("save state after generation and resume produces output", {
    skip_if_no_model()

    llama_generate(shared_ctx, "Once upon a time", max_new_tokens = 10L, temp = 0)

    state_file <- tempfile(fileext = ".bin")
    on.exit(unlink(state_file), add = TRUE)

    expect_true(llama_state_save(shared_ctx, state_file))

    ctx2 <- llama_new_context(shared_model, n_ctx = 256L, n_threads = 2L)
    on.exit(llama_free_context(ctx2), add = TRUE)

    expect_true(llama_state_load(ctx2, state_file))

    result <- llama_generate(ctx2, " there lived a", max_new_tokens = 10L, temp = 0)
    expect_true(is.character(result))
    expect_true(nchar(result) > 0)
})

# ============================================================
# Chain: multi-sequence KV cache management
# ============================================================

test_that("generate, copy sequence, remove original, continue", {
    skip_if_no_model()
    skip_if(llama_n_seq_max(shared_ctx) < 2L, "context supports only 1 sequence")

    llama_memory_clear(shared_ctx)
    llama_generate(shared_ctx, "Hello world", max_new_tokens = 5L, temp = 0)

    range0 <- llama_memory_seq_pos_range(shared_ctx, seq_id = 0L)
    expect_true(range0["max"] > range0["min"])

    llama_memory_seq_cp(shared_ctx, seq_src = 0L, seq_dst = 1L,
                        p0 = -1L, p1 = -1L)
    llama_memory_seq_rm(shared_ctx, seq_id = 0L, p0 = -1L, p1 = -1L)

    result <- llama_generate(shared_ctx, "More text", max_new_tokens = 5L, temp = 0)
    expect_true(is.character(result))
})

# ============================================================
# Chain: set threads → verify → reset
# ============================================================

test_that("set_threads round-trip via n_threads getters", {
    skip_if_no_model()

    orig_t  <- llama_n_threads(shared_ctx)
    orig_tb <- llama_n_threads_batch(shared_ctx)

    llama_set_threads(shared_ctx, n_threads = 1L, n_threads_batch = 2L)
    expect_equal(llama_n_threads(shared_ctx),       1L)
    expect_equal(llama_n_threads_batch(shared_ctx), 2L)

    llama_set_threads(shared_ctx, n_threads = orig_t, n_threads_batch = orig_tb)
    expect_equal(llama_n_threads(shared_ctx),       orig_t)
    expect_equal(llama_n_threads_batch(shared_ctx), orig_tb)
})

# ============================================================
# supports_rpc
# ============================================================

test_that("supports_rpc returns logical", {
    expect_true(is.logical(llama_supports_rpc()))
    expect_equal(length(llama_supports_rpc()), 1L)
})

# ============================================================
# synchronize
# ============================================================

test_that("synchronize does not error", {
    skip_if_no_model()
    expect_no_error(llama_synchronize(shared_ctx))
})

# ============================================================
# state_get_size
# ============================================================

test_that("state_get_size returns positive numeric", {
    skip_if_no_model()

    n <- llama_state_get_size(shared_ctx)
    expect_true(is.numeric(n))
    expect_true(n > 0)
})

# ============================================================
# memory_seq_div
# ============================================================

test_that("memory_seq_div does not error", {
    skip_if_no_model()

    llama_generate(shared_ctx, "Hello", max_new_tokens = 5L, temp = 0)
    expect_no_error(llama_memory_seq_div(shared_ctx, seq_id = 0L,
                                         p0 = -1L, p1 = -1L, d = 2L))
})

# ============================================================
# vocab_type
# ============================================================

test_that("vocab_type returns known string", {
    skip_if_no_model()

    vt <- llama_vocab_type(shared_model)
    expect_true(is.character(vt))
    expect_true(vt %in% c("none", "spm", "bpe", "wpm", "ugm", "rwkv", "plamo2"))
})

# ============================================================
# vocab_is_eog / vocab_is_control
# ============================================================

test_that("vocab_is_eog returns logical for known tokens", {
    skip_if_no_model()

    vocab <- llama_vocab_info(shared_model)
    eos   <- vocab["eos"]
    if (!is.na(eos) && eos >= 0L) {
        expect_true(llama_vocab_is_eog(shared_model, eos))
    }
    # token 0 is almost never EOG
    expect_true(is.logical(llama_vocab_is_eog(shared_model, 0L)))
})

test_that("vocab_is_control returns logical", {
    skip_if_no_model()

    expect_true(is.logical(llama_vocab_is_control(shared_model, 0L)))
})

# ============================================================
# vocab_get_text / vocab_get_score
# ============================================================

test_that("vocab_get_text returns character for valid token", {
    skip_if_no_model()

    tokens <- llama_tokenize(shared_ctx, "Hello", add_special = FALSE)
    text   <- llama_vocab_get_text(shared_model, tokens[1])
    expect_true(is.character(text) || is.null(text))
})

test_that("vocab_get_score returns numeric", {
    skip_if_no_model()

    tokens <- llama_tokenize(shared_ctx, "Hello", add_special = FALSE)
    score  <- llama_vocab_get_score(shared_model, tokens[1])
    expect_true(is.numeric(score))
    expect_equal(length(score), 1L)
})

# ============================================================
# model_info includes n_head_kv
# ============================================================

test_that("model_info contains n_head_kv", {
    skip_if_no_model()

    expect_true("n_head_kv" %in% names(shared_info))
    expect_true(is.integer(shared_info$n_head_kv))
    expect_true(shared_info$n_head_kv >= 1L)
})

# ============================================================
# perf_print / memory_breakdown_print
# ============================================================

test_that("perf_print does not error", {
    skip_if_no_model()

    llama_generate(shared_ctx, "Hello", max_new_tokens = 3L, temp = 0)
    expect_no_error(llama_perf_print(shared_ctx))
})

test_that("memory_breakdown_print does not error", {
    skip_if_no_model()
    expect_no_error(llama_memory_breakdown_print(shared_ctx))
})

# ============================================================
# get_embeddings (flat / matrix)
# ============================================================

test_that("get_embeddings returns matrix of correct shape", {
    skip_if_no_model()

    ctx <- llama_new_context(shared_model, n_ctx = 256L, n_threads = 2L)
    on.exit(llama_free_context(ctx))

    llama_embeddings(ctx, "Hello")   # populate output for 1 token position

    mat <- llama_get_embeddings(ctx, n_outputs = 1L)
    expect_true(is.matrix(mat))
    expect_equal(nrow(mat), 1L)
    expect_equal(ncol(mat), shared_info$n_embd)
    expect_true(any(mat != 0))
})

# ============================================================
# get_model
# ============================================================

test_that("get_model returns the same model object", {
    skip_if_no_model()

    m <- llama_get_model(shared_ctx)
    expect_false(is.null(m))
    expect_true(inherits(m, "externalptr"))

    # model info from retrieved handle matches original
    info2 <- llama_model_info(m)
    expect_equal(info2$n_embd,  shared_info$n_embd)
    expect_equal(info2$n_vocab, shared_info$n_vocab)
})

# ============================================================
# set_warmup
# ============================================================

test_that("set_warmup does not error", {
    skip_if_no_model()

    expect_no_error(llama_set_warmup(shared_ctx, TRUE))
    expect_no_error(llama_set_warmup(shared_ctx, FALSE))
})

# ============================================================
# set_abort_callback
# ============================================================

test_that("set_abort_callback NULL clears without error", {
    skip_if_no_model()
    expect_no_error(llama_set_abort_callback(shared_ctx, NULL))
})

test_that("abort callback that always returns FALSE does not abort generation", {
    skip_if_no_model()

    llama_set_abort_callback(shared_ctx, function() FALSE)
    on.exit(llama_set_abort_callback(shared_ctx, NULL))

    result <- llama_generate(shared_ctx, "Hello", max_new_tokens = 5L, temp = 0)
    expect_true(is.character(result))
    expect_true(nchar(result) > 0)
})

test_that("abort callback that returns TRUE aborts immediately", {
    skip_if_no_model()

    llama_set_abort_callback(shared_ctx, function() TRUE)
    on.exit(llama_set_abort_callback(shared_ctx, NULL))

    # aborted generation should either return empty string or error — both are acceptable
    result <- tryCatch(
        llama_generate(shared_ctx, "Hello", max_new_tokens = 20L, temp = 0),
        error = function(e) ""
    )
    # Either way, no crash
    expect_true(is.character(result))
})