# Performance regression test for the cells store path.
#
# Before the temp-table refactor + the O(n^2) collect_dim_rows fix + the
# vectorised normalize_*, storing a ~100k-row pixieweb result took many
# minutes and blocked on the user's workflow. This test generates a
# synthetic batch of that shape and asserts the whole pipeline finishes
# in well under 30 seconds on any reasonable machine.
#
# The budget is deliberately loose so the test doesn't flake on slow CI,
# but tight enough that a return of the O(n^2) or row-by-row pattern
# would blow it by orders of magnitude.

test_that("storing a ~100k-row pixieweb batch completes within a few seconds", {
  skip_on_cran()
  skip_on_ci()

  n_regions <- 290L
  months <- sprintf("%dM%02d", rep(2010:2025, each = 12L), rep(1:12, 16L))
  n_months <- length(months)
  n_rows <- n_regions * n_months    # ~55 680

  df <- tibble::tibble(
    table_id    = rep("TAB_PERF", n_rows),
    Region      = rep(sprintf("%04d", seq_len(n_regions)), each = n_months),
    Region_text = rep(sprintf("Kommun %d", seq_len(n_regions)), each = n_months),
    Tid         = rep(months, times = n_regions),
    Tid_text    = rep(months, times = n_regions),
    value       = runif(n_rows, 0, 1000)
  )

  path <- tempfile(fileext = ".sqlite")
  handle <- nxt_open(path)
  on.exit({
    nxt_close(handle)
    unlink(c(path, paste0(path, c("-wal","-shm"))), force = TRUE)
  })

  ch <- nxt_cache_handler(
    source = "pixieweb", entity = "data", cache = TRUE,
    cache_location = handle,
    key_params = list(alias = "scb", table_id = "TAB_PERF", Tid = "all"),
    normalize_extra = list(alias = "scb")
  )

  t_store <- system.time(ch("store", df))["elapsed"]
  expect_lt(t_store, 30)

  # Sanity: all rows were persisted
  n_cells <- DBI::dbGetQuery(handle$con,
    "SELECT COUNT(*) AS n FROM cells;")$n
  expect_equal(n_cells, n_rows)

  # Sanity: discover + load roundtrip works and is also fast
  expect_true(ch("discover"))
  t_load <- system.time({
    back <- ch("load")
  })["elapsed"]
  expect_lt(t_load, 15)
  expect_equal(nrow(back), n_rows)
})

test_that("vec_build_dims dedupes dim vectors before hashing", {
  # 5000 rows × only 10 unique dim combinations → hash is called only
  # for the unique blobs, which means vec_build_dims is fast regardless
  # of total row count.
  n <- 5000L
  n_unique <- 10L
  df <- tibble::tibble(
    region = rep(sprintf("R%02d", seq_len(n_unique)), length.out = n),
    cat    = rep(letters[1:5], length.out = n),
    value  = runif(n)
  )

  info <- vec_build_dims(df, c("region", "cat"))
  expect_length(info$dims_hash, n)

  # There should be exactly min(n_unique * 5, n) distinct hashes
  expect_lte(length(unique(info$dims_hash)), n_unique * 5L)
})