skip_if_not_installed("bigmemory") skip_if_not_installed("RcppAnnoy") library(bigmemory) brute_force_knn_euclidean <- function(ref, query = ref, k, exclude_self = identical(ref, query)) { n_ref <- nrow(ref) n_query <- nrow(query) index <- matrix(NA_integer_, nrow = n_query, ncol = k) distance <- matrix(NA_real_, nrow = n_query, ncol = k) for (i in seq_len(n_query)) { dists <- rowSums((t(t(ref) - query[i, ]))^2) if (exclude_self && identical(ref, query)) { dists[i] <- Inf } order_idx <- order(dists, seq_len(n_ref)) keep <- order_idx[seq_len(k)] index[i, ] <- keep distance[i, ] <- sqrt(dists[keep]) } list(index = index, distance = distance) } make_filebacked_matrix <- function(values, type, backingpath, name) { bm <- filebacked.big.matrix( nrow = nrow(values), ncol = ncol(values), type = type, backingfile = sprintf("%s.bin", name), descriptorfile = sprintf("%s.desc", name), backingpath = backingpath ) bm[,] <- values bm } test_that("build metadata includes v3 fields and validate/open round-trip works", { ref <- matrix( c(0, 0, 10, 0, 0, 10, 10, 10), ncol = 2, byrow = TRUE ) big_ref <- as.big.matrix(ref) path <- tempfile(fileext = ".ann") index <- annoy_build_bigmatrix(big_ref, path = path, n_trees = 20, seed = 77L, load_mode = "lazy") expect_false(annoy_is_loaded(index)) validation <- annoy_validate_index(index, strict = TRUE, load = TRUE) reopened <- annoy_open_index(path, load_mode = "eager") loaded <- annoy_load_bigmatrix(path, load_mode = "lazy") expected <- brute_force_knn_euclidean(ref, k = 2, exclude_self = TRUE) metadata <- read.dcf(index$metadata_path) expect_s3_class(index, "bigannoy_index") expect_true(file.exists(path)) expect_true(file.exists(index$metadata_path)) expect_true(validation$valid) expect_true(is.character(index$index_id) && nzchar(index$index_id)) expect_true(is.numeric(index$file_size) && index$file_size > 0) expect_true(is.character(index$file_md5) && nzchar(index$file_md5)) expect_identical(index$load_mode, "lazy") expect_true(annoy_is_loaded(index)) expect_true(annoy_is_loaded(reopened)) expect_false(annoy_is_loaded(loaded)) expect_equal(annoy_search_bigmatrix(reopened, k = 2)$index, expected$index) expect_equal(annoy_search_bigmatrix(loaded, k = 2)$distance, expected$distance, tolerance = 1e-12) expect_true(all(c("index_id", "file_size", "file_mtime", "file_md5", "load_mode") %in% colnames(metadata))) }) test_that("search lazily loads, close unloads, and repeated search reloads successfully", { ref <- as.big.matrix(matrix(c(0, 0, 2, 0, 0, 2, 2, 2), ncol = 2, byrow = TRUE)) index <- annoy_build_bigmatrix(ref, tempfile(fileext = ".ann"), n_trees = 20, load_mode = "lazy") expect_false(annoy_is_loaded(index)) first <- annoy_search_bigmatrix(index, k = 2) expect_true(annoy_is_loaded(index)) annoy_close_index(index) expect_false(annoy_is_loaded(index)) second <- annoy_search_bigmatrix(index, k = 2) expect_true(annoy_is_loaded(index)) expect_equal(second$index, first$index) expect_equal(second$distance, first$distance, tolerance = 1e-12) }) test_that("dense, big.matrix, descriptor object, and descriptor path queries all work", { td <- tempfile("bigannoy-descriptor-") dir.create(td, recursive = TRUE) ref <- matrix( c(0, 0, 5, 0, 0, 5, 5, 5, 9, 9), ncol = 2, byrow = TRUE ) query <- matrix( c(0.2, 0.1, 4.7, 5.1), ncol = 2, byrow = TRUE ) ref_fb <- make_filebacked_matrix(ref, type = "double", backingpath = td, name = "ref") query_fb <- make_filebacked_matrix(query, type = "double", backingpath = td, name = "query") query_desc <- describe(query_fb) query_desc_path <- file.path(td, "query.desc") index <- annoy_build_bigmatrix(describe(ref_fb), tempfile(tmpdir = td, fileext = ".ann"), n_trees = 25, seed = 99L) dense_result <- annoy_search_bigmatrix(index, query = query, k = 2) big_result <- annoy_search_bigmatrix(index, query = query_fb, k = 2) desc_result <- annoy_search_bigmatrix(index, query = query_desc, k = 2) path_result <- annoy_search_bigmatrix(index, query = query_desc_path, k = 2) expected <- brute_force_knn_euclidean(ref, query = query, k = 2, exclude_self = FALSE) expect_equal(dense_result$index, expected$index) expect_equal(dense_result$distance, expected$distance, tolerance = 1e-6) expect_equal(big_result$index, dense_result$index) expect_equal(desc_result$distance, dense_result$distance, tolerance = 1e-6) expect_equal(path_result$index, dense_result$index) }) test_that("streaming outputs accept descriptor objects and descriptor paths", { td <- tempfile("bigannoy-stream-desc-") dir.create(td, recursive = TRUE) ref <- matrix( c(1, 1, 8, 1, 1, 8, 8, 8), ncol = 2, byrow = TRUE ) query <- matrix( c(2, 2, 7, 7), ncol = 2, byrow = TRUE ) ref_fb <- make_filebacked_matrix(ref, type = "double", backingpath = td, name = "ref") query_fb <- make_filebacked_matrix(query, type = "double", backingpath = td, name = "query") built <- annoy_build_bigmatrix(file.path(td, "ref.desc"), tempfile(tmpdir = td, fileext = ".ann"), n_trees = 20, seed = 101L) expected <- annoy_search_bigmatrix(built, query = query, k = 2) index_store <- filebacked.big.matrix( nrow = nrow(query), ncol = 2, type = "integer", backingfile = "index.bin", descriptorfile = "index.desc", backingpath = td ) distance_store <- filebacked.big.matrix( nrow = nrow(query), ncol = 2, type = "double", backingfile = "distance.bin", descriptorfile = "distance.desc", backingpath = td ) streamed <- annoy_search_bigmatrix( built, query = describe(query_fb), k = 2, xpIndex = describe(index_store), xpDistance = file.path(td, "distance.desc") ) expect_equal(bigmemory::as.matrix(index_store), expected$index) expect_equal(bigmemory::as.matrix(distance_store), expected$distance, tolerance = 1e-12) expect_type(streamed, "list") }) test_that("all supported metrics build and search, and native and debug backends agree", { ref <- matrix( c(1, 2, 2, 1, 4, 3, 3, 5, 7, 2), ncol = 2, byrow = TRUE ) query <- matrix( c(1.1, 2.2, 3.9, 3.2), ncol = 2, byrow = TRUE ) metrics <- c("euclidean", "angular", "manhattan", "dot") for (metric in metrics) { old_options <- options(bigANNOY.backend = "r") on.exit(options(old_options), add = TRUE) r_index <- suppressWarnings( annoy_build_bigmatrix(as.big.matrix(ref), tempfile(fileext = ".ann"), n_trees = 30, metric = metric, seed = 55L) ) r_result <- suppressWarnings( annoy_search_bigmatrix(r_index, query = query, k = 2, search_k = 200L) ) expect_identical(r_result$metric, metric) expect_identical(dim(r_result$index), c(nrow(query), 2L)) expect_identical(dim(r_result$distance), c(nrow(query), 2L)) if (isTRUE(is.loaded("_bigANNOY_cpp_annoy_open_index", PACKAGE = "bigANNOY")) && isTRUE(is.loaded("_bigANNOY_cpp_annoy_handle_search", PACKAGE = "bigANNOY"))) { options(bigANNOY.backend = "cpp") cpp_index <- annoy_build_bigmatrix(as.big.matrix(ref), tempfile(fileext = ".ann"), n_trees = 30, metric = metric, seed = 55L) cpp_result <- annoy_search_bigmatrix(cpp_index, query = query, k = 2, search_k = 200L) expect_equal(cpp_result$index, r_result$index) expect_equal(cpp_result$distance, r_result$distance, tolerance = 1e-6) } } }) test_that("validation catches impossible k, bad dimensions, missing files, and corrupted metadata", { td <- tempfile("bigannoy-validate-") dir.create(td, recursive = TRUE) ref <- as.big.matrix(matrix(c(0, 0, 1, 0), ncol = 2, byrow = TRUE)) path <- file.path(td, "index.ann") built <- annoy_build_bigmatrix(ref, path, n_trees = 10) expect_error(annoy_search_bigmatrix(built, k = 2), "`k` exceeds") expect_error( annoy_search_bigmatrix(built, query = matrix(1, ncol = 1), k = 1), "same number of columns" ) expect_error( annoy_open_index(tempfile(fileext = ".ann")), "does not exist" ) bad_index <- big.matrix(1, 1, type = "double") bad_distance <- big.matrix(1, 1, type = "integer") expect_error( annoy_search_bigmatrix(built, query = matrix(c(0, 0), ncol = 2), k = 1, xpIndex = bad_index), "`xpIndex` big.matrix must store integers" ) expect_error( annoy_search_bigmatrix( built, query = matrix(c(0, 0), ncol = 2), k = 1, xpIndex = big.matrix(1, 1, type = "integer"), xpDistance = bad_distance ), "`xpDistance` big.matrix must store doubles" ) metadata <- read.dcf(built$metadata_path) metadata[1L, "file_md5"] <- "corrupted" write.dcf(as.data.frame(metadata, stringsAsFactors = FALSE), file = built$metadata_path) reopened <- annoy_open_index(path, load_mode = "lazy") report <- annoy_validate_index(reopened, strict = FALSE, load = FALSE) expect_false(report$valid) expect_error(annoy_validate_index(reopened, strict = TRUE, load = FALSE), "checksum") }) test_that("non-finite build and query inputs are rejected", { bad_ref <- as.big.matrix(matrix(c(0, 0, NA, 1), ncol = 2, byrow = TRUE)) expect_error( annoy_build_bigmatrix(bad_ref, tempfile(fileext = ".ann")), "contains non-finite values" ) ref <- as.big.matrix(matrix(c(0, 0, 1, 1, 2, 2), ncol = 2, byrow = TRUE)) built <- annoy_build_bigmatrix(ref, tempfile(fileext = ".ann"), n_trees = 10) expect_error( annoy_search_bigmatrix(built, query = matrix(c(Inf, 0), ncol = 2), k = 1), "contains non-finite values" ) }) test_that("file-backed reopen and separated-column query matrices behave across sessions", { td <- tempfile("bigannoy-reopen-") dir.create(td, recursive = TRUE) ref <- matrix(rnorm(60), nrow = 20, ncol = 3) query <- matrix(rnorm(15), nrow = 5, ncol = 3) ref_fb <- make_filebacked_matrix(ref, type = "double", backingpath = td, name = "ref_large") query_sep <- big.matrix(nrow(query), ncol(query), type = "double", separated = TRUE) query_sep[,] <- query path <- file.path(td, "persist.ann") built <- annoy_build_bigmatrix(file.path(td, "ref_large.desc"), path = path, n_trees = 25, metric = "euclidean", seed = 123L) reopened <- annoy_open_index(path, prefault = TRUE, load_mode = "eager") direct <- annoy_search_bigmatrix(built, query = describe(query_sep), k = 3, prefault = TRUE) reopened_result <- annoy_search_bigmatrix(reopened, query = query_sep@address, k = 3, prefault = TRUE) expect_true(annoy_is_loaded(reopened)) expect_equal(reopened_result$index, direct$index) expect_equal(reopened_result$distance, direct$distance, tolerance = 1e-6) }) test_that("benchmark interface supports user data, saved outputs, and suite summaries", { ref <- matrix(rnorm(80), nrow = 20, ncol = 4) query <- matrix(rnorm(16), nrow = 4, ncol = 4) single_out <- tempfile(fileext = ".csv") suite_out <- tempfile(fileext = ".csv") single <- benchmark_annoy_bigmatrix( x = ref, query = query, k = 2L, n_trees = 10L, exact = FALSE, output_path = single_out, load_mode = "eager" ) suite <- benchmark_annoy_recall_suite( x = ref, query = query, k = 2L, n_trees = c(5L, 10L), search_k = c(-1L, 20L), exact = FALSE, output_path = suite_out, load_mode = "eager" ) expect_true(file.exists(single_out)) expect_true(file.exists(suite_out)) expect_true(single$validation$valid) expect_true(all(c("summary", "params", "index_path", "metadata_path", "exact_available", "validation") %in% names(single))) expect_true(all(c("metric", "backend", "self_search", "load_mode", "build_elapsed", "search_elapsed", "recall_at_k", "index_id") %in% names(single$summary))) expect_equal(nrow(single$summary), 1L) expect_true(all(c("summary", "exact_available") %in% names(suite))) expect_equal(nrow(suite$summary), 4L) expect_true(all(c("n_trees", "search_k", "self_search", "load_mode", "build_elapsed", "search_elapsed") %in% names(suite$summary))) })