# Helper functions for tests # This file contains common setup functions used across test files # Load required libraries for testing library(testthat) library(dplyr) library(lubridate) # Mock data creation functions that can be used across multiple test files #' Create standardized test search results #' #' @param n_articles Number of articles to create #' @param include_duplicates Whether to include some duplicate records #' @param missing_abstracts Number of records with missing abstracts #' @return Data frame with standardized search results create_standard_test_data <- function(n_articles = 20, include_duplicates = TRUE, missing_abstracts = 2) { base_data <- data.frame( id = paste0("art", 1:n_articles), title = paste("Article", 1:n_articles, "systematic review"), abstract = c( paste("Abstract", 1:(n_articles - missing_abstracts), "content"), rep(NA, missing_abstracts) ), source = sample(c("PubMed", "Embase", "Scopus", "Web of Science"), n_articles, replace = TRUE), date = as.Date("2020-01-01") + sample(0:1095, n_articles, replace = TRUE), doi = paste0("10.1000/journal.", 2020:2023, ".", sprintf("%03d", 1:n_articles)), authors = paste("Author", letters[1:n_articles], ", Co-author", LETTERS[1:n_articles]), duplicate = rep(FALSE, n_articles), stringsAsFactors = FALSE ) # Add some duplicates if requested if (include_duplicates && n_articles >= 4) { # Make articles 3 and 4 duplicates of articles 1 and 2 duplicate_indices <- c(3, 4) original_indices <- c(1, 2) for (i in seq_along(duplicate_indices)) { base_data$title[duplicate_indices[i]] <- base_data$title[original_indices[i]] base_data$abstract[duplicate_indices[i]] <- base_data$abstract[original_indices[i]] base_data$duplicate[duplicate_indices[i]] <- TRUE } } return(base_data) } #' Create test gold standard (known relevant articles) #' #' @param search_results Data frame of search results #' @param proportion Proportion of articles to include as relevant #' @return Vector of relevant article IDs create_test_gold_standard <- function(search_results, proportion = 0.3) { n_relevant <- ceiling(nrow(search_results) * proportion) sample(search_results$id, n_relevant) } #' Create test search strategy #' #' @param complexity Level of strategy complexity ("simple", "moderate", "complex") #' @return List representing search strategy create_test_search_strategy <- function(complexity = "moderate") { switch(complexity, "simple" = list( terms = c("systematic review"), databases = c("PubMed"), date_range = as.Date(c("2020-01-01", "2023-12-31")) ), "moderate" = list( terms = c("systematic review", "meta-analysis"), databases = c("PubMed", "Embase"), date_range = as.Date(c("2019-01-01", "2023-12-31")), filters = list(language = "English") ), "complex" = list( terms = c("systematic review", "meta-analysis", "evidence synthesis", "literature review"), databases = c("PubMed", "Embase", "Scopus", "Web of Science"), date_range = as.Date(c("2015-01-01", "2023-12-31")), filters = list( language = c("English", "Spanish"), study_type = c("randomized controlled trial", "cohort study"), publication_type = "journal article" ) ) ) } #' Create test benchmark dataset #' #' @param corpus_size Size of the document corpus #' @param relevance_rate Proportion of documents that are relevant #' @return List with corpus and relevant_ids create_test_benchmark <- function(corpus_size = 100, relevance_rate = 0.2) { corpus <- data.frame( id = paste0("bench", 1:corpus_size), title = paste("Benchmark article", 1:corpus_size), abstract = paste("This article discusses", sample(c("systematic review", "meta-analysis", "clinical trial", "observational study"), corpus_size, replace = TRUE)), source = sample(c("PubMed", "Embase", "Scopus"), corpus_size, replace = TRUE), date = as.Date("2018-01-01") + sample(0:1826, corpus_size, replace = TRUE), stringsAsFactors = FALSE ) n_relevant <- ceiling(corpus_size * relevance_rate) relevant_ids <- sample(corpus$id, n_relevant) list( corpus = corpus, relevant_ids = relevant_ids ) } #' Create test metrics object #' #' @param performance_level Performance level ("poor", "moderate", "good") #' @return List of test metrics create_test_metrics <- function(performance_level = "moderate") { base_precision <- switch(performance_level, "poor" = 0.3, "moderate" = 0.7, "good" = 0.9 ) base_recall <- switch(performance_level, "poor" = 0.4, "moderate" = 0.6, "good" = 0.8 ) f1_score <- 2 * (base_precision * base_recall) / (base_precision + base_recall) list( basic = list( total_records = 1000, unique_records = 950, duplicates = 50, missing_abstracts = 25, missing_dates = 10 ), precision_recall = list( precision = base_precision, recall = base_recall, f1_score = f1_score, true_positives = ceiling(base_recall * 100), false_positives = ceiling((1 - base_precision) * 150), false_negatives = ceiling((1 - base_recall) * 100), number_needed_to_read = 1 / base_precision ), efficiency = list( time_per_result = 0.1, time_per_relevant = 2.0, relevant_ratio = base_precision, efficiency_score = base_precision / 10 ), coverage = list( total_coverage = base_recall, by_database = data.frame( database = c("PubMed", "Embase", "Scopus"), coverage_count = c(30, 25, 20), coverage_rate = c(0.6, 0.5, 0.4), unique_coverage = c(10, 8, 5) ), redundancy_rate = 0.3 ), temporal = list( date_range = as.Date(c("2020-01-01", "2023-12-31")), temporal_distribution = data.frame( year = 2020:2023, n = c(200, 250, 300, 250) ) ) ) } #' Skip test if package not available #' #' @param package_name Name of package to check skip_if_not_installed <- function(package_name) { if (!requireNamespace(package_name, quietly = TRUE)) { testthat::skip(paste("Package", package_name, "not available")) } } #' Expect approximately equal with tolerance #' #' @param object Object to test #' @param expected Expected value #' @param tolerance Tolerance for comparison expect_approximately_equal <- function(object, expected, tolerance = 0.01) { testthat::expect_true(abs(object - expected) < tolerance, info = paste("Expected", expected, "but got", object)) } #' Create temporary test directory #' #' @return Path to temporary directory create_temp_test_dir <- function() { temp_dir <- file.path(tempdir(), "searchanalyzer_tests", format(Sys.time(), "%Y%m%d_%H%M%S")) dir.create(temp_dir, recursive = TRUE, showWarnings = FALSE) temp_dir } #' Clean up temporary test files #' #' @param file_paths Vector of file paths to remove cleanup_test_files <- function(file_paths) { for (path in file_paths) { if (file.exists(path)) { if (dir.exists(path)) { unlink(path, recursive = TRUE) } else { file.remove(path) } } } } #' Mock function for external dependencies #' #' @param function_name Name of function to mock #' @param return_value Value to return #' @param envir Environment to assign mock function mock_function <- function(function_name, return_value, envir = globalenv()) { mock_fn <- function(...) return_value assign(function_name, mock_fn, envir = envir) } #' Restore original function after mocking #' #' @param function_name Name of function to restore #' @param envir Environment to remove mock from unmock_function <- function(function_name, envir = globalenv()) { if (exists(function_name, envir = envir)) { rm(list = function_name, envir = envir) } } #' Test data validation helper #' #' @param data Data frame to validate #' @param required_cols Required column names #' @param min_rows Minimum number of rows #' @return TRUE if valid, error message if not validate_test_data <- function(data, required_cols = NULL, min_rows = 1) { if (!is.data.frame(data)) { stop("Input must be a data frame") } if (nrow(data) < min_rows) { stop(paste("Data must have at least", min_rows, "rows")) } if (!is.null(required_cols)) { missing_cols <- setdiff(required_cols, names(data)) if (length(missing_cols) > 0) { stop(paste("Missing required columns:", paste(missing_cols, collapse = ", "))) } } return(TRUE) } # Set up test environment setup_test_environment <- function() { # Set options for consistent testing options(stringsAsFactors = FALSE) # Set seed for reproducible random data set.seed(12345) # Create temporary directory for test outputs test_dir <- create_temp_test_dir() # Return test configuration list( test_dir = test_dir, seed = 12345, start_time = Sys.time() ) } # Cleanup test environment cleanup_test_environment <- function(config) { if (!is.null(config$test_dir) && dir.exists(config$test_dir)) { unlink(config$test_dir, recursive = TRUE) } # Reset random seed set.seed(NULL) cat("Test cleanup completed in", round(difftime(Sys.time(), config$start_time, units = "secs"), 2), "seconds\n") }