# Test fixture generators for pepdiff # These functions create synthetic proteomics data for testing #' Generate synthetic proteomics test data #' #' Creates realistic PRM proteomics data with configurable parameters. #' Data follows Gamma distribution typical of mass spec abundances. #' #' @param n_peptides Number of peptides to generate #' @param n_reps Number of biological replicates per condition #' @param factors Named list of factor levels, e.g., list(treatment = c("ctrl", "trt")) #' @param effect_size Fold change for affected peptides #' @param affected_fraction Proportion of peptides with differential abundance #' @param na_rate Proportion of missing values to introduce #' @param seed Random seed for reproducibility #' @return A tibble suitable for read_pepdiff() or testing make_test_data <- function( n_peptides = 10, n_reps = 3, factors = list(treatment = c("ctrl", "trt")), effect_size = 2.0, affected_fraction = 0.3, na_rate = 0.05, seed = 42 ) { set.seed(seed) # Generate all factor combinations factor_grid <- expand.grid(factors, stringsAsFactors = FALSE) # Create peptide and gene IDs peptide_ids <- paste0("PEP_", sprintf("%03d", seq_len(n_peptides))) gene_ids <- paste0("GENE_", sprintf("%03d", ceiling(seq_len(n_peptides) / 2))) # Determine which peptides are affected (have differential abundance) n_affected <- round(n_peptides * affected_fraction) affected_peptides <- sample(peptide_ids, n_affected) # Base abundance parameters (realistic proteomics scale) base_mean <- 1e6 cv <- 0.3 # Coefficient of variation # Build the full dataset data_list <- list() for (i in seq_len(nrow(factor_grid))) { condition <- factor_grid[i, , drop = FALSE] for (rep_idx in seq_len(n_reps)) { for (pep_idx in seq_len(n_peptides)) { peptide <- peptide_ids[pep_idx] gene <- gene_ids[pep_idx] # Determine if this peptide is affected and if this is treatment is_affected <- peptide %in% affected_peptides # Calculate fold change multiplier # Apply effect to non-reference levels of the first factor fc_multiplier <- 1.0 if (is_affected) { first_factor <- names(factors)[1] ref_level <- factors[[first_factor]][1] current_level <- condition[[first_factor]] if (current_level != ref_level) { fc_multiplier <- effect_size } } # Generate Gamma-distributed value # shape = 1/CV^2, rate = shape/mean shape <- 1 / (cv^2) adjusted_mean <- base_mean * fc_multiplier rate <- shape / adjusted_mean value <- stats::rgamma(1, shape = shape, rate = rate) # Create row row <- data.frame( peptide = peptide, gene_id = gene, bio_rep = rep_idx, value = value, stringsAsFactors = FALSE ) # Add factor columns for (f in names(factors)) { row[[f]] <- condition[[f]] } data_list[[length(data_list) + 1]] <- row } } } result <- do.call(rbind, data_list) # Introduce missing values (MCAR pattern for simplicity) if (na_rate > 0) { n_missing <- round(nrow(result) * na_rate) if (n_missing > 0) { missing_idx <- sample(seq_len(nrow(result)), n_missing) result$value[missing_idx] <- NA } } tibble::as_tibble(result) } #' Generate minimal test data for quick tests #' #' Creates a small dataset with 5 peptides and 2 reps per condition #' #' @param seed Random seed #' @return A tibble make_minimal_test_data <- function(seed = 123) { make_test_data( n_peptides = 5, n_reps = 2, factors = list(treatment = c("ctrl", "trt")), effect_size = 2.0, affected_fraction = 0.4, na_rate = 0, seed = seed ) } #' Generate multi-factor test data #' #' Creates data with two factors (treatment x timepoint) for factorial design tests #' #' @param seed Random seed #' @return A tibble make_factorial_test_data <- function(seed = 456) { make_test_data( n_peptides = 10, n_reps = 3, factors = list( treatment = c("ctrl", "trt"), timepoint = c("0h", "24h") ), effect_size = 2.0, affected_fraction = 0.3, na_rate = 0.05, seed = seed ) } #' Generate test data with technical replicates #' #' @param n_tech_reps Number of technical replicates per biological replicate #' @param seed Random seed #' @return A tibble with tech_rep column make_tech_rep_test_data <- function(n_tech_reps = 2, seed = 789) { base_data <- make_test_data( n_peptides = 5, n_reps = 2, factors = list(treatment = c("ctrl", "trt")), effect_size = 2.0, affected_fraction = 0.4, na_rate = 0, seed = seed ) # Expand with technical replicates set.seed(seed + 1) expanded <- list() for (i in seq_len(nrow(base_data))) { for (tech in seq_len(n_tech_reps)) { row <- base_data[i, ] # Add small technical variation (CV ~10%) row$value <- row$value * stats::rlnorm(1, 0, 0.1) row$tech_rep <- tech expanded[[length(expanded) + 1]] <- row } } tibble::as_tibble(do.call(rbind, expanded)) } #' Write test data to a temporary CSV file #' #' @param data A tibble of test data #' @return Path to the temporary file write_test_csv <- function(data) { temp_file <- tempfile(fileext = ".csv") readr::write_csv(data, temp_file) temp_file } #' Generate test data with zeros (for testing Gamma GLM error handling) #' #' @param seed Random seed #' @return A tibble with some zero values make_zero_test_data <- function(seed = 999) { data <- make_minimal_test_data(seed = seed) # Replace some values with zeros data$value[c(1, 5, 10)] <- 0 data }