# Test file for log_cpm and extract_expression_data functions mock_api_response <- list( classifier_keys = list( sex = c("female", "male"), tissue = c("Brodmann (1909) area 4", "adipose tissue", "adrenal gland", "adrenal tissue", "lung", "liver", "brain", "heart", "kidney", "stomach", "breast", # Add more tissues to reach realistic numbers paste0("tissue_", 1:275)), disease = c(" B-cell", " M3 (promyelocytic)", " M5 (monocytic)", " T-cell", "cancer", "diabetes", "normal", "inflammation", "gastrointestinal stromal tumor", # Add more diseases to reach realistic numbers paste0("disease_", 1:184)), cell_line = c("22Rv1", "A-375", "A-549", "A-673", "HeLa", "MCF-7", "HEK293", # Add more cell lines to reach 65 total paste0("cell_line_", 1:58)), cell_type = c("A-375", "A-549", "ASC", "B cell", "epithelial", "fibroblast", "stem cell", "immune cell", # Add more cell types to reach 595 total paste0("cell_type_", 1:587)), sample_type = c("cell line", "differentiated", "iPSC", "organoid", "primary tissue", "tissue", "blood", "serum"), perturbation_type = c("compound", "control", "crispr", "drug", "genetic", "knockout", "overexpression", "small_molecule", "viral", "chemical") ), gene_order = c("ENSG00000000003", "ENSG00000000005", "ENSG00000000419", "ENSG00000000457", "ENSG00000000460", "ENSG00000000938", # Add more genes to reach 44,592 total paste0("ENSG0000000", sprintf("%04d", 1:44586))), model_version = 1, outputs = list( classifier_probs = data.frame( cell_line = I(list( matrix(runif(5*65, 0, 1e-10), nrow = 5, ncol = 65), matrix(runif(5*65, 0, 1e-10), nrow = 5, ncol = 65) )), cell_type = I(list( matrix(runif(5*595, 0, 1e-8), nrow = 5, ncol = 595), matrix(runif(5*595, 0, 1e-8), nrow = 5, ncol = 595) )), disease = I(list( matrix(runif(5*193, 0, 1e-15), nrow = 5, ncol = 193), matrix(runif(5*193, 0, 1e-15), nrow = 5, ncol = 193) )), perturbation_type = I(list( matrix(runif(5*10, 0, 1e-12), nrow = 5, ncol = 10), matrix(c(rep(0.981, 5), runif(5*9, 0, 1e-10)), nrow = 5, ncol = 10) )), sample_type = I(list( matrix(c(rep(1, 5), runif(5*7, 0, 1e-10)), nrow = 5, ncol = 8), matrix(c(runif(5*1, 0, 1e-13), rep(1, 5*7)), nrow = 5, ncol = 8) )), sex = I(list( matrix(rep(1, 5*2), nrow = 5, ncol = 2), matrix(rep(1, 5*2), nrow = 5, ncol = 2) )), tissue = I(list( matrix(runif(5*286, 0, 1e-9), nrow = 5, ncol = 286), matrix(runif(5*286, 0, 1e-20), nrow = 5, ncol = 286) )) ), expression = I(list( matrix(as.integer(c(rep(1015, 5), rep(6, 5), runif(5*44590, 0, 2000))), nrow = 5, ncol = 44592), matrix(as.integer(c(rep(372, 5), rep(1, 5), runif(5*44590, 0, 2000))), nrow = 5, ncol = 44592) )), latents = I(list( list(), list() )), metadata = data.frame( cell_line = c("A-549", NA), perturbation = c("ABL1", NA), perturbation_type = c("crispr", NA), perturbation_time = c("96 hours", NA), sample_type = c("cell line", "primary tissue"), disease = c(NA, "gastrointestinal stromal tumor"), age = c(NA, "65 years"), sex = c(NA, "female"), tissue = c(NA, "stomach"), stringsAsFactors = FALSE ) ) ) # Tests for log_cpm function test_that("log_cpm transforms data correctly", { # Create sample raw counts raw_counts <- data.frame( sample_id = c("A", "B", "C"), gene1 = c(100, 200, 300), gene2 = c(50, 100, 150), gene3 = c(10, 20, 30) ) # Transform to log CPM result <- log_cpm(raw_counts) # Manually calculate expected values for first row row1_lib_size <- sum(raw_counts[1, -1 ]) # 100 + 50 + 10 = 160 expected_gene1_cpm <- (100 / 160) * 1e6 # 625000 expected_gene2_cpm <- (50 / 160) * 1e6 # 312500 expected_gene3_cpm <- (10 / 160) * 1e6 # 62500 # Log1p of expected values expected_gene1_log <- log1p(expected_gene1_cpm) expected_gene2_log <- log1p(expected_gene2_cpm) expected_gene3_log <- log1p(expected_gene3_cpm) # Check column names have _cpm suffix expect_true(all(grepl("_cpm$", colnames(result[-1])))) # Check values for first row (with tolerance for floating point differences) expect_equal(result$gene1_cpm[1], expected_gene1_log, tolerance = 1e-5) expect_equal(result$gene2_cpm[2], expected_gene2_log, tolerance = 1e-5) expect_equal(result$gene3_cpm[3], expected_gene3_log, tolerance = 1e-5) # Check dimensions expect_equal(nrow(result), nrow(raw_counts)) expect_equal(ncol(result), ncol(raw_counts)) }) test_that("log_cpm handles edge cases correctly", { # Test with matrix input matrix_input <- matrix(c(100, 200, 50, 100, 10, 20), nrow = 2) colnames(matrix_input) <- c("ENSG00001", "ENSG00002", "ENSG00003") expect_error(log_cpm(matrix_input), NA) # Should not error # Test with zero counts zero_counts <- data.frame( sample_id = c("A", "B", "C"), gene1 = c(0, 200, 300), gene2 = c(50, 0, 150), gene3 = c(10, 20, 0) ) result_zeros <- log_cpm(zero_counts) expect_false(any(is.na(result_zeros))) # Test with negative values (should be converted to 0) neg_counts <- data.frame( sample_id = c("A", "B", "C"), gene1 = c(-10, 200, 300), gene2 = c(50, -20, 150), gene3 = c(10, 20, -30) ) result_neg <- log_cpm(neg_counts) expect_false(any(is.na(result_neg))) }) test_that("log_cpm handles invalid inputs correctly", { # Test with non-data frame/matrix expect_error(log_cpm(list(a = 1:3, b = 4:6)), "Input must be a data frame or matrix") # Test with empty data frame expect_error(log_cpm(data.frame()), "Input must have at least one row and one column") # Test with data frame with no columns empty_df <- data.frame(x = integer(0))[, FALSE] expect_error(log_cpm(empty_df), "Input must have at least one row and one column") }) # Tests for extract_expression_data function test_that("extract_expression_data processes API response correctly", { # Test with as_counts = TRUE (default) result_counts <- extract_expression_data(mock_api_response) # Check structure expect_type(result_counts, "list") expect_named(result_counts, c("metadata", "expression")) # Check metadata expect_s3_class(result_counts$metadata, "data.frame") expect_equal(nrow(result_counts$metadata), 10) # Check expression data expect_s3_class(result_counts$expression, "data.frame") expect_equal(nrow(result_counts$expression), 10) expect_equal(colnames(result_counts$expression)[1:4], c("sample_id", "ENSG00000000003", "ENSG00000000005", "ENSG00000000419")) # Test with as_counts = FALSE (log CPM transformation) result_logcpm <- extract_expression_data(mock_api_response, as_counts = FALSE) # Check expression data has been transformed (no longer integers) expect_false(all(sapply(result_logcpm$expression, is.integer))) }) test_that("extract_expression_data correctly assigns sample IDs", { # Test sample ID generation result <- extract_expression_data(mock_api_response) # Check sample IDs match between metadata and expression expect_equal( nrow(result$metadata), nrow(result$expression)) # sample ids should match expect_equal(result$metadata$sample_id, result$expression$sample_id) })