ExampleDb <- file.path("..", "data-tests", "ExampleDb.gz") db <- readChangeoDb(ExampleDb) expect_warning( db_gg <- readChangeoDb(file.path("..", "data-tests", "db_test.tsv")), "db_test.tsv is not in the Change-O format" ) ### countGenes #### test_that("countGenes", { # Without copy numbers genes <- countGenes(db, gene = "V_CALL", groups = "SAMPLE", mode = "family") expect_equal(genes$seq_freq, c(0.76, 0.87, 0.24, 0.05, 0.05, 0.02, 0.01), tolerance = 0.001 ) genes <- countGenes(db, gene = "V_CALL", groups = "SAMPLE", mode = "gene") expect_equal(genes$seq_freq, c( 0.41, 0.35, 0.24, 0.33, 0.26, 0.11, 0.10, 0.05, 0.05, 0.04, 0.02, 0.02, 0.01, 0.01 ), tolerance = 0.001 ) genes <- countGenes(db, gene = "V_CALL", groups = "SAMPLE", mode = "allele") expect_equal(genes$seq_freq, c( 0.350, 0.325, 0.215, 0.330, 0.085, 0.150, 0.100, 0.100, 0.070, 0.050, 0.050, 0.025, 0.040, 0.030, 0.020, 0.020, 0.010, 0.010, 0.010, 0.010 ), tolerance = 0.01 ) # With copy numbers and multiple groups genes <- countGenes(db, gene = "V_CALL", groups = c("SAMPLE", "ISOTYPE"), copy = "DUPCOUNT", mode = "family" ) expect_equal(genes$seq_freq, c( 0.78, 0.95, 0.95, 0.22, 0.63, 0.37, 0.60, 0.20, 0.40, 0.67, 0.60, 0.80, 0.40, 0.33, 0.03, 0.05, 0.01, 0.01 ), tolerance = 0.01 ) genes <- countGenes(db, gene = "V_CALL", groups = c("SAMPLE", "ISOTYPE"), copy = "DUPCOUNT", mode = "gene" ) expect_equal(round(genes$seq_freq, 2)[1:12], c( 0.61, 0.75, 0.22, 0.54, 0.40, 0.37, 0.17, 0.20, 0.15, 0.20, 0.60, 0.20 ), tolerance = 0.01 ) genes <- countGenes(db, gene = "V_CALL", groups = c("SAMPLE", "ISOTYPE"), copy = "DUPCOUNT", mode = "allele" ) expect_equal(genes$seq_freq[1:12], c(0.61, 0.72, 0.16, 0.41, 0.40, 0.37, 0.06, 0.20, 0.08, 0.20, 0.50, 0.08), tolerance = 0.01 ) # Testing of fill # Without copy numbers genes <- countGenes(db, gene = "V_CALL", groups = "SAMPLE", mode = "family", fill = T) expect_equal(genes$seq_freq, c(0.02, 0.87, 0.05, 0.05, 0.01, 0, 0.76, 0, 0.24, 0), tolerance = 0.001 ) genes <- countGenes(db, gene = "V_CALL", groups = "SAMPLE", mode = "gene", fill = T) expect_equal(genes$seq_freq, c( 0.02, 0.04, 0.01, 0.1, 0.11, 0.26, 0.33, 0.02, 0, 0.05, 0.05, 0.01, 0, 0, 0, 0, 0.41, 0, 0, 0, 0.35, 0, 0.24, 0 ), tolerance = 0.001 ) genes <- countGenes(db, gene = "V_CALL", groups = "SAMPLE", mode = "allele", fill = T) expect_equal(genes$seq_freq, c( 0.02, 0.01, 0.03, 0.01, 0.1, 0.07, 0.04, 0.01, 0.1, 0.15, 0.33, 0.02, 0, 0.05, 0.05, 0, 0.01, 0, 0, 0, 0, 0, 0.085, 0.325, 0, 0, 0, 0, 0, 0.35, 0, 0.215, 0.025, 0 ), tolerance = 0.01 ) # With copy numbers and multiple groups genes <- countGenes(db, gene = "V_CALL", groups = c("SAMPLE", "ISOTYPE"), copy = "DUPCOUNT", mode = "family", fill = T ) expect_equal(round(genes$seq_freq, 2)[1:12], c(0, 0.8, 0, 0.2, 0, 0, 0.6, 0.4, 0, 0, 0, 0.6), tolerance = 0.01 ) genes <- countGenes(db, gene = "V_CALL", groups = c("SAMPLE", "ISOTYPE"), copy = "DUPCOUNT", mode = "gene", fill = T ) expect_equal(round(genes$seq_freq, 2)[1:12], c(0, 0, 0, 0.2, 0, 0, 0.6, 0, 0, 0, 0.2, 0), tolerance = 0.01 ) genes <- countGenes(db, gene = "V_CALL", groups = c("SAMPLE", "ISOTYPE"), copy = "DUPCOUNT", mode = "allele", fill = T ) expect_equal(genes$seq_freq[1:12], c(0, 0, 0, 0, 0.2, 0, 0, 0, 0, 0, 0.6, 0), tolerance = 0.01 ) # Test how NAs are handled db_some_na <- data.frame( sequence_id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), sample = c("S1", "S2", "S4", "S3", "S2", "S3", "S4", "S4", "S3", "S4"), v_call = c("IGHV1-1", NA, "IGHV1-1,IGHV1-2", "IGHV1-2,IGHV1-3", "IGHV1-2", "IGHV1-3", "IGHV1-3", "IGHV1-1,IGHV1-2", "IGHV1-2", NA), dupcount = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1), clone = c(1, 2, 2, 3, 3, 3, 4, 4, 4, 4), stringsAsFactors = F ) expect_warning(genes <- countGenes(db_some_na, gene = "v_call", mode = "gene"), NULL) expect_equal(genes$seq_freq, c(0.375, 0.375, 0.25), tolerance = 0.001 ) genes <- countGenes(db_some_na, gene = "v_call", mode = "gene", remove_na = FALSE) # A locus cannot be determined if the gene is NA. When determining the denominator (locus_count), # these sequences will be considered separately as their own "NA" locus. # Example: # locus gene seq_count locus_count seq_freq # # 1 IGH IGHV1-1 3 8 0.375 # 2 IGH IGHV1-2 3 8 0.375 # 3 IGH IGHV1-3 2 8 0.25 # 4 NA NA 2 2 1 expect_equal(genes$seq_freq, c(0.375, 0.375, 0.25, 1), tolerance = 0.001 ) genes <- countGenes(db_some_na, gene = "v_call", mode = "gene", copy = "clone", remove_na = FALSE) expect_equal(genes$copy_count, c(10, 7, 7, 6), tolerance = 0.001 ) genes <- countGenes(db_some_na, gene = "v_call", mode = "gene", groups = "sample", fill = TRUE, remove_na = FALSE) expect_equal(genes$seq_count, c(1, 0, 0, 0, 0, 1, 0, 1, 0, 2, 1, 0, 2, 0, 1, 1), tolerance = 0.001 ) db_all_na <- data.frame( sequence_id = c(1, 2, 3), sample = c("S1", "S2", "S2"), v_call = c(NA, NA, NA), dupcount = c(1, 1, 1), clone = c(1, 1, 1), stringsAsFactors = F ) expect_warning(genes <- countGenes(db_all_na, gene = "v_call", mode = "gene"), "The column v_call contains no data") expect_equal(nrow(genes), 0, tolerance = 0.001 ) expect_warning(genes <- countGenes(db_all_na, gene = "v_call", mode = "gene", remove_na = FALSE), "The column v_call contains no data") expect_equal(genes$seq_count, 3, tolerance = 0.001 ) expect_warning( genes <- countGenes(db_all_na, gene = "v_call", mode = "gene", groups = "sample", copy = "dupcount", fill = TRUE, remove_na = FALSE), "The column v_call contains no data" ) expect_equal(genes$copy_count, c(1, 2), tolerance = 0.001 ) # testing addition of collapse and first flags genes <- countGenes(db, gene = "J_CALL", mode = "allele", first = FALSE) expect_equal(genes$seq_freq, c(0.283, 0.213, 0.183, 0.133, 0.123, 0.0233, 0.020, 0.010, 0.010), tolerance = 0.001 ) genes <- countGenes(db, gene = "V_CALL", mode = "gene", first = FALSE, collapse = FALSE) expect_equal(genes$seq_freq, c( 0.31, 0.233, 0.17, 0.11, 0.0867, 0.0333, 0.0167, 0.0133, 0.00667, 0.00667, 0.00667, 0.0033, 0.0033 ), tolerance = 0.001 ) # test bulk percentages now being by locus # need data with mixed loci db_hl <- data.frame( sequence_id = c(1:30), sample = c(rep("S1", 15), rep("S2", 15)), v_call = c( "IGHV1-2,IGHV1-3", "IGHV1-2", "IGHV1-2", "IGHV1-3", "IGHV1-3", "IGHV1-3", "IGHV1-3", "IGHV2-1,IGHV2-2", "IGHV2-2", "IGHV2-2", "IGKV1-1", "IGKV1-2", "IGKV1-1,IGKV1-2", "IGLV1-1", "IGLV2-1", "IGHV1-2", "IGHV1-2", "IGHV3-1", "IGHV3-1", "IGHV2-1,IGHV2-2", "IGHV2-1", "IGHV2-2", "IGHV2-2", "IGHV2-2", "IGHV3-1", "IGKV1-1", "IGKV1-2", "IGKV2-1", "IGLV1-1", "IGLV1-1" ), j_call = c( "IGHJ1", "IGHJ1", "IGHJ1", "IGHJ1", "IGHJ2", "IGHJ2", "IGHJ2", "IGHJ2", "IGHJ2", "IGHJ2", "IGKJ1", "IGKJ1", "IGKJ1", "IGLJ1", "IGLJ1", "IGHJ1", "IGHJ1", "IGHJ1", "IGHJ1", "IGHJ2", "IGHJ2", "IGHJ2", "IGHJ2", "IGHJ2", "IGHJ2", "IGKJ1", "IGKJ2", "IGKJ2", "IGLJ1", "IGLJ1" ), dupcount = c( 1, 1, 5, 1, 1, 1, 5, 1, 5, 1, 1, 4, 1, 4, 1, 1, 1, 1, 1, 1, 2, 2, 4, 4, 1, 1, 4, 1, 4, 1 ), clone = c( 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, NA, NA, NA, NA, NA, 4, 4, 5, 5, 6, 6, 6, 6, 6, 7, NA, NA, NA, NA, NA ), stringsAsFactors = F ) genes <- countGenes(db_hl, gene = "v_call", groups = "sample", mode = "family") expect_equal(genes$seq_freq, c( 0.7, 0.5, 0.3, 1, 0.3, 0.2, 0.667, 1, 0.5, 0.5, 0.333 ), tolerance = 0.001 ) genes <- countGenes(db_hl, gene = "v_call", groups = "sample", mode = "gene") expect_equal(genes$seq_freq, c( 0.4, 0.3, 0.3, 0.3, 0.2, 0.667, 0.2, 0.2, 1, 0.1, 0.333, 0.5, 0.5, 0.333, 0.333, 0.333 ), tolerance = 0.001 ) genes <- countGenes(db_hl, gene = "j_call", groups = "sample", mode = "gene") expect_equal(genes$seq_freq, c( 0.6, 0.6, 0.4, 0.4, 1, 1, 0.667, 1, 0.333 ), tolerance = 0.001 ) genes <- countGenes(db_hl, gene = "v_call", groups = "sample", mode = "gene", copy = "dupcount") expect_equal(genes$copy_freq, c( 0.556, 0.364, 0.318, 0.273, 1, 0.667, 0.8, 0.667, 0.167, 0.167, 0.333, 0.111, 0.045, 0.2, 0.167, 0.167 ), tolerance = 0.001 ) genes <- countGenes(db_hl, gene = "j_call", groups = "sample", mode = "gene", copy = "dupcount") expect_equal(genes$copy_freq, c( 0.636, 0.778, 0.364, 1, 1, 0.833, 1, 0.222, 0.167 ), tolerance = 0.001 ) # case with all clone IDs provided genes <- countGenes(db_hl %>% dplyr::filter(!is.na(clone)), gene = "v_call", groups = "sample", mode = "gene", clone = "clone") expect_equal(genes$clone_freq, c( 0.5, 0.333, 0.333, 0.333, 0.25, 0.25 ), tolerance = 0.001 ) # case with some clone IDs provided expect_warning( genes <- countGenes(db_hl, gene = "v_call", groups = "sample", mode = "gene", clone = "clone"), "Found 10 sequences without clonal assignments. These sequences will be removed before counting clone frequencies." ) expect_equal(genes$clone_freq, c( 0.5, 0.333, 0.333, 0.333, 0.25, 0.25 ), tolerance = 0.001 ) expect_warning( genes <- countGenes(db_hl, gene = "j_call", groups = "sample", mode = "gene", clone = "clone"), "Found 10 sequences without clonal assignments. These sequences will be removed before counting clone frequencies." ) expect_equal(genes$clone_freq, c( 0.667, 0.5, 0.5, 0.333 ), tolerance = 0.001 ) # test if all NA db_hl$clone_na <- rep(NA, nrow(db_hl)) expect_error( genes <- countGenes(db_hl, gene = "v_call", groups = "sample", mode = "gene", clone = "clone_na"), "No clone IDs are present in the data." ) }) test_that("countGenes works for single-cell", { # Without copy numbers expect_warning(genes <- countGenes(db_gg, gene = "v_call", groups = "subject_id", mode = "family"), "Mixed bulk and single-cell data") expect_equal(round(genes$seq_freq, 2), c(0.83, 0.50, 0.11, 0.11, 0.06, 0.06), tolerance = 0.001 ) expect_warning(genes <- countGenes(db_gg, gene = "v_call", groups = "subject_id", mode = "gene"), "Mixed bulk and single-cell data") expect_equal(round(genes$seq_freq, 2), c(0.83, 0.50, 0.11, 0.11, 0.06, 0.06), tolerance = 0.001 ) added_cell <- data.frame( subject_id = c("S2", "S2", "S2"), v_call = c("IGHV1-1", "IGKV1-1", "IGKV1-1"), j_call = c("IGHJ1-1", "IGKJ1-1", "IGKJ1-1"), cell_id = c(30, 30, 30) ) # Specific use case for identical light V gene within 1 cell that should be counted only once db_gg_multilight <- db_gg %>% dplyr::select(c("subject_id", "v_call", "j_call", "cell_id")) %>% dplyr::bind_rows(db_gg, added_cell) expect_warning(genes <- countGenes(db_gg_multilight, gene = "v_call", groups = "subject_id", mode = "gene"), "Mixed bulk and single-cell data") expect_equal(genes$seq_freq, c(0.8, 0.5, 0.1, 0.1, 0.05, 0.05, 1, 1), tolerance = 0.001 ) }) ### getSegment #### test_that("getSegment", { # Light chain tests kappa_call <- c( "Homsap IGKV1D-39*01 F,Homsap IGKV1-39*02 F,Homsap IGKV1-39*01", "Homsap IGKJ5*01 F" ) expect_equal( getAllele(kappa_call), c("IGKV1-39*01", "IGKJ5*01") ) expect_equal( getAllele(kappa_call, first = FALSE), c("IGKV1-39*01,IGKV1-39*02", "IGKJ5*01") ) expect_equal( getAllele(kappa_call, first = FALSE, strip_d = FALSE), c("IGKV1D-39*01,IGKV1-39*02,IGKV1-39*01", "IGKJ5*01") ) expect_equal( getGene(kappa_call), c("IGKV1-39", "IGKJ5") ) expect_equal( getGene(kappa_call, first = FALSE), c("IGKV1-39", "IGKJ5") ) expect_equal( getGene(kappa_call, first = FALSE, strip_d = FALSE), c("IGKV1D-39,IGKV1-39", "IGKJ5") ) expect_equal( getGene(kappa_call, first = FALSE, strip_d = TRUE), c("IGKV1-39", "IGKJ5") ) expect_equal( getFamily(kappa_call), c("IGKV1", "IGKJ5") ) expect_equal( getFamily(kappa_call, first = FALSE), c("IGKV1", "IGKJ5") ) expect_equal( getFamily(kappa_call, first = FALSE, collapse = FALSE), c("IGKV1,IGKV1,IGKV1", "IGKJ5") ) expect_equal( getFamily(kappa_call, first = FALSE, strip_d = FALSE), c("IGKV1D,IGKV1", "IGKJ5") ) expect_equal( getLocus(kappa_call, first = FALSE, strip_d = FALSE, collapse = TRUE), c("IGK", "IGK") ) expect_equal( getLocus(kappa_call, first = FALSE, strip_d = FALSE, collapse = FALSE), c("IGK,IGK,IGK", "IGK") ) expect_equal( getChain(kappa_call, first = FALSE, strip_d = FALSE, collapse = TRUE), c("VL", "VL") ) expect_equal( getChain(kappa_call, first = FALSE, strip_d = FALSE, collapse = FALSE), c("VL,VL,VL", "VL") ) # Heavy chain tests heavy_call <- c( "Homsap IGHV1-69*01 F,Homsap IGHV1-69D*01 F", "Homsap IGHD1-1*01 F", "Homsap IGHJ1*01 F" ) expect_equal( getAllele(heavy_call, first = FALSE), c("IGHV1-69*01", "IGHD1-1*01", "IGHJ1*01") ) expect_equal( getAllele(heavy_call, first = FALSE, strip_d = FALSE), c("IGHV1-69*01,IGHV1-69D*01", "IGHD1-1*01", "IGHJ1*01") ) expect_equal( getGene(heavy_call, first = FALSE), c("IGHV1-69", "IGHD1-1", "IGHJ1") ) expect_equal( getGene(heavy_call, first = FALSE, strip_d = FALSE), c("IGHV1-69,IGHV1-69D", "IGHD1-1", "IGHJ1") ) expect_equal( getGene(heavy_call, first = FALSE, strip_d = TRUE), c("IGHV1-69", "IGHD1-1", "IGHJ1") ) expect_equal( getLocus(heavy_call, first = FALSE, strip_d = FALSE, collapse = TRUE), c("IGH", "IGH", "IGH") ) expect_equal( getLocus(heavy_call, first = FALSE, strip_d = FALSE, collapse = FALSE), c("IGH,IGH", "IGH", "IGH") ) expect_equal( getChain(heavy_call, first = FALSE, strip_d = FALSE, collapse = TRUE), c("VH", "VH", "VH") ) expect_equal( getChain(heavy_call, first = FALSE, strip_d = FALSE, collapse = FALSE), c("VH,VH", "VH", "VH") ) # Filtering non-localized genes nl_call <- c( "IGHV3-NL1*01,IGHV3-30-3*01,IGHV3-30*01", "Homosap IGHV2-30*01 F,Homsap IGHV3-NL1*01 F", "IGHV1-NL1*01" ) expect_equal( getAllele(nl_call, first = FALSE, omit_nl = TRUE), c("IGHV3-30-3*01,IGHV3-30*01", "IGHV2-30*01", "") ) expect_equal( getGene(nl_call, first = FALSE, omit_nl = TRUE), c("IGHV3-30-3,IGHV3-30", "IGHV2-30", "") ) expect_equal( getFamily(nl_call, first = FALSE, omit_nl = TRUE), c("IGHV3", "IGHV2", "") ) expect_equal( getLocus(nl_call, first = FALSE, omit_nl = TRUE), c("IGH", "IGH", "") ) expect_equal( getChain(nl_call, first = FALSE, omit_nl = TRUE), c("VH", "VH", "") ) # Test for issue found in TIgGER # If there's no allele info, # getAllele should return the same input value # Before the change in allele_regex used by getAllele, # getAllele("TRAV38-2/DV8") returned TRAV38-2 # which in TIgGER introduced NAs and an error, as TRAV38-2 was not a valid name # for the list of alleles, the valid name was TRAV38-2/DV8. # getAllele("IGHV1-01") returned IGHV1-01 getAllele(getGene("IGHV1-01*02")) == getGene("IGHV1-01*02") getAllele(getGene("TRAV38-2/DV8*01")) == getGene("TRAV38-2/DV8*01") # Test that getSegment works well with c_call and strip_d=TRUE # and it doesn't convert IGHD into IGH expect_equal(getGene("IGHD"), "IGHD") expect_equal(getAllele("IGHD"), "IGHD") gene <- countGenes(data.frame("c_call" = "IGHD", stringsAsFactors = F), gene = "c_call", mode = "gene") asis <- countGenes(data.frame("c_call" = "IGHD", stringsAsFactors = F), gene = "c_call", mode = "asis") expect_equal(gene[["gene"]], "IGHD") expect_equal(asis[["gene"]], "IGHD") # Test for IMGT temporary designation nomenclature tmp_call <- c("IGHV9S3*01", "IGKV10S12*01") expect_equal( getAllele(tmp_call), c("IGHV9S3*01", "IGKV10S12*01") ) expect_equal( getGene(tmp_call), c("IGHV9S3", "IGKV10S12") ) expect_equal( getFamily(tmp_call), c("IGHV9", "IGKV10") ) # Test Mus musculus # ADEGMC mm_alleles <- c( "IGHA*01", "IGHD*01", "IGHD5-1-1*01", "IGHE*01", "IGHG1*01", "IGHG2A*01", "IGHM*01", "IGKC*01" ) mm_genes <- c( "IGHA", "IGHD", "IGHD5-1-1", "IGHE", "IGHG1", "IGHG2A", "IGHM", "IGKC" ) mm_families <- c( "IGHA", "IGHD", "IGHD5", "IGHE", "IGHG1", "IGHG2A", "IGHM", "IGKC" ) expect_equal(getGene(mm_alleles), mm_genes) expect_equal(getFamily(mm_alleles), mm_families) # Test for TCR expect_equal( getGene(c("TRBV29-1*01", "TRAV12-1*01")), c("TRBV29-1", "TRAV12-1") ) expect_equal( getFamily(c("TRBV29-1*01", "TRAV12-1*01")), c("TRBV29", "TRAV12") ) }) ### sortGenes #### test_that("sortGenes", { genes <- c( "IGHV1-69D*01", "IGHV1-69*01", "IGHV4-38-2*01", "IGHV1-69-2*01", "IGHV2-5*01", "IGHV1-NL1*01", "IGHV1-2*01,IGHV1-2*05", "IGHV1-2", "IGHV1-2*02", "IGHV1-69*02" ) # Sort genes by name expected <- c( "IGHV1-2", "IGHV1-2*01,IGHV1-2*05", "IGHV1-2*02", "IGHV1-69*01", "IGHV1-69D*01", "IGHV1-69*02", "IGHV1-69-2*01", "IGHV1-NL1*01", "IGHV2-5*01", "IGHV4-38-2*01" ) sorted <- sortGenes(genes) expect_equal(sorted, expected) # Sort genes by position in the locus expected_locus <- c( "IGHV1-NL1*01", "IGHV1-69-2*01", "IGHV1-69*01", "IGHV1-69D*01", "IGHV1-69*02", "IGHV4-38-2*01", "IGHV2-5*01", "IGHV1-2", "IGHV1-2*01,IGHV1-2*05", "IGHV1-2*02" ) sorted_locus <- sortGenes(genes, method = "pos") expect_equal(sorted_locus, expected_locus) }) ### groupGenes #### test_that("groupGenes heavy only", { # make a data frame db <- data.frame( SEQUENCE_ID = c(1, 2, 3, 4, 5, 6, 7, 8), V_CALL = c("IGHV1-1", "IGHV1-1,IGHV1-2", "IGHV1-2,IGHV1-3", "IGHV1-2", "IGHV1-3", "IGHV1-3", "IGHV1-1,IGHV1-2", "IGHV1-2"), J_CALL = c("IGHJ1", "IGHJ2", "IGHJ1", "IGHJ1", "IGHJ1", "IGHJ2", "IGHJ1", "IGHJ2"), stringsAsFactors = F ) db$LOCUS <- getLocus(db$V_CALL) # group VJ genes db <- groupGenes(db, v_call = "V_CALL", j_call = "J_CALL", locus = "LOCUS", first = F ) # test # expected <- c(2, 3, 2, 2, 2, 1, 2, 3) # same underlying spirit as before expected <- c("G1", "G2", "G1", "G1", "G1", "G3", "G1", "G2") expect_equal(db$vj_group, expected) }) test_that("groupGenes, when 1 row ", { # Tests fix for rowSums when 1 row db <- structure(list( sample_id = "UW0343__S", sequence_id = "assemble12_0", subject_id = "S1", v_call = "IGHV3-30*02,IGHV3-30-5*02", j_call = "IGHJ6*03", junction = "TGTGCGAAAGTCCCCGTGGGGACTGCCTCTTACATGGACGCCTGG", cell_id = NA_character_, single_cell = FALSE, locus = "IGH", junction_length = 45L ), row.names = c(NA, -1L), class = c("tbl_df", "tbl", "data.frame")) ft <- groupGenes(db, junc_len = "junction_length", cell_id = "cell_id", locus = "locus", only_heavy = T, first = TRUE ) ff <- groupGenes(db, junc_len = "junction_length", cell_id = "cell_id", locus = "locus", only_heavy = T, first = FALSE ) expect_equal(ft, ff) }) test_that("groupGenes all cell_id values are NA", { db <- data.frame( subject_id = c("S1", "S1", "S1", "S1", "S1", "S1", "S1", "S1"), v_call = c("IGHV1-1*01", "IGHV1-1*01", "IGHV1-2*01", "IGHV1-1*01,IGHV1-2*01", "IGHV1-2*01", "IGKV1-1*01", "IGKV1-1*01","IGKV1-1*01"), j_call = c("IGHJ2*01", "IGHJ1*01", "IGHJ1*01", "IGHJ1*01", "IGHJ1*01", "IGKJ1*01", "IGKJ1*01", "IGKJ1*01"), junction = c("TGTAAAAAATGG", "TGTAAAAAATGG", "TGTAAAACCTGG", "TGTAAACCCTGG", "TGTAAACCCTGG", "TGTCCCCCCTGG", "TGTCCCCCCTGG", "TGTCCCCCCTGG"), locus = c("IGH", "IGH", "IGH", "IGH", "IGH", "IGK", "IGK", "IGK"), cell_id = c(NA, NA, NA, NA, NA, NA, NA, NA), junction_length = 12 ) expect_warning( groupGenes(db, v_call = "v_call", j_call = "j_call", junc_len = NULL, locus = "locus", only_heavy = TRUE, first = FALSE), "A cell_id column was found in the data, but was not specified. All values are NA." ) }) test_that("groupGenes, single-cell mode, heavy only", { # (in theory, should be 1 heavy per cell; but code-wise there is no restriction for groupGenes) load(file.path("..", "data-tests", "db_sc.rda")) # data_t1 # manual deduction # 1 # IGHV3-11 IGHJ4 93 # 2 # IGHV3-7 IGHJ1 24 # IGHV3-6 IGHJ1 24 (first=F) # IGHV1-4 IGHJ3 27 # IGHV1-4 IGHJ4 27 (first=F) # # 3 IGHV3-7 IGHJ5 90 # 4 IGHV3-7 IGHJ5 90 # 5 IGHV3-7 IGHJ5 90 # # 6 IGHV4-59 IGHJ1 84 # 7 IGHV4-59 IGHJ1 84 # 8 IGHV4-59 IGHJ1 84 # 9 IGHV4-59 IGHJ1 84 # # 10 IGHV4-59 IGHJ1 84 # 11 IGHV4-59 IGHJ1 84 # Cell with CELL_ID==B has 2 heavy and 2 light chains # expect error expect_error( singleCellValidation(data_t1, cell_id = "CELL_ID", locus = "LOCUS"), regexp = "Only one heavy chain is allowed per cell" ) expect_error( gg1 <- groupGenes(data_t1, v_call = "V_CALL", j_call = "J_CALL", junc_len = "LEN", cell_id = "CELL_ID", locus = "LOCUS", only_heavy = TRUE, first = FALSE ), regexp = "Only one heavy chain is allowed per cell" ) # 1 by itself, 2 by itself, 3-5 together, 6-11 together gg1_expect <- c( rep("G1", 2), # rep("G1", 4), # Cell B removed rep("G2", 2 + 2 + 2), rep("G3", 2 + 3 + 2 + 2 + 2 + 2) ) gg1 <- groupGenes(data_t1 %>% dplyr::filter(CELL_ID != "B"), v_call = "V_CALL", j_call = "J_CALL", junc_len = "LEN", cell_id = "CELL_ID", locus = "LOCUS", only_heavy = TRUE, first = FALSE ) expect_equal(gg1[["vj_group"]], gg1_expect) }) test_that("groupGenes, mixed bulk and single cell", { # TODO: add other sc cell with different L to test only_heavy T/F and fix tests db <- data.frame( subject_id = c("S1", "S1", "S1", "S1", "S1", "S1", "S1", "S1"), v_call = c("IGHV1-1*01", "IGHV1-1*01", "IGHV1-2*01", "IGHV1-1*01,IGHV1-2*01", "IGHV1-2*01", "IGKV1-1*01", "IGKV1-1*01","IGKV1-1*01"), j_call = c("IGHJ2*01", "IGHJ1*01", "IGHJ1*01", "IGHJ1*01", "IGHJ1*01", "IGKJ1*01", "IGKJ1*01", "IGKJ1*01"), junction = c("TGTAAAAAATGG", "TGTAAAAAATGG", "TGTAAAACCTGG", "TGTAAACCCTGG", "TGTAAACCCTGG", "TGTCCCCCCTGG", "TGTCCCCCCTGG", "TGTCCCCCCTGG"), locus = c("IGH", "IGH", "IGH", "IGH", "IGH", "IGK", "IGK", "IGK"), cell_id = c(1, 2, 3, NA, NA, 1, NA, 4), junction_length = 12 ) # subset to single-cell sequences # expect warning, one cell has only light chain and will be assigned group NA expect_warning( d <- groupGenes(db %>% dplyr::filter(!is.na(cell_id)), v_call = "v_call", j_call = "j_call", junc_len = NULL, cell_id = "cell_id", locus = "locus", only_heavy = TRUE, first = FALSE), pattern="NA(s) found in one or more of " ) expect_equal(d[["vj_group"]], c("G2", "G1", "G3", "G2", NA)) expect_warning( d_oh <- groupGenes(db %>% dplyr::filter(!is.na(cell_id)), v_call = "v_call", j_call = "j_call", junc_len = NULL, cell_id = "cell_id", locus = "locus", only_heavy = FALSE, first = FALSE ), "only_heavy = FALSE is deprecated" ) expect_warning( d_sl <- groupGenes(db %>% dplyr::filter(!is.na(cell_id)), v_call = "v_call", j_call = "j_call", junc_len = NULL, cell_id = "cell_id", locus = "locus", only_heavy = TRUE, first = FALSE, split_light = TRUE ), "split_light = TRUE is deprecated" ) expect_equal(d[["vj_group"]], d_oh[["vj_group"]], d_sl[["vj_group"]]) # bulk data d_bulk <- groupGenes(db %>% dplyr::filter(is.na(cell_id)), v_call = "v_call", j_call = "j_call", junc_len = NULL, cell_id = "cell_id", locus = "locus", only_heavy = TRUE, first = FALSE ) # mixed data expect_warning( d_mixed <- groupGenes(db, v_call = "v_call", j_call = "j_call", junc_len = NULL, cell_id = "cell_id", locus = "locus", only_heavy = TRUE, first = FALSE ), pattern="NA(s) found in one or more of " ) expect_equal(d_mixed[["vj_group"]], c("G2", "G1", "G1", "G1", "G1", "G2", NA, NA)) # TODO SSNN 7/16/2024 # Run tests with the same data used in scoper # Bulk mode cell_id=NULL ## first=T gg <- groupGenes(db_gg %>% select(-cell_id), cell_id = NULL, first = T) expect_equal( gg[["vj_group"]], gg[["expected_group_cell_id-null_first-T"]] ) ## first=F gg <- groupGenes(db_gg %>% select(-cell_id), cell_id = NULL, first = F) expect_equal( gg[["vj_group"]], gg[["expected_group_cell_id-null_first-F"]] ) }) #### AIRR-format migration tests #### test_that("countGenes, AIRR-format migration", { db_c <- readChangeoDb(file.path("..", "data-tests", "ExampleDb.gz")) expect_warning( db_a <- readChangeoDb( file.path("..", "data-tests", "ExampleDb_airr.gz") ), regexp = "airr::read_rearrangement" ) # 1 genes_c <- countGenes(db_c, gene = "V_CALL", groups = "SAMPLE", mode = "allele") genes_a <- countGenes(db_a, gene = "v_call", groups = "sample", mode = "allele") expect_true(all(genes_c == genes_a)) # 2 genes_c <- countGenes(db_c, gene = "V_CALL", groups = c("SAMPLE", "ISOTYPE"), copy = "DUPCOUNT", mode = "family" ) genes_a <- countGenes(db_a, gene = "v_call", groups = c("sample", "isotype"), copy = "duplicate_count", mode = "family" ) expect_true(all(genes_c == genes_a)) # 3 genes_c <- countGenes(db_c, gene = "V_CALL", groups = "SAMPLE", mode = "allele", fill = TRUE ) genes_a <- countGenes(db_a, gene = "v_call", groups = "sample", mode = "allele", fill = TRUE ) # genes_c uses "SAMPLE", genes_a uses "sample" # make colnames same case for comparison colnames(genes_c) <- tolower(colnames(genes_c)) expect_true(all.equal(genes_c, genes_a[rownames(genes_c), names(genes_c)])) }) test_that("groupGenes, AIRR-format migration", { # 1 db_c <- readChangeoDb(file.path("..", "data-tests", "ExampleDb.gz")) expect_warning( db_a <- readChangeoDb(file.path("..", "data-tests", "ExampleDb_airr.gz")), regexp = "airr::read_rearrangement" ) db_c$LOCUS <- getLocus(db_c$V_CALL) newDb_c <- groupGenes(db_c, v_call = "V_CALL", j_call = "J_CALL", locus = "LOCUS") newDb_a <- groupGenes(db_a, v_call = "v_call", j_call = "j_call") # newDb_c and newDb_a not directly comparable b/c diff ncol expect_true(all(newDb_c[["vj_group"]] == newDb_a[["vj_group"]])) rm(db_c, db_a, newDb_c, newDb_a) # 2 # Cell with CELL_ID==B has 2 heavy and 2 light chains load(file.path("..", "data-tests", "db_sc.rda")) # data_t1, data_t2 load(file.path("..", "data-tests", "db_sc_airr.rda")) # data_t1_airr, data_t2_airr newDb_c <- groupGenes(data_t1 %>% dplyr::filter(CELL_ID != "B"), v_call = "V_CALL", j_call = "J_CALL", junc_len = "LEN", cell_id = "CELL_ID", locus = "LOCUS", only_heavy = TRUE, first = FALSE ) newDb_a <- groupGenes(data_t1_airr %>% dplyr::filter(cell_id != "B"), v_call = "v_call", j_call = "j_call", junc_len = "len", cell_id = "cell_id", locus = "locus", only_heavy = TRUE, first = FALSE ) expect_true(all(newDb_c[["vj_group"]] == newDb_a[["vj_group"]])) })