# test_that("downsample_immundata downsamples single-cell repertoires and is deterministic with seed", { # output_dir <- create_test_output_dir() # on.exit(cleanup_output_dir(output_dir)) # # test_data <- data.frame( # cell_id = c("c1", "c2", "c3", "c4", "c5", "c6", "c7", "c8"), # sample_id = c("S1", "S1", "S1", "S1", "S2", "S2", "S2", "S2"), # v_call = c("IGHV1", "IGHV2", "IGHV3", "IGHV4", "IGHV1", "IGHV2", "IGHV3", "IGHV4"), # j_call = c("IGHJ1", "IGHJ2", "IGHJ3", "IGHJ4", "IGHJ1", "IGHJ2", "IGHJ3", "IGHJ4"), # junction_aa = c("CARA", "CARB", "CARC", "CARD", "CARE", "CARF", "CARG", "CARH"), # locus = "IGH", # umi_count = c(10, 11, 12, 13, 14, 15, 16, 17) # ) # # temp_file <- tempfile(fileext = ".tsv") # readr::write_tsv(test_data, temp_file) # on.exit(unlink(temp_file), add = TRUE) # # idata <- read_repertoires( # path = temp_file, # schema = make_receptor_schema( # features = c("v_call", "j_call", "junction_aa"), # chains = "IGH" # ), # barcode_col = "cell_id", # locus_col = "locus", # umi_col = "umi_count", # output_folder = output_dir, # preprocess = NULL, # postprocess = NULL, # rename_columns = NULL # ) # idata <- agg_repertoires_with_integrity( # idata, # schema = "sample_id", # context = "downsample single-cell deterministic" # ) # # ds1 <- downsample_immundata(idata, n = 2, seed = 100) # ds2 <- downsample_immundata(idata, n = 2, seed = 100) # # reps <- ds1$repertoires |> collect() # expect_true(all(reps$n_barcodes == 2)) # # sampled1 <- ds1$annotations |> # select(sample_id, imd_barcode) |> # distinct() |> # arrange(sample_id, imd_barcode) |> # collect() # # sampled2 <- ds2$annotations |> # select(sample_id, imd_barcode) |> # distinct() |> # arrange(sample_id, imd_barcode) |> # collect() # # expect_equal(sampled1, sampled2) # }) # # test_that("downsample_immundata downsampled bulk repertoires by count", { # output_dir <- create_test_output_dir() # on.exit(cleanup_output_dir(output_dir)) # # test_data <- data.frame( # sample_id = c("S1", "S1", "S2", "S2"), # v_call = c("TRBV1", "TRBV2", "TRBV3", "TRBV4"), # j_call = c("TRBJ1", "TRBJ2", "TRBJ1", "TRBJ2"), # junction_aa = c("AAAA", "BBBB", "CCCC", "DDDD"), # clone_count = c(8, 7, 6, 9) # ) # # temp_file <- tempfile(fileext = ".tsv") # readr::write_tsv(test_data, temp_file) # on.exit(unlink(temp_file), add = TRUE) # # idata <- read_repertoires( # path = temp_file, # schema = c("v_call", "j_call", "junction_aa"), # count_col = "clone_count", # output_folder = output_dir, # preprocess = NULL, # postprocess = NULL, # rename_columns = NULL # ) # idata <- agg_repertoires_with_integrity( # idata, # schema = "sample_id", # context = "downsample bulk count mode" # ) # # ds <- downsample_immundata(idata, n = 5, seed = 42) # reps <- ds$repertoires |> collect() # # expect_equal(sort(reps$n_barcodes), c(5, 5)) # }) # # test_that("downsample_immundata keeps paired chains intact", { # output_dir <- create_test_output_dir() # on.exit(cleanup_output_dir(output_dir)) # # test_data <- data.frame( # cell_id = c( # "a1", "a1", "a2", "a2", "a3", "a3", # "b1", "b1", "b2", "b2", "b3", "b3" # ), # sample_id = c( # "S1", "S1", "S1", "S1", "S1", "S1", # "S2", "S2", "S2", "S2", "S2", "S2" # ), # v_call = c("IGHV1", "IGLV1", "IGHV2", "IGLV2", "IGHV3", "IGLV3", "IGHV1", "IGLV1", "IGHV2", "IGLV2", "IGHV3", "IGLV3"), # j_call = c("IGHJ1", "IGLJ1", "IGHJ2", "IGLJ2", "IGHJ3", "IGLJ3", "IGHJ1", "IGLJ1", "IGHJ2", "IGLJ2", "IGHJ3", "IGLJ3"), # junction_aa = c("CA1", "CL1", "CA2", "CL2", "CA3", "CL3", "CB1", "CK1", "CB2", "CK2", "CB3", "CK3"), # locus = c("IGH", "IGL", "IGH", "IGL", "IGH", "IGL", "IGH", "IGL", "IGH", "IGL", "IGH", "IGL"), # umi_count = rep(100, 12) # ) # # temp_file <- tempfile(fileext = ".tsv") # readr::write_tsv(test_data, temp_file) # on.exit(unlink(temp_file), add = TRUE) # # idata <- read_repertoires( # path = temp_file, # schema = make_receptor_schema( # features = c("v_call", "j_call", "junction_aa"), # chains = c("IGH", "IGL") # ), # barcode_col = "cell_id", # locus_col = "locus", # umi_col = "umi_count", # output_folder = output_dir, # preprocess = NULL, # postprocess = NULL, # rename_columns = NULL # ) # idata <- agg_repertoires_with_integrity( # idata, # schema = "sample_id", # context = "downsample paired-chain integrity" # ) # # ds <- downsample_immundata(idata, n = 2, seed = 7) # reps <- ds$repertoires |> collect() # expect_true(all(reps$n_barcodes == 2)) # # chain_stats <- ds$annotations |> # collect() |> # summarise( # .by = c(sample_id, imd_barcode), # n_loci = n_distinct(locus), # n_rows = n() # ) # # expect_true(all(chain_stats$n_loci == 2)) # expect_true(all(chain_stats$n_rows == 2)) # }) # # test_that("downsample_immundata works on IG test data with proportion n", { # output_dir <- create_test_output_dir() # on.exit(cleanup_output_dir(output_dir)) # # idata <- read_repertoires( # path = test_ig_data(), # schema = make_receptor_schema( # features = c("v_call", "j_call", "junction_aa"), # chains = c("IGH", "IGK|IGL") # ), # barcode_col = "cell_id", # locus_col = "locus", # umi_col = "duplicate_count", # output_folder = output_dir, # preprocess = NULL, # postprocess = NULL, # rename_columns = NULL # ) |> # mutate_immundata(cohort = "all") # idata <- agg_repertoires_with_integrity( # idata, # schema = "cohort", # context = "downsample IG proportion" # ) # # n_before <- idata$repertoires |> # collect() |> # pull(n_barcodes) # # ds <- downsample_immundata(idata, n = 0.5, seed = 123) # n_after <- ds$repertoires |> # collect() |> # pull(n_barcodes) # # expect_equal(n_after, floor(n_before * 0.5)) # # chain_stats <- ds$annotations |> # collect() |> # summarise( # .by = imd_barcode, # n_loci = n_distinct(locus) # ) # expect_true(all(chain_stats$n_loci == 2)) # }) # # test_that("downsample_immundata with n = 1 keeps one receptor per repertoire", { # output_dir <- create_test_output_dir() # on.exit(cleanup_output_dir(output_dir)) # # test_data <- data.frame( # cell_id = c("s1c1", "s1c2", "s1c3", "s1c4", "s2c1", "s2c2", "s2c3", "s2c4"), # sample_id = c("S1", "S1", "S1", "S1", "S2", "S2", "S2", "S2"), # v_call = c("IGHV1", "IGHV1", "IGHV2", "IGHV3", "IGHV4", "IGHV5", "IGHV5", "IGHV6"), # j_call = c("IGHJ1", "IGHJ1", "IGHJ2", "IGHJ3", "IGHJ4", "IGHJ5", "IGHJ5", "IGHJ6"), # junction_aa = c("A", "A", "B", "C", "D", "E", "E", "F"), # locus = "IGH", # umi_count = c(12, 11, 10, 9, 8, 7, 6, 5) # ) # # temp_file <- tempfile(fileext = ".tsv") # readr::write_tsv(test_data, temp_file) # on.exit(unlink(temp_file), add = TRUE) # # idata <- read_repertoires( # path = temp_file, # schema = make_receptor_schema( # features = c("v_call", "j_call", "junction_aa"), # chains = "IGH" # ), # barcode_col = "cell_id", # locus_col = "locus", # umi_col = "umi_count", # output_folder = output_dir, # preprocess = NULL, # postprocess = NULL, # rename_columns = NULL # ) # idata <- agg_repertoires_with_integrity( # idata, # schema = "sample_id", # context = "downsample n=1 receptor mode" # ) # # ds1 <- downsample_immundata(idata, n = 1, seed = 321) # ds2 <- downsample_immundata(idata, n = 1, seed = 321) # # receptor_stats <- ds1$annotations |> # collect() |> # summarise( # .by = sample_id, # n_receptors = n_distinct(imd_receptor_id) # ) |> # arrange(sample_id) # # expect_true(all(receptor_stats$n_receptors == 1)) # # reps <- ds1$repertoires |> # collect() |> # arrange(sample_id) # expect_true(all(reps$n_receptors == 1)) # # sampled1 <- ds1$annotations |> # select(sample_id, imd_receptor_id) |> # distinct() |> # arrange(sample_id, imd_receptor_id) |> # collect() # # sampled2 <- ds2$annotations |> # select(sample_id, imd_receptor_id) |> # distinct() |> # arrange(sample_id, imd_receptor_id) |> # collect() # # expect_equal(sampled1, sampled2) # }) # # test_that("downsample_immundata validates n and handles no-repertoire fallback", { # idata <- get_test_idata_tsv_no_metadata() # # expect_error( # downsample_immundata(idata, n = 2.5), # "integer count" # ) # # n_before <- idata$annotations |> # distinct(imd_barcode) |> # collect() |> # nrow() # # ds <- downsample_immundata(idata, n = 0.1, seed = 1) # # n_after <- ds$annotations |> # distinct(imd_barcode) |> # collect() |> # nrow() # # expect_lt(n_after, n_before) # expect_null(ds$repertoires) # }) # # test_that("downsample_immundata warns and keeps repertoire unchanged when n exceeds available units", { # output_dir <- create_test_output_dir() # on.exit(cleanup_output_dir(output_dir)) # # test_data <- data.frame( # cell_id = c("c1", "c2", "c3", "c4", "c5", "c6"), # sample_id = c("S1", "S1", "S1", "S2", "S2", "S2"), # v_call = c("IGHV1", "IGHV2", "IGHV3", "IGHV1", "IGHV2", "IGHV3"), # j_call = c("IGHJ1", "IGHJ2", "IGHJ3", "IGHJ1", "IGHJ2", "IGHJ3"), # junction_aa = c("A1", "A2", "A3", "B1", "B2", "B3"), # locus = "IGH", # umi_count = c(1, 1, 1, 1, 1, 1) # ) # # temp_file <- tempfile(fileext = ".tsv") # readr::write_tsv(test_data, temp_file) # on.exit(unlink(temp_file), add = TRUE) # # idata <- read_repertoires( # path = temp_file, # schema = make_receptor_schema(features = c("v_call", "j_call", "junction_aa"), chains = "IGH"), # barcode_col = "cell_id", # locus_col = "locus", # umi_col = "umi_count", # output_folder = output_dir, # preprocess = NULL, # postprocess = NULL, # rename_columns = NULL # ) # idata <- agg_repertoires_with_integrity( # idata, # schema = "sample_id", # context = "downsample n exceeds units" # ) # # reps_before <- idata$repertoires |> # collect() |> # arrange(sample_id) # # ds <- NULL # expect_warning( # ds <- downsample_immundata(idata, n = 10, seed = 42), # "returned unchanged" # ) # # reps_after <- ds$repertoires |> # collect() |> # arrange(sample_id) # # expect_equal(reps_after$n_barcodes, reps_before$n_barcodes) # }) # # test_that("downsample_immundata supports count-mode proportion downsampling", { # output_dir <- create_test_output_dir() # on.exit(cleanup_output_dir(output_dir)) # # test_data <- data.frame( # sample_id = c("S1", "S1", "S2", "S2"), # v_call = c("TRBV1", "TRBV2", "TRBV3", "TRBV4"), # j_call = c("TRBJ1", "TRBJ2", "TRBJ1", "TRBJ2"), # junction_aa = c("AAAA", "BBBB", "CCCC", "DDDD"), # clone_count = c(8, 7, 6, 9) # ) # # temp_file <- tempfile(fileext = ".tsv") # readr::write_tsv(test_data, temp_file) # on.exit(unlink(temp_file), add = TRUE) # # idata <- read_repertoires( # path = temp_file, # schema = c("v_call", "j_call", "junction_aa"), # count_col = "clone_count", # output_folder = output_dir, # preprocess = NULL, # postprocess = NULL, # rename_columns = NULL # ) # idata <- agg_repertoires_with_integrity( # idata, # schema = "sample_id", # context = "downsample count proportion" # ) # # ds <- downsample_immundata(idata, n = 0.5, seed = 101) # reps <- ds$repertoires |> collect() # # expect_equal(sort(reps$n_barcodes), c(7, 7)) # }) # # test_that("downsample_immundata is deterministic in count mode with seed", { # output_dir <- create_test_output_dir() # on.exit(cleanup_output_dir(output_dir)) # # test_data <- data.frame( # sample_id = c("S1", "S1", "S2", "S2"), # v_call = c("TRBV1", "TRBV2", "TRBV3", "TRBV4"), # j_call = c("TRBJ1", "TRBJ2", "TRBJ1", "TRBJ2"), # junction_aa = c("AAAA", "BBBB", "CCCC", "DDDD"), # clone_count = c(8, 7, 6, 9) # ) # # temp_file <- tempfile(fileext = ".tsv") # readr::write_tsv(test_data, temp_file) # on.exit(unlink(temp_file), add = TRUE) # # idata <- read_repertoires( # path = temp_file, # schema = c("v_call", "j_call", "junction_aa"), # count_col = "clone_count", # output_folder = output_dir, # preprocess = NULL, # postprocess = NULL, # rename_columns = NULL # ) # idata <- agg_repertoires_with_integrity( # idata, # schema = "sample_id", # context = "downsample count deterministic" # ) # # ds1 <- downsample_immundata(idata, n = 5, seed = 222) # ds2 <- downsample_immundata(idata, n = 5, seed = 222) # # ann1 <- ds1$annotations |> # select(sample_id, imd_barcode, imd_n_chains) |> # arrange(sample_id, imd_barcode) |> # collect() # ann2 <- ds2$annotations |> # select(sample_id, imd_barcode, imd_n_chains) |> # arrange(sample_id, imd_barcode) |> # collect() # # expect_equal(ann1, ann2) # }) # # test_that("downsample_immundata errors when proportion results in zero target", { # idata <- get_test_idata_tsv_no_metadata() # # expect_error( # downsample_immundata(idata, n = 0.0001, seed = 1), # "Increase `n`" # ) # }) # # test_that("downsample_immundata produces consistent imd_count and imd_proportion invariants", { # output_dir <- create_test_output_dir() # on.exit(cleanup_output_dir(output_dir)) # # test_data <- data.frame( # cell_id = c("c1", "c2", "c3", "c4", "c5", "c6"), # sample_id = c("S1", "S1", "S1", "S2", "S2", "S2"), # v_call = c("IGHV1", "IGHV1", "IGHV2", "IGHV1", "IGHV2", "IGHV2"), # j_call = c("IGHJ1", "IGHJ1", "IGHJ2", "IGHJ1", "IGHJ2", "IGHJ2"), # junction_aa = c("A1", "A1", "A2", "B1", "B2", "B2"), # locus = "IGH", # umi_count = c(10, 11, 12, 13, 14, 15) # ) # # temp_file <- tempfile(fileext = ".tsv") # readr::write_tsv(test_data, temp_file) # on.exit(unlink(temp_file), add = TRUE) # # idata <- read_repertoires( # path = temp_file, # schema = make_receptor_schema(features = c("v_call", "j_call", "junction_aa"), chains = "IGH"), # barcode_col = "cell_id", # locus_col = "locus", # umi_col = "umi_count", # output_folder = output_dir, # preprocess = NULL, # postprocess = NULL, # rename_columns = NULL # ) # idata <- agg_repertoires_with_integrity( # idata, # schema = "sample_id", # context = "downsample count/proportion invariants" # ) # # ds <- downsample_immundata(idata, n = 2, seed = 33) # # reps <- ds$repertoires |> collect() # ann <- ds$annotations |> collect() # # receptor_stats <- ann |> # select(imd_repertoire_id, imd_receptor_id, imd_count, imd_proportion) |> # distinct() |> # summarise( # .by = imd_repertoire_id, # sum_count = sum(imd_count), # sum_prop = sum(imd_proportion) # ) |> # arrange(imd_repertoire_id) # # reps <- reps |> arrange(imd_repertoire_id) # # expect_equal(receptor_stats$sum_count, reps$n_barcodes) # expect_true(all(abs(receptor_stats$sum_prop - 1) < 1e-8)) # })