test_that("downsample_immundata downsamples single-cell repertoires and is deterministic with seed", {
  output_dir <- create_test_output_dir()
  on.exit(cleanup_output_dir(output_dir))

  test_data <- data.frame(
    cell_id = c("c1", "c2", "c3", "c4", "c5", "c6", "c7", "c8"),
    sample_id = c("S1", "S1", "S1", "S1", "S2", "S2", "S2", "S2"),
    v_call = c("IGHV1", "IGHV2", "IGHV3", "IGHV4", "IGHV1", "IGHV2", "IGHV3", "IGHV4"),
    j_call = c("IGHJ1", "IGHJ2", "IGHJ3", "IGHJ4", "IGHJ1", "IGHJ2", "IGHJ3", "IGHJ4"),
    junction_aa = c("CARA", "CARB", "CARC", "CARD", "CARE", "CARF", "CARG", "CARH"),
    locus = "IGH",
    umi_count = c(10, 11, 12, 13, 14, 15, 16, 17)
  )

  temp_file <- tempfile(fileext = ".tsv")
  readr::write_tsv(test_data, temp_file)
  on.exit(unlink(temp_file), add = TRUE)

  idata <- read_repertoires(
    path = temp_file,
    schema = make_receptor_schema(
      features = c("v_call", "j_call", "junction_aa"),
      chains = "IGH"
    ),
    barcode_col = "cell_id",
    locus_col = "locus",
    umi_col = "umi_count",
    output_folder = output_dir,
    preprocess = NULL,
    postprocess = NULL,
    rename_columns = NULL
  )
  idata <- agg_repertoires_with_integrity(
    idata,
    schema = "sample_id",
    context = "downsample single-cell deterministic"
  )

  ds1 <- downsample_immundata(idata, n = 2, seed = 100)
  ds2 <- downsample_immundata(idata, n = 2, seed = 100)

  reps <- ds1$repertoires |> collect()
  expect_true(all(reps$n_barcodes == 2))

  sampled1 <- ds1$annotations |>
    select(sample_id, imd_barcode) |>
    distinct() |>
    arrange(sample_id, imd_barcode) |>
    collect()

  sampled2 <- ds2$annotations |>
    select(sample_id, imd_barcode) |>
    distinct() |>
    arrange(sample_id, imd_barcode) |>
    collect()

  expect_equal(sampled1, sampled2)
})

test_that("downsample_immundata downsampled bulk repertoires by count", {
  output_dir <- create_test_output_dir()
  on.exit(cleanup_output_dir(output_dir))

  test_data <- data.frame(
    sample_id = c("S1", "S1", "S2", "S2"),
    v_call = c("TRBV1", "TRBV2", "TRBV3", "TRBV4"),
    j_call = c("TRBJ1", "TRBJ2", "TRBJ1", "TRBJ2"),
    junction_aa = c("AAAA", "BBBB", "CCCC", "DDDD"),
    clone_count = c(8, 7, 6, 9)
  )

  temp_file <- tempfile(fileext = ".tsv")
  readr::write_tsv(test_data, temp_file)
  on.exit(unlink(temp_file), add = TRUE)

  idata <- read_repertoires(
    path = temp_file,
    schema = c("v_call", "j_call", "junction_aa"),
    count_col = "clone_count",
    output_folder = output_dir,
    preprocess = NULL,
    postprocess = NULL,
    rename_columns = NULL
  )
  idata <- agg_repertoires_with_integrity(
    idata,
    schema = "sample_id",
    context = "downsample bulk count mode"
  )

  ds <- downsample_immundata(idata, n = 5, seed = 42)
  reps <- ds$repertoires |> collect()

  expect_equal(sort(reps$n_barcodes), c(5, 5))
})

test_that("downsample_immundata keeps paired chains intact", {
  output_dir <- create_test_output_dir()
  on.exit(cleanup_output_dir(output_dir))

  test_data <- data.frame(
    cell_id = c(
      "a1", "a1", "a2", "a2", "a3", "a3",
      "b1", "b1", "b2", "b2", "b3", "b3"
    ),
    sample_id = c(
      "S1", "S1", "S1", "S1", "S1", "S1",
      "S2", "S2", "S2", "S2", "S2", "S2"
    ),
    v_call = c("IGHV1", "IGLV1", "IGHV2", "IGLV2", "IGHV3", "IGLV3", "IGHV1", "IGLV1", "IGHV2", "IGLV2", "IGHV3", "IGLV3"),
    j_call = c("IGHJ1", "IGLJ1", "IGHJ2", "IGLJ2", "IGHJ3", "IGLJ3", "IGHJ1", "IGLJ1", "IGHJ2", "IGLJ2", "IGHJ3", "IGLJ3"),
    junction_aa = c("CA1", "CL1", "CA2", "CL2", "CA3", "CL3", "CB1", "CK1", "CB2", "CK2", "CB3", "CK3"),
    locus = c("IGH", "IGL", "IGH", "IGL", "IGH", "IGL", "IGH", "IGL", "IGH", "IGL", "IGH", "IGL"),
    umi_count = rep(100, 12)
  )

  temp_file <- tempfile(fileext = ".tsv")
  readr::write_tsv(test_data, temp_file)
  on.exit(unlink(temp_file), add = TRUE)

  idata <- read_repertoires(
    path = temp_file,
    schema = make_receptor_schema(
      features = c("v_call", "j_call", "junction_aa"),
      chains = c("IGH", "IGL")
    ),
    barcode_col = "cell_id",
    locus_col = "locus",
    umi_col = "umi_count",
    output_folder = output_dir,
    preprocess = NULL,
    postprocess = NULL,
    rename_columns = NULL
  )
  idata <- agg_repertoires_with_integrity(
    idata,
    schema = "sample_id",
    context = "downsample paired-chain integrity"
  )

  ds <- downsample_immundata(idata, n = 2, seed = 7)
  reps <- ds$repertoires |> collect()
  expect_true(all(reps$n_barcodes == 2))

  chain_stats <- ds$annotations |>
    collect() |>
    summarise(
      .by = c(sample_id, imd_barcode),
      n_loci = n_distinct(locus),
      n_rows = n()
    )

  expect_true(all(chain_stats$n_loci == 2))
  expect_true(all(chain_stats$n_rows == 2))
})

test_that("downsample_immundata works on IG test data with proportion n", {
  output_dir <- create_test_output_dir()
  on.exit(cleanup_output_dir(output_dir))

  idata <- read_repertoires(
    path = test_ig_data(),
    schema = make_receptor_schema(
      features = c("v_call", "j_call", "junction_aa"),
      chains = c("IGH", "IGK|IGL")
    ),
    barcode_col = "cell_id",
    locus_col = "locus",
    umi_col = "duplicate_count",
    output_folder = output_dir,
    preprocess = NULL,
    postprocess = NULL,
    rename_columns = NULL
  ) |>
    mutate_immundata(cohort = "all")
  idata <- agg_repertoires_with_integrity(
    idata,
    schema = "cohort",
    context = "downsample IG proportion"
  )

  n_before <- idata$repertoires |> collect() |> pull(n_barcodes)

  ds <- downsample_immundata(idata, n = 0.5, seed = 123)
  n_after <- ds$repertoires |> collect() |> pull(n_barcodes)

  expect_equal(n_after, floor(n_before * 0.5))

  chain_stats <- ds$annotations |>
    collect() |>
    summarise(
      .by = imd_barcode,
      n_loci = n_distinct(locus)
    )
  expect_true(all(chain_stats$n_loci == 2))
})

test_that("downsample_immundata with n = 1 keeps one receptor per repertoire", {
  output_dir <- create_test_output_dir()
  on.exit(cleanup_output_dir(output_dir))

  test_data <- data.frame(
    cell_id = c("s1c1", "s1c2", "s1c3", "s1c4", "s2c1", "s2c2", "s2c3", "s2c4"),
    sample_id = c("S1", "S1", "S1", "S1", "S2", "S2", "S2", "S2"),
    v_call = c("IGHV1", "IGHV1", "IGHV2", "IGHV3", "IGHV4", "IGHV5", "IGHV5", "IGHV6"),
    j_call = c("IGHJ1", "IGHJ1", "IGHJ2", "IGHJ3", "IGHJ4", "IGHJ5", "IGHJ5", "IGHJ6"),
    junction_aa = c("A", "A", "B", "C", "D", "E", "E", "F"),
    locus = "IGH",
    umi_count = c(12, 11, 10, 9, 8, 7, 6, 5)
  )

  temp_file <- tempfile(fileext = ".tsv")
  readr::write_tsv(test_data, temp_file)
  on.exit(unlink(temp_file), add = TRUE)

  idata <- read_repertoires(
    path = temp_file,
    schema = make_receptor_schema(
      features = c("v_call", "j_call", "junction_aa"),
      chains = "IGH"
    ),
    barcode_col = "cell_id",
    locus_col = "locus",
    umi_col = "umi_count",
    output_folder = output_dir,
    preprocess = NULL,
    postprocess = NULL,
    rename_columns = NULL
  )
  idata <- agg_repertoires_with_integrity(
    idata,
    schema = "sample_id",
    context = "downsample n=1 receptor mode"
  )

  ds1 <- downsample_immundata(idata, n = 1, seed = 321)
  ds2 <- downsample_immundata(idata, n = 1, seed = 321)

  receptor_stats <- ds1$annotations |>
    collect() |>
    summarise(
      .by = sample_id,
      n_receptors = n_distinct(imd_receptor_id)
    ) |>
    arrange(sample_id)

  expect_true(all(receptor_stats$n_receptors == 1))

  reps <- ds1$repertoires |>
    collect() |>
    arrange(sample_id)
  expect_true(all(reps$n_receptors == 1))

  sampled1 <- ds1$annotations |>
    select(sample_id, imd_receptor_id) |>
    distinct() |>
    arrange(sample_id, imd_receptor_id) |>
    collect()

  sampled2 <- ds2$annotations |>
    select(sample_id, imd_receptor_id) |>
    distinct() |>
    arrange(sample_id, imd_receptor_id) |>
    collect()

  expect_equal(sampled1, sampled2)
})

test_that("downsample_immundata validates n and handles no-repertoire fallback", {
  idata <- get_test_idata_tsv_no_metadata()

  expect_error(
    downsample_immundata(idata, n = 2.5),
    "integer count"
  )

  n_before <- idata$annotations |>
    distinct(imd_barcode) |>
    collect() |>
    nrow()

  ds <- downsample_immundata(idata, n = 0.1, seed = 1)

  n_after <- ds$annotations |>
    distinct(imd_barcode) |>
    collect() |>
    nrow()

  expect_lt(n_after, n_before)
  expect_null(ds$repertoires)
})

test_that("downsample_immundata warns and keeps repertoire unchanged when n exceeds available units", {
  output_dir <- create_test_output_dir()
  on.exit(cleanup_output_dir(output_dir))

  test_data <- data.frame(
    cell_id = c("c1", "c2", "c3", "c4", "c5", "c6"),
    sample_id = c("S1", "S1", "S1", "S2", "S2", "S2"),
    v_call = c("IGHV1", "IGHV2", "IGHV3", "IGHV1", "IGHV2", "IGHV3"),
    j_call = c("IGHJ1", "IGHJ2", "IGHJ3", "IGHJ1", "IGHJ2", "IGHJ3"),
    junction_aa = c("A1", "A2", "A3", "B1", "B2", "B3"),
    locus = "IGH",
    umi_count = c(1, 1, 1, 1, 1, 1)
  )

  temp_file <- tempfile(fileext = ".tsv")
  readr::write_tsv(test_data, temp_file)
  on.exit(unlink(temp_file), add = TRUE)

  idata <- read_repertoires(
    path = temp_file,
    schema = make_receptor_schema(features = c("v_call", "j_call", "junction_aa"), chains = "IGH"),
    barcode_col = "cell_id",
    locus_col = "locus",
    umi_col = "umi_count",
    output_folder = output_dir,
    preprocess = NULL,
    postprocess = NULL,
    rename_columns = NULL
  )
  idata <- agg_repertoires_with_integrity(
    idata,
    schema = "sample_id",
    context = "downsample n exceeds units"
  )

  reps_before <- idata$repertoires |>
    collect() |>
    arrange(sample_id)

  ds <- NULL
  expect_warning(
    ds <- downsample_immundata(idata, n = 10, seed = 42),
    "returned unchanged"
  )

  reps_after <- ds$repertoires |>
    collect() |>
    arrange(sample_id)

  expect_equal(reps_after$n_barcodes, reps_before$n_barcodes)
})

test_that("downsample_immundata supports count-mode proportion downsampling", {
  output_dir <- create_test_output_dir()
  on.exit(cleanup_output_dir(output_dir))

  test_data <- data.frame(
    sample_id = c("S1", "S1", "S2", "S2"),
    v_call = c("TRBV1", "TRBV2", "TRBV3", "TRBV4"),
    j_call = c("TRBJ1", "TRBJ2", "TRBJ1", "TRBJ2"),
    junction_aa = c("AAAA", "BBBB", "CCCC", "DDDD"),
    clone_count = c(8, 7, 6, 9)
  )

  temp_file <- tempfile(fileext = ".tsv")
  readr::write_tsv(test_data, temp_file)
  on.exit(unlink(temp_file), add = TRUE)

  idata <- read_repertoires(
    path = temp_file,
    schema = c("v_call", "j_call", "junction_aa"),
    count_col = "clone_count",
    output_folder = output_dir,
    preprocess = NULL,
    postprocess = NULL,
    rename_columns = NULL
  )
  idata <- agg_repertoires_with_integrity(
    idata,
    schema = "sample_id",
    context = "downsample count proportion"
  )

  ds <- downsample_immundata(idata, n = 0.5, seed = 101)
  reps <- ds$repertoires |> collect()

  expect_equal(sort(reps$n_barcodes), c(7, 7))
})

test_that("downsample_immundata is deterministic in count mode with seed", {
  output_dir <- create_test_output_dir()
  on.exit(cleanup_output_dir(output_dir))

  test_data <- data.frame(
    sample_id = c("S1", "S1", "S2", "S2"),
    v_call = c("TRBV1", "TRBV2", "TRBV3", "TRBV4"),
    j_call = c("TRBJ1", "TRBJ2", "TRBJ1", "TRBJ2"),
    junction_aa = c("AAAA", "BBBB", "CCCC", "DDDD"),
    clone_count = c(8, 7, 6, 9)
  )

  temp_file <- tempfile(fileext = ".tsv")
  readr::write_tsv(test_data, temp_file)
  on.exit(unlink(temp_file), add = TRUE)

  idata <- read_repertoires(
    path = temp_file,
    schema = c("v_call", "j_call", "junction_aa"),
    count_col = "clone_count",
    output_folder = output_dir,
    preprocess = NULL,
    postprocess = NULL,
    rename_columns = NULL
  )
  idata <- agg_repertoires_with_integrity(
    idata,
    schema = "sample_id",
    context = "downsample count deterministic"
  )

  ds1 <- downsample_immundata(idata, n = 5, seed = 222)
  ds2 <- downsample_immundata(idata, n = 5, seed = 222)

  ann1 <- ds1$annotations |>
    select(sample_id, imd_barcode, imd_n_chains) |>
    arrange(sample_id, imd_barcode) |>
    collect()
  ann2 <- ds2$annotations |>
    select(sample_id, imd_barcode, imd_n_chains) |>
    arrange(sample_id, imd_barcode) |>
    collect()

  expect_equal(ann1, ann2)
})

test_that("downsample_immundata errors when proportion results in zero target", {
  idata <- get_test_idata_tsv_no_metadata()

  expect_error(
    downsample_immundata(idata, n = 0.0001, seed = 1),
    "Increase `n`"
  )
})

test_that("downsample_immundata produces consistent imd_count and imd_proportion invariants", {
  output_dir <- create_test_output_dir()
  on.exit(cleanup_output_dir(output_dir))

  test_data <- data.frame(
    cell_id = c("c1", "c2", "c3", "c4", "c5", "c6"),
    sample_id = c("S1", "S1", "S1", "S2", "S2", "S2"),
    v_call = c("IGHV1", "IGHV1", "IGHV2", "IGHV1", "IGHV2", "IGHV2"),
    j_call = c("IGHJ1", "IGHJ1", "IGHJ2", "IGHJ1", "IGHJ2", "IGHJ2"),
    junction_aa = c("A1", "A1", "A2", "B1", "B2", "B2"),
    locus = "IGH",
    umi_count = c(10, 11, 12, 13, 14, 15)
  )

  temp_file <- tempfile(fileext = ".tsv")
  readr::write_tsv(test_data, temp_file)
  on.exit(unlink(temp_file), add = TRUE)

  idata <- read_repertoires(
    path = temp_file,
    schema = make_receptor_schema(features = c("v_call", "j_call", "junction_aa"), chains = "IGH"),
    barcode_col = "cell_id",
    locus_col = "locus",
    umi_col = "umi_count",
    output_folder = output_dir,
    preprocess = NULL,
    postprocess = NULL,
    rename_columns = NULL
  )
  idata <- agg_repertoires_with_integrity(
    idata,
    schema = "sample_id",
    context = "downsample count/proportion invariants"
  )

  ds <- downsample_immundata(idata, n = 2, seed = 33)

  reps <- ds$repertoires |> collect()
  ann <- ds$annotations |> collect()

  receptor_stats <- ann |>
    select(imd_repertoire_id, imd_receptor_id, imd_count, imd_proportion) |>
    distinct() |>
    summarise(
      .by = imd_repertoire_id,
      sum_count = sum(imd_count),
      sum_prop = sum(imd_proportion)
    ) |>
    arrange(imd_repertoire_id)

  reps <- reps |> arrange(imd_repertoire_id)

  expect_equal(receptor_stats$sum_count, reps$n_barcodes)
  expect_true(all(abs(receptor_stats$sum_prop - 1) < 1e-8))
})