test_that("Case 3.2a: read_repertoires handles strict pairing correctly", {
  output_dir <- create_test_output_dir()
  on.exit(cleanup_output_dir(output_dir))

  schema_features <- c("v_call", "j_call", "junction_aa")

  sample_file <- test_ig_data()

  idata <- read_repertoires(
    path = sample_file,
    schema = make_receptor_schema(
      features = schema_features,
      chains = c("IGH", "IGK")
    ),
    barcode_col = "cell_id",
    locus_col = "locus",
    umi_col = "duplicate_count",
    output_folder = output_dir,
    preprocess = NULL,
    postprocess = NULL,
    rename_columns = NULL
  )

  annotations <- idata$annotations |> collect()
  receptors <- idata$receptors |> collect()

  # Sanity check
  expect_false(nrow(annotations) == 0)

  expect_setequal(colnames(idata$receptors), c(do.call(paste0, expand.grid(c(schema_features, "locus"), c(".x", ".y"))), imd_schema("receptor")))

  expect_equal(receptors |> select(-imd_receptor_id) |> distinct() |> nrow(), nrow(receptors))
  expect_equal(receptors |> select(imd_receptor_id) |> distinct() |> nrow(), nrow(receptors))

  chains_stats <- annotations |>
    summarise(n_chains = n(), .by = imd_barcode) |>
    summarise(n = n(), .by = n_chains)

  # We should see the receptors with two chains only, nothing more, nothing less
  expect_setequal(2, chains_stats$n_chains)

  # Each cell should have both IGH and IGK
  cell_loci <- annotations |>
    group_by(imd_barcode) |>
    summarise(
      loci = list(sort(unique(locus))),
      n_loci = n_distinct(locus),
      .groups = "drop"
    )

  # All cells should have exactly 2 loci
  expect_true(all(cell_loci$n_loci == 2))

  # All cells should have both IGH and IGK
  res <- cell_loci |>
    distinct(loci) |>
    pull(loci)
  expect_equal(res[[1]], c("IGH", "IGK"))
})

test_that("Case 3.2b: read_repertoires() handles relaxed pairing and excludes artifacts", {
  output_dir <- create_test_output_dir()
  on.exit(cleanup_output_dir(output_dir))

  # Test relaxed IGH + (IGK|IGL) pairing
  schema_features <- c("v_call", "j_call", "junction_aa")

  sample_file <- test_ig_data()

  idata <- read_repertoires(
    path = sample_file,
    schema = make_receptor_schema(
      features = schema_features,
      chains = c("IGH", "IGK|IGL") # Relaxed pairing syntax
    ),
    barcode_col = "cell_id",
    locus_col = "locus",
    umi_col = "duplicate_count",
    output_folder = output_dir,
    preprocess = NULL,
    postprocess = NULL,
    rename_columns = NULL
  )

  expect_setequal(colnames(idata$receptors), c(do.call(paste0, expand.grid(c(schema_features, "locus"), c(".x", ".y"))), imd_schema("receptor")))

  # Sanity check
  annotations <- idata$annotations |> collect()
  receptors <- idata$receptors |> collect()

  expect_false(nrow(annotations) == 0)

  expect_equal(receptors |> select(-imd_receptor_id) |> distinct() |> nrow(), nrow(receptors))
  expect_equal(receptors |> select(imd_receptor_id) |> distinct() |> nrow(), nrow(receptors))

  chains_stats <- annotations |>
    summarise(n_chains = n(), .by = imd_barcode) |>
    summarise(n = n(), .by = n_chains)

  # We should see the receptors with two chains only, nothing more, nothing less
  expect_setequal(2, chains_stats$n_chains)

  # Each cell should have both IGH and IGK
  cell_loci <- annotations |>
    group_by(imd_barcode) |>
    summarise(
      loci = list(sort(unique(locus))),
      n_loci = n_distinct(locus),
      .groups = "drop"
    )

  # All cells should have exactly 2 loci
  expect_true(all(cell_loci$n_loci == 2))

  # All cells should have both IGH and IGK
  res <- cell_loci |>
    distinct(loci) |>
    pull(loci)
  expect_setequal(res, list(c("IGH", "IGK"), c("IGH", "IGL")))

  # Identify artifact cells from original data (those with IGH + IGK + IGL)
  original_data <- readr::read_tsv(sample_file, show_col_types = FALSE)
  artifact_cells <- original_data |>
    group_by(cell_id) |>
    summarise(
      has_igh = "IGH" %in% locus,
      has_igk = "IGK" %in% locus,
      has_igl = "IGL" %in% locus,
      .groups = "drop"
    ) |>
    filter(!((has_igh & has_igk & !has_igl) | (has_igh & !has_igk & has_igl))) |>
    pull(cell_id)

  cells_in_result <- unique(annotations$imd_barcode)

  # Artifact cells should NOT be in the result
  expect_false(any(artifact_cells %in% cells_in_result))
})

test_that("read_repertoires handles duplicate chain entries correctly", {
  output_dir <- create_test_output_dir()
  on.exit(cleanup_output_dir(output_dir))

  test_data <- tibble(
    cell_id = c("cell1", "cell1", "cell1", "cell1"),
    v_call = c("IGHV1", "IGHV1", "IGLV1", "IGLV2"),
    j_call = c("IGHJ1", "IGHJ1", "IGLJ1", "IGLJ2"),
    junction_aa = c("CARW", "CARX", "CASW", "CATW"),
    locus = c("IGH", "IGH", "IGL", "IGL"), # Two IGH and two IGL chains
    umi_count = c(100, 150, 80, 60) # Different UMI counts
  )

  temp_file <- tempfile(fileext = ".tsv")
  readr::write_tsv(test_data, temp_file)
  on.exit(unlink(temp_file), add = TRUE)

  schema_features <- c("v_call", "j_call", "junction_aa")

  idata_strict <- read_repertoires(
    path = temp_file,
    schema = make_receptor_schema(features = schema_features, chains = c("IGH", "IGL")),
    barcode_col = "cell_id",
    locus_col = "locus",
    umi_col = "umi_count",
    output_folder = output_dir,
    preprocess = NULL,
    postprocess = NULL,
    rename_columns = NULL
  )

  annotations <- idata_strict$annotations |> collect()

  # Should select highest UMI count for each locus
  igh_chains <- annotations |> filter(locus == "IGH")
  igl_chains <- annotations |> filter(locus == "IGL")

  # Should have selected the chains with highest UMI
  expect_equal(igh_chains$junction_aa, "CARX") # 150 UMI
  expect_equal(igl_chains$junction_aa, "CASW") # 80 UMI
})

test_that("read_repertoires handles same max UMI count per barcode", {
  output_dir <- create_test_output_dir()
  on.exit(cleanup_output_dir(output_dir))

  test_data <- tibble(
    cell_id = c("cell1", "cell1", "cell1", "cell1"),
    v_call = c("IGHV1", "IGHV1", "IGLV1", "IGLV2"),
    j_call = c("IGHJ1", "IGHJ1", "IGLJ1", "IGLJ2"),
    junction_aa = c("CARW", "CARX", "CASW", "CATW"),
    locus = c("IGH", "IGH", "IGL", "IGL"), # Two IGH and two IGL chains
    umi_count = c(100, 100, 80, 80) # Different UMI counts
  )

  temp_file <- tempfile(fileext = ".tsv")
  readr::write_tsv(test_data, temp_file)
  on.exit(unlink(temp_file), add = TRUE)

  schema_features <- c("v_call", "j_call", "junction_aa")

  idata_strict <- read_repertoires(
    path = temp_file,
    schema = make_receptor_schema(features = schema_features, chains = c("IGH", "IGL")),
    barcode_col = "cell_id",
    locus_col = "locus",
    umi_col = "umi_count",
    output_folder = output_dir,
    preprocess = NULL,
    postprocess = NULL,
    rename_columns = NULL
  )

  annotations <- idata_strict$annotations |> collect()

  # Should select highest UMI count for each locus
  igh_chains <- annotations |> filter(locus == "IGH")
  igl_chains <- annotations |> filter(locus == "IGL")

  # Should have selected the chains with highest UMI
  expect_equal(igh_chains$junction_aa, "CARW") # first one
  expect_equal(igl_chains$junction_aa, "CASW") # first one as well
})