# =============================================================================
# test-extract.R — Unit tests for extract_ series (mock-based, no network)
# =============================================================================
.skip_if_no_mockery()

# Helper: build a fake .dx_run() result
.fake_dx <- function(stdout = "", stderr = "", status = 0) {
  list(stdout = stdout, stderr = stderr, status = status, success = status == 0)
}

# Fake fields data.frame (representative sample of all field types)
.fake_fields_df <- function() {
  data.frame(
    field_name = c(
      "participant.eid",
      "participant.p31",
      "participant.p53_i0", "participant.p53_i1",
      "participant.p53_i2", "participant.p53_i3",
      "participant.p22189",
      "participant.p20002_i0_a0", "participant.p20002_i0_a1",
      "participant.p22009_a1",    "participant.p22009_a2",
      "participant.p41202"
    ),
    title = c(
      "Participant ID",
      "Sex",
      "Date of attending assessment centre | Instance 0",
      "Date of attending assessment centre | Instance 1",
      "Date of attending assessment centre | Instance 2",
      "Date of attending assessment centre | Instance 3",
      "Townsend deprivation index at recruitment",
      "Non-cancer illness code, self-reported | Instance 0 | Array 0",
      "Non-cancer illness code, self-reported | Instance 0 | Array 1",
      "Genetic principal components | Array 1",
      "Genetic principal components | Array 2",
      "Diagnoses - main ICD10"
    ),
    stringsAsFactors = FALSE
  )
}

# Helper: populate per-dataset session cache with fake fields
.set_fake_cache <- function(dataset = "app12345.dataset") {
  if (is.null(.ukbflow_cache$fields)) .ukbflow_cache$fields <- list()
  .ukbflow_cache$fields[[dataset]] <- .fake_fields_df()
}

# Helper: clear session cache entirely
.clear_cache <- function() {
  .ukbflow_cache$fields <- NULL
}

# ===========================================================================
# .dx_parse_fields() — pure function, no mocking needed
# ===========================================================================

test_that(".dx_parse_fields() returns empty df for empty stdout", {
  result <- ukbflow:::.dx_parse_fields("")
  expect_s3_class(result, "data.frame")
  expect_equal(nrow(result), 0)
  expect_named(result, c("field_name", "title"))
})

test_that(".dx_parse_fields() parses tab-separated output correctly", {
  stdout <- "participant.p31\tSex\nparticipant.p53_i0\tDate of attending | Instance 0\n"
  result <- ukbflow:::.dx_parse_fields(stdout)
  expect_equal(nrow(result), 2)
  expect_equal(result$field_name[1], "participant.p31")
  expect_equal(result$title[1], "Sex")
})

test_that(".dx_parse_fields() handles missing title gracefully", {
  stdout <- "participant.p31\n"
  result <- ukbflow:::.dx_parse_fields(stdout)
  expect_equal(nrow(result), 1)
  expect_true(is.na(result$title[1]))
})

# ===========================================================================
# .dx_match_fields() — pure function, no mocking needed
# ===========================================================================

test_that(".dx_match_fields() matches simple field exactly", {
  result <- ukbflow:::.dx_match_fields(31L, .fake_fields_df())
  expect_length(result$matched, 1)
  expect_equal(result$matched[[1]]$field_id, 31L)
  expect_equal(result$matched[[1]]$n_cols, 1L)
  expect_length(result$unmatched, 0)
})

test_that(".dx_match_fields() expands all instances for p53", {
  result <- ukbflow:::.dx_match_fields(53L, .fake_fields_df())
  expect_equal(result$matched[[1]]$n_cols, 4L)
  expect_length(result$matched[[1]]$field_names, 4L)
})

test_that(".dx_match_fields() does NOT match p31 to p310xx fields", {
  # Add a decoy field to the fake df
  df <- .fake_fields_df()
  df <- rbind(df, data.frame(
    field_name = "participant.p31000_i0",
    title = "Some other field",
    stringsAsFactors = FALSE
  ))
  result <- ukbflow:::.dx_match_fields(31L, df)
  matched_names <- result$matched[[1]]$field_names
  expect_false(any(grepl("p31000", matched_names)))
  expect_equal(result$matched[[1]]$n_cols, 1L)
})

test_that(".dx_match_fields() returns unmatched for missing field", {
  result <- ukbflow:::.dx_match_fields(c(31L, 189L), .fake_fields_df())
  expect_length(result$matched, 1)
  expect_equal(result$unmatched, 189L)
})

test_that(".dx_match_fields() strips title instance suffix correctly", {
  result <- ukbflow:::.dx_match_fields(53L, .fake_fields_df())
  expect_equal(result$matched[[1]]$title, "Date of attending assessment centre")
})

# ===========================================================================
# extract_ls()
# ===========================================================================

test_that("extract_ls() fetches fields and caches on first call", {
  .clear_cache()
  on.exit(.clear_cache())

  mockery::stub(extract_ls, ".dx_find_dataset",
                function() "app12345.dataset")
  mockery::stub(extract_ls, ".dx_list_fields_raw",
                function(dataset, ...) .fake_dx(
                  stdout = "participant.p31\tSex\nparticipant.p22189\tTownsend\n"
                ))

  result <- suppressMessages(extract_ls())
  expect_false(is.null(.ukbflow_cache$fields[["app12345.dataset"]]))
  expect_equal(nrow(.ukbflow_cache$fields[["app12345.dataset"]]), 2)
})

test_that("extract_ls() returns from cache on second call", {
  .clear_cache()
  on.exit(.clear_cache())

  call_count <- 0L
  mockery::stub(extract_ls, ".dx_find_dataset", function() "app12345.dataset")
  mockery::stub(extract_ls, ".dx_list_fields_raw", function(...) {
    call_count <<- call_count + 1L
    .fake_dx(stdout = "participant.p31\tSex\n")
  })

  suppressMessages(extract_ls())   # first call: fetches and caches
  suppressMessages(extract_ls())   # second call: served from cache
  expect_equal(call_count, 1L)
})

test_that("extract_ls() refresh = TRUE re-fetches from network", {
  .set_fake_cache()
  on.exit(.clear_cache())

  mockery::stub(extract_ls, ".dx_list_fields_raw",
                function(...) .fake_dx(
                  stdout = "participant.p31\tSex\n"
                ))
  mockery::stub(extract_ls, ".dx_find_dataset", function() "app12345.dataset")

  suppressMessages(extract_ls(refresh = TRUE))
  expect_equal(nrow(.ukbflow_cache$fields[["app12345.dataset"]]), 1)
})

test_that("extract_ls() pattern filters results", {
  .set_fake_cache()
  on.exit(.clear_cache())

  result <- extract_ls(dataset = "app12345.dataset", pattern = "p31")
  expect_true(all(grepl("p31", result$field_name) |
                    grepl("p31", result$title, ignore.case = TRUE)))
})

test_that("extract_ls() pattern search is case-insensitive on title", {
  .set_fake_cache()
  on.exit(.clear_cache())

  result <- extract_ls(dataset = "app12345.dataset", pattern = "SEX")
  expect_gt(nrow(result), 0)
})

test_that("extract_ls() returns invisible and message when no pattern", {
  .set_fake_cache()
  on.exit(.clear_cache())

  expect_message(res <- extract_ls(dataset = "app12345.dataset"), "fields available")
  expect_s3_class(res, "data.frame")
})

test_that("extract_ls() caches per dataset — different datasets don't share cache", {
  .clear_cache()
  on.exit(.clear_cache())

  call_count <- 0L
  mockery::stub(extract_ls, ".dx_list_fields_raw", function(...) {
    call_count <<- call_count + 1L
    .fake_dx(stdout = "participant.p31\tSex\n")
  })

  suppressMessages(extract_ls(dataset = "app11111.dataset"))
  suppressMessages(extract_ls(dataset = "app22222.dataset"))
  expect_equal(call_count, 2L)   # each dataset triggers its own fetch
  suppressMessages(extract_ls(dataset = "app11111.dataset"))
  expect_equal(call_count, 2L)   # cache hit — no new fetch
})

test_that("extract_ls() throws error when .dx_list_fields_raw fails", {
  .clear_cache()
  on.exit(.clear_cache())

  mockery::stub(extract_ls, ".dx_find_dataset", function() "app12345.dataset")
  mockery::stub(extract_ls, ".dx_list_fields_raw",
                function(...) .fake_dx(stderr = "Not logged in", status = 1))

  expect_error(suppressMessages(extract_ls()), "Failed to list fields")
})

# ===========================================================================
# extract_pheno()
# ===========================================================================

test_that("extract_pheno() stops when not on RAP", {
  mockery::stub(extract_pheno, ".assert_on_rap", function() cli::cli_abort("This function must be run inside the RAP environment."))
  expect_error(extract_pheno(31), "RAP environment")
})

test_that("extract_pheno() throws error on empty field_id", {
  mockery::stub(extract_pheno, ".assert_on_rap", function() invisible(NULL))
  expect_error(extract_pheno(integer(0)), "non-empty numeric")
})

test_that("extract_pheno() throws error on non-numeric field_id", {
  mockery::stub(extract_pheno, ".assert_on_rap", function() invisible(NULL))
  expect_error(extract_pheno("p31"), "non-empty numeric")
})

test_that("extract_pheno() throws error on NA field_id", {
  mockery::stub(extract_pheno, ".assert_on_rap", function() invisible(NULL))
  expect_error(extract_pheno(c(31, NA)), "NA")
})

test_that("extract_pheno() throws error on Inf field_id", {
  mockery::stub(extract_pheno, ".assert_on_rap", function() invisible(NULL))
  expect_error(extract_pheno(c(31, Inf)), "Inf")
})

test_that("extract_pheno() throws error on decimal field_id", {
  mockery::stub(extract_pheno, ".assert_on_rap", function() invisible(NULL))
  expect_error(extract_pheno(31.7), "whole numbers")
})

test_that("extract_pheno() throws error when no fields match", {
  .set_fake_cache()
  on.exit(.clear_cache())

  mockery::stub(extract_pheno, ".assert_on_rap", function() invisible(NULL))
  mockery::stub(extract_pheno, ".dx_find_dataset", function() "app12345.dataset")
  expect_error(
    suppressMessages(extract_pheno(999999)),
    "No matching fields found"
  )
})

test_that("extract_pheno() warns on unmatched field_id", {
  .set_fake_cache()
  on.exit(.clear_cache())

  mockery::stub(extract_pheno, ".assert_on_rap", function() invisible(NULL))
  mockery::stub(extract_pheno, ".dx_find_dataset", function() "app12345.dataset")
  mockery::stub(extract_pheno, ".dx_extract_run", function(dataset, fields, dest, ...) {
    write.csv(data.frame(`participant.eid` = 1L, `participant.p31` = 1L,
                          check.names = FALSE), dest, row.names = FALSE)
    .fake_dx()
  })

  expect_warning(
    suppressMessages(extract_pheno(c(31, 189))),
    "not found"
  )
})

test_that("extract_pheno() always includes eid as first field", {
  .set_fake_cache()
  on.exit(.clear_cache())

  received_fields <- NULL
  mockery::stub(extract_pheno, ".assert_on_rap", function() invisible(NULL))
  mockery::stub(extract_pheno, ".dx_find_dataset", function() "app12345.dataset")
  mockery::stub(extract_pheno, ".dx_extract_run",
                function(dataset, fields, dest, ...) {
                  received_fields <<- fields
                  write.csv(data.frame(`participant.eid` = 1L,
                                        check.names = FALSE), dest, row.names = FALSE)
                  .fake_dx()
                })

  suppressMessages(suppressWarnings(extract_pheno(31)))
  expect_equal(received_fields[1], "participant.eid")
})

test_that("extract_pheno() returns a data.table", {
  .set_fake_cache()
  on.exit(.clear_cache())

  mockery::stub(extract_pheno, ".assert_on_rap", function() invisible(NULL))
  mockery::stub(extract_pheno, ".dx_find_dataset", function() "app12345.dataset")
  mockery::stub(extract_pheno, ".dx_extract_run", function(dataset, fields, dest, ...) {
    write.csv(data.frame(`participant.eid` = c(1L, 2L), `participant.p31` = c(1L, 0L),
                          check.names = FALSE), dest, row.names = FALSE)
    .fake_dx()
  })

  result <- suppressMessages(extract_pheno(31))
  expect_true(data.table::is.data.table(result))
})

# ===========================================================================
# extract_batch()
# ===========================================================================

test_that("extract_batch() throws error on invalid field_id", {
  expect_error(extract_batch(character(0)), "non-empty numeric")
})

test_that("extract_batch() throws error on NA field_id", {
  expect_error(extract_batch(c(31, NA)), "NA")
})

test_that("extract_batch() throws error on Inf field_id", {
  expect_error(extract_batch(c(31, Inf)), "Inf")
})

test_that("extract_batch() throws error on decimal field_id", {
  expect_error(extract_batch(31.7), "whole numbers")
})

test_that("extract_batch() strips participant. prefix for table-exporter", {
  .set_fake_cache()
  on.exit(.clear_cache())

  received_fields <- NULL
  captured_path   <- NULL
  mockery::stub(extract_batch, ".dx_find_dataset", function() "app12345.dataset")
  # Reason: capture local_path here so .dx_run_table_exporter stub can read
  # the fields file before on.exit() deletes it
  mockery::stub(extract_batch, ".dx_upload_file", function(local_path) {
    captured_path <<- local_path
    "file-XXXX"
  })
  mockery::stub(extract_batch, ".dx_run_table_exporter",
                function(dataset, file_id, output, instance_type, priority) {
                  received_fields <<- readLines(captured_path)
                  "job-XXXX"
                })

  suppressMessages(extract_batch(31))
  expect_true(any(grepl("^eid$", received_fields)))
  expect_true(any(grepl("^p31$", received_fields)))
  expect_false(any(grepl("^participant\\.", received_fields)))
})

test_that("extract_batch() auto-selects x8 for small extractions", {
  .set_fake_cache()
  on.exit(.clear_cache())

  received_instance <- NULL
  mockery::stub(extract_batch, ".dx_find_dataset", function() "app12345.dataset")
  mockery::stub(extract_batch, ".dx_upload_file", function(...) "file-XXXX")
  mockery::stub(extract_batch, ".dx_run_table_exporter",
                function(dataset, file_id, output, instance_type, priority) {
                  received_instance <<- instance_type
                  "job-XXXX"
                })

  suppressMessages(extract_batch(c(31, 22189)))   # 2 cols → x4
  expect_equal(received_instance, "mem1_ssd1_v2_x4")
})

test_that("extract_batch() passes priority to table-exporter", {
  .set_fake_cache()
  on.exit(.clear_cache())

  received_priority <- NULL
  mockery::stub(extract_batch, ".dx_find_dataset", function() "app12345.dataset")
  mockery::stub(extract_batch, ".dx_upload_file", function(...) "file-XXXX")
  mockery::stub(extract_batch, ".dx_run_table_exporter",
                function(dataset, file_id, output, instance_type, priority) {
                  received_priority <<- priority
                  "job-XXXX"
                })

  suppressMessages(extract_batch(31, priority = "high"))
  expect_equal(received_priority, "high")
})

test_that("extract_batch() uses custom file name when provided", {
  .set_fake_cache()
  on.exit(.clear_cache())

  received_output <- NULL
  mockery::stub(extract_batch, ".dx_find_dataset", function() "app12345.dataset")
  mockery::stub(extract_batch, ".dx_upload_file", function(...) "file-XXXX")
  mockery::stub(extract_batch, ".dx_run_table_exporter",
                function(dataset, file_id, output, instance_type, priority) {
                  received_output <<- output
                  "job-XXXX"
                })

  suppressMessages(extract_batch(31, file = "my_cohort"))
  expect_equal(received_output, "my_cohort")
})

test_that("extract_batch() returns job_id invisibly", {
  .set_fake_cache()
  on.exit(.clear_cache())

  mockery::stub(extract_batch, ".dx_find_dataset", function() "app12345.dataset")
  mockery::stub(extract_batch, ".dx_upload_file", function(...) "file-XXXX")
  mockery::stub(extract_batch, ".dx_run_table_exporter",
                function(...) "job-XXXX")

  expect_invisible(suppressMessages(extract_batch(31)))
  result <- suppressMessages(extract_batch(31))
  expect_equal(result, "job-XXXX")
})

test_that("extract_batch() throws error when no fields match", {
  .set_fake_cache()
  on.exit(.clear_cache())

  mockery::stub(extract_batch, ".dx_find_dataset", function() "app12345.dataset")
  expect_error(
    suppressMessages(extract_batch(999999)),
    "No matching fields found"
  )
})

test_that("extract_batch() warns on unmatched field_ids", {
  .set_fake_cache()
  on.exit(.clear_cache())

  mockery::stub(extract_batch, ".dx_find_dataset", function() "app12345.dataset")
  mockery::stub(extract_batch, ".dx_upload_file", function(...) "file-XXXX")
  mockery::stub(extract_batch, ".dx_run_table_exporter", function(...) "job-XXXX")

  expect_warning(
    suppressMessages(extract_batch(c(31, 189))),
    "not found"
  )
})

test_that("extract_batch() auto-selects x8 for 21-100 cols", {
  # p53 with 25 instances → 25 cols, which falls in the x8 tier (21-100)
  big_df <- data.frame(
    field_name = paste0("participant.p53_i", 0:24),
    title      = paste0("Field 53 | Instance ", 0:24),
    stringsAsFactors = FALSE
  )
  .ukbflow_cache$fields <- list("app12345.dataset" = big_df)
  on.exit(.clear_cache())

  received_instance <- NULL
  mockery::stub(extract_batch, ".dx_find_dataset", function() "app12345.dataset")
  mockery::stub(extract_batch, ".dx_upload_file", function(...) "file-XXXX")
  mockery::stub(extract_batch, ".dx_run_table_exporter",
                function(dataset, file_id, output, instance_type, priority) {
                  received_instance <<- instance_type
                  "job-XXXX"
                })

  suppressMessages(extract_batch(53))
  expect_equal(received_instance, "mem1_ssd1_v2_x8")
})

test_that("extract_batch() auto-selects x16 for 101-500 cols", {
  # p53 with 110 instances → 110 cols, which falls in the x16 tier (101-500)
  huge_df <- data.frame(
    field_name = paste0("participant.p53_i", 0:109),
    title      = paste0("Field 53 | Instance ", 0:109),
    stringsAsFactors = FALSE
  )
  .ukbflow_cache$fields <- list("app12345.dataset" = huge_df)
  on.exit(.clear_cache())

  received_instance <- NULL
  mockery::stub(extract_batch, ".dx_find_dataset", function() "app12345.dataset")
  mockery::stub(extract_batch, ".dx_upload_file", function(...) "file-XXXX")
  mockery::stub(extract_batch, ".dx_run_table_exporter",
                function(dataset, file_id, output, instance_type, priority) {
                  received_instance <<- instance_type
                  "job-XXXX"
                })

  suppressMessages(extract_batch(53))
  expect_equal(received_instance, "mem1_ssd1_v2_x16")
})

test_that("extract_batch() auto-selects x36 for >500 cols", {
  # p53 with 510 instances → 510 cols, which falls in the x36 tier (>500)
  giant_df <- data.frame(
    field_name = paste0("participant.p53_i", 0:509),
    title      = paste0("Field 53 | Instance ", 0:509),
    stringsAsFactors = FALSE
  )
  .ukbflow_cache$fields <- list("app12345.dataset" = giant_df)
  on.exit(.clear_cache())

  received_instance <- NULL
  mockery::stub(extract_batch, ".dx_find_dataset", function() "app12345.dataset")
  mockery::stub(extract_batch, ".dx_upload_file", function(...) "file-XXXX")
  mockery::stub(extract_batch, ".dx_run_table_exporter",
                function(dataset, file_id, output, instance_type, priority) {
                  received_instance <<- instance_type
                  "job-XXXX"
                })

  suppressMessages(extract_batch(53))
  expect_equal(received_instance, "mem1_ssd1_v2_x36")
})

test_that("extract_batch() respects custom instance_type", {
  .set_fake_cache()
  on.exit(.clear_cache())

  received_instance <- NULL
  mockery::stub(extract_batch, ".dx_find_dataset", function() "app12345.dataset")
  mockery::stub(extract_batch, ".dx_upload_file", function(...) "file-XXXX")
  mockery::stub(extract_batch, ".dx_run_table_exporter",
                function(dataset, file_id, output, instance_type, priority) {
                  received_instance <<- instance_type
                  "job-XXXX"
                })

  suppressMessages(extract_batch(31, instance_type = "mem2_ssd1_v2_x4"))
  expect_equal(received_instance, "mem2_ssd1_v2_x4")
})

# ===========================================================================
# extract_pheno() — additional edge cases
# ===========================================================================

test_that("extract_pheno() stops when extraction command fails", {
  .set_fake_cache()
  on.exit(.clear_cache())

  mockery::stub(extract_pheno, ".assert_on_rap", function() invisible(NULL))
  mockery::stub(extract_pheno, ".dx_find_dataset", function() "app12345.dataset")
  mockery::stub(extract_pheno, ".dx_extract_run",
                function(...) .fake_dx(stderr = "connection timeout", status = 1))

  expect_error(
    suppressMessages(extract_pheno(31)),
    "Extraction failed"
  )
})

test_that("extract_pheno() deduplicates repeated field_ids", {
  .set_fake_cache()
  on.exit(.clear_cache())

  received_fields <- NULL
  mockery::stub(extract_pheno, ".assert_on_rap", function() invisible(NULL))
  mockery::stub(extract_pheno, ".dx_find_dataset", function() "app12345.dataset")
  mockery::stub(extract_pheno, ".dx_extract_run",
                function(dataset, fields, dest, ...) {
                  received_fields <<- fields
                  write.csv(data.frame(`participant.eid` = 1L,
                                        check.names = FALSE), dest, row.names = FALSE)
                  .fake_dx()
                })

  suppressMessages(extract_pheno(c(31, 31, 31)))
  # participant.eid + participant.p31 — p31 must appear exactly once
  expect_equal(sum(received_fields == "participant.p31"), 1L)
})