## SETUP AND TEST FIXTURES -----------------------------------------------------

null <- NULL
na <- NA
boolean <- TRUE
number_random <- sample(1:1000, 1)
string_random <- paste0(sample(letters, 5), collapse = "")
vector_strings <- c("foo", "bar")
list_strings <- list("foo", "bar")
df <- mtcars
matrix <- as.matrix(mtcars)

# Invalid JSON file (not a valid Document AI response)
fill <- list("a" = 1, "b" = 2)
json <- jsonlite::toJSON(fill)
madeup_json_file <- tempfile(fileext = ".json")
write(json, madeup_json_file)

# Real JSON paths
sample_json <- testthat::test_path("examples", "output.json")
sample_json_blank <- testthat::test_path("examples", "output_blank.json")
sample_json_multipage <- testthat::test_path("examples", "sample3pg.json")
sample_json_labelme <- testthat::test_path("examples", "peshtigo_labelme.json")
sample_json_pesh <- testthat::test_path("examples", "peshtigo.json")

## HELPER FUNCTIONS FOR TESTING ------------------------------------------------

# Test invalid type parameter
test_invalid_type <- function(func, func_name) {
  test_that(paste0(func_name, " validates type parameter"), {
    expect_error(func(sample_json, type = "invalid"), "Invalid type parameter.")
    expect_error(func(sample_json, type = c("sync", "async")), "Invalid type parameter.")
    expect_error(func(sample_json, type = NULL), "Invalid type parameter.")
    expect_error(func(sample_json, type = 123), "Invalid type parameter.")
  })
}

# Test invalid object parameter for build functions
test_invalid_object <- function(func, func_name) {
  test_that(paste0(func_name, " errors with invalid object parameter"), {
    # For async type
    expect_error(func(null, type = "async"), "Invalid object")
    expect_error(func(na, type = "async"), "Invalid object")
    expect_error(func(boolean, type = "async"), "Invalid object")
    expect_error(func(number_random, type = "async"), "Invalid object")
    expect_error(func(string_random, type = "async"), "Invalid object")
    expect_error(func(vector_strings, type = "async"), "Invalid object")
    expect_error(func(list_strings, type = "async"), "Invalid object")
    expect_error(func(df, type = "async"), "Invalid object")
    expect_error(func(matrix, type = "async"), "Invalid object")
  })
}

# Test invalid JSON file parameter
test_invalid_json <- function(func, func_name) {
  test_that(paste0(func_name, " errors with invalid JSON file"), {
    expect_error(func(null), "Invalid json parameter: must be a single character string filepath.")
    expect_error(func(na), "Invalid json parameter: must be a single character string filepath.")
    expect_error(func(boolean), "Invalid json parameter: must be a single character string filepath.")
    expect_error(func(number_random), "Invalid json parameter: must be a single character string filepath.")
    expect_error(func(vector_strings), "Invalid json parameter: must be a single character string filepath.")
    expect_error(func(list_strings), "Invalid json parameter: must be a single character string filepath.")
    expect_error(func(df), "Invalid json parameter: must be a single character string filepath.")
    expect_error(func(matrix), "Invalid json parameter: must be a single character string filepath.")
    
    expect_error(func("wrongfile.csv"), "Invalid json parameter: file is not a \\.json file or does not exist.")
    expect_error(func("madeup.json"), "Invalid json parameter: file is not a \\.json file or does not exist.")
  })
}

# Test invalid dataframe parameter
test_invalid_dataframe <- function(func, param_name, func_name, additional_args = list()) {
  test_that(paste0(func_name, " errors with invalid ", param_name, " parameter"), {
    invalid_inputs <- list(null, na, boolean, number_random, string_random, vector_strings, list_strings, matrix)
    
    for (input in invalid_inputs) {
      args <- c(setNames(list(input), param_name), additional_args)
      expect_error(do.call(func, args), paste0("Invalid ", param_name, " parameter"))
    }
  })
}

# Test invalid page parameter
test_invalid_page <- function(func, valid_args, func_name) {
  test_that(paste0(func_name, " errors with invalid page parameter"), {
    expect_error(do.call(func, c(valid_args, list(page = -1))), "Invalid page parameter.")
    expect_error(do.call(func, c(valid_args, list(page = 0))), "Invalid page parameter.")
    expect_error(do.call(func, c(valid_args, list(page = 1.5))), "Invalid page parameter.")
    expect_error(do.call(func, c(valid_args, list(page = "one"))), "Invalid page parameter.")
    expect_error(do.call(func, c(valid_args, list(page = c(1, 2)))), "Invalid page parameter.")
    expect_error(do.call(func, c(valid_args, list(page = 999))), "No such page number")
  })
}

# Test invalid block parameter  
test_invalid_block <- function(func, valid_args, func_name) {
  test_that(paste0(func_name, " errors with invalid block parameter"), {
    expect_error(do.call(func, c(valid_args, list(block = -1))), "Invalid block parameter.")
    expect_error(do.call(func, c(valid_args, list(block = 0))), "Invalid block parameter.")
    expect_error(do.call(func, c(valid_args, list(block = 1.5))), "Invalid block parameter.")
    expect_error(do.call(func, c(valid_args, list(block = "one"))), "Invalid block parameter.")
    expect_error(do.call(func, c(valid_args, list(block = c(1, 2)))), "Invalid block parameter.")
    expect_error(do.call(func, c(valid_args, list(block = 999))), "No such block number")
  })
}

# Test invalid directory parameter
test_invalid_directory <- function(func, param_name, func_name, additional_args = list()) {
  test_that(paste0(func_name, " errors with invalid ", param_name, " parameter"), {
    # Non-character inputs
    args1 <- c(setNames(list(null), param_name), additional_args)
    expect_error(do.call(func, args1), paste0("Invalid ", param_name, " parameter"))
    
    args2 <- c(setNames(list(boolean), param_name), additional_args)
    expect_error(do.call(func, args2), paste0("Invalid ", param_name, " parameter"))
    
    args3 <- c(setNames(list(number_random), param_name), additional_args)
    expect_error(do.call(func, args3), paste0("Invalid ", param_name, " parameter"))
    
    args4 <- c(setNames(list(vector_strings), param_name), additional_args)
    expect_error(do.call(func, args4), paste0("Invalid ", param_name, " parameter"))
    
    # Non-existent directory
    args5 <- c(setNames(list("/fake/nonexistent/path"), param_name), additional_args)
    expect_error(do.call(func, args5), "does not exist")
  })
}

## MERGE_SHARDS ----------------------------------------------------------------

test_invalid_directory(merge_shards, "source_dir", "merge_shards()")

test_that("merge_shards() errors with invalid dest_dir parameter", {
  valid_source <- file.path(tempdir(), "valid_source")
  dir.create(valid_source, showWarnings = FALSE)
  writeLines("test", file.path(valid_source, "test-0.txt"))
  
  expect_error(merge_shards(source_dir = valid_source, dest_dir = null), "Invalid dest_dir parameter")
  expect_error(merge_shards(source_dir = valid_source, dest_dir = boolean), "Invalid dest_dir parameter")
  expect_error(merge_shards(source_dir = valid_source, dest_dir = vector_strings), "Invalid dest_dir parameter")
  expect_error(merge_shards(source_dir = valid_source, dest_dir = "/fake/path"), "does not exist")
  
  unlink(valid_source, recursive = TRUE)
})

test_that("merge_shards() errors when no txt files found", {
  empty_dir <- file.path(tempdir(), "empty_test_dir")
  dir.create(empty_dir, showWarnings = FALSE)
  expect_error(merge_shards(source_dir = empty_dir), "No .txt files found.")
  unlink(empty_dir, recursive = TRUE)
})

test_that("merge_shards() errors when txt files are incorrectly formatted", {
  bad_dir <- file.path(tempdir(), "bad_format_dir")
  dir.create(bad_dir, showWarnings = FALSE)
  writeLines("test content", file.path(bad_dir, "test.txt"))
  expect_error(merge_shards(source_dir = bad_dir), "incorrectly formatted")
  unlink(bad_dir, recursive = TRUE)
})

test_that("merge_shards() successfully merges v1 format shards", {
  test_dir <- file.path(tempdir(), "v1_test")
  dir.create(test_dir, showWarnings = FALSE)
  output_dir <- file.path(tempdir(), "v1_output")
  dir.create(output_dir, showWarnings = FALSE)
  
  writeLines("First shard content", file.path(test_dir, "document-0.txt"))
  writeLines("Second shard content", file.path(test_dir, "document-1.txt"))
  writeLines("Third shard content", file.path(test_dir, "document-2.txt"))
  
  expect_message(merge_shards(source_dir = test_dir, dest_dir = output_dir), "Shards merged")
  
  expect_true(file.exists(file.path(output_dir, "document.txt")))
  merged_content <- readLines(file.path(output_dir, "document.txt"))
  expect_equal(length(merged_content), 3)
  expect_true(any(grepl("First shard", merged_content)))
  expect_true(any(grepl("Third shard", merged_content)))
  
  unlink(test_dir, recursive = TRUE)
  unlink(output_dir, recursive = TRUE)
})

test_that("merge_shards() successfully merges v1beta2 format shards", {
  test_dir <- file.path(tempdir(), "v1beta2_test")
  dir.create(test_dir, showWarnings = FALSE)
  output_dir <- file.path(tempdir(), "v1beta2_output")
  dir.create(output_dir, showWarnings = FALSE)
  
  writeLines("Pages 1-5 content", file.path(test_dir, "document-page-1-to-5.txt"))
  writeLines("Pages 6-10 content", file.path(test_dir, "document-page-6-to-10.txt"))
  
  expect_message(merge_shards(source_dir = test_dir, dest_dir = output_dir))
  
  expect_true(file.exists(file.path(output_dir, "document.txt")))
  merged_content <- readLines(file.path(output_dir, "document.txt"))
  expect_true(any(grepl("Pages 1-5", merged_content)))
  expect_true(any(grepl("Pages 6-10", merged_content)))
  
  unlink(test_dir, recursive = TRUE)
  unlink(output_dir, recursive = TRUE)
})

test_that("merge_shards() handles multiple document sets", {
  test_dir <- file.path(tempdir(), "multi_doc_test")
  dir.create(test_dir, showWarnings = FALSE)
  output_dir <- file.path(tempdir(), "multi_output")
  dir.create(output_dir, showWarnings = FALSE)
  
  writeLines("Doc1 Part1", file.path(test_dir, "doc1-0.txt"))
  writeLines("Doc1 Part2", file.path(test_dir, "doc1-1.txt"))
  writeLines("Doc2 Part1", file.path(test_dir, "doc2-0.txt"))
  writeLines("Doc2 Part2", file.path(test_dir, "doc2-1.txt"))
  
  merge_shards(source_dir = test_dir, dest_dir = output_dir)
  
  expect_true(file.exists(file.path(output_dir, "doc1.txt")))
  expect_true(file.exists(file.path(output_dir, "doc2.txt")))
  
  unlink(test_dir, recursive = TRUE)
  unlink(output_dir, recursive = TRUE)
})

test_that("merge_shards() handles directory paths with trailing slashes", {
  test_dir <- file.path(tempdir(), "trailing_slash_test")
  dir.create(test_dir, showWarnings = FALSE)
  output_dir <- file.path(tempdir(), "trailing_output")
  dir.create(output_dir, showWarnings = FALSE)
  
  writeLines("Content", file.path(test_dir, "doc-0.txt"))
  
  # Add trailing slashes
  source_with_slash <- paste0(test_dir, "/")
  dest_with_slash <- paste0(output_dir, "/")
  
  merge_shards(source_dir = source_with_slash, dest_dir = dest_with_slash)
  
  expect_true(file.exists(file.path(output_dir, "doc.txt")))
  
  unlink(test_dir, recursive = TRUE)
  unlink(output_dir, recursive = TRUE)
})

test_that("merge_shards() handles edge case: single shard", {
  test_dir <- file.path(tempdir(), "single_shard_test")
  dir.create(test_dir, showWarnings = FALSE)
  output_dir <- file.path(tempdir(), "single_output")
  dir.create(output_dir, showWarnings = FALSE)
  
  writeLines("Only shard", file.path(test_dir, "document-0.txt"))
  
  merge_shards(source_dir = test_dir, dest_dir = output_dir)
  
  expect_true(file.exists(file.path(output_dir, "document.txt")))
  content <- readLines(file.path(output_dir, "document.txt"))
  expect_equal(length(content), 1)
  expect_equal(content, "Only shard")
  
  unlink(test_dir, recursive = TRUE)
  unlink(output_dir, recursive = TRUE)
})

test_that("merge_shards() handles three-digit shard numbers", {
  test_dir <- file.path(tempdir(), "threedigit_test")
  dir.create(test_dir, showWarnings = FALSE)
  output_dir <- file.path(tempdir(), "threedigit_output")
  dir.create(output_dir, showWarnings = FALSE)
  
  writeLines("Shard 99", file.path(test_dir, "doc-99.txt"))
  writeLines("Shard 100", file.path(test_dir, "doc-100.txt"))
  
  merge_shards(source_dir = test_dir, dest_dir = output_dir)
  
  expect_true(file.exists(file.path(output_dir, "doc.txt")))
  
  unlink(test_dir, recursive = TRUE)
  unlink(output_dir, recursive = TRUE)
})

## BUILD_TOKEN_DF --------------------------------------------------------------

test_invalid_type(build_token_df, "build_token_df()")
test_invalid_object(build_token_df, "build_token_df()")

test_that("build_token_df() errors with files not containing tokens", {
  expect_error(build_token_df(madeup_json_file, type = "async"), "JSON not in right format")
  expect_error(build_token_df(sample_json_blank, type = "async"), "DAI found no tokens")
})

test_that("build_token_df() builds a token dataframe with correct structure", {
  skip_if_not(file.exists(sample_json))
  df <- build_token_df(sample_json, type = "async")
  
  expect_true(is.data.frame(df))
  expect_equal(ncol(df), 10)
  expect_setequal(colnames(df), c("token", "start_ind", "end_ind", "conf", "left", "right", "top", "bottom", "page", "block"))
  
  # Check column types
  expect_true(is.character(df$token))
  expect_true(is.integer(df$start_ind))
  expect_true(is.integer(df$end_ind))
  expect_true(is.numeric(df$conf))
  expect_true(is.numeric(df$left))
  expect_true(is.numeric(df$right))
  expect_true(is.numeric(df$top))
  expect_true(is.numeric(df$bottom))
  expect_true(is.integer(df$page))
})

test_that("build_token_df() produces valid coordinate values", {
  skip_if_not(file.exists(sample_json))
  df <- build_token_df(sample_json, type = "async")
  
  # Coordinates should be between 0 and 1 (normalized) - ignoring NAs
  expect_true(all(df$left >= 0 & df$left <= 1, na.rm = TRUE))
  expect_true(all(df$right >= 0 & df$right <= 1, na.rm = TRUE))
  expect_true(all(df$top >= 0 & df$top <= 1, na.rm = TRUE))
  expect_true(all(df$bottom >= 0 & df$bottom <= 1, na.rm = TRUE))
  
  # Left should be less than or equal to right, top less than or equal to bottom
  expect_true(all(df$left <= df$right, na.rm = TRUE))
  expect_true(all(df$top <= df$bottom, na.rm = TRUE))
  
  # Most tokens should have valid coordinates (allow some NAs but not too many)
  na_count <- sum(is.na(df$left) | is.na(df$right) | is.na(df$top) | is.na(df$bottom))
  expect_lt(na_count / nrow(df), 0.5)  # Less than 50% should have NAs
})

test_that("build_token_df() produces reasonable token counts", {
  skip_if_not(file.exists(sample_json))
  df <- build_token_df(sample_json, type = "async")
  parsed <- jsonlite::fromJSON(sample_json)
  text <- parsed$text
  words <- length(strsplit(text, "\\s+")[[1]])
  
  # Token count should be reasonably close to word count (within 50%)
  expect_lt(nrow(df), 1.5 * words)
  expect_gt(nrow(df), 0.5 * words)
})

test_that("build_token_df() produces tokens in correct order", {
  skip_if_not(file.exists(sample_json))
  df <- build_token_df(sample_json, type = "async")
  
  # Start indices should be sorted (tokens in reading order)
  expect_false(is.unsorted(df$start_ind))
})

test_that("build_token_df() handles multi-page documents", {
  skip_if_not(file.exists(sample_json_multipage))
  df <- build_token_df(sample_json_multipage, type = "async")
  parsed <- jsonlite::fromJSON(sample_json_multipage)
  
  expect_equal(max(df$page), nrow(parsed$pages))
  expect_true(all(1:max(df$page) %in% df$page))
})

test_that("build_token_df() handles pages with no tokens", {
  skip_if_not(file.exists(sample_json_multipage))
  df <- build_token_df(sample_json_multipage, type = "async")
  
  # Should not crash and should handle gracefully
  expect_true(is.data.frame(df))
  expect_true(nrow(df) > 0)
})

## BUILD_BLOCK_DF --------------------------------------------------------------

test_invalid_type(build_block_df, "build_block_df()")
test_invalid_object(build_block_df, "build_block_df()")

test_that("build_block_df() errors with files not containing blocks", {
  expect_error(build_block_df(madeup_json_file, type = "async"), "JSON not in right format")
  expect_error(build_block_df(sample_json_blank, type = "async"), "DAI found no blocks")
})

test_that("build_block_df() builds a block dataframe with correct structure", {
  skip_if_not(file.exists(sample_json))
  df <- build_block_df(sample_json, type = "async")
  
  expect_true(is.data.frame(df))
  expect_equal(ncol(df), 7)
  expect_setequal(colnames(df), c("page", "block", "conf", "left", "right", "top", "bottom"))
  
  # Check column types
  expect_true(is.integer(df$page))
  expect_true(is.integer(df$block))
  expect_true(is.numeric(df$conf))
  expect_true(is.numeric(df$left))
  expect_true(is.numeric(df$right))
  expect_true(is.numeric(df$top))
  expect_true(is.numeric(df$bottom))
})

test_that("build_block_df() produces valid coordinate values", {
  skip_if_not(file.exists(sample_json))
  df <- build_block_df(sample_json, type = "async")
  
  # Coordinates should be between 0 and 1 (normalized) - ignoring NAs
  expect_true(all(df$left >= 0 & df$left <= 1, na.rm = TRUE))
  expect_true(all(df$right >= 0 & df$right <= 1, na.rm = TRUE))
  expect_true(all(df$top >= 0 & df$top <= 1, na.rm = TRUE))
  expect_true(all(df$bottom >= 0 & df$bottom <= 1, na.rm = TRUE))
  
  # Left should be less than right, top less than bottom
  expect_true(all(df$left <= df$right, na.rm = TRUE))
  expect_true(all(df$top <= df$bottom, na.rm = TRUE))
})

test_that("build_block_df() produces blocks in correct order", {
  skip_if_not(file.exists(sample_json))
  df <- build_block_df(sample_json, type = "async")
  
  # Block numbers should be sequential within each page
  for (pg in unique(df$page)) {
    page_blocks <- df$block[df$page == pg]
    expect_equal(page_blocks, seq_along(page_blocks))
  }
})

test_that("build_block_df() handles multi-page documents", {
  skip_if_not(file.exists(sample_json_multipage))
  df <- build_block_df(sample_json_multipage, type = "async")
  parsed <- jsonlite::fromJSON(sample_json_multipage)
  
  expect_equal(max(df$page), nrow(parsed$pages))
  expect_true(all(1:max(df$page) %in% df$page))
})

## SPLIT_BLOCK -----------------------------------------------------------------

test_that("split_block() setup creates test data", {
  skip_if_not(file.exists(sample_json))
  test_df <<- build_block_df(sample_json, type = "async")
  expect_true(is.data.frame(test_df))
})

test_invalid_dataframe(split_block, "block_df", "split_block()", list(block = 1, cut_point = 50))
test_invalid_page(split_block, list(block_df = test_df, block = 1, cut_point = 50), "split_block()")
test_invalid_block(split_block, list(block_df = test_df, page = 1, cut_point = 50), "split_block()")

test_that("split_block() errors with invalid cut_point", {
  expect_error(split_block(block_df = test_df, block = 1, cut_point = "middle"), "Invalid cut point parameter")
  expect_error(split_block(block_df = test_df, block = 1, cut_point = 0), "Cut point out of range")
  expect_error(split_block(block_df = test_df, block = 1, cut_point = 100), "Cut point out of range")
  expect_error(split_block(block_df = test_df, block = 1, cut_point = 150), "Cut point out of range")
  expect_error(split_block(block_df = test_df, block = 1, cut_point = -1), "Cut point out of range")  # Changed!
  expect_error(split_block(block_df = test_df, block = 1, cut_point = 50.5), "Invalid cut point parameter")
})

test_that("split_block() errors with invalid direction", {
  expect_error(split_block(block_df = test_df, block = 1, cut_point = 50, direction = 1), "Invalid direction parameter")
  expect_error(split_block(block_df = test_df, block = 1, cut_point = 50, direction = c("v", "h")), "Invalid direction parameter")
  expect_error(split_block(block_df = test_df, block = 1, cut_point = 50, direction = "horizontal"), 'Split direction must be either "h" or "v"')
  expect_error(split_block(block_df = test_df, block = 1, cut_point = 50, direction = "x"), 'Split direction must be either "h" or "v"')
})

test_that("split_block() errors with wrong dataframe format", {
  wrong_df <- data.frame(x = 1:10, y = 1:10)
  expect_error(split_block(block_df = wrong_df, block = 1, cut_point = 50), "Dataframe not recognized")
})

test_that("split_block() returns a revised block dataframe - vertical split", {
  skip_if_not(file.exists(sample_json))
  df <- build_block_df(sample_json, type = "async")
  block_to_split <- min(df$block[df$page == 1])
  cut_point <- 50
  
  new_df <- split_block(df, page = 1, block = block_to_split, cut_point = cut_point, direction = "v")
  
  expect_equal(nrow(new_df), nrow(df) + 1)
  expect_equal(max(new_df$block[new_df$page == 1]), max(df$block[df$page == 1]) + 1)
  expect_true(is.data.frame(new_df))
  expect_equal(colnames(new_df), colnames(df))
})

test_that("split_block() returns a revised block dataframe - horizontal split", {
  skip_if_not(file.exists(sample_json))
  df <- build_block_df(sample_json, type = "async")
  block_to_split <- min(df$block[df$page == 1])
  cut_point <- 50
  
  new_df <- split_block(df, page = 1, block = block_to_split, cut_point = cut_point, direction = "h")
  
  expect_equal(nrow(new_df), nrow(df) + 1)
  expect_equal(max(new_df$block[new_df$page == 1]), max(df$block[df$page == 1]) + 1)
})

test_that("split_block() horizontal split creates correct coordinates", {
  skip_if_not(file.exists(sample_json))
  df <- build_block_df(sample_json, type = "async")
  old_block <- df[df$block == 1 & df$page == 1, ]
  
  new_df <- split_block(df, page = 1, block = 1, cut_point = 50, direction = "h")
  modified_block <- new_df[new_df$block == 1 & new_df$page == 1, ]
  new_block <- new_df[new_df$block == max(new_df$block[new_df$page == 1]) & new_df$page == 1, ]
  
  # Old block should have smaller bottom boundary after horizontal split
  expect_lt(modified_block$bottom, old_block$bottom)
  expect_equal(modified_block$top, old_block$top)
  
  # New block should span from cut to original bottom
  expect_equal(new_block$top, modified_block$bottom)
  expect_equal(new_block$bottom, old_block$bottom)
  
  # Horizontal split: left and right should match original
  expect_equal(new_block$left, old_block$left)
  expect_equal(new_block$right, old_block$right)
})

test_that("split_block() handles multi-page documents correctly", {
  skip_if_not(file.exists(sample_json_multipage))
  df <- build_block_df(sample_json_multipage, type = "async")
  
  # Split a block on page 2
  df_page2 <- df[df$page == 2, ]
  if (nrow(df_page2) > 0) {
    block_to_split <- min(df_page2$block)
    cut_point <- 50
    
    new_df <- split_block(df, page = 2, block = block_to_split, cut_point = cut_point)
    
    expect_equal(nrow(new_df), nrow(df) + 1)
    # Other pages should be unchanged
    expect_equal(nrow(new_df[new_df$page == 1, ]), nrow(df[df$page == 1, ]))
  }
})

test_that("split_block() correctly modifies old block coordinates - vertical", {
  skip_if_not(file.exists(sample_json))
  df <- build_block_df(sample_json, type = "async")
  old_block <- df[df$block == 1 & df$page == 1, ]
  
  new_df <- split_block(df, page = 1, block = 1, cut_point = 50, direction = "v")
  modified_block <- new_df[new_df$block == 1 & new_df$page == 1, ]
  
  # Old block should have smaller right boundary after vertical split
  expect_lt(modified_block$right, old_block$right)
  expect_equal(modified_block$left, old_block$left)
})

test_that("split_block() accepts case-insensitive direction parameter", {
  skip_if_not(file.exists(sample_json))
  df <- build_block_df(sample_json, type = "async")
  
  expect_no_error(split_block(df, block = 1, cut_point = 50, direction = "V"))
  expect_no_error(split_block(df, block = 1, cut_point = 50, direction = "H"))
})

test_that("split_block() handles edge case split points", {
  skip_if_not(file.exists(sample_json))
  df <- build_block_df(sample_json, type = "async")
  
  # Very small cut (1%)
  new_df_1 <- split_block(df, block = 1, cut_point = 1)
  expect_equal(nrow(new_df_1), nrow(df) + 1)
  
  # Very large cut (99%)
  new_df_99 <- split_block(df, block = 1, cut_point = 99)
  expect_equal(nrow(new_df_99), nrow(df) + 1)
})

## REASSIGN_TOKENS -------------------------------------------------------------

test_that("reassign_tokens() setup creates test data", {
  skip_if_not(file.exists(sample_json))
  test_token_df <<- build_token_df(sample_json, type = "async")
  test_block_df <<- build_block_df(sample_json, type = "async")
  expect_true(is.data.frame(test_token_df))
  expect_true(is.data.frame(test_block_df))
})

test_invalid_dataframe(reassign_tokens, "token_df", "reassign_tokens()", list(block_df = test_block_df))
test_invalid_dataframe(reassign_tokens, "block_df", "reassign_tokens()", list(token_df = test_token_df))

test_that("reassign_tokens() errors with wrong dataframe format", {
  wrong_token_df <- data.frame(x = 1:10, y = 1:10)
  expect_error(reassign_tokens(wrong_token_df, test_block_df), "Token dataframe not recognized")
  
  wrong_block_df <- data.frame(x = 1:10, y = 1:10)
  expect_error(reassign_tokens(test_token_df, wrong_block_df), "Block dataframe not recognized")
})

test_that("reassign_tokens() returns a revised token dataframe", {
  skip_if_not(file.exists(sample_json))
  tdf_old <- build_token_df(sample_json, type = "async")
  bdf_old <- build_block_df(sample_json, type = "async")
  
  # Split a block
  bdf_new <- split_block(bdf_old, page = 1, block = 1, cut_point = 50)
  tdf_new <- reassign_tokens(tdf_old, bdf_new)
  
  expect_true(is.data.frame(tdf_new))
  expect_equal(colnames(tdf_new), colnames(tdf_old))
  expect_equal(nrow(tdf_new), nrow(tdf_old))
})

test_that("reassign_tokens() maintains token order", {
  skip_if_not(file.exists(sample_json))
  tdf_old <- build_token_df(sample_json, type = "async")
  bdf_old <- build_block_df(sample_json, type = "async")
  bdf_new <- split_block(bdf_old, page = 1, block = 1, cut_point = 50)
  tdf_new <- reassign_tokens(tdf_old, bdf_new)
  
  # Start indices should still be sorted
  expect_false(is.unsorted(tdf_new$start_ind))
})

test_that("reassign_tokens() handles multi-page documents", {
  skip_if_not(file.exists(sample_json_multipage))
  tdf <- build_token_df(sample_json_multipage, type = "async")
  bdf <- build_block_df(sample_json_multipage, type = "async")
  bdf_new <- split_block(bdf, page = 2, block = 1, cut_point = 50)
  tdf_new <- reassign_tokens(tdf, bdf_new)
  
  expect_equal(nrow(tdf_new), nrow(tdf))
  expect_equal(max(tdf_new$page), max(tdf$page))
})

test_that("reassign_tokens() correctly assigns tokens to new blocks", {
  skip_if_not(file.exists(sample_json))
  tdf_old <- build_token_df(sample_json, type = "async")
  bdf_old <- build_block_df(sample_json, type = "async")
  
  # Find a block with multiple tokens to ensure split creates two non-empty blocks
  # Count tokens per block
  token_counts <- table(tdf_old$block[tdf_old$page == 1 & !is.na(tdf_old$block)])
  
  # Skip if no blocks have tokens
  skip_if(length(token_counts) == 0, "No blocks with tokens on page 1")
  
  # Find a block with at least 4 tokens (more likely to split evenly)
  blocks_with_tokens <- as.integer(names(token_counts[token_counts >= 4]))
  
  if (length(blocks_with_tokens) > 0) {
    # Use first block with multiple tokens
    block_to_split <- blocks_with_tokens[1]
  } else {
    # Fall back to any block with tokens
    block_to_split <- as.integer(names(token_counts[token_counts > 0])[1])
  }
  
  # Split the block vertically at 50%
  bdf_new <- split_block(bdf_old, page = 1, block = block_to_split, cut_point = 50, direction = "v")
  tdf_new <- reassign_tokens(tdf_old, bdf_new)
  
  # Verify reassignment worked
  expect_equal(nrow(tdf_new), nrow(tdf_old))
  expect_equal(max(tdf_new$block, na.rm = TRUE), max(bdf_new$block, na.rm = TRUE))
  
  # Check that the new block exists in the block dataframe
  new_block_num <- max(bdf_new$block[bdf_new$page == 1])
  expect_true(new_block_num %in% bdf_new$block[bdf_new$page == 1])
  
  # Optionally check if new block has tokens
  tokens_in_new_block <- sum(tdf_new$block[tdf_new$page == 1] == new_block_num, na.rm = TRUE)
  if (tokens_in_new_block > 0) {
    message(paste("New block", new_block_num, "has", tokens_in_new_block, "token(s)"))
  }
})

## REASSIGN_TOKENS2 ------------------------------------------------------------

test_that("reassign_tokens2() setup creates test data", {
  skip_if_not(file.exists(sample_json))
  test_block_single <<- test_block_df[1, ]
  expect_equal(nrow(test_block_single), 1)
})

test_invalid_dataframe(reassign_tokens2, "token_df", "reassign_tokens2()", list(block = test_block_single))
test_invalid_dataframe(reassign_tokens2, "block", "reassign_tokens2()", list(token_df = test_token_df))
test_invalid_page(reassign_tokens2, list(token_df = test_token_df, block = test_block_single), "reassign_tokens2()")

test_that("reassign_tokens2() errors with wrong dataframe format", {
  wrong_token_df <- data.frame(x = 1:10, y = 1:10)
  expect_error(reassign_tokens2(wrong_token_df, test_block_single), "Token dataframe not recognized")
  
  wrong_block_df <- data.frame(x = 1:10, y = 1:10)
  expect_error(reassign_tokens2(test_token_df, wrong_block_df), "Block dataframe format not recognized")
})

test_that("reassign_tokens2() returns a revised token dataframe", {
  skip_if_not(file.exists(sample_json_pesh) && file.exists(sample_json_labelme))
  tdf_old <- build_token_df(sample_json_pesh, type = "async")
  block <- from_labelme(sample_json_labelme)
  tdf_new <- reassign_tokens2(tdf_old, block)
  
  expect_true(is.data.frame(tdf_new))
  expect_equal(nrow(tdf_new), nrow(tdf_old))
  expect_equal(colnames(tdf_new), colnames(tdf_old))
})

test_that("reassign_tokens2() maintains structure", {
  skip_if_not(file.exists(sample_json_pesh) && file.exists(sample_json_labelme))
  tdf_old <- build_token_df(sample_json_pesh, type = "async")
  block <- from_labelme(sample_json_labelme)
  tdf_new <- reassign_tokens2(tdf_old, block)
  
  # Should maintain all original tokens
  expect_equal(nrow(tdf_new), nrow(tdf_old))
  # Start indices should still be sorted
  expect_false(is.unsorted(tdf_new$start_ind))
})

test_that("reassign_tokens2() correctly assigns tokens within block boundaries", {
  skip_if_not(file.exists(sample_json_pesh) && file.exists(sample_json_labelme))
  tdf_old <- build_token_df(sample_json_pesh, type = "async")
  block <- from_labelme(sample_json_labelme)
  tdf_new <- reassign_tokens2(tdf_old, block)
  
  # Tokens assigned to this block should be within its boundaries
  assigned_tokens <- tdf_new[tdf_new$block == block$block & !is.na(tdf_new$block), ]
  if (nrow(assigned_tokens) > 0) {
    expect_true(all(assigned_tokens$left >= block$left))
    expect_true(all(assigned_tokens$right <= block$right))
    expect_true(all(assigned_tokens$top >= block$top))
    expect_true(all(assigned_tokens$bottom <= block$bottom))
  }
})

## FROM_LABELME ----------------------------------------------------------------

test_invalid_json(from_labelme, "from_labelme()")

test_that("from_labelme() errors with invalid page parameter", {
  # from_labelme() only validates page type/format, not existence
  expect_error(from_labelme(sample_json_labelme, page = -1), "Invalid page parameter.")
  expect_error(from_labelme(sample_json_labelme, page = 0), "Invalid page parameter.")
  expect_error(from_labelme(sample_json_labelme, page = 1.5), "Invalid page parameter.")
  expect_error(from_labelme(sample_json_labelme, page = "one"), "Invalid page parameter.")
  expect_error(from_labelme(sample_json_labelme, page = c(1, 2)), "Invalid page parameter.")
  
  # Page 999 is valid - from_labelme just creates a dataframe with that page number
  expect_no_error(from_labelme(sample_json_labelme, page = 999))
})

test_that("from_labelme() produces a properly formatted dataframe", {
  skip_if_not(file.exists(sample_json_labelme))
  df <- from_labelme(sample_json_labelme)
  
  expect_true(is.data.frame(df))
  expect_equal(nrow(df), 1)
  expect_setequal(colnames(df), c("page", "block", "conf", "left", "right", "top", "bottom"))
  
  # Check types
  expect_true(is.numeric(df$page))
  expect_true(is.numeric(df$block))
  expect_true(is.numeric(df$left))
  expect_true(is.numeric(df$right))
  expect_true(is.numeric(df$top))
  expect_true(is.numeric(df$bottom))
})

test_that("from_labelme() produces valid normalized coordinates", {
  skip_if_not(file.exists(sample_json_labelme))
  df <- from_labelme(sample_json_labelme)
  
  # Coordinates should be between 0 and 1 (normalized)
  expect_true(df$left >= 0 & df$left <= 1)
  expect_true(df$right >= 0 & df$right <= 1)
  expect_true(df$top >= 0 & df$top <= 1)
  expect_true(df$bottom >= 0 & df$bottom <= 1)
  
  # Left should be less than right, top less than bottom
  expect_true(df$left < df$right)
  expect_true(df$top < df$bottom)
})

test_that("from_labelme() accepts custom page parameter", {
  skip_if_not(file.exists(sample_json_labelme))
  df_page1 <- from_labelme(sample_json_labelme, page = 1)
  df_page5 <- from_labelme(sample_json_labelme, page = 5)
  
  expect_equal(df_page1$page, 1)
  expect_equal(df_page5$page, 5)
})

## REDRAW_BLOCKS ---------------------------------------------------------------

test_that("redraw_blocks() errors with invalid json parameter", {
  expect_error(redraw_blocks(json = vector_strings, token_df = test_token_df), "Invalid json parameter")
  expect_error(redraw_blocks(json = number_random, token_df = test_token_df), "Invalid json parameter")
  expect_error(redraw_blocks(json = c(sample_json, sample_json), token_df = test_token_df), "not vectorised")
})

test_invalid_dataframe(redraw_blocks, "token_df", "redraw_blocks()", list(json = sample_json))

test_that("redraw_blocks() errors with wrong dataframe format", {
  wrong_df <- data.frame(x = 1:10, y = 1:10)
  expect_error(redraw_blocks(json = sample_json, token_df = wrong_df), "Token dataframe format not recognized")
})

test_that("redraw_blocks() errors on invalid JSON structures", {
  # JSON without pages field
  json_no_pages <- tempfile(fileext = ".json")
  jsonlite::write_json(list(text = "test"), json_no_pages)
  expect_error(redraw_blocks(json = json_no_pages, token_df = test_token_df), 
               "does not contain valid Document AI response with images")
  unlink(json_no_pages)
})

test_that("redraw_blocks() errors when JSON lacks images", {
  skip_if_not(file.exists(sample_json))
  token_df <- build_token_df(sample_json, type = "async")
  expect_error(redraw_blocks(json = sample_json, token_df = token_df, dir = tempdir()),
               "does not contain valid Document AI response with images")
})

test_that("redraw_blocks() generates annotated images when images present", {
  skip_on_cran()
  skip_on_ci()
  skip_if_not_installed("magick")
  
  # This test requires a JSON with embedded images
  # Skip if no suitable test file exists
  skip("Requires JSON file with base64-encoded images")
})

## INTEGRATION TESTS -----------------------------------------------------------

test_that("build_token_df() and build_block_df() produce compatible dataframes", {
  skip_if_not(file.exists(sample_json))
  token_df <- build_token_df(sample_json, type = "async")
  block_df <- build_block_df(sample_json, type = "async")
  
  # Should have same number of pages
  expect_equal(max(token_df$page), max(block_df$page))
  
  # All tokens should be assigned to blocks that exist
  for (pg in unique(token_df$page)) {
    page_tokens <- token_df[token_df$page == pg, ]
    page_blocks <- block_df[block_df$page == pg, ]
    assigned_blocks <- unique(page_tokens$block[!is.na(page_tokens$block)])
    if (length(assigned_blocks) > 0) {
      expect_true(all(assigned_blocks %in% page_blocks$block))
    }
  }
})

test_that("Full workflow: build, split, reassign", {
  skip_if_not(file.exists(sample_json))
  # Build initial dataframes
  token_df <- build_token_df(sample_json, type = "async")
  block_df <- build_block_df(sample_json, type = "async")
  
  # Find a block with multiple tokens to ensure meaningful split
  token_counts <- table(token_df$block[token_df$page == 1 & !is.na(token_df$block)])
  
  if (length(token_counts[token_counts >= 4]) > 0) {
    # Use a block with multiple tokens
    block_to_split <- as.integer(names(token_counts[token_counts >= 4])[1])
  } else {
    # Fall back to any block with tokens
    block_to_split <- as.integer(names(token_counts[token_counts > 0])[1])
  }
  
  # Split the block
  new_block_df <- split_block(block_df, page = 1, block = block_to_split, cut_point = 50)
  
  # Reassign tokens
  new_token_df <- reassign_tokens(token_df, new_block_df)
  
  # Check results
  expect_equal(nrow(new_block_df), nrow(block_df) + 1)
  expect_equal(nrow(new_token_df), nrow(token_df))
  
  # The max block number in tokens should be <= max in blocks (empty blocks may not appear)
  expect_true(max(new_token_df$block, na.rm = TRUE) <= max(new_block_df$block))
  
  # All token blocks should exist in block dataframe
  token_blocks <- unique(new_token_df$block[!is.na(new_token_df$block)])
  expect_true(all(token_blocks %in% new_block_df$block))
})

test_that("Multiple splits on same document", {
  skip_if_not(file.exists(sample_json))
  token_df <- build_token_df(sample_json, type = "async")
  block_df <- build_block_df(sample_json, type = "async")
  
  # First split
  block_df_2 <- split_block(block_df, page = 1, block = 1, cut_point = 33)
  # Second split on different block
  if (max(block_df$block[block_df$page == 1]) > 1) {
    block_df_3 <- split_block(block_df_2, page = 1, block = 2, cut_point = 67)
    
    expect_equal(nrow(block_df_3), nrow(block_df) + 2)
    
    # Reassign tokens after multiple splits
    token_df_new <- reassign_tokens(token_df, block_df_3)
    expect_equal(nrow(token_df_new), nrow(token_df))
  }
})

## CLEANUP ---------------------------------------------------------------------

unlink(madeup_json_file, force = TRUE)