# Tests for Arrow/Parquet support in compare_datasets_from_yaml. # # Uses arrow::read_parquet(as_data_frame=FALSE) for Arrow Tables and # arrow::open_dataset() for Arrow Datasets. Both are converted to # DuckDB-backed lazy tables via arrow::to_duckdb() before pointblank # validation, keeping the pipeline fully lazy. # ---- helpers ---------------------------------------------------------------- as_arrow_table <- function(df) { skip_if_not_installed("arrow") skip_if_not_installed("duckdb") p <- tempfile(fileext = ".parquet") arrow::write_parquet(df, p) arrow::read_parquet(p, as_data_frame = FALSE) } as_arrow_dataset <- function(df) { skip_if_not_installed("arrow") skip_if_not_installed("duckdb") d <- tempfile() dir.create(d) arrow::write_parquet(df, file.path(d, "part-0.parquet")) arrow::open_dataset(d) } yaml_path <- function(content) { p <- tempfile(fileext = ".yaml") writeLines(content, p) p } # ---- identical data (with key) --------------------------------------------- test_that("arrow table: identical data pass (with key)", { ref <- data.frame(id = 1:5, value = c(1.0, 2.0, 3.0, 4.0, 5.0)) cand <- data.frame(id = 1:5, value = c(1.0, 2.0, 3.0, 4.0, 5.0)) result <- compare_datasets_from_yaml(as_arrow_table(ref), as_arrow_table(cand), key = "id") expect_true(result$all_passed) }) test_that("arrow dataset: identical data pass (with key)", { ref <- data.frame(id = 1:5, value = c(1.0, 2.0, 3.0, 4.0, 5.0)) cand <- data.frame(id = 1:5, value = c(1.0, 2.0, 3.0, 4.0, 5.0)) result <- compare_datasets_from_yaml(as_arrow_dataset(ref), as_arrow_dataset(cand), key = "id") expect_true(result$all_passed) }) # ---- tolerance abs (pass) --------------------------------------------------- test_that("arrow table: tolerance abs pass", { ref <- data.frame(id = 1:3, value = c(1.0, 2.0, 3.0)) cand <- data.frame(id = 1:3, value = c(1.005, 2.005, 3.005)) p <- yaml_path(" version: 1 defaults: na_equal: yes row_validation: check_count: no by_type: numeric: abs: 0.01 ") result <- compare_datasets_from_yaml(as_arrow_table(ref), as_arrow_table(cand), key = "id", path = p) expect_true(result$all_passed) }) test_that("arrow dataset: tolerance abs pass", { ref <- data.frame(id = 1:3, value = c(1.0, 2.0, 3.0)) cand <- data.frame(id = 1:3, value = c(1.005, 2.005, 3.005)) p <- yaml_path(" version: 1 defaults: na_equal: yes row_validation: check_count: no by_type: numeric: abs: 0.01 ") result <- compare_datasets_from_yaml(as_arrow_dataset(ref), as_arrow_dataset(cand), key = "id", path = p) expect_true(result$all_passed) }) # ---- tolerance abs (fail) --------------------------------------------------- test_that("arrow table: tolerance abs fail", { ref <- data.frame(id = 1:3, value = c(1.0, 2.0, 3.0)) cand <- data.frame(id = 1:3, value = c(1.5, 2.5, 3.5)) p <- yaml_path(" version: 1 defaults: na_equal: yes row_validation: check_count: no by_type: numeric: abs: 0.01 ") result <- compare_datasets_from_yaml(as_arrow_table(ref), as_arrow_table(cand), key = "id", path = p) expect_false(result$all_passed) }) test_that("arrow dataset: tolerance abs fail", { ref <- data.frame(id = 1:3, value = c(1.0, 2.0, 3.0)) cand <- data.frame(id = 1:3, value = c(1.5, 2.5, 3.5)) p <- yaml_path(" version: 1 defaults: na_equal: yes row_validation: check_count: no by_type: numeric: abs: 0.01 ") result <- compare_datasets_from_yaml(as_arrow_dataset(ref), as_arrow_dataset(cand), key = "id", path = p) expect_false(result$all_passed) }) # ---- tolerance relative (pass) ---------------------------------------------- test_that("arrow table: tolerance rel pass", { ref <- data.frame(id = 1:3, value = c(100.0, 200.0, 300.0)) cand <- data.frame(id = 1:3, value = c(101.0, 202.0, 303.0)) p <- yaml_path(" version: 1 defaults: na_equal: yes row_validation: check_count: no by_type: numeric: abs: 0 rel: 0.02 ") result <- compare_datasets_from_yaml(as_arrow_table(ref), as_arrow_table(cand), key = "id", path = p) expect_true(result$all_passed) }) test_that("arrow dataset: tolerance rel pass", { ref <- data.frame(id = 1:3, value = c(100.0, 200.0, 300.0)) cand <- data.frame(id = 1:3, value = c(101.0, 202.0, 303.0)) p <- yaml_path(" version: 1 defaults: na_equal: yes row_validation: check_count: no by_type: numeric: abs: 0 rel: 0.02 ") result <- compare_datasets_from_yaml(as_arrow_dataset(ref), as_arrow_dataset(cand), key = "id", path = p) expect_true(result$all_passed) }) # ---- tolerance relative (fail) ---------------------------------------------- test_that("arrow table: tolerance rel fail", { ref <- data.frame(id = 1:3, value = c(100.0, 200.0, 300.0)) cand <- data.frame(id = 1:3, value = c(120.0, 240.0, 360.0)) p <- yaml_path(" version: 1 defaults: na_equal: yes row_validation: check_count: no by_type: numeric: abs: 0 rel: 0.02 ") result <- compare_datasets_from_yaml(as_arrow_table(ref), as_arrow_table(cand), key = "id", path = p) expect_false(result$all_passed) }) test_that("arrow dataset: tolerance rel fail", { ref <- data.frame(id = 1:3, value = c(100.0, 200.0, 300.0)) cand <- data.frame(id = 1:3, value = c(120.0, 240.0, 360.0)) p <- yaml_path(" version: 1 defaults: na_equal: yes row_validation: check_count: no by_type: numeric: abs: 0 rel: 0.02 ") result <- compare_datasets_from_yaml(as_arrow_dataset(ref), as_arrow_dataset(cand), key = "id", path = p) expect_false(result$all_passed) }) # ---- positional comparison (no key) ---------------------------------------- test_that("arrow table: positional comparison (no key)", { ref <- data.frame(value = c(1.0, 2.0, 3.0), name = c("a", "b", "c")) cand <- data.frame(value = c(1.0, 2.0, 3.0), name = c("a", "b", "c")) result <- compare_datasets_from_yaml(as_arrow_table(ref), as_arrow_table(cand)) expect_true(result$all_passed) }) test_that("arrow dataset: positional comparison (no key)", { ref <- data.frame(value = c(1.0, 2.0, 3.0), name = c("a", "b", "c")) cand <- data.frame(value = c(1.0, 2.0, 3.0), name = c("a", "b", "c")) result <- compare_datasets_from_yaml(as_arrow_dataset(ref), as_arrow_dataset(cand)) expect_true(result$all_passed) }) # ---- NA with na_equal=TRUE (numeric) ---------------------------------------- test_that("arrow table: NA numeric na_equal=TRUE", { ref <- data.frame(id = 1:3, value = c(1.0, NA, 3.0)) cand <- data.frame(id = 1:3, value = c(1.0, NA, 3.0)) p <- yaml_path(" version: 1 defaults: na_equal: yes row_validation: check_count: no by_type: numeric: abs: 0 ") result <- compare_datasets_from_yaml(as_arrow_table(ref), as_arrow_table(cand), key = "id", path = p) expect_true(result$all_passed) }) test_that("arrow dataset: NA numeric na_equal=TRUE", { ref <- data.frame(id = 1:3, value = c(1.0, NA, 3.0)) cand <- data.frame(id = 1:3, value = c(1.0, NA, 3.0)) p <- yaml_path(" version: 1 defaults: na_equal: yes row_validation: check_count: no by_type: numeric: abs: 0 ") result <- compare_datasets_from_yaml(as_arrow_dataset(ref), as_arrow_dataset(cand), key = "id", path = p) expect_true(result$all_passed) }) # ---- NA with na_equal=TRUE (character) -------------------------------------- test_that("arrow table: NA character na_equal=TRUE", { ref <- data.frame(id = 1:3, name = c("a", NA, "c"), stringsAsFactors = FALSE) cand <- data.frame(id = 1:3, name = c("a", NA, "c"), stringsAsFactors = FALSE) p <- yaml_path(" version: 1 defaults: na_equal: yes row_validation: check_count: no ") result <- compare_datasets_from_yaml(as_arrow_table(ref), as_arrow_table(cand), key = "id", path = p) expect_true(result$all_passed) }) test_that("arrow dataset: NA character na_equal=TRUE", { ref <- data.frame(id = 1:3, name = c("a", NA, "c"), stringsAsFactors = FALSE) cand <- data.frame(id = 1:3, name = c("a", NA, "c"), stringsAsFactors = FALSE) p <- yaml_path(" version: 1 defaults: na_equal: yes row_validation: check_count: no ") result <- compare_datasets_from_yaml(as_arrow_dataset(ref), as_arrow_dataset(cand), key = "id", path = p) expect_true(result$all_passed) }) # ---- NA with na_equal=FALSE ------------------------------------------------- test_that("arrow table: NA na_equal=FALSE fails", { ref <- data.frame(id = 1:3, value = c(1.0, NA, 3.0)) cand <- data.frame(id = 1:3, value = c(1.0, NA, 3.0)) p <- yaml_path(" version: 1 defaults: na_equal: no row_validation: check_count: no by_type: numeric: abs: 0 ") result <- compare_datasets_from_yaml(as_arrow_table(ref), as_arrow_table(cand), key = "id", path = p) expect_false(result$all_passed) }) test_that("arrow dataset: NA na_equal=FALSE fails", { ref <- data.frame(id = 1:3, value = c(1.0, NA, 3.0)) cand <- data.frame(id = 1:3, value = c(1.0, NA, 3.0)) p <- yaml_path(" version: 1 defaults: na_equal: no row_validation: check_count: no by_type: numeric: abs: 0 ") result <- compare_datasets_from_yaml(as_arrow_dataset(ref), as_arrow_dataset(cand), key = "id", path = p) expect_false(result$all_passed) }) # ---- preprocessing case_insensitive ---------------------------------------- test_that("arrow table: case insensitive comparison", { ref <- data.frame(id = 1:3, name = c("Hello", "WORLD", "Foo"), stringsAsFactors = FALSE) cand <- data.frame(id = 1:3, name = c("hello", "world", "foo"), stringsAsFactors = FALSE) p <- yaml_path(" version: 1 defaults: na_equal: yes row_validation: check_count: no by_type: character: case_insensitive: yes ") result <- compare_datasets_from_yaml(as_arrow_table(ref), as_arrow_table(cand), key = "id", path = p) expect_true(result$all_passed) }) test_that("arrow dataset: case insensitive comparison", { ref <- data.frame(id = 1:3, name = c("Hello", "WORLD", "Foo"), stringsAsFactors = FALSE) cand <- data.frame(id = 1:3, name = c("hello", "world", "foo"), stringsAsFactors = FALSE) p <- yaml_path(" version: 1 defaults: na_equal: yes row_validation: check_count: no by_type: character: case_insensitive: yes ") result <- compare_datasets_from_yaml(as_arrow_dataset(ref), as_arrow_dataset(cand), key = "id", path = p) expect_true(result$all_passed) }) # ---- preprocessing trim ----------------------------------------------------- test_that("arrow table: trim whitespace comparison", { ref <- data.frame(id = 1:3, name = c(" a ", " b ", "c"), stringsAsFactors = FALSE) cand <- data.frame(id = 1:3, name = c("a", "b", "c"), stringsAsFactors = FALSE) p <- yaml_path(" version: 1 defaults: na_equal: yes row_validation: check_count: no by_type: character: trim: yes ") result <- compare_datasets_from_yaml(as_arrow_table(ref), as_arrow_table(cand), key = "id", path = p) expect_true(result$all_passed) }) test_that("arrow dataset: trim whitespace comparison", { ref <- data.frame(id = 1:3, name = c(" a ", " b ", "c"), stringsAsFactors = FALSE) cand <- data.frame(id = 1:3, name = c("a", "b", "c"), stringsAsFactors = FALSE) p <- yaml_path(" version: 1 defaults: na_equal: yes row_validation: check_count: no by_type: character: trim: yes ") result <- compare_datasets_from_yaml(as_arrow_dataset(ref), as_arrow_dataset(cand), key = "id", path = p) expect_true(result$all_passed) }) # ---- preprocessing trim + case combined ------------------------------------ test_that("arrow table: trim + case insensitive combined", { ref <- data.frame(id = 1:3, name = c(" Hello ", " WORLD ", "Foo"), stringsAsFactors = FALSE) cand <- data.frame(id = 1:3, name = c("hello", "world", "foo"), stringsAsFactors = FALSE) p <- yaml_path(" version: 1 defaults: na_equal: yes row_validation: check_count: no by_type: character: case_insensitive: yes trim: yes ") result <- compare_datasets_from_yaml(as_arrow_table(ref), as_arrow_table(cand), key = "id", path = p) expect_true(result$all_passed) }) test_that("arrow dataset: trim + case insensitive combined", { ref <- data.frame(id = 1:3, name = c(" Hello ", " WORLD ", "Foo"), stringsAsFactors = FALSE) cand <- data.frame(id = 1:3, name = c("hello", "world", "foo"), stringsAsFactors = FALSE) p <- yaml_path(" version: 1 defaults: na_equal: yes row_validation: check_count: no by_type: character: case_insensitive: yes trim: yes ") result <- compare_datasets_from_yaml(as_arrow_dataset(ref), as_arrow_dataset(cand), key = "id", path = p) expect_true(result$all_passed) }) # ---- mixed types (numeric, integer, char) ----------------------------------- test_that("arrow table: mixed types", { ref <- data.frame(id = 1:3, val = c(1.0, 2.0, 3.0), count = 10L:12L, label = c("a", "b", "c"), stringsAsFactors = FALSE) cand <- data.frame(id = 1:3, val = c(1.0, 2.0, 3.0), count = 10L:12L, label = c("a", "b", "c"), stringsAsFactors = FALSE) result <- compare_datasets_from_yaml(as_arrow_table(ref), as_arrow_table(cand), key = "id") expect_true(result$all_passed) }) test_that("arrow dataset: mixed types", { ref <- data.frame(id = 1:3, val = c(1.0, 2.0, 3.0), count = 10L:12L, label = c("a", "b", "c"), stringsAsFactors = FALSE) cand <- data.frame(id = 1:3, val = c(1.0, 2.0, 3.0), count = 10L:12L, label = c("a", "b", "c"), stringsAsFactors = FALSE) result <- compare_datasets_from_yaml(as_arrow_dataset(ref), as_arrow_dataset(cand), key = "id") expect_true(result$all_passed) }) # ---- composite keys --------------------------------------------------------- test_that("arrow table: composite keys", { ref <- data.frame(grp = c("a", "a", "b"), sub = 1:3, value = c(10, 20, 30)) cand <- data.frame(grp = c("a", "a", "b"), sub = 1:3, value = c(10, 20, 30)) result <- compare_datasets_from_yaml(as_arrow_table(ref), as_arrow_table(cand), key = c("grp", "sub")) expect_true(result$all_passed) }) test_that("arrow dataset: composite keys", { ref <- data.frame(grp = c("a", "a", "b"), sub = 1:3, value = c(10, 20, 30)) cand <- data.frame(grp = c("a", "a", "b"), sub = 1:3, value = c(10, 20, 30)) result <- compare_datasets_from_yaml(as_arrow_dataset(ref), as_arrow_dataset(cand), key = c("grp", "sub")) expect_true(result$all_passed) }) # ---- missing column in candidate -------------------------------------------- test_that("arrow table: missing column in candidate", { ref <- data.frame(id = 1:3, value = c(1, 2, 3), extra = c(10, 20, 30)) cand <- data.frame(id = 1:3, value = c(1, 2, 3)) result <- compare_datasets_from_yaml(as_arrow_table(ref), as_arrow_table(cand), key = "id") expect_false(result$all_passed) expect_true("extra" %in% result$missing_in_candidate) }) test_that("arrow dataset: missing column in candidate", { ref <- data.frame(id = 1:3, value = c(1, 2, 3), extra = c(10, 20, 30)) cand <- data.frame(id = 1:3, value = c(1, 2, 3)) result <- compare_datasets_from_yaml(as_arrow_dataset(ref), as_arrow_dataset(cand), key = "id") expect_false(result$all_passed) expect_true("extra" %in% result$missing_in_candidate) }) # ---- row count mismatch ----------------------------------------------------- test_that("arrow table: row count mismatch detected", { ref <- data.frame(id = 1:5, value = 1:5 * 1.0) cand <- data.frame(id = 1:3, value = 1:3 * 1.0) p <- yaml_path(" version: 1 defaults: na_equal: yes row_validation: check_count: yes tolerance: 0 ") result <- compare_datasets_from_yaml(as_arrow_table(ref), as_arrow_table(cand), key = "id", path = p) expect_false(result$all_passed) }) test_that("arrow dataset: row count mismatch detected", { ref <- data.frame(id = 1:5, value = 1:5 * 1.0) cand <- data.frame(id = 1:3, value = 1:3 * 1.0) p <- yaml_path(" version: 1 defaults: na_equal: yes row_validation: check_count: yes tolerance: 0 ") result <- compare_datasets_from_yaml(as_arrow_dataset(ref), as_arrow_dataset(cand), key = "id", path = p) expect_false(result$all_passed) }) # ---- empty tables (0 rows) ------------------------------------------------- test_that("arrow table: empty tables (0 rows) pass", { ref <- data.frame(id = integer(0), value = numeric(0)) cand <- data.frame(id = integer(0), value = numeric(0)) result <- compare_datasets_from_yaml(as_arrow_table(ref), as_arrow_table(cand), key = "id") expect_true(result$all_passed) }) test_that("arrow dataset: empty tables (0 rows) pass", { ref <- data.frame(id = integer(0), value = numeric(0)) cand <- data.frame(id = integer(0), value = numeric(0)) result <- compare_datasets_from_yaml(as_arrow_dataset(ref), as_arrow_dataset(cand), key = "id") expect_true(result$all_passed) }) # ---- duplicate keys warning ------------------------------------------------- test_that("arrow table: duplicate keys warning", { ref <- data.frame(id = c(1, 1, 2), value = c(1, 2, 3)) cand <- data.frame(id = c(1, 1, 2), value = c(1, 2, 3)) expect_warning( compare_datasets_from_yaml(as_arrow_table(ref), as_arrow_table(cand), key = "id"), "Duplicate keys" ) }) test_that("arrow dataset: duplicate keys warning", { ref <- data.frame(id = c(1, 1, 2), value = c(1, 2, 3)) cand <- data.frame(id = c(1, 1, 2), value = c(1, 2, 3)) expect_warning( compare_datasets_from_yaml(as_arrow_dataset(ref), as_arrow_dataset(cand), key = "id"), "Duplicate keys" ) }) # ---- ignore_columns --------------------------------------------------------- test_that("arrow table: ignore_columns", { ref <- data.frame(id = 1:3, value = c(1, 2, 3), noise = c(99, 98, 97)) cand <- data.frame(id = 1:3, value = c(1, 2, 3), noise = c(0, 0, 0)) p <- yaml_path(" version: 1 defaults: na_equal: yes ignore_columns: - noise row_validation: check_count: no ") result <- compare_datasets_from_yaml(as_arrow_table(ref), as_arrow_table(cand), key = "id", path = p) expect_true(result$all_passed) }) test_that("arrow dataset: ignore_columns", { ref <- data.frame(id = 1:3, value = c(1, 2, 3), noise = c(99, 98, 97)) cand <- data.frame(id = 1:3, value = c(1, 2, 3), noise = c(0, 0, 0)) p <- yaml_path(" version: 1 defaults: na_equal: yes ignore_columns: - noise row_validation: check_count: no ") result <- compare_datasets_from_yaml(as_arrow_dataset(ref), as_arrow_dataset(cand), key = "id", path = p) expect_true(result$all_passed) }) # ---- write_rules_template with Arrow ---------------------------------------- test_that("arrow table: write_rules_template works", { ref <- data.frame(id = 1:3, value = c(1.0, 2.0, 3.0), name = c("a", "b", "c"), stringsAsFactors = FALSE) tbl <- as_arrow_table(ref) p <- tempfile(fileext = ".yaml") write_rules_template(tbl, key = "id", path = p) rules <- read_rules(p) expect_equal(rules$version, 1) expect_equal(rules$defaults$keys, "id") }) test_that("arrow dataset: write_rules_template works", { ref <- data.frame(id = 1:3, value = c(1.0, 2.0, 3.0), name = c("a", "b", "c"), stringsAsFactors = FALSE) ds <- as_arrow_dataset(ref) p <- tempfile(fileext = ".yaml") write_rules_template(ds, key = "id", path = p) rules <- read_rules(p) expect_equal(rules$version, 1) expect_equal(rules$defaults$keys, "id") })