# ============================================================================= # Tests for Phase 3-6 Features # ============================================================================= # --- Phase 3: join_repair() --- test_that("join_repair trims whitespace", { x <- data.frame(id = c(" A", "B ", " C "), val = 1:3, stringsAsFactors = FALSE) result <- join_repair(x, by = "id", trim_whitespace = TRUE) expect_equal(result$id, c("A", "B", "C")) }) test_that("join_repair standardizes case", { x <- data.frame(id = c("ABC", "Def", "ghi"), val = 1:3, stringsAsFactors = FALSE) result <- join_repair(x, by = "id", standardize_case = "lower") expect_equal(result$id, c("abc", "def", "ghi")) }) test_that("join_repair converts empty to NA", { x <- data.frame(id = c("A", "", "C"), val = 1:3, stringsAsFactors = FALSE) result <- join_repair(x, by = "id", empty_to_na = TRUE) expect_true(is.na(result$id[2])) }) test_that("join_repair dry_run mode works", { x <- data.frame(id = c(" A", "B "), val = 1:2, stringsAsFactors = FALSE) result <- join_repair(x, by = "id", dry_run = TRUE) expect_true(result$total_changes > 0) # Original data unchanged in dry run }) test_that("join_repair handles both x and y", { x <- data.frame(id = c(" A", "B"), val = 1:2, stringsAsFactors = FALSE) y <- data.frame(id = c("A", "B "), name = c("a", "b"), stringsAsFactors = FALSE) result <- join_repair(x, y, by = "id") expect_equal(result$x$id, c("A", "B")) expect_equal(result$y$id, c("A", "B")) }) # --- Phase 4: Sampling --- test_that("join_spy sample parameter works", { x <- data.frame(id = 1:1000, val = rnorm(1000)) y <- data.frame(id = 1:1000, name = sample(letters, 1000, replace = TRUE)) report <- join_spy(x, y, by = "id", sample = 100) expect_true(!is.null(report$sampling)) expect_true(report$sampling$sampled) expect_equal(report$sampling$sample_size, 100) }) test_that("join_spy includes memory estimates", { x <- data.frame(id = 1:100, val = 1:100) y <- data.frame(id = 1:100, name = letters[1:100 %% 26 + 1]) report <- join_spy(x, y, by = "id") expect_true(!is.null(report$memory_estimate)) expect_true(is.character(report$memory_estimate$inner)) }) # --- Phase 5: Visualization --- test_that("plot() runs without error", { x <- data.frame(id = 1:5, val = 1:5) y <- data.frame(id = 3:7, name = letters[3:7]) report <- join_spy(x, y, by = "id") # Should not error result <- plot(report) expect_equal(result$left_only, 2) expect_equal(result$both, 3) expect_equal(result$right_only, 2) }) test_that("summary() returns data frame with format options", { x <- data.frame(id = 1:5, val = 1:5) y <- data.frame(id = 3:7, name = letters[3:7]) report <- join_spy(x, y, by = "id") result <- summary(report) expect_s3_class(result, "data.frame") expect_true("metric" %in% names(result)) expect_true("value" %in% names(result)) }) # --- Phase 6: Advanced Patterns --- test_that("check_cartesian detects explosion risk", { x <- data.frame(id = c(1, 1, 1, 2, 2), val_x = 1:5) y <- data.frame(id = c(1, 1, 1, 2, 2), val_y = 1:5) result <- check_cartesian(x, y, by = "id", threshold = 2) expect_true(result$has_explosion) expect_true(result$expansion_factor > 2) }) test_that("check_cartesian passes for safe joins", { x <- data.frame(id = 1:5, val_x = 1:5) y <- data.frame(id = 1:5, val_y = 1:5) result <- check_cartesian(x, y, by = "id") expect_false(result$has_explosion) }) test_that("detect_cardinality correctly identifies relationships", { # 1:1 x1 <- data.frame(id = 1:3, val = 1:3) y1 <- data.frame(id = 1:3, name = c("A", "B", "C")) expect_equal(detect_cardinality(x1, y1, "id"), "1:1") # 1:m x2 <- data.frame(id = 1:3, val = 1:3) y2 <- data.frame(id = c(1, 1, 2, 3), name = c("A1", "A2", "B", "C")) expect_equal(detect_cardinality(x2, y2, "id"), "1:m") # m:1 x3 <- data.frame(id = c(1, 1, 2, 3), val = 1:4) y3 <- data.frame(id = 1:3, name = c("A", "B", "C")) expect_equal(detect_cardinality(x3, y3, "id"), "m:1") # m:m x4 <- data.frame(id = c(1, 1, 2), val = 1:3) y4 <- data.frame(id = c(1, 2, 2), name = c("A", "B1", "B2")) expect_equal(detect_cardinality(x4, y4, "id"), "m:m") }) test_that("analyze_join_chain works with multiple tables", { orders <- data.frame(order_id = 1:3, customer_id = c(1, 2, 2)) customers <- data.frame(customer_id = 1:3, region_id = c(1, 1, 2)) regions <- data.frame(region_id = 1:2, name = c("North", "South")) result <- analyze_join_chain( tables = list(orders = orders, customers = customers, regions = regions), joins = list( list(left = "orders", right = "customers", by = "customer_id"), list(left = "result", right = "regions", by = "region_id") ) ) expect_equal(length(result), 2) expect_true(is_join_report(result[[1]]$report)) })