test_that("graph() creates correct DAG structure from parse output", { temp_dir <- tempdir() old_wd <- getwd() setwd(temp_dir) # Create external files for validation writeLines("sales data", "sales.csv") writeLines("regions data", "regions.csv") writeLines("raw data", "raw_data.txt") # Test data structure with new parse format parse_data <- list( scripts = list( "analysis.R" = list( inputs = character(0), outputs = c("monthly_sales.csv"), externals = c("sales.csv") ), "report_generation.R" = list( inputs = c("monthly_sales.csv"), outputs = c("quarterly_report.pdf"), externals = c("regions.csv") ), "data_cleaning.R" = list( inputs = character(0), outputs = c("cleaned_data.csv", "summary_stats.txt"), externals = c("raw_data.txt") ) ), inputs = c("monthly_sales.csv"), outputs = c("monthly_sales.csv", "quarterly_report.pdf", "cleaned_data.csv", "summary_stats.txt"), externals = c("sales.csv", "regions.csv", "raw_data.txt") ) graph <- graph(parse_data) # Should return a list with nodes and edges expect_type(graph, "list") expect_true("nodes" %in% names(graph)) expect_true("edges" %in% names(graph)) # Nodes should include all files (scripts, inputs, outputs) nodes <- graph$nodes expect_s3_class(nodes, "data.frame") expect_true("file" %in% names(nodes)) expect_true("type" %in% names(nodes)) expect_true("stale" %in% names(nodes)) # Scripts should be present expect_true("analysis.R" %in% nodes$file) expect_true("report_generation.R" %in% nodes$file) expect_true("data_cleaning.R" %in% nodes$file) # Files should also be present as nodes expect_true("sales.csv" %in% nodes$file) expect_true("monthly_sales.csv" %in% nodes$file) expect_true("regions.csv" %in% nodes$file) expect_true("quarterly_report.pdf" %in% nodes$file) expect_true("raw_data.txt" %in% nodes$file) expect_true("cleaned_data.csv" %in% nodes$file) expect_true("summary_stats.txt" %in% nodes$file) # Check node types expect_equal(nodes$type[nodes$file == "analysis.R"], "script") expect_equal(nodes$type[nodes$file == "sales.csv"], "external") expect_equal(nodes$type[nodes$file == "regions.csv"], "external") expect_equal(nodes$type[nodes$file == "raw_data.txt"], "external") expect_equal(nodes$type[nodes$file == "monthly_sales.csv"], "artifact") expect_equal(nodes$type[nodes$file == "quarterly_report.pdf"], "artifact") expect_equal(nodes$type[nodes$file == "cleaned_data.csv"], "artifact") expect_equal(nodes$type[nodes$file == "summary_stats.txt"], "artifact") # Check edges connect files directly edges <- graph$edges expect_s3_class(edges, "data.frame") expect_true("from" %in% names(edges)) expect_true("to" %in% names(edges)) # Check specific edges: input -> script -> output expect_true(any(edges$from == "sales.csv" & edges$to == "analysis.R")) expect_true(any(edges$from == "analysis.R" & edges$to == "monthly_sales.csv")) expect_true(any(edges$from == "monthly_sales.csv" & edges$to == "report_generation.R")) expect_true(any(edges$from == "regions.csv" & edges$to == "report_generation.R")) expect_true(any(edges$from == "report_generation.R" & edges$to == "quarterly_report.pdf")) # Clean up setwd(old_wd) unlink(file.path(temp_dir, c("sales.csv", "regions.csv", "raw_data.txt"))) }) test_that("graph() detects cyclic dependencies", { # Create cyclic dependency: A -> B -> C -> A parse_data <- list( scripts = list( "script_a.R" = list( inputs = c("file_c.csv"), outputs = c("file_a.csv"), externals = character(0) ), "script_b.R" = list( inputs = c("file_a.csv"), outputs = c("file_b.csv"), externals = character(0) ), "script_c.R" = list( inputs = c("file_b.csv"), outputs = c("file_c.csv"), externals = character(0) ) ), inputs = c("file_c.csv", "file_a.csv", "file_b.csv"), outputs = c("file_a.csv", "file_b.csv", "file_c.csv"), externals = character(0) ) expect_error(graph(parse_data), "Cycle detected") }) test_that("graph() validates single producer per artifact", { # Two scripts producing the same output file parse_data <- list( scripts = list( "script1.R" = list( inputs = character(0), outputs = c("duplicate_output.csv"), externals = c("input.csv") ), "script2.R" = list( inputs = character(0), outputs = c("duplicate_output.csv"), externals = c("other_input.csv") ) ), inputs = character(0), outputs = c("duplicate_output.csv"), externals = c("input.csv", "other_input.csv") ) expect_error({invisible(capture.output(graph(parse_data)))}, "Pipeline validation failed.*duplicate_output.csv") }) test_that("graph() supports topological sorting", { temp_dir <- tempdir() old_wd <- getwd() setwd(temp_dir) # Create external files writeLines("sales", "sales.csv") writeLines("regions", "regions.csv") parse_data <- list( scripts = list( "analysis.R" = list( inputs = character(0), outputs = c("monthly_sales.csv"), externals = c("sales.csv") ), "report_generation.R" = list( inputs = c("monthly_sales.csv"), outputs = c("quarterly_report.pdf"), externals = c("regions.csv") ) ), inputs = c("monthly_sales.csv"), outputs = c("monthly_sales.csv", "quarterly_report.pdf"), externals = c("sales.csv", "regions.csv") ) graph_obj <- graph(parse_data) topo_order <- topological_sort(graph_obj, scripts_only = TRUE) expect_type(topo_order, "character") # Scripts should be in correct order analysis_pos <- which(topo_order == "analysis.R") report_pos <- which(topo_order == "report_generation.R") expect_true(analysis_pos < report_pos) # Only scripts should be in the topological order expect_equal(length(topo_order), 2) expect_true(all(grepl("\\.R$", topo_order))) # Clean up setwd(old_wd) unlink(file.path(temp_dir, c("sales.csv", "regions.csv"))) }) test_that("graph() finds descendants for stale marking", { parse_data <- list( scripts = list( "analysis.R" = list( inputs = character(0), outputs = c("monthly_sales.csv"), externals = c("sales.csv") ), "report_generation.R" = list( inputs = c("monthly_sales.csv"), outputs = c("quarterly_report.pdf"), externals = c("regions.csv") ) ), inputs = c("monthly_sales.csv"), outputs = c("monthly_sales.csv", "quarterly_report.pdf"), externals = c("sales.csv", "regions.csv") ) graph_obj <- graph(parse_data) # Find descendants of analysis.R (should include files and scripts) descendants <- find_descendants(graph_obj, "analysis.R") expect_true("monthly_sales.csv" %in% descendants) expect_true("report_generation.R" %in% descendants) expect_true("quarterly_report.pdf" %in% descendants) expect_false("analysis.R" %in% descendants) # Find descendants of report_generation.R descendants2 <- find_descendants(graph_obj, "report_generation.R") expect_true("quarterly_report.pdf" %in% descendants2) }) test_that("graph() handles empty parse data", { parse_data <- list( scripts = list(), inputs = character(0), outputs = character(0) ) graph_obj <- graph(parse_data) expect_type(graph_obj, "list") expect_equal(nrow(graph_obj$nodes), 0) expect_equal(nrow(graph_obj$edges), 0) }) test_that("graph() handles scripts with no dependencies", { parse_data <- list( scripts = list( "standalone.R" = list( inputs = character(0), outputs = character(0) ), "producer.R" = list( inputs = character(0), outputs = c("data.csv") ) ), inputs = character(0), outputs = c("data.csv") ) graph_obj <- graph(parse_data) expect_true("standalone.R" %in% graph_obj$nodes$file) expect_true("producer.R" %in% graph_obj$nodes$file) expect_true("data.csv" %in% graph_obj$nodes$file) # Files are now nodes # Check edges - standalone script should have no edges standalone_edges <- graph_obj$edges[graph_obj$edges$from == "standalone.R" | graph_obj$edges$to == "standalone.R", ] expect_equal(nrow(standalone_edges), 0) # Producer script should have edge to its output producer_edges <- graph_obj$edges[graph_obj$edges$from == "producer.R" | graph_obj$edges$to == "producer.R", ] expect_equal(nrow(producer_edges), 1) # Should have edge to data.csv }) test_that("topological_sort() returns scripts in dependency order", { parse_data <- list( scripts = list( "step3.R" = list( inputs = c("intermediate2.csv"), outputs = c("final.csv"), externals = character(0) ), "step1.R" = list( inputs = character(0), outputs = c("intermediate1.csv"), externals = c("raw.csv") ), "step2.R" = list( inputs = c("intermediate1.csv"), outputs = c("intermediate2.csv"), externals = character(0) ) ), inputs = c("intermediate2.csv", "intermediate1.csv"), outputs = c("final.csv", "intermediate1.csv", "intermediate2.csv"), externals = c("raw.csv") ) graph_obj <- graph(parse_data) topo_order <- topological_sort(graph_obj, scripts_only = TRUE) # All nodes should be scripts expect_true(all(grepl("\\.R$", topo_order))) expect_equal(topo_order, c("step1.R", "step2.R", "step3.R")) }) test_that("graph() with state_obj marks nodes as stale correctly", { parse_data <- list( scripts = list( "script1.R" = list(inputs = character(0), outputs = c("intermediate.csv"), externals = c("input.csv")), "script2.R" = list(inputs = c("intermediate.csv"), outputs = c("output.csv"), externals = character(0)), "script3.R" = list(inputs = character(0), outputs = c("final.csv"), externals = c("other.csv")) ), inputs = c("intermediate.csv"), outputs = c("intermediate.csv", "output.csv", "final.csv"), externals = c("input.csv", "other.csv") ) # Create state object with mixed fresh/stale files (new data frame format) state_obj <- data.frame( file = c("script1.R", "input.csv", "intermediate.csv", "script2.R", "output.csv", "script3.R", "other.csv", "final.csv"), stale = c(FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE), stringsAsFactors = FALSE ) graph_obj <- graph(parse_data, state_obj) # Check that nodes data frame has stale information expect_true("nodes" %in% names(graph_obj)) expect_s3_class(graph_obj$nodes, "data.frame") expect_true("stale" %in% names(graph_obj$nodes)) # script1.R should be stale because its input (input.csv) is stale script1_stale <- graph_obj$nodes$stale[graph_obj$nodes$file == "script1.R"] expect_true(script1_stale) # script2.R should be stale because script1.R (which produces its input) is stale script2_stale <- graph_obj$nodes$stale[graph_obj$nodes$file == "script2.R"] expect_true(script2_stale) # script3.R should be fresh because its dependencies are fresh script3_stale <- graph_obj$nodes$stale[graph_obj$nodes$file == "script3.R"] expect_false(script3_stale) }) test_that("graph() without state_obj works as before", { parse_data <- list( scripts = list( "script1.R" = list(inputs = character(0), outputs = c("output.csv"), externals = c("input.csv")) ), inputs = character(0), outputs = c("output.csv"), externals = c("input.csv") ) # Should work without state_obj parameter - all nodes should be stale graph_obj <- graph(parse_data) expect_true("nodes" %in% names(graph_obj)) expect_true("edges" %in% names(graph_obj)) expect_s3_class(graph_obj$nodes, "data.frame") expect_true("stale" %in% names(graph_obj$nodes)) # Without state_obj, all nodes should be marked as stale expect_true(all(graph_obj$nodes$stale)) }) test_that("graph() marks all nodes as fresh when no stale files", { parse_data <- list( scripts = list( "script1.R" = list(inputs = character(0), outputs = c("output.csv"), externals = c("input.csv")), "script2.R" = list(inputs = c("output.csv"), outputs = c("final.csv"), externals = character(0)) ), inputs = c("output.csv"), outputs = c("output.csv", "final.csv"), externals = c("input.csv") ) # State object with no stale files (new data frame format) state_obj <- data.frame( file = c("script1.R", "script2.R", "input.csv", "output.csv", "final.csv"), stale = c(FALSE, FALSE, FALSE, FALSE, FALSE), stringsAsFactors = FALSE ) graph_obj <- graph(parse_data, state_obj) # All scripts should be fresh when no stale files expect_true(all(!graph_obj$nodes$stale)) }) test_that("graph() marks nodes as stale when files not in state", { parse_data <- list( scripts = list( "script1.R" = list(inputs = character(0), outputs = c("output.csv"), externals = c("input.csv")), "script2.R" = list(inputs = c("output.csv"), outputs = c("final.csv"), externals = character(0)) ), inputs = c("output.csv"), outputs = c("output.csv", "final.csv"), externals = c("input.csv") ) # State object with some files marked as stale (new data frame format) state_obj <- data.frame( file = c("script1.R", "script2.R", "input.csv", "output.csv", "final.csv"), stale = c(FALSE, TRUE, TRUE, TRUE, TRUE), stringsAsFactors = FALSE ) graph_obj <- graph(parse_data, state_obj) # script1.R should be stale due to input.csv being stale, script2.R should be stale script1_stale <- graph_obj$nodes$stale[graph_obj$nodes$file == "script1.R"] script2_stale <- graph_obj$nodes$stale[graph_obj$nodes$file == "script2.R"] expect_true(script1_stale) expect_true(script2_stale) }) test_that("graph() propagates staleness correctly via DFS", { # Linear pipeline: script1 -> script2 -> script3 parse_data <- list( scripts = list( "script1.R" = list(inputs = character(0), outputs = c("clean.csv"), externals = c("raw.csv")), "script2.R" = list(inputs = c("clean.csv"), outputs = c("processed.csv"), externals = character(0)), "script3.R" = list(inputs = c("processed.csv"), outputs = c("final.csv"), externals = character(0)) ), inputs = c("clean.csv", "processed.csv"), outputs = c("clean.csv", "processed.csv", "final.csv"), externals = c("raw.csv") ) # State where only script1.R is stale (new data frame format) state_obj <- data.frame( file = c("script1.R", "script2.R", "script3.R", "raw.csv", "clean.csv", "processed.csv", "final.csv"), stale = c(TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE), stringsAsFactors = FALSE ) graph_obj <- graph(parse_data, state_obj) # All scripts should be stale due to propagation script1_stale <- graph_obj$nodes$stale[graph_obj$nodes$file == "script1.R"] script2_stale <- graph_obj$nodes$stale[graph_obj$nodes$file == "script2.R"] script3_stale <- graph_obj$nodes$stale[graph_obj$nodes$file == "script3.R"] expect_true(script1_stale) expect_true(script2_stale) expect_true(script3_stale) }) test_that("graph() handles disconnected components correctly", { # Two independent pipelines parse_data <- list( scripts = list( "pipeline1_step1.R" = list(inputs = character(0), outputs = c("result1.csv"), externals = c("data1.csv")), "pipeline1_step2.R" = list(inputs = c("result1.csv"), outputs = c("final1.csv"), externals = character(0)), "pipeline2_step1.R" = list(inputs = character(0), outputs = c("final2.csv"), externals = c("data2.csv")) ), inputs = c("result1.csv"), outputs = c("result1.csv", "final1.csv", "final2.csv"), externals = c("data1.csv", "data2.csv") ) # Only pipeline1 has stale data (new data frame format) state_obj <- data.frame( file = c("pipeline1_step1.R", "pipeline1_step2.R", "pipeline2_step1.R", "data1.csv", "result1.csv", "final1.csv", "data2.csv", "final2.csv"), stale = c(FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE), stringsAsFactors = FALSE ) graph_obj <- graph(parse_data, state_obj) # Only pipeline1 scripts should be stale pipeline1_step1_stale <- graph_obj$nodes$stale[graph_obj$nodes$file == "pipeline1_step1.R"] pipeline1_step2_stale <- graph_obj$nodes$stale[graph_obj$nodes$file == "pipeline1_step2.R"] pipeline2_step1_stale <- graph_obj$nodes$stale[graph_obj$nodes$file == "pipeline2_step1.R"] expect_true(pipeline1_step1_stale) expect_true(pipeline1_step2_stale) expect_false(pipeline2_step1_stale) })