# helper.R — Factory functions for synthetic test data # No external files or API calls needed. # --------------------------------------------------------------------------- # make_minimal_openalex_dataframe() # 6-row tibble mimicking read_openalex() output with 2 citation chains # --------------------------------------------------------------------------- make_minimal_openalex_dataframe <- function() { tibble::tibble( id_short = paste0("W", 1:6), TI = paste("Title", 1:6), AU = paste("Author", LETTERS[1:6]), PY = c(2018L, 2019L, 2019L, 2020L, 2020L, 2021L), DI = paste0("10.1234/fake.", 1:6), CR = c( NA_character_, "W1", "W1", "W2;W3", "W2;W3", "W4;W5" ), SO = rep("FAKE JOURNAL", 6), DT = rep("article", 6), DE = c( "keyword a; keyword b", "keyword a; keyword c", "keyword b; keyword d", "keyword c; keyword d; keyword e", "keyword a; keyword e", "keyword d; keyword e; keyword f" ), AB = paste("Abstract for paper", 1:6), C1 = rep("Country X", 6), TC = c(10L, 8L, 6L, 4L, 3L, 1L), SC = rep("Computer Science", 6), SR = paste0("W", 1:6), DB = rep("openalex_api", 6) ) } # --------------------------------------------------------------------------- # make_test_tbl_graph(db_type = "wos") # 20-node directed tbl_graph with 2 groups (c1g1, c1g2), all standard # node attributes, WoS-format CR strings for CCT testing. # --------------------------------------------------------------------------- make_test_tbl_graph <- function(db_type = "wos") { if (db_type == "wos") { # 20 nodes: 10 per group, years 2015-2020 / 2016-2021 nodes <- tibble::tibble( name = sprintf("101234FAKE%04d%03d", rep(2015:2020, each = 2), rep(1:2, 6)), SR = paste0("WOS:", sprintf("%012d", 1:12)), AU = paste("Author", LETTERS[1:12]), TI = paste("Title", 1:12), PY = rep(2015:2020, each = 2), DE = rep(c( "kw1; kw2", "kw2; kw3", "kw1; kw3; kw4", "kw4; kw5", "kw2; kw5; kw6", "kw1; kw6" ), each = 2), AB = paste("Abstract", 1:12), SO = rep("FAKE JOURNAL", 12), TC = 12:1, DI = sprintf("10.1234/fake.%04d.%03d", rep(2015:2020, each = 2), rep(1:2, 6)), DI2 = sprintf("101234FAKE%04d%03d", rep(2015:2020, each = 2), rep(1:2, 6)), CR = c( NA_character_, NA_character_, "Author A, 2015, FAKE JOURNAL, V1, P1, DOI 10.1234/fake.2015.001", "Author B, 2015, FAKE JOURNAL, V1, P2, DOI 10.1234/fake.2015.002", "Author C, 2016, FAKE JOURNAL, V2, P1, DOI 10.1234/fake.2016.001; Author D, 2016, FAKE JOURNAL, V2, P2, DOI 10.1234/fake.2016.002", "Author C, 2016, FAKE JOURNAL, V2, P1, DOI 10.1234/fake.2016.001", "Author E, 2017, FAKE JOURNAL, V3, P1, DOI 10.1234/fake.2017.001; Author F, 2017, FAKE JOURNAL, V3, P2, DOI 10.1234/fake.2017.002", "Author E, 2017, FAKE JOURNAL, V3, P1, DOI 10.1234/fake.2017.001", "Author G, 2018, FAKE JOURNAL, V4, P1, DOI 10.1234/fake.2018.001; Author H, 2018, FAKE JOURNAL, V4, P2, DOI 10.1234/fake.2018.002", "Author G, 2018, FAKE JOURNAL, V4, P1, DOI 10.1234/fake.2018.001", "Author I, 2019, FAKE JOURNAL, V5, P1, DOI 10.1234/fake.2019.001; Author J, 2019, FAKE JOURNAL, V5, P2, DOI 10.1234/fake.2019.002", "Author I, 2019, FAKE JOURNAL, V5, P1, DOI 10.1234/fake.2019.001" ), DB = rep("wos_bib_normalized_names", 12), NT = rep("direct-citation", 12), component = rep("c1", 12), group = rep(c("c1g1", "c1g2"), 6) ) # Also create 8 more nodes for group c1g2 with different years nodes2 <- tibble::tibble( name = sprintf("101234FAKE%04d%03d", rep(2016:2019, each = 2), rep(3:4, 4)), SR = paste0("WOS:", sprintf("%012d", 13:20)), AU = paste("Author", LETTERS[13:20]), TI = paste("Title", 13:20), PY = rep(2016:2019, each = 2), DE = rep(c("kw7; kw8", "kw8; kw9", "kw7; kw9; kw10", "kw10; kw11"), each = 2), AB = paste("Abstract", 13:20), SO = rep("FAKE JOURNAL", 8), TC = 8:1, DI = sprintf("10.1234/fake.%04d.%03d", rep(2016:2019, each = 2), rep(3:4, 4)), DI2 = sprintf("101234FAKE%04d%03d", rep(2016:2019, each = 2), rep(3:4, 4)), CR = c( NA_character_, NA_character_, "Author M, 2016, FAKE JOURNAL, V2, P3, DOI 10.1234/fake.2016.003; Author N, 2016, FAKE JOURNAL, V2, P4, DOI 10.1234/fake.2016.004", "Author M, 2016, FAKE JOURNAL, V2, P3, DOI 10.1234/fake.2016.003", "Author O, 2017, FAKE JOURNAL, V3, P3, DOI 10.1234/fake.2017.003; Author P, 2017, FAKE JOURNAL, V3, P4, DOI 10.1234/fake.2017.004", "Author O, 2017, FAKE JOURNAL, V3, P3, DOI 10.1234/fake.2017.003", "Author Q, 2018, FAKE JOURNAL, V4, P3, DOI 10.1234/fake.2018.003; Author R, 2018, FAKE JOURNAL, V4, P4, DOI 10.1234/fake.2018.004", "Author Q, 2018, FAKE JOURNAL, V4, P3, DOI 10.1234/fake.2018.003" ), DB = rep("wos_bib_normalized_names", 8), NT = rep("direct-citation", 8), component = rep("c1", 8), group = rep("c1g2", 8) ) all_nodes <- dplyr::bind_rows(nodes, nodes2) # Edges: newer nodes cite older nodes (within each group) edges <- tibble::tibble( from = c( # c1g1 edges (using row indices after we build the graph) all_nodes$name[3], all_nodes$name[4], all_nodes$name[5], all_nodes$name[5], all_nodes$name[6], all_nodes$name[7], all_nodes$name[7], all_nodes$name[8], all_nodes$name[9], all_nodes$name[9], all_nodes$name[10], all_nodes$name[10], all_nodes$name[11], all_nodes$name[12], # c1g2 edges all_nodes$name[15], all_nodes$name[15], all_nodes$name[16], all_nodes$name[17], all_nodes$name[17], all_nodes$name[18], all_nodes$name[19], all_nodes$name[19], all_nodes$name[20], all_nodes$name[20] ), to = c( # c1g1 targets all_nodes$name[1], all_nodes$name[2], all_nodes$name[3], all_nodes$name[4], all_nodes$name[3], all_nodes$name[5], all_nodes$name[6], all_nodes$name[5], all_nodes$name[7], all_nodes$name[8], all_nodes$name[9], all_nodes$name[10], all_nodes$name[9], all_nodes$name[10], # c1g2 targets all_nodes$name[13], all_nodes$name[14], all_nodes$name[14], all_nodes$name[15], all_nodes$name[16], all_nodes$name[15], all_nodes$name[17], all_nodes$name[18], all_nodes$name[19], all_nodes$name[20] ) ) g <- igraph::graph_from_data_frame(edges, directed = TRUE, vertices = all_nodes) tidygraph::as_tbl_graph(g) } else { # OpenAlex variant nodes <- tibble::tibble( name = paste0("W", 1:20), SR = paste0("W", 1:20), AU = paste("Author", LETTERS[1:20]), TI = paste("Title", 1:20), PY = rep(2018:2022, each = 4), DE = rep(c("kw1; kw2", "kw2; kw3", "kw3; kw4", "kw1; kw4"), 5), AB = paste("Abstract", 1:20), SO = rep("FAKE JOURNAL", 20), TC = 20:1, DI = paste0("10.1234/fake.", 1:20), CR = c( rep(NA_character_, 4), paste0("W", 1:4), paste0("W", 5:8), paste0("W", 9:12), paste0("W", 13:16) ), DB = rep("openalex_api", 20), NT = rep("direct-citation", 20), component = rep("c1", 20), group = rep(c("c1g1", "c1g2"), 10) ) edges <- tibble::tibble( from = nodes$name[5:20], to = nodes$name[c(1:4, 5:8, 9:12, 13:16)] ) g <- igraph::graph_from_data_frame(edges, directed = TRUE, vertices = nodes) tidygraph::as_tbl_graph(g) } } # --------------------------------------------------------------------------- # make_sniff_groups_output(db_type = "wos") # list(aggregate, network, pubs_by_year) matching sniff_groups() output # --------------------------------------------------------------------------- make_sniff_groups_output <- function(db_type = "wos") { net <- make_test_tbl_graph(db_type) node_data <- net |> tidygraph::activate(nodes) |> tibble::as_tibble() aggregate <- node_data |> dplyr::group_by(.data$group) |> dplyr::summarise( quantity_papers = dplyr::n(), average_age = mean(.data$PY, na.rm = TRUE), .groups = "drop" ) pubs_by_year <- node_data |> dplyr::group_by(.data$group, .data$PY) |> dplyr::tally(name = "publications") |> dplyr::rename(year = .data$PY) |> dplyr::ungroup() |> dplyr::arrange(.data$year, .data$group) # Add aggregate columns to network nodes net2 <- net |> tidygraph::activate(nodes) |> dplyr::left_join(aggregate, by = "group") list( aggregate = aggregate, network = net2, pubs_by_year = pubs_by_year ) } # --------------------------------------------------------------------------- # make_sniff_components_output(db_type = "wos") # list(components, network) matching sniff_components() output # --------------------------------------------------------------------------- make_sniff_components_output <- function(db_type = "wos") { net <- make_test_tbl_graph(db_type) # Remove group column so sniff_groups() can add it fresh net_clean <- net |> tidygraph::activate(nodes) |> dplyr::select(-dplyr::any_of("group")) node_data <- net_clean |> tidygraph::activate(nodes) |> tibble::as_tibble() components <- node_data |> dplyr::group_by(.data$component) |> dplyr::summarise( quantity_publications = dplyr::n(), average_age = mean(.data$PY, na.rm = TRUE), .groups = "drop" ) list( components = components, network = net_clean ) } # --------------------------------------------------------------------------- # make_simple_dag() # 5-node linear chain DAG for deterministic key-route testing. # A -> B -> C -> D -> E (directed, with PY increasing) # --------------------------------------------------------------------------- make_simple_dag <- function() { nodes <- tibble::tibble( name = c("n1", "n2", "n3", "n4", "n5"), AU = paste("Author", LETTERS[1:5]), TI = paste("Title", 1:5), PY = c(2015L, 2016L, 2017L, 2018L, 2019L), DE = rep("kw1; kw2", 5), AB = paste("Abstract", 1:5), SO = rep("FAKE JOURNAL", 5), TC = c(10L, 8L, 6L, 4L, 2L), DI = paste0("10.1234/fake.", 1:5), SR = paste0("WOS:", sprintf("%012d", 1:5)), DB = rep("wos_bib_normalized_names", 5), NT = rep("direct-citation", 5), group = rep("full_network", 5), component = rep("c1", 5), CR = c( NA_character_, "Author A, 2015, FAKE JOURNAL, V1, P1, DOI 10.1234/fake.1", "Author B, 2016, FAKE JOURNAL, V2, P1, DOI 10.1234/fake.2", "Author C, 2017, FAKE JOURNAL, V3, P1, DOI 10.1234/fake.3", "Author D, 2018, FAKE JOURNAL, V4, P1, DOI 10.1234/fake.4" ) ) # Edges: newer papers cite older (n2->n1, n3->n2, n4->n3, n5->n4) edges <- tibble::tibble( from = c("n2", "n3", "n4", "n5"), to = c("n1", "n2", "n3", "n4") ) g <- igraph::graph_from_data_frame(edges, directed = TRUE, vertices = nodes) tidygraph::as_tbl_graph(g) } # --------------------------------------------------------------------------- # make_tracked_cr_py(db_type = "wos") # Pre-computed CR/CR_PY tibble to bypass API calls in CCT tests. # --------------------------------------------------------------------------- make_tracked_cr_py <- function(db_type = "wos") { if (db_type == "wos") { tibble::tibble( CR = c( "Author A, 2015, FAKE JOURNAL, V1, P1, DOI 10.1234/fake.2015.001", "Author B, 2015, FAKE JOURNAL, V1, P2, DOI 10.1234/fake.2015.002", "Author C, 2016, FAKE JOURNAL, V2, P1, DOI 10.1234/fake.2016.001", "Author D, 2016, FAKE JOURNAL, V2, P2, DOI 10.1234/fake.2016.002", "Author E, 2017, FAKE JOURNAL, V3, P1, DOI 10.1234/fake.2017.001", "Author F, 2017, FAKE JOURNAL, V3, P2, DOI 10.1234/fake.2017.002", "Author G, 2018, FAKE JOURNAL, V4, P1, DOI 10.1234/fake.2018.001", "Author H, 2018, FAKE JOURNAL, V4, P2, DOI 10.1234/fake.2018.002", "Author I, 2019, FAKE JOURNAL, V5, P1, DOI 10.1234/fake.2019.001", "Author J, 2019, FAKE JOURNAL, V5, P2, DOI 10.1234/fake.2019.002", "Author M, 2016, FAKE JOURNAL, V2, P3, DOI 10.1234/fake.2016.003", "Author N, 2016, FAKE JOURNAL, V2, P4, DOI 10.1234/fake.2016.004", "Author O, 2017, FAKE JOURNAL, V3, P3, DOI 10.1234/fake.2017.003", "Author P, 2017, FAKE JOURNAL, V3, P4, DOI 10.1234/fake.2017.004", "Author Q, 2018, FAKE JOURNAL, V4, P3, DOI 10.1234/fake.2018.003", "Author R, 2018, FAKE JOURNAL, V4, P4, DOI 10.1234/fake.2018.004" ), CR_PY = c( 2015L, 2015L, 2016L, 2016L, 2017L, 2017L, 2018L, 2018L, 2019L, 2019L, 2016L, 2016L, 2017L, 2017L, 2018L, 2018L ) ) } else { tibble::tibble( CR = paste0("W", 1:16), CR_PY = rep(2018:2021, each = 4) ) } }