# Test suite for citation_analysis.R functions # This is the main integration test suite library(testthat) library(tibble) library(dplyr) # ============================================================================ # map_citations_to_segments() - HELPER FUNCTION # ============================================================================ test_that("map_citations_to_segments handles auto sections detection", { citations_df <- tibble( citation_id = c("C1", "C2"), start_pos = c(10, 50), citation_text = c("(Smith, 2020)", "(Jones, 2021)") ) text_list <- list( Introduction = paste(rep("x", 30), collapse = ""), Methods = paste(rep("x", 30), collapse = "") ) result <- map_citations_to_segments( citations_df, text_list, use_sections = "auto", n_segments = 5 ) expect_true("segment" %in% names(result)) expect_true("segment_type" %in% names(result)) }) test_that("map_citations_to_segments creates equal-length segments", { citations_df <- tibble( citation_id = c("C1", "C2", "C3"), start_pos = c(10, 50, 100), citation_text = c("(A, 2020)", "(B, 2021)", "(C, 2022)") ) text <- paste(rep("word", 200), collapse = " ") result <- map_citations_to_segments( citations_df, text, use_sections = FALSE, n_segments = 3 ) expect_equal(result$segment_type[1], "equal_length") expect_true(all(grepl("Segment", result$segment))) }) test_that("map_citations_to_segments warns when sections unavailable", { citations_df <- tibble( citation_id = "C1", start_pos = 10, citation_text = "(Smith, 2020)" ) text <- "Simple string text" expect_warning( result <- map_citations_to_segments( citations_df, text, use_sections = TRUE, n_segments = 5 ), "Sections requested but not available" ) }) # ============================================================================ # analyze_scientific_content() - BASIC INPUT VALIDATION # ============================================================================ test_that("analyze_scientific_content requires text input", { expect_error( analyze_scientific_content(), "argument \"text\" is missing" ) }) test_that("analyze_scientific_content works with simple string", { skip_if_not_installed("tidytext") text <- "This is a simple test document with some words." result <- analyze_scientific_content(text) expect_type(result, "list") expect_s3_class(result, "enhanced_scientific_content_analysis") }) test_that("analyze_scientific_content works with list input", { skip_if_not_installed("tidytext") text_list <- list( Introduction = "This is the introduction section.", Methods = "This describes the methods used." ) result <- analyze_scientific_content(text_list) expect_type(result, "list") expect_s3_class(result, "enhanced_scientific_content_analysis") }) # ============================================================================ # analyze_scientific_content() - TEXT ANALYTICS # ============================================================================ test_that("analyze_scientific_content calculates basic text statistics", { skip_if_not_installed("tidytext") text <- "This is a test. This has words." result <- analyze_scientific_content(text) expect_true(!is.null(result$text_analytics)) expect_true(!is.null(result$text_analytics$basic_stats)) expect_true("total_characters" %in% names(result$text_analytics$basic_stats)) expect_true("total_words" %in% names(result$text_analytics$basic_stats)) }) test_that("analyze_scientific_content extracts word frequencies", { skip_if_not_installed("tidytext") text <- "test word test word test another" result <- analyze_scientific_content( text, min_word_length = 3, remove_stopwords = FALSE ) expect_true(!is.null(result$word_frequencies)) expect_s3_class(result$word_frequencies, "tbl_df") expect_true("word" %in% names(result$word_frequencies)) expect_true("n" %in% names(result$word_frequencies)) }) test_that("analyze_scientific_content generates n-grams", { skip_if_not_installed("tidytext") text <- "machine learning is important for data science" result <- analyze_scientific_content( text, ngram_range = c(1, 2), remove_stopwords = FALSE ) expect_true(!is.null(result$ngrams)) expect_true("1gram" %in% names(result$ngrams)) expect_true("2gram" %in% names(result$ngrams)) }) test_that("analyze_scientific_content removes stopwords", { skip_if_not_installed("tidytext") text <- "the quick brown fox jumps over the lazy dog" result_with_stops <- analyze_scientific_content( text, remove_stopwords = FALSE ) result_no_stops <- analyze_scientific_content( text, remove_stopwords = TRUE ) # With stopwords should have more words expect_gt( nrow(result_with_stops$word_frequencies), nrow(result_no_stops$word_frequencies) ) }) # ============================================================================ # analyze_scientific_content() - CITATION EXTRACTION # ============================================================================ test_that("analyze_scientific_content extracts author-year citations", { skip_if_not_installed("tidytext") text <- "Research by Smith (2020) shows that machine learning (Jones, 2021) is important." result <- analyze_scientific_content(text) expect_true(!is.null(result$citations)) expect_s3_class(result$citations, "tbl_df") if (nrow(result$citations) > 0) { expect_true("citation_text" %in% names(result$citations)) expect_true("citation_type" %in% names(result$citations)) } }) test_that("analyze_scientific_content extracts numbered citations", { skip_if_not_installed("tidytext") text <- "Previous work [1] showed results. Another study [2] confirmed this." result <- analyze_scientific_content(text) expect_true(!is.null(result$citations)) if (nrow(result$citations) > 0) { numbered_citations <- result$citations %>% filter(grepl("numbered", citation_type)) expect_gte(nrow(numbered_citations), 0) } }) test_that("analyze_scientific_content extracts et al. citations", { skip_if_not_installed("tidytext") text <- "Smith et al. (2020) demonstrated the approach." result <- analyze_scientific_content(text) expect_true(!is.null(result$citations)) if (nrow(result$citations) > 0) { expect_true(any(grepl("et al", result$citations$citation_text))) } }) test_that("analyze_scientific_content parses multiple citations", { skip_if_not_installed("tidytext") text <- "Research (Smith, 2020; Jones, 2021; Brown, 2022) shows results." result <- analyze_scientific_content( text, parse_multiple_citations = TRUE ) expect_true(!is.null(result$citations)) if (nrow(result$citations) > 0) { parsed <- result$citations %>% filter(citation_type == "parsed_from_multiple") expect_gte(nrow(parsed), 0) } }) # ============================================================================ # analyze_scientific_content() - CITATION CONTEXTS # ============================================================================ test_that("analyze_scientific_content extracts citation contexts", { skip_if_not_installed("tidytext") text <- "The important work by Smith (2020) demonstrates this approach effectively." result <- analyze_scientific_content( text, window_size = 5 ) expect_true(!is.null(result$citation_contexts)) if (nrow(result$citation_contexts) > 0) { expect_true("words_before" %in% names(result$citation_contexts)) expect_true("words_after" %in% names(result$citation_contexts)) expect_true("full_context" %in% names(result$citation_contexts)) } }) test_that("analyze_scientific_content respects window_size", { skip_if_not_installed("tidytext") text <- "Word1 word2 word3 word4 word5 Smith (2020) word6 word7 word8 word9 word10" result <- analyze_scientific_content( text, window_size = 3 ) if (!is.null(result$citation_contexts) && nrow(result$citation_contexts) > 0) { # Context should have limited words expect_true(!is.null(result$citation_contexts$context_word_count)) } expect_true(TRUE) # Always pass if no citations }) # ============================================================================ # analyze_scientific_content() - CITATION METRICS # ============================================================================ test_that("analyze_scientific_content calculates citation metrics", { skip_if_not_installed("tidytext") text <- "Smith (2020) and Jones (2021) show results. See also [1] and [2]." result <- analyze_scientific_content(text) expect_true(!is.null(result$citation_metrics)) if (length(result$citation_metrics) > 0) { expect_true(!is.null(result$citation_metrics$type_distribution) || !is.null(result$citation_metrics$density)) } }) test_that("analyze_scientific_content tracks narrative vs parenthetical", { skip_if_not_installed("tidytext") text <- "Smith (2020) found that results (Jones, 2021) support the theory." result <- analyze_scientific_content(text) if (!is.null(result$citation_metrics$narrative_ratio)) { expect_true("narrative_citations" %in% names(result$citation_metrics$narrative_ratio)) expect_true("parenthetical_citations" %in% names(result$citation_metrics$narrative_ratio)) } expect_true(TRUE) }) # ============================================================================ # analyze_scientific_content() - SECTION MAPPING # ============================================================================ test_that("analyze_scientific_content maps citations to sections", { skip_if_not_installed("tidytext") text_list <- list( Introduction = "Smith (2020) introduced the concept.", Methods = "We used the approach of Jones (2021).", Results = "The results [1] show improvement." ) result <- analyze_scientific_content( text_list, use_sections_for_citations = TRUE ) if (!is.null(result$citations) && nrow(result$citations) > 0) { expect_true("section" %in% names(result$citations)) sections <- unique(result$citations$section) expect_true(any(sections %in% c("Introduction", "Methods", "Results"))) } expect_true(TRUE) }) test_that("analyze_scientific_content creates segments when no sections", { skip_if_not_installed("tidytext") long_text <- paste(rep("Research by Smith (2020) shows results.", 10), collapse = " ") result <- analyze_scientific_content( long_text, use_sections_for_citations = FALSE, n_segments_citations = 5 ) if (!is.null(result$citations) && nrow(result$citations) > 0) { expect_true("section" %in% names(result$citations)) } expect_true(TRUE) }) # ============================================================================ # analyze_scientific_content() - REFERENCE PARSING # ============================================================================ test_that("analyze_scientific_content parses references from text", { skip_if_not_installed("tidytext") text_list <- list( Full_text = "Main text with citation (Smith, 2020).", References = "Smith, J. (2020). Title of paper. Journal of Science." ) result <- analyze_scientific_content(text_list) expect_true(!is.null(result$parsed_references)) if (!is.null(result$parsed_references)) { expect_s3_class(result$parsed_references, "tbl_df") if (nrow(result$parsed_references) > 0) { expect_true("ref_id" %in% names(result$parsed_references)) expect_true("ref_year" %in% names(result$parsed_references)) } } }) test_that("analyze_scientific_content handles missing references section", { skip_if_not_installed("tidytext") text <- "Text without references section." result <- analyze_scientific_content(text) # Should complete without error expect_type(result, "list") expect_true(is.null(result$parsed_references) || nrow(result$parsed_references) == 0) }) # ============================================================================ # analyze_scientific_content() - CROSSREF INTEGRATION # ============================================================================ test_that("analyze_scientific_content retrieves from CrossRef with DOI", { skip_if_not_installed("tidytext") skip_if_not_installed("httr2") # Mock CrossRef API local_mocked_bindings( get_crossref_references = function(doi, mailto) { tibble( key = "ref1", doi = "10.1234/test", article_title = "Test Article", author = "Smith J", year = "2020", journal = "Journal", volume = "10", first_page = "100" ) }, .package = "contentanalysis" ) text <- "Research text here." result <- analyze_scientific_content( text, doi = "10.1234/test", mailto = "test@example.com" ) expect_true(!is.null(result$parsed_references)) }) test_that("analyze_scientific_content handles CrossRef failure gracefully", { skip_if_not_installed("tidytext") # Mock CrossRef to fail local_mocked_bindings( get_crossref_references = function(doi, mailto) { stop("API Error") }, .package = "contentanalysis" ) text_list <- list( Full_text = "Text here.", References = "Smith, J. (2020). Paper." ) expect_warning( result <- analyze_scientific_content( text_list, doi = "10.1234/test" ) ) # Should fall back to parsing from text expect_type(result, "list") }) # ============================================================================ # analyze_scientific_content() - CITATION-REFERENCE MATCHING # ============================================================================ test_that("analyze_scientific_content matches citations to references", { skip_if_not_installed("tidytext") text_list <- list( Full_text = "Research by Smith (2020) demonstrates the approach.", References = "Smith, J. (2020). Important paper. Journal of Research." ) result <- analyze_scientific_content(text_list) if (!is.null(result$citation_references_mapping)) { expect_s3_class(result$citation_references_mapping, "tbl_df") expect_true("matched_ref_id" %in% names(result$citation_references_mapping)) expect_true("match_confidence" %in% names(result$citation_references_mapping)) } expect_true(TRUE) }) # ============================================================================ # analyze_scientific_content() - SUMMARY # ============================================================================ test_that("analyze_scientific_content generates summary", { skip_if_not_installed("tidytext") text <- "Test text with some words and content." result <- analyze_scientific_content(text) expect_true(!is.null(result$summary)) expect_true("total_words_analyzed" %in% names(result$summary)) expect_true("unique_words" %in% names(result$summary)) expect_true("lexical_diversity" %in% names(result$summary)) }) # ============================================================================ # analyze_scientific_content() - OUTPUT STRUCTURE # ============================================================================ test_that("analyze_scientific_content returns complete structure", { skip_if_not_installed("tidytext") text <- "Sample scientific text for analysis." result <- analyze_scientific_content(text) expected_components <- c( "text_analytics", "citations", "citation_contexts", "citation_metrics", "word_frequencies", "ngrams", "summary" ) for (comp in expected_components) { expect_true(comp %in% names(result)) } }) test_that("analyze_scientific_content has correct class", { skip_if_not_installed("tidytext") text <- "Test text" result <- analyze_scientific_content(text) expect_s3_class(result, "enhanced_scientific_content_analysis") expect_type(result, "list") }) # ============================================================================ # analyze_scientific_content() - PARAMETERS # ============================================================================ test_that("analyze_scientific_content respects min_word_length", { skip_if_not_installed("tidytext") text <- "a ab abc abcd abcde" result <- analyze_scientific_content( text, min_word_length = 4, remove_stopwords = FALSE ) # Should only have words >= 4 chars if (nrow(result$word_frequencies) > 0) { min_length <- min(nchar(result$word_frequencies$word)) expect_gte(min_length, 4) } expect_true(TRUE) }) test_that("analyze_scientific_content handles custom stopwords", { skip_if_not_installed("tidytext") text <- "custom word test custom word test" result <- analyze_scientific_content( text, remove_stopwords = TRUE, custom_stopwords = c("custom") ) # "custom" should be removed if (nrow(result$word_frequencies) > 0) { expect_false("custom" %in% result$word_frequencies$word) } expect_true(TRUE) }) test_that("analyze_scientific_content handles different ngram_range", { skip_if_not_installed("tidytext") text <- "machine learning is important" result <- analyze_scientific_content( text, ngram_range = c(2, 3), remove_stopwords = FALSE ) expect_true("2gram" %in% names(result$ngrams)) expect_true("3gram" %in% names(result$ngrams)) expect_false("1gram" %in% names(result$ngrams)) }) # ============================================================================ # analyze_scientific_content() - EDGE CASES # ============================================================================ test_that("analyze_scientific_content handles empty text", { skip_if_not_installed("tidytext") text <- "" result <- analyze_scientific_content(text) expect_type(result, "list") expect_s3_class(result, "enhanced_scientific_content_analysis") }) test_that("analyze_scientific_content handles very short text", { skip_if_not_installed("tidytext") text <- "Short" result <- analyze_scientific_content(text) expect_type(result, "list") }) test_that("analyze_scientific_content handles text with no citations", { skip_if_not_installed("tidytext") text <- "This text has no citations at all, just regular content." result <- analyze_scientific_content(text) expect_true(!is.null(result$citations)) expect_equal(nrow(result$citations), 0) }) # ============================================================================ # analyze_scientific_content() - INTEGRATION TESTS # ============================================================================ test_that("analyze_scientific_content complete workflow with sections", { skip_if_not_installed("tidytext") text_list <- list( Introduction = "Machine learning Smith (2020) is a growing field of artificial intelligence.", Methods = "We applied the methodology described by Jones (2021) using neural networks.", Results = "Our results [1] demonstrate improved accuracy compared to baseline.", Discussion = "These findings support the work of Brown et al. (2022) in deep learning.", References = "Smith, J. (2020). ML Paper. Journal. Jones, A. (2021). Methods Paper. Science. Brown, K., Davis, M., Wilson, P. (2022). DL Research. Nature." ) result <- analyze_scientific_content( text_list, window_size = 5, ngram_range = c(1, 2), use_sections_for_citations = TRUE ) # Verify all major components expect_s3_class(result, "enhanced_scientific_content_analysis") expect_true(!is.null(result$text_analytics)) expect_true(!is.null(result$citations)) expect_true(!is.null(result$word_frequencies)) expect_true(!is.null(result$summary)) # Verify citations were extracted expect_gte(nrow(result$citations), 1) # Verify sections were used if (nrow(result$citations) > 0) { expect_true("section" %in% names(result$citations)) } # Verify references were parsed expect_true(!is.null(result$parsed_references)) if (!is.null(result$parsed_references)) { expect_gte(nrow(result$parsed_references), 1) } }) test_that("analyze_scientific_content works without References section", { skip_if_not_installed("tidytext") text_list <- list( Introduction = "Research by Smith (2020) shows results.", Methods = "We used standard approaches." ) result <- analyze_scientific_content(text_list) expect_s3_class(result, "enhanced_scientific_content_analysis") expect_true(is.null(result$citation_references_mapping)) }) # ============================================================================ # print_matching_diagnostics() - TESTS # ============================================================================ test_that("print_matching_diagnostics handles NULL mapping", { skip_if_not_installed("tidytext") text <- "Simple text" result <- analyze_scientific_content(text) expect_output( diag <- print_matching_diagnostics(result), "No citation-reference mapping" ) expect_null(diag) }) test_that("print_matching_diagnostics prints match statistics", { skip_if_not_installed("tidytext") # Create result with mapping text_list <- list( Full_text = "Smith (2020) did research.", References = "Smith, J. (2020). Paper. Journal." ) result <- analyze_scientific_content(text_list) if (!is.null(result$citation_references_mapping)) { expect_output( print_matching_diagnostics(result), "CITATION-REFERENCE MATCHING" ) } expect_true(TRUE) }) test_that("print_matching_diagnostics returns summary invisibly", { skip_if_not_installed("tidytext") text_list <- list( Full_text = "Smith (2020) research.", References = "Smith, J. (2020). Paper." ) result <- analyze_scientific_content(text_list) diag <- suppressMessages(print_matching_diagnostics(result)) # Should return something (or NULL) expect_true(is.null(diag) || is.data.frame(diag)) }) # ============================================================================ # STRESS TESTS # ============================================================================ test_that("analyze_scientific_content handles large text", { skip_if_not_installed("tidytext") skip_on_cran() # Create large text large_text <- paste(rep("Research by Smith (2020) shows that machine learning is important.", 100), collapse = " ") result <- analyze_scientific_content(large_text) expect_s3_class(result, "enhanced_scientific_content_analysis") expect_gt(result$summary$total_words_analyzed, 100) }) test_that("analyze_scientific_content handles many citations", { skip_if_not_installed("tidytext") # Mix di formati per assicurarsi di catturarne abbastanza text <- paste( "Previous work [1] showed that (Smith, 2020) and (Jones, 2021) confirmed", "the findings [2][3][4]. Additional studies (Brown, 2022) and [5]", "demonstrated (Wilson, 2023) similar results [6][7][8][9][10]", "supporting the theory (Davis, 2024) and [11][12][13][14][15]." ) result <- analyze_scientific_content(text) expect_gte(nrow(result$citations), 5) })