# Tests for alternative models implementation functions # Create a mock co-occurrence matrix for testing create_mock_comat <- function(n_terms = 50, with_entity_types = TRUE) { set.seed(123) # For reproducibility # Create term names a_terms <- paste0("A_Term_", 1:2) b_terms <- paste0("B_Term_", 1:10) c_terms <- paste0("C_Term_", 1:8) other_terms <- paste0("Term_", 1:30) all_terms <- c(a_terms, b_terms, c_terms, other_terms) all_terms <- unique(all_terms)[1:min(n_terms, length(unique(all_terms)))] # Create sparse co-occurrence matrix co_matrix <- matrix(0, nrow = length(all_terms), ncol = length(all_terms)) rownames(co_matrix) <- all_terms colnames(co_matrix) <- all_terms # Fill diagonal with term frequencies for (i in 1:length(all_terms)) { co_matrix[i, i] <- sample(10:50, 1) # Term frequency } # Fill co-occurrence values (making it symmetric) for (i in 1:(length(all_terms)-1)) { for (j in (i+1):length(all_terms)) { if (runif(1) < 0.3) { # 30% chance of co-occurrence co_value <- runif(1, 0.1, 0.9) co_matrix[i, j] <- co_value co_matrix[j, i] <- co_value # Make symmetric } } } # Add entity types if requested if (with_entity_types) { entity_types <- c("disease", "drug", "gene", "protein", "pathway", "biological_process", "chemical", "symptom") term_types <- sample(entity_types, length(all_terms), replace = TRUE) names(term_types) <- all_terms # Ensure A terms are diseases, B terms mix, and C terms are targets for (term in a_terms) { term_types[term] <- "disease" } for (term in b_terms) { term_types[term] <- sample(c("gene", "protein", "biological_process"), 1) } for (term in c_terms) { term_types[term] <- sample(c("drug", "chemical"), 1) } # Add entity types as attribute attr(co_matrix, "entity_types") <- term_types # Add entity frequencies as attribute entity_freq <- diag(co_matrix) attr(co_matrix, "entity_freq") <- entity_freq # Add metadata attribute attr(co_matrix, "metadata") <- list( n_docs = 1000, n_entities = length(all_terms), has_types = TRUE, normalization = "cosine" ) } return(co_matrix) } # Create a custom mock validation function that doesn't rely on NLP create_mock_validation_function <- function() { # Create a function that validates based on term patterns without NLP dependencies function(term, claimed_type = NULL) { # Simple term validation rules without NLP # If term starts with the prefixes (A_, B_, C_), it's valid if (grepl("^A_", term) || grepl("^B_", term) || grepl("^C_", term)) { return(TRUE) } # Validate medical terms in our test data medical_terms <- c( "disease", "patient", "treatment", "therapy", "drug", "symptom", "diagnosis", "clinical", "syndrome", "disorder", "medicine", "protein", "gene", "receptor", "enzyme", "pathway" ) if (tolower(term) %in% medical_terms) { return(TRUE) } # Specific term validation based on claimed_type if (!is.null(claimed_type)) { if (claimed_type == "disease" && grepl("disease|syndrome|disorder", tolower(term))) { return(TRUE) } if (claimed_type == "drug" && grepl("drug|medicine|therapy", tolower(term))) { return(TRUE) } if (claimed_type == "protein" && grepl("protein|receptor|enzyme", tolower(term))) { return(TRUE) } if (claimed_type == "gene" && grepl("gene", tolower(term))) { return(TRUE) } } # Default to false for other terms return(FALSE) } } # Test the anc_model function test_that("anc_model produces expected results", { # Skip tests if function is not available skip_if_not(exists("anc_model"), "anc_model function not available") # Create a test co-occurrence matrix co_matrix <- create_mock_comat(n_terms = 30) # Get an A term to use a_term <- grep("^A_Term_", rownames(co_matrix), value = TRUE)[1] # Create a mock validation function to avoid NLP dependencies mock_validation <- create_mock_validation_function() # Test with default parameters expect_no_error({ results <- anc_model(co_matrix, a_term, n_b_terms = 3, validation_function = mock_validation) }) # If results were produced, check their structure if(exists("results") && is.data.frame(results) && nrow(results) > 0) { expect_true(all(c("a_term", "b_terms", "c_term", "a_b_scores", "b_c_scores", "anc_score") %in% colnames(results))) expect_equal(results$a_term[1], a_term) } # Test with c_type constraint expect_no_error({ results_with_ctype <- anc_model(co_matrix, a_term, c_type = "drug", validation_function = mock_validation) }) # Test with disabled biomedical term filtering expect_no_error({ results_no_filter <- anc_model(co_matrix, a_term, enforce_biomedical_terms = FALSE) }) }) # Test the bitola_model function test_that("bitola_model produces expected results", { # Skip tests if function is not available skip_if_not(exists("bitola_model"), "bitola_model function not available") # Create a test co-occurrence matrix co_matrix <- create_mock_comat(n_terms = 30) # Get an A term to use a_term <- grep("^A_Term_", rownames(co_matrix), value = TRUE)[1] # Get entity types entity_types <- attr(co_matrix, "entity_types") a_semantic_type <- entity_types[a_term] # Find a semantic type for C terms c_types <- unique(entity_types[grep("^C_Term_", names(entity_types))]) c_semantic_type <- c_types[1] # Test with specific semantic types expect_no_error({ results <- bitola_model(co_matrix, a_term, a_semantic_type = a_semantic_type, c_semantic_type = c_semantic_type) }) # Check result structure if available if(exists("results") && is.data.frame(results) && nrow(results) > 0) { expect_true(all(c("a_term", "a_type", "c_term", "c_type", "support", "bitola_score", "b_terms", "ranking_score") %in% colnames(results))) expect_equal(results$a_term[1], a_term) # Test the types without using the indexing, which can cause issues with names expect_true(as.character(results$a_type[1]) == as.character(a_semantic_type)) expect_true(as.character(results$c_type[1]) == as.character(c_semantic_type)) } }) # Test error handling in BITOLA model test_that("bitola_model handles errors appropriately", { # Skip tests if function is not available skip_if_not(exists("bitola_model"), "bitola_model function not available") # Create a test co-occurrence matrix without entity types co_matrix_no_types <- create_mock_comat(n_terms = 10, with_entity_types = FALSE) # Create a test co-occurrence matrix with entity types co_matrix <- create_mock_comat(n_terms = 10) # Get an A term to use a_term <- grep("^A_Term_", rownames(co_matrix), value = TRUE)[1] # Missing semantic types should error expect_error(bitola_model(co_matrix, a_term), "Both A and C semantic types must be provided") # Matrix without entity types should error expect_error(bitola_model(co_matrix_no_types, a_term, a_semantic_type = "disease", c_semantic_type = "drug"), "Entity types must be available") # Non-existent A term should error expect_error(bitola_model(co_matrix, "NonExistentTerm", a_semantic_type = "disease", c_semantic_type = "drug"), "A-term .* not found") # Incorrect A term semantic type should error entity_types <- attr(co_matrix, "entity_types") a_semantic_type_incorrect <- "drug" # Different from actual type which is disease expect_error(bitola_model(co_matrix, a_term, a_semantic_type = a_semantic_type_incorrect, c_semantic_type = "drug"), "A-term .* is not of semantic type") }) # Create mock term-document matrix for LSI testing create_mock_tdm <- function(n_terms = 100, n_docs = 50) { set.seed(123) # For reproducibility # Create term names a_terms <- paste0("A_Term_", 1:2) b_terms <- paste0("B_Term_", 1:10) c_terms <- paste0("C_Term_", 1:8) medical_terms <- c("disease", "patient", "treatment", "therapy", "drug", "symptom", "diagnosis", "clinical", "syndrome", "disorder", "medicine", "protein", "gene", "receptor", "enzyme", "pathway") other_terms <- c(paste0("Term_", 1:70), medical_terms) all_terms <- c(a_terms, b_terms, c_terms, other_terms) all_terms <- unique(all_terms)[1:min(n_terms, length(unique(all_terms)))] # Create term-document matrix tdm <- matrix(0, nrow = length(all_terms), ncol = n_docs) rownames(tdm) <- all_terms # Fill matrix with random values for (i in 1:n_docs) { # Each document contains ~10% of terms n_terms_in_doc <- round(length(all_terms) * 0.1) term_indices <- sample(1:length(all_terms), n_terms_in_doc) tdm[term_indices, i] <- rpois(n_terms_in_doc, lambda = 3) # Poisson-distributed counts } return(tdm) } # Test the LSI model function test_that("lsi_model produces expected results", { # Skip tests if function is not available skip_if_not(exists("lsi_model"), "lsi_model function not available") # Skip tests if irlba package is not available skip_if_not(requireNamespace("irlba", quietly = TRUE), "irlba package not available") # Create a test term-document matrix tdm <- create_mock_tdm() # Get an A term to use a_term <- grep("^A_Term_", rownames(tdm), value = TRUE)[1] # Create a mock validation function to avoid NLP dependencies mock_validation <- create_mock_validation_function() # Test with default parameters and fewer factors for faster testing expect_no_error({ results <- lsi_model(tdm, a_term, n_factors = 10, n_results = 20, validation_function = mock_validation, use_nlp = FALSE) # Disable NLP to avoid the spaCy dependency }) # Check result structure if available if(exists("results") && is.data.frame(results) && nrow(results) > 0) { expect_true(all(c("a_term", "c_term", "lsi_similarity") %in% colnames(results))) expect_equal(results$a_term[1], a_term) # Instead of checking exact range, verify values are numerical and finite # LSI similarity can sometimes be outside [0,1] range depending on normalization expect_true(all(is.numeric(results$lsi_similarity))) expect_true(all(is.finite(results$lsi_similarity))) } # Test with enforce_biomedical_terms = FALSE expect_no_error({ results_no_filter <- lsi_model(tdm, a_term, n_factors = 10, enforce_biomedical_terms = FALSE) }) # If we have both results, no_filter should have more rows if(exists("results") && exists("results_no_filter")) { expect_true(nrow(results_no_filter) >= nrow(results)) } }) # Test the create_tdm function test_that("create_tdm correctly creates a term-document matrix", { # Skip tests if function is not available skip_if_not(exists("create_tdm"), "create_tdm function not available") # Create mock preprocessed data n_docs <- 5 mock_preprocessed <- data.frame(doc_id = paste0("doc", 1:n_docs)) # Create mock terms data frames mock_preprocessed$terms <- lapply(1:n_docs, function(i) { n_terms <- sample(5:10, 1) words <- paste0("term", sample(1:20, n_terms, replace = FALSE)) counts <- sample(1:5, n_terms, replace = TRUE) data.frame(word = words, count = counts, stringsAsFactors = FALSE) }) # Test create_tdm function expect_no_error({ tdm <- create_tdm(mock_preprocessed) }) # Check TDM structure if it was created if(exists("tdm")) { expect_true(is.matrix(tdm)) expect_true(nrow(tdm) > 0) expect_true(ncol(tdm) == n_docs) } # Test with missing terms column mock_preprocessed_bad <- data.frame(doc_id = paste0("doc", 1:n_docs)) expect_error(create_tdm(mock_preprocessed_bad), "Terms column not found") })