# devtools::load_all() test_pmcxml = nlpembeds:::get_test_pmcxml() ftext = nlpembeds:::build_ftext_from_pmcxml(test_pmcxml) processed_text = nlpembeds:::process_pubmed(ftext) pruned_text = nlpembeds:::prune_text(processed_text) set.seed(1) glove_fit = nlpembeds:::str_to_vectors(pruned_text, n_iter = 1, verbose = FALSE) test_build_ftext_from_pmcxml = function() { # 17 articles expect_equal(length(ftext), 17) # first article has 187 sentences + keywords expect_equal(nrow(ftext[[1]]), 193) # check section names expect_equal(head(ftext[[1]]$section, 2), c('Title', 'Abstract')) # first article 27535 chars + keywords expect_equal(nchar(paste(ftext[[1]]$text, collapse = ' ')), 27738) } test_that('build_ftext_from_pmcxml', test_build_ftext_from_pmcxml()) test_process_pubmed = function() { # now a character vector expect_is(processed_text, 'character') # still 17 articles expect_equal(length(processed_text), 17) # removed ~500 chars expect_equal(nchar(processed_text[1]), 27024) # check that it removed irrelevant sections ## check that we had irrelevant sections expect_true('SUPPLEMENTARY DATA' %in% ftext[[13]]$section) expect_true(any(grepl('Supplementary Data are available', unlist(ftext[[13]]$text)))) ## number of article chars expect_equal(nchar(paste(ftext[[1]]$text, collapse = ' ')), 27738) ## removed ~600 chars expect_false(grepl('Supplementary Data are available', processed_text[13])) expect_equal(nchar(processed_text[13]), 25445) } test_that('process_pubmed', test_process_pubmed()) test_prune_text = function() { # still a character vector expect_is(pruned_text, 'character') # still 17 articles expect_equal(length(pruned_text), 17) # check we removed stop words expect_true(grepl(' the ', processed_text[1])) expect_false(grepl(' the ', pruned_text[1])) # check we performed stemming expect_true(grepl(' patients ', processed_text[1])) expect_false(grepl(' patients ', pruned_text[1])) # check replace_by_cuis ## healthcare is replaced by https://www.wikidata.org/wiki/Q1914636 expect_true(grepl(' healthcare ', pruned_text[1])) expect_false(grepl(' https://www.wikidata.org/wiki/Q1914636 ', pruned_text[1], fixed = TRUE)) ### however in link, https://www.wikidata.org/wiki/Q1914636 is activity ### due to propagation ? but activity itself is http://purl.obolibrary.org/obo/ID_0000001 ### may need to fix df_mapping dirpath = system.file('data', package = 'nlpembeds') df_mapping = get(load(file.path(dirpath, 'df_cui_mapping.rda'))) df_activity = subset(df_mapping, id == 'https://www.wikidata.org/wiki/Q1914636') expect_equal(nrow(df_activity), 205) ## role is replaced by http://purl.obolibrary.org/obo/BFO_0000023 ### this one is correct in link expect_true(grepl(' role ', pruned_text[2])) expect_false(grepl(' http://purl.obolibrary.org/obo/BFO_0000023 ', pruned_text[2], fixed = TRUE)) } test_that('prune_text', test_prune_text()) # also known as str_to_vectors test_fit_embeds = function() { # projection matrix expect_equal(dim(glove_fit), c(2431, 100)) expect_true(any(grepl('patient', rownames(glove_fit)))) # frequency removal expect_true(any(grepl(' #16c ', pruned_text))) expect_false(any(grepl('#16c', rownames(glove_fit)))) ## 1 url in original data expect_equal(length(grep('http', rownames(glove_fit))), 1) ## no cui in original data expect_equal(length(grep('UMLS_CUI', rownames(glove_fit))), 0) } test_that('fit_embeds', test_fit_embeds()) test_vector_operation = function() { neighbors = nlpembeds:::vector_operation(glove_fit, 'patient') expect_equal(names(neighbors), c('patient', 'sars', 'online', 'evolutionary', 'ssc')) # n_closest neighbors = nlpembeds:::vector_operation(glove_fit, 'patient', n_closest = 50) expect_equal(length(neighbors), 50) # sub_cols neighbors = nlpembeds:::vector_operation(glove_fit, 'patient', 'sars', n_closest = 2) expect_equal(names(neighbors), c('patient', 'relation')) } test_that('vector_operation', test_vector_operation()) # this is deprecated since it's preferably done by nile test_replace_by_cuis = function() { pruned_text_cuis = nlpembeds:::prune_text(processed_text, replace_by_cuis = TRUE) set.seed(1) glove_fit_cuis = nlpembeds:::str_to_vectors(pruned_text_cuis, n_iter = 1, verbose = FALSE) expect_false(grepl(' healthcare ', pruned_text_cuis[1])) expect_true(grepl(' https://www.wikidata.org/wiki/Q1914636 ', pruned_text_cuis[1], fixed = TRUE)) expect_false(grepl(' role ', pruned_text_cuis[2])) expect_true(grepl(' http://purl.obolibrary.org/obo/BFO_0000023 ', pruned_text_cuis[2], fixed = TRUE)) # with replace_by_cuis role_id = 'http://purl.obolibrary.org/obo/BFO_0000023' expect_true(any(grepl(role_id, rownames(glove_fit_cuis), fixed = TRUE))) ## 40 now expect_equal(length(grep('http', rownames(glove_fit_cuis))), 40) ## 8 now expect_equal(length(grep('UMLS_CUI', rownames(glove_fit_cuis))), 8) } #test_that('replace_by_cuis', test_replace_by_cuis())