context("testing entity functions") source("utils.R") test_that("spacy_extract_entity data.frame works", { skip_on_cran() # skip_on_appveyor() skip_on_os("solaris") try_spacy_initialize() expect_message(spacy_initialize(), "successfully|already") txt1 <- c(doc1 = "I would have accepted without question the information that Gatsby sprang from the swamps of Louisiana or from the lower East Side of New York.", doc2 = "I graduated from New Haven in 1915, just a quarter of a century after my father, and a little later I participated in that delayed Teutonic migration known as the Great War.") entities <- spacy_extract_entity(txt1, output = "data.frame") expect_equal( entities$text, c("Gatsby", "Louisiana", "East Side", "New York", "New Haven", "1915", "just a quarter of a century", "Teutonic", "the Great War")) skip("behaviour changed in spaCy") expect_equal( entities$ent_type, c("PERSON", "GPE", "LOC", "GPE", "GPE", "DATE", "CARDINAL", "NORP", "EVENT")) expect_silent(spacy_finalize()) }) test_that("spacy_extract_entity data.frame works properly when there is no noun-phrase", { skip_on_cran() # skip_on_appveyor() skip_on_os("solaris") try_spacy_initialize() expect_message(spacy_initialize(), "successfully|already") txt1 <- c(doc1 = "He told me all this very much later, but I've put it down here with the idea of exploding those wild rumors about his antecedents, which weren t even faintly true.") expect_message( spacy_extract_entity(txt1, output = "data.frame"), "No entity") expect_equivalent( spacy_extract_entity(txt1, output = "data.frame"), NULL) }) test_that("spacy_extract_entity list works", { skip_on_cran() # skip_on_appveyor() skip_on_os("solaris") try_spacy_initialize() expect_message(spacy_initialize(), "successfully|already") txt1 <- c(doc1 = "I would have accepted without question the information that Gatsby sprang from the swamps of Louisiana or from the lower East Side of New York.", doc2 = "I graduated from New Haven in 1915, just a quarter of a century after my father, and a little later I participated in that delayed Teutonic migration known as the Great War.") entities <- spacy_extract_entity(txt1, output = "list") expect_equal( entities, list(doc1 = c("Gatsby", "Louisiana", "East Side", "New York"), doc2 = c("New Haven", "1915", "just a quarter of a century", "Teutonic", "the Great War")) ) }) test_that("spacy_extract_entity data.frame and list returns the same entity", { skip_on_cran() # skip_on_appveyor() skip_on_os("solaris") try_spacy_initialize() expect_message(spacy_initialize(), "successfully|already") txt1 <- c(doc1 = "I would have accepted without question the information that Gatsby sprang from the swamps of Louisiana or from the lower East Side of New York.", doc2 = "It was a matter of chance that I should have rented a house in one of the strangest communities in North America.") entities_dataframe <- spacy_extract_entity(txt1, output = "data.frame") entities_list <- spacy_extract_entity(txt1, output = "list") expect_equal( entities_dataframe$text, unname(unlist(entities_list)) ) expect_identical( lengths(entities_list, use.names = FALSE), as.integer(table(entities_dataframe$doc_id)) ) }) test_that("spacy_extract_entity.data.frame() works", { skip_on_cran() # skip_on_appveyor() skip_on_os("solaris") try_spacy_initialize() expect_message(spacy_initialize(), "successfully|already") txt <- c(doc1 = "I would have accepted without question the information that Gatsby sprang from the swamps of Louisiana or from the lower East Side of New York.", doc2 = "It was a matter of chance that I should have rented a house in one of the strangest communities in North America.") txt_df <- data.frame(doc_id = paste0("doc", 1:2), text = txt, stringsAsFactors = FALSE) expect_equal( spacy_extract_entity(txt), spacy_extract_entity(txt_df) ) }) test_that("spacy_extract_entity type option works", { skip_on_cran() # skip_on_appveyor() skip_on_os("solaris") try_spacy_initialize() expect_message(spacy_initialize(), "successfully|already") txt1 <- c(doc1 = "I would have accepted without question the information that Gatsby sprang from the swamps of Louisiana or from the lower East Side of New York.", doc2 = "I graduated from New Haven in 1915, just a quarter of a century after my father, and a little later I participated in that delayed Teutonic migration known as the Great War.") expect_equal( nrow(spacy_extract_entity(txt1, output = "data.frame", type = "named")), 7 ) expect_equal( nrow(spacy_extract_entity(txt1, output = "data.frame", type = "extended")), 2 ) expect_equal( nrow(spacy_extract_entity(txt1, output = "data.frame", type = "all")), 9 ) expect_equal( unname(unlist(spacy_extract_entity(txt1, output = "list", type = "named"))), c("Gatsby", "Louisiana", "East Side", "New York", "New Haven", "Teutonic", "the Great War") ) expect_equal( unname(unlist(spacy_extract_entity(txt1, output = "list", type = "extended"))), c("1915", "just a quarter of a century") ) expect_equal( spacy_extract_entity(txt1, output = "data.frame", type = "named")$text, unname(unlist(spacy_extract_entity(txt1, output = "list", type = "named"))) ) }) test_that("getting named entities from spacy_parsed object works", { skip_on_cran() # skip_on_appveyor() skip_on_os("solaris") try_spacy_initialize() expect_message(spacy_initialize(), "successfully|already") txt1 <- c(doc1 = "The United States elected President Donald Trump, from New York.", doc2 = "New buildings on the New York skyline.") parsed <- spacy_parse(txt1, entity = TRUE) entities <- entity_extract(parsed, concatenator = " ") expect_equal( entities$entity, c("The United States", "Donald Trump", "New York", "New York") ) expect_equal( entities$entity_type, c("GPE", "PERSON", "GPE", "GPE") ) txt1 <- c(doc1 = "The United States elected President Donald Trump, from New York.", doc2 = "New buildings on the New York skyline appeared in January.") parsed <- spacy_parse(txt1, entity = TRUE) expect_equal( entity_extract(parsed, type = "extended")$entity_type, "DATE" ) expect_equal( entity_extract(parsed, type = "named")$entity_type, c("GPE", "PERSON", "GPE", "GPE") ) parsed <- spacy_parse(txt1, entity = FALSE) expect_error( entity_extract(parsed), "no entities in parsed object" ) expect_silent(spacy_finalize()) }) test_that("compare entity_extract(spacy_parse()) and spacy_extract_entity()", { skip_on_cran() # skip_on_appveyor() skip_on_os("solaris") try_spacy_initialize() expect_message(spacy_initialize(), "successfully|already") txt1 <- c(doc1 = "The history of natural language processing generally started in the 1950s, although work can be found from earlier periods.", doc2 = "In 1950, Alan Turing published an article titled Intelligence which proposed what is now called the Turing test as a criterion of intelligence.") parsed <- spacy_parse(txt1, entity = TRUE) entities_1 <- entity_extract(parsed, concatenator = " ", type = "all") entities_2 <- spacy_extract_entity(txt1, output = "data.frame") expect_equal( entities_1$entity, entities_2$text ) expect_silent(spacy_finalize()) }) test_that("entity consolidation works", { skip_on_cran() # skip_on_appveyor() skip_on_os("solaris") try(spacy_finalize(), silent = TRUE) expect_message(spacy_initialize(), "successfully") txt1 <- c(doc1 = "The United States elected President Donald Trump, from New York.", doc2 = "New buildings on the New York skyline appeared in January.") parsed <- spacy_parse(txt1, entity = TRUE) expect_equal( entity_consolidate(parsed)$token[c(1, 4)], c("The_United_States", "Donald_Trump") ) expect_equal( entity_consolidate(parsed, concatenator = " ")$token[c(1, 4)], c("The United States", "Donald Trump") ) expect_equal( entity_consolidate(parsed)$token_id, c(1:8, 1:10) ) parsed <- spacy_parse(txt1, entity = TRUE, nounphrase = TRUE) expect_equal( entity_consolidate(parsed)$token[c(1, 4)], c("The_United_States", "Donald_Trump") ) expect_true( !("nounphrase" %in% names(entity_consolidate(parsed))) ) parsed <- spacy_parse(txt1, entity = TRUE, pos = TRUE, tag = TRUE) expect_equal( entity_consolidate(parsed)$tag[c(1, 4, 17)], rep("ENTITY", 3) ) expect_equal( tolower(entity_consolidate(parsed)$lemma[c(1, 4, 16)]), # obviously en_model stop lowercasing lemmas. That's cool tolower(entity_consolidate(parsed)$token[c(1, 4, 16)]) ) parsed <- spacy_parse(txt1, entity = TRUE, dependency = TRUE) expect_true( !"dep_rel" %in% names(entity_consolidate(parsed)) ) expect_message( entity_consolidate(parsed), "Note: removing head_token_id, dep_rel" ) parsed <- spacy_parse(txt1, entity = FALSE) expect_error( entity_consolidate(parsed), "no entities in parsed object" ) expect_silent(spacy_finalize()) })