# devtools::load_all()

test_pmcxml = nlpembeds:::get_test_pmcxml()

ftext = nlpembeds:::build_ftext_from_pmcxml(test_pmcxml)
processed_text = nlpembeds:::process_pubmed(ftext)

pruned_text = nlpembeds:::prune_text(processed_text)

set.seed(1)

glove_fit = nlpembeds:::str_to_vectors(pruned_text, n_iter = 1, verbose = FALSE)


test_build_ftext_from_pmcxml = function() {

  # 17 articles
  expect_equal(length(ftext), 17)
  # first article has 187 sentences + keywords
  expect_equal(nrow(ftext[[1]]), 193)
  # check section names
  expect_equal(head(ftext[[1]]$section, 2), c('Title', 'Abstract'))

  # first article 27535 chars + keywords
  expect_equal(nchar(paste(ftext[[1]]$text, collapse = ' ')), 27738)
}
test_that('build_ftext_from_pmcxml', test_build_ftext_from_pmcxml())


test_process_pubmed = function() {

  # now a character vector
  expect_is(processed_text, 'character')
  # still 17 articles
  expect_equal(length(processed_text), 17)

  # removed ~500 chars
  expect_equal(nchar(processed_text[1]), 27024)

  # check that it removed irrelevant sections

  ## check that we had irrelevant sections
  expect_true('SUPPLEMENTARY DATA' %in% ftext[[13]]$section)
  expect_true(any(grepl('Supplementary Data are available', unlist(ftext[[13]]$text))))
  ## number of article chars
  expect_equal(nchar(paste(ftext[[1]]$text, collapse = ' ')), 27738)

  ## removed ~600 chars
  expect_false(grepl('Supplementary Data are available', processed_text[13]))
  expect_equal(nchar(processed_text[13]), 25445)
}
test_that('process_pubmed', test_process_pubmed())

test_prune_text = function() {

  # still a character vector
  expect_is(pruned_text, 'character')
  # still 17 articles
  expect_equal(length(pruned_text), 17)

  # check we removed stop words
  expect_true(grepl(' the ', processed_text[1]))
  expect_false(grepl(' the ', pruned_text[1]))
  # check we performed stemming
  expect_true(grepl(' patients ', processed_text[1]))
  expect_false(grepl(' patients ', pruned_text[1]))


  # check replace_by_cuis
  ## healthcare is replaced by https://www.wikidata.org/wiki/Q1914636
  expect_true(grepl(' healthcare ', pruned_text[1]))
  expect_false(grepl(' https://www.wikidata.org/wiki/Q1914636 ', pruned_text[1], fixed = TRUE))

  ### however in link, https://www.wikidata.org/wiki/Q1914636 is activity
  ### due to propagation ? but activity itself is http://purl.obolibrary.org/obo/ID_0000001
  ### may need to fix df_mapping

  dirpath = system.file('data', package = 'nlpembeds')
  df_mapping = get(load(file.path(dirpath, 'df_cui_mapping.rda')))
  df_activity = subset(df_mapping, id == 'https://www.wikidata.org/wiki/Q1914636') 
  expect_equal(nrow(df_activity), 205)

  ## role is replaced by http://purl.obolibrary.org/obo/BFO_0000023
  ### this one is correct in link

  expect_true(grepl(' role ', pruned_text[2]))
  expect_false(grepl(' http://purl.obolibrary.org/obo/BFO_0000023 ', pruned_text[2], fixed = TRUE))

}
test_that('prune_text', test_prune_text())

# also known as str_to_vectors
test_fit_embeds = function() {

	# projection matrix
	expect_equal(dim(glove_fit), c(2431, 100))
	expect_true(any(grepl('patient', rownames(glove_fit))))

	# frequency removal
	expect_true(any(grepl(' #16c ', pruned_text)))
	expect_false(any(grepl('#16c', rownames(glove_fit))))

        ## 1 url in original data
	expect_equal(length(grep('http', rownames(glove_fit))), 1)

        ## no cui in original data
	expect_equal(length(grep('UMLS_CUI', rownames(glove_fit))), 0)
}
test_that('fit_embeds', test_fit_embeds())

test_vector_operation = function() {
  neighbors = nlpembeds:::vector_operation(glove_fit, 'patient')
  expect_equal(names(neighbors), c('patient', 'sars', 'online', 'evolutionary', 'ssc'))

  # n_closest
  neighbors = nlpembeds:::vector_operation(glove_fit, 'patient', n_closest = 50)
  expect_equal(length(neighbors), 50)

  # sub_cols
  neighbors = nlpembeds:::vector_operation(glove_fit, 'patient', 'sars', n_closest = 2)
  expect_equal(names(neighbors), c('patient', 'relation'))
}
test_that('vector_operation', test_vector_operation())

# this is deprecated since it's preferably done by nile
test_replace_by_cuis = function() {

  pruned_text_cuis = nlpembeds:::prune_text(processed_text, replace_by_cuis = TRUE)
  set.seed(1)
  glove_fit_cuis = nlpembeds:::str_to_vectors(pruned_text_cuis, n_iter = 1, verbose = FALSE)

  expect_false(grepl(' healthcare ', pruned_text_cuis[1]))
  expect_true(grepl(' https://www.wikidata.org/wiki/Q1914636 ', pruned_text_cuis[1], fixed = TRUE))

  expect_false(grepl(' role ', pruned_text_cuis[2]))
  expect_true(grepl(' http://purl.obolibrary.org/obo/BFO_0000023 ', pruned_text_cuis[2], fixed = TRUE))

	# with replace_by_cuis
	role_id = 'http://purl.obolibrary.org/obo/BFO_0000023'
	expect_true(any(grepl(role_id, rownames(glove_fit_cuis), fixed = TRUE)))
        ## 40 now
	expect_equal(length(grep('http', rownames(glove_fit_cuis))), 40)
        ## 8 now
	expect_equal(length(grep('UMLS_CUI', rownames(glove_fit_cuis))), 8)
}
#test_that('replace_by_cuis', test_replace_by_cuis())