test_that("textstat_embedding works with simple dfm and embeddings model", { texts <- c("this says one thing", "and this one says another one") dfm_obj <- dfm(tokens(texts)) embeddings_model <- embeddings(runif(30), nrow = 6, dimnames = list(c("this", "says", "one", "thing", "and", "another"), paste0("dim_", 1:5))) result <- textstat_embedding(dfm_obj, embeddings_model) expect_s3_class(result, "tbl_df") expect_equal(nrow(result), ndoc(dfm_obj)) expect_true(all(c("doc_id", paste0("dim_", 1:5)) %in% names(result))) }) test_that("textstat_embedding returns embeddings object when output_embeddings = TRUE", { texts <- c("this says one thing", "and this one says another one") dfm_obj <- dfm(tokens(texts)) embeddings_model <- embeddings(runif(30), nrow = 6, dimnames = list(c("this", "says", "one", "thing", "and", "another"), paste0("dim_", 1:5))) result <- textstat_embedding(dfm_obj, embeddings_model, output_embeddings = TRUE) expect_true(is.embeddings(result)) expect_equal(nrow(result), ndoc(dfm_obj)) expect_equal(rownames(result), docnames(dfm_obj)) }) test_that("textstat_embedding works with method = 'sum'", { texts <- c("this says one thing", "and this one says another one") dfm_obj <- dfm(tokens(texts)) embeddings_model <- embeddings(runif(30), nrow = 6, dimnames = list(c("this", "says", "one", "thing", "and", "another"), paste0("dim_", 1:5))) result <- textstat_embedding(dfm_obj, embeddings_model, method = "sum") # Manually compute the expected sums expected <- matrix(0, nrow = ndoc(dfm_obj), ncol = ncol(embeddings_model)) colnames(expected) <- colnames(embeddings_model) for (doc in seq_len(ndoc(dfm_obj))) { expected[doc, ] <- colSums(embeddings_model * as.numeric(dfm_obj[doc,])) } expect_equal(as.matrix(result[,-1]), expected) }) test_that("textstat_embedding works with method = 'median'", { skip_if_not_installed("Gmedian") texts <- c("this says one thing", "and this says another") dfm_obj <- dfm(tokens(texts)) embeddings_model <- embeddings(runif(30), nrow = 6, dimnames = list(c("this", "says", "one", "thing", "and", "another"), paste0("dim_", 1:5))) result <- textstat_embedding(dfm_obj, embeddings_model, method = "median") expect_s3_class(result, "tbl_df") expect_equal(nrow(result), ndoc(dfm_obj)) }) test_that("textstat_embedding handles missing words in embeddings model", { texts <- c("this says one thing", "unknown word") dfm_obj <- dfm(tokens(texts)) embeddings_model <- embeddings(runif(30), nrow = 6, dimnames = list(c("this", "says", "one", "thing", "and", "another"), paste0("dim_", 1:5))) expect_warning(result <- textstat_embedding(dfm_obj, embeddings_model), "2 items in `newdata` are not present in the embeddings object.") expect_s3_class(result, "tbl_df") expect_equal(nrow(result), ndoc(dfm_obj)) # Check that the embedding for the second document is zero or NA second_doc_embedding <- as.numeric(result[2, -1]) expect_true(all(second_doc_embedding == 0)) }) test_that("textstat_embedding applies weighting correctly", { texts <- c("this says one thing", "and this says another") dfm_obj <- dfm(tokens(texts)) embeddings_model <- embeddings(runif(30), nrow = 6, dimnames = list(c("this", "says", "one", "thing", "and", "another"), paste0("dim_", 1:5))) # Assign weights to tokens weights <- c("this" = 2, "says" = 0.5, "one" = 1, "thing" = 1.5, "and" = 1, "another" = 1) result <- textstat_embedding(dfm_obj, embeddings_model, w = weights) expect_s3_class(result, "tbl_df") expect_equal(nrow(result), ndoc(dfm_obj)) }) test_that("textstat_embedding throws error with invalid method", { texts <- c("this says one thing", "and this says another") dfm_obj <- dfm(tokens(texts)) embeddings_model <- embeddings(runif(30), nrow = 6, dimnames = list(c("this", "says", "one", "thing", "and", "another"), paste0("dim_", 1:5))) expect_error( textstat_embedding(dfm_obj, embeddings_model, method = "invalid"), "'invalid' is not a recognized averaging method" ) }) test_that("embed_docs works with character vector and embeddings model", { texts <- c("this says one thing", "and this says another") embeddings_model <- embeddings(runif(30), nrow = 6, dimnames = list(c("this", "says", "one", "thing", "and", "another"), paste0("dim_", 1:5))) result <- embed_docs(texts, embeddings_model) expect_s3_class(result, "tbl_df") expect_equal(nrow(result), length(texts)) expect_true(all(c("doc_id", paste0("dim_", 1:5)) %in% names(result))) }) test_that("embed_docs returns embeddings object when output_embeddings = TRUE", { texts <- c("this says one thing", "and this says another") embeddings_model <- embeddings(runif(30), nrow = 6, dimnames = list(c("this", "says", "one", "thing", "and", "another"), paste0("dim_", 1:5))) result <- embed_docs(texts, embeddings_model, output_embeddings = TRUE) expect_true(is.embeddings(result)) expect_equal(nrow(result), length(texts)) }) test_that("embed_docs works with custom model function", { texts <- c("this says one thing", "and this one says another one") custom_model <- function(texts, ...) { # Simple example: return length of each text data.frame(length = nchar(texts)) } result <- embed_docs(texts, custom_model) expect_s3_class(result, "tbl_df") expect_equal(nrow(result), length(texts)) expect_equal(names(result), c("doc_id", "length")) }) test_that("embed_docs works with data frame input", { df <- data.frame( id = c("doc1", "doc2"), text = c("this says one thing", "and this says another"), stringsAsFactors = FALSE ) embeddings_model <- embeddings(runif(30), nrow = 6, dimnames = list(c("this", "says", "one", "thing", "and", "another"), paste0("dim_", 1:5))) result <- embed_docs(df, text_col = "text", model = embeddings_model, id_col = "id") expect_s3_class(result, "tbl_df") expect_equal(nrow(result), nrow(df)) expect_true(all(c("id", paste0("dim_", 1:5)) %in% names(result))) }) test_that("embed_docs keeps all columns when .keep_all = TRUE", { df <- data.frame( id = c("doc1", "doc2"), text = c("this says one thing", "and this says another"), category = c("A", "B"), stringsAsFactors = FALSE ) embeddings_model <- embeddings(runif(30), nrow = 6, dimnames = list(c("this", "says", "one", "thing", "and", "another"), paste0("dim_", 1:5))) result <- embed_docs(df, text_col = "text", model = embeddings_model, id_col = "id", .keep_all = TRUE) expect_s3_class(result, "tbl_df") expect_equal(nrow(result), nrow(df)) expect_true(all(c("id", "category", paste0("dim_", 1:5)) %in% names(result))) }) test_that("embed_docs returns embeddings object when output_embeddings = TRUE with data frame input", { df <- data.frame( id = c("doc1", "doc2"), text = c("this says one thing", "and this says another"), stringsAsFactors = FALSE ) embeddings_model <- embeddings(runif(30), nrow = 6, dimnames = list(c("this", "says", "one", "thing", "and", "another"), paste0("dim_", 1:5))) result <- embed_docs(df, text_col = "text", model = embeddings_model, id_col = "id", output_embeddings = TRUE) expect_true(is.embeddings(result)) expect_equal(nrow(result), nrow(df)) expect_equal(rownames(result), df$id) }) test_that("embed_docs respects tolower = FALSE", { texts <- c("This says one thing", "And this says another") embeddings_model <- embeddings(runif(30), nrow = 6, dimnames = list(c("this", "says", "one", "thing", "and", "another"), paste0("dim_", 1:5))) result_tolower <- embed_docs(texts, embeddings_model, tolower = TRUE) expect_warning(result_no_tolower <- embed_docs(texts, embeddings_model, tolower = FALSE), "2 items in `newdata` are not present in the embeddings object.") expect_false(identical(result_tolower, result_no_tolower)) }) test_that("embed_docs throws error when model function returns incorrect output", { texts <- c("this says one thing", "and this says another") custom_model <- function(texts, ...) { # Return incorrect number of rows data.frame(length = nchar(texts[1])) } expect_error( embed_docs(texts, custom_model), "`model` must be either an embeddings object or a function that outputs a dataframe with the same number of rows as `x`" ) }) test_that("embed_docs assigns default doc_id when id_col is missing", { df <- data.frame( text = c("this says one thing", "and this says another"), stringsAsFactors = FALSE ) embeddings_model <- embeddings(runif(30), nrow = 6, dimnames = list(c("this", "says", "one", "thing", "and", "another"), paste0("dim_", 1:5))) result <- embed_docs(df, text_col = "text", model = embeddings_model) expect_s3_class(result, "tbl_df") expect_equal(names(result)[1], "doc_id") }) test_that("embed_docs handles missing words in embeddings model", { texts <- c("this says one thing", "unknown word") embeddings_model <- embeddings(runif(30), nrow = 6, dimnames = list(c("this", "says", "one", "thing", "and", "another"), paste0("dim_", 1:5))) expect_warning(result <- embed_docs(texts, embeddings_model), "2 items in `newdata` are not present in the embeddings object.") # Check that the embedding for the second document is zero or NA second_doc_embedding <- as.numeric(result[2, -1]) expect_true(all(second_doc_embedding == 0)) }) test_that("textstat_embedding handles empty dfm", { dfm_obj <- dfm(tokens(character(0))) embeddings_model <- embeddings(runif(30), nrow = 6, dimnames = list(c("this", "says", "one", "thing", "and", "another"), paste0("dim_", 1:5))) result <- textstat_embedding(dfm_obj, embeddings_model) expect_s3_class(result, "tbl_df") expect_equal(nrow(result), 0) }) test_that("embed_docs handles empty character vector", { texts <- character(0) embeddings_model <- embeddings(runif(30), nrow = 6, dimnames = list(c("this", "says", "one", "thing", "and", "another"), paste0("dim_", 1:5))) result <- embed_docs(texts, embeddings_model) expect_s3_class(result, "tbl_df") expect_equal(nrow(result), 0) }) test_that("embed_docs throws error when text_col is missing in data frame", { df <- data.frame( id = c("doc1", "doc2"), content = c("this says one thing", "and this says another"), stringsAsFactors = FALSE ) embeddings_model <- embeddings(runif(30), nrow = 6, dimnames = list(c("this", "says", "one", "thing", "and", "another"), paste0("dim_", 1:5))) expect_error( embed_docs(df, text_col = "text", model = embeddings_model, id_col = "id"), "Can't extract columns that don't exist." ) }) test_that("textstat_embedding handles dfm with all tokens missing in embeddings model", { texts <- c("unknown word", "other unknown word") dfm_obj <- dfm(tokens(texts)) embeddings_model <- embeddings(runif(30), nrow = 6, dimnames = list(c("this", "says", "one", "thing", "and", "another"), paste0("dim_", 1:5))) expect_warning(result <- textstat_embedding(dfm_obj, embeddings_model), "3 items in `newdata` are not present in the embeddings object.") expect_s3_class(result, "tbl_df") expect_equal(nrow(result), ndoc(dfm_obj)) expect_true(all(rowSums(result[, -1]) == 0)) }) test_that("embed_docs handles custom model function and output_embeddings = FALSE", { texts <- c("this says one thing", "and this says another") custom_model <- function(texts, ...) { data.frame(length = nchar(texts)) } result <- embed_docs(texts, custom_model, output_embeddings = FALSE) expect_s3_class(result, "tbl_df") expect_equal(nrow(result), length(texts)) }) test_that("embed_docs throws error when model is not embeddings or function", { texts <- c("this says one thing", "and this says another") model <- list() # Invalid model expect_error( embed_docs(texts, model), "`model` must be either an embeddings object or a function that outputs a dataframe with the same number of rows as `x`" ) }) test_that("embed_docs assigns default doc_id when id_col is missing in data frame", { df <- data.frame( text = c("this says one thing", "and this says another"), stringsAsFactors = FALSE ) embeddings_model <- embeddings(runif(30), nrow = 6, dimnames = list(c("this", "says", "one", "thing", "and", "another"), paste0("dim_", 1:5))) result <- embed_docs(df, text_col = "text", model = embeddings_model) expect_s3_class(result, "tbl_df") expect_equal(names(result)[1], "doc_id") }) test_that("textstat_embedding handles embeddings with NA values", { texts <- c("this says one thing", "and this says another") dfm_obj <- dfm(tokens(texts)) embeddings_model <- embeddings(runif(30), nrow = 6, dimnames = list(c("this", "says", "one", "thing", "and", "another"), paste0("dim_", 1:5))) embeddings_model[1, 1] <- NA # Introduce NA value result <- textstat_embedding(dfm_obj, embeddings_model) expect_s3_class(result, "tbl_df") expect_equal(nrow(result), ndoc(dfm_obj)) }) test_that("embed_docs passes additional arguments to tokens()", { texts <- c("This says one thing!", "And this says another.") embeddings_model <- embeddings(runif(30), nrow = 6, dimnames = list(c("this", "says", "one", "thing", "and", "another"), paste0("dim_", 1:5))) # Remove punctuation result <- embed_docs(texts, embeddings_model, remove_punct = TRUE) expect_s3_class(result, "tbl_df") }) test_that("embed_docs passes additional arguments to custom model function", { texts <- c("this says one thing", "and this says another") custom_model <- function(texts, multiplier = 1, ...) { data.frame(length = nchar(texts) * multiplier) } result <- embed_docs(texts, custom_model, multiplier = 2) expect_s3_class(result, "tbl_df") expect_equal(result$length, nchar(texts) * 2) }) test_that("embed_docs assigns correct doc_id with non-default id_col", { df <- data.frame( doc_id = c("doc1", "doc2"), text = c("this says one thing", "and this says another"), stringsAsFactors = FALSE ) embeddings_model <- embeddings(runif(30), nrow = 6, dimnames = list(c("this", "says", "one", "thing", "and", "another"), paste0("dim_", 1:5))) result <- embed_docs(df, text_col = "text", model = embeddings_model, id_col = "doc_id") expect_s3_class(result, "tbl_df") expect_equal(result$doc_id, df$doc_id) }) test_that("embed_docs only keeps doc_id and embeddings when .keep_all = FALSE", { df <- data.frame( id = c("doc1", "doc2"), text = c("this says one thing", "and this says another"), category = c("A", "B"), stringsAsFactors = FALSE ) embeddings_model <- embeddings(runif(30), nrow = 6, dimnames = list(c("this", "says", "one", "thing", "and", "another"), paste0("dim_", 1:5))) result <- embed_docs(df, text_col = "text", model = embeddings_model, id_col = "id", .keep_all = FALSE) expect_s3_class(result, "tbl_df") expect_equal(names(result), c("id", paste0("dim_", 1:5))) })