test_that("embed_tokens works with character vector input and embeddings model", {
	texts <- c("this says one thing", "and this says another")

	embeddings_model <- embeddings(runif(30), nrow = 6, dimnames = list(c("this", "says", "one", "thing", "and", "another"), paste0("dim_", 1:5)))

	result <- embed_tokens(texts, embeddings_model)

	expect_s3_class(result, "tbl_df")
	expect_equal(names(result)[1:2], c("doc_id", "token"))
	expect_equal(nrow(result), sum(sapply(strsplit(texts, " "), length)))
	expect_true(all(paste0("dim_", 1:5) %in% names(result)))
})

test_that("embed_tokens returns list of embeddings when output_embeddings = TRUE", {
	texts <- c("this says one thing", "and this says another")

	embeddings_model <- embeddings(runif(30), nrow = 6, dimnames = list(c("this", "says", "one", "thing", "and", "another"), paste0("dim_", 1:5)))

	result <- embed_tokens(texts, embeddings_model, output_embeddings = TRUE)

	expect_true(is.list(result))
	expect_equal(length(result), length(texts))
	expect_true(all(sapply(result, is.embeddings)))
})

test_that("embed_tokens keeps missing tokens when .keep_missing = TRUE", {
	texts <- c("this says unknown_word", "and this says another")

	embeddings_model <- embeddings(runif(30), nrow = 6, dimnames = list(c("this", "says", "one", "thing", "and", "another"), paste0("dim_", 1:5)))

	expect_warning(result <- embed_tokens(texts, embeddings_model, .keep_missing = TRUE),
								 "1 tokens in `x` are not present in `model`.")

	expect_s3_class(result, "tbl_df")
	missing_tokens <- result$token[result$token == "unknown_word"]
	expect_equal(length(missing_tokens), 1)
	expect_true(all(is.na(result[result$token == "unknown_word", paste0("dim_", 1:5)])))
})

test_that("embed_tokens works with quanteda tokens object", {
	texts <- c("this says one thing", "and this says another")
	tokens_obj <- tokens(texts)

	embeddings_model <- embeddings(runif(35), nrow = 7, dimnames = list(c("this", "says", "one", "thing", "and", "another", "unknown_word"), paste0("dim_", 1:5)))

	result <- embed_tokens(tokens_obj, embeddings_model)

	expect_s3_class(result, "tbl_df")
	expect_equal(names(result)[1:2], c("doc_id", "token"))
})

test_that("embed_tokens works with data frame input", {
	df <- data.frame(
		id = c("doc1", "doc2"),
		text = c("this says one thing", "and this says another"),
		stringsAsFactors = FALSE
	)

	embeddings_model <- embeddings(runif(35), nrow = 7, dimnames = list(c("this", "says", "one", "thing", "and", "another", "unknown_word"), paste0("dim_", 1:5)))

	result <- embed_tokens(df, text_col = "text", model = embeddings_model, id_col = "id")

	expect_s3_class(result, "tbl_df")
	expect_equal(names(result)[1:2], c("id", "token"))
})

test_that("embed_tokens keeps all columns when .keep_all = TRUE", {
	df <- data.frame(
		id = c("doc1", "doc2"),
		text = c("this says one thing", "and this says another"),
		category = c("A", "B"),
		stringsAsFactors = FALSE
	)

	embeddings_model <- embeddings(runif(35), nrow = 7, dimnames = list(c("this", "says", "one", "thing", "and", "another", "unknown_word"), paste0("dim_", 1:5)))

	result <- embed_tokens(df, text_col = "text", model = embeddings_model,
												 id_col = "id", .keep_all = TRUE)

	expect_s3_class(result, "data.frame")
	expect_true(all(c("id", "category", "token", paste0("dim_", 1:5)) %in% names(result)))
})

test_that("embed_tokens respects tolower = FALSE", {
	texts <- c("This says one thing", "And this says another")

	embeddings_model <- embeddings(runif(35), nrow = 7, dimnames = list(c("this", "says", "one", "thing", "and", "another", "unknown_word"), paste0("dim_", 1:5)))

	result_tolower <- embed_tokens(texts, embeddings_model, tolower = TRUE)
	expect_warning(result_no_tolower <- embed_tokens(texts, embeddings_model, tolower = FALSE), "2 tokens in `x` are not present in `model`.")

	expect_false(identical(result_tolower, result_no_tolower))
})

test_that("embed_tokens passes additional arguments to tokens()", {
	texts <- c("this says one thing!", "and this says another.")

	embeddings_model <- embeddings(runif(35), nrow = 7, dimnames = list(c("this", "says", "one", "thing", "and", "another", "unknown_word"), paste0("dim_", 1:5)))

	# Remove punctuation
	result <- embed_tokens(texts, embeddings_model, remove_punct = TRUE)

	expect_s3_class(result, "tbl_df")
	expect_false(any(grepl("[[:punct:]]", result$token)))
})

test_that("embed_tokens returns embeddings with NA values for missing tokens when .keep_missing = TRUE", {
	texts <- c("this says unknown_word", "and this says another")

	embeddings_model <- embeddings(runif(30), nrow = 6, dimnames = list(c("this", "says", "one", "thing", "and", "another"), paste0("dim_", 1:5)))

	expect_warning(result <- embed_tokens(texts, embeddings_model, output_embeddings = TRUE, .keep_missing = TRUE),
								 "1 tokens in `x` are not present in `model`.")

	expect_true(is.list(result))
	expect_equal(length(result), length(texts))

	# Check that the embeddings contain NA for missing tokens
	embeddings_doc1 <- result[[1]]
	expect_true("unknown_word" %in% rownames(embeddings_doc1))
	expect_true(all(is.na(embeddings_doc1["unknown_word", ])))
})

test_that("embed_tokens handles empty input", {
	texts <- character(0)

	embeddings_model <- embeddings(runif(35), nrow = 7, dimnames = list(c("this", "says", "one", "thing", "and", "another", "unknown_word"), paste0("dim_", 1:5)))

	result <- embed_tokens(texts, embeddings_model)

	expect_s3_class(result, "tbl_df")
	expect_equal(nrow(result), 0)
})

test_that("embed_tokens throws error when model is invalid", {
	texts <- c("this says one thing", "and this says another")

	model <- list()  # Invalid model

	expect_error(embed_tokens(texts, model), "`model` must be an embeddings object.")
})

test_that("embed_tokens throws error when text_col is missing in data frame", {
	df <- data.frame(
		id = c("doc1", "doc2"),
		content = c("this says one thing", "and this says another"),
		stringsAsFactors = FALSE
	)

	embeddings_model <- embeddings(runif(35), nrow = 7, dimnames = list(c("this", "says", "one", "thing", "and", "another", "unknown_word"), paste0("dim_", 1:5)))

	expect_error(
		embed_tokens(df, text_col = "text", model = embeddings_model, id_col = "id"),
		"Can't extract columns that don't exist."
	)
})

test_that("embed_tokens ignores missing tokens when .keep_missing = FALSE", {
	texts <- c("this says unknown_word", "and this says another")

	embeddings_model <- embeddings(runif(30), nrow = 6, dimnames = list(c("this", "says", "one", "thing", "and", "another"), paste0("dim_", 1:5)))

	expect_warning(result <- embed_tokens(texts, embeddings_model, .keep_missing = FALSE), "1 tokens in `x` are not present in `model`.")

	expect_false(any(result$token == "unknown_word"))
})

test_that("embed_tokens handles embeddings with NA values", {
	texts <- c("this says one thing", "and this says another")
	embeddings_model <- embeddings(runif(35), nrow = 7, dimnames = list(c("this", "says", "one", "thing", "and", "another", "unknown_word"), paste0("dim_", 1:5)))
	embeddings_model[1, 1] <- NA  # Introduce NA value

	result <- embed_tokens(texts, embeddings_model)

	expect_s3_class(result, "tbl_df")
	expect_true(any(is.na(result)))
})

test_that("embed_tokens works with quanteda corpus", {
	texts <- c("this says one thing", "and this says another")
	corpus_obj <- corpus(texts)

	embeddings_model <- embeddings(runif(35), nrow = 7, dimnames = list(c("this", "says", "one", "thing", "and", "another", "unknown_word"), paste0("dim_", 1:5)))

	result <- embed_tokens(corpus_obj, embeddings_model)

	expect_s3_class(result, "tbl_df")
	expect_equal(names(result)[1:2], c("doc_id", "token"))
})

test_that("embed_tokens matches tokens case-sensitively when tolower = FALSE", {
	texts <- c("This says One thing", "And this says another")

	embeddings_model <- embeddings(runif(35), nrow = 7, dimnames = list(c("this", "says", "one", "thing", "and", "another", "unknown_word"), paste0("dim_", 1:5)))
	# Add uppercase versions to embeddings
	extra_tokens <- c("This", "One", "And")
	extra_embeddings <- matrix(runif(length(extra_tokens) * 5), nrow = length(extra_tokens),
														 dimnames = list(extra_tokens, paste0("dim_", 1:5)))
	embeddings_model <- rbind(embeddings_model, extra_embeddings)

	result <- embed_tokens(texts, embeddings_model, tolower = FALSE)

	expect_true(all(c("This", "One", "And") %in% result$token))
})

test_that("embed_tokens handles tokens with punctuation", {
	texts <- c("this says one thing!", "and this says another.")
	tokens_obj <- tokens(texts, what = "fasterword")

	embeddings_model <- embeddings(runif(30), nrow = 6, dimnames = list(c("this", "says", "one", "thing!", "and", "another."), paste0("dim_", 1:5)))

	result <- embed_tokens(tokens_obj, embeddings_model, remove_punct = FALSE)

	expect_true("thing!" %in% result$token)
	expect_true("another." %in% result$token)
})

test_that("embed_tokens assigns default doc_id when id_col is missing in data frame", {
	df <- data.frame(
		text = c("this says one thing", "and this says another"),
		stringsAsFactors = FALSE
	)

	embeddings_model <- embeddings(runif(35), nrow = 7, dimnames = list(c("this", "says", "one", "thing", "and", "another", "unknown_word"), paste0("dim_", 1:5)))

	result <- embed_tokens(df, text_col = "text", model = embeddings_model)

	expect_s3_class(result, "tbl_df")
	expect_equal(names(result)[1], "doc_id")
})

test_that("embed_tokens handles tokens that repeat in documents", {
	texts <- c("this this this", "and and")
	embeddings_model <- embeddings(runif(35), nrow = 7, dimnames = list(c("this", "says", "one", "thing", "and", "another", "unknown_word"), paste0("dim_", 1:5)))

	result <- embed_tokens(texts, embeddings_model)

	expect_equal(nrow(result), 5)  # 3 tokens in first doc, 2 in second
})

test_that("embed_tokens only keeps doc_id, token, and embeddings when .keep_all = FALSE", {
	df <- data.frame(
		id = c("doc1", "doc2"),
		text = c("this says one thing", "and this says another"),
		category = c("A", "B"),
		stringsAsFactors = FALSE
	)

	embeddings_model <- embeddings(runif(35), nrow = 7, dimnames = list(c("this", "says", "one", "thing", "and", "another", "unknown_word"), paste0("dim_", 1:5)))

	result <- embed_tokens(df, text_col = "text", model = embeddings_model,
												 id_col = "id", .keep_all = FALSE)

	expect_s3_class(result, "tbl_df")
	expect_equal(names(result)[1:2], c("id", "token"))
	expect_false("category" %in% names(result))
})

test_that("embed_tokens with tokens object respects tolower = FALSE", {
	texts <- c("This Says One Thing", "And This Says Another")
	tokens_obj <- tokens(texts)

	embeddings_model <- embeddings(runif(30), nrow = 6, dimnames = list(c("this", "says", "one", "thing", "and", "another"), paste0("dim_", 1:5)))
	# Add uppercase tokens
	extra_tokens <- c("This", "One", "And")
	extra_embeddings <- matrix(runif(length(extra_tokens) * 5), nrow = length(extra_tokens),
														 dimnames = list(extra_tokens, paste0("dim_", 1:5)))
	embeddings_model <- rbind(embeddings_model, extra_embeddings)

	expect_warning(result <- embed_tokens(tokens_obj, embeddings_model, tolower = FALSE), "3 tokens in `x` are not present in `model`.")

	expect_true(all(c("This", "One", "And") %in% result$token))
})

test_that("embed_tokens handles empty documents", {
	texts <- c("", "and this says another")

	embeddings_model <- embeddings(runif(30), nrow = 6, dimnames = list(c("this", "says", "one", "thing", "and", "another"), paste0("dim_", 1:5)))

	result <- embed_tokens(texts, embeddings_model)

	expect_equal(nrow(result), 4)  # Only tokens from second document
})