test_that("sm_preprocess_text works with valid data", {
  test_data <- data.frame(
    abstract = c(
      "This is a test abstract about machine learning.",
      "Another abstract discussing deep learning algorithms.",
      "Sport science and data analysis methods."
    ),
    stringsAsFactors = FALSE
  )

  result <- sm_preprocess_text(test_data, text_col = "abstract")

  expect_s3_class(result, "data.frame")
  expect_true(all(c("doc_id", "stem", "n") %in% names(result)))
  expect_gt(nrow(result), 0)
})

test_that("sm_preprocess_text handles missing text column", {
  test_data <- data.frame(
    content = c("Text here"),
    stringsAsFactors = FALSE
  )

  expect_error(
    sm_preprocess_text(test_data, text_col = "abstract"),
    "Column 'abstract' not found"
  )
})

test_that("sm_preprocess_text filters NA values", {
  test_data <- data.frame(
    abstract = c(
      "Valid text here.",
      NA,
      "",
      "More valid text."
    ),
    stringsAsFactors = FALSE
  )

  result <- sm_preprocess_text(test_data)

  unique_docs <- unique(result$doc_id)
  expect_equal(length(unique_docs), 2)
})

test_that("sm_create_dtm produces valid DTM", {
  word_counts <- data.frame(
    doc_id = rep(c("doc1", "doc2", "doc3"), each = 5),
    stem = c(
      "machin", "learn", "algorithm", "data", "scienc",
      "deep", "neural", "network", "model", "train",
      "sport", "athlet", "perform", "analys", "metric"
    ),
    n = c(3, 2, 1, 4, 2, 5, 3, 2, 1, 3, 4, 2, 3, 5, 1)
  )

  dtm <- sm_create_dtm(word_counts, min_term_freq = 1)

  expect_s3_class(dtm, "DocumentTermMatrix")
  expect_gt(dtm$nrow, 0)
  expect_gt(dtm$ncol, 0)
})

test_that("sm_create_dtm validates input columns", {
  invalid_data <- data.frame(
    id = c("doc1", "doc2"),
    word = c("test", "example")
  )

  expect_error(
    sm_create_dtm(invalid_data),
    "word_counts must have columns: doc_id, stem, n"
  )
})