# Test data
mock_cluster_data <- data.frame(
  T1 = c("A", "B", "A", "C", "A", "B"),
  T2 = c("B", "A", "B", "A", "C", "A"),
  T3 = c("C", "C", "A", "B", "B", "C")
)

# Tests for cluster_data / cluster_sequences ----------------------------------

test_that("cluster_data works with PAM method", {
  result <- cluster_data(mock_cluster_data, k = 2, method = "pam")
  expect_s3_class(result, "tna_clustering")
  expect_equal(result$k, 2)
  expect_equal(result$method, "pam")
  expect_length(result$assignments, nrow(mock_cluster_data))
  expect_true(is.numeric(result$silhouette))
  expect_equal(sum(result$sizes), nrow(mock_cluster_data))
})

test_that("cluster_data works with hclust methods", {
  # Note: check_match lowercases methods, so only lowercase methods work with hclust
  methods <- c("complete", "average", "single", "mcquitty", "median", "centroid")
  for (m in methods) {
    result <- cluster_data(mock_cluster_data, k = 2, method = m)
    expect_s3_class(result, "tna_clustering")
    expect_equal(result$method, m)
    expect_length(result$assignments, nrow(mock_cluster_data))
  }
})

test_that("cluster_data returns correct structure", {
  result <- cluster_data(mock_cluster_data, k = 2)
  expect_named(
    result,
    c("data", "k", "assignments", "silhouette", "sizes", "method", "distance")
  )
  expect_equal(result$data, mock_cluster_data)
  expect_true(inherits(result$distance, "dist"))
})

test_that("cluster_data validates input arguments", {
  # Invalid data type
  expect_error(cluster_data("not a data frame", k = 2), "data.frame")

  # k out of range
  expect_error(cluster_data(mock_cluster_data, k = 0))
  expect_error(cluster_data(mock_cluster_data, k = 100))

  # Invalid dissimilarity

  expect_error(cluster_data(mock_cluster_data, k = 2, dissimilarity = "invalid"))

  # Invalid method
  expect_error(cluster_data(mock_cluster_data, k = 2, method = "invalid"))

  # Weighted only for hamming
  expect_error(
    cluster_data(mock_cluster_data, k = 2, dissimilarity = "lv", weighted = TRUE),
    "Hamming"
  )
})

test_that("cluster_data works with matrix input", {
  mat_data <- as.matrix(mock_cluster_data)
  result <- cluster_data(mat_data, k = 2)
  expect_s3_class(result, "tna_clustering")
})

test_that("cluster_data handles weighted hamming distance", {
  result <- cluster_data(
    mock_cluster_data,
    k = 2,
    dissimilarity = "hamming",
    weighted = TRUE,
    lambda = 0.5
  )
  expect_s3_class(result, "tna_clustering")
})

test_that("cluster_sequences is alias for cluster_data", {
  result1 <- cluster_data(mock_cluster_data, k = 2)
  result2 <- cluster_sequences(mock_cluster_data, k = 2)
  expect_equal(result1$k, result2$k)
  expect_equal(result1$method, result2$method)
})

# Tests for seq2chr -----------------------------------------------------------

test_that("seq2chr converts data to character matrix", {
  result <- tna:::seq2chr(mock_cluster_data, na_syms = character(0))
  expect_type(result, "list")
  expect_named(result, c("mat", "len"))
  expect_true(is.matrix(result$mat))
  expect_equal(nrow(result$mat), nrow(mock_cluster_data))
  expect_equal(ncol(result$mat), ncol(mock_cluster_data))
})

test_that("seq2chr handles NA symbols", {
  data_with_na <- mock_cluster_data
  data_with_na[1, 1] <- "*"
  data_with_na[2, 2] <- "%"
  result <- tna:::seq2chr(data_with_na, na_syms = c("*", "%"))
  expect_type(result, "list")
})

test_that("seq2chr computes correct sequence lengths", {
  # All complete sequences should have length equal to ncol
  result <- tna:::seq2chr(mock_cluster_data, na_syms = character(0))
  expect_equal(result$len, rep(ncol(mock_cluster_data), nrow(mock_cluster_data)))
})

# Tests for get_qgram ---------------------------------------------------------

test_that("get_qgram returns empty for short sequences", {
  result <- tna:::get_qgram(c("a", "b"), n = 2, q = 3)
  expect_length(result, 0)
})

test_that("get_qgram computes q-grams correctly", {
  result <- tna:::get_qgram(c("a", "b", "c", "d"), n = 4, q = 2)
  expect_type(result, "integer")
  expect_true(length(result) > 0)
  expect_true(all(names(result) %in% c("ab", "bc", "cd")))
})

test_that("get_qgram counts repeated q-grams", {
  result <- tna:::get_qgram(c("a", "b", "a", "b"), n = 4, q = 2)
  expect_equal(result["ab"], c(ab = 2L))
})

# Tests for dissimilarity_matrix ----------------------------------------------

test_that("dissimilarity_matrix works with all methods", {
  strings <- tna:::seq2chr(mock_cluster_data, na_syms = character(0))
  methods <- c("osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw")
  for (d in methods) {
    result <- tna:::dissimilarity_matrix(strings, d, lambda = 0)
    expect_true(inherits(result, "dist"))
  }
})

test_that("dissimilarity_matrix handles weighted distances", {
  strings <- tna:::seq2chr(mock_cluster_data, na_syms = character(0))
  result <- tna:::dissimilarity_matrix(strings, "hamming", lambda = 1.0)
  expect_true(inherits(result, "dist"))
})

test_that("dissimilarity_matrix computes q-gram based distances", {
  strings <- tna:::seq2chr(mock_cluster_data, na_syms = character(0))
  # Test with custom q parameter
  result <- tna:::dissimilarity_matrix(strings, "qgram", lambda = 0, q = 2)
  expect_true(inherits(result, "dist"))
})

# Tests for individual distance functions -------------------------------------

test_that("osa_dist computes optimal string alignment distance", {
  x <- c("a", "b", "c")
  y <- c("a", "b", "d")

  # Function executes and returns numeric

  result_same <- tna:::osa_dist(x, x, 3, 3)
  expect_type(result_same, "integer")

  result_diff <- tna:::osa_dist(x, y, 3, 3)
  expect_type(result_diff, "integer")

  # Transposition case (adjacent swap)
  x2 <- c("a", "b", "c", "d")
  y2 <- c("a", "c", "b", "d")
  result_trans <- tna:::osa_dist(x2, y2, 4, 4)
  expect_type(result_trans, "integer")
})

test_that("levenshtein_dist computes edit distance", {
  x <- c("a", "b", "c")
  y <- c("a", "b", "d")

  # Function executes and returns numeric
  result_same <- tna:::levenshtein_dist(x, x, 3, 3)
  expect_type(result_same, "integer")

  result_diff <- tna:::levenshtein_dist(x, y, 3, 3)
  expect_type(result_diff, "integer")

  # Completely different strings
  z <- c("d", "e", "f")
  result_all_diff <- tna:::levenshtein_dist(x, z, 3, 3)
  expect_type(result_all_diff, "integer")
})

test_that("damerau_levenshtein_dist handles transpositions", {
  x <- c("a", "b", "c")
  y <- c("a", "b", "d")

  # Function executes and returns numeric
  result_same <- tna:::damerau_levenshtein_dist(x, x, 3, 3)
  expect_type(result_same, "integer")

  result_diff <- tna:::damerau_levenshtein_dist(x, y, 3, 3)
  expect_type(result_diff, "integer")

  # Transposition case
  x2 <- c("a", "b", "c", "d")
  y2 <- c("a", "c", "b", "d")
  result_trans <- tna:::damerau_levenshtein_dist(x2, y2, 4, 4)
  expect_type(result_trans, "integer")
})

test_that("hamming_dist computes hamming distance", {
  x <- c("a", "b", "c")
  y <- c("a", "b", "c")
  expect_equal(tna:::hamming_dist(x, y, 3, 3, weights = 1), 0)

  # One difference
  y <- c("a", "b", "d")
  expect_equal(tna:::hamming_dist(x, y, 3, 3, weights = 1), 1)

  # All different
  y <- c("d", "e", "f")
  expect_equal(tna:::hamming_dist(x, y, 3, 3, weights = 1), 3)

  # With weights
  weights <- c(1, 0.5, 0.25)
  y <- c("d", "e", "f")
  expect_equal(tna:::hamming_dist(x, y, 3, 3, weights = weights), sum(weights))
})

test_that("lcs_dist computes longest common subsequence distance", {
  x <- c("a", "b", "c")
  y <- c("a", "b", "c")
  expect_equal(tna:::lcs_dist(x, y, 3, 3), 0)

  # Partial match
  y <- c("a", "d", "c")
  result <- tna:::lcs_dist(x, y, 3, 3)
  expect_true(result >= 0)

  # No common subsequence
  y <- c("d", "e", "f")
  result <- tna:::lcs_dist(x, y, 3, 3)
  expect_equal(result, 3)
})

test_that("qgram_dist computes q-gram distance", {
  qx <- c(ab = 1L, bc = 1L)
  qy <- c(ab = 1L, bc = 1L)
  expect_equal(tna:::qgram_dist(NULL, NULL, qx, qy), 0)

  # Different q-grams
  qy <- c(ab = 2L, cd = 1L)
  result <- tna:::qgram_dist(NULL, NULL, qx, qy)
  expect_true(result > 0)
})

test_that("cosine_dist computes cosine distance", {
  qx <- c(ab = 1L, bc = 1L)
  qy <- c(ab = 1L, bc = 1L)
  expect_equal(tna:::cosine_dist(NULL, NULL, qx, qy), 0)

  # Orthogonal vectors
  qx <- c(ab = 1L)
  qy <- c(cd = 1L)
  expect_equal(tna:::cosine_dist(NULL, NULL, qx, qy), 1)

  # Empty vectors (edge case)
  qx <- integer(0)
  names(qx) <- character(0)
  qy <- integer(0)
  names(qy) <- character(0)
  expect_equal(tna:::cosine_dist(NULL, NULL, qx, qy), 1)
})

test_that("jaccard_dist computes jaccard distance", {
  qx <- c(ab = 1L, bc = 1L)
  qy <- c(ab = 1L, bc = 1L)
  expect_equal(tna:::jaccard_dist(qx, qy), 0)

  # No overlap
  qx <- c(ab = 1L)
  qy <- c(cd = 1L)
  expect_equal(tna:::jaccard_dist(qx, qy), 1)

  # Partial overlap
  qx <- c(ab = 1L, bc = 1L)
  qy <- c(ab = 1L, cd = 1L)
  expect_equal(tna:::jaccard_dist(qx, qy), 1 - 1/3)

  # Empty sets
  qx <- integer(0)
  names(qx) <- character(0)
  qy <- integer(0)
  names(qy) <- character(0)
  expect_equal(tna:::jaccard_dist(qx, qy), 0)
})

test_that("jaro_dist computes jaro distance", {
  x <- c("a", "b", "c")
  y <- c("a", "b", "c")
  expect_equal(tna:::jaro_dist(x, y, 3, 3), 0)

  # Completely different
  y <- c("d", "e", "f")
  expect_equal(tna:::jaro_dist(x, y, 3, 3), 1)

  # Empty strings
  expect_equal(tna:::jaro_dist(character(0), character(0), 0, 0), 0)
  expect_equal(tna:::jaro_dist(c("a"), character(0), 1, 0), 1)
  expect_equal(tna:::jaro_dist(character(0), c("a"), 0, 1), 1)
})

test_that("jaro_dist handles transpositions", {
  x <- c("a", "b", "c", "d")
  y <- c("a", "c", "b", "d")
  result <- tna:::jaro_dist(x, y, 4, 4)
  expect_true(result > 0 && result < 1)
})

test_that("jaro_winkler_dist computes jaro-winkler distance", {
  x <- c("a", "b", "c")
  y <- c("a", "b", "c")
  expect_equal(tna:::jaro_winkler_dist(x, y, 3, 3), 0)

  # Common prefix should reduce distance
  x <- c("a", "b", "c", "d")
  y <- c("a", "b", "x", "y")
  jaro <- tna:::jaro_dist(x, y, 4, 4)
  jw <- tna:::jaro_winkler_dist(x, y, 4, 4)
  # JW distance should be <= Jaro distance when there's a common prefix
  expect_true(jw <= jaro)
})

test_that("jaro_winkler_dist handles no common prefix", {
  x <- c("a", "b", "c")
  y <- c("d", "e", "f")
  result <- tna:::jaro_winkler_dist(x, y, 3, 3)
  expect_equal(result, 1)
})

# Edge cases ------------------------------------------------------------------

test_that("distance functions handle single element sequences", {
  x <- c("a")
  y <- c("a")
  # Functions execute on single element
  result_lev <- tna:::levenshtein_dist(x, y, 1, 1)
  expect_type(result_lev, "integer")

  result_ham <- tna:::hamming_dist(x, y, 1, 1, weights = 1)
  expect_type(result_ham, "double")
})

test_that("cluster_data works with minimum k", {
  result <- cluster_data(mock_cluster_data, k = 2)
  expect_equal(result$k, 2)
})

test_that("cluster_data handles all dissimilarity measures", {
  methods <- c("osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw")
  for (d in methods) {
    result <- cluster_data(mock_cluster_data, k = 2, dissimilarity = d)
    expect_s3_class(result, "tna_clustering")
  }
})