R Under development (unstable) (2023-08-28 r85029 ucrt) -- "Unsuffered Consequences"
Copyright (C) 2023 The R Foundation for Statistical Computing
Platform: x86_64-w64-mingw32/x64

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.

R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.

Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.

> # This test file tests the `dist_matrix` and `dist_pairwise` functions
> # These two functions are simple dynamic programming algorithms for computing pairwise distances and are themselves used to validate
> # the RadixTree imeplementation (see test_radix_tree.R)
> 
> library(seqtrie)
> library(stringdist)
> library(Biostrings)
Loading required package: BiocGenerics

Attaching package: 'BiocGenerics'

The following objects are masked from 'package:stats':

    IQR, mad, sd, var, xtabs

The following objects are masked from 'package:base':

    Filter, Find, Map, Position, Reduce, anyDuplicated, aperm, append,
    as.data.frame, basename, cbind, colnames, dirname, do.call,
    duplicated, eval, evalq, get, grep, grepl, intersect, is.unsorted,
    lapply, mapply, match, mget, order, paste, pmax, pmax.int, pmin,
    pmin.int, rank, rbind, rownames, sapply, setdiff, sort, table,
    tapply, union, unique, unsplit, which.max, which.min

Loading required package: S4Vectors
Loading required package: stats4

Attaching package: 'S4Vectors'

The following object is masked from 'package:utils':

    findMatches

The following objects are masked from 'package:base':

    I, expand.grid, unname

Loading required package: IRanges

Attaching package: 'IRanges'

The following object is masked from 'package:grDevices':

    windows

Loading required package: XVector
Loading required package: GenomeInfoDb

Attaching package: 'Biostrings'

The following object is masked from 'package:base':

    strsplit

> library(dplyr)

Attaching package: 'dplyr'

The following objects are masked from 'package:Biostrings':

    collapse, intersect, setdiff, setequal, union

The following object is masked from 'package:GenomeInfoDb':

    intersect

The following object is masked from 'package:XVector':

    slice

The following objects are masked from 'package:IRanges':

    collapse, desc, intersect, setdiff, slice, union

The following objects are masked from 'package:S4Vectors':

    first, intersect, rename, setdiff, setequal, union

The following objects are masked from 'package:BiocGenerics':

    combine, intersect, setdiff, union

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

> 
> # Use 2 threads on github actions and CRAN, 4 threads locally
> IS_LOCAL  <- Sys.getenv("IS_LOCAL") != ""
> NTHREADS  <- ifelse(IS_LOCAL, 4, 2)
> NITER     <- ifelse(IS_LOCAL, 4, 2)
> NSEQS     <- 10000
> MAXSEQLEN <- 200
> CHARSET   <- "ACGT"
> 
> random_strings <- function(N, charset = "abcdefghijklmnopqrstuvwxyz") {
+   len <- sample(0:MAXSEQLEN, N, replace=TRUE)
+   result <- lapply(0:MAXSEQLEN, function(x) {
+     nx <- sum(len == x)
+     stringfish::random_strings(nx, x, charset = charset, vector_mode = "normal")
+   })
+   sample(unlist(result))
+ }
> 
> mutate_strings <- function(x, prob = 0.025, indel_prob = 0.025, charset = "abcdefghijklmnopqrstuvwxyz") {
+   charset <- unlist(strsplit(charset, ""))
+   xsplit <- strsplit(x, "")
+   sapply(xsplit, function(a) {
+     r <- runif(length(a)) < prob
+     a[r] <- sample(charset, sum(r), replace=TRUE)
+     ins <- runif(length(a)) < indel_prob
+     a[ins] <- paste0(sample(charset, sum(ins), replace=TRUE), sample(charset, sum(ins), replace=TRUE))
+     del <- runif(length(a)) < indel_prob
+     a[del] <- ""
+     paste0(a, collapse = "")
+   })
+ }
> 
> # Biostrings notes:
> # subject (target) must be of length 1 or equal to pattern (query)
> # To get a distance matrix, iterate over target and perform a column bind
> # special_zero_case -- if both query and target are empty, Biostrings fails with an error
> pairwiseAlignmentFix <- function(pattern, subject, ...) {
+     results <- rep(0, length(subject))
+     special_zero_case <- nchar(pattern) == 0 & nchar(subject) == 0
+     if(all(special_zero_case)) {
+       results
+     } else {
+       results[!special_zero_case] <- Biostrings::pairwiseAlignment(pattern=pattern[!special_zero_case], subject=subject[!special_zero_case], ...)
+       results
+     }
+ }
> 
> biostrings_matrix_global <- function(query, target, cost_matrix, gap_cost, gap_open_cost = 0) {
+   substitutionMatrix <- -cost_matrix
+   lapply(query, function(x) {
+     query2 <- rep(x, length(target))
+     -pairwiseAlignmentFix(pattern=query2, subject=target, substitutionMatrix = substitutionMatrix, gapOpening=gap_open_cost, gapExtension=gap_cost, scoreOnly=TRUE, type="global")
+   }) %>% do.call(rbind, .)
+ }
> 
> biostrings_pairwise_global <- function(query, target, cost_matrix, gap_cost, gap_open_cost = 0) {
+   substitutionMatrix <- -cost_matrix
+   -pairwiseAlignment(pattern=query, subject=target, substitutionMatrix = substitutionMatrix,gapOpening=gap_open_cost, gapExtension=gap_cost, scoreOnly=TRUE, type="global")
+ }
> 
> biostrings_matrix_anchored <- function(query, target, query_size, target_size, cost_matrix, gap_cost, gap_open_cost = 0) {
+   substitutionMatrix <- -cost_matrix
+   lapply(seq_along(query), function(i) {
+     query2 <- substring(query[i], 1, query_size[i,,drop=TRUE])
+     target2 <- substring(target, 1, target_size[i,,drop=TRUE])
+     -pairwiseAlignmentFix(pattern=query2, subject=target2, substitutionMatrix = substitutionMatrix, gapOpening=gap_open_cost, gapExtension=gap_cost, scoreOnly=TRUE, type="global")
+   }) %>% do.call(rbind, .)
+ }
> 
> biostrings_pairwise_anchored <- function(query, target, query_size, target_size, cost_matrix, gap_cost, gap_open_cost = 0) {
+   substitutionMatrix <- -cost_matrix
+   query2 <- substring(query, 1, query_size)
+   target2 <- substring(target, 1, target_size)
+   -pairwiseAlignmentFix(pattern=query2, subject=target2, substitutionMatrix = substitutionMatrix, gapOpening=gap_open_cost, gapExtension=gap_cost, scoreOnly=TRUE, type="global")
+ }
> 
> for(. in 1:NITER) {
+ 
+     print("Checking hamming search correctness")
+     local({
+       # Note: seqtrie returns `NA_integer_` for hamming distance when the lengths are different
+       # whereas stringdist returns `Inf`
+       # This is why we need to replace `NA_integer_` with `Inf` when comparing results
+ 
+       target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+       query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+       query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+ 
+       # Check matrix results
+       results_seqtrie <- dist_matrix(query, target, mode = "hamming", nthreads=NTHREADS)
+       results_seqtrie[is.na(results_seqtrie)] <- Inf
+       results_stringdist <- stringdist::stringdistmatrix(query, target, method = "hamming", nthread=NTHREADS)
+       stopifnot(all(results_seqtrie == results_stringdist))
+ 
+       # Check pairwise results
+       query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+       results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "hamming", nthreads=NTHREADS)
+       results_seqtrie[is.na(results_seqtrie)] <- Inf
+       results_stringdist <- stringdist::stringdist(query_pairwise, target, method = "hamming", nthread=NTHREADS)
+       stopifnot(all(results_seqtrie == results_stringdist))
+     })
+ 
+     print("Checking levenshtein search correctness")
+     local({
+       target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+       query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+       query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+ 
+       # Check matrix results
+       results_seqtrie <- dist_matrix(query, target, mode = "levenshtein", nthreads=NTHREADS)
+       results_stringdist <- stringdist::stringdistmatrix(query, target, method = "lv", nthread=NTHREADS)
+       stopifnot(all(results_seqtrie == results_stringdist))
+ 
+       # Check pairwise results
+       query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+       results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "levenshtein", nthreads=NTHREADS)
+       results_stringdist <- stringdist::stringdist(query_pairwise, target, method = "lv", nthread=NTHREADS)
+       stopifnot(all(results_seqtrie == results_stringdist))
+     })
+ 
+     print("Checking anchored search correctness")
+     local({
+       # There is no anchored search in stringdist (or elsewhere). To get the same results, we substring the query and target sequences
+       # By the results of the seqtrie anchored search and then compare the results
+ 
+       target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+       query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+       query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+ 
+       # Check matrix results
+       results_seqtrie <- dist_matrix(query, target, mode = "anchored", nthreads=NTHREADS)
+       query_size <- attr(results_seqtrie, "query_size")
+       target_size <- attr(results_seqtrie, "target_size")
+       results_stringdist <- lapply(seq_along(query), function(i) {
+         query_size2 <- query_size[i,,drop=TRUE]
+         target_size2 <- target_size[i,,drop=TRUE]
+         query2 <- substring(query[i], 1, query_size2) # query[i] is recycled
+         target2 <- substring(target, 1, target_size2)
+         stringdist::stringdist(query2, target2, method = "lv", nthread=NTHREADS)
+       }) %>% do.call(rbind, .)
+       stopifnot(all(results_seqtrie == results_stringdist))
+ 
+       # Check pairwise results
+       query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+       results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "anchored", nthreads=NTHREADS)
+       query_size <- attr(results_seqtrie, "query_size")
+       target_size <- attr(results_seqtrie, "target_size")
+       query2 <- substring(query_pairwise, 1, query_size)
+       target2 <- substring(target, 1, target_size)
+       results_stringdist <- stringdist::stringdist(query2, target2, method = "lv", nthread=NTHREADS)
+       stopifnot(all(results_seqtrie == results_stringdist))
+     })
+ 
+     print("Checking global search with linear gap for correctness")
+     local({
+       target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+       query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+       query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+ 
+       # Check matrix results
+       cost_matrix <- matrix(sample(1:3, size = nchar(CHARSET)^2, replace=TRUE), nrow=nchar(CHARSET))
+       diag(cost_matrix) <- 0
+       colnames(cost_matrix) <- rownames(cost_matrix) <- strsplit(CHARSET, "")[[1]]
+       gap_cost <- sample(1:3, size = 1)
+       results_seqtrie <- dist_matrix(query, target, mode = "levenshtein", cost_matrix = cost_matrix, gap_cost = gap_cost, nthreads=NTHREADS)
+       results_biostrings <- biostrings_matrix_global(query, target, cost_matrix = cost_matrix, gap_cost = gap_cost)
+       stopifnot(all(results_seqtrie == results_biostrings))
+ 
+       # Check pairwise results
+       query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+       results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "levenshtein", cost_matrix = cost_matrix, gap_cost = gap_cost, nthreads=NTHREADS)
+       results_biostrings <- biostrings_pairwise_global(query_pairwise, target, cost_matrix = cost_matrix, gap_cost = gap_cost)
+       stopifnot(all(results_seqtrie == results_biostrings))
+     })
+ 
+     print("Checking anchored search with linear gap for correctness")
+     local({
+       target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+       query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+       query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+ 
+       # Check matrix results
+       cost_matrix <- matrix(sample(1:3, size = nchar(CHARSET)^2, replace=TRUE), nrow=nchar(CHARSET))
+       diag(cost_matrix) <- 0
+       colnames(cost_matrix) <- rownames(cost_matrix) <- strsplit(CHARSET, "")[[1]]
+       gap_cost <- sample(1:3, size = 1)
+       results_seqtrie <- dist_matrix(query, target, mode = "anchored", cost_matrix = cost_matrix, gap_cost = gap_cost, nthreads=NTHREADS)
+       query_size <- attr(results_seqtrie, "query_size")
+       target_size <- attr(results_seqtrie, "target_size")
+       results_biostrings <- biostrings_matrix_anchored(query, target, query_size, target_size, cost_matrix = cost_matrix, gap_cost = gap_cost)
+       stopifnot(all(results_seqtrie == results_biostrings))
+ 
+       # Check pairwise results
+       query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+       results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "anchored", cost_matrix = cost_matrix, gap_cost = gap_cost, nthreads=NTHREADS)
+       query_size <- attr(results_seqtrie, "query_size")
+       target_size <- attr(results_seqtrie, "target_size")
+       results_biostrings <- biostrings_pairwise_anchored(query_pairwise, target, query_size, target_size, cost_matrix = cost_matrix, gap_cost = gap_cost)
+       stopifnot(all(results_seqtrie == results_biostrings))
+     })
+ 
+ 
+     print("Checking global search with affine gap for correctness")
+     local({
+       target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+       query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+       query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+ 
+       # Check matrix results
+       cost_matrix <- matrix(sample(1:3, size = nchar(CHARSET)^2, replace=TRUE), nrow=nchar(CHARSET))
+       diag(cost_matrix) <- 0
+       colnames(cost_matrix) <- rownames(cost_matrix) <- strsplit(CHARSET, "")[[1]]
+       gap_cost <- sample(1:3, size = 1)
+       gap_open_cost <- sample(1:3, size = 1)
+       results_seqtrie <- dist_matrix(query, target, mode = "levenshtein", cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cos=gap_open_cost, nthreads=NTHREADS)
+       results_biostrings <- biostrings_matrix_global(query, target, cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost)
+       stopifnot(all(results_seqtrie == results_biostrings))
+ 
+       # Check pairwise results
+       query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+       results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "levenshtein", cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost, nthreads=NTHREADS)
+       results_biostrings <- biostrings_pairwise_global(query_pairwise, target, cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost)
+       stopifnot(all(results_seqtrie == results_biostrings))
+     })
+ 
+     print("Checking anchored search with affine gap for correctness")
+     local({
+       target <- c(random_strings(NSEQS, CHARSET),"") %>% unique
+       query <- sample(c(sample(target, NSEQS/1000), random_strings(NSEQS/1000, CHARSET)))
+       query <- c(mutate_strings(query, indel_prob=0, charset = CHARSET), "") %>% unique
+ 
+       # Check matrix results
+       cost_matrix <- matrix(sample(1:3, size = nchar(CHARSET)^2, replace=TRUE), nrow=nchar(CHARSET))
+       diag(cost_matrix) <- 0
+       colnames(cost_matrix) <- rownames(cost_matrix) <- strsplit(CHARSET, "")[[1]]
+       gap_cost <- sample(1:3, size = 1)
+       gap_open_cost <- sample(1:3, size = 1)
+       results_seqtrie <- dist_matrix(query, target, mode = "anchored", cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost, nthreads=NTHREADS)
+       query_size <- attr(results_seqtrie, "query_size")
+       target_size <- attr(results_seqtrie, "target_size")
+       results_biostrings <- biostrings_matrix_anchored(query, target, query_size, target_size, cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost)
+       stopifnot(all(results_seqtrie == results_biostrings))
+ 
+       # Check pairwise results
+       query_pairwise <- mutate_strings(target, prob=0.025, indel_prob=0.05, charset = CHARSET)
+       results_seqtrie <- dist_pairwise(query_pairwise, target, mode = "anchored", cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost, nthreads=NTHREADS)
+       query_size <- attr(results_seqtrie, "query_size")
+       target_size <- attr(results_seqtrie, "target_size")
+       results_biostrings <- biostrings_pairwise_anchored(query_pairwise, target, query_size, target_size, cost_matrix = cost_matrix, gap_cost = gap_cost, gap_open_cost=gap_open_cost)
+       stopifnot(all(results_seqtrie == results_biostrings))
+     })
+ }
[1] "Checking hamming search correctness"
[1] "Checking levenshtein search correctness"
[1] "Checking anchored search correctness"
[1] "Checking global search with linear gap for correctness"
[1] "Checking anchored search with linear gap for correctness"
[1] "Checking global search with affine gap for correctness"
[1] "Checking anchored search with affine gap for correctness"
[1] "Checking hamming search correctness"
[1] "Checking levenshtein search correctness"
[1] "Checking anchored search correctness"
[1] "Checking global search with linear gap for correctness"
[1] "Checking anchored search with linear gap for correctness"
[1] "Checking global search with affine gap for correctness"
[1] "Checking anchored search with affine gap for correctness"
> 
> proc.time()
   user  system elapsed 
 576.15    1.90  328.51