R version 4.6.0 RC (2026-04-20 r89921 ucrt) -- "Because it was There" Copyright (C) 2026 The R Foundation for Statistical Computing Platform: x86_64-w64-mingw32/x64 R is free software and comes with ABSOLUTELY NO WARRANTY. You are welcome to redistribute it under certain conditions. Type 'license()' or 'licence()' for distribution details. R is a collaborative project with many contributors. Type 'contributors()' for more information and 'citation()' on how to cite R or R packages in publications. Type 'demo()' for some demos, 'help()' for on-line help, or 'help.start()' for an HTML browser interface to help. Type 'q()' to quit R. > suppressPackageStartupMessages(library(stringfish)) > > regression_external_source <- ' + // [[Rcpp::depends(stringfish)]] + // [[Rcpp::plugins(cpp17)]] + #include + #include "sf_external.h" + using namespace Rcpp; + + // [[Rcpp::export]] + List sf_external_regressions(SEXP x, int nthreads = 1) { + return List::create( + _["nchar"] = sf_nchar(x, "chars", nthreads), + _["compare"] = sf_compare(x, x, nthreads), + _["substr"] = sf_substr(x, IntegerVector::create(1), IntegerVector::create(1), nthreads), + _["random"] = random_strings(4, IntegerVector::create(3), "ab", "normal") + ); + } + + // [[Rcpp::export]] + IntegerVector sf_external_encoding_regressions(SEXP x) { + if(TYPEOF(x) != STRSXP || Rf_xlength(x) != 1) { + stop("x must be a character vector of length 1"); + } + const char native_utf8[] = "\\xC3\\xA9"; + RStringIndexer direct_indexer(x); + + SEXP vec_bytes = PROTECT(sf_vector(1)); + sf_vec_data_ref(vec_bytes)[0] = sfstring("abc", 3, CE_BYTES); + RStringIndexer vec_bytes_indexer(vec_bytes); + + SEXP store_bytes = PROTECT(slice_store_create(1)); + slice_store_data_ref(store_bytes).assign(0, "abc", 3, CE_BYTES); + RStringIndexer store_bytes_indexer(store_bytes); + + SEXP vec_native = PROTECT(sf_vector(1)); + sf_vec_data_ref(vec_native)[0] = sfstring(native_utf8, 2, CE_NATIVE); + RStringIndexer vec_native_indexer(vec_native); + + SEXP store_native = PROTECT(slice_store_create(1)); + slice_store_data_ref(store_native).assign(0, native_utf8, 2, CE_NATIVE); + RStringIndexer store_native_indexer(store_native); + + IntegerVector out = IntegerVector::create( + _["reinterpret_ascii_bytes"] = static_cast(reinterpret_input_encoding("abc", 3, CE_BYTES)), + _["sfstring_ascii_bytes"] = static_cast(sfstring("abc", 3, CE_BYTES).encoding), + _["sf_vec_ascii_bytes"] = static_cast(vec_bytes_indexer.getCharLenCE(0).enc), + _["slice_store_ascii_bytes"] = static_cast(store_bytes_indexer.getCharLenCE(0).enc), + _["reinterpret_utf8_native"] = static_cast(reinterpret_input_encoding(native_utf8, 2, CE_NATIVE)), + _["sfstring_utf8_native"] = static_cast(sfstring(native_utf8, 2, CE_NATIVE).encoding), + _["sf_vec_utf8_native"] = static_cast(vec_native_indexer.getCharLenCE(0).enc), + _["slice_store_utf8_native"] = static_cast(store_native_indexer.getCharLenCE(0).enc), + _["direct_utf8_native"] = static_cast(direct_indexer.getCharLenCE(0).enc) + ); + UNPROTECT(4); + return out; + } + + // [[Rcpp::export]] + IntegerVector sf_external_get_encodings(SEXP x) { + RStringIndexer indexer(x); + const size_t len = indexer.size(); + IntegerVector out(len); + for(size_t i = 0; i < len; ++i) { + out[static_cast(i)] = static_cast(indexer.getCharLenCE(i).enc); + } + return out; + } + + // [[Rcpp::export]] + List sf_external_make_storage_vectors() { + const char utf8_e[] = "\\xC3\\xA9"; + const char latin1_e[] = "\\xE9"; + + SEXP vec = PROTECT(sf_vector(6)); + sf_vec_data & vec_data = sf_vec_data_ref(vec); + vec_data[0] = sfstring("abc", 3, cetype_t_ext::CE_ASCII); + vec_data[1] = sfstring(utf8_e, 2, cetype_t_ext::CE_ASCII_OR_UTF8); + vec_data[2] = sfstring(utf8_e, 2, cetype_t_ext::CE_NATIVE); + vec_data[3] = sfstring(latin1_e, 1, cetype_t_ext::CE_LATIN1); + vec_data[4] = sfstring("abc", 3, cetype_t_ext::CE_BYTES); + vec_data[5] = sfstring(NA_STRING); + + SEXP store = PROTECT(slice_store_create_with_size(6, 256)); + slice_store_data & store_data = slice_store_data_ref(store); + store_data.assign(0, "abc", 3, cetype_t_ext::CE_ASCII); + store_data.assign(1, utf8_e, 2, cetype_t_ext::CE_ASCII_OR_UTF8); + store_data.assign(2, utf8_e, 2, cetype_t_ext::CE_NATIVE); + store_data.assign(3, latin1_e, 1, cetype_t_ext::CE_LATIN1); + store_data.assign(4, "abc", 3, cetype_t_ext::CE_BYTES); + store_data.assign(5, nullptr, 0, cetype_t_ext::CE_NA); + + List out = List::create( + _["vec"] = vec, + _["slice"] = store + ); + UNPROTECT(2); + return out; + } + + // [[Rcpp::export]] + List sf_external_slice_store_state(SEXP x) { + slice_store_data & data = slice_store_data_ref(x); + return List::create( + _["initial_slice_size"] = data.initial_slice_size_override.has_value() ? + Rcpp::wrap(static_cast(*data.initial_slice_size_override)) : + Rcpp::wrap(NA_INTEGER), + _["slice_count"] = static_cast(data.slices.size()), + _["current_slice_capacity"] = static_cast(data.current_slice_capacity) + ); + } + ' > > is_solaris <- function() { + grepl("SunOS", Sys.info()[["sysname"]]) + } > > can_source_cpp <- !is_solaris() > > compile_regressions <- function() { + if (!can_source_cpp) { + return(invisible(NULL)) + } + if (exists("sf_external_regressions", mode = "function", inherits = FALSE)) { + return(invisible(NULL)) + } + + R_TESTS <- Sys.getenv("R_TESTS") + if (nzchar(R_TESTS)) { + R_TESTS_absolute <- normalizePath(R_TESTS) + Sys.setenv(R_TESTS = R_TESTS_absolute) + } + Rcpp::sourceCpp(code = regression_external_source) + if (nzchar(R_TESTS)) { + Sys.setenv(R_TESTS = R_TESTS) + } + invisible(NULL) + } > > encode_native_uint <- function(x, size) { + remaining <- as.numeric(x) + out <- as.raw(integer(size)) + for (i in seq_len(size)) { + byte <- as.integer(remaining %% 256) + out[[if (.Platform$endian == "little") i else size - i + 1L]] <- as.raw(byte) + remaining <- floor(remaining / 256) + } + out + } > > find_raw_subsequence <- function(haystack, needle) { + stopifnot(is.raw(haystack), is.raw(needle), length(needle) > 0L) + limit <- length(haystack) - length(needle) + 1L + if (limit < 1L) { + stop("raw subsequence longer than input") + } + for (i in seq_len(limit)) { + if (identical(haystack[i:(i + length(needle) - 1L)], needle)) { + return(i) + } + } + stop("raw subsequence not found") + } > > mutate_serialized_state <- function(x, expected_state, mutate) { + serialized <- serialize(x, NULL, xdr = FALSE) + start <- find_raw_subsequence(serialized, expected_state) + state_idx <- start:(start + length(expected_state) - 1L) + mutated_state <- mutate(serialized[state_idx]) + stopifnot(is.raw(mutated_state), length(mutated_state) == length(expected_state)) + serialized[state_idx] <- mutated_state + serialized + } > > run_regressions <- function(nthreads = 1L) { + x <- convert_to_sf_vector(c("abcdef", "ghijkl")) + y <- sf_substr(x, c(1L, 2L), 1L, nthreads = nthreads) + stopifnot(string_identical(y, c("a", ""))) + materialize(x) + y2 <- sf_substr(x, c(1L, 2L), 1L, nthreads = nthreads) + stopifnot(string_identical(y2, c("a", ""))) + + err <- tryCatch({ + stringfish:::sf_assign(sf_vector(2), 3, "x") + NULL + }, error = identity) + stopifnot(inherits(err, "error")) + + collapsed <- sf_collapse(c("a", NA_character_), ",") + stopifnot(is.character(collapsed), length(collapsed) == 1L, is.na(collapsed[[1]])) + stopifnot(identical(sf_paste(nthreads = nthreads), "")) + stopifnot(identical(sf_paste("a", "b", sep = NA_character_, nthreads = nthreads), NA_character_)) + stopifnot(identical(sf_collapse(c("a", "b"), NA_character_), NA_character_)) + + stopifnot(string_identical(sf_tolower(c("ABC", NA_character_)), c("abc", NA_character_))) + stopifnot(string_identical(sf_toupper(c("abc", NA_character_)), c("ABC", NA_character_))) + + stopifnot(identical(sf_compare("a", c("a", "b", NA_character_), nthreads = nthreads), c(TRUE, FALSE, NA))) + stopifnot(identical(sf_compare(c("a", "b", NA_character_), "a", nthreads = nthreads), c(TRUE, FALSE, NA))) + + latin1_word <- iconv("café", "UTF-8", "latin1") + Encoding(latin1_word) <- "latin1" + bytes_word <- latin1_word + Encoding(bytes_word) <- "bytes" + normalized <- convert_to_sf_vector(latin1_word) + stopifnot(identical(normalized, "café"), identical(Encoding(normalized), "UTF-8")) + stopifnot(identical(sf_compare(c("café", bytes_word, "café"), latin1_word, nthreads = nthreads), c(TRUE, TRUE, TRUE))) + + tmp_latin1 <- tempfile(fileext = ".txt") + con <- file(tmp_latin1, open = "wb") + writeBin(c(charToRaw(latin1_word), as.raw(0x0a)), con) + close(con) + on.exit(unlink(tmp_latin1), add = TRUE) + read_back <- sf_readLines(tmp_latin1, encoding = "latin1") + stopifnot(identical(read_back, "café"), identical(Encoding(read_back), "UTF-8")) + + err <- tryCatch({ + sf_grepl("a", c("a", "b"), nthreads = nthreads) + NULL + }, error = identity) + stopifnot(inherits(err, "error")) + err <- tryCatch({ + sf_split("a", c("a", "b"), nthreads = nthreads) + NULL + }, error = identity) + stopifnot(inherits(err, "error")) + err <- tryCatch({ + sf_gsub("a", "a", c("x", "y"), nthreads = nthreads) + NULL + }, error = identity) + stopifnot(inherits(err, "error")) + + subset_x <- convert_to_sf_vector(c("a", "b", "c")) + stopifnot(string_identical(subset_x[c(3L, 1L, NA_integer_, 4L)], c("c", "a", NA_character_, NA_character_))) + stopifnot(get_string_type(subset_x[c(3L, 1L)]) == "stringfish vector") + materialize(subset_x) + stopifnot(string_identical(subset_x[c(3L, 1L, NA_integer_, 4L)], c("c", "a", NA_character_, NA_character_))) + stopifnot(get_string_type(subset_x[c(3L, 1L)]) == "stringfish vector") + + stopifnot(identical( + materialize(sf_split("abc", "(?<=a)", nthreads = nthreads)[[1L]]), + strsplit("abc", "(?<=a)", perl = TRUE)[[1L]] + )) + stopifnot(identical( + materialize(sf_split("abc", "^", nthreads = nthreads)[[1L]]), + c("", "abc") + )) + stopifnot(identical( + materialize(sf_split("abc", "$", nthreads = nthreads)[[1L]]), + c("abc", "") + )) + + tmp_bytes <- tempfile(fileext = ".txt") + on.exit(unlink(tmp_bytes), add = TRUE) + bytes_word <- rawToChar(as.raw(0xE9)) + Encoding(bytes_word) <- "bytes" + sf_writeLines(bytes_word, tmp_bytes, encode_mode = "UTF-8") + stopifnot(identical(readBin(tmp_bytes, "raw", n = 2L), as.raw(c(0xE9, 0x0A)))) + + if (can_source_cpp) { + out <- sf_external_regressions(c("ab", NA_character_), nthreads) + stopifnot(identical(out$nchar, c(2L, NA_integer_))) + stopifnot(identical(out$compare, c(TRUE, NA))) + stopifnot(string_identical(out$substr, c("a", NA_character_))) + stopifnot(length(out$random) == 4L) + + native_word <- "é" + Encoding(native_word) <- "unknown" + stopifnot(identical(Encoding(native_word), "unknown")) + enc <- sf_external_encoding_regressions(native_word) + stopifnot(identical(unname(enc), c(3L, 3L, 3L, 3L, 0L, 0L, 0L, 0L, 0L))) + + sized_store <- stringfish:::slice_store_create_with_size(4L, 256L) + sized_state <- sf_external_slice_store_state(sized_store) + stopifnot(identical(sized_state$initial_slice_size, 256L)) + stopifnot(identical(sized_state$slice_count, 1L)) + stopifnot(identical(sized_state$current_slice_capacity, 256L)) + + stored <- sf_external_make_storage_vectors() + vec_before <- sf_external_get_encodings(stored$vec) + slice_before <- sf_external_get_encodings(stored$slice) + vec_after <- unserialize(serialize(stored$vec, NULL)) + slice_after <- unserialize(serialize(stored$slice, NULL)) + stopifnot(identical(sf_external_get_encodings(vec_after), vec_before)) + stopifnot(identical(sf_external_get_encodings(slice_after), slice_before)) + stopifnot(identical(get_string_type(vec_after), "stringfish vector")) + stopifnot(identical(get_string_type(slice_after), "stringfish slice store")) + + serialized_state_abc <- c( + encode_native_uint(1L, 8L), + encode_native_uint(3L, 4L), + as.raw(254L), + charToRaw("abc") + ) + truncated_serialized <- mutate_serialized_state( + convert_to_sf_vector("abc"), + serialized_state_abc, + function(state) { + state[[1L]] <- as.raw(0x02) + state + } + ) + invalid_encoding_serialized <- mutate_serialized_state( + convert_to_slice_store("abc"), + serialized_state_abc, + function(state) { + state[[13L]] <- as.raw(0x04) + state + } + ) + err <- tryCatch({ + unserialize(truncated_serialized) + NULL + }, error = identity) + stopifnot(inherits(err, "error")) + err <- tryCatch({ + unserialize(invalid_encoding_serialized) + NULL + }, error = identity) + stopifnot(inherits(err, "error")) + stopifnot(grepl("ALTREP", conditionMessage(err), fixed = TRUE)) + } + } > > compile_regressions() > run_regressions(1L) > if (stringfish:::is_tbb()) { + run_regressions(4L) + } > > proc.time() user system elapsed 0.75 0.14 9.17