# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. skip_if_not_available("utf8proc") skip_if_not_available("acero") # Skip these tests on CRAN due to build times > 10 mins skip_on_cran() library(dplyr, warn.conflicts = FALSE) library(lubridate) library(stringr) library(stringi) tbl <- example_data # Add some better string data tbl$verses <- verses[[1]] # c(" a ", " b ", " c ", ...) increasing padding # nchar = 3 5 7 9 11 13 15 17 19 21 tbl$padded_strings <- stringr::str_pad(letters[1:10], width = 2 * (1:10) + 1, side = "both") tbl$some_grouping <- rep(c(1, 2), 5) test_that("paste, paste0, and str_c", { df <- tibble( v = c("A", "B", "C"), w = c("a", "b", "c"), x = c("d", NA_character_, "f"), y = c(NA_character_, "h", "i"), z = c(1.1, 2.2, NA) ) x <- Expression$field_ref("x") y <- Expression$field_ref("y") # no NAs in data compare_dplyr_binding( .input %>% transmute( a = paste(v, w), a2 = base::paste(v, w) ) %>% collect(), df ) compare_dplyr_binding( .input %>% transmute(paste(v, w, sep = "-")) %>% collect(), df ) compare_dplyr_binding( .input %>% transmute( a = paste0(v, w), a2 = base::paste0(v, w) ) %>% collect(), df ) compare_dplyr_binding( .input %>% transmute( a = str_c(v, w), a2 = stringr::str_c(v, w) ) %>% collect(), df ) compare_dplyr_binding( .input %>% transmute(str_c(v, w, sep = "+")) %>% collect(), df ) # NAs in data compare_dplyr_binding( .input %>% transmute(paste(x, y)) %>% collect(), df ) compare_dplyr_binding( .input %>% transmute(paste(x, y, sep = "-")) %>% collect(), df ) compare_dplyr_binding( .input %>% transmute(str_c(x, y)) %>% collect(), df ) # non-character column in dots compare_dplyr_binding( .input %>% transmute(paste0(x, y, z)) %>% collect(), df ) # literal string in dots compare_dplyr_binding( .input %>% transmute(paste(x, "foo", y)) %>% collect(), df ) # literal NA in dots compare_dplyr_binding( .input %>% transmute(paste(x, NA, y)) %>% collect(), df ) # expressions in dots compare_dplyr_binding( .input %>% transmute(paste0(x, toupper(y), as.character(z))) %>% collect(), df ) # sep is literal NA # errors in paste() (consistent with base::paste()) expect_error( call_binding("paste", x, y, sep = NA_character_), "Invalid separator" ) # In next release of stringr (late 2022), str_c also errors expect_error( call_binding("str_c", x, y, sep = NA_character_), "`sep` must be a single string, not `NA`." ) # sep passed in dots to paste0 (which doesn't take a sep argument) compare_dplyr_binding( .input %>% transmute(paste0(x, y, sep = "-")) %>% collect(), df ) # known differences # arrow allows the separator to be an array expect_equal( df %>% Table$create() %>% transmute(result = paste(x, y, sep = w)) %>% collect(), df %>% transmute(result = paste(x, w, y, sep = "")) ) # expected errors # collapse argument not supported expect_arrow_eval_error( paste(chr, int, collapse = ""), "`collapse` argument not supported in Arrow", class = "arrow_not_supported" ) expect_arrow_eval_error( paste0(chr, int, collapse = ""), "`collapse` argument not supported in Arrow", class = "arrow_not_supported" ) expect_arrow_eval_error( str_c(chr, int, collapse = ""), "`collapse` argument not supported in Arrow", class = "arrow_not_supported" ) # literal vectors of length != 1 not supported expect_arrow_eval_error( paste(chr, character(0), int), "Literal vectors of length != 1 in string concatenation not supported in Arrow", class = "arrow_not_supported" ) expect_arrow_eval_error( paste(chr, c(",", ";"), int), "Literal vectors of length != 1 in string concatenation not supported in Arrow", class = "arrow_not_supported" ) }) test_that("grepl with ignore.case = FALSE and fixed = TRUE", { df <- tibble(x = c("Foo", "bar", NA_character_)) compare_dplyr_binding( .input %>% filter(grepl("o", x, fixed = TRUE)) %>% collect(), df ) compare_dplyr_binding( .input %>% mutate(x = grepl("o", x, fixed = TRUE)) %>% collect(), df ) }) test_that("sub and gsub with ignore.case = FALSE and fixed = TRUE", { df <- tibble(x = c("Foo", "bar")) compare_dplyr_binding( .input %>% transmute(x = sub("Foo", "baz", x, fixed = TRUE)) %>% collect(), df ) compare_dplyr_binding( .input %>% transmute(x = gsub("o", "u", x, fixed = TRUE)) %>% collect(), df ) }) # many of the remainder of these tests require RE2 skip_if_not_available("re2") test_that("grepl", { df <- tibble(x = c("Foo", "bar", NA_character_)) for (fixed in c(TRUE, FALSE)) { compare_dplyr_binding( .input %>% filter(grepl("Foo", x, fixed = fixed)) %>% collect(), df ) compare_dplyr_binding( .input %>% transmute(x = grepl("^B.+", x, ignore.case = FALSE, fixed = fixed)) %>% collect(), df ) compare_dplyr_binding( .input %>% filter(grepl("Foo", x, ignore.case = FALSE, fixed = fixed)) %>% collect(), df ) # with namespacing compare_dplyr_binding( .input %>% filter(base::grepl("Foo", x, fixed = fixed)) %>% collect(), df ) } }) test_that("grepl with ignore.case = TRUE and fixed = TRUE", { df <- tibble(x = c("Foo", "bar", NA_character_)) # base::grepl() ignores ignore.case = TRUE with a warning when fixed = TRUE, # so we can't use compare_dplyr_binding() for these tests expect_equal( df %>% Table$create() %>% filter(grepl("O", x, ignore.case = TRUE, fixed = TRUE)) %>% collect(), tibble(x = "Foo") ) expect_equal( df %>% Table$create() %>% filter(grepl("^B.+", x, ignore.case = TRUE, fixed = TRUE)) %>% collect(), tibble(x = character(0)) ) expect_equal( df %>% Table$create() %>% mutate( a = grepl("O", x, ignore.case = TRUE, fixed = TRUE) ) %>% collect(), tibble( x = c("Foo", "bar", NA_character_), a = c(TRUE, FALSE, FALSE) ) ) }) test_that("str_detect", { df <- tibble(x = c("Foo", "bar", NA_character_)) compare_dplyr_binding( .input %>% filter(str_detect(x, regex("^F"))) %>% collect(), df ) string <- "^F" compare_dplyr_binding( .input %>% filter(str_detect(x, regex(string))) %>% collect(), df ) compare_dplyr_binding( .input %>% transmute( a = str_detect(x, regex("^f[A-Z]{2}", ignore_case = TRUE)), a2 = stringr::str_detect(x, regex("^f[A-Z]{2}", ignore_case = TRUE)) ) %>% collect(), df ) compare_dplyr_binding( .input %>% transmute(x = str_detect(x, regex("^f[A-Z]{2}", ignore_case = TRUE), negate = TRUE)) %>% collect(), df ) compare_dplyr_binding( .input %>% filter(str_detect(x, fixed("o"))) %>% collect(), df ) compare_dplyr_binding( .input %>% filter(str_detect(x, fixed("O"))) %>% collect(), df ) compare_dplyr_binding( .input %>% filter(str_detect(x, fixed("O", ignore_case = TRUE))) %>% collect(), df ) compare_dplyr_binding( .input %>% filter(str_detect(x, fixed("O", ignore_case = TRUE), negate = TRUE)) %>% collect(), df ) }) test_that("sub and gsub", { df <- tibble(x = c("Foo", "bar")) for (fixed in c(TRUE, FALSE)) { compare_dplyr_binding( .input %>% transmute(x = sub("Foo", "baz", x, fixed = fixed)) %>% collect(), df ) compare_dplyr_binding( .input %>% transmute(x = sub("^B.+", "baz", x, ignore.case = FALSE, fixed = fixed)) %>% collect(), df ) compare_dplyr_binding( .input %>% transmute(x = sub("Foo", "baz", x, ignore.case = FALSE, fixed = fixed)) %>% collect(), df ) } }) test_that("sub and gsub with ignore.case = TRUE and fixed = TRUE", { df <- tibble(x = c("Foo", "bar")) # base::sub() and base::gsub() ignore ignore.case = TRUE with a warning when # fixed = TRUE, so we can't use compare_dplyr_binding() for these tests expect_equal( df %>% Table$create() %>% transmute(x = sub("O", "u", x, ignore.case = TRUE, fixed = TRUE)) %>% collect(), tibble(x = c("Fuo", "bar")) ) expect_equal( df %>% Table$create() %>% transmute(x = gsub("o", "u", x, ignore.case = TRUE, fixed = TRUE)) %>% collect(), tibble(x = c("Fuu", "bar")) ) expect_equal( df %>% Table$create() %>% transmute(x = sub("^B.+", "baz", x, ignore.case = TRUE, fixed = TRUE)) %>% collect(), df # unchanged ) }) test_that("sub and gsub with namespacing", { compare_dplyr_binding( .input %>% mutate(verses_new = base::gsub("o", "u", verses, fixed = TRUE)) %>% collect(), tbl ) compare_dplyr_binding( .input %>% mutate(verses_new = base::sub("o", "u", verses, fixed = TRUE)) %>% collect(), tbl ) }) test_that("str_replace and str_replace_all", { x <- Expression$field_ref("x") expect_error( call_binding("str_replace_all", x, c("F" = "_", "b" = "")), regexp = "`pattern` must be a length 1 character vector" ) expect_error( call_binding("str_replace_all", x, c("F", "b"), c("_", "")), regexp = "`pattern` must be a length 1 character vector" ) expect_error( call_binding("str_replace_all", x, c("F"), c("_", "")), regexp = "`replacement` must be a length 1 character vector" ) df <- tibble(x = c("Foo", "bar")) compare_dplyr_binding( .input %>% transmute(x = str_replace_all(x, "^F", "baz")) %>% collect(), df ) compare_dplyr_binding( .input %>% transmute(x = str_replace_all(x, regex("^F"), "baz")) %>% collect(), df ) compare_dplyr_binding( .input %>% mutate(x = str_replace(x, "^F[a-z]{2}", "baz")) %>% collect(), df ) compare_dplyr_binding( .input %>% transmute(x = str_replace(x, regex("^f[A-Z]{2}", ignore_case = TRUE), "baz")) %>% collect(), df ) compare_dplyr_binding( .input %>% transmute( x = str_replace_all(x, fixed("o"), "u"), x2 = stringr::str_replace_all(x, fixed("o"), "u") ) %>% collect(), df ) compare_dplyr_binding( .input %>% transmute( x = str_replace(x, fixed("O"), "u"), x2 = stringr::str_replace(x, fixed("O"), "u") ) %>% collect(), df ) compare_dplyr_binding( .input %>% transmute(x = str_replace(x, fixed("O", ignore_case = TRUE), "u")) %>% collect(), df ) }) test_that("strsplit and str_split", { df <- tibble(x = c("Foo and bar", "baz and qux and quux")) compare_dplyr_binding( .input %>% mutate(x = strsplit(x, "and")) %>% collect(), df, # `ignore_attr = TRUE` because the vctr coming back from arrow (ListArray) # has type information in it, but it's just a bare list from R/dplyr. ignore_attr = TRUE ) compare_dplyr_binding( .input %>% mutate(x = strsplit(x, "and.*", fixed = TRUE)) %>% collect(), df, ignore_attr = TRUE ) compare_dplyr_binding( .input %>% mutate( a = strsplit(x, " +and +"), a2 = base::strsplit(x, " +and +") ) %>% collect(), df, ignore_attr = TRUE ) compare_dplyr_binding( .input %>% mutate( a = str_split(x, "and"), a2 = stringr::str_split(x, "and") ) %>% collect(), df, ignore_attr = TRUE ) compare_dplyr_binding( .input %>% mutate(x = str_split(x, "and", n = 2)) %>% collect(), df, ignore_attr = TRUE ) compare_dplyr_binding( .input %>% mutate(x = str_split(x, fixed("and"), n = 2)) %>% collect(), df, ignore_attr = TRUE ) compare_dplyr_binding( .input %>% mutate(x = str_split(x, regex("and"), n = 2)) %>% collect(), df, ignore_attr = TRUE ) compare_dplyr_binding( .input %>% mutate(x = str_split(x, "Foo|bar", n = 2)) %>% collect(), df, ignore_attr = TRUE ) }) test_that("strrep and str_dup", { df <- tibble(x = c("foo1", " \tB a R\n", "!apACHe aRroW!")) for (times in 0:8) { compare_dplyr_binding( .input %>% mutate(x = strrep(x, times)) %>% collect(), df ) compare_dplyr_binding( .input %>% mutate(x = str_dup(x, times)) %>% collect(), df ) } }) test_that("str_to_lower, str_to_upper, and str_to_title", { df <- tibble(x = c("foo1", " \tB a R\n", "!apACHe aRroW!")) compare_dplyr_binding( .input %>% transmute( x_lower = str_to_lower(x), x_upper = str_to_upper(x), x_title = str_to_title(x), x_lower_nmspc = stringr::str_to_lower(x), x_upper_nmspc = stringr::str_to_upper(x), x_title_nmspc = stringr::str_to_title(x) ) %>% collect(), df ) # Error checking a single function because they all use the same code path. expect_arrow_eval_error( str_to_lower("Apache Arrow", locale = "sp"), paste( "Providing a value for 'locale' other than the default ('en') not supported in Arrow", "> To change locale, use 'Sys.setlocale()'", sep = "\n" ), fixed = TRUE, class = "arrow_not_supported" ) }) test_that("arrow_*_split_whitespace functions", { # use only ASCII whitespace characters df_ascii <- tibble(x = c("Foo\nand bar", "baz\tand qux and quux")) # use only non-ASCII whitespace characters df_utf8 <- tibble(x = c("Foo\u00A0and\u2000bar", "baz\u2006and\u1680qux\u3000and\u2008quux")) df_split <- tibble(x = list(c("Foo", "and", "bar"), c("baz", "and", "qux", "and", "quux"))) # use default option values expect_equal( df_ascii %>% Table$create() %>% mutate(x = arrow_ascii_split_whitespace(x)) %>% collect(), df_split, ignore_attr = TRUE ) expect_equal( df_utf8 %>% Table$create() %>% mutate(x = arrow_utf8_split_whitespace(x)) %>% collect(), df_split, ignore_attr = TRUE ) # specify non-default option values expect_equal( df_ascii %>% Table$create() %>% mutate( x = arrow_ascii_split_whitespace(x, options = list(max_splits = 1, reverse = TRUE)) ) %>% collect(), tibble(x = list(c("Foo\nand", "bar"), c("baz\tand qux and", "quux"))), ignore_attr = TRUE ) expect_equal( df_utf8 %>% Table$create() %>% mutate( x = arrow_utf8_split_whitespace(x, options = list(max_splits = 1, reverse = TRUE)) ) %>% collect(), tibble(x = list(c("Foo\u00A0and", "bar"), c("baz\u2006and\u1680qux\u3000and", "quux"))), ignore_attr = TRUE ) }) test_that("errors and warnings in string splitting", { # These conditions generate an error, but abandon_ship() catches the error, # issues a warning, and pulls the data into R (if computing on InMemoryDataset) # Elsewhere we test that abandon_ship() works, # so here we can just call the functions directly x <- Expression$field_ref("x") expect_error( call_binding("str_split", x, fixed("and", ignore_case = TRUE)), "Case-insensitive string splitting not supported in Arrow" ) expect_error( call_binding("str_split", x, coll("and.?")), "Pattern modifier `coll()` not supported in Arrow", fixed = TRUE ) expect_error( call_binding("str_split", x, boundary(type = "word")), "Pattern modifier `boundary()` not supported in Arrow", fixed = TRUE ) expect_error( call_binding("str_split", x, "and", n = 0), "Splitting strings into zero parts not supported in Arrow" ) # This condition generates a warning expect_warning( call_binding("str_split", x, fixed("and"), simplify = TRUE), "Argument 'simplify = TRUE' will be ignored" ) }) test_that("errors and warnings in string detection and replacement", { x <- Expression$field_ref("x") expect_error( call_binding("str_detect", x, boundary(type = "character")), "Pattern modifier `boundary()` not supported in Arrow", fixed = TRUE ) expect_error( call_binding("str_replace_all", x, coll("o", locale = "en"), "ó"), "Pattern modifier `coll()` not supported in Arrow", fixed = TRUE ) # This condition generates a warning expect_warning( call_binding("str_replace_all", x, regex("o", multiline = TRUE), "u"), "Ignoring pattern modifier argument not supported in Arrow: \"multiline\"" ) }) test_that("backreferences in pattern in string detection", { skip("RE2 does not support backreferences in pattern (https://github.com/google/re2/issues/101)") df <- tibble(x = c("Foo", "bar")) compare_dplyr_binding( .input %>% filter(str_detect(x, regex("F([aeiou])\\1"))) %>% collect(), df ) }) test_that("backreferences (substitutions) in string replacement", { df <- tibble(x = c("Foo", "bar")) compare_dplyr_binding( .input %>% transmute(desc = sub( "(?:https?|ftp)://([^/\r\n]+)(/[^\r\n]*)?", "path `\\2` on server `\\1`", url )) %>% collect(), tibble(url = "https://arrow.apache.org/docs/r/") ) compare_dplyr_binding( .input %>% transmute(x = str_replace(x, "^(\\w)o(.*)", "\\1\\2p")) %>% collect(), df ) compare_dplyr_binding( .input %>% transmute(x = str_replace(x, regex("^(\\w)o(.*)", ignore_case = TRUE), "\\1\\2p")) %>% collect(), df ) compare_dplyr_binding( .input %>% transmute(x = str_replace(x, regex("^(\\w)o(.*)", ignore_case = TRUE), "\\1\\2p")) %>% collect(), df ) }) test_that("edge cases in string detection and replacement", { # in case-insensitive fixed match/replace, test that "\\E" in the search # string and backslashes in the replacement string are interpreted literally. # this test does not use compare_dplyr_binding() because base::sub() and # base::grepl() do not support ignore.case = TRUE when fixed = TRUE. expect_equal( tibble(x = c("\\Q\\e\\D")) %>% Table$create() %>% filter(grepl("\\E", x, ignore.case = TRUE, fixed = TRUE)) %>% collect(), tibble(x = c("\\Q\\e\\D")) ) expect_equal( tibble(x = c("\\Q\\e\\D")) %>% Table$create() %>% transmute(x = sub("\\E", "\\L", x, ignore.case = TRUE, fixed = TRUE)) %>% collect(), tibble(x = c("\\Q\\L\\D")) ) # test that a user's "(?i)" prefix does not break the "(?i)" prefix that's # added in case-insensitive regex match/replace compare_dplyr_binding( .input %>% filter(grepl("(?i)^[abc]{3}$", x, ignore.case = TRUE, fixed = FALSE)) %>% collect(), tibble(x = c("ABC")) ) compare_dplyr_binding( .input %>% transmute(x = sub("(?i)^[abc]{3}$", "123", x, ignore.case = TRUE, fixed = FALSE)) %>% collect(), tibble(x = c("ABC")) ) }) test_that("arrow_find_substring and arrow_find_substring_regex", { df <- tibble(x = c("Foo and Bar", "baz and qux and quux")) expect_equal( df %>% Table$create() %>% mutate(x = arrow_find_substring(x, options = list(pattern = "b"))) %>% collect(), tibble(x = c(-1, 0)) ) expect_equal( df %>% Table$create() %>% mutate(x = arrow_find_substring( x, options = list(pattern = "b", ignore_case = TRUE) )) %>% collect(), tibble(x = c(8, 0)) ) expect_equal( df %>% Table$create() %>% mutate(x = arrow_find_substring_regex( x, options = list(pattern = "^[fb]") )) %>% collect(), tibble(x = c(-1, 0)) ) expect_equal( df %>% Table$create() %>% mutate(x = arrow_find_substring_regex( x, options = list(pattern = "[AEIOU]", ignore_case = TRUE) )) %>% collect(), tibble(x = c(1, 1)) ) }) test_that("stri_reverse and arrow_ascii_reverse functions", { df_ascii <- tibble(x = c("Foo\nand bar", "baz\tand qux and quux")) df_utf8 <- tibble(x = c("Foo\u00A0\u0061nd\u00A0bar", "\u0062az\u00A0and\u00A0qux\u3000and\u00A0quux")) compare_dplyr_binding( .input %>% mutate(x = stri_reverse(x)) %>% collect(), df_utf8 ) compare_dplyr_binding( .input %>% mutate(x = stri_reverse(x)) %>% collect(), df_ascii ) expect_equal( df_ascii %>% Table$create() %>% mutate(x = arrow_ascii_reverse(x)) %>% collect(), tibble(x = c("rab dna\nooF", "xuuq dna xuq dna\tzab")) ) expect_error( df_utf8 %>% Table$create() %>% mutate(x = arrow_ascii_reverse(x)) %>% collect(), "Invalid: Non-ASCII sequence in input" ) }) test_that("str_like", { df <- tibble(x = c("Foo and bar", "baz and qux and quux")) # No match - entire string compare_dplyr_binding( .input %>% mutate(x = str_like(x, "baz")) %>% collect(), df ) # with namespacing compare_dplyr_binding( .input %>% mutate(x = stringr::str_like(x, "baz")) %>% collect(), df ) # Match - entire string compare_dplyr_binding( .input %>% mutate(x = str_like(x, "Foo and bar")) %>% collect(), df ) # Wildcard compare_dplyr_binding( .input %>% mutate(x = str_like(x, "f%", ignore_case = TRUE)) %>% collect(), df ) # Ignore case compare_dplyr_binding( .input %>% mutate(x = str_like(x, "f%", ignore_case = FALSE)) %>% collect(), df ) # Single character compare_dplyr_binding( .input %>% mutate(x = str_like(x, "_a%")) %>% collect(), df ) compare_dplyr_binding( .input %>% mutate(x = str_like(x, "%baz%")) %>% collect(), df ) }) test_that("str_pad", { df <- tibble(x = c("Foo and bar", "baz and qux and quux")) compare_dplyr_binding( .input %>% mutate(x = str_pad(x, width = 31)) %>% collect(), df ) compare_dplyr_binding( .input %>% mutate(x = str_pad(x, width = 30, side = "right")) %>% collect(), df ) compare_dplyr_binding( .input %>% mutate(x = str_pad(x, width = 31, side = "left", pad = "+")) %>% collect(), df ) compare_dplyr_binding( .input %>% mutate(x = str_pad(x, width = 10, side = "left", pad = "+")) %>% collect(), df ) compare_dplyr_binding( .input %>% mutate( a = str_pad(x, width = 31, side = "both"), a2 = stringr::str_pad(x, width = 31, side = "both") ) %>% collect(), df ) }) test_that("substr with string()", { df <- tibble(x = "Apache Arrow") compare_dplyr_binding( .input %>% mutate(y = substr(x, 1, 6)) %>% collect(), df ) compare_dplyr_binding( .input %>% mutate(y = substr(x, 0, 6)) %>% collect(), df ) compare_dplyr_binding( .input %>% mutate(y = substr(x, -1, 6)) %>% collect(), df ) compare_dplyr_binding( .input %>% mutate(y = substr(x, 6, 1)) %>% collect(), df ) compare_dplyr_binding( .input %>% mutate(y = substr(x, -1, -2)) %>% collect(), df ) compare_dplyr_binding( .input %>% mutate(y = substr(x, 9, 6)) %>% collect(), df ) compare_dplyr_binding( .input %>% mutate(y = substr(x, 1, 6)) %>% collect(), df ) compare_dplyr_binding( .input %>% mutate(y = substr(x, 8, 12)) %>% collect(), df ) compare_dplyr_binding( .input %>% mutate( y = substr(x, -5, -1), y2 = base::substr(x, -5, -1) ) %>% collect(), df ) expect_arrow_eval_error( substr("Apache Arrow", c(1, 2), 3), "`start` must be length 1 - other lengths not supported in Arrow", class = "arrow_not_supported" ) expect_arrow_eval_error( substr("Apache Arrow", 1, c(2, 3)), "`stop` must be length 1 - other lengths not supported in Arrow", class = "arrow_not_supported" ) }) test_that("substr with binary()", { batch <- record_batch(x = list(charToRaw("Apache Arrow"))) # Check a field reference input expect_identical( batch %>% transmute(y = substr(x, 1, 3)) %>% collect() %>% # because of the arrow_binary class mutate(y = unclass(y)), tibble::tibble(y = list(charToRaw("Apa"))) ) # Check a Scalar input scalar <- Scalar$create(batch$x) expect_identical( batch %>% transmute(y = substr(scalar, 1, 3)) %>% collect() %>% # because of the arrow_binary class mutate(y = unclass(y)), tibble::tibble(y = list(charToRaw("Apa"))) ) }) test_that("substring", { # binding for substring just calls call_binding("substr", ...), # tested extensively above df <- tibble(x = "Apache Arrow") compare_dplyr_binding( .input %>% mutate( y = substring(x, 1, 6), y2 = base::substring(x, 1, 6) ) %>% collect(), df ) }) test_that("str_sub", { df <- tibble(x = "Apache Arrow") compare_dplyr_binding( .input %>% mutate(y = str_sub(x, 1, 6)) %>% collect(), df ) compare_dplyr_binding( .input %>% mutate(y = str_sub(x, 0, 6)) %>% collect(), df ) compare_dplyr_binding( .input %>% mutate(y = str_sub(x, -1, 6)) %>% collect(), df ) compare_dplyr_binding( .input %>% mutate(y = str_sub(x, 6, 1)) %>% collect(), df ) compare_dplyr_binding( .input %>% mutate(y = str_sub(x, -1, -2)) %>% collect(), df ) compare_dplyr_binding( .input %>% mutate(y = str_sub(x, -1, 3)) %>% collect(), df ) compare_dplyr_binding( .input %>% mutate(y = str_sub(x, 9, 6)) %>% collect(), df ) compare_dplyr_binding( .input %>% mutate(y = str_sub(x, 1, 6)) %>% collect(), df ) compare_dplyr_binding( .input %>% mutate(y = str_sub(x, 8, 12)) %>% collect(), df ) compare_dplyr_binding( .input %>% mutate( y = str_sub(x, -5, -1), y2 = stringr::str_sub(x, -5, -1) ) %>% collect(), df ) compare_dplyr_binding( .input %>% mutate( y = str_sub(x, 1, -3), y2 = stringr::str_sub(x, 1, -3) ) %>% collect(), df ) expect_arrow_eval_error( str_sub("Apache Arrow", c(1, 2), 3), "`start` must be length 1 - other lengths not supported in Arrow", class = "arrow_not_supported" ) expect_arrow_eval_error( str_sub("Apache Arrow", 1, c(2, 3)), "`end` must be length 1 - other lengths not supported in Arrow", class = "arrow_not_supported" ) }) test_that("str_starts, str_ends, startsWith, endsWith", { df <- tibble(x = c("Foo", "bar", "baz", "qux", NA_character_)) compare_dplyr_binding( .input %>% filter(str_starts(x, "b.*")) %>% collect(), df ) compare_dplyr_binding( .input %>% filter(str_starts(x, "b.*", negate = TRUE)) %>% collect(), df ) compare_dplyr_binding( .input %>% filter(str_starts(x, fixed("b.*"))) %>% collect(), df ) compare_dplyr_binding( .input %>% filter(str_starts(x, fixed("b"))) %>% collect(), df ) compare_dplyr_binding( .input %>% transmute( a = str_starts(x, "b.*"), a2 = stringr::str_starts(x, "b.*"), b = str_starts(x, "b.*", negate = TRUE), c = str_starts(x, fixed("b")), d = str_starts(x, fixed("b"), negate = TRUE) ) %>% collect(), df ) compare_dplyr_binding( .input %>% filter(str_ends(x, "r")) %>% collect(), df ) compare_dplyr_binding( .input %>% filter(str_ends(x, "r", negate = TRUE)) %>% collect(), df ) compare_dplyr_binding( .input %>% filter(str_ends(x, fixed("r$"))) %>% collect(), df ) compare_dplyr_binding( .input %>% filter(str_ends(x, fixed("r"))) %>% collect(), df ) compare_dplyr_binding( .input %>% transmute( a = str_ends(x, "r"), a2 = stringr::str_ends(x, "r"), b = str_ends(x, "r", negate = TRUE), c = str_ends(x, fixed("r")), d = str_ends(x, fixed("r"), negate = TRUE) ) %>% collect(), df ) compare_dplyr_binding( .input %>% filter(startsWith(x, "b")) %>% collect(), df ) compare_dplyr_binding( .input %>% filter(endsWith(x, "r")) %>% collect(), df ) compare_dplyr_binding( .input %>% filter(startsWith(x, "b.*")) %>% collect(), df ) compare_dplyr_binding( .input %>% filter(endsWith(x, "r$")) %>% collect(), df ) compare_dplyr_binding( .input %>% transmute( a = startsWith(x, "b"), b = endsWith(x, "r"), a2 = base::startsWith(x, "b"), b2 = base::endsWith(x, "r") ) %>% collect(), df ) }) test_that("str_count", { df <- tibble( cities = c("Kolkata", "Dar es Salaam", "Tel Aviv", "San Antonio", "Cluj Napoca", "Bern", "Bogota"), dots = c("a.", "...", ".a.a", "a..a.", "ab...", "dse....", ".f..d..") ) compare_dplyr_binding( .input %>% mutate( a_count = str_count(cities, pattern = "a"), a_count_nmspc = stringr::str_count(cities, pattern = "a") ) %>% collect(), df ) compare_dplyr_binding( .input %>% mutate(p_count = str_count(cities, pattern = "d")) %>% collect(), df ) compare_dplyr_binding( .input %>% mutate(p_count = str_count(cities, pattern = regex("d", ignore_case = TRUE) )) %>% collect(), df ) compare_dplyr_binding( .input %>% mutate(e_count = str_count(cities, pattern = "u")) %>% collect(), df ) # call_binding("str_count", ) is not vectorised over pattern compare_dplyr_binding( .input %>% mutate(let_count = str_count(cities, pattern = c("a", "b", "e", "g", "p", "n", "s"))) %>% collect(), df, warning = TRUE ) compare_dplyr_binding( .input %>% mutate(dots_count = str_count(dots, ".")) %>% collect(), df ) compare_dplyr_binding( .input %>% mutate(dots_count = str_count(dots, fixed("."))) %>% collect(), df ) }) test_that("base::tolower and base::toupper", { compare_dplyr_binding( .input %>% mutate( verse_to_upper = toupper(verses), verse_to_lower = tolower(verses), verse_to_upper_nmspc = base::toupper(verses), verse_to_lower_nmspc = base::tolower(verses) ) %>% collect(), tbl ) }) test_that("namespaced unary and binary string functions", { # str_length and stringi::stri_reverse compare_dplyr_binding( .input %>% mutate( verse_length = stringr::str_length(verses), reverses_verse = stringi::stri_reverse(verses) ) %>% collect(), tbl ) # stringr::str_dup and base::strrep df <- tibble(x = c("foo1", " \tB a R\n", "!apACHe aRroW!")) for (times in 0:8) { compare_dplyr_binding( .input %>% mutate(x = base::strrep(x, times)) %>% collect(), df ) compare_dplyr_binding( .input %>% mutate(x = stringr::str_dup(x, times)) %>% collect(), df ) } }) test_that("nchar with namespacing", { compare_dplyr_binding( .input %>% mutate(verses_nchar = base::nchar(verses)) %>% collect(), tbl ) }) test_that("str_trim()", { compare_dplyr_binding( .input %>% mutate( left_trim_padded_string = str_trim(padded_strings, "left"), right_trim_padded_string = str_trim(padded_strings, "right"), both_trim_padded_string = str_trim(padded_strings, "both"), left_trim_padded_string_nmspc = stringr::str_trim(padded_strings, "left"), right_trim_padded_string_nmspc = stringr::str_trim(padded_strings, "right"), both_trim_padded_string_nmspc = stringr::str_trim(padded_strings, "both") ) %>% collect(), tbl ) }) test_that("str_remove and str_remove_all", { df <- tibble(x = c("Foo", "bar")) compare_dplyr_binding( .input %>% transmute(x = str_remove_all(x, "^F")) %>% collect(), df ) compare_dplyr_binding( .input %>% transmute(x = str_remove_all(x, regex("^F"))) %>% collect(), df ) compare_dplyr_binding( .input %>% mutate(x = str_remove(x, "^F[a-z]{2}")) %>% collect(), df ) compare_dplyr_binding( .input %>% transmute(x = str_remove(x, regex("^f[A-Z]{2}", ignore_case = TRUE))) %>% collect(), df ) compare_dplyr_binding( .input %>% transmute( x = str_remove_all(x, fixed("o")), x2 = stringr::str_remove_all(x, fixed("o")) ) %>% collect(), df ) compare_dplyr_binding( .input %>% transmute( x = str_remove(x, fixed("O")), x2 = stringr::str_remove(x, fixed("O")) ) %>% collect(), df ) compare_dplyr_binding( .input %>% transmute(x = str_remove(x, fixed("O", ignore_case = TRUE))) %>% collect(), df ) }) test_that("GH-36720: stringr modifier functions can be called with namespace prefix", { df <- tibble(x = c("Foo", "bar")) compare_dplyr_binding( .input %>% transmute(x = str_replace_all(x, stringr::regex("^f", ignore_case = TRUE), "baz")) %>% collect(), df ) compare_dplyr_binding( .input %>% filter(str_detect(x, stringr::fixed("f", ignore_case = TRUE), negate = TRUE)) %>% collect(), df ) x <- Expression$field_ref("x") expect_error( call_binding("str_detect", x, stringr::boundary(type = "character")), "Pattern modifier `boundary()` not supported in Arrow", fixed = TRUE ) expect_error( call_binding("str_replace_all", x, stringr::coll("o", locale = "en"), "ó"), "Pattern modifier `coll()` not supported in Arrow", fixed = TRUE ) })