# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. library(dplyr, warn.conflicts = FALSE) library(stringr) skip_if_not_available("acero") tbl <- example_data # Add some better string data tbl$verses <- verses[[1]] # c(" a ", " b ", " c ", ...) increasing padding # nchar = 3 5 7 9 11 13 15 17 19 21 tbl$padded_strings <- stringr::str_pad(letters[1:10], width = 2 * (1:10) + 1, side = "both") tbl$another_chr <- tail(letters, 10) test_that("basic select/filter/collect", { batch <- record_batch(tbl) b2 <- batch %>% select(int, chr) %>% filter(int > 5) expect_s3_class(b2, "arrow_dplyr_query") t2 <- collect(b2) expect_equal(t2, tbl[tbl$int > 5 & !is.na(tbl$int), c("int", "chr")]) # Test that the original object is not affected expect_identical(collect(batch), tbl) }) test_that("dim() on query", { compare_dplyr_binding( .input %>% filter(int > 5) %>% select(int, chr) %>% dim(), tbl ) }) test_that("Print method", { expect_output( record_batch(tbl) %>% filter(dbl > 2, chr == "d" | chr == "f") %>% select(chr, int, lgl) %>% filter(int < 5) %>% select(int, chr) %>% print(), 'RecordBatch (query) int: int32 chr: string * Filter: (((dbl > 2) and ((chr == "d") or (chr == "f"))) and (int < 5)) See $.data for the source Arrow object', fixed = TRUE ) }) test_that("pull", { compare_dplyr_binding( .input %>% pull() %>% as.vector(), tbl ) compare_dplyr_binding( .input %>% pull(1) %>% as.vector(), tbl ) compare_dplyr_binding( .input %>% pull(chr) %>% as.vector(), tbl ) compare_dplyr_binding( .input %>% filter(int > 4) %>% rename(strng = chr) %>% group_by(dbl) %>% pull(strng) %>% as.vector(), tbl ) }) test_that("pull() shows a deprecation warning if the option isn't set", { expect_warning( vec <- tbl %>% arrow_table() %>% pull(as_vector = NULL), "Current behavior of returning an R vector is deprecated" ) # And the default is the old behavior, an R vector expect_identical(vec, pull(tbl)) }) test_that("collect(as_data_frame=FALSE)", { batch <- record_batch(tbl) b1 <- batch %>% collect(as_data_frame = FALSE) expect_r6_class(b1, "RecordBatch") b2 <- batch %>% select(int, chr) %>% filter(int > 5) %>% collect(as_data_frame = FALSE) # collect(as_data_frame = FALSE) always returns Table now expect_r6_class(b2, "Table") expected <- tbl[tbl$int > 5 & !is.na(tbl$int), c("int", "chr")] expect_equal_data_frame(b2, expected) b3 <- batch %>% select(int, strng = chr) %>% filter(int > 5) %>% collect(as_data_frame = FALSE) expect_r6_class(b3, "Table") expect_equal_data_frame(b3, set_names(expected, c("int", "strng"))) b4 <- batch %>% select(int, strng = chr) %>% filter(int > 5) %>% group_by(int) %>% collect(as_data_frame = FALSE) expect_r6_class(b4, "Table") expect_equal_data_frame( b4, expected %>% rename(strng = chr) %>% group_by(int) ) }) test_that("compute()", { batch <- record_batch(tbl) b1 <- batch %>% compute() expect_r6_class(b1, "RecordBatch") b2 <- batch %>% select(int, chr) %>% filter(int > 5) %>% compute() expect_r6_class(b2, "Table") expected <- tbl[tbl$int > 5 & !is.na(tbl$int), c("int", "chr")] expect_equal_data_frame(b2, expected) b3 <- batch %>% select(int, strng = chr) %>% filter(int > 5) %>% compute() expect_r6_class(b3, "Table") expect_equal_data_frame(b3, set_names(expected, c("int", "strng"))) b4 <- batch %>% select(int, strng = chr) %>% filter(int > 5) %>% group_by(int) %>% compute() expect_r6_class(b4, "Table") expect_equal_data_frame( b4, expected %>% rename(strng = chr) %>% group_by(int) ) }) test_that("head", { compare_dplyr_binding( .input %>% select(int, strng = chr) %>% filter(int > 5) %>% group_by(int) %>% head(2) %>% collect(), tbl ) # This would fail if we evaluated head() after filter() compare_dplyr_binding( .input %>% select(int, strng = chr) %>% arrange(int) %>% head(2) %>% filter(int > 5) %>% mutate(twice = int * 2) %>% collect(), tbl ) }) test_that("arrange then head returns the right data (ARROW-14162)", { compare_dplyr_binding( .input %>% # mpg has ties so we need to sort by two things to get deterministic order arrange(mpg, disp) %>% head(4) %>% collect(), tibble::as_tibble(mtcars) ) }) test_that("arrange then tail returns the right data", { compare_dplyr_binding( .input %>% # mpg has ties so we need to sort by two things to get deterministic order arrange(mpg, disp) %>% tail(4) %>% collect(), tibble::as_tibble(mtcars) ) }) test_that("tail", { # With sorting compare_dplyr_binding( .input %>% select(int, chr) %>% filter(int < 5) %>% arrange(int) %>% tail(2) %>% collect(), tbl ) # Without sorting: table order is implicit, and we can compute the filter # row length, so the query can use Fetch with offset compare_dplyr_binding( .input %>% select(int, chr) %>% filter(int < 5) %>% tail(2) %>% collect(), tbl ) }) test_that("No duplicate field names are allowed in an arrow_dplyr_query", { expect_error( Table$create(tbl, tbl) %>% filter(int > 0), regexp = paste0( 'The following field names were found more than once in the data: "int", "dbl", ', '"dbl2", "lgl", "false", "chr", "fct", "verses", "padded_strings"' ) ) }) test_that("all_sources() finds all data sources in a query", { skip_if_not_available("dataset") tab <- Table$create(a = 1) ds <- InMemoryDataset$create(tab) expect_equal(all_sources(tab), list(tab)) expect_equal( tab %>% filter(a > 0) %>% summarize(a = sum(a)) %>% arrange(desc(a)) %>% all_sources(), list(tab) ) expect_equal( tab %>% filter(a > 0) %>% union_all(ds) %>% all_sources(), list(tab, ds) ) expect_equal( tab %>% filter(a > 0) %>% union_all(ds) %>% left_join(tab) %>% all_sources(), list(tab, ds, tab) ) expect_equal( tab %>% filter(a > 0) %>% union_all(left_join(ds, tab)) %>% left_join(tab) %>% all_sources(), list(tab, ds, tab, tab) ) }) test_that("query_on_dataset() looks at all data sources in a query", { skip_if_not_available("dataset") tab <- Table$create(a = 1) ds <- InMemoryDataset$create(tab) expect_false(query_on_dataset(tab)) expect_true(query_on_dataset(ds)) expect_false( tab %>% filter(a > 0) %>% summarize(a = sum(a)) %>% arrange(desc(a)) %>% query_on_dataset() ) expect_true( tab %>% filter(a > 0) %>% union_all(ds) %>% query_on_dataset() ) expect_true( tab %>% filter(a > 0) %>% union_all(left_join(ds, tab)) %>% left_join(tab) %>% query_on_dataset() ) expect_false( tab %>% filter(a > 0) %>% union_all(left_join(tab, tab)) %>% left_join(tab) %>% query_on_dataset() ) }) test_that("query_can_stream()", { skip_if_not_available("dataset") tab <- Table$create(a = 1) ds <- InMemoryDataset$create(tab) expect_true(query_can_stream(tab)) expect_true(query_can_stream(ds)) expect_true(query_can_stream(NULL)) expect_true( ds %>% filter(a > 0) %>% query_can_stream() ) expect_false( tab %>% filter(a > 0) %>% arrange(desc(a)) %>% query_can_stream() ) expect_false( tab %>% filter(a > 0) %>% summarize(a = sum(a)) %>% query_can_stream() ) expect_true( tab %>% filter(a > 0) %>% union_all(ds) %>% query_can_stream() ) expect_false( tab %>% filter(a > 0) %>% union_all(summarize(ds, a = sum(a))) %>% query_can_stream() ) expect_true( tab %>% filter(a > 0) %>% union_all(left_join(ds, tab)) %>% left_join(tab) %>% query_can_stream() ) expect_true( tab %>% filter(a > 0) %>% union_all(left_join(tab, tab)) %>% left_join(tab) %>% query_can_stream() ) expect_false( tab %>% filter(a > 0) %>% union_all(left_join(tab, tab)) %>% left_join(ds) %>% query_can_stream() ) expect_false( tab %>% filter(a > 0) %>% arrange(a) %>% union_all(left_join(tab, tab)) %>% left_join(tab) %>% query_can_stream() ) }) test_that("show_exec_plan(), show_query() and explain()", { # show_query() and explain() are wrappers around show_exec_plan() and are not # tested separately # minimal test - this fails if we don't coerce the input to `show_exec_plan()` # to be an `arrow_dplyr_query` expect_output( mtcars %>% arrow_table() %>% show_exec_plan(), regexp = paste0( "ExecPlan with 2 nodes:.*", # boiler plate for ExecPlan "SinkNode.*", # output "TableSourceNode" # entry point ) ) # arrow_table and mutate expect_output( tbl %>% arrow_table() %>% filter(dbl > 2, chr != "e") %>% select(chr, int, lgl) %>% mutate(int_plus_ten = int + 10) %>% show_exec_plan(), regexp = paste0( "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan "chr, int, lgl, \"int_plus_ten\".*", # selected columns "FilterNode.*", # filter node "(dbl > 2).*", # filter expressions "chr != \"e\".*", "TableSourceNode" # entry point ) ) # record_batch and mutate expect_output( tbl %>% record_batch() %>% filter(dbl > 2, chr != "e") %>% select(chr, int, lgl) %>% mutate(int_plus_ten = int + 10) %>% show_exec_plan(), regexp = paste0( "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan "chr, int, lgl, \"int_plus_ten\".*", # selected columns "(dbl > 2).*", # the filter expressions "chr != \"e\".*", "TableSourceNode" # the entry point" ) ) # test with group_by and summarise expect_output( tbl %>% arrow_table() %>% group_by(lgl) %>% summarise(avg = mean(dbl, na.rm = TRUE)) %>% show_exec_plan(), regexp = paste0( "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan "GroupByNode.*", # the group_by statement "keys=.*lgl.*", # the key for the aggregations "aggregates=.*hash_mean.*avg.*", # the aggregations "ProjectNode.*", # the input columns "TableSourceNode" # the entry point ) ) # test with join expect_output( tbl %>% arrow_table() %>% left_join( example_data %>% arrow_table() %>% mutate(doubled_dbl = dbl * 2) %>% select(int, doubled_dbl), by = "int" ) %>% select(int, verses, doubled_dbl) %>% show_exec_plan(), regexp = paste0( "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan "ProjectNode.*", # output columns "HashJoinNode.*", # the join "ProjectNode.*", # input columns for the second table "\"doubled_dbl\"\\: multiply_checked\\(dbl, 2\\).*", # mutate "TableSourceNode.*", # second table "TableSourceNode" # first table ) ) expect_output( mtcars %>% arrow_table() %>% filter(mpg > 20) %>% arrange(desc(wt)) %>% show_exec_plan(), regexp = paste0( "ExecPlan with .* nodes:.*", # boiler plate for ExecPlan "OrderBy.*wt.*DESC.*", # arrange goes via the OrderBy node "FilterNode.*", # filter node "TableSourceNode.*" # entry point ) ) # printing the ExecPlan for a nested query would currently force the # evaluation of the inner one(s), which we want to avoid => no output expect_output( mtcars %>% arrow_table() %>% filter(mpg > 20) %>% head(3) %>% show_exec_plan(), paste0( "ExecPlan with 4 nodes:.*", "3:SinkNode.*", "2:FetchNode.offset=0 count=3.*", "1:FilterNode.filter=.mpg > 20.*", "0:TableSourceNode.*" ) ) }) test_that("needs_projection unit tests", { tab <- Table$create(tbl) # Wrapper to simplify tests query_needs_projection <- function(query) { needs_projection(query$selected_columns, tab$schema) } expect_false(query_needs_projection(as_adq(tab))) expect_false(query_needs_projection( tab %>% collapse() %>% collapse() )) expect_true(query_needs_projection( tab %>% mutate(int = int + 2) )) expect_true(query_needs_projection( tab %>% select(int, chr) )) expect_true(query_needs_projection( tab %>% rename(int2 = int) )) expect_true(query_needs_projection( tab %>% relocate(lgl) )) }) test_that("compute() on a grouped query returns a Table with groups in metadata", { tab1 <- tbl %>% arrow_table() %>% group_by(int) %>% compute() expect_r6_class(tab1, "Table") expect_equal_data_frame( tab1, tbl %>% group_by(int) ) expect_equal( collect(tab1), tbl %>% group_by(int) ) }) test_that("collect() is identical to compute() %>% collect()", { tab1 <- tbl %>% arrow_table() adq1 <- tab1 %>% group_by(int) expect_equal( tab1 %>% compute() %>% collect(), tab1 %>% collect() ) expect_equal( adq1 %>% compute() %>% collect(), adq1 %>% collect() ) }) test_that("Scalars in expressions match the type of the field, if possible", { tbl_with_datetime <- tbl tbl_with_datetime$dates <- as.Date("2022-08-28") + 1:10 tbl_with_datetime$times <- lubridate::ymd_hms("2018-10-07 19:04:05") + 1:10 tab <- Table$create(tbl_with_datetime) # 5 is double in R but is properly interpreted as int, no cast is added expect_output( tab %>% filter(int == 5) %>% show_exec_plan(), "int == 5" ) # Because 5.2 can't cast to int32 without truncation, we pass as is # and Acero will cast int to float64 expect_output( tab %>% filter(int == 5.2) %>% show_exec_plan(), "filter=(cast(int, {to_type=double", fixed = TRUE ) expect_equal( tab %>% filter(int == 5.2) %>% nrow(), 0 ) # int == string, errors starting in dplyr 1.1.0 expect_snapshot_warning( tab %>% filter(int == "5") ) # Strings automatically parsed to date/timestamp expect_output( tab %>% filter(dates > "2022-09-01") %>% show_exec_plan(), "dates > 2022-09-01" ) compare_dplyr_binding( .input %>% filter(dates > "2022-09-01") %>% collect(), tbl_with_datetime ) # ARROW-18401: These will error if the system timezone is not valid. A PR was # submitted to fix this docker image upstream; this skip can be removed after # it merges. # https://github.com/r-hub/rhub-linux-builders/pull/65 skip_if(identical(Sys.timezone(), "/UTC")) expect_output( tab %>% filter(times > "2018-10-07 19:04:05") %>% show_exec_plan(), "times > 2018-10-0. ..:..:05" ) compare_dplyr_binding( .input %>% filter(times > "2018-10-07 19:04:05") %>% collect(), tbl_with_datetime ) tab_with_decimal <- tab %>% mutate(dec = cast(dbl, decimal(15, 2))) %>% compute() # This reproduces the issue on ARROW-17601, found in the TPC-H query 1 # In ARROW-17462, we chose not to auto-cast to decimal to avoid that issue result <- tab_with_decimal %>% summarize( tpc_h_1 = sum(dec * (1 - dec) * (1 + dec), na.rm = TRUE), as_dbl = sum(dbl * (1 - dbl) * (1 + dbl), na.rm = TRUE) ) %>% collect() expect_equal(result$tpc_h_1, result$as_dbl) }) test_that("Can use nested field refs", { nested_data <- tibble(int = 1:5, df_col = tibble(a = 6:10, b = 11:15)) compare_dplyr_binding( .input %>% mutate( nested = df_col$a, times2 = df_col$a * 2 ) %>% filter(nested > 7) %>% collect(), nested_data ) compare_dplyr_binding( .input %>% mutate( nested = df_col$a, times2 = df_col$a * 2 ) %>% filter(nested > 7) %>% summarize(sum(times2)) %>% collect(), nested_data ) }) test_that("Can use nested field refs with Dataset", { skip_if_not_available("dataset") # Now with Dataset: make sure column pushdown in ScanNode works nested_data <- tibble(int = 1:5, df_col = tibble(a = 6:10, b = 11:15)) tf <- tempfile() dir.create(tf) write_dataset(nested_data, tf) ds <- open_dataset(tf) expect_equal( ds %>% mutate( nested = df_col$a, times2 = df_col$a * 2 ) %>% filter(nested > 7) %>% collect(), nested_data %>% mutate( nested = df_col$a, times2 = df_col$a * 2 ) %>% filter(nested > 7) ) # Issue #34519: error when projecting same name, but only on file dataset expect_equal( ds %>% mutate(int = as.numeric(int)) %>% collect(), nested_data %>% mutate(int = as.numeric(int)) %>% collect() ) }) test_that("Use struct_field for $ on non-field-ref", { compare_dplyr_binding( .input %>% mutate( df_col = tibble(i = int, d = dbl) ) %>% transmute( int2 = df_col$i, dbl2 = df_col$d ) %>% collect(), example_data ) }) test_that("nested field ref error handling", { expect_error( example_data %>% arrow_table() %>% mutate(x = int$nested) %>% compute(), "No match" ) })