# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. #' Run standard suite of integration tests for a filesystem #' #' @param name Name of filesystem to be printed in test name #' @param fs A `FileSystem` instance to test with #' @param path_formatter A function that takes a sequence of path segments and #' returns a absolute path. #' @param uri_formatter A function that takes a sequence of path segments and #' returns a URI containing the filesystem scheme (e.g. 's3://', 'gs://'), the #' absolute path, and any necessary connection options as URL query parameters. test_filesystem <- function(name, fs, path_formatter, uri_formatter) { # NOTE: it's important that we label these tests with name of filesystem so # that we can differentiate the different calls to these test in the output. test_that(sprintf("read/write Feather on %s using URIs", name), { write_feather(example_data, uri_formatter("test.feather")) expect_identical(read_feather(uri_formatter("test.feather")), example_data) }) test_that(sprintf("read/write Feather on %s using Filesystem", name), { write_feather(example_data, fs$path(path_formatter("test2.feather"))) expect_identical( read_feather(fs$path(path_formatter("test2.feather"))), example_data ) }) if (!("package:dplyr" %in% search())) { abort("library(dplyr) required for test_filesystem()") } test_that(sprintf("read/write compressed csv on %s using FileSystem", name), { skip_if_not_available("gzip") dat <- tibble(x = seq(1, 10, by = 0.2)) write_csv_arrow(dat, fs$path(path_formatter("test.csv.gz"))) expect_identical( read_csv_arrow(fs$path(path_formatter("test.csv.gz"))), dat ) }) test_that(sprintf("read/write csv on %s using FileSystem", name), { skip_if_not_available("gzip") dat <- tibble(x = seq(1, 10, by = 0.2)) write_csv_arrow(dat, fs$path(path_formatter("test.csv"))) expect_identical( read_csv_arrow(fs$path(path_formatter("test.csv"))), dat ) }) test_that(sprintf("read/write IPC stream on %s", name), { write_ipc_stream(example_data, fs$path(path_formatter("test3.ipc"))) expect_identical( read_ipc_stream(fs$path(path_formatter("test3.ipc"))), example_data ) }) test_that(sprintf("read/write Parquet on %s", name), { skip_if_not_available("parquet") write_parquet(example_data, fs$path(path_formatter("test.parquet"))) expect_identical(read_parquet(uri_formatter("test.parquet")), example_data) }) if (arrow_with_dataset()) { make_temp_dir <- function() { path <- tempfile() dir.create(path) normalizePath(path, winslash = "/") } test_that(sprintf("open_dataset with an %s file (not directory) URI", name), { skip_if_not_available("parquet") expect_identical( open_dataset(uri_formatter("test.parquet")) %>% collect() %>% arrange(int), example_data %>% arrange(int) ) }) test_that(sprintf("open_dataset with vector of %s file URIs", name), { expect_identical( open_dataset( c(uri_formatter("test.feather"), uri_formatter("test2.feather")), format = "feather" ) %>% arrange(int) %>% collect(), rbind(example_data, example_data) %>% arrange(int) ) }) test_that(sprintf("open_dataset errors if passed URIs mixing %s and local fs", name), { td <- make_temp_dir() expect_error( open_dataset( c( uri_formatter("test.feather"), paste0("file://", file.path(td, "fake.feather")) ), format = "feather" ), "Vectors of URIs for different file systems are not supported" ) }) # Dataset test setup, cf. test-dataset.R first_date <- lubridate::ymd_hms("2015-04-29 03:12:39") df1 <- tibble( int = 1:10, dbl = as.numeric(1:10), lgl = rep(c(TRUE, FALSE, NA, TRUE, FALSE), 2), chr = letters[1:10], fct = factor(LETTERS[1:10]), ts = first_date + lubridate::days(1:10) ) second_date <- lubridate::ymd_hms("2017-03-09 07:01:02") df2 <- tibble( int = 101:110, dbl = as.numeric(51:60), lgl = rep(c(TRUE, FALSE, NA, TRUE, FALSE), 2), chr = letters[10:1], fct = factor(LETTERS[10:1]), ts = second_date + lubridate::days(10:1) ) # This is also to set up the dataset tests test_that(sprintf("write_parquet with %s filesystem arg", name), { skip_if_not_available("parquet") fs$CreateDir(path_formatter("hive_dir", "group=1", "other=xxx")) fs$CreateDir(path_formatter("hive_dir", "group=2", "other=yyy")) expect_length(fs$ls(path_formatter("hive_dir")), 2) write_parquet(df1, fs$path(path_formatter("hive_dir", "group=1", "other=xxx", "file1.parquet"))) write_parquet(df2, fs$path(path_formatter("hive_dir", "group=2", "other=yyy", "file2.parquet"))) expect_identical( read_parquet(fs$path(path_formatter("hive_dir", "group=1", "other=xxx", "file1.parquet"))), df1 ) }) test_that(sprintf("open_dataset with %s", name), { ds <- open_dataset(fs$path(path_formatter("hive_dir"))) expect_identical( ds %>% select(int, dbl, lgl) %>% collect() %>% arrange(int), rbind(df1[, c("int", "dbl", "lgl")], df2[, c("int", "dbl", "lgl")]) %>% arrange(int) ) }) test_that(sprintf("write_dataset with %s", name), { ds <- open_dataset(fs$path(path_formatter("hive_dir"))) write_dataset(ds, fs$path(path_formatter("new_dataset_dir"))) expect_length(fs$ls(path_formatter("new_dataset_dir")), 1) }) test_that(sprintf("copy files with %s", name), { td <- make_temp_dir() copy_files(uri_formatter("hive_dir"), td) expect_length(dir(td), 2) ds <- open_dataset(td) expect_identical( ds %>% select(int, dbl, lgl) %>% collect() %>% arrange(int), rbind(df1[, c("int", "dbl", "lgl")], df2[, c("int", "dbl", "lgl")]) %>% arrange(int) ) # Let's copy the other way and use a SubTreeFileSystem rather than URI copy_files(td, fs$path(path_formatter("hive_dir2"))) ds2 <- open_dataset(fs$path(path_formatter("hive_dir2"))) expect_identical( ds2 %>% select(int, dbl, lgl) %>% collect() %>% arrange(int), rbind(df1[, c("int", "dbl", "lgl")], df2[, c("int", "dbl", "lgl")]) %>% arrange(int) ) }) } # if(arrow_with_dataset()) }