# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. # In 3.4 the lack of tzone attribute causes spurious failures skip_on_r_older_than("3.5") library(lubridate, warn.conflicts = FALSE) library(dplyr, warn.conflicts = FALSE) skip_if_not_available("acero") # Skip these tests on CRAN due to build times > 10 mins skip_on_cran() # base::strptime() defaults to local timezone # but arrow's strptime defaults to UTC. # So that tests are consistent, set the local timezone to UTC # TODO: consider reevaluating now that ARROW-12980 has merged withr::local_timezone("UTC") if (tolower(Sys.info()[["sysname"]]) == "windows") { withr::local_locale(LC_TIME = "C") } test_date <- as.POSIXct("2017-01-01 00:00:11.3456789", tz = "Pacific/Marquesas") strptime_test_df <- tibble( string_a = c("2023-12-30-Sat", NA), string_A = c("2023-12-30-Saturday", NA), string_b = c("2023-12-30-Dec", NA), string_B = c("2023-12-30-December", NA), string_H = c("2023-12-30-01", NA), string_I = c("2023-12-30-01", NA), string_j = c("2023-12-30-364", NA), string_M = c("2023-12-30-45", NA), string_p = c("2023-12-30-AM", NA), string_q = c("2023.3", NA), string_S = c("2023-12-30-56", NA), string_OS = c("2023-12-30-12.345678", NA), string_U = c("2023-12-30-52", NA), string_w = c("2023-12-30-6", NA), string_W = c("2023-12-30-52", NA), string_y = c("23-12-30", NA), string_Y = c("2023-12-30", NA), string_m = c("2023-12-30", NA), string_r = c("2023-12-30-01", NA), string_R = c("2023-12-30-01:23", NA), string_T = c("2023-12-30-01:23:45", NA), string_z = c("2023-12-30-01:23:45z", NA) ) test_df <- tibble::tibble( # test_date + 1 turns the tzone = "" to NULL, which is functionally equivalent # so we can run some tests on Windows, but this skirts around ARROW-13588. # That issue is tough because in C++, "" is the "no timezone" value # due to static typing, so we can't distinguish a literal "" from NULL datetime = c(test_date, NA) + 1, date = c(as.Date("2021-09-09"), NA), integer = 1:2 ) test_that("strptime", { t_string <- tibble(x = c("2018-10-07 19:04:05", NA)) # lubridate defaults to "UTC" as timezone => t_stamp is in "UTC" t_stamp_with_utc_tz <- tibble(x = c(lubridate::ymd_hms("2018-10-07 19:04:05"), NA)) t_stamp_with_pm_tz <- tibble( x = c(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "Pacific/Marquesas"), NA) ) # base::strptime returns a POSIXlt (a list) => we cannot use compare_dplyr_binding # => we use expect_equal for the tests below withr::with_timezone("Pacific/Marquesas", { # the default value for strptime's `tz` argument is "", which is interpreted # as the current timezone. we test here if the strptime binding picks up # correctly the current timezone (similarly to the base R version) expect_equal( t_string %>% record_batch() %>% mutate( x = strptime(x, format = "%Y-%m-%d %H:%M:%S") ) %>% collect(), t_stamp_with_pm_tz ) expect_equal( t_string %>% record_batch() %>% mutate( x = base::strptime(x, format = "%Y-%m-%d %H:%M:%S") ) %>% collect(), t_stamp_with_pm_tz ) }) # adding a timezone to a timezone-naive timestamp works # and since our TZ when running the test is (typically) Pacific/Marquesas # this also tests that assigning a TZ different from the current session one # works as expected expect_equal( t_string %>% arrow_table() %>% mutate( x = strptime(x, format = "%Y-%m-%d %H:%M:%S", tz = "Pacific/Marquesas") ) %>% collect(), t_stamp_with_pm_tz ) expect_equal( t_string %>% Table$create() %>% mutate( x = strptime(x, tz = "UTC") ) %>% collect(), t_stamp_with_utc_tz ) expect_equal( t_string %>% Table$create() %>% mutate( x = strptime(x, format = "%Y-%m-%d %H:%M:%S", tz = "UTC") ) %>% collect(), t_stamp_with_utc_tz ) expect_equal( t_string %>% Table$create() %>% mutate( x = strptime(x, format = "%Y-%m-%d %H:%M:%S", unit = "ns", tz = "UTC") ) %>% collect(), t_stamp_with_utc_tz ) expect_equal( t_string %>% Table$create() %>% mutate( x = strptime(x, format = "%Y-%m-%d %H:%M:%S", unit = "s", tz = "UTC") ) %>% collect(), t_stamp_with_utc_tz ) tstring <- tibble(x = c("08-05-2008", NA)) tstamp <- strptime(c("08-05-2008", NA), format = "%m-%d-%Y") expect_equal( tstring %>% Table$create() %>% mutate( x = strptime(x, format = "%m-%d-%Y") ) %>% pull() %>% as.vector(), # R's strptime returns POSIXlt (list type) as.POSIXct(tstamp), ignore_attr = "tzone" ) # these functions' internals use some string processing which requires the # RE2 library skip_if_not_available("re2") compare_dplyr_binding( .input %>% mutate( parsed_date_ymd = parse_date_time(string_1, orders = "Y-%m-d-%T") ) %>% collect(), tibble::tibble(string_1 = c("2022-02-11-12:23:45", NA)) ) }) test_that("strptime works for individual formats", { # strptime format support is not consistent across platforms skip_on_cran() # these functions' internals use some string processing which requires the # RE2 library skip_if_not_available("re2") expect_equal( strptime_test_df %>% arrow_table() %>% mutate( parsed_H = strptime(string_H, format = "%Y-%m-%d-%H"), parsed_I = strptime(string_I, format = "%Y-%m-%d-%I"), parsed_j = strptime(string_j, format = "%Y-%m-%d-%j"), parsed_M = strptime(string_M, format = "%Y-%m-%d-%M"), parsed_S = strptime(string_S, format = "%Y-%m-%d-%S"), parsed_U = strptime(string_U, format = "%Y-%m-%d-%U"), parsed_w = strptime(string_w, format = "%Y-%m-%d-%w"), parsed_W = strptime(string_W, format = "%Y-%m-%d-%W"), parsed_y = strptime(string_y, format = "%y-%m-%d"), parsed_Y = strptime(string_Y, format = "%Y-%m-%d"), parsed_R = strptime(string_R, format = "%Y-%m-%d-%R"), parsed_T = strptime(string_T, format = "%Y-%m-%d-%T") ) %>% collect(), strptime_test_df %>% mutate( parsed_H = as.POSIXct(strptime(string_H, format = "%Y-%m-%d-%H")), parsed_I = as.POSIXct(strptime(string_I, format = "%Y-%m-%d-%I")), parsed_j = as.POSIXct(strptime(string_j, format = "%Y-%m-%d-%j")), parsed_M = as.POSIXct(strptime(string_M, format = "%Y-%m-%d-%M")), parsed_S = as.POSIXct(strptime(string_S, format = "%Y-%m-%d-%S")), parsed_U = as.POSIXct(strptime(string_U, format = "%Y-%m-%d-%U")), parsed_w = as.POSIXct(strptime(string_w, format = "%Y-%m-%d-%w")), parsed_W = as.POSIXct(strptime(string_W, format = "%Y-%m-%d-%W")), parsed_y = as.POSIXct(strptime(string_y, format = "%y-%m-%d")), parsed_Y = as.POSIXct(strptime(string_Y, format = "%Y-%m-%d")), parsed_R = as.POSIXct(strptime(string_R, format = "%Y-%m-%d-%R")), parsed_T = as.POSIXct(strptime(string_T, format = "%Y-%m-%d-%T")) ) %>% collect() ) # Some formats are not supported on Windows skip_on_os("windows") expect_equal( strptime_test_df %>% arrow_table() %>% mutate( parsed_a = strptime(string_a, format = "%Y-%m-%d-%a"), parsed_A = strptime(string_A, format = "%Y-%m-%d-%A"), parsed_b = strptime(string_b, format = "%Y-%m-%d-%b"), parsed_B = strptime(string_B, format = "%Y-%m-%d-%B"), parsed_p = strptime(string_p, format = "%Y-%m-%d-%p"), parsed_r = strptime(string_r, format = "%Y-%m-%d-%r") ) %>% collect(), strptime_test_df %>% mutate( parsed_a = as.POSIXct(strptime(string_a, format = "%Y-%m-%d-%a")), parsed_A = as.POSIXct(strptime(string_A, format = "%Y-%m-%d-%A")), parsed_b = as.POSIXct(strptime(string_b, format = "%Y-%m-%d-%b")), parsed_B = as.POSIXct(strptime(string_B, format = "%Y-%m-%d-%B")), parsed_p = as.POSIXct(strptime(string_p, format = "%Y-%m-%d-%p")), parsed_r = as.POSIXct(strptime(string_r, format = "%Y-%m-%d-%r")) ) %>% collect() ) }) test_that("timestamp round trip correctly via strftime and strptime", { # strptime format support is not consistent across platforms skip_on_cran() # these functions' internals use some string processing which requires the # RE2 library skip_if_not_available("re2") tz <- "Pacific/Marquesas" set.seed(42) times <- seq(as.POSIXct("1999-02-07", tz = tz), as.POSIXct("2000-01-01", tz = tz), by = "sec") times <- sample(times, 100) # Op format is currently not supported by strptime formats <- c( "%d", "%H", "%j", "%m", "%T", "%S", "%q", "%M", "%U", "%w", "%W", "%y", "%Y", "%R", "%T" ) formats2 <- c( "a", "A", "b", "B", "d", "H", "j", "m", "T", "OS", "Ip", "S", "q", "M", "U", "w", "W", "y", "Y", "r", "R", "Tz" ) base_format <- "%Y-%m-%d" base_format2 <- "ymd" # Some formats are not supported on Windows if (!tolower(Sys.info()[["sysname"]]) == "windows") { # "%r" could also be here, though it is only valid in some locales (those # that use 12 hour formats, so skip for now) formats <- c(formats, "%a", "%A", "%b", "%B", "%OS", "%I%p", "%T%z") } for (fmt in formats) { fmt <- paste(base_format, fmt) test_df <- tibble::tibble(x = strftime(times, format = fmt)) expect_equal( test_df %>% arrow_table() %>% mutate(!!fmt := strptime(x, format = fmt)) %>% collect(), test_df %>% mutate(!!fmt := as.POSIXct(strptime(x, format = fmt))) %>% collect() ) } for (fmt in formats2) { fmt2 <- paste(base_format2, fmt) fmt <- paste(base_format, paste0("%", fmt)) test_df <- tibble::tibble(x = strftime(times, format = fmt)) expect_equal_data_frame( test_df %>% arrow_table() %>% mutate(!!fmt := strptime(x, format = fmt2)) %>% collect(), test_df %>% mutate(!!fmt := as.POSIXct(strptime(x, format = fmt2))) %>% collect() ) } }) test_that("strptime returns NA when format doesn't match the data", { df <- tibble( str_date = c("2022-02-07", "2012/02-07", "1975/01-02", "1981/01-07", NA) ) # base::strptime() returns a POSIXlt object (a list), while the Arrow binding # returns a POSIXct (double) vector => we cannot use compare_dplyr_binding() expect_equal( df %>% arrow_table() %>% mutate( r_obj_parsed_date = strptime("03-27/2022", format = "%m-%d/%Y"), r_obj_parsed_na = strptime("03-27/2022", format = "Y%-%m-%d") ) %>% collect(), df %>% mutate( r_obj_parsed_date = as.POSIXct(strptime("03-27/2022", format = "%m-%d/%Y")), r_obj_parsed_na = as.POSIXct(strptime("03-27/2022", format = "Y%-%m-%d")) ), ignore_attr = "tzone" ) expect_equal( df %>% record_batch() %>% mutate(parsed_date = strptime(str_date, format = "%Y-%m-%d")) %>% collect(), df %>% mutate(parsed_date = as.POSIXct(strptime(str_date, format = "%Y-%m-%d"))), ignore_attr = "tzone" ) expect_equal( df %>% arrow_table() %>% mutate(parsed_date = strptime(str_date, format = "%Y/%m-%d")) %>% collect(), df %>% mutate(parsed_date = as.POSIXct(strptime(str_date, format = "%Y/%m-%d"))), ignore_attr = "tzone" ) }) test_that("strftime", { times <- tibble( datetime = c(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "Etc/GMT+6"), NA), date = c(as.Date("2021-01-01"), NA) ) formats <- "%a %A %w %d %b %B %m %y %Y %H %I %p %M %z %Z %j %U %W %x %X %% %G %V %u" formats_date <- "%a %A %w %d %b %B %m %y %Y %H %I %p %M %j %U %W %x %X %% %G %V %u" compare_dplyr_binding( .input %>% mutate( x = strftime(datetime, format = formats), x2 = base::strftime(datetime, format = formats) ) %>% collect(), times ) compare_dplyr_binding( .input %>% mutate(x = strftime(date, format = formats_date)) %>% collect(), times ) compare_dplyr_binding( .input %>% mutate(x = strftime(datetime, format = formats, tz = "Pacific/Marquesas")) %>% collect(), times ) compare_dplyr_binding( .input %>% mutate(x = strftime(datetime, format = formats, tz = "EST", usetz = TRUE)) %>% collect(), times ) withr::with_timezone( "Pacific/Marquesas", { compare_dplyr_binding( .input %>% mutate( x = strftime(datetime, format = formats, tz = "EST"), x_date = strftime(date, format = formats_date, tz = "EST") ) %>% collect(), times ) compare_dplyr_binding( .input %>% mutate( x = strftime(datetime, format = formats), x_date = strftime(date, format = formats_date) ) %>% collect(), times ) } ) # This check is due to differences in the way %c currently works in Arrow and R's strftime. # We can revisit after https://github.com/HowardHinnant/date/issues/704 is resolved. if (Sys.getlocale("LC_TIME") != "C") { expect_error( times %>% Table$create() %>% mutate(x = strftime(datetime, format = "%c")) %>% collect(), "%c flag is not supported in non-C locales." ) } # Output precision of %S depends on the input timestamp precision. # Timestamps with second precision are represented as integers while # milliseconds, microsecond and nanoseconds are represented as fixed floating # point numbers with 3, 6 and 9 decimal places respectively. compare_dplyr_binding( .input %>% mutate(x = strftime(datetime, format = "%S")) %>% transmute(as.double(substr(x, 1, 2))) %>% collect(), times, tolerance = 1e-6 ) }) test_that("format_ISO8601", { # https://issues.apache.org/jira/projects/ARROW/issues/ARROW-15266 skip_if_not_available("re2") # A change in R altered the behavior of lubridate::format_ISO8601: # https://github.com/wch/r-source/commit/f6fd993f8a2f799a56dbecbd8238f155191fc31b # Fixed in lubridate here: # https://github.com/tidyverse/lubridate/pull/1068 skip_if_not(packageVersion("lubridate") > "1.8") times <- tibble(x = c(lubridate::ymd_hms("2018-10-07 19:04:05", tz = "Etc/GMT+6"), NA)) compare_dplyr_binding( .input %>% mutate( a = format_ISO8601(x, precision = "ymd", usetz = FALSE), a2 = lubridate::format_ISO8601(x, precision = "ymd", usetz = FALSE) ) %>% collect(), times ) if (getRversion() < "3.5") { # before 3.5, times$x will have no timezone attribute, so Arrow faithfully # errors that there is no timezone to format: expect_error( times %>% Table$create() %>% mutate(x = format_ISO8601(x, precision = "ymd", usetz = TRUE)) %>% collect(), "Timezone not present, cannot convert to string with timezone: %Y-%m-%d%z" ) # See comment regarding %S flag in strftime tests expect_error( times %>% Table$create() %>% mutate(x = format_ISO8601(x, precision = "ymdhms", usetz = TRUE)) %>% mutate(x = gsub("\\.0*", "", x)) %>% collect(), "Timezone not present, cannot convert to string with timezone: %Y-%m-%dT%H:%M:%S%z" ) } else { compare_dplyr_binding( .input %>% mutate(x = format_ISO8601(x, precision = "ymd", usetz = TRUE)) %>% collect(), times ) # See comment regarding %S flag in strftime tests compare_dplyr_binding( .input %>% mutate(x = format_ISO8601(x, precision = "ymdhms", usetz = TRUE)) %>% mutate(x = gsub("\\.0*", "", x)) %>% collect(), times ) } # See comment regarding %S flag in strftime tests compare_dplyr_binding( .input %>% mutate(x = format_ISO8601(x, precision = "ymdhms", usetz = FALSE)) %>% mutate(x = gsub("\\.0*", "", x)) %>% collect(), times ) }) # These tests test detection of dates and times test_that("is.* functions from lubridate", { # make sure all true and at least one false value is considered compare_dplyr_binding( .input %>% mutate( x = is.POSIXct(datetime), y = is.POSIXct(integer), x2 = lubridate::is.POSIXct(datetime) ) %>% collect(), test_df ) compare_dplyr_binding( .input %>% mutate( x = is.Date(date), y = is.Date(integer), x2 = lubridate::is.Date(date) ) %>% collect(), test_df ) compare_dplyr_binding( .input %>% mutate( x = is.instant(datetime), y = is.instant(date), z = is.instant(integer) ) %>% collect(), test_df ) compare_dplyr_binding( .input %>% mutate( x = is.timepoint(datetime), y = is.instant(date), z = is.timepoint(integer), x2 = lubridate::is.timepoint(datetime), y2 = lubridate::is.instant(date), z2 = lubridate::is.timepoint(integer) ) %>% collect(), test_df ) }) # These tests test component extraction from timestamp objects test_that("extract year from timestamp", { compare_dplyr_binding( .input %>% mutate(x = year(datetime)) %>% collect(), test_df ) }) test_that("extract isoyear from timestamp", { compare_dplyr_binding( .input %>% mutate(x = isoyear(datetime)) %>% collect(), test_df ) }) test_that("extract epiyear from timestamp", { compare_dplyr_binding( .input %>% mutate( x = epiyear(datetime), x2 = lubridate::epiyear(datetime) ) %>% collect(), test_df ) }) test_that("extract quarter from timestamp", { compare_dplyr_binding( .input %>% mutate(x = quarter(datetime)) %>% collect(), test_df ) }) test_that("extract month from timestamp", { compare_dplyr_binding( .input %>% mutate( x = month(datetime), x2 = lubridate::month(datetime) ) %>% collect(), test_df ) compare_dplyr_binding( .input %>% # R returns ordered factor whereas Arrow returns character mutate(x = as.character(month(datetime, label = TRUE))) %>% collect(), test_df, ignore_attr = TRUE ) compare_dplyr_binding( .input %>% mutate(x = as.character(month(datetime, label = TRUE, abbr = TRUE))) %>% collect(), test_df, ignore_attr = TRUE ) }) test_that("extract isoweek from timestamp", { compare_dplyr_binding( .input %>% mutate( x = isoweek(datetime), x2 = lubridate::isoweek(datetime) ) %>% collect(), test_df ) }) test_that("extract epiweek from timestamp", { compare_dplyr_binding( .input %>% mutate(x = epiweek(datetime)) %>% collect(), test_df ) }) test_that("extract week from timestamp", { compare_dplyr_binding( .input %>% mutate( x = week(datetime), x2 = lubridate::week(datetime) ) %>% collect(), test_df ) }) test_that("extract day from timestamp", { compare_dplyr_binding( .input %>% mutate(x = day(datetime)) %>% collect(), test_df ) }) test_that("extract wday from timestamp", { compare_dplyr_binding( .input %>% mutate(x = wday(datetime)) %>% collect(), test_df ) compare_dplyr_binding( .input %>% mutate(x = wday(date, week_start = 3)) %>% collect(), test_df ) compare_dplyr_binding( .input %>% mutate(x = wday(date, week_start = 1)) %>% collect(), test_df ) compare_dplyr_binding( .input %>% mutate(x = wday(date, label = TRUE)) %>% mutate(x = as.character(x)) %>% collect(), test_df ) compare_dplyr_binding( .input %>% mutate(x = wday(datetime, label = TRUE, abbr = TRUE)) %>% mutate(x = as.character(x)) %>% collect(), test_df ) }) test_that("extract mday from timestamp", { compare_dplyr_binding( .input %>% mutate(x = mday(datetime)) %>% collect(), test_df ) }) test_that("extract yday from timestamp", { compare_dplyr_binding( .input %>% mutate( x = yday(datetime), x2 = lubridate::yday(datetime) ) %>% collect(), test_df ) }) test_that("extract qday from timestamp", { test_df <- tibble::tibble( time = as.POSIXct(seq(as.Date("1999-12-31", tz = "UTC"), as.Date("2001-01-01", tz = "UTC"), by = "day")) ) compare_dplyr_binding( .input %>% transmute(x = qday(time)) %>% collect(), test_df ) compare_dplyr_binding( .input %>% transmute(x = qday(as.POSIXct("2022-06-29 12:35"))) %>% collect(), test_df ) }) test_that("extract hour from timestamp", { compare_dplyr_binding( .input %>% mutate( x = hour(datetime), x2 = lubridate::hour(datetime) ) %>% collect(), test_df ) }) test_that("extract minute from timestamp", { compare_dplyr_binding( .input %>% mutate( x = minute(datetime), x2 = lubridate::minute(datetime) ) %>% collect(), test_df ) }) test_that("extract second from timestamp", { compare_dplyr_binding( .input %>% mutate( x = second(datetime), x2 = lubridate::second(datetime) ) %>% collect(), test_df, # arrow supports nanosecond resolution but lubridate does not tolerance = 1e-6 ) }) # These tests test extraction of components from date32 objects test_that("extract year from date", { compare_dplyr_binding( .input %>% mutate( x = year(date), x2 = lubridate::year(date) ) %>% collect(), test_df ) }) test_that("extract isoyear from date", { compare_dplyr_binding( .input %>% mutate( x = isoyear(date), x2 = lubridate::isoyear(date) ) %>% collect(), test_df ) }) test_that("extract epiyear from date", { compare_dplyr_binding( .input %>% mutate(x = epiyear(date)) %>% collect(), test_df ) }) test_that("extract quarter from date", { compare_dplyr_binding( .input %>% mutate( x = quarter(date), x2 = lubridate::quarter(date) ) %>% collect(), test_df ) }) test_that("extract isoweek from date", { compare_dplyr_binding( .input %>% mutate(x = isoweek(date)) %>% collect(), test_df ) }) test_that("extract epiweek from date", { compare_dplyr_binding( .input %>% mutate( x = epiweek(date), x2 = lubridate::epiweek(date) ) %>% collect(), test_df ) }) test_that("extract week from date", { compare_dplyr_binding( .input %>% mutate(x = week(date)) %>% collect(), test_df ) }) test_that("extract month from date", { compare_dplyr_binding( .input %>% mutate(x = month(date)) %>% collect(), test_df ) compare_dplyr_binding( .input %>% # R returns ordered factor whereas Arrow returns character mutate(x = as.character(month(date, label = TRUE))) %>% collect(), test_df, ignore_attr = TRUE ) compare_dplyr_binding( .input %>% mutate(x = as.character(month(date, label = TRUE, abbr = TRUE))) %>% collect(), test_df, ignore_attr = TRUE ) }) test_that("extract day from date", { compare_dplyr_binding( .input %>% mutate( x = day(date), x2 = lubridate::day(date) ) %>% collect(), test_df ) }) test_that("extract wday from date", { compare_dplyr_binding( .input %>% mutate(x = wday(date)) %>% collect(), test_df ) compare_dplyr_binding( .input %>% mutate( x = wday(date, week_start = 3), x2 = lubridate::wday(date, week_start = 3) ) %>% collect(), test_df ) compare_dplyr_binding( .input %>% mutate(x = wday(date, week_start = 1)) %>% collect(), test_df ) compare_dplyr_binding( .input %>% mutate(x = wday(date, label = TRUE, abbr = TRUE)) %>% mutate(x = as.character(x)) %>% collect(), test_df ) compare_dplyr_binding( .input %>% mutate(x = wday(date, label = TRUE)) %>% mutate(x = as.character(x)) %>% collect(), test_df ) }) test_that("extract mday from date", { compare_dplyr_binding( .input %>% mutate( x = mday(date), x2 = lubridate::mday(date) ) %>% collect(), test_df ) }) test_that("extract yday from date", { compare_dplyr_binding( .input %>% mutate(x = yday(date)) %>% collect(), test_df ) }) test_that("extract qday from date", { test_df <- tibble::tibble( date = seq(as.Date("1999-12-31"), as.Date("2001-01-01"), by = "day") ) compare_dplyr_binding( .input %>% mutate(x = qday(date)) %>% collect(), test_df ) compare_dplyr_binding( .input %>% mutate(y = qday(as.Date("2022-06-29"))) %>% collect(), test_df ) }) test_that("leap_year mirror lubridate", { compare_dplyr_binding( .input %>% mutate( x = leap_year(date), x2 = lubridate::leap_year(date) ) %>% collect(), test_df ) compare_dplyr_binding( .input %>% mutate(x = leap_year(datetime)) %>% collect(), test_df ) compare_dplyr_binding( .input %>% mutate(x = leap_year(test_year)) %>% collect(), tibble::tibble( test_year = as.Date(c( "1998-01-01", # not leap year "1996-01-01", # leap year (divide by 4 rule) "1900-01-01", # not leap year (divide by 100 rule) "2000-01-01" # leap year (divide by 400 rule) )) ) ) }) test_that("am/pm mirror lubridate", { compare_dplyr_binding( .input %>% mutate( am = am(test_time), pm = pm(test_time), am2 = lubridate::am(test_time), pm2 = lubridate::pm(test_time) ) %>% # can't use collect() here due to how tibbles store datetimes # TODO: add better explanation above as.data.frame(), data.frame( test_time = strptime( x = c( "2022-01-25 11:50:59", "2022-01-25 12:00:00", "2022-01-25 00:00:00" ), format = "%Y-%m-%d %H:%M:%S" ) ) ) }) test_that("extract tz", { df <- tibble( posixct_date = as.POSIXct(c("2022-02-07", "2022-02-10"), tz = "Pacific/Marquesas"), ) compare_dplyr_binding( .input %>% mutate( timezone_posixct_date = tz(posixct_date), timezone_posixct_date2 = lubridate::tz(posixct_date) ) %>% collect(), df ) # test a few types directly from R objects expect_error( call_binding("tz", "2020-10-01"), "timezone extraction for objects of class `string` not supported in Arrow" ) expect_error( call_binding("tz", as.Date("2020-10-01")), "timezone extraction for objects of class `date32[day]` not supported in Arrow", fixed = TRUE ) expect_error( call_binding("tz", 1L), "timezone extraction for objects of class `int32` not supported in Arrow" ) expect_error( call_binding("tz", 1.1), "timezone extraction for objects of class `double` not supported in Arrow" ) # Test one expression expect_error( call_binding("tz", Expression$scalar("2020-10-01")), "timezone extraction for objects of class `string` not supported in Arrow" ) }) test_that("semester works with temporal types and integers", { test_df <- tibble( month_as_int = c(1:12, NA), month_as_char_pad = sprintf("%02i", month_as_int), dates = as.Date(paste0("2021-", month_as_char_pad, "-15")) ) # semester extraction from dates compare_dplyr_binding( .input %>% mutate( sem_wo_year = semester(dates), sem_wo_year2 = lubridate::semester(dates), sem_w_year = semester(dates, with_year = TRUE) ) %>% collect(), test_df ) compare_dplyr_binding( .input %>% mutate(sem_month_as_int = semester(month_as_int)) %>% collect(), test_df ) expect_error( test_df %>% arrow_table() %>% mutate(sem_month_as_char_pad = semester(month_as_char_pad)) %>% collect(), regexp = "NotImplemented: Function 'month' has no kernel matching input types (string)", fixed = TRUE ) }) test_that("dst extracts daylight savings time correctly", { test_df <- tibble( dates = as.POSIXct(c("2021-02-20", "2021-07-31", "2021-10-31", "2021-01-31"), tz = "Europe/London") ) compare_dplyr_binding( .input %>% mutate( dst = dst(dates), dst2 = lubridate::dst(dates) ) %>% collect(), test_df ) }) test_that("month() supports integer input", { test_df_month <- tibble( month_as_int = c(1:12, NA) ) compare_dplyr_binding( .input %>% mutate(month_int_input = month(month_as_int)) %>% collect(), test_df_month ) compare_dplyr_binding( .input %>% # R returns ordered factor whereas Arrow returns character mutate( month_int_input = as.character(month(month_as_int, label = TRUE)) ) %>% collect(), test_df_month ) compare_dplyr_binding( .input %>% # R returns ordered factor whereas Arrow returns character mutate( month_int_input = as.character( month(month_as_int, label = TRUE, abbr = FALSE) ) ) %>% collect(), test_df_month ) }) test_that("month() errors with double input and returns NA with int outside 1:12", { test_df_month <- tibble( month_as_int = c(-1L, 1L, 13L, NA), month_as_double = month_as_int + 0.1 ) expect_equal( test_df_month %>% arrow_table() %>% select(month_as_int) %>% mutate(month_int_input = month(month_as_int)) %>% collect(), tibble( month_as_int = c(-1L, 1L, 13L, NA), month_int_input = c(NA, 1L, NA, NA) ) ) expect_error( test_df_month %>% arrow_table() %>% mutate(month_dbl_input = month(month_as_double)) %>% collect(), regexp = "Function 'month' has no kernel matching input types (double)", fixed = TRUE ) expect_error( test_df_month %>% record_batch() %>% mutate(month_dbl_input = month(month_as_double)) %>% collect(), regexp = "Function 'month' has no kernel matching input types (double)", fixed = TRUE ) }) test_that("date works in arrow", { # this date is specific since lubridate::date() is different from base::as.Date() # since as.Date returns the UTC date and date() doesn't test_df <- tibble( posixct_date = as.POSIXct(c("2012-03-26 23:12:13", NA), tz = "America/New_York"), posixct_fractional_second = as_datetime(c("2012-03-26 23:12:13.676632", NA)), integer_var = c(32L, NA) ) r_date_object <- lubridate::ymd_hms("2012-03-26 23:12:13") compare_dplyr_binding( .input %>% mutate(a_date = lubridate::date(posixct_date)) %>% collect(), test_df ) compare_dplyr_binding( .input %>% mutate(a_date_base = as.Date(posixct_date)) %>% collect(), test_df ) compare_dplyr_binding( .input %>% mutate(a_date_base = as.Date(posixct_fractional_second)) %>% collect(), test_df ) compare_dplyr_binding( .input %>% mutate(date_from_r_object = lubridate::date(r_date_object)) %>% collect(), test_df ) compare_dplyr_binding( .input %>% mutate(as_date_from_r_object = as.Date(r_date_object)) %>% collect(), test_df ) # date from integer supported in arrow (similar to base::as.Date()), but in # Arrow it assumes a fixed origin "1970-01-01". However this is not supported # by lubridate. lubridate::date(integer_var) errors without an `origin` expect_equal( test_df %>% arrow_table() %>% select(integer_var) %>% mutate(date_int = date(integer_var)) %>% collect(), tibble( integer_var = c(32L, NA), date_int = as.Date(c("1970-02-02", NA)) ) ) }) test_that("date() errors with unsupported inputs", { # Use InMemoryDataset here so that abandon_ship() errors instead of warns. # The lubridate version errors too. skip_if_not_available("dataset") expect_error( example_data %>% InMemoryDataset$create() %>% mutate(date_bool = lubridate::date(TRUE)) %>% collect(), regexp = "Unsupported cast from bool to date32 using function cast_date32" ) }) test_that("make_date & make_datetime", { test_df <- expand.grid( year = c(1999, 1969, 2069, NA), month = c(1, 2, 7, 12, NA), day = c(1, 9, 13, 28, NA), hour = c(0, 7, 23, NA), min = c(0, 59, NA), sec = c(0, 59, NA) ) %>% tibble() compare_dplyr_binding( .input %>% mutate( composed_date = make_date(year, month, day), composed_date2 = lubridate::make_date(year, month, day) ) %>% collect(), test_df ) compare_dplyr_binding( .input %>% mutate(composed_date_r_obj = make_date(1999, 12, 31)) %>% collect(), test_df ) compare_dplyr_binding( .input %>% mutate( composed_datetime = make_datetime(year, month, day, hour, min, sec), composed_datetime2 = lubridate::make_datetime(year, month, day, hour, min, sec) ) %>% collect(), test_df, # the make_datetime binding uses strptime which does not support tz, hence # a mismatch in tzone attribute (ARROW-12820) ignore_attr = TRUE ) compare_dplyr_binding( .input %>% mutate( composed_datetime_r_obj = make_datetime(1999, 12, 31, 14, 15, 16) ) %>% collect(), test_df, # the make_datetime binding uses strptime which does not support tz, hence # a mismatch in tzone attribute (ARROW-12820) ignore_attr = TRUE ) }) test_that("ISO_datetime & ISOdate", { test_df <- expand.grid( year = c(1999, 1969, 2069, NA), month = c(1, 2, 7, 12, NA), day = c(1, 9, 13, 28, NA), hour = c(0, 7, 23, NA), min = c(0, 59, NA), sec = c(0, 59, NA) ) %>% tibble() compare_dplyr_binding( .input %>% mutate( composed_date = ISOdate(year, month, day), composed_date2 = base::ISOdate(year, month, day) ) %>% collect(), test_df, # the make_datetime binding uses strptime which does not support tz, hence # a mismatch in tzone attribute (ARROW-12820) ignore_attr = TRUE ) compare_dplyr_binding( .input %>% mutate(composed_date_r_obj = ISOdate(1999, 12, 31)) %>% collect(), test_df, # the make_datetime binding uses strptime which does not support tz, hence # a mismatch in tzone attribute (ARROW-12820) ignore_attr = TRUE ) # the default `tz` for base::ISOdatetime is "", but in Arrow it's "UTC" compare_dplyr_binding( .input %>% mutate( composed_datetime = ISOdatetime(year, month, day, hour, min, sec, tz = "UTC"), composed_datetime2 = base::ISOdatetime(year, month, day, hour, min, sec, tz = "UTC") ) %>% collect(), test_df, # the make_datetime binding uses strptime which does not support tz, hence # a mismatch in tzone attribute (ARROW-12820) ignore_attr = TRUE ) compare_dplyr_binding( .input %>% mutate( composed_datetime_r_obj = ISOdatetime(1999, 12, 31, 14, 15, 16) ) %>% collect(), test_df, # the make_datetime binding uses strptime which does not support tz, hence # a mismatch in tzone attribute (ARROW-12820) ignore_attr = TRUE ) }) test_that("difftime()", { test_df <- tibble( time1 = as.POSIXct( c("2021-02-20", "2021-07-31 0:0:0", "2021-10-30", "2021-01-31 0:0:0") ), time2 = as.POSIXct( c("2021-02-20 00:02:01", "2021-07-31 00:03:54", "2021-10-30 00:05:45", "2021-01-31 00:07:36") ), secs = c(121L, 234L, 345L, 456L) ) compare_dplyr_binding( .input %>% mutate( secs = difftime(time1, time2, units = "secs"), secs2 = base::difftime(time1, time2, units = "secs") ) %>% collect(), test_df, ignore_attr = TRUE ) # units other than "secs" not supported in arrow compare_dplyr_binding( .input %>% mutate( mins = difftime(time1, time2, units = "mins") ) %>% collect(), test_df, warning = TRUE, ignore_attr = TRUE ) test_df_with_tz <- tibble( time1 = as.POSIXct( c("2021-02-20", "2021-07-31", "2021-10-30", "2021-01-31"), tz = "Pacific/Marquesas" ), time2 = as.POSIXct( c("2021-02-20 00:02:01", "2021-07-31 00:03:54", "2021-10-30 00:05:45", "2021-01-31 00:07:36"), tz = "Asia/Kathmandu" ), secs = c(121L, 234L, 345L, 456L) ) compare_dplyr_binding( .input %>% mutate(secs2 = difftime(time2, time1, units = "secs")) %>% collect(), test_df_with_tz ) compare_dplyr_binding( .input %>% mutate( secs2 = difftime( as.POSIXct("2022-03-07", tz = "Pacific/Marquesas"), time1, units = "secs" ) ) %>% collect(), test_df_with_tz ) # `tz` is effectively ignored both in R (used only if inputs are POSIXlt) and Arrow compare_dplyr_binding( .input %>% mutate(secs2 = difftime(time2, time1, units = "secs", tz = "Pacific/Marquesas")) %>% collect(), test_df_with_tz, warning = "`tz` argument is not supported in Arrow, so it will be ignored" ) }) test_that("as.difftime()", { test_df <- tibble( hms_string = c("0:7:45", "12:34:56"), hm_string = c("7:45", "12:34"), int = c(30L, 75L), integerish_dbl = c(31, 76), dbl = c(31.2, 76.4) ) compare_dplyr_binding( .input %>% mutate( hms_difftime = as.difftime(hms_string, units = "secs"), hms_difftime2 = base::as.difftime(hms_string, units = "secs") ) %>% collect(), test_df ) # TODO add test with `format` mismatch returning NA once # https://issues.apache.org/jira/browse/ARROW-15659 is solved # for example: as.difftime("07:", format = "%H:%M") should return NA compare_dplyr_binding( .input %>% mutate(hm_difftime = as.difftime(hm_string, units = "secs", format = "%H:%M")) %>% collect(), test_df ) compare_dplyr_binding( .input %>% mutate(int_difftime = as.difftime(int, units = "secs")) %>% collect(), test_df ) compare_dplyr_binding( .input %>% mutate(integerish_dbl_difftime = as.difftime(integerish_dbl, units = "secs")) %>% collect(), test_df ) # "mins" or other values for units cannot be handled in Arrow compare_dplyr_binding( .input %>% mutate(int_difftime = as.difftime(int, units = "mins")) %>% collect(), test_df, warning = TRUE ) # only integer (or integer-like) -> duration conversion supported in Arrow. # double -> duration not supported. We aren't testing the content of the # error message as it is being generated in the C++ code and it might change, # but we want to make sure that this error is raised in our binding implementation expect_error( test_df %>% arrow_table() %>% mutate(dbl_difftime = as.difftime(dbl, units = "secs")) %>% collect() ) }) test_that("`decimal_date()` and `date_decimal()`", { test_df <- tibble( a = c( 2007.38998954347, 1970.77732069883, 2020.96061799722, 2009.43465948477, 1975.71251467871, NA ), b = as.POSIXct( c( "2007-05-23 08:18:30", "1970-10-11 17:19:45", "2020-12-17 14:04:06", "2009-06-08 15:37:01", "1975-09-18 01:37:42", NA ) ), c = as.Date( c("2007-05-23", "1970-10-11", "2020-12-17", "2009-06-08", "1975-09-18", NA) ) ) compare_dplyr_binding( .input %>% mutate( decimal_date_from_POSIXct = decimal_date(b), decimal_date_from_POSIXct2 = lubridate::decimal_date(b), decimal_date_from_r_POSIXct_obj = decimal_date(as.POSIXct("2022-03-25 15:37:01")), decimal_date_from_r_date_obj = decimal_date(as.Date("2022-03-25")), decimal_date_from_date = decimal_date(c), date_from_decimal = date_decimal(a), date_from_decimal2 = lubridate::date_decimal(a), date_from_decimal_r_obj = date_decimal(2022.178) ) %>% collect(), test_df, ignore_attr = "tzone" ) }) test_that("dminutes, dhours, ddays, dweeks, dmonths, dyears", { example_d <- tibble(x = c(1:10, NA)) date_to_add <- ymd("2009-08-03", tz = "Pacific/Marquesas") # When comparing results we use ignore_attr = TRUE because of the diff in: # attribute 'package' (absent vs. 'lubridate') # class (difftime vs Duration) # attribute 'units' (character vector ('secs') vs. absent) compare_dplyr_binding( .input %>% mutate( dminutes = dminutes(x), dhours = dhours(x), ddays = ddays(x), dweeks = dweeks(x), dmonths = dmonths(x), dyears = dyears(x) ) %>% collect(), example_d, ignore_attr = TRUE ) compare_dplyr_binding( .input %>% mutate( dhours = dhours(x), ddays = ddays(x), new_date_1 = date_to_add + ddays, new_date_2 = date_to_add + ddays - dhours(3), new_duration = dhours - ddays ) %>% collect(), example_d, ignore_attr = TRUE ) compare_dplyr_binding( .input %>% mutate( r_obj_dminutes = dminutes(1), r_obj_dhours = dhours(2), r_obj_ddays = ddays(3), r_obj_dweeks = dweeks(4), r_obj_dmonths = dmonths(5), r_obj_dyears = dyears(6), r_obj_dminutes2 = lubridate::dminutes(1), r_obj_dhours2 = lubridate::dhours(2), r_obj_ddays2 = lubridate::ddays(3), r_obj_dweeks2 = lubridate::dweeks(4), r_obj_dmonths2 = lubridate::dmonths(5), r_obj_dyears2 = lubridate::dyears(6) ) %>% collect(), tibble(), ignore_attr = TRUE ) # double -> duration not supported in Arrow. # With a scalar, cast to int64 error in mutate() -> abandon_ship warning expect_warning( test_df %>% arrow_table() %>% mutate(r_obj_dminutes = dminutes(1.12345)), "not supported in Arrow" ) # When operating on a column, it doesn't happen until collect() expect_error( arrow_table(dbl = 1.948230) %>% mutate(r_obj_dminutes = dminutes(dbl)) %>% collect(), "truncated converting to int64" ) }) test_that("dseconds, dmilliseconds, dmicroseconds, dnanoseconds, dpicoseconds", { example_d <- tibble(x = c(1:10, NA)) date_to_add <- ymd("2009-08-03", tz = "Pacific/Marquesas") # When comparing results we use ignore_attr = TRUE because of the diff in: # attribute 'package' (absent vs. 'lubridate') # class (difftime vs Duration) # attribute 'units' (character vector ('secs') vs. absent) compare_dplyr_binding( .input %>% mutate( dseconds = dseconds(x), dmilliseconds = dmilliseconds(x), dmicroseconds = dmicroseconds(x), dnanoseconds = dnanoseconds(x), dseconds2 = lubridate::dseconds(x), dmilliseconds2 = lubridate::dmilliseconds(x), dmicroseconds2 = lubridate::dmicroseconds(x), dnanoseconds2 = lubridate::dnanoseconds(x), ) %>% collect(), example_d, ignore_attr = TRUE ) compare_dplyr_binding( .input %>% mutate( dseconds = dseconds(x), dmicroseconds = dmicroseconds(x), new_date_1 = date_to_add + dseconds, new_date_2 = date_to_add + dseconds - dmicroseconds, new_duration = dseconds - dmicroseconds ) %>% collect(), example_d, ignore_attr = TRUE ) compare_dplyr_binding( .input %>% mutate( r_obj_dseconds = dseconds(1), r_obj_dmilliseconds = dmilliseconds(2), r_obj_dmicroseconds = dmicroseconds(3), r_obj_dnanoseconds = dnanoseconds(4) ) %>% collect(), tibble(), ignore_attr = TRUE ) expect_error( call_binding("dpicoseconds"), "Duration in picoseconds not supported in Arrow" ) expect_error( call_binding("lubridate::dpicoseconds"), "Duration in picoseconds not supported in Arrow" ) }) test_that("make_difftime()", { test_df <- tibble( seconds = c(3, 4, 5, 6), minutes = c(1.5, 2.3, 4.5, 6.7), hours = c(2, 3, 4, 5), days = c(6, 7, 8, 9), weeks = c(1, 3, 5, NA), number = 10:13 ) compare_dplyr_binding( .input %>% mutate( duration_from_parts = make_difftime( second = seconds, minute = minutes, hour = hours, day = days, week = weeks, units = "secs" ), duration_from_num = make_difftime( num = number, units = "secs" ), duration_from_r_num = make_difftime( num = 154, units = "secs" ), duration_from_r_parts = make_difftime( minute = 45, day = 2, week = 4, units = "secs" ), duration_from_parts2 = lubridate::make_difftime( second = seconds, minute = minutes, hour = hours, day = days, week = weeks, units = "secs" ) ) %>% collect(), test_df ) # named difftime parts other than `second`, `minute`, `hour`, `day` and `week` # are not supported expect_error( expect_warning( test_df %>% arrow_table() %>% mutate( err_difftime = make_difftime(month = 2) ) %>% collect(), paste0( "named `difftime` units other than: `second`, `minute`, `hour`,", " `day`, and `week` not supported in Arrow." ) ) ) # units other than "secs" not supported since they are the only ones in common # between R and Arrow compare_dplyr_binding( .input %>% mutate(error_difftime = make_difftime(num = number, units = "mins")) %>% collect(), test_df, warning = TRUE ) # constructing a difftime from both `num` and parts passed through `...` while # possible with the lubridate function (resulting in a concatenation of the 2 # resulting objects), it errors in a dplyr context expect_error( expect_warning( test_df %>% arrow_table() %>% mutate( duration_from_num_and_parts = make_difftime( num = number, second = seconds, minute = minutes, hour = hours, day = days, week = weeks, units = "secs" ) ) %>% collect(), "with both `num` and `...` not supported in Arrow" ) ) }) test_that("`as.Date()` and `as_date()`", { test_df <- tibble::tibble( posixct_var = as.POSIXct(c("2022-02-25 00:00:01", "1987-11-24 12:34:56", NA), tz = "Pacific/Marquesas"), dt_europe = ymd_hms("2010-08-03 00:50:50", "1987-11-24 12:34:56", NA, tz = "Europe/London"), dt_utc = ymd_hms("2010-08-03 00:50:50", "1987-11-24 12:34:56", NA), date_var = as.Date(c("2022-02-25", "1987-11-24", NA)), difference_date = ymd_hms("2010-08-03 00:50:50", "1987-11-24 12:34:56", NA, tz = "Pacific/Marquesas"), try_formats_string = c(NA, "2022-01-01", "2022/01/01"), character_ymd_hms_var = c("2022-02-25 00:00:01", "1987-11-24 12:34:56", NA), character_ydm_hms_var = c("2022/25/02 00:00:01", "1987/24/11 12:34:56", NA), character_ymd_var = c("2022-02-25", "1987-11-24", NA), character_ydm_var = c("2022/25/02", "1987/24/11", NA), integer_var = c(21L, 32L, NA), integerish_var = c(21, 32, NA), double_var = c(12.34, 56.78, NA) ) compare_dplyr_binding( .input %>% mutate( date_dv1 = as.Date(date_var), date_dv1_nmspc = base::as.Date(date_var), date_pv1 = as.Date(posixct_var), date_pv_tz1 = as.Date(posixct_var, tz = "Pacific/Marquesas"), date_utc1 = as.Date(dt_utc), date_europe1 = as.Date(dt_europe), date_char_ymd_hms1 = as.Date(character_ymd_hms_var, format = "%Y-%m-%d %H:%M:%S"), date_char_ydm_hms1 = as.Date(character_ydm_hms_var, format = "%Y/%d/%m %H:%M:%S"), date_int1 = as.Date(integer_var, origin = "1970-01-01"), date_int_origin1 = as.Date(integer_var, origin = "1970-01-03"), date_integerish1 = as.Date(integerish_var, origin = "1970-01-01"), date_dv2 = as_date(date_var), date_dv2_nmspc = lubridate::as_date(date_var), date_pv2 = as_date(posixct_var), date_pv_tz2 = as_date(posixct_var, tz = "Pacific/Marquesas"), date_utc2 = as_date(dt_utc), date_europe2 = as_date(dt_europe), date_char_ymd2 = as_date(character_ymd_hms_var, format = "%Y-%m-%d %H:%M:%S"), date_char_ydm2 = as_date(character_ydm_hms_var, format = "%Y/%d/%m %H:%M:%S"), date_int2 = as_date(integer_var, origin = "1970-01-01"), date_int_origin2 = as_date(integer_var, origin = "1970-01-03"), date_integerish2 = as_date(integerish_var, origin = "1970-01-01") ) %>% collect(), test_df ) # strptime does not support a partial format - Arrow returns NA, while # lubridate parses correctly # TODO: revisit after ARROW-15813 expect_error( expect_equal( test_df %>% arrow_table() %>% mutate(date_char_ymd_hms = as_date(character_ymd_hms_var)) %>% collect(), test_df %>% mutate(date_char_ymd_hms = as_date(character_ymd_hms_var)) %>% collect() ) ) # same as above expect_error( expect_equal( test_df %>% arrow_table() %>% mutate(date_char_ymd_hms = as.Date(character_ymd_hms_var)) %>% collect(), test_df %>% mutate(date_char_ymd_hms = as.Date(character_ymd_hms_var)) %>% collect() ) ) # we do not support as.Date() with double/ float (error surfaced from C++) # TODO: revisit after ARROW-15798 expect_error( test_df %>% arrow_table() %>% mutate(date_double = as.Date(double_var, origin = "1970-01-01")) %>% collect() ) expect_error( test_df %>% arrow_table() %>% mutate(date_double = as_date(double_var, origin = "1970-01-01")) %>% collect() ) # difference between as.Date() and as_date(): # `as.Date()` ignores the `tzone` attribute and uses the value of the `tz` arg # to `as.Date()` # `as_date()` does the opposite: uses the tzone attribute of the POSIXct object # passed if`tz` is NULL compare_dplyr_binding( .input %>% transmute( date_diff_lubridate = as_date(difference_date), date_diff_base = as.Date(difference_date) ) %>% collect(), test_df ) skip_if_not_available("dataset") # we do not support multiple tryFormats # Use a dataset to test the alternative suggestion message expect_snapshot( test_df %>% InMemoryDataset$create() %>% transmute( date_char_ymd = as.Date( character_ymd_var, tryFormats = c("%Y-%m-%d", "%Y/%m/%d") ) ) %>% collect(), error = TRUE ) }) test_that("`as_date()` and `as.Date()` work with R objects", { compare_dplyr_binding( .input %>% mutate( date1 = as.Date("2022-05-10"), date2 = as.Date(12, origin = "2022-05-01"), date3 = as.Date("2022-10-03", tryFormats = "%Y-%m-%d"), date4 = as_date("2022-05-10"), date5 = as_date(12, origin = "2022-05-01"), date6 = as_date("2022-10-03") ) %>% collect(), tibble( a = 1 ) ) }) test_that("`as_datetime()`", { test_df <- tibble( date = as.Date(c("2022-03-22", "2021-07-30", NA)), char_date = c("2022-03-22", "2021-07-30 14:32:47", NA), char_date_subsec = c("1970-01-01T00:00:59.123456789", "2000-02-29T23:23:23.999999999", NA), char_date_non_iso = c("2022-22-03 12:34:56", "2021-30-07 14:32:47", NA), int_date = c(10L, 25L, NA), integerish_date = c(10, 25, NA), double_date = c(10.1, 25.2, NA) ) compare_dplyr_binding( .input %>% mutate( ddate = as_datetime(date), ddate2 = lubridate::as_datetime(date), dchar_date_no_tz = as_datetime(char_date), dchar_date_with_tz = as_datetime(char_date, tz = "Pacific/Marquesas"), dchar_date_subsec_no_tz = as_datetime(char_date_subsec), dchar_date_subsec_with_tz = as_datetime(char_date_subsec, tz = "Pacific/Marquesas"), dint_date = as_datetime(int_date, origin = "1970-01-02"), dintegerish_date = as_datetime(integerish_date, origin = "1970-01-02"), dintegerish_date2 = as_datetime(integerish_date, origin = "1970-01-01"), ddouble_date = as_datetime(double_date) ) %>% collect(), test_df ) expect_identical( test_df %>% arrow_table() %>% mutate( x = cast(as_datetime(double_date, unit = "ns"), int64()), y = cast(as_datetime(double_date, unit = "us"), int64()), z = cast(as_datetime(double_date, unit = "ms"), int64()), .keep = "none" ) %>% collect(), tibble( x = bit64::as.integer64(c(10100000000, 25200000000, NA)), y = as.integer(c(10100000, 25200000, NA)), z = as.integer(c(10100, 25200, NA)) ) ) }) test_that("as_datetime() works with other functions", { test_df <- tibble( char_date = c("2022-03-22", "2021-07-30 14:32:47", "1970-01-01 00:00:59.123456789", NA) ) compare_dplyr_binding( .input %>% transmute( ddchar_date = as_datetime(char_date), ddchar_date_date32_1 = as.Date(ddchar_date), ddchar_date_date32_2 = as_date(ddchar_date), ddchar_date_floored = floor_date(ddchar_date, unit = "days") ) %>% collect(), test_df ) # ARROW-17428 - Arrow does not support conversion of timestamp to int32 expect_error( test_df %>% arrow_table() %>% mutate( dchar_date = as_datetime(char_date), dchar_date_int = as.integer(dchar_date) ) %>% collect() ) # ARROW-17428 - Arrow does not support conversion of timestamp to double expect_error( test_df %>% arrow_table() %>% mutate( dchar_date = as_datetime(char_date), dchar_date_num = as.numeric(dchar_date) ) %>% collect() ) }) test_that("parse_date_time() works with year, month, and date components", { # these functions' internals use some string processing which requires the # RE2 library skip_if_not_available("re2") compare_dplyr_binding( .input %>% mutate( parsed_date_ymd = parse_date_time(string_ymd, orders = "ymd"), parsed_date_ymd2 = lubridate::parse_date_time(string_ymd, orders = "ymd"), parsed_date_dmy = parse_date_time(string_dmy, orders = "dmy"), parsed_date_mdy = parse_date_time(string_mdy, orders = "mdy") ) %>% collect(), tibble::tibble( string_ymd = c( "2021-09-1", "2021/09///2", "2021.09.03", "2021,09,4", "2021:09::5", "2021 09 6", "21-09-07", "21/09/08", "21.09.9", "21,09,10", "21:09:11", "20210912", "210913", NA ), string_dmy = c( "1-09-2021", "2/09//2021", "03.09.2021", "04,09,2021", "5:::09:2021", "6 09 2021", "07-09-21", "08/09/21", "9.09.21", "10,09,21", "11:09:21", "12092021", "130921", NA ), string_mdy = c( "09-01-2021", "09/2/2021", "09.3.2021", "09,04,2021", "09:05:2021", "09 6 2021", "09-7-21", "09/08/21", "09.9.21", "09,10,21", "09:11:21", "09122021", "091321", NA ) ) ) # TODO(ARROW-16443): locale (affecting "%b% and "%B") does not work on Windows skip_on_os("windows") compare_dplyr_binding( .input %>% mutate( parsed_date_ymd = parse_date_time(string_ymd, orders = "ymd"), parsed_date_dmy = parse_date_time(string_dmy, orders = "dmy"), parsed_date_mdy = parse_date_time(string_mdy, orders = "mdy") ) %>% collect(), tibble::tibble( string_ymd = c( "2021 Sep 12", "2021 September 13", "21 Sep 14", "21 September 15", "2021Sep16", NA ), string_dmy = c( "12 Sep 2021", "13 September 2021", "14 Sep 21", "15 September 21", "16Sep2021", NA ), string_mdy = c( "Sep 12 2021", "September 13 2021", "Sep 14 21", "September 15 21", "Sep1621", NA ) ) ) }) test_that("parse_date_time() works with a mix of formats and orders", { # these functions' internals use some string processing which requires the # RE2 library skip_if_not_available("re2") test_df <- tibble( string_combi = c("2021-09-1", "2/09//2021", "09.3.2021") ) compare_dplyr_binding( .input %>% mutate( date_from_string = parse_date_time( string_combi, orders = c("ymd", "%d/%m//%Y", "%m.%d.%Y") ) ) %>% collect(), test_df ) }) test_that("year, month, day date/time parsers", { test_df <- tibble::tibble( ymd_string = c("2022-05-11", "2022/05/12", "22.05-13"), ydm_string = c("2022-11-05", "2022/12/05", "22.13-05"), mdy_string = c("05-11-2022", "05/12/2022", "05.13-22"), myd_string = c("05-2022-11", "05/2022/12", "05.22-14"), dmy_string = c("11-05-2022", "12/05/2022", "13.05-22"), dym_string = c("11-2022-05", "12/2022/05", "13.22-05") ) # these functions' internals use some string processing which requires the # RE2 library skip_if_not_available("re2") compare_dplyr_binding( .input %>% mutate( ymd_date = ymd(ymd_string), ydm_date = ydm(ydm_string), mdy_date = mdy(mdy_string), myd_date = myd(myd_string), dmy_date = dmy(dmy_string), dym_date = dym(dym_string), ymd_date2 = lubridate::ymd(ymd_string), ydm_date2 = lubridate::ydm(ydm_string), mdy_date2 = lubridate::mdy(mdy_string), myd_date2 = lubridate::myd(myd_string), dmy_date2 = lubridate::dmy(dmy_string), dym_date2 = lubridate::dym(dym_string) ) %>% collect(), test_df ) compare_dplyr_binding( .input %>% mutate( ymd_date = ymd(ymd_string, tz = "Pacific/Marquesas"), ydm_date = ydm(ydm_string, tz = "Pacific/Marquesas"), mdy_date = mdy(mdy_string, tz = "Pacific/Marquesas"), myd_date = myd(myd_string, tz = "Pacific/Marquesas"), dmy_date = dmy(dmy_string, tz = "Pacific/Marquesas"), dym_date = dym(dym_string, tz = "Pacific/Marquesas") ) %>% collect(), test_df ) }) test_that("ym, my & yq parsers", { test_df <- tibble::tibble( ym_string = c("2022-05", "2022/02", "22.3", "1979//12", "88.09", NA), my_string = c("05-2022", "02/2022", "03.22", "12//1979", "09.88", NA), Ym_string = c("2022-05", "2022/02", "2022.03", "1979//12", "1988.09", NA), mY_string = c("05-2022", "02/2022", "03.2022", "12//1979", "09.1988", NA), yq_string = c("2007.3", "1971.2", "2021.1", "2009.4", "1975.1", NA), yq_numeric = c(2007.3, 1971.2, 2021.1, 2009.4, 1975.1, NA), yq_space = c("2007 3", "1970 2", "2020 1", "2009 4", "1975 1", NA), qy_string = c("3.2007", "2.1971", "1.2020", "4.2009", "1.1975", NA), qy_numeric = c(3.2007, 2.1971, 1.2021, 4.2009, 1.1975, NA), qy_space = c("3 2007", "2 1971", "1 2021", "4 2009", "1 1975", NA) ) # these functions' internals use some string processing which requires the # RE2 library skip_if_not_available("re2") compare_dplyr_binding( .input %>% mutate( ym_date = ym(ym_string), ym_date2 = lubridate::ym(ym_string), ym_datetime = ym(ym_string, tz = "Pacific/Marquesas"), Ym_date = ym(Ym_string), Ym_datetime = ym(Ym_string, tz = "Pacific/Marquesas"), my_date = my(my_string), my_date2 = lubridate::my(my_string), my_datetime = my(my_string, tz = "Pacific/Marquesas"), mY_date = my(mY_string), mY_datetime = my(mY_string, tz = "Pacific/Marquesas"), yq_date_from_string = yq(yq_string), yq_date_from_string2 = lubridate::yq(yq_string), yq_datetime_from_string = yq(yq_string, tz = "Pacific/Marquesas"), yq_date_from_numeric = yq(yq_numeric), yq_datetime_from_numeric = yq(yq_numeric, tz = "Pacific/Marquesas"), yq_date_from_string_with_space = yq(yq_space), yq_datetime_from_string_with_space = yq(yq_space, tz = "Pacific/Marquesas"), ym_date2 = parse_date_time(ym_string, orders = c("ym", "ymd")), my_date2 = parse_date_time(my_string, orders = c("my", "myd")), Ym_date2 = parse_date_time(Ym_string, orders = c("Ym", "ymd")), mY_date2 = parse_date_time(mY_string, orders = c("mY", "myd")), yq_date_from_string2 = parse_date_time(yq_string, orders = "yq"), yq_date_from_numeric2 = parse_date_time(yq_numeric, orders = "yq"), yq_date_from_string_with_space2 = parse_date_time(yq_space, orders = "yq"), # testing with Yq yq_date_from_string3 = parse_date_time(yq_string, orders = "Yq"), yq_date_from_numeric3 = parse_date_time(yq_numeric, orders = "Yq"), yq_date_from_string_with_space3 = parse_date_time(yq_space, orders = "Yq"), # testing with qy qy_date_from_string = parse_date_time(qy_string, orders = "qy"), qy_date_from_numeric = parse_date_time(qy_numeric, orders = "qy"), qy_date_from_string_with_space = parse_date_time(qy_space, orders = "qy"), # testing with qY qy_date_from_string2 = parse_date_time(qy_string, orders = "qY"), qy_date_from_numeric2 = parse_date_time(qy_numeric, orders = "qY"), qy_date_from_string_with_space2 = parse_date_time(qy_space, orders = "qY") ) %>% collect(), test_df ) }) test_that("parse_date_time's other formats", { # these functions' internals use some string processing which requires the # RE2 library skip_if_not_available("re2") compare_dplyr_binding( .input %>% mutate( parsed_H = parse_date_time(string_H, orders = "%Y-%m-%d-%H"), parsed_I = parse_date_time(string_I, orders = "%Y-%m-%d-%I"), parsed_j = parse_date_time(string_j, orders = "%Y-%m-%d-%j"), parsed_M = parse_date_time(string_M, orders = "%Y-%m-%d-%M"), parsed_S = parse_date_time(string_S, orders = "%Y-%m-%d-%S"), parsed_U = parse_date_time(string_U, orders = "%Y-%m-%d-%U"), parsed_w = parse_date_time(string_w, orders = "%Y-%m-%d-%w"), parsed_W = parse_date_time(string_W, orders = "%Y-%m-%d-%W"), parsed_y = parse_date_time(string_y, orders = "%y-%m-%d"), parsed_Y = parse_date_time(string_Y, orders = "%Y-%m-%d"), parsed_R = parse_date_time(string_R, orders = "%Y-%m-%d-%R"), parsed_T = parse_date_time(string_T, orders = "%Y-%m-%d-%T") ) %>% collect(), strptime_test_df ) compare_dplyr_binding( .input %>% mutate( parsed_H = parse_date_time(string_H, orders = "ymdH"), parsed_I = parse_date_time(string_I, orders = "ymdI"), parsed_j = parse_date_time(string_j, orders = "ymdj"), parsed_M = parse_date_time(string_M, orders = "ymdM"), parsed_S = parse_date_time(string_S, orders = "ymdS"), parsed_U = parse_date_time(string_U, orders = "ymdU"), parsed_w = parse_date_time(string_w, orders = "ymdw"), parsed_W = parse_date_time(string_W, orders = "ymdW"), parsed_y = parse_date_time(string_y, orders = "ymd"), parsed_Y = parse_date_time(string_Y, orders = "Ymd"), parsed_R = parse_date_time(string_R, orders = "ymdR"), parsed_T = parse_date_time(string_T, orders = "ymdT") ) %>% collect(), strptime_test_df ) # Some formats are not supported on Windows if (!tolower(Sys.info()[["sysname"]]) == "windows") { compare_dplyr_binding( .input %>% mutate( parsed_a = parse_date_time(string_a, orders = "%Y-%m-%d-%a"), parsed_A = parse_date_time(string_A, orders = "%Y-%m-%d-%A"), parsed_b = parse_date_time(string_b, orders = "%Y-%m-%d-%b"), parsed_B = parse_date_time(string_B, orders = "%Y-%m-%d-%B"), parsed_p = parse_date_time(string_p, orders = "%Y-%m-%d-%p"), parsed_r = parse_date_time(string_r, orders = "%Y-%m-%d-%r") ) %>% collect(), strptime_test_df ) compare_dplyr_binding( .input %>% mutate( parsed_a = parse_date_time(string_a, orders = "ymda"), parsed_A = parse_date_time(string_A, orders = "ymdA"), parsed_b = parse_date_time(string_b, orders = "ymdb"), parsed_B = parse_date_time(string_B, orders = "ymdB"), parsed_p = parse_date_time(string_p, orders = "ymdp"), parsed_r = parse_date_time(string_r, orders = "ymdr") ) %>% collect(), strptime_test_df ) compare_dplyr_binding( .input %>% mutate( parsed_date_ymd = parse_date_time(string_1, orders = "Y-%b-d-%T") ) %>% collect(), tibble::tibble(string_1 = c("2022-Feb-11-12:23:45", NA)) ) } }) test_that("lubridate's fast_strptime", { compare_dplyr_binding( .input %>% mutate( y = fast_strptime(x, format = "%Y-%m-%d %H:%M:%S", lt = FALSE), y2 = lubridate::fast_strptime(x, format = "%Y-%m-%d %H:%M:%S", lt = FALSE) ) %>% collect(), tibble( x = c("2018-10-07 19:04:05", "2022-05-17 21:23:45", NA) ) ) # R object compare_dplyr_binding( .input %>% mutate( y = fast_strptime( "68-10-07 19:04:05", format = "%y-%m-%d %H:%M:%S", lt = FALSE ) ) %>% collect(), tibble( x = c("2018-10-07 19:04:05", NA) ) ) compare_dplyr_binding( .input %>% mutate( date_multi_formats = fast_strptime( x, format = c("%Y-%m-%d %H:%M:%S", "%m-%d-%Y %H:%M:%S"), lt = FALSE ) ) %>% collect(), tibble( x = c("2018-10-07 19:04:05", "10-07-1968 19:04:05") ) ) # these functions' internals use some string processing which requires the # RE2 library skip_if_not_available("re2") compare_dplyr_binding( .input %>% mutate( dttm_with_tz = fast_strptime( dttm_as_string, format = "%Y-%m-%d %H:%M:%S", tz = "Pacific/Marquesas", lt = FALSE ) ) %>% collect(), tibble( dttm_as_string = c("2018-10-07 19:04:05", "1969-10-07 19:04:05", NA) ) ) # fast_strptime()'s `cutoff_2000` argument is not supported, but its value is # implicitly set to 68L both in lubridate and in Arrow compare_dplyr_binding( .input %>% mutate( date_short_year = fast_strptime( x, format = "%y-%m-%d %H:%M:%S", lt = FALSE ) ) %>% collect(), tibble( x = c("68-10-07 19:04:05", "69-10-07 19:04:05", NA) ) ) # the arrow binding errors for a value different from 68L for `cutoff_2000` compare_dplyr_binding( .input %>% mutate( date_short_year = fast_strptime( x, format = "%y-%m-%d %H:%M:%S", lt = FALSE, cutoff_2000 = 69L ) ) %>% collect(), tibble( x = c("68-10-07 19:04:05", "69-10-07 19:04:05", NA) ), warning = TRUE ) # compare_dplyr_binding would not work here since lt = TRUE returns a list # and it also errors in regular dplyr pipelines expect_warning( tibble( x = c("68-10-07 19:04:05", "69-10-07 19:04:05", NA) ) %>% arrow_table() %>% mutate( date_short_year = fast_strptime( x, format = "%y-%m-%d %H:%M:%S", lt = TRUE ) ) %>% collect() ) }) test_that("parse_date_time with hours, minutes and seconds components", { test_dates_times <- tibble( ymd_hms_string = c("67-01-09 12:34:56", "1970-05-22 20:13:59", "870822201359", NA), ymd_hm_string = c("67-01-09 12:34", "1970-05-22 20:13", "8708222013", NA), ymd_h_string = c("67-01-09 12", "1970-05-22 20", "87082220", NA), dmy_hms_string = c("09-01-67 12:34:56", "22-05-1970 20:13:59", "220887201359", NA), dmy_hm_string = c("09-01-67 12:34", "22-05-1970 20:13", "2208872013", NA), dmy_h_string = c("09-01-67 12", "22-05-1970 20", "22088720", NA), mdy_hms_string = c("01-09-67 12:34:56", "05-22-1970 20:13:59", "082287201359", NA), mdy_hm_string = c("01-09-67 12:34", "05-22-1970 20:13", "0822872013", NA), mdy_h_string = c("01-09-67 12", "05-22-1970 20", "08228720", NA), ydm_hms_string = c("67-09-01 12:34:56", "1970-22-05 20:13:59", "872208201359", NA), ydm_hm_string = c("67-09-01 12:34", "1970-22-05 20:13", "8722082013", NA), ydm_h_string = c("67-09-01 12", "1970-22-05 20", "87220820", NA) ) # the unseparated strings are versions of "1987-08-22 20:13:59" (with %y) # these functions' internals use some string processing which requires the # RE2 library skip_if_not_available("re2") compare_dplyr_binding( .input %>% mutate( ymd_hms_dttm = parse_date_time(ymd_hms_string, orders = "ymd_HMS"), ymd_hm_dttm = parse_date_time(ymd_hm_string, orders = "ymd_HM"), ymd_h_dttm = parse_date_time(ymd_h_string, orders = "ymd_H"), dmy_hms_dttm = parse_date_time(dmy_hms_string, orders = "dmy_HMS"), dmy_hm_dttm = parse_date_time(dmy_hm_string, orders = "dmy_HM"), dmy_h_dttm = parse_date_time(dmy_h_string, orders = "dmy_H"), mdy_hms_dttm = parse_date_time(mdy_hms_string, orders = "mdy_HMS"), mdy_hm_dttm = parse_date_time(mdy_hm_string, orders = "mdy_HM"), mdy_h_dttm = parse_date_time(mdy_h_string, orders = "mdy_H"), ydm_hms_dttm = parse_date_time(ydm_hms_string, orders = "ydm_HMS"), ydm_hm_dttm = parse_date_time(ydm_hm_string, orders = "ydmHM"), ydm_h_dttm = parse_date_time(ydm_h_string, orders = "ydmH") ) %>% collect(), test_dates_times ) compare_dplyr_binding( .input %>% mutate( ymd_hms_dttm = ymd_hms(ymd_hms_string), ymd_hm_dttm = ymd_hm(ymd_hm_string), ymd_h_dttm = ymd_h(ymd_h_string), dmy_hms_dttm = dmy_hms(dmy_hms_string), dmy_hm_dttm = dmy_hm(dmy_hm_string), dmy_h_dttm = dmy_h(dmy_h_string), mdy_hms_dttm = mdy_hms(mdy_hms_string), mdy_hm_dttm = mdy_hm(mdy_hm_string), mdy_h_dttm = mdy_h(mdy_h_string), ydm_hms_dttm = ydm_hms(ydm_hms_string), ydm_hm_dttm = ydm_hm(ydm_hm_string), ydm_h_dttm = ydm_h(ydm_h_string) ) %>% collect(), test_dates_times ) # parse_date_time with timezone pm_tz <- "Pacific/Marquesas" compare_dplyr_binding( .input %>% mutate( ymd_hms_dttm = parse_date_time(ymd_hms_string, orders = "ymd_HMS", tz = pm_tz), ymd_hm_dttm = parse_date_time(ymd_hm_string, orders = "ymd_HM", tz = pm_tz), ymd_h_dttm = parse_date_time(ymd_h_string, orders = "ymd_H", tz = pm_tz), dmy_hms_dttm = parse_date_time(dmy_hms_string, orders = "dmy_HMS", tz = pm_tz), dmy_hm_dttm = parse_date_time(dmy_hm_string, orders = "dmy_HM", tz = pm_tz), dmy_h_dttm = parse_date_time(dmy_h_string, orders = "dmy_H", tz = pm_tz), mdy_hms_dttm = parse_date_time(mdy_hms_string, orders = "mdy_HMS", tz = pm_tz), mdy_hm_dttm = parse_date_time(mdy_hm_string, orders = "mdy_HM", tz = pm_tz), mdy_h_dttm = parse_date_time(mdy_h_string, orders = "mdy_H", tz = pm_tz), ydm_hms_dttm = parse_date_time(ydm_hms_string, orders = "ydm_HMS", tz = pm_tz), ydm_hm_dttm = parse_date_time(ydm_hm_string, orders = "ydm_HM", tz = pm_tz), ydm_h_dttm = parse_date_time(ydm_h_string, orders = "ydm_H", tz = pm_tz) ) %>% collect(), test_dates_times ) compare_dplyr_binding( .input %>% mutate( ymd_hms_dttm = ymd_hms(ymd_hms_string, tz = pm_tz), ymd_hm_dttm = ymd_hm(ymd_hm_string, tz = pm_tz), ymd_h_dttm = ymd_h(ymd_h_string, tz = pm_tz), dmy_hms_dttm = dmy_hms(dmy_hms_string, tz = pm_tz), dmy_hm_dttm = dmy_hm(dmy_hm_string, tz = pm_tz), dmy_h_dttm = dmy_h(dmy_h_string, tz = pm_tz), mdy_hms_dttm = mdy_hms(mdy_hms_string, tz = pm_tz), mdy_hm_dttm = mdy_hm(mdy_hm_string, tz = pm_tz), mdy_h_dttm = mdy_h(mdy_h_string, tz = pm_tz), ydm_hms_dttm = ydm_hms(ydm_hms_string, tz = pm_tz), ydm_hm_dttm = ydm_hm(ydm_hm_string, tz = pm_tz), ydm_h_dttm = ydm_h(ydm_h_string, tz = pm_tz), ) %>% collect(), test_dates_times ) compare_dplyr_binding( .input %>% mutate( ymd_hms_dttm = ymd_hms("2022-07-19 20:24:43"), ymd_hm_dttm = ymd_hm("2022-07-19 20:24"), ymd_h_dttm = ymd_h("2022-07-19 20"), dmy_hms_dttm = dmy_hms("19-07-2022 20:24:43"), dmy_hm_dttm = dmy_hm("19-07-2022 20:24"), dmy_h_dttm = dmy_h("19-07-2022 20"), mdy_hms_dttm = mdy_hms("07-19-2022 20:24:43"), mdy_hm_dttm = mdy_hm("07-19-2022 20:24"), mdy_h_dttm = mdy_h("07-19-2022 20"), ydm_hms_dttm = ydm_hms("2022-19-07 20:24:43"), ydm_hm_dttm = ydm_hm("2022-19-07 20:24"), ydm_h_dttm = ydm_h("2022-19-07 20") ) %>% collect(), test_dates_times ) # test ymd_ims compare_dplyr_binding( .input %>% mutate( ymd_ims_dttm = parse_date_time( ymd_ims_string, orders = "ymd_IMS", # lubridate is chatty and will warn 1 format failed to parse quiet = TRUE ) ) %>% collect(), tibble( ymd_ims_string = c("67-01-09 9:34:56", "1970-05-22 10:13:59", "19870822171359", NA) ) ) }) test_that("parse_date_time with month names and HMS", { # TODO(ARROW-16443): locale (affecting "%b% and "%B") does not work on Windows skip_on_os("windows") # these functions' internals use some string processing which requires the # RE2 library (not available in the minimal nightly builds) skip_if_not_available("re2") test_dates_times2 <- tibble( ymd_hms_string = c("67-Jan-09 12:34:56", "1970-June-22 20:13:59", "87Aug22201359", NA), ymd_hm_string = c("67-Jan-09 12:34", "1970-June-22 20:13", "87Aug222013", NA), ymd_h_string = c("67-Jan-09 12", "1970-June-22 20", "87Aug2220", NA), dmy_hms_string = c("09-Jan-67 12:34:56", "22-June-1970 20:13:59", "22Aug87201359", NA), dmy_hm_string = c("09-Jan-67 12:34", "22-June-1970 20:13", "22Aug872013", NA), dmy_h_string = c("09-Jan-67 12", "22-June-1970 20", "22Aug8720", NA), mdy_hms_string = c("Jan-09-67 12:34:56", "June-22-1970 20:13:59", "Aug2287201359", NA), mdy_hm_string = c("Jan-09-67 12:34", "June-22-1970 20:13", "Aug22872013", NA), mdy_h_string = c("Jan-09-67 12", "June-22-1970 20", "Aug228720", NA), ydm_hms_string = c("67-09-Jan 12:34:56", "1970-22-June 20:13:59", "8722Aug201359", NA), ydm_hm_string = c("67-09-Jan 12:34", "1970-22-June 20:13", "8722Aug2013", NA), ydm_h_string = c("67-09-Jan 12", "1970-22-June 20", "8722Aug20", NA) ) # the un-separated strings are versions of "1987-08-22 20:13:59" (with %y) compare_dplyr_binding( .input %>% mutate( ymd_hms_dttm = parse_date_time(ymd_hms_string, orders = "ymd_HMS"), ymd_hm_dttm = parse_date_time(ymd_hm_string, orders = "ymdHM"), ymd_h_dttm = parse_date_time(ymd_h_string, orders = "ymd_H"), dmy_hms_dttm = parse_date_time(dmy_hms_string, orders = "dmy_HMS"), dmy_hm_dttm = parse_date_time(dmy_hm_string, orders = "dmyHM"), dmy_h_dttm = parse_date_time(dmy_h_string, orders = "dmy_H"), mdy_hms_dttm = parse_date_time(mdy_hms_string, orders = "mdy_HMS"), mdy_hm_dttm = parse_date_time(mdy_hm_string, orders = "mdyHM"), mdy_h_dttm = parse_date_time(mdy_h_string, orders = "mdy_H"), ydm_hms_dttm = parse_date_time(ydm_hms_string, orders = "ydm_HMS"), ydm_hm_dttm = parse_date_time(ydm_hm_string, orders = "ydmHM"), ydm_h_dttm = parse_date_time(ydm_h_string, orders = "ydm_H") ) %>% collect(), test_dates_times2 ) compare_dplyr_binding( .input %>% mutate( ymd_hms_dttm = ymd_hms(ymd_hms_string), ymd_hm_dttm = ymd_hm(ymd_hm_string), ymd_h_dttm = ymd_h(ymd_h_string), dmy_hms_dttm = dmy_hms(dmy_hms_string), dmy_hm_dttm = dmy_hm(dmy_hm_string), dmy_h_dttm = dmy_h(dmy_h_string), mdy_hms_dttm = mdy_hms(mdy_hms_string), mdy_hm_dttm = mdy_hm(mdy_hm_string), mdy_h_dttm = mdy_h(mdy_h_string), ydm_hms_dttm = ydm_hms(ydm_hms_string), ydm_hm_dttm = ydm_hm(ydm_hm_string), ydm_h_dttm = ydm_h(ydm_h_string) ) %>% collect(), test_dates_times2 ) compare_dplyr_binding( .input %>% mutate( ymd_hms_dttm = ymd_hms("2022-June-19 20:24:43"), ymd_hm_dttm = ymd_hm("2022-June-19 20:24"), ymd_h_dttm = ymd_h("2022-June-19 20"), dmy_hms_dttm = dmy_hms("19-June-2022 20:24:43"), dmy_hm_dttm = dmy_hm("19-June-2022 20:24"), dmy_h_dttm = dmy_h("19-June-2022 20"), mdy_hms_dttm = mdy_hms("June-19-2022 20:24:43"), mdy_hm_dttm = mdy_hm("June-19-2022 20:24"), mdy_h_dttm = mdy_h("June-19-2022 20"), ydm_hms_dttm = ydm_hms("2022-19-June 20:24:43"), ydm_hm_dttm = ydm_hm("2022-19-June 20:24"), ydm_h_dttm = ydm_h("2022-19-June 20") ) %>% collect(), test_dates_times2 ) }) test_that("parse_date_time with `quiet = FALSE` not supported", { # we need expect_warning twice as both the arrow pipeline (because quiet = # FALSE is not supported) and the fallback dplyr/lubridate one throw # warnings (the lubridate one because quiet is FALSE) # https://issues.apache.org/jira/browse/ARROW-17146 # these functions' internals use some string processing which requires the # RE2 library (not available in the minimal nightly builds) skip_if_not_available("re2") expect_warning( expect_warning( tibble(x = c("2022-05-19 13:46:51")) %>% arrow_table() %>% mutate( x_dttm = parse_date_time(x, orders = "dmy_HMS", quiet = FALSE) ) %>% collect(), "`quiet = FALSE` not supported in Arrow" ), "All formats failed to parse" ) expect_warning( tibble(x = c("2022-05-19 13:46:51")) %>% arrow_table() %>% mutate( x_dttm = ymd_hms(x, quiet = FALSE) ) %>% collect(), "`quiet = FALSE` not supported in Arrow" ) }) test_that("parse_date_time with truncated formats", { # these functions' internals use some string processing which requires the # RE2 library skip_if_not_available("re2") test_truncation_df <- tibble( truncated_ymd_string = c( "2022-05-19 13:46:51", "2022-05-18 13:46", "2022-05-17 13", "2022-05-16" ) ) compare_dplyr_binding( .input %>% mutate( dttm = parse_date_time( truncated_ymd_string, orders = "ymd_HMS", truncated = 3 ), dttm2 = ymd_hms( truncated_ymd_string, truncated = 3 ) ) %>% collect(), test_truncation_df ) # values for truncated greater than nchar(orders) - 3 not supported in Arrow compare_dplyr_binding( .input %>% mutate( dttm = parse_date_time( truncated_ymd_string, orders = "ymd_HMS", truncated = 5 ) ) %>% collect(), test_truncation_df, warning = "a value for `truncated` > 4 not supported in Arrow" ) # values for truncated greater than nchar(orders) - 3 not supported in Arrow compare_dplyr_binding( .input %>% mutate( dttm = ymd_hms( truncated_ymd_string, truncated = 5 ) ) %>% collect(), test_truncation_df, warning = "a value for `truncated` > 4 not supported in Arrow" ) }) test_that("parse_date_time with `locale != NULL` not supported", { # parse_date_time currently doesn't take locale parameter which will be # addressed in https://issues.apache.org/jira/browse/ARROW-17147 skip_if_not_available("re2") expect_warning( tibble(x = c("2022-05-19 13:46:51")) %>% arrow_table() %>% mutate( x_dttm = ymd_hms(x, locale = "C") ) %>% collect(), "`locale` not supported in Arrow" ) }) test_that("parse_date_time with `exact = TRUE`, and with regular R objects", { test_df <- tibble( x = c("2022-12-31 12:59:59", "2022-01-01 12:11", "2022-01-01 12", "2022-01-01", NA), y = c("11/23/1998 07:00:00", "6/18/1952 0135", "2/25/1974 0523", "9/07/1985 01", NA) ) # these functions' internals use some string processing which requires the # RE2 library skip_if_not_available("re2") compare_dplyr_binding( .input %>% mutate( parsed_x = parse_date_time( x, c("%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M", "%Y-%m-%d %H", "%Y-%m-%d"), exact = TRUE ), parsed_y = parse_date_time( y, c("%m/%d/%Y %I:%M:%S", "%m/%d/%Y %H%M", "%m/%d/%Y %H"), exact = TRUE ) ) %>% collect(), test_df ) compare_dplyr_binding( .input %>% mutate( b = parse_date_time("2022-12-31 12:59:59", orders = "ymd_HMS") ) %>% collect(), tibble( a = 1 ) ) }) test_that("build_formats() and build_format_from_order()", { ymd_formats <- c( "%y-%m-%d", "%Y-%m-%d", "%y-%B-%d", "%Y-%B-%d", "%y-%b-%d", "%Y-%b-%d", "%y%m%d", "%Y%m%d", "%y%B%d", "%Y%B%d", "%y%b%d", "%Y%b%d" ) ymd_hms_formats <- c( "%y-%m-%d-%H-%M-%S", "%Y-%m-%d-%H-%M-%S", "%y-%B-%d-%H-%M-%S", "%Y-%B-%d-%H-%M-%S", "%y-%b-%d-%H-%M-%S", "%Y-%b-%d-%H-%M-%S", "%y%m%d%H%M%S", "%Y%m%d%H%M%S", "%y%B%d%H%M%S", "%Y%B%d%H%M%S", "%y%b%d%H%M%S", "%Y%b%d%H%M%S" ) expect_equal( build_formats(c("ym", "myd", "%Y-%d-%m")), c( # formats from "ym" order "%y-%m-%d", "%Y-%m-%d", "%y-%B-%d", "%Y-%B-%d", "%y-%b-%d", "%Y-%b-%d", "%y%m%d", "%Y%m%d", "%y%B%d", "%Y%B%d", "%y%b%d", "%Y%b%d", # formats from "myd" order "%m-%y-%d", "%B-%y-%d", "%b-%y-%d", "%m-%Y-%d", "%B-%Y-%d", "%b-%Y-%d", "%m%y%d", "%B%y%d", "%b%y%d", "%m%Y%d", "%B%Y%d", "%b%Y%d", # formats from "%Y-%d-%m" format "%y-%d-%m", "%Y-%d-%m", "%y-%d-%B", "%Y-%d-%B", "%y-%d-%b", "%Y-%d-%b", "%y%d%m", "%Y%d%m", "%y%d%B", "%Y%d%B", "%y%d%b", "%Y%d%b" ) ) expect_equal( build_formats("ymd_HMS"), ymd_hms_formats ) # when order is one of "yq", "qy", "ym" or "my" the data is augmented to "ymd" # or "ydm" and the formats are built accordingly expect_equal( build_formats("yq"), ymd_formats ) expect_equal( build_formats("ym"), ymd_formats ) expect_equal( build_formats("qy"), ymd_formats ) # build formats will output unique formats expect_equal( build_formats(c("yq", "ym", "qy")), ymd_formats ) expect_equal( build_formats("my"), c( "%m-%y-%d", "%B-%y-%d", "%b-%y-%d", "%m-%Y-%d", "%B-%Y-%d", "%b-%Y-%d", "%m%y%d", "%B%y%d", "%b%y%d", "%m%Y%d", "%B%Y%d", "%b%Y%d" ) ) expect_equal( build_format_from_order("abp"), c( "%a-%m-%p", "%A-%m-%p", "%a-%B-%p", "%A-%B-%p", "%a-%b-%p", "%A-%b-%p", "%a%m%p", "%A%m%p", "%a%B%p", "%A%B%p", "%a%b%p", "%A%b%p" ) ) expect_error( build_formats(c("vu", "ymd")), '"vu" `orders` not supported in Arrow' ) expect_error( build_formats(c("abc")), '"abc" `orders` not supported in Arrow' ) expect_equal( build_formats("wIpz"), c("%w-%I-%p-%z", "%w%I%p%z") ) expect_equal( build_formats("yOmd"), ymd_formats ) expect_equal( build_format_from_order("ymd"), ymd_formats ) expect_equal( build_format_from_order("ymdHMS"), ymd_hms_formats ) expect_equal( build_format_from_order("ymdHM"), c( "%y-%m-%d-%H-%M", "%Y-%m-%d-%H-%M", "%y-%B-%d-%H-%M", "%Y-%B-%d-%H-%M", "%y-%b-%d-%H-%M", "%Y-%b-%d-%H-%M", "%y%m%d%H%M", "%Y%m%d%H%M", "%y%B%d%H%M", "%Y%B%d%H%M", "%y%b%d%H%M", "%Y%b%d%H%M" ) ) expect_equal( build_format_from_order("ymdH"), c( "%y-%m-%d-%H", "%Y-%m-%d-%H", "%y-%B-%d-%H", "%Y-%B-%d-%H", "%y-%b-%d-%H", "%Y-%b-%d-%H", "%y%m%d%H", "%Y%m%d%H", "%y%B%d%H", "%Y%B%d%H", "%y%b%d%H", "%Y%b%d%H" ) ) expect_equal( build_formats("y-%b-d-%T"), c( "%y-%m-%d-%I-%M-%S-%p", "%Y-%m-%d-%I-%M-%S-%p", "%y-%B-%d-%I-%M-%S-%p", "%Y-%B-%d-%I-%M-%S-%p", "%y-%b-%d-%I-%M-%S-%p", "%Y-%b-%d-%I-%M-%S-%p", "%y-%m-%d-%H-%M-%S", "%Y-%m-%d-%H-%M-%S", "%y-%B-%d-%H-%M-%S", "%Y-%B-%d-%H-%M-%S", "%y-%b-%d-%H-%M-%S", "%Y-%b-%d-%H-%M-%S", "%y-%m-%d-%H-%M-%OS", "%Y-%m-%d-%H-%M-%OS", "%y-%B-%d-%H-%M-%OS", "%Y-%B-%d-%H-%M-%OS", "%y-%b-%d-%H-%M-%OS", "%Y-%b-%d-%H-%M-%OS", "%y%m%d%I%M%S%p", "%Y%m%d%I%M%S%p", "%y%B%d%I%M%S%p", "%Y%B%d%I%M%S%p", "%y%b%d%I%M%S%p", "%Y%b%d%I%M%S%p", "%y%m%d%H%M%S", "%Y%m%d%H%M%S", "%y%B%d%H%M%S", "%Y%B%d%H%M%S", "%y%b%d%H%M%S", "%Y%b%d%H%M%S", "%y%m%d%H%M%OS", "%Y%m%d%H%M%OS", "%y%B%d%H%M%OS", "%Y%B%d%H%M%OS", "%y%b%d%H%M%OS", "%Y%b%d%H%M%OS" ) ) expect_equal( build_formats("%YdmH%p"), c( "%y-%d-%m-%H-%p", "%Y-%d-%m-%H-%p", "%y-%d-%B-%H-%p", "%Y-%d-%B-%H-%p", "%y-%d-%b-%H-%p", "%Y-%d-%b-%H-%p", "%y%d%m%H%p", "%Y%d%m%H%p", "%y%d%B%H%p", "%Y%d%B%H%p", "%y%d%b%H%p", "%Y%d%b%H%p" ) ) }) # tests for datetime rounding --------------------------------------------- # an "easy" date to avoid conflating tests of different things (i.e., it's # UTC time, and not one of the edge cases on or extremely close to the # rounding boundary) easy_date <- as.POSIXct("2022-10-11 12:00:00", tz = "UTC") easy_df <- tibble::tibble(datetime = easy_date) # dates near month boundaries over the course of 1 year month_boundaries <- c( "2021-01-01 00:01:00", "2021-02-01 00:01:00", "2021-03-01 00:01:00", "2021-04-01 00:01:00", "2021-05-01 00:01:00", "2021-06-01 00:01:00", "2021-07-01 00:01:00", "2021-08-01 00:01:00", "2021-09-01 00:01:00", "2021-10-01 00:01:00", "2021-11-01 00:01:00", "2021-12-01 00:01:00", "2021-01-31 23:59:00", "2021-02-28 23:59:00", "2021-03-31 23:59:00", "2021-04-30 23:59:00", "2021-05-31 23:59:00", "2021-06-30 23:59:00", "2021-07-31 23:59:00", "2021-08-31 23:59:00", "2021-09-30 23:59:00", "2021-10-31 23:59:00", "2021-11-30 23:59:00", "2021-12-31 23:59:00" ) year_of_dates <- tibble::tibble( datetime = as.POSIXct(month_boundaries, tz = "UTC"), date = as.Date(datetime) ) # test case used to check we catch week boundaries for all week_start values fortnight <- tibble::tibble( date = seq( from = as.Date("2022-04-04"), to = as.Date("2022-04-17"), by = "day" ), datetime = as.POSIXct(date) ) # test case to check we catch interval lower boundaries for ceiling_date boundary_times <- tibble::tibble( datetime = as.POSIXct(strptime(c( "2022-05-10 00:00:00", # boundary for week when week_start = 7 (Sunday) "2022-05-11 00:00:00", # boundary for week when week_start = 1 (Monday) "2022-05-12 00:00:00", # boundary for week when week_start = 2 (Tuesday) "2022-03-10 00:00:00", # boundary for day, hour, minute, second, millisecond "2022-03-10 00:00:01", # boundary for second, millisecond "2022-03-10 00:01:00", # boundary for second, millisecond, minute "2022-03-10 01:00:00", # boundary for second, millisecond, minute, hour "2022-01-01 00:00:00" # boundary for year ), tz = "UTC", format = "%F %T")), date = as.Date(datetime) ) # test case to check rounding takes place in local time datestrings <- c( "1970-01-01 00:00:59.123456789", "2000-02-29 23:23:23.999999999", "1899-01-01 00:59:20.001001001", "2033-05-18 03:33:20.000000000", "2020-01-01 01:05:05.001", "2019-12-31 02:10:10.002", "2019-12-30 03:15:15.003", "2009-12-31 04:20:20.004132", "2010-01-01 05:25:25.005321", "2010-01-03 06:30:30.006163", "2010-01-04 07:35:35", "2006-01-01 08:40:40", "2005-12-31 09:45:45", "2008-12-28 00:00:00", "2008-12-29 00:00:00", "2012-01-01 01:02:03" ) tz_times <- tibble::tibble( utc_time = as.POSIXct(datestrings, tz = "UTC"), syd_time = as.POSIXct(datestrings, tz = "Australia/Sydney"), # UTC +10 (UTC +11 with DST) adl_time = as.POSIXct(datestrings, tz = "Australia/Adelaide"), # UTC +9:30 (UTC +10:30 with DST) mar_time = as.POSIXct(datestrings, tz = "Pacific/Marquesas"), # UTC -9:30 (no DST) kat_time = as.POSIXct(datestrings, tz = "Asia/Kathmandu") # UTC +5:45 (no DST) ) test_that("timestamp round/floor/ceiling works for a minimal test", { compare_dplyr_binding( .input %>% mutate( round_datetime = round_date(datetime), floor_datetime = floor_date(datetime), ceiling_datetime = ceiling_date(datetime, change_on_boundary = FALSE) ) %>% collect(), test_df ) }) test_that("timestamp round/floor/ceiling accepts period unit abbreviation", { # test helper to ensure standard abbreviations of period names # are understood by arrow and mirror the lubridate behaviour check_period_abbreviation <- function(unit, synonyms) { # check arrow against lubridate compare_dplyr_binding( .input %>% mutate(out_1 = round_date(datetime, unit)) %>% collect(), easy_df ) # check synonyms base <- call_binding("round_date", Expression$scalar(easy_date), unit) for (syn in synonyms) { expect_equal( call_binding("round_date", Expression$scalar(easy_date), syn), base ) } } check_period_abbreviation("minute", synonyms = c("minutes", "min", "mins")) check_period_abbreviation("second", synonyms = c("seconds", "sec", "secs")) check_period_abbreviation("month", synonyms = c("months", "mon", "mons")) }) test_that("temporal round/floor/ceiling accepts periods with multiple units", { check_multiple_unit_period <- function(unit, multiplier) { unit_string <- paste(multiplier, unit) compare_dplyr_binding( .input %>% mutate( round_datetime = round_date(datetime, unit_string), floor_datetime = floor_date(datetime, unit_string), ceiling_datetime = ceiling_date(datetime, unit_string) ) %>% collect(), easy_df ) } for (multiplier in c(1, 2, 10)) { for (unit in c("second", "minute", "day", "year")) { check_multiple_unit_period(unit, multiplier) } } }) # Test helper functions for checking equivalence of outputs regardless of # the unit specified. The lubridate_unit argument allows for cases where # arrow supports a unit name (e.g., nanosecond) that lubridate doesn't. Also # note that in the check_date_rounding helper the lubridate output is coerced # to ensure type stable output (arrow output should be type stable without this) check_date_rounding <- function(data, unit, lubridate_unit = unit, ...) { expect_equal( data %>% arrow_table() %>% mutate( date_rounded = round_date(date, unit), date_floored = floor_date(date, unit), date_ceiling = ceiling_date(date, unit) ) %>% collect(), data %>% mutate( date_rounded = as.Date(round_date(date, lubridate_unit)), date_floored = as.Date(floor_date(date, lubridate_unit)), date_ceiling = as.Date(ceiling_date(date, lubridate_unit)) ), ... ) } check_timestamp_rounding <- function(data, unit, lubridate_unit = unit, ...) { expect_equal( data %>% arrow_table() %>% mutate( datetime_rounded = round_date(datetime, unit), datetime_floored = floor_date(datetime, unit), datetime_ceiling = ceiling_date(datetime, unit) ) %>% collect(), data %>% mutate( datetime_rounded = round_date(datetime, lubridate_unit), datetime_floored = floor_date(datetime, lubridate_unit), datetime_ceiling = ceiling_date(datetime, lubridate_unit) ), ... ) } test_that("date round/floor/ceil works for units of 1 day or less", { test_df %>% check_date_rounding("1 millisecond", lubridate_unit = ".001 second") test_df %>% check_date_rounding("1 second") test_df %>% check_date_rounding("1 hour") skip("floor_date(as.Date(NA), '1 day') is no longer NA on latest R-devel") # Possibly https://github.com/wch/r-source/commit/4f70ce0d79eeda7464cf97448e515275cbef754b test_df %>% check_date_rounding("1 day") }) test_that("timestamp round/floor/ceil works for units of 1 day or less", { test_df %>% check_timestamp_rounding("second") test_df %>% check_timestamp_rounding("minute") test_df %>% check_timestamp_rounding("hour") test_df %>% check_timestamp_rounding("day") test_df %>% check_timestamp_rounding(".01 second") test_df %>% check_timestamp_rounding(".001 second") test_df %>% check_timestamp_rounding(".00001 second") test_df %>% check_timestamp_rounding("1 millisecond", lubridate_unit = ".001 second") test_df %>% check_timestamp_rounding("1 microsecond", lubridate_unit = ".000001 second") test_df %>% check_timestamp_rounding("1 nanosecond", lubridate_unit = ".000000001 second") }) test_that("timestamp round/floor/ceil works for units: month/quarter/year", { year_of_dates %>% check_timestamp_rounding("month", ignore_attr = TRUE) year_of_dates %>% check_timestamp_rounding("quarter", ignore_attr = TRUE) year_of_dates %>% check_timestamp_rounding("year", ignore_attr = TRUE) }) # check helper invoked when we need to avoid the lubridate rounding bug check_date_rounding_1051_bypass <- function(data, unit, ignore_attr = TRUE, ...) { # directly compare arrow to lubridate for floor and ceiling compare_dplyr_binding( .input %>% mutate( date_floored = floor_date(date, unit), date_ceiling = ceiling_date(date, unit) ) %>% collect(), data, ignore_attr = ignore_attr, ... ) # The rounding tests for dates is run against Arrow timestamp behaviour # because of a lubridate bug specific to Date objects with week and # higher-unit rounding (see lubridate issue 1051) # https://github.com/tidyverse/lubridate/issues/1051 out <- data %>% arrow_table() %>% mutate( out_date = date %>% round_date(unit), # Date out_time = datetime %>% round_date(unit) # POSIXct ) %>% collect() expect_equal( out$out_date, as.Date(out$out_time) ) } test_that("date round/floor/ceil works for units: month/quarter/year", { # these test cases are affected by lubridate issue 1051 so we bypass # lubridate::round_date() for Date objects with large rounding units # https://github.com/tidyverse/lubridate/issues/1051 check_date_rounding_1051_bypass(year_of_dates, "month", ignore_attr = TRUE) check_date_rounding_1051_bypass(year_of_dates, "quarter", ignore_attr = TRUE) check_date_rounding_1051_bypass(year_of_dates, "year", ignore_attr = TRUE) }) check_date_week_rounding <- function(data, week_start, ignore_attr = TRUE, ...) { expect_equal( data %>% arrow_table() %>% mutate( date_rounded = round_date(date, unit), date_floored = floor_date(date, unit), date_ceiling = ceiling_date(date, unit) ) %>% collect(), data %>% mutate( date_rounded = as.Date(round_date(date, lubridate_unit)), date_floored = as.Date(floor_date(date, lubridate_unit)), date_ceiling = as.Date(ceiling_date(date, lubridate_unit)) ), ignore_attr = ignore_attr, ... ) } check_timestamp_week_rounding <- function(data, week_start, ignore_attr = TRUE, ...) { compare_dplyr_binding( .input %>% mutate( datetime_rounded = round_date(datetime, "week", week_start = week_start), datetime_floored = floor_date(datetime, "week", week_start = week_start), datetime_ceiling = ceiling_date(datetime, "week", week_start = week_start) ) %>% collect(), data, ignore_attr = ignore_attr, ... ) } test_that("timestamp round/floor/ceil works for week units (standard week_start)", { fortnight %>% check_timestamp_week_rounding(week_start = 1) # Monday fortnight %>% check_timestamp_week_rounding(week_start = 7) # Sunday }) test_that("timestamp round/floor/ceil works for week units (non-standard week_start)", { fortnight %>% check_timestamp_week_rounding(week_start = 2) # Tuesday fortnight %>% check_timestamp_week_rounding(week_start = 3) # Wednesday fortnight %>% check_timestamp_week_rounding(week_start = 4) # Thursday fortnight %>% check_timestamp_week_rounding(week_start = 5) # Friday fortnight %>% check_timestamp_week_rounding(week_start = 6) # Saturday }) check_date_week_rounding <- function(data, week_start, ignore_attr = TRUE, ...) { # directly compare arrow to lubridate for floor and ceiling compare_dplyr_binding( .input %>% mutate( date_floored = floor_date(date, "week", week_start = week_start), date_ceiling = ceiling_date(date, "week", week_start = week_start) ) %>% collect(), data, ignore_attr = ignore_attr, ... ) # use the bypass method to avoid the lubridate-1051 bug for week units # https://github.com/tidyverse/lubridate/issues/1051 out <- data %>% arrow_table() %>% mutate( out_date = date %>% round_date("week", week_start = week_start), # Date out_time = datetime %>% round_date("week", week_start = week_start) # POSIXct ) %>% collect() expect_equal( out$out_date, as.Date(out$out_time) ) } test_that("date round/floor/ceil works for week units (standard week_start)", { check_date_week_rounding(fortnight, week_start = 1) # Monday check_date_week_rounding(fortnight, week_start = 7) # Sunday }) test_that("date round/floor/ceil works for week units (non-standard week_start)", { check_date_week_rounding(fortnight, week_start = 2) # Tuesday check_date_week_rounding(fortnight, week_start = 3) # Wednesday check_date_week_rounding(fortnight, week_start = 4) # Thursday check_date_week_rounding(fortnight, week_start = 5) # Friday check_date_week_rounding(fortnight, week_start = 6) # Saturday }) # Test helper used to check that the change_on_boundary argument to # ceiling_date behaves identically to the lubridate version. It takes # unit as an argument to run tests separately for different rounding units check_boundary_with_unit <- function(unit, ...) { # timestamps compare_dplyr_binding( .input %>% mutate( cob_null = ceiling_date(datetime, unit, change_on_boundary = NULL), cob_true = ceiling_date(datetime, unit, change_on_boundary = TRUE), cob_false = ceiling_date(datetime, unit, change_on_boundary = FALSE) ) %>% collect(), boundary_times, ... ) # dates expect_equal( boundary_times %>% arrow_table() %>% mutate( cob_null = ceiling_date(date, unit, change_on_boundary = NULL), cob_true = ceiling_date(date, unit, change_on_boundary = TRUE), cob_false = ceiling_date(date, unit, change_on_boundary = FALSE) ) %>% collect(), boundary_times %>% mutate( cob_null = as.Date(ceiling_date(date, unit, change_on_boundary = NULL)), cob_true = as.Date(ceiling_date(date, unit, change_on_boundary = TRUE)), cob_false = as.Date(ceiling_date(date, unit, change_on_boundary = FALSE)) ), ... ) } test_that("ceiling_date() applies change_on_boundary correctly", { check_boundary_with_unit(".001 second") check_boundary_with_unit("second") check_boundary_with_unit("minute", tolerance = 0.001) # floating point issue? check_boundary_with_unit("hour") check_boundary_with_unit("day") }) # In lubridate, an error is thrown when 60 sec/60 min/24 hour thresholds are # exceeded. Checks that arrow mimics this behaviour and throws an identically # worded error message test_that("temporal round/floor/ceil period unit maxima are enforced", { expect_error( call_binding("round_date", Expression$scalar(Sys.time()), "61 seconds"), "Rounding with second > 60 is not supported" ) expect_error( call_binding("round_date", Expression$scalar(Sys.time()), "61 minutes"), "Rounding with minute > 60 is not supported" ) expect_error( call_binding("round_date", Expression$scalar(Sys.time()), "25 hours"), "Rounding with hour > 24 is not supported" ) expect_error( call_binding("round_date", Expression$scalar(Sys.Date()), "25 hours"), "Rounding with hour > 24 is not supported" ) }) # one method to test that temporal rounding takes place in local time is to # use lubridate as a ground truth and compare arrow results to lubridate # results. this test helper runs that test, skipping cases where lubridate # produces incorrect answers check_timezone_rounding_vs_lubridate <- function(data, unit) { # esoteric lubridate bug: on windows and macOS (not linux), lubridate returns # incorrect ceiling/floor for timezoned POSIXct times (syd, adl, kat zones, # but not mar) but not utc, and not for round, and only for these two # timestamps where high-precision timing is relevant to the outcome if (unit %in% c(".001 second", "second", "minute")) { if (tolower(Sys.info()[["sysname"]]) %in% c("windows", "darwin")) { data <- data[-c(1, 3), ] } } # external validity check: compare lubridate to arrow compare_dplyr_binding( .input %>% mutate( utc_floored = floor_date(utc_time, unit = unit), utc_rounded = round_date(utc_time, unit = unit), utc_ceiling = ceiling_date(utc_time, unit = unit), syd_floored = floor_date(syd_time, unit = unit), syd_rounded = round_date(syd_time, unit = unit), syd_ceiling = ceiling_date(syd_time, unit = unit), adl_floored = floor_date(adl_time, unit = unit), adl_rounded = round_date(adl_time, unit = unit), adl_ceiling = ceiling_date(adl_time, unit = unit), mar_floored = floor_date(mar_time, unit = unit), mar_rounded = round_date(mar_time, unit = unit), mar_ceiling = ceiling_date(mar_time, unit = unit), kat_floored = floor_date(kat_time, unit = unit), kat_rounded = round_date(kat_time, unit = unit), kat_ceiling = ceiling_date(kat_time, unit = unit) ) %>% collect(), data ) } # another method to check that temporal rounding takes place in local # time is to test the internal consistency of the YMD HMS values returned # by temporal rounding functions: these should be the same regardless of # timezone and should always be identical to the equivalent result calculated # for UTC test. this test isn't useful for subsecond resolution but avoids # dependency on lubridate check_timezone_rounding_for_consistency <- function(data, unit) { shifted_times <- data %>% arrow_table() %>% mutate( utc_floored = floor_date(utc_time, unit = unit), utc_rounded = round_date(utc_time, unit = unit), utc_ceiling = ceiling_date(utc_time, unit = unit), syd_floored = floor_date(syd_time, unit = unit), syd_rounded = round_date(syd_time, unit = unit), syd_ceiling = ceiling_date(syd_time, unit = unit), adl_floored = floor_date(adl_time, unit = unit), adl_rounded = round_date(adl_time, unit = unit), adl_ceiling = ceiling_date(adl_time, unit = unit), mar_floored = floor_date(mar_time, unit = unit), mar_rounded = round_date(mar_time, unit = unit), mar_ceiling = ceiling_date(mar_time, unit = unit), kat_floored = floor_date(kat_time, unit = unit), kat_rounded = round_date(kat_time, unit = unit), kat_ceiling = ceiling_date(kat_time, unit = unit) ) %>% collect() compare_local_times <- function(time1, time2) { all(year(time1) == year(time1) & month(time1) == month(time2) & day(time1) == day(time2) & hour(time1) == hour(time2) & minute(time1) == minute(time2) & second(time1) == second(time1)) } base <- shifted_times$utc_rounded expect_true(compare_local_times(shifted_times$syd_rounded, base)) expect_true(compare_local_times(shifted_times$adl_rounded, base)) expect_true(compare_local_times(shifted_times$mar_rounded, base)) expect_true(compare_local_times(shifted_times$kat_rounded, base)) base <- shifted_times$utc_floored expect_true(compare_local_times(shifted_times$syd_floored, base)) expect_true(compare_local_times(shifted_times$adl_floored, base)) expect_true(compare_local_times(shifted_times$mar_floored, base)) expect_true(compare_local_times(shifted_times$kat_floored, base)) base <- shifted_times$utc_ceiling expect_true(compare_local_times(shifted_times$syd_ceiling, base)) expect_true(compare_local_times(shifted_times$adl_ceiling, base)) expect_true(compare_local_times(shifted_times$mar_ceiling, base)) expect_true(compare_local_times(shifted_times$kat_ceiling, base)) } test_that("timestamp rounding takes place in local time", { tz_times %>% check_timezone_rounding_vs_lubridate(".001 second") tz_times %>% check_timezone_rounding_vs_lubridate("second") tz_times %>% check_timezone_rounding_vs_lubridate("minute") tz_times %>% check_timezone_rounding_vs_lubridate("hour") tz_times %>% check_timezone_rounding_vs_lubridate("day") tz_times %>% check_timezone_rounding_vs_lubridate("week") tz_times %>% check_timezone_rounding_vs_lubridate("month") tz_times %>% check_timezone_rounding_vs_lubridate("quarter") tz_times %>% check_timezone_rounding_vs_lubridate("year") tz_times %>% check_timezone_rounding_for_consistency("second") tz_times %>% check_timezone_rounding_for_consistency("minute") tz_times %>% check_timezone_rounding_for_consistency("hour") tz_times %>% check_timezone_rounding_for_consistency("day") tz_times %>% check_timezone_rounding_for_consistency("week") tz_times %>% check_timezone_rounding_for_consistency("month") tz_times %>% check_timezone_rounding_for_consistency("quarter") tz_times %>% check_timezone_rounding_for_consistency("year") tz_times %>% check_timezone_rounding_for_consistency("7 seconds") tz_times %>% check_timezone_rounding_for_consistency("7 minutes") tz_times %>% check_timezone_rounding_for_consistency("7 hours") tz_times %>% check_timezone_rounding_for_consistency("7 months") tz_times %>% check_timezone_rounding_for_consistency("7 years") tz_times %>% check_timezone_rounding_for_consistency("13 seconds") tz_times %>% check_timezone_rounding_for_consistency("13 minutes") tz_times %>% check_timezone_rounding_for_consistency("13 hours") tz_times %>% check_timezone_rounding_for_consistency("13 months") tz_times %>% check_timezone_rounding_for_consistency("13 years") }) test_that("with_tz() and force_tz() works", { timestamps <- as_datetime(c( "1970-01-01T00:00:59.123456789", "2000-02-29T23:23:23.999999999", "2033-05-18T03:33:20.000000000", "2020-01-01T01:05:05.001", "2019-12-31T02:10:10.002", "2019-12-30T03:15:15.003", "2009-12-31T04:20:20.004132", "2010-01-01T05:25:25.005321", "2010-01-03T06:30:30.006163", "2010-01-04T07:35:35", "2006-01-01T08:40:40", "2005-12-31T09:45:45", "2008-12-28", "2008-12-29", "2012-01-01 01:02:03" ), tz = "UTC") timestamps_non_utc <- force_tz(timestamps, "America/Chicago") nonexistent <- as_datetime(c( "2015-03-29 02:30:00", "2015-03-29 03:30:00" ), tz = "UTC") ambiguous <- as_datetime(c( "2015-10-25 02:30:00", "2015-10-25 03:30:00" ), tz = "UTC") compare_dplyr_binding( .input %>% mutate( timestamps_with_tz_1 = with_tz(timestamps, "UTC"), timestamps_with_tz_2 = with_tz(timestamps, "America/Chicago"), timestamps_with_tz_3 = with_tz(timestamps, "Asia/Kolkata"), timestamps_force_tz_1 = force_tz(timestamps, "UTC"), timestamps_force_tz_2 = force_tz(timestamps, "America/Chicago"), timestamps_force_tz_3 = force_tz(timestamps, "Asia/Kolkata") ) %>% collect(), tibble::tibble(timestamps = timestamps) ) compare_dplyr_binding( .input %>% mutate( timestamps_with_tz_1 = with_tz(timestamps, "UTC"), timestamps_with_tz_2 = with_tz(timestamps, "America/Chicago"), timestamps_with_tz_3 = with_tz(timestamps, "Asia/Kolkata") ) %>% collect(), tibble::tibble(timestamps = timestamps_non_utc) ) # We can match some roll_dst behaviour for nonexistent times compare_dplyr_binding( .input %>% mutate( timestamps_with_tz_1 = force_tz( timestamps, "Europe/Brussels", roll_dst = c("boundary", "post") ) ) %>% collect(), tibble::tibble(timestamps = nonexistent) ) # We can match all roll_dst behaviour for ambiguous times compare_dplyr_binding( .input %>% mutate( # The difference is easier to see if we transform back to UTC # because both pre and post will display as 02:30 otherwise timestamps_with_tz_pre = with_tz( force_tz( timestamps, "Europe/Brussels", roll_dst = c("boundary", "pre") ), "UTC" ), timestamps_with_tz_post = with_tz( force_tz( timestamps, "Europe/Brussels", roll_dst = c("boundary", "post") ), "UTC" ) ) %>% collect(), tibble::tibble(timestamps = ambiguous) ) # non-UTC timezone to other timezone is not supported in arrow's force_tz() expect_warning( tibble::tibble(timestamps = timestamps_non_utc) %>% arrow_table() %>% mutate(timestamps = force_tz(timestamps, "UTC")) %>% collect(), "`time` with a non-UTC timezone not supported in Arrow" ) # We only support some roll_dst values expect_warning( tibble::tibble(timestamps = nonexistent) %>% arrow_table() %>% mutate(timestamps = force_tz( timestamps, "Europe/Brussels", roll_dst = "post" )) %>% collect(), "roll_dst` value must be 'error' or 'boundary' for nonexistent times" ) expect_warning( tibble::tibble(timestamps = nonexistent) %>% arrow_table() %>% mutate(timestamps = force_tz( timestamps, "Europe/Brussels", roll_dst = c("boundary", "NA") )) %>% collect(), "`roll_dst` value must be 'error', 'pre', or 'post' for nonexistent times" ) # Raise error when the timezone falls into the DST-break expect_error( record_batch(timestamps = nonexistent) %>% mutate(nonexistent_roll_false = force_tz(timestamps, "Europe/Brussels")) %>% collect(), "Timestamp doesn't exist in timezone 'Europe/Brussels'" ) }) test_that("with_tz() and force_tz() can add timezone to timestamp without timezone", { timestamps <- Array$create(1L:10L, int64())$cast(timestamp("s")) expect_equal( arrow_table(timestamps = timestamps) %>% mutate(timestamps = with_tz(timestamps, "America/Chicago")) %>% compute(), arrow_table(timestamps = timestamps$cast(timestamp("s", "America/Chicago"))) ) expect_equal( arrow_table(timestamps = timestamps) %>% mutate(timestamps = force_tz(timestamps, "America/Chicago")) %>% compute(), arrow_table( timestamps = call_function("assume_timezone", timestamps, options = list(timezone = "America/Chicago")) ) ) })