djnavarro commented on code in PR #12154: URL: https://github.com/apache/arrow/pull/12154#discussion_r912592684
########## r/tests/testthat/test-dplyr-funcs-datetime.R: ########## @@ -2309,3 +2308,535 @@ test_that("build_formats() and build_format_from_order()", { "%y%b%d%H%M%S", "%Y%b%d%H%M%S") ) }) + + + +# tests for datetime rounding --------------------------------------------- + +# an easy date to avoid conflating tests of different things +easy_date <- as.POSIXct("2022-10-11 12:00:00", tz = "UTC") +easy_df <- tibble::tibble(datetime = easy_date) + +# dates near month boundaries over the course of 1 year +month_boundaries <- c( + "2021-01-01 00:01:00", "2021-02-01 00:01:00", "2021-03-01 00:01:00", + "2021-04-01 00:01:00", "2021-05-01 00:01:00", "2021-06-01 00:01:00", + "2021-07-01 00:01:00", "2021-08-01 00:01:00", "2021-09-01 00:01:00", + "2021-10-01 00:01:00", "2021-11-01 00:01:00", "2021-12-01 00:01:00", + "2021-01-31 23:59:00", "2021-02-28 23:59:00", "2021-03-31 23:59:00", + "2021-04-30 23:59:00", "2021-05-31 23:59:00", "2021-06-30 23:59:00", + "2021-07-31 23:59:00", "2021-08-31 23:59:00", "2021-09-30 23:59:00", + "2021-10-31 23:59:00", "2021-11-30 23:59:00", "2021-12-31 23:59:00" +) +year_of_dates <- tibble::tibble( + datetime = as.POSIXct(month_boundaries, tz = "UTC"), + date = as.Date(datetime) +) + +# test case used to check we catch week boundaries for all week_start values +fortnight <- tibble::tibble( + date = as.Date(c( + "2022-04-04", # Monday + "2022-04-05", # Tuesday + "2022-04-06", # Wednesday + "2022-04-07", # Thursday + "2022-04-08", # Friday + "2022-04-09", # Saturday + "2022-04-10", # Sunday + "2022-04-11", # Monday + "2022-04-12", # Tuesday + "2022-04-13", # Wednesday + "2022-04-14", # Thursday + "2022-04-15", # Friday + "2022-04-16", # Saturday + "2022-04-17" # Sunday + )), + datetime = as.POSIXct(date) +) + +# test case to check we catch interval lower boundaries for ceiling_date +boundary_times <- tibble::tibble( + datetime = as.POSIXct(strptime(c( + "2022-05-10 00:00:00", # boundary for week when week_start = 7 (Sunday) + "2022-05-11 00:00:00", # boundary for week when week_start = 1 (Monday) + "2022-05-12 00:00:00", # boundary for week when week_start = 2 (Tuesday) + "2022-03-10 00:00:00", # boundary for day, hour, minute, second, millisecond + "2022-03-10 00:00:01", # boundary for second, millisecond + "2022-03-10 00:01:00", # boundary for second, millisecond, minute + "2022-03-10 01:00:00", # boundary for second, millisecond, minute, hour + "2022-01-01 00:00:00" # boundary for year + ), tz = "UTC", format = "%F %T")), + date = as.Date(datetime) +) + +# test case to check rounding takes place in local time +datestrings <- c( + "1970-01-01T00:00:59.123456789", "2000-02-29T23:23:23.999999999", + "1899-01-01T00:59:20.001001001", "2033-05-18T03:33:20.000000000", + "2020-01-01T01:05:05.001", "2019-12-31T02:10:10.002", + "2019-12-30T03:15:15.003", "2009-12-31T04:20:20.004132", + "2010-01-01T05:25:25.005321", "2010-01-03T06:30:30.006163", + "2010-01-04T07:35:35", "2006-01-01T08:40:40", "2005-12-31T09:45:45", + "2008-12-28T00:00:00", "2008-12-29T00:00:00", "2012-01-01 01:02:03" +) +tz_times <- tibble::tibble( + utc_time = as.POSIXct(datestrings, tz = "UTC"), + syd_time = as.POSIXct(datestrings, tz = "Australia/Sydney"), # UTC +10 (UTC +11 with DST) + adl_time = as.POSIXct(datestrings, tz = "Australia/Adelaide"), # UTC +9:30 (UTC +10:30 with DST) + mar_time = as.POSIXct(datestrings, tz = "Pacific/Marquesas"), # UTC -9:30 (no DST) + kat_time = as.POSIXct(datestrings, tz = "Asia/Kathmandu") # UTC +5:45 (no DST) +) + + +test_that("timestamp round/floor/ceiling works for a minimal test", { + + compare_dplyr_binding( + .input %>% + mutate( + round_datetime = round_date(datetime), + floor_datetime = floor_date(datetime), + ceiling_datetime = ceiling_date(datetime, change_on_boundary = FALSE) + ) %>% + collect(), + test_df + ) +}) + +test_that("timestamp round/floor/ceiling accepts period unit abbreviation", { + + # test helper to ensure standard abbreviations of period names + # are understood by arrow and mirror the lubridate behaviour + check_period_abbreviation <- function(unit, synonyms) { + + # check arrow against lubridate + compare_dplyr_binding( + .input %>% + mutate(out_1 = round_date(datetime, unit)) %>% + collect(), + easy_df + ) + + # check synonyms + base <- call_binding("round_date", Expression$scalar(easy_date), unit) + for (syn in synonyms) { + expect_equal( + call_binding("round_date", Expression$scalar(easy_date), syn), + base + ) + } + } + + check_period_abbreviation("minute", synonyms = c("minutes", "min", "mins")) + check_period_abbreviation("second", synonyms = c("seconds", "sec", "secs")) + check_period_abbreviation("month", synonyms = c("months", "mon", "mons")) +}) + + +test_that("temporal round/floor/ceiling accepts periods with multiple units", { + + check_multiple_unit_period <- function(unit, multiplier) { + unit_string <- paste(multiplier, unit) + compare_dplyr_binding( + .input %>% + mutate( + round_datetime = round_date(datetime, unit_string), + floor_datetime = floor_date(datetime, unit_string), + ceiling_datetime = ceiling_date(datetime, unit_string) + ) %>% + collect(), + easy_df + ) + } + + for (multiplier in c(1, 2, 10)) { + for (unit in c("second", "minute", "day", "year")) { + check_multiple_unit_period(unit, multiplier) + } + } +}) + + +# Test helper functions for checking equivalence of outputs regardless of +# the unit specified. The lubridate_unit argument allows for cases where +# arrow supports a unit name (e.g., nanosecond) that lubridate doesn't. Also +# note that in the check_date_rounding helper the lubridate output is coerced +# to ensure type stable output (arrow output should be type stable without this) + +check_date_rounding <- function(data, unit, lubridate_unit = unit, ...) { + expect_equal( + data %>% + arrow_table() %>% + mutate( + date_rounded = round_date(date, unit), + date_floored = floor_date(date, unit), + date_ceiling = ceiling_date(date, unit) + ) %>% + collect(), + data %>% + mutate( + date_rounded = as.Date(round_date(date, lubridate_unit)), + date_floored = as.Date(floor_date(date, lubridate_unit)), + date_ceiling = as.Date(ceiling_date(date, lubridate_unit)) + ), + ... + ) +} + +check_timestamp_rounding <- function(data, unit, lubridate_unit = unit, ...) { + + expect_equal( + data %>% + arrow_table() %>% + mutate( + datetime_rounded = round_date(datetime, unit), + datetime_floored = floor_date(datetime, unit), + datetime_ceiling = ceiling_date(datetime, unit) + ) %>% + collect(), + data %>% + mutate( + datetime_rounded = round_date(datetime, lubridate_unit), + datetime_floored = floor_date(datetime, lubridate_unit), + datetime_ceiling = ceiling_date(datetime, lubridate_unit) + ), + ... + ) +} + + +test_that("date round/floor/ceil works for units of 1 day or less", { + + test_df %>% check_date_rounding("1 millisecond", lubridate_unit = ".001 second") + test_df %>% check_date_rounding("1 day") + test_df %>% check_date_rounding("1 second") + test_df %>% check_date_rounding("1 hour") + +}) + +test_that("timestamp round/floor/ceil works for units of 1 day or less", { + + test_df %>% check_timestamp_rounding("second") + test_df %>% check_timestamp_rounding("minute") + test_df %>% check_timestamp_rounding("hour") + test_df %>% check_timestamp_rounding("day") + + test_df %>% check_timestamp_rounding(".01 second") + test_df %>% check_timestamp_rounding(".001 second") + test_df %>% check_timestamp_rounding(".00001 second") + + test_df %>% check_timestamp_rounding("1 millisecond", lubridate_unit = ".001 second") + test_df %>% check_timestamp_rounding("1 microsecond", lubridate_unit = ".000001 second") + test_df %>% check_timestamp_rounding("1 nanosecond", lubridate_unit = ".000000001 second") + +}) + +test_that("timestamp round/floor/ceil works for units: month/quarter/year", { + + year_of_dates %>% check_timestamp_rounding("month", ignore_attr = TRUE) + year_of_dates %>% check_timestamp_rounding("quarter", ignore_attr = TRUE) + year_of_dates %>% check_timestamp_rounding("year", ignore_attr = TRUE) + +}) + +# check helper invoked when we need to avoid the lubridate rounding bug +check_date_rounding_1051_bypass <- function(data, unit, ignore_attr = TRUE, ...) { + + # directly compare arrow to lubridate for floor and ceiling + compare_dplyr_binding( + .input %>% + mutate( + date_floored = floor_date(date, unit), + date_ceiling = ceiling_date(date, unit) + ) %>% + collect(), + data, + ignore_attr = ignore_attr, + ... + ) + + # The rounding tests for dates is run against Arrow timestamp behaviour + # because of a lubridate bug specific to Date objects with week and + # higher-unit rounding (see lubridate issue 1051) + # https://github.com/tidyverse/lubridate/issues/1051 + out <- data %>% + arrow_table() %>% + mutate( + out_date = date %>% round_date(unit), # Date + out_time = datetime %>% round_date(unit) # POSIXct + ) %>% + collect() + + expect_equal( + out$out_date, + as.Date(out$out_time) + ) +} + + +test_that("date round/floor/ceil works for units: month/quarter/year", { + + # these test cases are affected by lubridate issue 1051 so we bypass + # lubridate::round_date() for Date objects with large rounding units + # https://github.com/tidyverse/lubridate/issues/1051 + + # these tests are run one row at a time to avoid ARROW-16412 (see note) + for (r in nrow(year_of_dates)) { + year_of_dates[r, ] %>% check_date_rounding_1051_bypass("month", ignore_attr = TRUE) + year_of_dates[r, ] %>% check_date_rounding_1051_bypass("quarter", ignore_attr = TRUE) + year_of_dates[r, ] %>% check_date_rounding_1051_bypass("year", ignore_attr = TRUE) + } + +}) + +# ARROW-16142 note: Until 16142 is resolved, there are a few cases where the +# tests need to be written in a way that avoids the "32-bit temporal array +# misinterpreted as 64-bit temporal array" bug (ARROW-16142). The easiest +# solution is to never use an arrow array of length greater than 1. +# https://issues.apache.org/jira/browse/ARROW-16142 + + +check_date_week_rounding <- function(data, week_start, ignore_attr = TRUE, ...) { + expect_equal( + data %>% + arrow_table() %>% + mutate( + date_rounded = round_date(date, unit), + date_floored = floor_date(date, unit), + date_ceiling = ceiling_date(date, unit) + ) %>% + collect(), + data %>% + mutate( + date_rounded = as.Date(round_date(date, lubridate_unit)), + date_floored = as.Date(floor_date(date, lubridate_unit)), + date_ceiling = as.Date(ceiling_date(date, lubridate_unit)) + ), + ignore_attr = ignore_attr, + ... + ) +} + +check_timestamp_week_rounding <- function(data, week_start, ignore_attr = TRUE, ...) { + + compare_dplyr_binding( + .input %>% + mutate( + datetime_rounded = round_date(datetime, "week", week_start = week_start), + datetime_floored = floor_date(datetime, "week", week_start = week_start), + datetime_ceiling = ceiling_date(datetime, "week", week_start = week_start) + ) %>% + collect(), + data, + ignore_attr = ignore_attr, + ... + ) +} + +test_that("timestamp round/floor/ceil works for week units (standard week_start)", { + + fortnight %>% check_timestamp_week_rounding(week_start = 1) # Monday + fortnight %>% check_timestamp_week_rounding(week_start = 7) # Sunday + +}) + +test_that("timestamp round/floor/ceil works for week units (non-standard week_start)", { + + fortnight %>% check_timestamp_week_rounding(week_start = 1) # Tuesday + fortnight %>% check_timestamp_week_rounding(week_start = 2) # Wednedsday + fortnight %>% check_timestamp_week_rounding(week_start = 3) # Thursday + fortnight %>% check_timestamp_week_rounding(week_start = 4) # Friday + fortnight %>% check_timestamp_week_rounding(week_start = 5) # Saturday + +}) + + +check_date_week_rounding <- function(data, week_start, ignore_attr = TRUE, ...) { + + # directly compare arrow to lubridate for floor and ceiling + compare_dplyr_binding( + .input %>% + mutate( + date_floored = floor_date(date, "week", week_start = week_start), + date_ceiling = ceiling_date(date, "week", week_start = week_start) + ) %>% + collect(), + data, + ignore_attr = ignore_attr, + ... + ) + + # use the bypass method to avoid the lubridate-1051 bug for week units + # https://github.com/tidyverse/lubridate/issues/1051 + out <- data %>% + arrow_table() %>% + mutate( + out_date = date %>% round_date("week", week_start = week_start), # Date + out_time = datetime %>% round_date("week", week_start = week_start) # POSIXct + ) %>% + collect() + + expect_equal( + out$out_date, + as.Date(out$out_time) + ) +} + +test_that("date round/floor/ceil works for week units (standard week_start)", { + + # these tests are run one row at a time to avoid ARROW-16412 (see note) + for (r in seq_len(nrow(fortnight))) { + fortnight[r, ] %>% check_date_week_rounding(week_start = 1) # Monday + fortnight[r, ] %>% check_date_week_rounding(week_start = 7) # Sunday + } +}) + +test_that("date round/floor/ceil works for week units (non-standard week_start)", { + + # these tests are run one row at a time to avoid ARROW-16412 (see note) + for (r in seq_len(nrow(fortnight))) { + fortnight[r, ] %>% check_date_week_rounding(week_start = 1) # Tuesday + fortnight[r, ] %>% check_date_week_rounding(week_start = 2) # Wednedsday + fortnight[r, ] %>% check_date_week_rounding(week_start = 3) # Thursday + fortnight[r, ] %>% check_date_week_rounding(week_start = 4) # Friday + fortnight[r, ] %>% check_date_week_rounding(week_start = 5) # Saturday + } +}) + + + +# Test helper used to check that the change_on_boundary argument to +# ceiling_date behaves identically to the lubridate version. It takes +# unit as an argument to run tests separately for different rounding units +check_boundary_with_unit <- function(unit, ...) { + + # timestamps + compare_dplyr_binding( + .input %>% + mutate( + cob_null = ceiling_date(datetime, unit, change_on_boundary = NULL), + cob_true = ceiling_date(datetime, unit, change_on_boundary = TRUE), + cob_false = ceiling_date(datetime, unit, change_on_boundary = FALSE) + ) %>% + collect(), + boundary_times, + ... + ) + + # dates + # these tests are run one row at a time to avoid ARROW-16412 (see note) + for (r in seq_len(nrow(boundary_times))) { + expect_equal( + boundary_times[r, ] %>% + arrow_table() %>% + mutate( + cob_null = ceiling_date(date, unit, change_on_boundary = NULL), + cob_true = ceiling_date(date, unit, change_on_boundary = TRUE), + cob_false = ceiling_date(date, unit, change_on_boundary = FALSE) + ) %>% + collect(), + boundary_times[r, ] %>% + mutate( + cob_null = as.Date(ceiling_date(date, unit, change_on_boundary = NULL)), + cob_true = as.Date(ceiling_date(date, unit, change_on_boundary = TRUE)), + cob_false = as.Date(ceiling_date(date, unit, change_on_boundary = FALSE)) + ), + ... + ) + } + + +} + +test_that("ceiling_date() applies change_on_boundary correctly", { + check_boundary_with_unit(".001 second") + check_boundary_with_unit("second") + check_boundary_with_unit("minute", tolerance = .001) # floating point issue? + check_boundary_with_unit("hour") + check_boundary_with_unit("day") +}) + + +# In lubridate, an error is thrown when 60 sec/60 min/24 hour thresholds are +# exceeded. Checks that arrow mimics this behaviour and throws an identically +# worded error message +test_that("temporal round/floor/ceil period unit maxima are enforced", { + + expect_error( + call_binding("round_date", Expression$scalar(Sys.time()), "61 seconds"), + "Rounding with second > 60 is not supported" + ) + expect_error( + call_binding("round_date", Expression$scalar(Sys.time()), "61 minutes"), + "Rounding with minute > 60 is not supported" + ) + expect_error( + call_binding("round_date", Expression$scalar(Sys.time()), "25 hours"), + "Rounding with hour > 24 is not supported" + ) + expect_error( + call_binding("round_date", Expression$scalar(Sys.Date()), "25 hours"), + "Rounding with hour > 24 is not supported" + ) + +}) + +check_timezone_rounding <- function(data, unit) { + compare_dplyr_binding( + .input %>% + mutate( + utc_floored = floor_date(utc_time, unit = unit), + utc_rounded = round_date(utc_time, unit = unit), + utc_ceiling = ceiling_date(utc_time, unit = unit), + syd_floored = floor_date(syd_time, unit = unit), + syd_rounded = round_date(syd_time, unit = unit), + syd_ceiling = ceiling_date(syd_time, unit = unit), + adl_floored = floor_date(adl_time, unit = unit), + adl_rounded = round_date(adl_time, unit = unit), + adl_ceiling = ceiling_date(adl_time, unit = unit), + mar_floored = floor_date(mar_time, unit = unit), + mar_rounded = round_date(mar_time, unit = unit), + mar_ceiling = ceiling_date(mar_time, unit = unit), + kat_floored = floor_date(kat_time, unit = unit), + kat_rounded = round_date(kat_time, unit = unit), + kat_ceiling = ceiling_date(kat_time, unit = unit) + ) %>% + collect(), + data + ) +} + +test_that("timestamp rounding takes place in local time", { + + # lubridate does not always return the correct results + # for some of our test cases in tz_times, esp on windows Review Comment: It's not us: it's either lubridate or something lower-level that lubridate relies on. The bug only occurs in very specific case where sub-second timing is relevant to the result of the rounding, only on windows, it's specific to zoned times (but not all timezones!) and it's lubridate that returns inconsistent answers, not arrow. As a paranoia check, I've added a second batch of tests that check that the arrow results give the same answer in local time for the zoned times as it does for the UTC times. I think we're in the clear -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org