jonkeane commented on a change in pull request #12154: URL: https://github.com/apache/arrow/pull/12154#discussion_r786804593
########## File path: r/tests/testthat/test-dplyr-funcs-datetime.R ########## @@ -616,3 +616,292 @@ test_that("extract yday from date", { test_df ) }) + +test_that("round/floor/ceiling on datetime (to nearest second)", { + compare_dplyr_binding( + .input %>% + mutate( + out_1 = round_date(datetime), + out_2 = floor_date(datetime), + out_3 = ceiling_date(datetime), + ) %>% + collect(), + test_df + ) +}) + +test_that("period unit abbreviation", { + compare_dplyr_binding( + .input %>% + mutate( + out_1 = round_date(datetime, "minute"), + out_2 = round_date(datetime, "minutes"), + out_3 = round_date(datetime, "mins"), + ) %>% + collect(), + test_df + ) +}) + +test_that("period unit extracts integer multiples", { + compare_dplyr_binding( + .input %>% + mutate( + out_1 = round_date(datetime, "1 minute"), + out_2 = round_date(datetime, "2 minutes"), + out_3 = round_date(datetime, "10 minutes") + ) %>% + collect(), + test_df + ) +}) + +# lubridate errors when 60 sec/60 min/24 hour thresholds exceeded. +# this test checks that arrow does too. +test_that("period unit maxima are enforced", { + + expect_error(suppressWarnings( # <- hack + test_df %>% + arrow_table() %>% + mutate(out = round_date(datetime, "61 seconds")) %>% + collect() + )) + + expect_error(suppressWarnings( + test_df %>% + arrow_table() %>% + mutate(out = round_date(datetime, "61 minutes")) %>% + collect() + )) + + expect_error(suppressWarnings( + test_df %>% + arrow_table() %>% + mutate(out = round_date(datetime, "25 hours")) %>% + collect() + )) + +}) + +test_that("datetime rounding between 1sec and 1day", { + compare_dplyr_binding( + .input %>% + mutate( + out_1 = round_date(datetime, "second"), + out_2 = round_date(datetime, "minute"), + out_3 = round_date(datetime, "hour"), + out_4 = round_date(datetime, "day") + ) %>% + collect(), + test_df + ) +}) + +# lubridate doesn't accept millisecond, microsecond or nanosecond descriptors: +# instead it supports corresponding fractions of 1 second. these tests added to +# that arrow verify that fractional second inputs to arrow mirror lubridate + +test_that("datetime rounding below 1sec", { + + expect_equal( + test_df %>% + arrow_table() %>% + mutate(out = round_date(datetime, ".001 second")) %>% + collect(), + + test_df %>% + arrow_table() %>% + mutate(out = round_date(datetime, "1 millisecond")) %>% + collect() + ) + + expect_equal( + test_df %>% + arrow_table() %>% + mutate(out = round_date(datetime, ".000001 second")) %>% + collect(), + + test_df %>% + arrow_table() %>% + mutate(out = round_date(datetime, "1 microsecond")) %>% + collect() + ) + + expect_equal( + test_df %>% + arrow_table() %>% + mutate(out = round_date(datetime, ".000000001 second")) %>% + collect(), + + test_df %>% + arrow_table() %>% + mutate(out = round_date(datetime, "1 nanosecond")) %>% + collect() + ) + + compare_dplyr_binding( + .input %>% + mutate( + out_1 = round_date(datetime, ".01 second"), + out_2 = round_date(datetime, ".001 second"), + out_3 = round_date(datetime, ".00001 second") + ) %>% + collect(), + test_df + ) +}) + + +# a simplified test case using UTC timezone only +test_df_v2 <- tibble::tibble( + datetime = c(as.POSIXct("2017-01-01 00:00:11.3456789", tz = "UTC"), NA), + date = c(as.Date("2021-09-09"), NA), + integer = 1:2 +) + +test_that("datetime round/floor/ceil to month/quarter/year", { + + compare_dplyr_binding( + .input %>% + mutate( + out_1 = round_date(datetime, "month"), + out_2 = round_date(datetime, "quarter"), + out_3 = round_date(datetime, "year"), + ) %>% + collect(), + test_df_v2 + ) + + compare_dplyr_binding( + .input %>% + mutate( + out_1 = floor_date(datetime, "month"), + out_2 = floor_date(datetime, "quarter"), + out_3 = floor_date(datetime, "year"), + ) %>% + collect(), + test_df_v2 + ) + + compare_dplyr_binding( + .input %>% + mutate( + out_1 = ceiling_date(datetime, "month"), + out_2 = ceiling_date(datetime, "quarter"), + out_3 = ceiling_date(datetime, "year"), + ) %>% + collect(), + test_df_v2 + ) +}) + +# NOTE: the hard coding of week_starts = 4 needs to be fixed. round_temporal() +# treats 1970-01-01 as the beginning of week 1, i.e., week_starts on a Thursday + +# NOTE: arrow dplyr binding for ceiling_date() does not force dates up to the +# next date. the logic mirrors lubridate prior to v1.6.0 (change_on_boundary = FALSE). +# I'm not 100% sold on this implementation, but it's not obviously terrible + +test_that("datetime round/floor/ceil to week", { + + expect_equal( + test_df_v2 %>% + arrow_table() %>% + mutate(out = round_date(datetime, "1 week")) %>% + collect(), + test_df_v2 %>% + mutate(out = round_date(datetime, "1 week", week_start = 4)) + ) + + expect_equal( + test_df_v2 %>% + arrow_table() %>% + mutate(out = ceiling_date(datetime, "1 week")) %>% + collect(), + test_df_v2 %>% + mutate(out = ceiling_date(datetime, "1 week", week_start = 4, change_on_boundary = FALSE)) + ) + + expect_equal( + test_df_v2 %>% + arrow_table() %>% + mutate(out = floor_date(datetime, "1 week")) %>% + collect(), + test_df_v2 %>% + mutate(out = floor_date(datetime, "1 week", week_start = 4)) + ) +}) + +# NOTE: lubridate::round_date() sometimes coerces output from Date to POSIXct. +# this is not the default for the round_temporal() function in libarrow, which +# is type stable: timestamps stay timestamps, and date32 stays date32. the +# current implementation preserves the type stability property. consequently +# there are edge cases where the arrow dplyr binding will not precisely mirror +# the lubridate original. with that in mind, all tests for date32 rounding coerce +# the lubridate equivalent back to Date Review comment: Yup, I agree here. It might be nice to add a small comment in those test cases about this (so future-us remembers why we have `as.Date()` at the end there -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org