This is an automated email from the ASF dual-hosted git repository.
paleolimbot pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 3c7a0cad0e ARROW-16395: [R] Implement lubridate's parsers with year,
month, and day, hour, minute, and second components (#13627)
3c7a0cad0e is described below
commit 3c7a0cad0e25ed66e4c555d9da49f320f803573c
Author: Rok Mihevc <[email protected]>
AuthorDate: Fri Jul 22 21:02:53 2022 +0200
ARROW-16395: [R] Implement lubridate's parsers with year, month, and day,
hour, minute, and second components (#13627)
This is to resolve
[ARROW-16395](https://issues.apache.org/jira/browse/ARROW-16395).
Authored-by: Rok <[email protected]>
Signed-off-by: Dewey Dunnington <[email protected]>
---
r/R/dplyr-funcs-datetime.R | 25 ++--
r/tests/testthat/test-dplyr-funcs-datetime.R | 172 +++++++++++++++++++++++++++
2 files changed, 190 insertions(+), 7 deletions(-)
diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R
index fd705e4578..9a010452b8 100644
--- a/r/R/dplyr-funcs-datetime.R
+++ b/r/R/dplyr-funcs-datetime.R
@@ -580,12 +580,23 @@ register_bindings_datetime_parsers <- function() {
}
})
- ymd_parser_vec <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym", "ym", "my",
"yq")
+ parser_vec <- c(
+ "ymd", "ydm", "mdy", "myd", "dmy", "dym", "ym", "my", "yq",
+ "ymd_HMS", "ymd_HM", "ymd_H", "dmy_HMS", "dmy_HM", "dmy_H",
+ "mdy_HMS", "mdy_HM", "mdy_H", "ydm_HMS", "ydm_HM", "ydm_H"
+ )
- ymd_parser_map_factory <- function(order) {
+ parser_map_factory <- function(order) {
force(order)
- function(x, tz = NULL) {
- parse_x <- call_binding("parse_date_time", x, order, tz)
+ function(x, quiet = TRUE, tz = NULL, locale = NULL, truncated = 0) {
+ if (!is.null(locale)) {
+ arrow_not_supported("`locale`")
+ }
+ # Parsers returning datetimes return UTC by default and never return
dates.
+ if (is.null(tz) && nchar(order) > 3) {
+ tz <- "UTC"
+ }
+ parse_x <- call_binding("parse_date_time", x, order, tz, truncated,
quiet)
if (is.null(tz)) {
# we cast so we can mimic the behaviour of the `tz` argument in
lubridate
# "If NULL (default), a Date object is returned. Otherwise a POSIXct
with
@@ -596,10 +607,10 @@ register_bindings_datetime_parsers <- function() {
}
}
- for (ymd_order in ymd_parser_vec) {
+ for (order in parser_vec) {
register_binding(
- paste0("lubridate::", ymd_order),
- ymd_parser_map_factory(ymd_order)
+ paste0("lubridate::", tolower(order)),
+ parser_map_factory(order)
)
}
diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R
b/r/tests/testthat/test-dplyr-funcs-datetime.R
index 1c6b73d7af..0c655e48bb 100644
--- a/r/tests/testthat/test-dplyr-funcs-datetime.R
+++ b/r/tests/testthat/test-dplyr-funcs-datetime.R
@@ -2222,6 +2222,26 @@ test_that("parse_date_time with hours, minutes and
seconds components", {
test_dates_times
)
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ ymd_hms_dttm = ymd_hms(ymd_hms_string),
+ ymd_hm_dttm = ymd_hm(ymd_hm_string),
+ ymd_h_dttm = ymd_h(ymd_h_string),
+ dmy_hms_dttm = dmy_hms(dmy_hms_string),
+ dmy_hm_dttm = dmy_hm(dmy_hm_string),
+ dmy_h_dttm = dmy_h(dmy_h_string),
+ mdy_hms_dttm = mdy_hms(mdy_hms_string),
+ mdy_hm_dttm = mdy_hm(mdy_hm_string),
+ mdy_h_dttm = mdy_h(mdy_h_string),
+ ydm_hms_dttm = ydm_hms(ydm_hms_string),
+ ydm_hm_dttm = ydm_hm(ydm_hm_string),
+ ydm_h_dttm = ydm_h(ydm_h_string)
+ ) %>%
+ collect(),
+ test_dates_times
+ )
+
# parse_date_time with timezone
pm_tz <- "Pacific/Marquesas"
compare_dplyr_binding(
@@ -2244,6 +2264,46 @@ test_that("parse_date_time with hours, minutes and
seconds components", {
test_dates_times
)
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ ymd_hms_dttm = ymd_hms(ymd_hms_string, tz = pm_tz),
+ ymd_hm_dttm = ymd_hm(ymd_hm_string, tz = pm_tz),
+ ymd_h_dttm = ymd_h(ymd_h_string, tz = pm_tz),
+ dmy_hms_dttm = dmy_hms(dmy_hms_string, tz = pm_tz),
+ dmy_hm_dttm = dmy_hm(dmy_hm_string, tz = pm_tz),
+ dmy_h_dttm = dmy_h(dmy_h_string, tz = pm_tz),
+ mdy_hms_dttm = mdy_hms(mdy_hms_string, tz = pm_tz),
+ mdy_hm_dttm = mdy_hm(mdy_hm_string, tz = pm_tz),
+ mdy_h_dttm = mdy_h(mdy_h_string, tz = pm_tz),
+ ydm_hms_dttm = ydm_hms(ydm_hms_string, tz = pm_tz),
+ ydm_hm_dttm = ydm_hm(ydm_hm_string, tz = pm_tz),
+ ydm_h_dttm = ydm_h(ydm_h_string, tz = pm_tz),
+ ) %>%
+ collect(),
+ test_dates_times
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ ymd_hms_dttm = ymd_hms("2022-07-19 20:24:43"),
+ ymd_hm_dttm = ymd_hm("2022-07-19 20:24"),
+ ymd_h_dttm = ymd_h("2022-07-19 20"),
+ dmy_hms_dttm = dmy_hms("19-07-2022 20:24:43"),
+ dmy_hm_dttm = dmy_hm("19-07-2022 20:24"),
+ dmy_h_dttm = dmy_h("19-07-2022 20"),
+ mdy_hms_dttm = mdy_hms("07-19-2022 20:24:43"),
+ mdy_hm_dttm = mdy_hm("07-19-2022 20:24"),
+ mdy_h_dttm = mdy_h("07-19-2022 20"),
+ ydm_hms_dttm = ydm_hms("2022-19-07 20:24:43"),
+ ydm_hm_dttm = ydm_hm("2022-19-07 20:24"),
+ ydm_h_dttm = ydm_h("2022-19-07 20")
+ ) %>%
+ collect(),
+ test_dates_times
+ )
+
# test ymd_ims
compare_dplyr_binding(
.input %>%
@@ -2319,12 +2379,58 @@ test_that("parse_date_time with month names and HMS", {
collect(),
test_dates_times2
)
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ ymd_hms_dttm = ymd_hms(ymd_hms_string),
+ ymd_hm_dttm = ymd_hm(ymd_hm_string),
+ ymd_h_dttm = ymd_h(ymd_h_string),
+ dmy_hms_dttm = dmy_hms(dmy_hms_string),
+ dmy_hm_dttm = dmy_hm(dmy_hm_string),
+ dmy_h_dttm = dmy_h(dmy_h_string),
+ mdy_hms_dttm = mdy_hms(mdy_hms_string),
+ mdy_hm_dttm = mdy_hm(mdy_hm_string),
+ mdy_h_dttm = mdy_h(mdy_h_string),
+ ydm_hms_dttm = ydm_hms(ydm_hms_string),
+ ydm_hm_dttm = ydm_hm(ydm_hm_string),
+ ydm_h_dttm = ydm_h(ydm_h_string)
+ ) %>%
+ collect(),
+ test_dates_times2
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ ymd_hms_dttm = ymd_hms("2022-June-19 20:24:43"),
+ ymd_hm_dttm = ymd_hm("2022-June-19 20:24"),
+ ymd_h_dttm = ymd_h("2022-June-19 20"),
+ dmy_hms_dttm = dmy_hms("19-June-2022 20:24:43"),
+ dmy_hm_dttm = dmy_hm("19-June-2022 20:24"),
+ dmy_h_dttm = dmy_h("19-June-2022 20"),
+ mdy_hms_dttm = mdy_hms("June-19-2022 20:24:43"),
+ mdy_hm_dttm = mdy_hm("June-19-2022 20:24"),
+ mdy_h_dttm = mdy_h("June-19-2022 20"),
+ ydm_hms_dttm = ydm_hms("2022-19-June 20:24:43"),
+ ydm_hm_dttm = ydm_hm("2022-19-June 20:24"),
+ ydm_h_dttm = ydm_h("2022-19-June 20")
+ ) %>%
+ collect(),
+ test_dates_times2
+ )
})
test_that("parse_date_time with `quiet = FALSE` not supported", {
# we need expect_warning twice as both the arrow pipeline (because quiet =
# FALSE is not supported) and the fallback dplyr/lubridate one throw
# warnings (the lubridate one because quiet is FALSE)
+ # https://issues.apache.org/jira/browse/ARROW-17146
+
+ # these functions' internals use some string processing which requires the
+ # RE2 library (not available on Windows with R 3.6 & the minimal nightly
builds)
+ skip_if_not_available("re2")
+
expect_warning(
expect_warning(
tibble(x = c("2022-05-19 13:46:51")) %>%
@@ -2337,6 +2443,16 @@ test_that("parse_date_time with `quiet = FALSE` not
supported", {
),
"All formats failed to parse"
)
+
+ expect_warning(
+ tibble(x = c("2022-05-19 13:46:51")) %>%
+ arrow_table() %>%
+ mutate(
+ x_dttm = ymd_hms(x, quiet = FALSE)
+ ) %>%
+ collect(),
+ "`quiet = FALSE` not supported in Arrow"
+ )
})
test_that("parse_date_time with truncated formats", {
@@ -2362,6 +2478,11 @@ test_that("parse_date_time with truncated formats", {
truncated_ymd_string,
orders = "ymd_HMS",
truncated = 3
+ ),
+ dttm2 =
+ ymd_hms(
+ truncated_ymd_string,
+ truncated = 3
)
) %>%
collect(),
@@ -2383,6 +2504,37 @@ test_that("parse_date_time with truncated formats", {
test_truncation_df,
warning = "a value for `truncated` > 4 not supported in Arrow"
)
+
+ # values for truncated greater than nchar(orders) - 3 not supported in Arrow
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ dttm =
+ ymd_hms(
+ truncated_ymd_string,
+ truncated = 5
+ )
+ ) %>%
+ collect(),
+ test_truncation_df,
+ warning = "a value for `truncated` > 4 not supported in Arrow"
+ )
+})
+
+test_that("parse_date_time with `locale != NULL` not supported", {
+ # parse_date_time currently doesn't take locale paramete which will be
+ # addressed in https://issues.apache.org/jira/browse/ARROW-17147
+ skip_if_not_available("re2")
+
+ expect_warning(
+ tibble(x = c("2022-05-19 13:46:51")) %>%
+ arrow_table() %>%
+ mutate(
+ x_dttm = ymd_hms(x, locale = "C")
+ ) %>%
+ collect(),
+ "`locale` not supported in Arrow"
+ )
})
test_that("parse_date_time with `exact = TRUE`, and with regular R objects", {
@@ -2514,6 +2666,26 @@ test_that("build_formats() and
build_format_from_order()", {
"%y%b%d%H%M%S", "%Y%b%d%H%M%S"
)
)
+
+ expect_equal(
+ build_format_from_order("ymdHM"),
+ c(
+ "%y-%m-%d-%H-%M", "%Y-%m-%d-%H-%M", "%y-%B-%d-%H-%M",
+ "%Y-%B-%d-%H-%M", "%y-%b-%d-%H-%M", "%Y-%b-%d-%H-%M",
+ "%y%m%d%H%M", "%Y%m%d%H%M", "%y%B%d%H%M", "%Y%B%d%H%M",
+ "%y%b%d%H%M", "%Y%b%d%H%M"
+ )
+ )
+
+ expect_equal(
+ build_format_from_order("ymdH"),
+ c(
+ "%y-%m-%d-%H", "%Y-%m-%d-%H", "%y-%B-%d-%H",
+ "%Y-%B-%d-%H", "%y-%b-%d-%H", "%Y-%b-%d-%H",
+ "%y%m%d%H", "%Y%m%d%H", "%y%B%d%H", "%Y%B%d%H",
+ "%y%b%d%H", "%Y%b%d%H"
+ )
+ )
})