thisisnic commented on a change in pull request #10624: URL: https://github.com/apache/arrow/pull/10624#discussion_r667773505
########## File path: r/R/dplyr-functions.R ########## @@ -286,8 +286,8 @@ nse_funcs$substr <- function(string, start, stop) { msg = "Start of length != 1 not supported in Arrow" ) assert_that( - length(end) == 1, - msg = "End of length != 1 not supported in Arrow" + length(stop) == 1, + msg = "Stop of length != 1 not supported in Arrow" Review comment: ```suggestion msg = "`stop` must be length 1 - other lengths are not supported in Arrow" ``` ########## File path: r/R/dplyr-functions.R ########## @@ -313,10 +313,33 @@ nse_funcs$substring <- function(text, first, last = 1000000L) { } nse_funcs$str_sub <- function(string, start = 1L, end = -1L) { - if (start < 0 || end < 0) { - warning("Negative counts not yet implemented for strings") + if(end <= -1) end <- .Machine$integer.max + + assert_that( + length(start) == 1, + msg = "Start of length != 1 not supported in Arrow" Review comment: ```suggestion msg = "`start` must be length 1 - other lengths are not supported in Arrow" ``` ########## File path: r/R/dplyr-functions.R ########## @@ -313,10 +313,33 @@ nse_funcs$substring <- function(text, first, last = 1000000L) { } nse_funcs$str_sub <- function(string, start = 1L, end = -1L) { - if (start < 0 || end < 0) { - warning("Negative counts not yet implemented for strings") + if(end <= -1) end <- .Machine$integer.max + + assert_that( + length(start) == 1, + msg = "Start of length != 1 not supported in Arrow" + ) + assert_that( + length(end) == 1, + msg = "end of length != 1 not supported in Arrow" Review comment: ```suggestion msg = "`end` must be length 1 - other lengths are not supported in Arrow" ``` ########## File path: r/src/compute.cpp ########## @@ -316,6 +316,19 @@ std::shared_ptr<arrow::compute::FunctionOptions> make_compute_options( return std::make_shared<Options>(max_splits, reverse); } + if (func_name == "utf8_slice_codeunits") { + using Options = arrow::compute::SliceOptions; + int64_t start = 1; + int64_t stop = -1; Review comment: I think this still may need updating to reflect the default value of stop supplied by C++ ########## File path: r/tests/testthat/test-dplyr-string-functions.R ########## @@ -868,42 +868,76 @@ test_that("str_like", { }) test_that("str_pad", { - + df <- tibble(x = c("Foo and bar", "baz and qux and quux")) - + expect_dplyr_equal( input %>% mutate(x = str_pad(x, width = 31)) %>% collect(), df ) - + expect_dplyr_equal( input %>% mutate(x = str_pad(x, width = 30, side = "right")) %>% collect(), df ) - + expect_dplyr_equal( input %>% mutate(x = str_pad(x, width = 31, side = "left", pad = "+")) %>% collect(), df ) - + expect_dplyr_equal( input %>% mutate(x = str_pad(x, width = 10, side = "left", pad = "+")) %>% collect(), df ) - + expect_dplyr_equal( input %>% mutate(x = str_pad(x, width = 31, side = "both")) %>% collect(), df ) - +}) + +test_that("substr, substring, str_sub", { + + df <- tibble(x1 = "Apache Arrow") + + expect_dplyr_equal( + input %>% + mutate( + x2 = substr(x1, 1, 6), # Apache + x3 = substr(x1, 8, 12) # Arrow + ) %>% + collect(), + df + ) + + expect_dplyr_equal( + input %>% + mutate( + x2 = substring(x1, 1, 6), # Apache + x3 = substring(x1, 8, 12) # Arrow + ) %>% + collect(), + df + ) + + expect_dplyr_equal( + input %>% + mutate( + x2 = str_sub(x1, 1, 6), # Apache + x3 = str_sub(x1, 8, 12) # Arrow + ) %>% + collect(), + df + ) }) Review comment: The tests are now way shorter than they were before and don't include any of those awkward test cases like negative numbers that we were discussing on the ticket - maybe we should add them back in? ########## File path: r/src/compute.cpp ########## @@ -316,6 +316,19 @@ std::shared_ptr<arrow::compute::FunctionOptions> make_compute_options( return std::make_shared<Options>(max_splits, reverse); } + if (func_name == "utf8_slice_codeunits") { + using Options = arrow::compute::SliceOptions; + int64_t start = 1; + int64_t stop = -1; Review comment: See the final comment here for more info but happy to explain if that doesn't make sense: https://issues.apache.org/jira/browse/ARROW-13259 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org