nealrichardson commented on a change in pull request #9521:
URL: https://github.com/apache/arrow/pull/9521#discussion_r581458393



##########
File path: r/R/dplyr.R
##########
@@ -423,26 +482,115 @@ ungroup.arrow_dplyr_query <- function(x, ...) {
 }
 ungroup.Dataset <- ungroup.ArrowTabular <- force
 
-mutate.arrow_dplyr_query <- function(.data, ...) {
+mutate.arrow_dplyr_query <- function(.data,
+                                     ...,
+                                     .keep = c("all", "used", "unused", 
"none"),
+                                     .before = NULL,
+                                     .after = NULL) {
+  call <- match.call()
+  exprs <- quos(...)
+  if (length(exprs) == 0) {
+    # Nothing to do
+    return(.data)
+  }
+
   .data <- arrow_dplyr_query(.data)
   if (query_on_dataset(.data)) {
     not_implemented_for_dataset("mutate()")
   }
-  # TODO: see if we can defer evaluating the expressions and not collect here.
-  # It's different from filters (as currently implemented) because the basic
-  # vector transformation functions aren't yet implemented in Arrow C++.
-  dplyr::mutate(dplyr::collect(.data), ...)
+
+  .keep <- match.arg(.keep)
+  .before <- enquo(.before)
+  .after <- enquo(.after)
+  # Restrict the cases we support for now
+  if (!quo_is_null(.before) || !quo_is_null(.after)) {
+    # TODO(ARROW-11701)
+    return(abandon_ship(call, .data, '.before and .after arguments are not 
supported in Arrow'))
+  } else if (length(group_vars(.data)) > 0) {
+    # mutate() on a grouped dataset does calculations within groups
+    # This doesn't matter on scalar ops (arithmetic etc.) but it does
+    # for things with aggregations (e.g. subtracting the mean)
+    return(abandon_ship(call, .data, 'mutate() on grouped data not supported 
in Arrow'))
+  } else if (!all(nzchar(names(exprs)))) {
+    # This is either user error or a function that returns a data.frame
+    # e.g. across() that dplyr::mutate() will autosplice
+    # TODO(ARROW-16999)
+    msg <- 'all ... expressions must be named: autosplicing multi-column 
results not supported in Arrow'
+    return(abandon_ship(call, .data, msg))

Review comment:
       I could also let it go through; dplyr::mutate() will create names by 
deparsing the expressing and maybe that can work for us too, and 
dplyr::across() will fail and presumably say "function not implemented, pulling 
data into R". And if/when across)( is implemented, the 
Table/RecordBatch$create() methods actually do autosplicing. 




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to