paleolimbot commented on code in PR #12751:
URL: https://github.com/apache/arrow/pull/12751#discussion_r848349565
##########
r/R/table.R:
##########
@@ -149,6 +149,75 @@ Table$create <- function(..., schema = NULL) {
#' @export
names.Table <- function(x) x$ColumnNames()
+#' @export
+rbind.Table <- function(...) {
+ tables <- list(...)
+
+ # assert they have same schema
+ schema <- tables[[1]]$schema
+ unequal_schema_idx <- which.min(lapply(tables, function(x) x$schema ==
schema))
+ if (unequal_schema_idx != 1) {
+ abort(c(
+ sprintf("Schema at index %i does not match the first schema",
unequal_schema_idx),
+ i = paste0("Schema 1:\n", schema$ToString()),
+ i = paste0(sprintf("Schema %d:\n", unequal_schema_idx),
+ tables[[unequal_schema_idx]]$schema$ToString())
+ ))
+ }
+
+ # create chunked array for each column
+ columns <- map(seq_len(tables[[1]]$num_columns), function(i) {
+ do.call(c, map(tables, ~ .[[i]]))
+ })
+
+ Table$create(!!!set_names(columns, names(schema)), schema = schema)
+}
+
+#' @export
+cbind.Table <- function(..., call = caller_env()) {
Review Comment:
I think you want to move `call <- caller_env()` to inside the
function...it's not clear to me that anybody would ever pass it as an argument.
If you do need it as an argument, use `.call` instead of `call` for the name
(in the off chance anybody uses this to add a column named `call`...).
##########
r/R/record-batch.R:
##########
@@ -189,3 +189,61 @@ record_batch <- RecordBatch$create
#' @export
names.RecordBatch <- function(x) x$names()
+
+#' @export
+rbind.RecordBatch <- function(...) {
+ abort("Use `Table$create()` to combine record batches")
+}
+
+cbind_check_length <- function(target_length, length, idx) {
+ if (length != target_length) {
+ abort(
+ sprintf(
+ "Non-scalar inputs must have an equal number of rows. ..1 has %d, ..%d
has %d",
Review Comment:
Not at all necessary, but you make good use of the rlang `c(..., "i" = ...)`
trick below and the second sentence here might look pretty as an `"i"` bullet.
##########
r/R/record-batch.R:
##########
@@ -189,3 +189,52 @@ record_batch <- RecordBatch$create
#' @export
names.RecordBatch <- function(x) x$names()
+
+#' @export
+rbind.RecordBatch <- function(...) {
+ abort("Use Table$create to combine record batches")
+}
+
+cbind_check_length <- function(target_length, length, idx) {
+ if (length != target_length) {
+ abort(
+ sprintf(
+ "Non-scalar inputs must have an equal number of rows. ..1 has %i, ..%i
has %i",
+ target_length,
+ idx,
+ length
+ )
+ )
+ }
+}
+
+#' @export
+cbind.RecordBatch <- function(...) {
+ inputs <- list(...)
+ num_rows <- inputs[[1]]$num_rows
+
+ batches <- imap(inputs, function(input, idx) {
+ if (inherits(input, "RecordBatch")) {
+ cbind_check_length(num_rows, input$num_rows, idx)
+ input
+ } else if (is.atomic(input) && length(input) == 1) {
+ RecordBatch$create("{idx}" := rep(input, num_rows))
+ } else {
+ tryCatch(
+ {
+ cbind_check_length(num_rows, length(input), idx)
+ RecordBatch$create("{idx}" := input)
+ },
+ error = function(err) {
+ abort(sprintf("Input ..%i cannot be converted to an Arrow Array:
%s", idx, err))
+ }
+ )
+ }
Review Comment:
Nice!
##########
r/R/record-batch.R:
##########
@@ -189,3 +189,61 @@ record_batch <- RecordBatch$create
#' @export
names.RecordBatch <- function(x) x$names()
+
+#' @export
+rbind.RecordBatch <- function(...) {
+ abort("Use `Table$create()` to combine record batches")
+}
+
+cbind_check_length <- function(target_length, length, idx) {
+ if (length != target_length) {
+ abort(
+ sprintf(
+ "Non-scalar inputs must have an equal number of rows. ..1 has %d, ..%d
has %d",
+ target_length,
+ idx,
+ length
+ )
+ )
+ }
+}
+
+#' @export
+cbind.RecordBatch <- function(..., call = caller_env()) {
+ inputs <- list(...)
+ num_rows <- inputs[[1]]$num_rows
+
+ batches <- map(seq_along(inputs), function(i) {
+ input <- inputs[[i]]
+ name <- names(inputs)[i]
+ name <- ifelse(name == "", paste0("X", as.character(i)), name)
Review Comment:
The "name repair" operation is actually pretty complicated and I think you
probably want to avoid it (maybe error for unnamed Array/R vector?). If you do
want to support unnamed array-like things, you could use `inputs <-
rlang::enquos(..., .named = TRUE)`, `base::make.names()`, or
`vctrs::vec_as_names()` to avoid rolling your own logic.
##########
r/R/table.R:
##########
@@ -149,6 +149,75 @@ Table$create <- function(..., schema = NULL) {
#' @export
names.Table <- function(x) x$ColumnNames()
+#' @export
+rbind.Table <- function(...) {
+ tables <- list(...)
+
+ # assert they have same schema
+ schema <- tables[[1]]$schema
+ unequal_schema_idx <- which.min(lapply(tables, function(x) x$schema ==
schema))
+ if (unequal_schema_idx != 1) {
+ abort(c(
+ sprintf("Schema at index %i does not match the first schema",
unequal_schema_idx),
+ i = paste0("Schema 1:\n", schema$ToString()),
+ i = paste0(sprintf("Schema %d:\n", unequal_schema_idx),
+ tables[[unequal_schema_idx]]$schema$ToString())
+ ))
+ }
+
+ # create chunked array for each column
+ columns <- map(seq_len(tables[[1]]$num_columns), function(i) {
+ do.call(c, map(tables, ~ .[[i]]))
+ })
+
+ Table$create(!!!set_names(columns, names(schema)), schema = schema)
+}
+
+#' @export
+cbind.Table <- function(..., call = caller_env()) {
Review Comment:
Actually, on re-reading where you use this, I think you can avoid it
entirely (I *think* rlang is smart enough to jump over the tryCatch when
signaling an error condition with a parent but definitely check). If you do
need it, I think you want `sys.call(1)` rather than `caller_env()`.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]