nealrichardson commented on code in PR #12751:
URL: https://github.com/apache/arrow/pull/12751#discussion_r852119995


##########
r/R/record-batch.R:
##########
@@ -189,3 +189,62 @@ record_batch <- RecordBatch$create
 
 #' @export
 names.RecordBatch <- function(x) x$names()
+
+#' @export
+rbind.RecordBatch <- function(...) {
+  abort("Use `Table$create()` to combine record batches")
+}
+
+cbind_check_length <- function(target_length, length, idx, call = 
caller_env()) {

Review Comment:
   Do you need this function? I'm pretty sure the RecordBatch constructor in 
C++ will check lengths (again). 



##########
r/R/record-batch.R:
##########
@@ -189,3 +189,62 @@ record_batch <- RecordBatch$create
 
 #' @export
 names.RecordBatch <- function(x) x$names()
+
+#' @export
+rbind.RecordBatch <- function(...) {
+  abort("Use `Table$create()` to combine record batches")
+}
+
+cbind_check_length <- function(target_length, length, idx, call = 
caller_env()) {
+  if (length != target_length) {
+    abort(
+      c("Non-scalar inputs must have an equal number of rows.",
+        i = sprintf("..1 has %d, ..%d has %d", target_length, idx, length)),
+      call = call
+    )
+  }
+}
+
+#' @export
+cbind.RecordBatch <- function(...) {
+  call <- sys.call()
+  inputs <- list(...)
+  num_rows <- inputs[[1]]$num_rows
+
+  # These names are only used for scalar or arrays
+  arg_names <- if (is.null(names(inputs))) character(length(inputs)) else 
names(inputs)
+  arg_names <- make.names(arg_names, unique = TRUE)
+
+  batches <- map(seq_along(inputs), function(i) {
+    input <- inputs[[i]]
+    name <- arg_names[i]
+
+    if (inherits(input, "RecordBatch")) {
+      cbind_check_length(num_rows, input$num_rows, i, call)
+      input
+    } else if (inherits(input, "data.frame")) {
+      cbind_check_length(num_rows, nrow(input), i, call)
+      RecordBatch$create(input)
+    } else if (length(input) == 1) {
+      RecordBatch$create("{name}" := repeat_value_as_array(input, num_rows))

Review Comment:
   Also just confirming: do you need this here or will the C++ constructor do 
this for you?



##########
r/tests/testthat/test-RecordBatch.R:
##########
@@ -513,6 +513,76 @@ test_that("record_batch() with different length arrays", {
   expect_error(record_batch(a = 1:5, b = 1:6), msg)
 })
 
+test_that("RecordBatch doesn't support rbind", {
+  expect_snapshot_error(
+    rbind(
+      record_batch(a = 1:10),
+      record_batch(a = 2:4)
+    )
+  )
+})
+
+test_that("RecordBatch supports cbind", {
+  expect_snapshot_error(
+    cbind(
+      record_batch(a = 1:10),
+      record_batch(a = c("a", "b"))
+    )
+  )
+
+  actual <- cbind(
+    record_batch(a = c(1, 2), b = c("a", "b")),
+    record_batch(a = c("d", "c")),
+    record_batch(c = c(2, 3))
+  )
+  expected <- record_batch(
+    a = c(1, 2),
+    b = c("a", "b"),
+    a = c("d", "c"),
+    c = c(2, 3)
+  )
+  expect_equal(actual, expected)
+
+  # Handles arrays
+  expect_equal(
+    cbind(record_batch(a = 1:2), b = Array$create(4:5)),
+    record_batch(a = 1:2, b = 4:5)
+  )
+
+  # Handles data.frames on R 4.0 or greater
+  if (getRversion() >= "4.0.0") {
+    # Prior to R 4.0, cbind would short-circuit to the data.frame 
implementation
+    # if **any** of the arguments are a data.frame.
+    expect_equal(
+      cbind(record_batch(a = 1:2), data.frame(b = 4:5)),
+      record_batch(a = 1:2, b = 4:5)
+    )
+  }
+
+
+  # Handles base factors
+  expect_equal(
+    cbind(record_batch(a = 1:2), b = factor(c("a", "b"))),
+    record_batch(a = 1:2, b = factor(c("a", "b")))
+  )
+
+  # Handles base scalars
+  expect_equal(
+    cbind(record_batch(a = 1:2), b = 1L),
+    record_batch(a = 1:2, b = rep(1L, 2))
+  )
+
+  # Handles unnamed arrays, even in cases where no named arguments are passed

Review Comment:
   Is this desirable? FWIW we don't accept this directly in `record_batch()` 
(though the error message is not awesome):
   
   ```
   > record_batch(a=1:2, 3:4)
   Error: Unknown: only data frames are allowed as unnamed arguments to be auto 
spliced
   ```



##########
r/R/table.R:
##########
@@ -149,6 +149,77 @@ Table$create <- function(..., schema = NULL) {
 #' @export
 names.Table <- function(x) x$ColumnNames()
 
+#' @export
+rbind.Table <- function(...) {

Review Comment:
   There is a ConcatenateTables C++ function in table.h. Any reason not to use 
it?



##########
r/tests/testthat/test-RecordBatch.R:
##########
@@ -513,6 +513,76 @@ test_that("record_batch() with different length arrays", {
   expect_error(record_batch(a = 1:5, b = 1:6), msg)
 })
 
+test_that("RecordBatch doesn't support rbind", {
+  expect_snapshot_error(

Review Comment:
   Not a critique of this PR, just an observation: I find these 
`expect_snapshot_error` tests difficult as a reader. It's really not clear what 
they're doing, you have to look aside to another file and apparently count the 
number of errors in the file to see what message this particular call threw. 



##########
r/tests/testthat/test-Table.R:
##########
@@ -518,6 +518,95 @@ test_that("Table$create() no recycling with tibbles", {
   )
 })
 
+test_that("Table supports rbind", {
+  expect_error(
+    rbind(
+      Table$create(a = 1:10, b = Scalar$create(5)),
+      Table$create(a = c("a", "b"), b = Scalar$create(5))
+    ),
+    regexp = "Schema at index 2 does not match the first schema"
+  )
+
+  tables <- list(
+    Table$create(a = 1:10, b = Scalar$create("x")),
+    Table$create(a = 2:42, b = Scalar$create("y")),
+    Table$create(a = 8:10, b = Scalar$create("z"))
+  )
+  expected <- Table$create(do.call(rbind, lapply(tables, function(table) 
as.data.frame(table))))

Review Comment:
   ```suggestion
     expected <- Table$create(do.call(rbind, lapply(tables, as.data.frame)))
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to