nealrichardson commented on code in PR #12751:
URL: https://github.com/apache/arrow/pull/12751#discussion_r852119995
##########
r/R/record-batch.R:
##########
@@ -189,3 +189,62 @@ record_batch <- RecordBatch$create
#' @export
names.RecordBatch <- function(x) x$names()
+
+#' @export
+rbind.RecordBatch <- function(...) {
+ abort("Use `Table$create()` to combine record batches")
+}
+
+cbind_check_length <- function(target_length, length, idx, call =
caller_env()) {
Review Comment:
Do you need this function? I'm pretty sure the RecordBatch constructor in
C++ will check lengths (again).
##########
r/R/record-batch.R:
##########
@@ -189,3 +189,62 @@ record_batch <- RecordBatch$create
#' @export
names.RecordBatch <- function(x) x$names()
+
+#' @export
+rbind.RecordBatch <- function(...) {
+ abort("Use `Table$create()` to combine record batches")
+}
+
+cbind_check_length <- function(target_length, length, idx, call =
caller_env()) {
+ if (length != target_length) {
+ abort(
+ c("Non-scalar inputs must have an equal number of rows.",
+ i = sprintf("..1 has %d, ..%d has %d", target_length, idx, length)),
+ call = call
+ )
+ }
+}
+
+#' @export
+cbind.RecordBatch <- function(...) {
+ call <- sys.call()
+ inputs <- list(...)
+ num_rows <- inputs[[1]]$num_rows
+
+ # These names are only used for scalar or arrays
+ arg_names <- if (is.null(names(inputs))) character(length(inputs)) else
names(inputs)
+ arg_names <- make.names(arg_names, unique = TRUE)
+
+ batches <- map(seq_along(inputs), function(i) {
+ input <- inputs[[i]]
+ name <- arg_names[i]
+
+ if (inherits(input, "RecordBatch")) {
+ cbind_check_length(num_rows, input$num_rows, i, call)
+ input
+ } else if (inherits(input, "data.frame")) {
+ cbind_check_length(num_rows, nrow(input), i, call)
+ RecordBatch$create(input)
+ } else if (length(input) == 1) {
+ RecordBatch$create("{name}" := repeat_value_as_array(input, num_rows))
Review Comment:
Also just confirming: do you need this here or will the C++ constructor do
this for you?
##########
r/tests/testthat/test-RecordBatch.R:
##########
@@ -513,6 +513,76 @@ test_that("record_batch() with different length arrays", {
expect_error(record_batch(a = 1:5, b = 1:6), msg)
})
+test_that("RecordBatch doesn't support rbind", {
+ expect_snapshot_error(
+ rbind(
+ record_batch(a = 1:10),
+ record_batch(a = 2:4)
+ )
+ )
+})
+
+test_that("RecordBatch supports cbind", {
+ expect_snapshot_error(
+ cbind(
+ record_batch(a = 1:10),
+ record_batch(a = c("a", "b"))
+ )
+ )
+
+ actual <- cbind(
+ record_batch(a = c(1, 2), b = c("a", "b")),
+ record_batch(a = c("d", "c")),
+ record_batch(c = c(2, 3))
+ )
+ expected <- record_batch(
+ a = c(1, 2),
+ b = c("a", "b"),
+ a = c("d", "c"),
+ c = c(2, 3)
+ )
+ expect_equal(actual, expected)
+
+ # Handles arrays
+ expect_equal(
+ cbind(record_batch(a = 1:2), b = Array$create(4:5)),
+ record_batch(a = 1:2, b = 4:5)
+ )
+
+ # Handles data.frames on R 4.0 or greater
+ if (getRversion() >= "4.0.0") {
+ # Prior to R 4.0, cbind would short-circuit to the data.frame
implementation
+ # if **any** of the arguments are a data.frame.
+ expect_equal(
+ cbind(record_batch(a = 1:2), data.frame(b = 4:5)),
+ record_batch(a = 1:2, b = 4:5)
+ )
+ }
+
+
+ # Handles base factors
+ expect_equal(
+ cbind(record_batch(a = 1:2), b = factor(c("a", "b"))),
+ record_batch(a = 1:2, b = factor(c("a", "b")))
+ )
+
+ # Handles base scalars
+ expect_equal(
+ cbind(record_batch(a = 1:2), b = 1L),
+ record_batch(a = 1:2, b = rep(1L, 2))
+ )
+
+ # Handles unnamed arrays, even in cases where no named arguments are passed
Review Comment:
Is this desirable? FWIW we don't accept this directly in `record_batch()`
(though the error message is not awesome):
```
> record_batch(a=1:2, 3:4)
Error: Unknown: only data frames are allowed as unnamed arguments to be auto
spliced
```
##########
r/R/table.R:
##########
@@ -149,6 +149,77 @@ Table$create <- function(..., schema = NULL) {
#' @export
names.Table <- function(x) x$ColumnNames()
+#' @export
+rbind.Table <- function(...) {
Review Comment:
There is a ConcatenateTables C++ function in table.h. Any reason not to use
it?
##########
r/tests/testthat/test-RecordBatch.R:
##########
@@ -513,6 +513,76 @@ test_that("record_batch() with different length arrays", {
expect_error(record_batch(a = 1:5, b = 1:6), msg)
})
+test_that("RecordBatch doesn't support rbind", {
+ expect_snapshot_error(
Review Comment:
Not a critique of this PR, just an observation: I find these
`expect_snapshot_error` tests difficult as a reader. It's really not clear what
they're doing, you have to look aside to another file and apparently count the
number of errors in the file to see what message this particular call threw.
##########
r/tests/testthat/test-Table.R:
##########
@@ -518,6 +518,95 @@ test_that("Table$create() no recycling with tibbles", {
)
})
+test_that("Table supports rbind", {
+ expect_error(
+ rbind(
+ Table$create(a = 1:10, b = Scalar$create(5)),
+ Table$create(a = c("a", "b"), b = Scalar$create(5))
+ ),
+ regexp = "Schema at index 2 does not match the first schema"
+ )
+
+ tables <- list(
+ Table$create(a = 1:10, b = Scalar$create("x")),
+ Table$create(a = 2:42, b = Scalar$create("y")),
+ Table$create(a = 8:10, b = Scalar$create("z"))
+ )
+ expected <- Table$create(do.call(rbind, lapply(tables, function(table)
as.data.frame(table))))
Review Comment:
```suggestion
expected <- Table$create(do.call(rbind, lapply(tables, as.data.frame)))
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]