This is an automated email from the ASF dual-hosted git repository.
thisisnic pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new c16a8b22d1 GH-40640: [R] to_arrow() loses group_by() (#49713)
c16a8b22d1 is described below
commit c16a8b22d144d6685ef97c3ef676091dd9c86f1f
Author: Nic Crane <[email protected]>
AuthorDate: Sun Apr 12 09:40:27 2026 +0100
GH-40640: [R] to_arrow() loses group_by() (#49713)
### Rationale for this change
Roundtrip to duckdb loses grouping
### What changes are included in this PR?
Reapply grouping
### Are these changes tested?
Yup
### Are there any user-facing changes?
Nah
### AI usage
Done with Codex, but I went through it myself and am happy with it.
* GitHub Issue: #40640
Authored-by: Nic Crane <[email protected]>
Signed-off-by: Nic Crane <[email protected]>
---
r/R/duckdb.R | 10 +++++++++-
r/tests/testthat/test-duckdb.R | 12 ++++++++++++
2 files changed, 21 insertions(+), 1 deletion(-)
diff --git a/r/R/duckdb.R b/r/R/duckdb.R
index 5e5ad1497e..00266e9af6 100644
--- a/r/R/duckdb.R
+++ b/r/R/duckdb.R
@@ -183,9 +183,17 @@ to_arrow <- function(.data) {
)
}
+ groups <- dplyr::groups(.data)
+
# Run the query
res <- DBI::dbSendQuery(dbplyr::remote_con(.data),
dbplyr::remote_query(.data), arrow = TRUE)
reader <- duckdb::duckdb_fetch_record_batch(res)
- MakeSafeRecordBatchReader(reader)
+ out <- MakeSafeRecordBatchReader(reader)
+
+ if (length(groups)) {
+ out <- dplyr::group_by(out, !!!groups)
+ }
+
+ out
}
diff --git a/r/tests/testthat/test-duckdb.R b/r/tests/testthat/test-duckdb.R
index 4bc3e642c4..8b572268e3 100644
--- a/r/tests/testthat/test-duckdb.R
+++ b/r/tests/testthat/test-duckdb.R
@@ -190,6 +190,18 @@ test_that("to_arrow roundtrip, with dataset (without
wrapping)", {
expect_r6_class(out, "RecordBatchReader")
})
+test_that("to_arrow preserves grouping from duckdb tables", {
+ ds <- InMemoryDataset$create(example_data)
+
+ out <- ds |>
+ to_duckdb() |>
+ group_by(lgl) |>
+ to_arrow()
+
+ expect_s3_class(out, "arrow_dplyr_query")
+ expect_equal(dplyr::group_vars(out), "lgl")
+})
+
# The next set of tests use an already-extant connection to test features of
# persistence and querying against the table without using the `tbl` itself, so
# we need to create a connection separate from the ephemeral one that is made