This is an automated email from the ASF dual-hosted git repository.
thisisnic pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 1a428c902f GH-50163: [R] Bug: Partial matching on $metadata$r causes
errors with schema metadata keys starting with "r" (#50178)
1a428c902f is described below
commit 1a428c902f0d1d6b9924d6fd55fad2e242fc5b93
Author: Nic Crane <[email protected]>
AuthorDate: Wed Jun 17 10:40:44 2026 +0100
GH-50163: [R] Bug: Partial matching on $metadata$r causes errors with
schema metadata keys starting with "r" (#50178)
### Rationale for this change
Partial matching meant metadata beginning with R was triggering errors
### What changes are included in this PR?
Access fields via `[[` not `$`
### Are these changes tested?
Yes
### Are there any user-facing changes?
No
* GitHub Issue: #50163
Authored-by: Nic Crane <[email protected]>
Signed-off-by: Nic Crane <[email protected]>
---
r/R/arrow-tabular.R | 6 +++---
r/R/arrowExports.R | 16 +++++++--------
r/R/dataset-write.R | 2 +-
r/R/dplyr-collect.R | 2 +-
r/R/dplyr-group-by.R | 8 ++++----
r/R/feather.R | 2 +-
r/R/metadata.R | 6 +++---
r/R/schema.R | 10 ++++-----
r/extra-tests/test-read-files.R | 2 +-
r/man/Schema-class.Rd | 2 +-
r/tests/testthat/test-metadata.R | 44 ++++++++++++++++++++++++----------------
r/vignettes/metadata.Rmd | 2 +-
12 files changed, 55 insertions(+), 47 deletions(-)
diff --git a/r/R/arrow-tabular.R b/r/R/arrow-tabular.R
index b15a25075f..a83226a728 100644
--- a/r/R/arrow-tabular.R
+++ b/r/R/arrow-tabular.R
@@ -82,10 +82,10 @@ ArrowTabular <- R6Class(
# Helper for the R metadata that handles the serialization
# See also method on Schema
if (missing(new)) {
- self$metadata$r
+ self$metadata[["r"]]
} else {
# Set the R metadata
- self$metadata$r <- new
+ self$metadata[["r"]] <- new
self
}
}
@@ -95,7 +95,7 @@ ArrowTabular <- R6Class(
#' @export
as.data.frame.ArrowTabular <- function(x, row.names = NULL, optional = FALSE,
...) {
df <- x$to_data_frame()
- out <- apply_arrow_r_metadata(df, x$metadata$r)
+ out <- apply_arrow_r_metadata(df, x$metadata[["r"]])
as.data.frame(out, row.names = row.names, optional = optional, ...)
}
diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R
index 52274d29f0..22e66e2243 100644
--- a/r/R/arrowExports.R
+++ b/r/R/arrowExports.R
@@ -1248,14 +1248,6 @@ Field__Equals <- function(field, other, check_metadata) {
.Call(`_arrow_Field__Equals`, field, other, check_metadata)
}
-Field__nullable <- function(field) {
- .Call(`_arrow_Field__nullable`, field)
-}
-
-Field__type <- function(field) {
- .Call(`_arrow_Field__type`, field)
-}
-
Field__HasMetadata <- function(field) {
.Call(`_arrow_Field__HasMetadata`, field)
}
@@ -1272,6 +1264,14 @@ Field__RemoveMetadata <- function(field) {
.Call(`_arrow_Field__RemoveMetadata`, field)
}
+Field__nullable <- function(field) {
+ .Call(`_arrow_Field__nullable`, field)
+}
+
+Field__type <- function(field) {
+ .Call(`_arrow_Field__type`, field)
+}
+
fs___FileInfo__type <- function(x) {
.Call(`_arrow_fs___FileInfo__type`, x)
}
diff --git a/r/R/dataset-write.R b/r/R/dataset-write.R
index dd675b40d0..dac3ee8798 100644
--- a/r/R/dataset-write.R
+++ b/r/R/dataset-write.R
@@ -154,7 +154,7 @@ write_dataset <- function(
if (inherits(dataset, "grouped_df")) {
force(partitioning)
# Drop the grouping metadata before writing; we've already consumed it
- # now to construct `partitioning` and don't want it in the metadata$r
+ # now to construct `partitioning` and don't want it in the
metadata[["r"]]
dataset <- dplyr::ungroup(dataset)
}
dataset <- as_adq(dataset)
diff --git a/r/R/dplyr-collect.R b/r/R/dplyr-collect.R
index d109b15a62..b9461df20d 100644
--- a/r/R/dplyr-collect.R
+++ b/r/R/dplyr-collect.R
@@ -24,7 +24,7 @@ collect.arrow_dplyr_query <- function(x, as_data_frame =
TRUE, ...) {
collect.ArrowTabular <- function(x, as_data_frame = TRUE, ...) {
if (as_data_frame) {
df <- x$to_data_frame()
- apply_arrow_r_metadata(df, x$metadata$r)
+ apply_arrow_r_metadata(df, x$metadata[["r"]])
} else {
x
}
diff --git a/r/R/dplyr-group-by.R b/r/R/dplyr-group-by.R
index 268bf064af..52ea38f123 100644
--- a/r/R/dplyr-group-by.R
+++ b/r/R/dplyr-group-by.R
@@ -53,7 +53,7 @@ group_vars.arrow_dplyr_query <- function(x) x$group_by_vars
group_vars.Dataset <- function(x) character()
group_vars.RecordBatchReader <- function(x) character()
group_vars.ArrowTabular <- function(x) {
- x$metadata$r$attributes$.group_vars %||% character()
+ x$metadata[["r"]]$attributes$.group_vars %||% character()
}
# the logical literal in the two functions below controls the default value of
@@ -62,7 +62,7 @@ group_by_drop_default.arrow_dplyr_query <- function(.tbl) {
.tbl$drop_empty_groups %||% TRUE
}
group_by_drop_default.ArrowTabular <- function(.tbl) {
- .tbl$metadata$r$attributes$.group_by_drop %||% TRUE
+ .tbl$metadata[["r"]]$attributes$.group_by_drop %||% TRUE
}
group_by_drop_default.Dataset <- group_by_drop_default.RecordBatchReader <-
function(.tbl) TRUE
@@ -84,11 +84,11 @@ set_group_attributes <- function(tab, group_vars, .drop) {
# so passing NULL means unset (ungroup)
if (is.null(group_vars) || length(group_vars)) {
# Since accessing schema metadata does some work, only overwrite if needed
- new_atts <- old_atts <- tab$metadata$r$attributes %||% list()
+ new_atts <- old_atts <- tab$metadata[["r"]]$attributes %||% list()
new_atts[[".group_vars"]] <- group_vars
new_atts[[".group_by_drop"]] <- .drop
if (!identical(new_atts, old_atts)) {
- tab$metadata$r$attributes <- new_atts
+ tab$metadata[["r"]]$attributes <- new_atts
}
}
tab
diff --git a/r/R/feather.R b/r/R/feather.R
index 4008041106..23d5e4ed20 100644
--- a/r/R/feather.R
+++ b/r/R/feather.R
@@ -205,7 +205,7 @@ read_feather <- function(file, col_select = NULL,
as_data_frame = TRUE, mmap = T
if (isTRUE(as_data_frame)) {
df <- out$to_data_frame()
- out <- apply_arrow_r_metadata(df, out$metadata$r)
+ out <- apply_arrow_r_metadata(df, out$metadata[["r"]])
}
out
}
diff --git a/r/R/metadata.R b/r/R/metadata.R
index dbc8c6b639..b7d7609e23 100644
--- a/r/R/metadata.R
+++ b/r/R/metadata.R
@@ -48,7 +48,7 @@
if (getOption("arrow.debug", FALSE)) {
print(conditionMessage(e))
}
- warning("Invalid metadata$r", call. = FALSE)
+ warning('Invalid metadata$[["r"]]', call. = FALSE)
NULL
})
}
@@ -228,7 +228,7 @@ apply_arrow_r_metadata <- function(x, r_metadata) {
}
},
error = function(e) {
- warning("Invalid metadata$r", call. = FALSE)
+ warning('Invalid metadata$[["r"]]', call. = FALSE)
}
)
x
@@ -323,7 +323,7 @@ arrow_attributes <- function(x, only_top_level = FALSE) {
get_r_metadata_from_old_schema <- function(new_schema, old_schema) {
# TODO: do we care about other (non-R) metadata preservation?
# How would we know if it were meaningful?
- r_meta <- old_schema$metadata$r
+ r_meta <- old_schema$metadata[["r"]]
if (!is.null(r_meta)) {
# Filter r_metadata$columns on columns with name _and_ type match
common_names <- intersect(names(r_meta$columns), names(new_schema))
diff --git a/r/R/schema.R b/r/R/schema.R
index 7a6e38c95e..a02753fac5 100644
--- a/r/R/schema.R
+++ b/r/R/schema.R
@@ -65,7 +65,7 @@
#' schema (column names and types) which is compatible with other Arrow
#' clients. The R metadata is only read by R and is ignored by other clients
#' (e.g. Pandas has its own custom metadata). This metadata is stored in
-#' `$metadata$r`.
+#' `$metadata[["r"]]`.
#'
#' Since Schema metadata keys and values must be strings, this metadata is
#' saved by serializing R's attribute list structure to a string. If the
@@ -138,8 +138,8 @@ Schema <- R6Class(
renamed_schema <- Schema__WithNames(self, names)
# if we have R metadata containing column names, update names there too
- if (!is.null(existing_metadata$r$columns)) {
- names(existing_metadata$r$columns) <- names
+ if (!is.null(existing_metadata[["r"]]$columns)) {
+ names(existing_metadata[["r"]]$columns) <- names
}
renamed_schema$WithMetadata(existing_metadata)
}
@@ -176,10 +176,10 @@ Schema <- R6Class(
# Helper for the R metadata that handles the serialization
# See also method on ArrowTabular
if (missing(new)) {
- self$metadata$r
+ self$metadata[["r"]]
} else {
# Set the R metadata
- self$metadata$r <- new
+ self$metadata[["r"]] <- new
self
}
}
diff --git a/r/extra-tests/test-read-files.R b/r/extra-tests/test-read-files.R
index 4d4ecf85fa..ced366d2f5 100644
--- a/r/extra-tests/test-read-files.R
+++ b/r/extra-tests/test-read-files.R
@@ -183,7 +183,7 @@ test_that("Can see the extra metadata (parquet)", {
if (if_version_less_than("3.0.0")) {
expect_warning(
df <- read_parquet(pq_file),
- "Invalid metadata$r",
+ "Invalid metadata",
fixed = TRUE
)
expect_s3_class(df, "tbl")
diff --git a/r/man/Schema-class.Rd b/r/man/Schema-class.Rd
index ecd216af07..df4d7d7f75 100644
--- a/r/man/Schema-class.Rd
+++ b/r/man/Schema-class.Rd
@@ -56,7 +56,7 @@ re-create them when pulled back into R. This metadata is
separate from the
schema (column names and types) which is compatible with other Arrow
clients. The R metadata is only read by R and is ignored by other clients
(e.g. Pandas has its own custom metadata). This metadata is stored in
-\verb{$metadata$r}.
+\verb{$metadata[["r"]]}.
Since Schema metadata keys and values must be strings, this metadata is
saved by serializing R's attribute list structure to a string. If the
diff --git a/r/tests/testthat/test-metadata.R b/r/tests/testthat/test-metadata.R
index 90b9f599ec..fea4578635 100644
--- a/r/tests/testthat/test-metadata.R
+++ b/r/tests/testthat/test-metadata.R
@@ -62,16 +62,16 @@ test_that("Table R metadata", {
test_that("R metadata is not stored for types that map to Arrow types (factor,
Date, etc.)", {
tab <- Table$create(example_data[1:6])
- expect_null(tab$metadata$r)
+ expect_null(tab$metadata[["r"]])
- expect_null(Table$create(example_with_times[1:3])$metadata$r)
+ expect_null(Table$create(example_with_times[1:3])$metadata[["r"]])
})
test_that("R metadata is not stored for ExtensionType columns", {
tab <- Table$create(
x = vctrs::new_vctr(1:5, class = "special_integer")
)
- expect_null(tab$metadata$r)
+ expect_null(tab$metadata[["r"]])
})
test_that("classes are not stored for
arrow_binary/arrow_large_binary/arrow_fixed_size_binary (ARROW-14140)", {
@@ -81,13 +81,13 @@ test_that("classes are not stored for
arrow_binary/arrow_large_binary/arrow_fixe
large_binary <- Array$create(list(raws), large_binary())
fixed_size_binary <- Array$create(list(raws), fixed_size_binary(7L))
- expect_null(RecordBatch$create(b = binary)$metadata$r)
- expect_null(RecordBatch$create(b = large_binary)$metadata$r)
- expect_null(RecordBatch$create(b = fixed_size_binary)$metadata$r)
+ expect_null(RecordBatch$create(b = binary)$metadata[["r"]])
+ expect_null(RecordBatch$create(b = large_binary)$metadata[["r"]])
+ expect_null(RecordBatch$create(b = fixed_size_binary)$metadata[["r"]])
- expect_null(Table$create(b = binary)$metadata$r)
- expect_null(Table$create(b = large_binary)$metadata$r)
- expect_null(Table$create(b = fixed_size_binary)$metadata$r)
+ expect_null(Table$create(b = binary)$metadata[["r"]])
+ expect_null(Table$create(b = large_binary)$metadata[["r"]])
+ expect_null(Table$create(b = fixed_size_binary)$metadata[["r"]])
})
test_that("Garbage R metadata doesn't break things", {
@@ -95,7 +95,7 @@ test_that("Garbage R metadata doesn't break things", {
tab$metadata$r <- "garbage"
expect_warning(
as.data.frame(tab),
- "Invalid metadata$r",
+ 'Invalid metadata$[["r"]]',
fixed = TRUE
)
# serialize data like .serialize_arrow_r_metadata does, but don't call that
@@ -104,7 +104,7 @@ test_that("Garbage R metadata doesn't break things", {
tab$metadata$r <- rawToChar(serialize("garbage", NULL, ascii = TRUE))
expect_warning(
as.data.frame(tab),
- "Invalid metadata$r",
+ 'Invalid metadata$[["r"]]',
fixed = TRUE
)
@@ -113,7 +113,7 @@ test_that("Garbage R metadata doesn't break things", {
tab$metadata <- list(r = rawToChar(serialize(bad, NULL, ascii = TRUE)))
expect_warning(
as.data.frame(tab),
- "Invalid metadata$r",
+ 'Invalid metadata$[["r"]]',
fixed = TRUE
)
@@ -144,7 +144,7 @@ arbitrary\040code\040was\040just\040executed
expect_message(
expect_warning(
as.data.frame(tab),
- "Invalid metadata$r",
+ 'Invalid metadata$[["r"]]',
fixed = TRUE
),
NA
@@ -465,7 +465,7 @@ test_that("grouped_df metadata is recorded (efficiently)", {
expect_s3_class(grouped, "grouped_df")
grouped_tab <- Table$create(grouped)
expect_r6_class(grouped_tab, "Table")
- expect_equal(grouped_tab$metadata$r$attributes$.group_vars, "a")
+ expect_equal(grouped_tab$metadata[["r"]]$attributes$.group_vars, "a")
})
test_that("grouped_df non-arrow metadata is preserved", {
@@ -496,10 +496,10 @@ test_that("apply_arrow_r_metadata doesn't add in metadata
from plain data.frame
plain_df <- data.frame(x = 1:5)
plain_df_arrow <- arrow_table(plain_df)
- expect_equal(plain_df_arrow$metadata$r$columns, list(x = NULL))
+ expect_equal(plain_df_arrow$metadata[["r"]]$columns, list(x = NULL))
plain_df_no_metadata <- plain_df_arrow$to_data_frame()
- plain_df_with_metadata <- apply_arrow_r_metadata(plain_df_no_metadata,
plain_df_arrow$metadata$r)
+ plain_df_with_metadata <- apply_arrow_r_metadata(plain_df_no_metadata,
plain_df_arrow$metadata[["r"]])
expect_identical(plain_df_no_metadata, plain_df_with_metadata)
@@ -507,13 +507,21 @@ test_that("apply_arrow_r_metadata doesn't add in metadata
from plain data.frame
spicy_df_arrow <- arrow_table(haven_data)
expect_equal(
- spicy_df_arrow$metadata$r$columns,
+ spicy_df_arrow$metadata[["r"]]$columns,
list(num = list(attributes = list(format.spss = "F8.2"), columns = NULL),
cat_int = NULL, cat_chr = NULL)
)
spicy_df_no_metadata <- spicy_df_arrow$to_data_frame()
- spicy_df_with_metadata <- apply_arrow_r_metadata(spicy_df_no_metadata,
spicy_df_arrow$metadata$r)
+ spicy_df_with_metadata <- apply_arrow_r_metadata(spicy_df_no_metadata,
spicy_df_arrow$metadata[["r"]])
expect_null(attr(spicy_df_no_metadata$num, "format.spss"))
expect_equal(attr(spicy_df_with_metadata$num, "format.spss"), "F8.2")
})
+
+test_that("metadata keys starting with 'r' don't cause partial matching -
GH-50163", {
+ tbl <- arrow_table(x = 1:3)
+ tbl <- tbl$cast(tbl$schema$WithMetadata(list(rachel = "some_value")))
+
+ expect_no_warning(as.data.frame(tbl))
+ expect_no_warning(collect.ArrowTabular(tbl))
+})
diff --git a/r/vignettes/metadata.Rmd b/r/vignettes/metadata.Rmd
index 813b1075f2..3c1cd7315b 100644
--- a/r/vignettes/metadata.Rmd
+++ b/r/vignettes/metadata.Rmd
@@ -74,7 +74,7 @@ tb$metadata$new_key <- "new value"
Metadata attached to a Schema is preserved when writing the Table to
Arrow/Feather or Parquet formats. When reading those files into R, or when
calling `as.data.frame()` on a Table or RecordBatch, the column attributes are
restored to the columns of the resulting `data.frame`. This means that custom
data types, including `haven::labelled`, `vctrs` annotations, and others, are
preserved when doing a round-trip through Arrow.
-Note that the attributes stored in `$metadata$r` are only understood by R. If
you write a `data.frame` with `haven` columns to a Feather file and read that
in Pandas, the `haven` metadata won't be recognized there. Similarly, Pandas
writes its own custom metadata, which the R package does not consume. You are
free, however, to define custom metadata conventions for your application and
assign any (string) values you want to other metadata keys.
+Note that the attributes stored in `$metadata[["r"]]` are only understood by
R. If you write a `data.frame` with `haven` columns to a Feather file and read
that in Pandas, the `haven` metadata won't be recognized there. Similarly,
Pandas writes its own custom metadata, which the R package does not consume.
You are free, however, to define custom metadata conventions for your
application and assign any (string) values you want to other metadata keys.
## Further reading