This is an automated email from the ASF dual-hosted git repository.
paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git
The following commit(s) were added to refs/heads/main by this push:
new 05cdc1b5 refactor(r): Use JSON in experimental R vctrs extension type
(#533)
05cdc1b5 is described below
commit 05cdc1b5ae46bbad7c603ba33149f12305bfda59
Author: Dewey Dunnington <[email protected]>
AuthorDate: Thu Jun 20 16:51:16 2024 +0000
refactor(r): Use JSON in experimental R vctrs extension type (#533)
This PR updates the "vctrs extension type" to use JSON as its metadata
serialization. JSON is use for most other extension types and using it
here provides some workaround for non-R consumers that encounter this
type elsewhere.
The serializer and deserializer use the same format as
`jsonlite::serializeJSON()`, but restrict the types of objects that it
is willing to serialize or deserialize.
---
r/DESCRIPTION | 1 +
r/R/extension-vctrs.R | 105 +++++++++++++++++++++++++++++---
r/man/na_vctrs.Rd | 3 +-
r/tests/testthat/test-extension-vctrs.R | 82 ++++++++++++++++++++-----
r/tests/testthat/test-extension.R | 35 +++++++----
5 files changed, 189 insertions(+), 37 deletions(-)
diff --git a/r/DESCRIPTION b/r/DESCRIPTION
index 753ee341..d20eec34 100644
--- a/r/DESCRIPTION
+++ b/r/DESCRIPTION
@@ -27,6 +27,7 @@ Suggests:
bit64,
blob,
hms,
+ jsonlite,
rlang,
testthat (>= 3.0.0),
tibble,
diff --git a/r/R/extension-vctrs.R b/r/R/extension-vctrs.R
index bf910f3f..415bc242 100644
--- a/r/R/extension-vctrs.R
+++ b/r/R/extension-vctrs.R
@@ -31,7 +31,7 @@
#' @return A [nanoarrow_schema][as_nanoarrow_schema].
#' @export
#'
-#' @examples
+#' @examplesIf requireNamespace("jsonlite", quietly = TRUE)
#' vctr <- as.POSIXlt("2000-01-02 03:45", tz = "UTC")
#' array <- as_nanoarrow_array(vctr, schema = na_vctrs(vctr))
#' infer_nanoarrow_ptype(array)
@@ -44,12 +44,7 @@ na_vctrs <- function(ptype, storage_type = NULL) {
storage_type <- infer_nanoarrow_schema(vctrs::vec_data(ptype))
}
- # Note: a potential replacement for this is the JSON generated by the cereal
- # package; however, as of this writing that JSON doesn't handle arbitrary
nesting.
- # The arrow package currently uses the non-ASCII version; however, it
generally
- # makes life easier if the metadata is valid UTF-8. The deserializer works
with
- # either.
- na_extension(storage_type, "arrow.r.vctrs", serialize(ptype, NULL, ascii =
TRUE))
+ na_extension(storage_type, "arrow.r.vctrs", serialize_ptype(ptype))
}
register_vctrs_extension <- function() {
@@ -62,7 +57,7 @@ register_vctrs_extension <- function() {
#' @export
infer_nanoarrow_ptype_extension.nanoarrow_extension_spec_vctrs <-
function(extension_spec, x, ...) {
parsed <- .Call(nanoarrow_c_schema_parse, x)
- unserialize(parsed$extension_metadata)
+ unserialize_ptype(parsed$extension_metadata)
}
#' @export
@@ -102,3 +97,97 @@ as_nanoarrow_array_extension.nanoarrow_extension_spec_vctrs
<- function(
schema$metadata[["ARROW:extension:metadata"]]
)
}
+
+# The logic for serializing and deserializing prototypes is a subset of
+# the implementation in jsonlite. Unlike jsonlite, we don't need to handle
+# arbitrary attributes because vector prototypes typically do not contain
+# complex information like expression/language objects and environments.
+serialize_ptype <- function(x) {
+ type <- typeof(x)
+ type_serialized <- sprintf('"type":"%s"', type)
+
+ attrs <- attributes(x)
+ attributes(x) <- NULL
+ if (!is.null(attrs)) {
+ attr_names_serialized <- paste0('"', gsub('"', '\\"', names(attrs)), '"')
+ attr_values_serialized <- lapply(unname(attrs), serialize_ptype)
+ attrs_serialized <- sprintf(
+ '"attributes":{%s}',
+ paste0(attr_names_serialized, ":", attr_values_serialized, collapse =
",")
+ )
+ } else {
+ attrs_serialized <- NULL
+ }
+
+ if (identical(type, "NULL")) {
+ values_serialized <- NULL
+ } else if (identical(type, "raw")) {
+ values_serialized <- sprintf('"value":"%s"', jsonlite::base64_enc(x))
+ } else if(length(x) == 0) {
+ values_serialized <- '"value":[]'
+ } else {
+ values <- switch(
+ type,
+ character = paste0('"', gsub('"', '\\"', x), '"'),
+ complex = paste0('"', format(x, digits = 16, justify = "none", na.encode
= FALSE), '"'),
+ logical = tolower(as.character(x)),
+ integer = ,
+ double = format(x, digits = 16, justify = "none", na.encode = FALSE),
+ list = lapply(x, serialize_ptype),
+ stop(sprintf("storage '%s' is not supported by serialize_ptype", type))
+ )
+
+ values[is.na(x)] <- "null"
+ values_serialized <- sprintf(
+ '"value":[%s]',
+ paste(values, collapse = ",")
+ )
+ }
+
+ sprintf(
+ "{%s}",
+ paste(
+ c(type_serialized, attrs_serialized, values_serialized),
+ collapse = ","
+ )
+ )
+}
+
+unserialize_ptype <- function(x) {
+ if (is.raw(x)) {
+ x <- rawToChar(x)
+ }
+
+ unserialize_ptype_impl(jsonlite::fromJSON(x, simplifyVector = FALSE))
+}
+
+unserialize_ptype_impl <- function(x) {
+ if (identical(x$type, "NULL")) {
+ return(NULL)
+ } else if(identical(x$type, "raw")) {
+ return(jsonlite::base64_dec(x$value))
+ }
+
+ sanitizer <- switch(
+ x$type,
+ raw = as.raw,
+ complex = as.complex,
+ character = as.character,
+ logical = as.logical,
+ integer = as.integer,
+ double = as.double,
+ list = function(x) list(unserialize_ptype_impl(x)),
+ stop(sprintf("storage '%s' is not supported by unserialize_ptype", x$type))
+ )
+
+ na <- vector(x$type)[1]
+ x$value[vapply(x$value, is.null, logical(1))] <- na
+ x$value[vapply(x$value, identical, logical(1), "NA")] <- na
+ out <- vapply(x$value, sanitizer, na)
+
+ if (!is.null(x$attributes)) {
+ attributes(out) <- lapply(x$attributes, unserialize_ptype_impl)
+ }
+
+ out
+}
diff --git a/r/man/na_vctrs.Rd b/r/man/na_vctrs.Rd
index f1fbd244..c87dad6d 100644
--- a/r/man/na_vctrs.Rd
+++ b/r/man/na_vctrs.Rd
@@ -24,9 +24,10 @@ through Arrow memory. The vctrs extension type uses
\code{\link[vctrs:vec_data]{
\code{\link[=as_nanoarrow_array]{as_nanoarrow_array()}} and
\code{\link[=convert_array]{convert_array()}} to ensure roundtrip fidelity.
}
\examples{
+\dontshow{if (requireNamespace("jsonlite", quietly = TRUE)) (if (getRversion()
>= "3.4") withAutoprint else force)(\{ # examplesIf}
vctr <- as.POSIXlt("2000-01-02 03:45", tz = "UTC")
array <- as_nanoarrow_array(vctr, schema = na_vctrs(vctr))
infer_nanoarrow_ptype(array)
convert_array(array)
-
+\dontshow{\}) # examplesIf}
}
diff --git a/r/tests/testthat/test-extension-vctrs.R
b/r/tests/testthat/test-extension-vctrs.R
index cb6d840b..dabbff84 100644
--- a/r/tests/testthat/test-extension-vctrs.R
+++ b/r/tests/testthat/test-extension-vctrs.R
@@ -17,8 +17,8 @@
test_that("vctrs extension type can roundtrip built-in vector types", {
skip_if_not_installed("tibble")
+ skip_if_not_installed("jsonlite")
- # Arrow tibbleifies everything, so we do here too
# Lists aren't automatically handled in nanoarrow conversion, so they
# aren't listed here yet.
vectors <- list(
@@ -30,7 +30,7 @@ test_that("vctrs extension type can roundtrip built-in vector
types", {
posixlt = as.POSIXlt("2000-01-01 12:23", tz = "UTC"),
date = as.Date("2000-01-01"),
difftime = as.difftime(123, units = "secs"),
- data_frame_simple = tibble::tibble(x = 1:5),
+ data_frame_simple = data.frame(x = 1:5),
data_frame_nested = tibble::tibble(x = 1:5, y = tibble::tibble(z =
letters[1:5]))
)
@@ -54,20 +54,6 @@ test_that("vctrs extension type can roundtrip built-in
vector types", {
# Roundtrip with multiple chunks
stream <- basic_array_stream(list(array, array))
expect_identical(convert_array_stream(stream), vctrs::vec_rep(vctr, 2))
-
- if (requireNamespace("arrow", quietly = TRUE)) {
- # Roundtrip from nanoarrow -> arrow -> R
- arrow_array <- arrow::as_arrow_array(array)
- expect_s3_class(arrow_array, "ExtensionArray")
- expect_identical(arrow_array$type$ptype(), ptype)
- expect_identical(arrow_array$as_vector(), vctr)
-
- # Roundtrip from arrow -> nanoarrow -> R
- arrow_array <- arrow::vctrs_extension_array(vctr)
- array <- as_nanoarrow_array(vctr, schema = schema)
- expect_identical(infer_nanoarrow_ptype(array), ptype)
- expect_identical(convert_array(array), vctr)
- }
}
})
@@ -83,3 +69,67 @@ test_that("vctrs extension type respects `to` in
convert_array()", {
vctrs::vec_cast(vctr, as.POSIXct(character()))
)
})
+
+test_that("serialize_ptype() can roundtrip R objects", {
+ skip_if_not_installed("jsonlite")
+
+ vectors <- list(
+ null = NULL,
+ raw = as.raw(c(0x00, 0x01, 0x02)),
+ lgl = c(FALSE, TRUE, NA),
+ int = c(0L, 1L, NA_integer_),
+ dbl = c(0, 1, pi, NA_real_),
+ chr = c("a", NA_character_),
+ cmplx = c(complex(real = 1:3, imaginary = 3:1), NA_complex_),
+ list = list(1, 2, x = 3, NULL),
+
+ raw0 = raw(),
+ lgl0 = logical(),
+ int0 = integer(),
+ dbl0 = double(),
+ chr0 = character(),
+ cmplx0 = complex(),
+ list0 = list(),
+
+ posixct = as.POSIXct("2000-01-01 12:23", tz = "UTC"),
+ posixlt = as.POSIXlt("2000-01-01 12:23", tz = "UTC"),
+ date = as.Date("2000-01-01"),
+ difftime = as.difftime(123, units = "secs"),
+ data_frame_simple = data.frame(x = 1:5),
+ data_frame_nested = tibble::tibble(x = 1:5, y = tibble::tibble(z =
letters[1:5]))
+ )
+
+ for (obj in vectors) {
+ # Check that our serializer/deserializer can roundtrip
+ expect_identical(
+ unserialize_ptype(serialize_ptype(obj)),
+ obj
+ )
+
+ # Check that our generated JSON is compatible with jsonlite's serde
+ expect_identical(
+ jsonlite::unserializeJSON(serialize_ptype(obj)),
+ obj
+ )
+
+ expect_identical(
+ unserialize_ptype(jsonlite::serializeJSON(obj, digits = 16)),
+ obj
+ )
+ }
+})
+
+test_that("serialize_ptype() errors for unsupported R objects", {
+ skip_if_not_installed("jsonlite")
+
+ expect_error(
+ serialize_ptype(quote(cat("I will eat you"))),
+ "storage 'language' is not supported by serialize_ptype"
+ )
+
+ expect_error(
+ unserialize_ptype(jsonlite::serializeJSON(quote(cat("I will eat you")))),
+ "storage 'language' is not supported by unserialize_ptype"
+ )
+
+})
diff --git a/r/tests/testthat/test-extension.R
b/r/tests/testthat/test-extension.R
index 5935394c..b57c24f1 100644
--- a/r/tests/testthat/test-extension.R
+++ b/r/tests/testthat/test-extension.R
@@ -26,18 +26,18 @@ test_that("extension types can be registered and
unregistered", {
test_that("infer_nanoarrow_ptype() dispatches on registered extension spec", {
register_nanoarrow_extension(
"some_ext",
- nanoarrow_extension_spec(subclass = "some_spec_class")
+ nanoarrow_extension_spec(subclass = "some_spec_class0")
)
on.exit(unregister_nanoarrow_extension("some_ext"))
- infer_nanoarrow_ptype_extension.some_spec_class <- function(spec, x, ...) {
+ infer_nanoarrow_ptype_extension.some_spec_class0 <- function(spec, x, ...) {
infer_nanoarrow_ptype_extension(NULL, x, ..., warn_unregistered = FALSE)
}
s3_register(
"nanoarrow::infer_nanoarrow_ptype_extension",
- "some_spec_class",
- infer_nanoarrow_ptype_extension.some_spec_class
+ "some_spec_class0",
+ infer_nanoarrow_ptype_extension.some_spec_class0
)
expect_identical(
@@ -51,18 +51,29 @@ test_that("infer_nanoarrow_ptype() dispatches on registered
extension spec", {
test_that("convert_array() dispatches on registered extension spec", {
register_nanoarrow_extension(
"some_ext",
- nanoarrow_extension_spec(subclass = "some_spec_class")
+ nanoarrow_extension_spec(subclass = "some_spec_class1")
)
on.exit(unregister_nanoarrow_extension("some_ext"))
- convert_array_extension.some_spec_class <- function(spec, array, to, ...) {
+ # Use unique spec class names to avoid interdependency between tests
+ convert_array_extension.some_spec_class1 <- function(spec, array, to, ...) {
convert_array_extension(NULL, array, to, ..., warn_unregistered = FALSE)
}
+ infer_nanoarrow_ptype_extension.some_spec_class1 <- function(spec, x, ...) {
+ infer_nanoarrow_ptype_extension(NULL, x, ..., warn_unregistered = FALSE)
+ }
+
s3_register(
"nanoarrow::convert_array_extension",
- "some_spec_class",
- convert_array_extension.some_spec_class
+ "some_spec_class1",
+ convert_array_extension.some_spec_class1
+ )
+
+ s3_register(
+ "nanoarrow::infer_nanoarrow_ptype_extension",
+ "some_spec_class1",
+ infer_nanoarrow_ptype_extension.some_spec_class1
)
expect_identical(
@@ -76,7 +87,7 @@ test_that("convert_array() dispatches on registered extension
spec", {
test_that("as_nanoarrow_array() dispatches on registered extension spec", {
register_nanoarrow_extension(
"some_ext",
- nanoarrow_extension_spec(subclass = "some_spec_class")
+ nanoarrow_extension_spec(subclass = "some_spec_class2")
)
on.exit(unregister_nanoarrow_extension("some_ext"))
@@ -91,14 +102,14 @@ test_that("as_nanoarrow_array() dispatches on registered
extension spec", {
"not implemented for extension"
)
- as_nanoarrow_array_extension.some_spec_class <- function(spec, x, ...,
schema = NULL) {
+ as_nanoarrow_array_extension.some_spec_class2 <- function(spec, x, ...,
schema = NULL) {
nanoarrow_extension_array(x, "some_ext")
}
s3_register(
"nanoarrow::as_nanoarrow_array_extension",
- "some_spec_class",
- as_nanoarrow_array_extension.some_spec_class
+ "some_spec_class2",
+ as_nanoarrow_array_extension.some_spec_class2
)
ext_array <- as_nanoarrow_array(