This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 6450527c4f0 feat: encode FixedSizeBinary in JSON as hex string (#5622)
6450527c4f0 is described below
commit 6450527c4f07fd5aa81282701bb380fdc18d6458
Author: Trevor Hilton <[email protected]>
AuthorDate: Fri Apr 12 13:16:02 2024 -0400
feat: encode FixedSizeBinary in JSON as hex string (#5622)
* feat: encode FixedSizeBinary in JSON as hex
Adds encoding support to the JSON writer for the FixedSizeBinary DataType
A test was added as well
* fix: properly encode fixed size binary as string
The fixed size binary values were not being encoded with surrounding
double quotes. This fixes that, and updates the added test to actually
parse the written JSON as JSON, using serde_json, and make assertions
against that.
* chore: remove unused hex dep in arrow-json
* refactor: check for null serialization of fixedsizebinary in JSON
* refactor: extend explicit nulls to the FixedSizeBinaryEncoder
Have the FixedSizeBinaryEncoder for the JSON writer handle explicit null
values, based on the Writer's EncoderOptions.
* refactor: borrow array in JSON fixed size binary encoder
Changed the FixedSizeBinaryEncoder for the JSON writer to use a borrow
of the FixedSizeBinaryArray being encoded, to follow other Encoder
implementations, and to remove the use of clone.
* refactor: remove need for clone in JSON encoder types
BooleanEncoder and StringEncoder were changed to use borrows of their
respective Array types, to avoid cloning.
* refactor: remove null handling in JSON FixedSizeBinaryEncoder
The FixedSizeBinaryEncoder does not need to handle nulls, as that will
be handled by a parent encoder, i.e., list/map.
---
arrow-json/src/writer.rs | 80 +++++++++++++++++++++++++++++++++++++++-
arrow-json/src/writer/encoder.rs | 38 +++++++++++++++----
2 files changed, 110 insertions(+), 8 deletions(-)
diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs
index 9f63b811d74..85a81d98e1b 100644
--- a/arrow-json/src/writer.rs
+++ b/arrow-json/src/writer.rs
@@ -833,7 +833,9 @@ mod tests {
use serde_json::json;
- use arrow_array::builder::{Int32Builder, Int64Builder, MapBuilder,
StringBuilder};
+ use arrow_array::builder::{
+ FixedSizeBinaryBuilder, Int32Builder, Int64Builder, MapBuilder,
StringBuilder,
+ };
use arrow_buffer::{Buffer, NullBuffer, OffsetBuffer, ToByteSlice};
use arrow_data::ArrayData;
@@ -2137,4 +2139,80 @@ mod tests {
Ok(())
}
+
+ #[test]
+ fn test_writer_fixed_size_binary() {
+ // set up schema:
+ let size = 11;
+ let schema = SchemaRef::new(Schema::new(vec![Field::new(
+ "bytes",
+ DataType::FixedSizeBinary(size),
+ true,
+ )]));
+
+ // build record batch:
+ let mut builder = FixedSizeBinaryBuilder::new(size);
+ let values = [Some(b"hello world"), None, Some(b"summer rain")];
+ for value in values {
+ match value {
+ Some(v) => builder.append_value(v).unwrap(),
+ None => builder.append_null(),
+ }
+ }
+ let array = Arc::new(builder.finish()) as ArrayRef;
+ let batch = RecordBatch::try_new(schema, vec![array]).unwrap();
+
+ // encode and check JSON with explicit nulls:
+ {
+ let mut buf = Vec::new();
+ let json_value: Value = {
+ let mut writer = WriterBuilder::new()
+ .with_explicit_nulls(true)
+ .build::<_, JsonArray>(&mut buf);
+ writer.write(&batch).unwrap();
+ writer.close().unwrap();
+ serde_json::from_slice(&buf).unwrap()
+ };
+
+ assert_eq!(
+ json!([
+ {
+ "bytes": "68656c6c6f20776f726c64"
+ },
+ {
+ "bytes": null // the explicit null
+ },
+ {
+ "bytes": "73756d6d6572207261696e"
+ }
+ ]),
+ json_value,
+ );
+ }
+ // encode and check JSON with no explicit nulls:
+ {
+ let mut buf = Vec::new();
+ let json_value: Value = {
+ // explicit nulls are off by default, so we don't need
+ // to set that when creating the writer:
+ let mut writer = ArrayWriter::new(&mut buf);
+ writer.write(&batch).unwrap();
+ writer.close().unwrap();
+ serde_json::from_slice(&buf).unwrap()
+ };
+
+ assert_eq!(
+ json!([
+ {
+ "bytes": "68656c6c6f20776f726c64"
+ },
+ {}, // empty because nulls are omitted
+ {
+ "bytes": "73756d6d6572207261696e"
+ }
+ ]),
+ json_value,
+ );
+ }
+ }
}
diff --git a/arrow-json/src/writer/encoder.rs b/arrow-json/src/writer/encoder.rs
index 87efcb9f39a..113dc5dfc75 100644
--- a/arrow-json/src/writer/encoder.rs
+++ b/arrow-json/src/writer/encoder.rs
@@ -69,16 +69,16 @@ fn make_encoder_impl<'a>(
DataType::Float64 => primitive_helper!(Float64Type),
DataType::Boolean => {
let array = array.as_boolean();
- (Box::new(BooleanEncoder(array.clone())), array.nulls().cloned())
+ (Box::new(BooleanEncoder(array)), array.nulls().cloned())
}
DataType::Null => (Box::new(NullEncoder), array.logical_nulls()),
DataType::Utf8 => {
let array = array.as_string::<i32>();
- (Box::new(StringEncoder(array.clone())) as _,
array.nulls().cloned())
+ (Box::new(StringEncoder(array)) as _, array.nulls().cloned())
}
DataType::LargeUtf8 => {
let array = array.as_string::<i64>();
- (Box::new(StringEncoder(array.clone())) as _,
array.nulls().cloned())
+ (Box::new(StringEncoder(array)) as _, array.nulls().cloned())
}
DataType::List(_) => {
let array = array.as_list::<i32>();
@@ -99,6 +99,11 @@ fn make_encoder_impl<'a>(
(Box::new(MapEncoder::try_new(array, options)?) as _,
array.nulls().cloned())
}
+ DataType::FixedSizeBinary(_) => {
+ let array =
array.as_any().downcast_ref::<FixedSizeBinaryArray>().unwrap();
+ (Box::new(FixedSizeBinaryEncoder::new(array)) as _,
array.nulls().cloned())
+ }
+
DataType::Struct(fields) => {
let array = array.as_struct();
let encoders = fields.iter().zip(array.columns()).map(|(field,
array)| {
@@ -259,9 +264,9 @@ impl<N: PrimitiveEncode> Encoder for PrimitiveEncoder<N> {
}
}
-struct BooleanEncoder(BooleanArray);
+struct BooleanEncoder<'a>(&'a BooleanArray);
-impl Encoder for BooleanEncoder {
+impl<'a> Encoder for BooleanEncoder<'a> {
fn encode(&mut self, idx: usize, out: &mut Vec<u8>) {
match self.0.value(idx) {
true => out.extend_from_slice(b"true"),
@@ -270,9 +275,9 @@ impl Encoder for BooleanEncoder {
}
}
-struct StringEncoder<O: OffsetSizeTrait>(GenericStringArray<O>);
+struct StringEncoder<'a, O: OffsetSizeTrait>(&'a GenericStringArray<O>);
-impl<O: OffsetSizeTrait> Encoder for StringEncoder<O> {
+impl<'a, O: OffsetSizeTrait> Encoder for StringEncoder<'a, O> {
fn encode(&mut self, idx: usize, out: &mut Vec<u8>) {
encode_string(self.0.value(idx), out);
}
@@ -443,3 +448,22 @@ impl<'a> Encoder for MapEncoder<'a> {
out.push(b'}');
}
}
+
+struct FixedSizeBinaryEncoder<'a>(&'a FixedSizeBinaryArray);
+
+impl<'a> FixedSizeBinaryEncoder<'a> {
+ fn new(array: &'a FixedSizeBinaryArray) -> Self {
+ Self(array)
+ }
+}
+
+impl<'a> Encoder for FixedSizeBinaryEncoder<'a> {
+ fn encode(&mut self, idx: usize, out: &mut Vec<u8>) {
+ out.push(b'"');
+ for byte in self.0.value(idx) {
+ // this write is infallible
+ write!(out, "{byte:02x}").unwrap();
+ }
+ out.push(b'"');
+ }
+}