This is an automated email from the ASF dual-hosted git repository.
liurenjie1024 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-rust.git
The following commit(s) were added to refs/heads/main by this push:
new 08c46fb remove binary serialize in literal (#456)
08c46fb is described below
commit 08c46fbb98c55ae284b31977e6307f40ef5f381e
Author: ZENOTME <[email protected]>
AuthorDate: Mon Jul 15 20:47:29 2024 +0800
remove binary serialize in literal (#456)
Co-authored-by: ZENOTME <[email protected]>
---
crates/iceberg/src/spec/manifest.rs | 17 +--
crates/iceberg/src/spec/manifest_list.rs | 6 +-
crates/iceberg/src/spec/values.rs | 172 +++++++++----------------------
3 files changed, 57 insertions(+), 138 deletions(-)
diff --git a/crates/iceberg/src/spec/manifest.rs
b/crates/iceberg/src/spec/manifest.rs
index f5a5984..dd14823 100644
--- a/crates/iceberg/src/spec/manifest.rs
+++ b/crates/iceberg/src/spec/manifest.rs
@@ -1203,13 +1203,11 @@ impl std::fmt::Display for DataFileFormat {
mod _serde {
use std::collections::HashMap;
- use serde_bytes::ByteBuf;
use serde_derive::{Deserialize, Serialize};
use serde_with::serde_as;
use crate::spec::Datum;
use crate::spec::Literal;
- use crate::spec::PrimitiveLiteral;
use crate::spec::RawLiteral;
use crate::spec::Schema;
use crate::spec::Struct;
@@ -1333,12 +1331,8 @@ mod _serde {
value_counts: Some(to_i64_entry(value.value_counts)?),
null_value_counts:
Some(to_i64_entry(value.null_value_counts)?),
nan_value_counts: Some(to_i64_entry(value.nan_value_counts)?),
- lower_bounds: Some(to_bytes_entry(
- value.lower_bounds.into_iter().map(|(k, v)| (k, v.into())),
- )),
- upper_bounds: Some(to_bytes_entry(
- value.upper_bounds.into_iter().map(|(k, v)| (k, v.into())),
- )),
+ lower_bounds: Some(to_bytes_entry(value.lower_bounds)),
+ upper_bounds: Some(to_bytes_entry(value.upper_bounds)),
key_metadata:
Some(serde_bytes::ByteBuf::from(value.key_metadata)),
split_offsets: Some(value.split_offsets),
equality_ids: Some(value.equality_ids),
@@ -1442,11 +1436,11 @@ mod _serde {
Ok(m)
}
- fn to_bytes_entry(v: impl IntoIterator<Item = (i32, PrimitiveLiteral)>) ->
Vec<BytesEntry> {
+ fn to_bytes_entry(v: impl IntoIterator<Item = (i32, Datum)>) ->
Vec<BytesEntry> {
v.into_iter()
.map(|e| BytesEntry {
key: e.0,
- value: Into::<ByteBuf>::into(e.1),
+ value: e.1.to_bytes(),
})
.collect()
}
@@ -1906,8 +1900,7 @@ mod tests {
partition: Struct::from_iter(
vec![
Some(
- Literal::try_from_bytes(&[120],
&Type::Primitive(PrimitiveType::String))
- .unwrap()
+ Literal::string("x"),
),
]
.into_iter()
diff --git a/crates/iceberg/src/spec/manifest_list.rs
b/crates/iceberg/src/spec/manifest_list.rs
index ec7e2d8..688bdef 100644
--- a/crates/iceberg/src/spec/manifest_list.rs
+++ b/crates/iceberg/src/spec/manifest_list.rs
@@ -675,7 +675,7 @@ pub struct FieldSummary {
/// [ManifestFileV1] and [ManifestFileV2] are internal struct that are only
used for serialization and deserialization.
pub(super) mod _serde {
use crate::{
- spec::{Datum, PrimitiveLiteral, PrimitiveType, StructType},
+ spec::{Datum, PrimitiveType, StructType},
Error,
};
pub use serde_bytes::ByteBuf;
@@ -965,8 +965,8 @@ pub(super) mod _serde {
.map(|v| FieldSummary {
contains_null: v.contains_null,
contains_nan: v.contains_nan,
- lower_bound: v.lower_bound.map(|v|
PrimitiveLiteral::from(v).into()),
- upper_bound: v.upper_bound.map(|v|
PrimitiveLiteral::from(v).into()),
+ lower_bound: v.lower_bound.map(|v| v.to_bytes()),
+ upper_bound: v.upper_bound.map(|v| v.to_bytes()),
})
.collect(),
)
diff --git a/crates/iceberg/src/spec/values.rs
b/crates/iceberg/src/spec/values.rs
index a905903..310bec1 100644
--- a/crates/iceberg/src/spec/values.rs
+++ b/crates/iceberg/src/spec/values.rs
@@ -382,7 +382,9 @@ impl Datum {
Datum { r#type, literal }
}
- /// Create iceberg value from bytes
+ /// Create iceberg value from bytes.
+ ///
+ /// See [this
spec](https://iceberg.apache.org/spec/#binary-single-value-serialization) for
reference.
pub fn try_from_bytes(bytes: &[u8], data_type: PrimitiveType) ->
Result<Self> {
let literal = match data_type {
PrimitiveType::Boolean => {
@@ -424,6 +426,34 @@ impl Datum {
Ok(Datum::new(data_type, literal))
}
+ /// Convert the value to bytes
+ ///
+ /// See [this
spec](https://iceberg.apache.org/spec/#binary-single-value-serialization) for
reference.
+ pub fn to_bytes(&self) -> ByteBuf {
+ match &self.literal {
+ PrimitiveLiteral::Boolean(val) => {
+ if *val {
+ ByteBuf::from([1u8])
+ } else {
+ ByteBuf::from([0u8])
+ }
+ }
+ PrimitiveLiteral::Int(val) => ByteBuf::from(val.to_le_bytes()),
+ PrimitiveLiteral::Long(val) => ByteBuf::from(val.to_le_bytes()),
+ PrimitiveLiteral::Float(val) => ByteBuf::from(val.to_le_bytes()),
+ PrimitiveLiteral::Double(val) => ByteBuf::from(val.to_le_bytes()),
+ PrimitiveLiteral::Date(val) => ByteBuf::from(val.to_le_bytes()),
+ PrimitiveLiteral::Time(val) => ByteBuf::from(val.to_le_bytes()),
+ PrimitiveLiteral::Timestamp(val) =>
ByteBuf::from(val.to_le_bytes()),
+ PrimitiveLiteral::Timestamptz(val) =>
ByteBuf::from(val.to_le_bytes()),
+ PrimitiveLiteral::String(val) => ByteBuf::from(val.as_bytes()),
+ PrimitiveLiteral::UUID(val) =>
ByteBuf::from(val.as_u128().to_be_bytes()),
+ PrimitiveLiteral::Fixed(val) => ByteBuf::from(val.as_slice()),
+ PrimitiveLiteral::Binary(val) => ByteBuf::from(val.as_slice()),
+ PrimitiveLiteral::Decimal(_) => todo!(),
+ }
+ }
+
/// Creates a boolean value.
///
/// Example:
@@ -1448,78 +1478,6 @@ impl Literal {
}
}
-impl From<PrimitiveLiteral> for ByteBuf {
- fn from(value: PrimitiveLiteral) -> Self {
- match value {
- PrimitiveLiteral::Boolean(val) => {
- if val {
- ByteBuf::from([1u8])
- } else {
- ByteBuf::from([0u8])
- }
- }
- PrimitiveLiteral::Int(val) => ByteBuf::from(val.to_le_bytes()),
- PrimitiveLiteral::Long(val) => ByteBuf::from(val.to_le_bytes()),
- PrimitiveLiteral::Float(val) => ByteBuf::from(val.to_le_bytes()),
- PrimitiveLiteral::Double(val) => ByteBuf::from(val.to_le_bytes()),
- PrimitiveLiteral::Date(val) => ByteBuf::from(val.to_le_bytes()),
- PrimitiveLiteral::Time(val) => ByteBuf::from(val.to_le_bytes()),
- PrimitiveLiteral::Timestamp(val) =>
ByteBuf::from(val.to_le_bytes()),
- PrimitiveLiteral::Timestamptz(val) =>
ByteBuf::from(val.to_le_bytes()),
- PrimitiveLiteral::String(val) => ByteBuf::from(val.as_bytes()),
- PrimitiveLiteral::UUID(val) =>
ByteBuf::from(val.as_u128().to_be_bytes()),
- PrimitiveLiteral::Fixed(val) => ByteBuf::from(val),
- PrimitiveLiteral::Binary(val) => ByteBuf::from(val),
- PrimitiveLiteral::Decimal(_) => todo!(),
- }
- }
-}
-
-impl From<Literal> for ByteBuf {
- fn from(value: Literal) -> Self {
- match value {
- Literal::Primitive(val) => val.into(),
- _ => unimplemented!(),
- }
- }
-}
-
-impl From<PrimitiveLiteral> for Vec<u8> {
- fn from(value: PrimitiveLiteral) -> Self {
- match value {
- PrimitiveLiteral::Boolean(val) => {
- if val {
- Vec::from([1u8])
- } else {
- Vec::from([0u8])
- }
- }
- PrimitiveLiteral::Int(val) => Vec::from(val.to_le_bytes()),
- PrimitiveLiteral::Long(val) => Vec::from(val.to_le_bytes()),
- PrimitiveLiteral::Float(val) => Vec::from(val.to_le_bytes()),
- PrimitiveLiteral::Double(val) => Vec::from(val.to_le_bytes()),
- PrimitiveLiteral::Date(val) => Vec::from(val.to_le_bytes()),
- PrimitiveLiteral::Time(val) => Vec::from(val.to_le_bytes()),
- PrimitiveLiteral::Timestamp(val) => Vec::from(val.to_le_bytes()),
- PrimitiveLiteral::Timestamptz(val) => Vec::from(val.to_le_bytes()),
- PrimitiveLiteral::String(val) => Vec::from(val.as_bytes()),
- PrimitiveLiteral::UUID(val) =>
Vec::from(val.as_u128().to_be_bytes()),
- PrimitiveLiteral::Fixed(val) => val,
- PrimitiveLiteral::Binary(val) => val,
- PrimitiveLiteral::Decimal(_) => todo!(),
- }
- }
-}
-
-impl From<Literal> for Vec<u8> {
- fn from(value: Literal) -> Self {
- match value {
- Literal::Primitive(val) => val.into(),
- _ => unimplemented!(),
- }
- }
-}
-
/// The partition struct stores the tuple of partition values for each file.
/// Its type is derived from the partition fields of the partition spec used
to write the manifest file.
/// In v2, the partition struct’s field ids must match the ids from the
partition spec.
@@ -1622,21 +1580,9 @@ impl FromIterator<Option<Literal>> for Struct {
}
impl Literal {
- /// Create iceberg value from bytes
- pub fn try_from_bytes(bytes: &[u8], data_type: &Type) -> Result<Self> {
- match data_type {
- Type::Primitive(primitive_type) => {
- let datum = Datum::try_from_bytes(bytes,
primitive_type.clone())?;
- Ok(Literal::Primitive(datum.literal))
- }
- _ => Err(Error::new(
- crate::ErrorKind::DataInvalid,
- "Converting bytes to non-primitive types is not supported.",
- )),
- }
- }
-
/// Create iceberg value from a json value
+ ///
+ /// See [this
spec](https://iceberg.apache.org/spec/#json-single-value-serialization) for
reference.
pub fn try_from_json(value: JsonValue, data_type: &Type) ->
Result<Option<Self>> {
match data_type {
Type::Primitive(primitive) => match (primitive, value) {
@@ -2714,23 +2660,27 @@ mod tests {
assert_eq!(parsed_json_value, raw_json_value);
}
- fn check_avro_bytes_serde(input: Vec<u8>, expected_literal: Literal,
expected_type: &Type) {
+ fn check_avro_bytes_serde(
+ input: Vec<u8>,
+ expected_datum: Datum,
+ expected_type: &PrimitiveType,
+ ) {
let raw_schema = r#""bytes""#;
let schema = apache_avro::Schema::parse_str(raw_schema).unwrap();
let bytes = ByteBuf::from(input);
- let literal = Literal::try_from_bytes(&bytes, expected_type).unwrap();
- assert_eq!(literal, expected_literal);
+ let datum = Datum::try_from_bytes(&bytes,
expected_type.clone()).unwrap();
+ assert_eq!(datum, expected_datum);
let mut writer = apache_avro::Writer::new(&schema, Vec::new());
- writer.append_ser(ByteBuf::from(literal)).unwrap();
+ writer.append_ser(datum.to_bytes()).unwrap();
let encoded = writer.into_inner().unwrap();
let reader = apache_avro::Reader::with_schema(&schema,
&*encoded).unwrap();
for record in reader {
let result =
apache_avro::from_value::<ByteBuf>(&record.unwrap()).unwrap();
- let desered_literal = Literal::try_from_bytes(&result,
expected_type).unwrap();
- assert_eq!(desered_literal, expected_literal);
+ let desered_datum = Datum::try_from_bytes(&result,
expected_type.clone()).unwrap();
+ assert_eq!(desered_datum, expected_datum);
}
}
@@ -2999,66 +2949,42 @@ mod tests {
fn avro_bytes_boolean() {
let bytes = vec![1u8];
- check_avro_bytes_serde(
- bytes,
- Literal::Primitive(PrimitiveLiteral::Boolean(true)),
- &Type::Primitive(PrimitiveType::Boolean),
- );
+ check_avro_bytes_serde(bytes, Datum::bool(true),
&PrimitiveType::Boolean);
}
#[test]
fn avro_bytes_int() {
let bytes = vec![32u8, 0u8, 0u8, 0u8];
- check_avro_bytes_serde(
- bytes,
- Literal::Primitive(PrimitiveLiteral::Int(32)),
- &Type::Primitive(PrimitiveType::Int),
- );
+ check_avro_bytes_serde(bytes, Datum::int(32), &PrimitiveType::Int);
}
#[test]
fn avro_bytes_long() {
let bytes = vec![32u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8];
- check_avro_bytes_serde(
- bytes,
- Literal::Primitive(PrimitiveLiteral::Long(32)),
- &Type::Primitive(PrimitiveType::Long),
- );
+ check_avro_bytes_serde(bytes, Datum::long(32), &PrimitiveType::Long);
}
#[test]
fn avro_bytes_float() {
let bytes = vec![0u8, 0u8, 128u8, 63u8];
- check_avro_bytes_serde(
- bytes,
- Literal::Primitive(PrimitiveLiteral::Float(OrderedFloat(1.0))),
- &Type::Primitive(PrimitiveType::Float),
- );
+ check_avro_bytes_serde(bytes, Datum::float(1.0),
&PrimitiveType::Float);
}
#[test]
fn avro_bytes_double() {
let bytes = vec![0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 240u8, 63u8];
- check_avro_bytes_serde(
- bytes,
- Literal::Primitive(PrimitiveLiteral::Double(OrderedFloat(1.0))),
- &Type::Primitive(PrimitiveType::Double),
- );
+ check_avro_bytes_serde(bytes, Datum::double(1.0),
&PrimitiveType::Double);
}
#[test]
fn avro_bytes_string() {
let bytes = vec![105u8, 99u8, 101u8, 98u8, 101u8, 114u8, 103u8];
- check_avro_bytes_serde(
- bytes,
-
Literal::Primitive(PrimitiveLiteral::String("iceberg".to_string())),
- &Type::Primitive(PrimitiveType::String),
- );
+ check_avro_bytes_serde(bytes, Datum::string("iceberg"),
&PrimitiveType::String);
}
#[test]