alamb commented on code in PR #7740:
URL: https://github.com/apache/arrow-rs/pull/7740#discussion_r2164497957
##########
parquet-variant/src/builder.rs:
##########
@@ -420,100 +472,152 @@ impl Default for VariantBuilder {
///
/// See the examples on [`VariantBuilder`] for usage.
pub struct ListBuilder<'a> {
- parent: &'a mut VariantBuilder,
- start_pos: usize,
+ parent_buffer: &'a mut VariantBuffer,
+ field_metadata_dictionary: &'a mut FieldMetadataDictionary,
offsets: Vec<usize>,
+
+ buffer: VariantBuffer,
+ pending: bool,
}
impl<'a> ListBuilder<'a> {
- fn new(parent: &'a mut VariantBuilder) -> Self {
- let start_pos = parent.offset();
+ fn new(
+ parent_buffer: &'a mut VariantBuffer,
+ field_metadata_dictionary: &'a mut FieldMetadataDictionary,
+ ) -> Self {
Self {
- parent,
- start_pos,
+ parent_buffer,
+ field_metadata_dictionary,
offsets: vec![0],
+ buffer: VariantBuffer::default(),
+ pending: false,
+ }
+ }
+
+ fn check_new_offset(&mut self) {
+ if !self.pending {
+ return;
}
+
+ let element_end = self.buffer.offset();
+ self.offsets.push(element_end);
+
+ self.pending = false;
+ }
+
+ pub fn new_object(&mut self) -> ObjectBuilder {
+ self.check_new_offset();
+
+ let obj_builder = ObjectBuilder::new(&mut self.buffer,
self.field_metadata_dictionary);
+ self.pending = true;
+
+ obj_builder
+ }
+
+ pub fn new_list(&mut self) -> ListBuilder {
+ self.check_new_offset();
+
+ let list_builder = ListBuilder::new(&mut self.buffer,
self.field_metadata_dictionary);
+ self.pending = true;
+
+ list_builder
}
pub fn append_value<'m, 'd, T: Into<Variant<'m, 'd>>>(&mut self, value: T)
{
- self.parent.append_value(value);
- let element_end = self.parent.offset() - self.start_pos;
+ self.check_new_offset();
+
+ self.buffer.append_value(value);
+ let element_end = self.buffer.offset();
self.offsets.push(element_end);
}
- pub fn finish(self) {
- let data_size = self.parent.offset() - self.start_pos;
+ pub fn finish(mut self) {
+ self.check_new_offset();
+
+ let data_size = self.buffer.offset();
let num_elements = self.offsets.len() - 1;
let is_large = num_elements > u8::MAX as usize;
let size_bytes = if is_large { 4 } else { 1 };
let offset_size = int_size(data_size);
let header_size = 1 + size_bytes + (num_elements + 1) * offset_size as
usize;
- make_room_for_header(&mut self.parent.buffer, self.start_pos,
header_size);
+ let parent_start_pos = self.parent_buffer.offset();
+
+ make_room_for_header(&mut self.parent_buffer.0, parent_start_pos,
header_size);
// Write header
- let mut pos = self.start_pos;
- self.parent.buffer[pos] = array_header(is_large, offset_size);
+ let mut pos = parent_start_pos;
+ self.parent_buffer.0[pos] = array_header(is_large, offset_size);
Review Comment:
I don't understand how this can be writing to anything other than the end of
parent_buffer and thus it seems like the code to handle other positions is
unecesssary
I also see you just refactored this here rather than introducing it, so no
need to change it in this PR
##########
parquet-variant/src/builder.rs:
##########
@@ -72,7 +72,177 @@ fn make_room_for_header(buffer: &mut Vec<u8>, start_pos:
usize, header_size: usi
buffer.copy_within(src_start..src_end, dst_start);
}
-/// Builder for [`Variant`] values
+#[derive(Default)]
+struct VariantBuffer(Vec<u8>);
+
+impl VariantBuffer {
+ fn append_null(&mut self) {
+ self.0.push(primitive_header(VariantPrimitiveType::Null));
+ }
+
+ fn append_bool(&mut self, value: bool) {
+ let primitive_type = if value {
+ VariantPrimitiveType::BooleanTrue
+ } else {
+ VariantPrimitiveType::BooleanFalse
+ };
+ self.0.push(primitive_header(primitive_type));
+ }
+
+ fn append_int8(&mut self, value: i8) {
+ self.0.push(primitive_header(VariantPrimitiveType::Int8));
+ self.0.push(value as u8);
+ }
+
+ fn append_int16(&mut self, value: i16) {
+ self.0.push(primitive_header(VariantPrimitiveType::Int16));
+ self.0.extend_from_slice(&value.to_le_bytes());
+ }
+
+ fn append_int32(&mut self, value: i32) {
+ self.0.push(primitive_header(VariantPrimitiveType::Int32));
+ self.0.extend_from_slice(&value.to_le_bytes());
+ }
+
+ fn append_int64(&mut self, value: i64) {
+ self.0.push(primitive_header(VariantPrimitiveType::Int64));
+ self.0.extend_from_slice(&value.to_le_bytes());
+ }
+
+ fn append_float(&mut self, value: f32) {
+ self.0.push(primitive_header(VariantPrimitiveType::Float));
+ self.0.extend_from_slice(&value.to_le_bytes());
+ }
+
+ fn append_double(&mut self, value: f64) {
+ self.0.push(primitive_header(VariantPrimitiveType::Double));
+ self.0.extend_from_slice(&value.to_le_bytes());
+ }
+
+ fn append_date(&mut self, value: chrono::NaiveDate) {
+ self.0.push(primitive_header(VariantPrimitiveType::Date));
+ let days_since_epoch =
value.signed_duration_since(UNIX_EPOCH_DATE).num_days() as i32;
+ self.0.extend_from_slice(&days_since_epoch.to_le_bytes());
+ }
+
+ fn append_timestamp_micros(&mut self, value:
chrono::DateTime<chrono::Utc>) {
+ self.0
+ .push(primitive_header(VariantPrimitiveType::TimestampMicros));
+ let micros = value.timestamp_micros();
+ self.0.extend_from_slice(µs.to_le_bytes());
+ }
+
+ fn append_timestamp_ntz_micros(&mut self, value: chrono::NaiveDateTime) {
+ self.0
+ .push(primitive_header(VariantPrimitiveType::TimestampNtzMicros));
+ let micros = value.and_utc().timestamp_micros();
+ self.0.extend_from_slice(µs.to_le_bytes());
+ }
+
+ fn append_decimal4(&mut self, integer: i32, scale: u8) {
+ self.0
+ .push(primitive_header(VariantPrimitiveType::Decimal4));
+ self.0.push(scale);
+ self.0.extend_from_slice(&integer.to_le_bytes());
+ }
+
+ fn append_decimal8(&mut self, integer: i64, scale: u8) {
+ self.0
+ .push(primitive_header(VariantPrimitiveType::Decimal8));
+ self.0.push(scale);
+ self.0.extend_from_slice(&integer.to_le_bytes());
+ }
+
+ fn append_decimal16(&mut self, integer: i128, scale: u8) {
+ self.0
+ .push(primitive_header(VariantPrimitiveType::Decimal16));
+ self.0.push(scale);
+ self.0.extend_from_slice(&integer.to_le_bytes());
+ }
+
+ fn append_binary(&mut self, value: &[u8]) {
+ self.0.push(primitive_header(VariantPrimitiveType::Binary));
+ self.0
+ .extend_from_slice(&(value.len() as u32).to_le_bytes());
+ self.0.extend_from_slice(value);
+ }
+
+ fn append_short_string(&mut self, value: ShortString) {
+ let inner = value.0;
+ self.0.push(short_string_header(inner.len()));
+ self.0.extend_from_slice(inner.as_bytes());
+ }
+
+ fn append_string(&mut self, value: &str) {
+ self.0.push(primitive_header(VariantPrimitiveType::String));
+ self.0
+ .extend_from_slice(&(value.len() as u32).to_le_bytes());
+ self.0.extend_from_slice(value.as_bytes());
+ }
+
+ fn offset(&self) -> usize {
+ self.0.len()
+ }
+
+ fn append_value<'m, 'd, T: Into<Variant<'m, 'd>>>(&mut self, value: T) {
+ let variant = value.into();
+ match variant {
+ Variant::Null => self.append_null(),
+ Variant::BooleanTrue => self.append_bool(true),
+ Variant::BooleanFalse => self.append_bool(false),
+ Variant::Int8(v) => self.append_int8(v),
+ Variant::Int16(v) => self.append_int16(v),
+ Variant::Int32(v) => self.append_int32(v),
+ Variant::Int64(v) => self.append_int64(v),
+ Variant::Date(v) => self.append_date(v),
+ Variant::TimestampMicros(v) => self.append_timestamp_micros(v),
+ Variant::TimestampNtzMicros(v) =>
self.append_timestamp_ntz_micros(v),
+ Variant::Decimal4 { integer, scale } =>
self.append_decimal4(integer, scale),
+ Variant::Decimal8 { integer, scale } =>
self.append_decimal8(integer, scale),
+ Variant::Decimal16 { integer, scale } =>
self.append_decimal16(integer, scale),
+ Variant::Float(v) => self.append_float(v),
+ Variant::Double(v) => self.append_double(v),
+ Variant::Binary(v) => self.append_binary(v),
+ Variant::String(s) => self.append_string(s),
+ Variant::ShortString(s) => self.append_short_string(s),
+ Variant::Object(_) | Variant::List(_) => {
+ todo!("How does this work with the redesign?");
+ }
+ }
+ }
+}
+
+#[derive(Default)]
+struct FieldMetadataDictionary {
Review Comment:
I suggest renaming this struct `MetadataBuilder` to align with the fact it
is used to create the Metadata of the variant
##########
parquet-variant/src/builder.rs:
##########
@@ -72,7 +72,177 @@ fn make_room_for_header(buffer: &mut Vec<u8>, start_pos:
usize, header_size: usi
buffer.copy_within(src_start..src_end, dst_start);
}
-/// Builder for [`Variant`] values
+#[derive(Default)]
+struct VariantBuffer(Vec<u8>);
Review Comment:
I Perhaps this could be called `ValueBuilder` as it is largely responsible
for building the value portion of the variant
##########
parquet-variant/src/builder.rs:
##########
@@ -420,100 +472,152 @@ impl Default for VariantBuilder {
///
/// See the examples on [`VariantBuilder`] for usage.
pub struct ListBuilder<'a> {
- parent: &'a mut VariantBuilder,
- start_pos: usize,
+ parent_buffer: &'a mut VariantBuffer,
+ field_metadata_dictionary: &'a mut FieldMetadataDictionary,
offsets: Vec<usize>,
+
+ buffer: VariantBuffer,
+ pending: bool,
}
impl<'a> ListBuilder<'a> {
- fn new(parent: &'a mut VariantBuilder) -> Self {
- let start_pos = parent.offset();
+ fn new(
+ parent_buffer: &'a mut VariantBuffer,
+ field_metadata_dictionary: &'a mut FieldMetadataDictionary,
+ ) -> Self {
Self {
- parent,
- start_pos,
+ parent_buffer,
+ field_metadata_dictionary,
offsets: vec![0],
+ buffer: VariantBuffer::default(),
+ pending: false,
+ }
+ }
+
+ fn check_new_offset(&mut self) {
+ if !self.pending {
+ return;
}
+
+ let element_end = self.buffer.offset();
+ self.offsets.push(element_end);
+
+ self.pending = false;
+ }
+
+ pub fn new_object(&mut self) -> ObjectBuilder {
+ self.check_new_offset();
+
+ let obj_builder = ObjectBuilder::new(&mut self.buffer,
self.field_metadata_dictionary);
+ self.pending = true;
+
+ obj_builder
+ }
+
+ pub fn new_list(&mut self) -> ListBuilder {
+ self.check_new_offset();
+
+ let list_builder = ListBuilder::new(&mut self.buffer,
self.field_metadata_dictionary);
+ self.pending = true;
+
+ list_builder
}
pub fn append_value<'m, 'd, T: Into<Variant<'m, 'd>>>(&mut self, value: T)
{
- self.parent.append_value(value);
- let element_end = self.parent.offset() - self.start_pos;
+ self.check_new_offset();
+
+ self.buffer.append_value(value);
+ let element_end = self.buffer.offset();
self.offsets.push(element_end);
}
- pub fn finish(self) {
- let data_size = self.parent.offset() - self.start_pos;
+ pub fn finish(mut self) {
+ self.check_new_offset();
+
+ let data_size = self.buffer.offset();
let num_elements = self.offsets.len() - 1;
let is_large = num_elements > u8::MAX as usize;
let size_bytes = if is_large { 4 } else { 1 };
let offset_size = int_size(data_size);
let header_size = 1 + size_bytes + (num_elements + 1) * offset_size as
usize;
- make_room_for_header(&mut self.parent.buffer, self.start_pos,
header_size);
+ let parent_start_pos = self.parent_buffer.offset();
Review Comment:
parent_buffet.offset() seems to just return the length.
I think this code is doing the equivalent of
`self.parent_buffer.reserve(header_size)` to reserver enough space (I am not
sure what `make_room_for_header` is doing)
##########
parquet-variant/src/builder.rs:
##########
@@ -420,100 +472,152 @@ impl Default for VariantBuilder {
///
/// See the examples on [`VariantBuilder`] for usage.
pub struct ListBuilder<'a> {
- parent: &'a mut VariantBuilder,
- start_pos: usize,
+ parent_buffer: &'a mut VariantBuffer,
+ field_metadata_dictionary: &'a mut FieldMetadataDictionary,
offsets: Vec<usize>,
+
+ buffer: VariantBuffer,
Review Comment:
As a follow on it would be really nice to avoid having to copy all the
values back to the parent buffer.
One problem is that you don't know beforehand how many elements are in the
list (so we don't know the offset size and therefore we don't know how many
bytes to leave empty for the list length). However, I think we could do
something fancy like assume it will be a short list (and leave 1 byte for the
length) and then if we end up with more we could go back and copy over the
values
We could also add a nice api like `make_large()` or something on the
ListBuilder
I am just speculating, there is no need to do this here in this PR or
without benchmarks
##########
parquet-variant/src/builder.rs:
##########
@@ -824,39 +932,250 @@ mod tests {
}
#[test]
- fn test_append_object() {
- let (object_metadata, object_value) = {
- let mut builder = VariantBuilder::new();
- let mut obj = builder.new_object();
- obj.append_value("name", "John");
- obj.finish();
- builder.finish()
- };
- let object_variant = Variant::try_new(&object_metadata,
&object_value).unwrap();
+ fn test_nested_list() {
+ let mut builder = VariantBuilder::new();
+
+ let mut outer_list_builder = builder.new_list();
+
+ {
+ let mut inner_list_builder = outer_list_builder.new_list();
+
+ inner_list_builder.append_value("a");
+ inner_list_builder.append_value("b");
+ inner_list_builder.append_value("c");
+ inner_list_builder.append_value("d");
+
+ inner_list_builder.finish();
+ }
+
+ outer_list_builder.finish();
+
+ let (metadata, value) = builder.finish();
+
+ let variant = Variant::try_new(&metadata, &value).unwrap();
+ let outer_list = variant.as_list().unwrap();
+
+ assert_eq!(outer_list.len(), 1);
+
+ let inner_variant = outer_list.get(0).unwrap();
+ let inner_list = inner_variant.as_list().unwrap();
+
+ assert_eq!(
+ vec![
+ Variant::from("a"),
+ Variant::from("b"),
+ Variant::from("c"),
+ Variant::from("d"),
+ ],
+ inner_list.iter().collect::<Vec<_>>()
+ );
+ }
+
+ #[test]
+ fn test_super_nested_list() {
+ /*
+ [[[[[1]]]]]
+ */
let mut builder = VariantBuilder::new();
- builder.append_value(object_variant.clone());
+ {
+ let mut list_builder1 = builder.new_list();
+ {
+ let mut list_builder2 = list_builder1.new_list();
+ {
+ let mut list_builder3 = list_builder2.new_list();
+ {
+ let mut list_builder4 = list_builder3.new_list();
+ {
+ let mut list_builder5 = list_builder4.new_list();
+ list_builder5.append_value(1);
+ list_builder5.finish();
+ }
+ list_builder4.finish();
+ }
+ list_builder3.finish();
+ }
+ list_builder2.finish();
+ }
+ list_builder1.finish();
+ }
+
let (metadata, value) = builder.finish();
+
let variant = Variant::try_new(&metadata, &value).unwrap();
- assert_eq!(variant, object_variant);
+ let list1 = variant.as_list().unwrap();
+ assert_eq!(list1.len(), 1);
+
+ let list2_variant = list1.get(0).unwrap();
+ let list2 = list2_variant.as_list().unwrap();
+ assert_eq!(list2.len(), 1);
+
+ let list3_variant = list2.get(0).unwrap();
+ let list3 = list3_variant.as_list().unwrap();
+ assert_eq!(list3.len(), 1);
+
+ let list4_variant = list3.get(0).unwrap();
+ let list4 = list4_variant.as_list().unwrap();
+ assert_eq!(list4.len(), 1);
+
+ let list5_variant = list4.get(0).unwrap();
+ let list5 = list5_variant.as_list().unwrap();
+ assert_eq!(list5.len(), 1);
+
+ assert_eq!(list5.len(), 1);
+
+ assert_eq!(list5.get(0).unwrap(), Variant::from(1));
}
#[test]
- fn test_append_list() {
- let (list_metadata, list_value) = {
- let mut builder = VariantBuilder::new();
- let mut list = builder.new_list();
- list.append_value(1i8);
- list.append_value(2i8);
- list.finish();
- builder.finish()
- };
- let list_variant = Variant::try_new(&list_metadata,
&list_value).unwrap();
+ fn test_object_list() {
+ let mut builder = VariantBuilder::new();
+
+ let mut list_builder = builder.new_list();
+
+ {
+ let mut object_builder = list_builder.new_object();
+ object_builder.append_value("id", 1);
+ object_builder.append_value("type", "Cauliflower");
+ object_builder.finish();
+ }
+
+ {
+ let mut object_builder = list_builder.new_object();
+ object_builder.append_value("id", 2);
+ object_builder.append_value("type", "Beets");
+ object_builder.finish();
+ }
+
+ list_builder.finish();
+
+ let (metadata, value) = builder.finish();
+
+ let variant = Variant::try_new(&metadata, &value).unwrap();
+ let list = variant.as_list().unwrap();
+ assert_eq!(list.len(), 2);
+
+ let obj1_variant = list.get(0).unwrap();
+ let obj1 = obj1_variant.as_object().unwrap();
+
+ assert_eq!(
+ vec![
+ ("id", Variant::from(1)),
+ ("type", Variant::from("Cauliflower")),
+ ],
+ obj1.iter().collect::<Vec<_>>()
+ );
+
+ let obj2_variant = list.get(1).unwrap();
+ let obj2 = obj2_variant.as_object().unwrap();
+
+ assert_eq!(
+ vec![("id", Variant::from(2)), ("type", Variant::from("Beets")),],
+ obj2.iter().collect::<Vec<_>>()
+ );
+ }
+
+ #[test]
+ fn test_object_list2() {
let mut builder = VariantBuilder::new();
- builder.append_value(list_variant.clone());
+
+ let mut list_builder = builder.new_list();
+
+ {
+ let mut object_builder = list_builder.new_object();
+ object_builder.append_value("a", 1);
+ object_builder.finish();
+ }
+
+ {
+ let mut object_builder = list_builder.new_object();
+ object_builder.append_value("b", 2);
+ object_builder.finish();
+ }
+
+ list_builder.finish();
+
let (metadata, value) = builder.finish();
+
let variant = Variant::try_new(&metadata, &value).unwrap();
- assert_eq!(variant, list_variant);
+ let list = variant.as_list().unwrap();
+ assert_eq!(list.len(), 2);
+
+ let obj1_variant = list.get(0).unwrap();
+ let obj1 = obj1_variant.as_object().unwrap();
+ assert_eq!(
+ vec![("a", Variant::from(1)),],
+ obj1.iter().collect::<Vec<_>>()
+ );
+
+ let obj2_variant = list.get(1).unwrap();
+ let obj2 = obj2_variant.as_object().unwrap();
+ assert_eq!(
+ vec![("b", Variant::from(2)),],
+ obj2.iter().collect::<Vec<_>>()
+ );
+ }
+
+ #[test]
+ fn test_hetergenous_list() {
Review Comment:
💯
##########
parquet-variant/src/builder.rs:
##########
@@ -769,8 +877,8 @@ mod tests {
// dict is ordered by field names
// NOTE: when we support nested objects, we'll want to perform a
filter by fields_map field ids
let dict_metadata = obj
- .parent
- .dict
+ .field_metadata_dictionary
Review Comment:
do we need to update the comment above about nested objects?
##########
parquet-variant/src/builder.rs:
##########
@@ -148,189 +314,108 @@ fn make_room_for_header(buffer: &mut Vec<u8>,
start_pos: usize, header_size: usi
///
/// # Example: [`Variant::List`] of [`Variant::Object`]s
///
-/// THis example shows how to create an list of objects:
+/// This example shows how to create an list of objects:
/// ```json
/// [
-/// {
-/// "first_name": "Jiaying",
-/// "last_name": "Li"
-/// },
/// {
-/// "first_name": "Malthe",
-/// "last_name": "Karbo"
-/// }
+/// "id": 1,
+/// "type": "Cauliflower"
+/// },
+/// {
+/// "id": 2,
+/// "type": "Beets"
+/// }
/// ]
/// ```
+/// ```
+/// use parquet_variant::{Variant, VariantBuilder};
+/// let mut builder = VariantBuilder::new();
+///
+/// // Create a builder that will write elements to the list
+/// let mut list_builder = builder.new_list();
+///
+/// {
+/// let mut object_builder = list_builder.new_object();
+/// object_builder.append_value("id", 1);
+/// object_builder.append_value("type", "Cauliflower");
+/// object_builder.finish();
+/// }
+///
+/// {
+/// let mut object_builder = list_builder.new_object();
+/// object_builder.append_value("id", 2);
+/// object_builder.append_value("type", "Beets");
+/// object_builder.finish();
+/// }
+///
+/// list_builder.finish();
+/// // Finish the builder to get the metadata and value
+/// let (metadata, value) = builder.finish();
+/// // use the Variant API to verify the result
+/// let variant = Variant::try_new(&metadata, &value).unwrap();
+/// let variant_list = variant.as_list().unwrap();
+///
+///
+/// let obj1_variant = variant_list.get(0).unwrap();
+/// let obj1 = obj1_variant.as_object().unwrap();
+/// assert_eq!(
+/// obj1.field_by_name("id").unwrap(),
+/// Some(Variant::from(1))
+/// );
+/// assert_eq!(
+/// obj1.field_by_name("type").unwrap(),
+/// Some(Variant::from("Cauliflower"))
+/// );
///
-/// TODO
+/// let obj2_variant = variant_list.get(1).unwrap();
+/// let obj2 = obj2_variant.as_object().unwrap();
+///
+/// assert_eq!(
+/// obj2.field_by_name("id").unwrap(),
+/// Some(Variant::from(2))
+/// );
+/// assert_eq!(
+/// obj2.field_by_name("type").unwrap(),
+/// Some(Variant::from("Beets"))
+/// );
///
+/// ```
pub struct VariantBuilder {
- buffer: Vec<u8>,
- dict: BTreeMap<String, u32>,
- dict_keys: Vec<String>,
+ buffer: VariantBuffer,
+ field_metadata_dictionary: FieldMetadataDictionary,
}
impl VariantBuilder {
pub fn new() -> Self {
Self {
- buffer: Vec::new(),
- dict: BTreeMap::new(),
- dict_keys: Vec::new(),
- }
- }
-
- fn append_null(&mut self) {
- self.buffer
- .push(primitive_header(VariantPrimitiveType::Null));
- }
-
- fn append_bool(&mut self, value: bool) {
- let primitive_type = if value {
- VariantPrimitiveType::BooleanTrue
- } else {
- VariantPrimitiveType::BooleanFalse
- };
- self.buffer.push(primitive_header(primitive_type));
- }
-
- fn append_int8(&mut self, value: i8) {
- self.buffer
- .push(primitive_header(VariantPrimitiveType::Int8));
- self.buffer.push(value as u8);
- }
-
- fn append_int16(&mut self, value: i16) {
- self.buffer
- .push(primitive_header(VariantPrimitiveType::Int16));
- self.buffer.extend_from_slice(&value.to_le_bytes());
- }
-
- fn append_int32(&mut self, value: i32) {
- self.buffer
- .push(primitive_header(VariantPrimitiveType::Int32));
- self.buffer.extend_from_slice(&value.to_le_bytes());
- }
-
- fn append_int64(&mut self, value: i64) {
- self.buffer
- .push(primitive_header(VariantPrimitiveType::Int64));
- self.buffer.extend_from_slice(&value.to_le_bytes());
- }
-
- fn append_float(&mut self, value: f32) {
- self.buffer
- .push(primitive_header(VariantPrimitiveType::Float));
- self.buffer.extend_from_slice(&value.to_le_bytes());
- }
-
- fn append_double(&mut self, value: f64) {
- self.buffer
- .push(primitive_header(VariantPrimitiveType::Double));
- self.buffer.extend_from_slice(&value.to_le_bytes());
- }
-
- fn append_date(&mut self, value: chrono::NaiveDate) {
- self.buffer
- .push(primitive_header(VariantPrimitiveType::Date));
- let days_since_epoch =
value.signed_duration_since(UNIX_EPOCH_DATE).num_days() as i32;
- self.buffer
- .extend_from_slice(&days_since_epoch.to_le_bytes());
- }
-
- fn append_timestamp_micros(&mut self, value:
chrono::DateTime<chrono::Utc>) {
- self.buffer
- .push(primitive_header(VariantPrimitiveType::TimestampMicros));
- let micros = value.timestamp_micros();
- self.buffer.extend_from_slice(µs.to_le_bytes());
- }
-
- fn append_timestamp_ntz_micros(&mut self, value: chrono::NaiveDateTime) {
- self.buffer
- .push(primitive_header(VariantPrimitiveType::TimestampNtzMicros));
- let micros = value.and_utc().timestamp_micros();
- self.buffer.extend_from_slice(µs.to_le_bytes());
- }
-
- fn append_decimal4(&mut self, integer: i32, scale: u8) {
- self.buffer
- .push(primitive_header(VariantPrimitiveType::Decimal4));
- self.buffer.push(scale);
- self.buffer.extend_from_slice(&integer.to_le_bytes());
- }
-
- fn append_decimal8(&mut self, integer: i64, scale: u8) {
- self.buffer
- .push(primitive_header(VariantPrimitiveType::Decimal8));
- self.buffer.push(scale);
- self.buffer.extend_from_slice(&integer.to_le_bytes());
- }
-
- fn append_decimal16(&mut self, integer: i128, scale: u8) {
- self.buffer
- .push(primitive_header(VariantPrimitiveType::Decimal16));
- self.buffer.push(scale);
- self.buffer.extend_from_slice(&integer.to_le_bytes());
- }
-
- fn append_binary(&mut self, value: &[u8]) {
- self.buffer
- .push(primitive_header(VariantPrimitiveType::Binary));
- self.buffer
- .extend_from_slice(&(value.len() as u32).to_le_bytes());
- self.buffer.extend_from_slice(value);
- }
-
- fn append_short_string(&mut self, value: ShortString) {
- let inner = value.0;
- self.buffer.push(short_string_header(inner.len()));
- self.buffer.extend_from_slice(inner.as_bytes());
- }
-
- fn append_string(&mut self, value: &str) {
- self.buffer
- .push(primitive_header(VariantPrimitiveType::String));
- self.buffer
- .extend_from_slice(&(value.len() as u32).to_le_bytes());
- self.buffer.extend_from_slice(value.as_bytes());
- }
-
- /// Add key to dictionary, return its ID
- fn add_key(&mut self, key: &str) -> u32 {
- use std::collections::btree_map::Entry;
- match self.dict.entry(key.to_string()) {
- Entry::Occupied(entry) => *entry.get(),
- Entry::Vacant(entry) => {
- let id = self.dict_keys.len() as u32;
- entry.insert(id);
- self.dict_keys.push(key.to_string());
- id
- }
+ buffer: VariantBuffer::default(),
+ field_metadata_dictionary: FieldMetadataDictionary::default(),
}
}
- fn offset(&self) -> usize {
- self.buffer.len()
- }
-
/// Create an [`ListBuilder`] for creating [`Variant::List`] values.
///
/// See the examples on [`VariantBuilder`] for usage.
pub fn new_list(&mut self) -> ListBuilder {
- ListBuilder::new(self)
+ ListBuilder::new(&mut self.buffer, &mut self.field_metadata_dictionary)
}
/// Create an [`ObjectBuilder`] for creating [`Variant::Object`] values.
///
/// See the examples on [`VariantBuilder`] for usage.
pub fn new_object(&mut self) -> ObjectBuilder {
- ObjectBuilder::new(self)
+ ObjectBuilder::new(&mut self.buffer, &mut
self.field_metadata_dictionary)
+ }
+
+ pub fn append_value<'m, 'd, T: Into<Variant<'m, 'd>>>(&mut self, value: T)
{
+ self.buffer.append_value(value);
}
pub fn finish(self) -> (Vec<u8>, Vec<u8>) {
- let nkeys = self.dict_keys.len();
+ let nkeys = self.field_metadata_dictionary.num_field_names();
Review Comment:
A minor thought would be to move the construction of the metadata /
dictionary into the `field_metadata_dictionary` itself
Something like
```rust
let metadata = self.field_metadata_dictionary.build()
```
I am thinking the more encapsulated building the metadata dictionary is, the
easier it will be to optimize it later (for example to avoid allocating and
copying so many strings)
Doesn't have to be for this PR, I just noticed it during review
##########
parquet-variant/src/builder.rs:
##########
@@ -148,189 +314,108 @@ fn make_room_for_header(buffer: &mut Vec<u8>,
start_pos: usize, header_size: usi
///
/// # Example: [`Variant::List`] of [`Variant::Object`]s
///
-/// THis example shows how to create an list of objects:
+/// This example shows how to create an list of objects:
/// ```json
/// [
-/// {
-/// "first_name": "Jiaying",
-/// "last_name": "Li"
-/// },
/// {
-/// "first_name": "Malthe",
-/// "last_name": "Karbo"
-/// }
+/// "id": 1,
+/// "type": "Cauliflower"
+/// },
+/// {
+/// "id": 2,
+/// "type": "Beets"
+/// }
/// ]
/// ```
+/// ```
+/// use parquet_variant::{Variant, VariantBuilder};
+/// let mut builder = VariantBuilder::new();
+///
+/// // Create a builder that will write elements to the list
+/// let mut list_builder = builder.new_list();
+///
+/// {
+/// let mut object_builder = list_builder.new_object();
+/// object_builder.append_value("id", 1);
+/// object_builder.append_value("type", "Cauliflower");
+/// object_builder.finish();
+/// }
+///
+/// {
+/// let mut object_builder = list_builder.new_object();
+/// object_builder.append_value("id", 2);
+/// object_builder.append_value("type", "Beets");
+/// object_builder.finish();
+/// }
+///
+/// list_builder.finish();
+/// // Finish the builder to get the metadata and value
+/// let (metadata, value) = builder.finish();
+/// // use the Variant API to verify the result
+/// let variant = Variant::try_new(&metadata, &value).unwrap();
+/// let variant_list = variant.as_list().unwrap();
+///
+///
+/// let obj1_variant = variant_list.get(0).unwrap();
+/// let obj1 = obj1_variant.as_object().unwrap();
+/// assert_eq!(
+/// obj1.field_by_name("id").unwrap(),
+/// Some(Variant::from(1))
+/// );
+/// assert_eq!(
+/// obj1.field_by_name("type").unwrap(),
+/// Some(Variant::from("Cauliflower"))
+/// );
///
-/// TODO
+/// let obj2_variant = variant_list.get(1).unwrap();
+/// let obj2 = obj2_variant.as_object().unwrap();
+///
+/// assert_eq!(
+/// obj2.field_by_name("id").unwrap(),
+/// Some(Variant::from(2))
+/// );
+/// assert_eq!(
+/// obj2.field_by_name("type").unwrap(),
+/// Some(Variant::from("Beets"))
+/// );
///
+/// ```
pub struct VariantBuilder {
- buffer: Vec<u8>,
- dict: BTreeMap<String, u32>,
- dict_keys: Vec<String>,
+ buffer: VariantBuffer,
+ field_metadata_dictionary: FieldMetadataDictionary,
}
impl VariantBuilder {
pub fn new() -> Self {
Self {
- buffer: Vec::new(),
- dict: BTreeMap::new(),
- dict_keys: Vec::new(),
- }
- }
-
- fn append_null(&mut self) {
- self.buffer
- .push(primitive_header(VariantPrimitiveType::Null));
- }
-
- fn append_bool(&mut self, value: bool) {
- let primitive_type = if value {
- VariantPrimitiveType::BooleanTrue
- } else {
- VariantPrimitiveType::BooleanFalse
- };
- self.buffer.push(primitive_header(primitive_type));
- }
-
- fn append_int8(&mut self, value: i8) {
- self.buffer
- .push(primitive_header(VariantPrimitiveType::Int8));
- self.buffer.push(value as u8);
- }
-
- fn append_int16(&mut self, value: i16) {
- self.buffer
- .push(primitive_header(VariantPrimitiveType::Int16));
- self.buffer.extend_from_slice(&value.to_le_bytes());
- }
-
- fn append_int32(&mut self, value: i32) {
- self.buffer
- .push(primitive_header(VariantPrimitiveType::Int32));
- self.buffer.extend_from_slice(&value.to_le_bytes());
- }
-
- fn append_int64(&mut self, value: i64) {
- self.buffer
- .push(primitive_header(VariantPrimitiveType::Int64));
- self.buffer.extend_from_slice(&value.to_le_bytes());
- }
-
- fn append_float(&mut self, value: f32) {
- self.buffer
- .push(primitive_header(VariantPrimitiveType::Float));
- self.buffer.extend_from_slice(&value.to_le_bytes());
- }
-
- fn append_double(&mut self, value: f64) {
- self.buffer
- .push(primitive_header(VariantPrimitiveType::Double));
- self.buffer.extend_from_slice(&value.to_le_bytes());
- }
-
- fn append_date(&mut self, value: chrono::NaiveDate) {
- self.buffer
- .push(primitive_header(VariantPrimitiveType::Date));
- let days_since_epoch =
value.signed_duration_since(UNIX_EPOCH_DATE).num_days() as i32;
- self.buffer
- .extend_from_slice(&days_since_epoch.to_le_bytes());
- }
-
- fn append_timestamp_micros(&mut self, value:
chrono::DateTime<chrono::Utc>) {
- self.buffer
- .push(primitive_header(VariantPrimitiveType::TimestampMicros));
- let micros = value.timestamp_micros();
- self.buffer.extend_from_slice(µs.to_le_bytes());
- }
-
- fn append_timestamp_ntz_micros(&mut self, value: chrono::NaiveDateTime) {
- self.buffer
- .push(primitive_header(VariantPrimitiveType::TimestampNtzMicros));
- let micros = value.and_utc().timestamp_micros();
- self.buffer.extend_from_slice(µs.to_le_bytes());
- }
-
- fn append_decimal4(&mut self, integer: i32, scale: u8) {
- self.buffer
- .push(primitive_header(VariantPrimitiveType::Decimal4));
- self.buffer.push(scale);
- self.buffer.extend_from_slice(&integer.to_le_bytes());
- }
-
- fn append_decimal8(&mut self, integer: i64, scale: u8) {
- self.buffer
- .push(primitive_header(VariantPrimitiveType::Decimal8));
- self.buffer.push(scale);
- self.buffer.extend_from_slice(&integer.to_le_bytes());
- }
-
- fn append_decimal16(&mut self, integer: i128, scale: u8) {
- self.buffer
- .push(primitive_header(VariantPrimitiveType::Decimal16));
- self.buffer.push(scale);
- self.buffer.extend_from_slice(&integer.to_le_bytes());
- }
-
- fn append_binary(&mut self, value: &[u8]) {
- self.buffer
- .push(primitive_header(VariantPrimitiveType::Binary));
- self.buffer
- .extend_from_slice(&(value.len() as u32).to_le_bytes());
- self.buffer.extend_from_slice(value);
- }
-
- fn append_short_string(&mut self, value: ShortString) {
- let inner = value.0;
- self.buffer.push(short_string_header(inner.len()));
- self.buffer.extend_from_slice(inner.as_bytes());
- }
-
- fn append_string(&mut self, value: &str) {
- self.buffer
- .push(primitive_header(VariantPrimitiveType::String));
- self.buffer
- .extend_from_slice(&(value.len() as u32).to_le_bytes());
- self.buffer.extend_from_slice(value.as_bytes());
- }
-
- /// Add key to dictionary, return its ID
- fn add_key(&mut self, key: &str) -> u32 {
- use std::collections::btree_map::Entry;
- match self.dict.entry(key.to_string()) {
- Entry::Occupied(entry) => *entry.get(),
- Entry::Vacant(entry) => {
- let id = self.dict_keys.len() as u32;
- entry.insert(id);
- self.dict_keys.push(key.to_string());
- id
- }
+ buffer: VariantBuffer::default(),
+ field_metadata_dictionary: FieldMetadataDictionary::default(),
}
}
- fn offset(&self) -> usize {
- self.buffer.len()
- }
-
/// Create an [`ListBuilder`] for creating [`Variant::List`] values.
///
/// See the examples on [`VariantBuilder`] for usage.
pub fn new_list(&mut self) -> ListBuilder {
- ListBuilder::new(self)
+ ListBuilder::new(&mut self.buffer, &mut self.field_metadata_dictionary)
Review Comment:
This is a very nice pattern
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]