alamb commented on code in PR #7935: URL: https://github.com/apache/arrow-rs/pull/7935#discussion_r2213460119
########## parquet-variant/src/builder.rs: ########## @@ -999,8 +1000,17 @@ impl<'a> ListBuilder<'a> { let offset_size = int_size(data_size); // Get parent's buffer + let offset_shift = match &self.parent_state { Review Comment: It might be nice if this was a function in ParentState, something like ```suggestion let offset_shift = self.parent_state.object_start_offset(); ``` ########## parquet-variant/src/builder.rs: ########## @@ -1028,18 +1038,28 @@ impl Drop for ListBuilder<'_> { pub struct ObjectBuilder<'a> { parent_state: ParentState<'a>, fields: IndexMap<u32, usize>, // (field_id, offset) - buffer: ValueBuffer, + /// the starting offset in the parent's buffer where this object starts + object_start_offset: usize, + /// whether the object has been finished, the written content of the current object + /// will be truncated in `drop` if `has_been_finished` is false + has_been_finished: bool, validate_unique_fields: bool, /// Set of duplicate fields to report for errors duplicate_fields: HashSet<u32>, } impl<'a> ObjectBuilder<'a> { fn new(parent_state: ParentState<'a>, validate_unique_fields: bool) -> Self { + let start_offset = match &parent_state { + ParentState::Variant { buffer, .. } => buffer.offset(), + ParentState::List { buffer, .. } => buffer.offset(), + ParentState::Object { buffer, .. } => buffer.offset(), + }; Review Comment: ```suggestion let start_offset = parent_state.buffer().offset(); ``` ########## parquet-variant/src/builder.rs: ########## @@ -1134,38 +1230,96 @@ impl<'a> ObjectBuilder<'a> { ))); } - let data_size = self.buffer.offset(); - let num_fields = self.fields.len(); - let is_large = num_fields > u8::MAX as usize; + let metadata_builder = match &self.parent_state { + ParentState::Variant { + metadata_builder, .. + } => metadata_builder, + ParentState::List { + metadata_builder, .. + } => metadata_builder, + ParentState::Object { + metadata_builder, .. + } => metadata_builder, + }; self.fields.sort_by(|&field_a_id, _, &field_b_id, _| { - let key_a = &metadata_builder.field_name(field_a_id as usize); - let key_b = &metadata_builder.field_name(field_b_id as usize); - key_a.cmp(key_b) + let field_a_name = metadata_builder.field_name(field_a_id as usize); + let field_b_name = metadata_builder.field_name(field_b_id as usize); + field_a_name.cmp(field_b_name) }); - let max_id = self.fields.iter().map(|(i, _)| *i).max().unwrap_or(0); - let id_size = int_size(max_id as usize); - let offset_size = int_size(data_size); - // Get parent's buffer let parent_buffer = self.parent_state.buffer(); - let starting_offset = parent_buffer.offset(); + let current_offset = parent_buffer.offset(); + // current object starts from `object_start_offset` + let data_size = current_offset - self.object_start_offset; + let offset_size = int_size(data_size); - // Write header + let num_fields = self.fields.len(); + let is_large = num_fields > u8::MAX as usize; + + let header_size = 1 + // header byte + (if is_large { 4 } else { 1 }) + // num_fields + (num_fields * id_size as usize) + // field IDs + ((num_fields + 1) * offset_size as usize); // field offsets + data_size + + let starting_offset = self.object_start_offset; + + // Shift existing data to make room for the header Review Comment: As a follow on PR we can consider avoiding this extra splice somehow (by preallocating the size or something). Future work though ########## parquet-variant/src/builder.rs: ########## @@ -1064,20 +1084,58 @@ impl<'a> ObjectBuilder<'a> { key: &str, value: T, ) -> Result<(), ArrowError> { - // Get metadata_builder from parent state - let metadata_builder = self.parent_state.metadata_builder(); + match &mut self.parent_state { Review Comment: Here is a proposal of how to avoid the duplication: - https://github.com/klion26/arrow-rs/pull/1 ########## parquet-variant/src/builder.rs: ########## @@ -955,13 +967,48 @@ impl<'a> ObjectBuilder<'a> { // Returns validate_unique_fields because we can no longer reference self once this method returns. fn parent_state<'b>(&'b mut self, key: &'b str) -> (ParentState<'b>, bool) { - let state = ParentState::Object { - buffer: &mut self.buffer, - metadata_builder: self.parent_state.metadata_builder(), - fields: &mut self.fields, - field_name: key, - }; - (state, self.validate_unique_fields) + let validate_unique_fields = self.validate_unique_fields; + + match &mut self.parent_state { Review Comment: I think we can use the pattern in this pR: https://github.com/klion26/arrow-rs/pull/1 ########## parquet-variant/src/builder.rs: ########## @@ -1860,10 +2046,22 @@ mod tests { let inner_object_variant = outer_object.field(2).unwrap(); let inner_object = inner_object_variant.as_object().unwrap(); - assert_eq!(inner_object.len(), 1); + assert_eq!(inner_object.len(), 3); assert_eq!(inner_object.field_name(0).unwrap(), "b"); assert_eq!(inner_object.field(0).unwrap(), Variant::from("a")); + let inner_iner_object_variant_c = inner_object.field(1).unwrap(); + let inner_inner_object_c = inner_iner_object_variant_c.as_object().unwrap(); + assert_eq!(inner_inner_object_c.len(), 1); + assert_eq!(inner_inner_object_c.field_name(0).unwrap(), "aa"); + assert_eq!(inner_inner_object_c.field(0).unwrap(), Variant::from("bb")); + + let inner_iner_object_variant_d = inner_object.field(2).unwrap(); + let inner_inner_object_d = inner_iner_object_variant_d.as_object().unwrap(); + assert_eq!(inner_inner_object_d.len(), 1); + assert_eq!(inner_inner_object_d.field_name(0).unwrap(), "cc"); + assert_eq!(inner_inner_object_d.field(0).unwrap(), Variant::from("dd")); + assert_eq!(outer_object.field_name(1).unwrap(), "b"); assert_eq!(outer_object.field(1).unwrap(), Variant::from(true)); } Review Comment: I think we should also add tests for the rollback behavior (as in starting an ObjectBuilder but not calling finish) Similar we should test a list builder rollback too ########## parquet-variant/src/builder.rs: ########## @@ -1134,38 +1230,96 @@ impl<'a> ObjectBuilder<'a> { ))); } - let data_size = self.buffer.offset(); - let num_fields = self.fields.len(); - let is_large = num_fields > u8::MAX as usize; + let metadata_builder = match &self.parent_state { Review Comment: it would be nice to put this into a method as well rather than an inline match statement ########## parquet-variant/src/builder.rs: ########## @@ -1134,38 +1230,96 @@ impl<'a> ObjectBuilder<'a> { ))); } - let data_size = self.buffer.offset(); - let num_fields = self.fields.len(); - let is_large = num_fields > u8::MAX as usize; + let metadata_builder = match &self.parent_state { + ParentState::Variant { + metadata_builder, .. + } => metadata_builder, + ParentState::List { + metadata_builder, .. + } => metadata_builder, + ParentState::Object { + metadata_builder, .. + } => metadata_builder, + }; self.fields.sort_by(|&field_a_id, _, &field_b_id, _| { - let key_a = &metadata_builder.field_name(field_a_id as usize); - let key_b = &metadata_builder.field_name(field_b_id as usize); - key_a.cmp(key_b) + let field_a_name = metadata_builder.field_name(field_a_id as usize); + let field_b_name = metadata_builder.field_name(field_b_id as usize); + field_a_name.cmp(field_b_name) }); - let max_id = self.fields.iter().map(|(i, _)| *i).max().unwrap_or(0); - let id_size = int_size(max_id as usize); - let offset_size = int_size(data_size); - // Get parent's buffer let parent_buffer = self.parent_state.buffer(); - let starting_offset = parent_buffer.offset(); + let current_offset = parent_buffer.offset(); + // current object starts from `object_start_offset` + let data_size = current_offset - self.object_start_offset; + let offset_size = int_size(data_size); - // Write header + let num_fields = self.fields.len(); + let is_large = num_fields > u8::MAX as usize; + + let header_size = 1 + // header byte + (if is_large { 4 } else { 1 }) + // num_fields + (num_fields * id_size as usize) + // field IDs + ((num_fields + 1) * offset_size as usize); // field offsets + data_size + + let starting_offset = self.object_start_offset; + + // Shift existing data to make room for the header + let buffer = parent_buffer.inner_mut(); + buffer.splice(starting_offset..starting_offset, vec![0u8; header_size]); + + // Write header at the original start position + let mut header_pos = starting_offset; + + // Write header byte let header = object_header(is_large, id_size, offset_size); - parent_buffer.append_header(header, is_large, num_fields); + buffer[header_pos] = header; + header_pos += 1; + + // Write number of fields + if is_large { + buffer[header_pos..header_pos + 4].copy_from_slice(&(num_fields as u32).to_le_bytes()); + header_pos += 4; + } else { + buffer[header_pos] = num_fields as u8; + header_pos += 1; + } - // Write field IDs (sorted order) - let ids = self.fields.keys().map(|id| *id as usize); - parent_buffer.append_offset_array(ids, None, id_size); + // Write field IDs + for (&field_id, _) in &self.fields { + let id_bytes = (field_id as usize).to_le_bytes(); + buffer[header_pos..header_pos + id_size as usize] + .copy_from_slice(&id_bytes[..id_size as usize]); + header_pos += id_size as usize; + } - // Write the field offset array, followed by the value bytes - let offsets = std::mem::take(&mut self.fields).into_values(); - parent_buffer.append_offset_array(offsets, Some(data_size), offset_size); - parent_buffer.append_slice(self.buffer.inner()); - self.parent_state.finish(starting_offset); + // Write field offsets (adjusted for header) + for (_, &relative_offset) in &self.fields { + let offset_bytes = relative_offset.to_le_bytes(); + buffer[header_pos..header_pos + offset_size as usize] + .copy_from_slice(&offset_bytes[..offset_size as usize]); + header_pos += offset_size as usize; + } + + // Write data_size + let data_size_bytes = data_size.to_le_bytes(); + buffer[header_pos..header_pos + offset_size as usize] + .copy_from_slice(&data_size_bytes[..offset_size as usize]); + + let start_offset_shift = match &self.parent_state { Review Comment: here is another place we could use the method and avoid an inline match -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org