scovich commented on code in PR #7740:
URL: https://github.com/apache/arrow-rs/pull/7740#discussion_r2161422331
##########
parquet-variant/src/builder.rs:
##########
@@ -72,7 +74,182 @@ fn make_room_for_header(buffer: &mut Vec<u8>, start_pos:
usize, header_size: usi
buffer.copy_within(src_start..src_end, dst_start);
}
-/// Builder for [`Variant`] values
+#[derive(Default)]
+struct VariantBuffer(Vec<u8>);
+
+impl VariantBuffer {
+ fn append_null(&mut self) {
+ self.0.push(primitive_header(VariantPrimitiveType::Null));
+ }
+
+ fn append_bool(&mut self, value: bool) {
+ let primitive_type = if value {
+ VariantPrimitiveType::BooleanTrue
+ } else {
+ VariantPrimitiveType::BooleanFalse
+ };
+ self.0.push(primitive_header(primitive_type));
+ }
+
+ fn append_int8(&mut self, value: i8) {
+ self.0.push(primitive_header(VariantPrimitiveType::Int8));
+ self.0.push(value as u8);
+ }
+
+ fn append_int16(&mut self, value: i16) {
+ self.0.push(primitive_header(VariantPrimitiveType::Int16));
+ self.0.extend_from_slice(&value.to_le_bytes());
+ }
+
+ fn append_int32(&mut self, value: i32) {
+ self.0.push(primitive_header(VariantPrimitiveType::Int32));
+ self.0.extend_from_slice(&value.to_le_bytes());
+ }
+
+ fn append_int64(&mut self, value: i64) {
+ self.0.push(primitive_header(VariantPrimitiveType::Int64));
+ self.0.extend_from_slice(&value.to_le_bytes());
+ }
+
+ fn append_float(&mut self, value: f32) {
+ self.0.push(primitive_header(VariantPrimitiveType::Float));
+ self.0.extend_from_slice(&value.to_le_bytes());
+ }
+
+ fn append_double(&mut self, value: f64) {
+ self.0.push(primitive_header(VariantPrimitiveType::Double));
+ self.0.extend_from_slice(&value.to_le_bytes());
+ }
+
+ fn append_date(&mut self, value: chrono::NaiveDate) {
+ self.0.push(primitive_header(VariantPrimitiveType::Date));
+ let days_since_epoch =
value.signed_duration_since(UNIX_EPOCH_DATE).num_days() as i32;
+ self.0.extend_from_slice(&days_since_epoch.to_le_bytes());
+ }
+
+ fn append_timestamp_micros(&mut self, value:
chrono::DateTime<chrono::Utc>) {
+ self.0
+ .push(primitive_header(VariantPrimitiveType::TimestampMicros));
Review Comment:
nit: perhaps worth factoring out an `append_primitive_header` helper?
```suggestion
self.append_primitive_header(VariantPrimitiveType::TimestampMicros);
```
(similar for `append_from_slice`)
##########
parquet-variant/src/builder.rs:
##########
@@ -72,7 +72,177 @@ fn make_room_for_header(buffer: &mut Vec<u8>, start_pos:
usize, header_size: usi
buffer.copy_within(src_start..src_end, dst_start);
}
-/// Builder for [`Variant`] values
+#[derive(Default)]
+struct VariantBuffer(Vec<u8>);
+
+impl VariantBuffer {
+ fn append_null(&mut self) {
+ self.0.push(primitive_header(VariantPrimitiveType::Null));
+ }
+
+ fn append_bool(&mut self, value: bool) {
+ let primitive_type = if value {
+ VariantPrimitiveType::BooleanTrue
+ } else {
+ VariantPrimitiveType::BooleanFalse
+ };
+ self.0.push(primitive_header(primitive_type));
+ }
+
+ fn append_int8(&mut self, value: i8) {
+ self.0.push(primitive_header(VariantPrimitiveType::Int8));
+ self.0.push(value as u8);
+ }
+
+ fn append_int16(&mut self, value: i16) {
+ self.0.push(primitive_header(VariantPrimitiveType::Int16));
+ self.0.extend_from_slice(&value.to_le_bytes());
+ }
+
+ fn append_int32(&mut self, value: i32) {
+ self.0.push(primitive_header(VariantPrimitiveType::Int32));
+ self.0.extend_from_slice(&value.to_le_bytes());
+ }
+
+ fn append_int64(&mut self, value: i64) {
+ self.0.push(primitive_header(VariantPrimitiveType::Int64));
+ self.0.extend_from_slice(&value.to_le_bytes());
+ }
+
+ fn append_float(&mut self, value: f32) {
+ self.0.push(primitive_header(VariantPrimitiveType::Float));
+ self.0.extend_from_slice(&value.to_le_bytes());
+ }
+
+ fn append_double(&mut self, value: f64) {
+ self.0.push(primitive_header(VariantPrimitiveType::Double));
+ self.0.extend_from_slice(&value.to_le_bytes());
+ }
+
+ fn append_date(&mut self, value: chrono::NaiveDate) {
+ self.0.push(primitive_header(VariantPrimitiveType::Date));
+ let days_since_epoch =
value.signed_duration_since(UNIX_EPOCH_DATE).num_days() as i32;
+ self.0.extend_from_slice(&days_since_epoch.to_le_bytes());
+ }
+
+ fn append_timestamp_micros(&mut self, value:
chrono::DateTime<chrono::Utc>) {
+ self.0
+ .push(primitive_header(VariantPrimitiveType::TimestampMicros));
+ let micros = value.timestamp_micros();
+ self.0.extend_from_slice(µs.to_le_bytes());
+ }
+
+ fn append_timestamp_ntz_micros(&mut self, value: chrono::NaiveDateTime) {
+ self.0
+ .push(primitive_header(VariantPrimitiveType::TimestampNtzMicros));
+ let micros = value.and_utc().timestamp_micros();
+ self.0.extend_from_slice(µs.to_le_bytes());
+ }
+
+ fn append_decimal4(&mut self, integer: i32, scale: u8) {
+ self.0
+ .push(primitive_header(VariantPrimitiveType::Decimal4));
+ self.0.push(scale);
+ self.0.extend_from_slice(&integer.to_le_bytes());
+ }
+
+ fn append_decimal8(&mut self, integer: i64, scale: u8) {
+ self.0
+ .push(primitive_header(VariantPrimitiveType::Decimal8));
+ self.0.push(scale);
+ self.0.extend_from_slice(&integer.to_le_bytes());
+ }
+
+ fn append_decimal16(&mut self, integer: i128, scale: u8) {
+ self.0
+ .push(primitive_header(VariantPrimitiveType::Decimal16));
+ self.0.push(scale);
+ self.0.extend_from_slice(&integer.to_le_bytes());
+ }
+
+ fn append_binary(&mut self, value: &[u8]) {
+ self.0.push(primitive_header(VariantPrimitiveType::Binary));
+ self.0
+ .extend_from_slice(&(value.len() as u32).to_le_bytes());
+ self.0.extend_from_slice(value);
+ }
+
+ fn append_short_string(&mut self, value: ShortString) {
+ let inner = value.0;
+ self.0.push(short_string_header(inner.len()));
+ self.0.extend_from_slice(inner.as_bytes());
+ }
+
+ fn append_string(&mut self, value: &str) {
+ self.0.push(primitive_header(VariantPrimitiveType::String));
+ self.0
+ .extend_from_slice(&(value.len() as u32).to_le_bytes());
+ self.0.extend_from_slice(value.as_bytes());
+ }
+
+ fn offset(&self) -> usize {
+ self.0.len()
+ }
+
+ fn append_value<'m, 'd, T: Into<Variant<'m, 'd>>>(&mut self, value: T) {
+ let variant = value.into();
+ match variant {
+ Variant::Null => self.append_null(),
+ Variant::BooleanTrue => self.append_bool(true),
+ Variant::BooleanFalse => self.append_bool(false),
+ Variant::Int8(v) => self.append_int8(v),
+ Variant::Int16(v) => self.append_int16(v),
+ Variant::Int32(v) => self.append_int32(v),
+ Variant::Int64(v) => self.append_int64(v),
+ Variant::Date(v) => self.append_date(v),
+ Variant::TimestampMicros(v) => self.append_timestamp_micros(v),
+ Variant::TimestampNtzMicros(v) =>
self.append_timestamp_ntz_micros(v),
+ Variant::Decimal4 { integer, scale } =>
self.append_decimal4(integer, scale),
+ Variant::Decimal8 { integer, scale } =>
self.append_decimal8(integer, scale),
+ Variant::Decimal16 { integer, scale } =>
self.append_decimal16(integer, scale),
+ Variant::Float(v) => self.append_float(v),
+ Variant::Double(v) => self.append_double(v),
+ Variant::Binary(v) => self.append_binary(v),
+ Variant::String(s) => self.append_string(s),
+ Variant::ShortString(s) => self.append_short_string(s),
+ Variant::Object(_) | Variant::List(_) => {
+ unreachable!("Object and List variants cannot be created
through Into<Variant>")
+ }
+ }
+ }
+}
+
+#[derive(Default)]
+struct FieldMetadataDictionary {
+ field_name_to_id: BTreeMap<String, u32>,
+ field_names: Vec<String>,
Review Comment:
What purpose does this vec of strings serve? Seems like the map is all we
need?
(it looks like all the immediately obvious uses of `field_names` can be
replaced by similar operations over the map instead)
##########
parquet-variant/src/builder.rs:
##########
@@ -72,7 +72,177 @@ fn make_room_for_header(buffer: &mut Vec<u8>, start_pos:
usize, header_size: usi
buffer.copy_within(src_start..src_end, dst_start);
}
-/// Builder for [`Variant`] values
+#[derive(Default)]
+struct VariantBuffer(Vec<u8>);
+
+impl VariantBuffer {
+ fn append_null(&mut self) {
+ self.0.push(primitive_header(VariantPrimitiveType::Null));
+ }
+
+ fn append_bool(&mut self, value: bool) {
+ let primitive_type = if value {
+ VariantPrimitiveType::BooleanTrue
+ } else {
+ VariantPrimitiveType::BooleanFalse
+ };
+ self.0.push(primitive_header(primitive_type));
+ }
+
+ fn append_int8(&mut self, value: i8) {
+ self.0.push(primitive_header(VariantPrimitiveType::Int8));
+ self.0.push(value as u8);
+ }
+
+ fn append_int16(&mut self, value: i16) {
+ self.0.push(primitive_header(VariantPrimitiveType::Int16));
+ self.0.extend_from_slice(&value.to_le_bytes());
+ }
+
+ fn append_int32(&mut self, value: i32) {
+ self.0.push(primitive_header(VariantPrimitiveType::Int32));
+ self.0.extend_from_slice(&value.to_le_bytes());
+ }
+
+ fn append_int64(&mut self, value: i64) {
+ self.0.push(primitive_header(VariantPrimitiveType::Int64));
+ self.0.extend_from_slice(&value.to_le_bytes());
+ }
+
+ fn append_float(&mut self, value: f32) {
+ self.0.push(primitive_header(VariantPrimitiveType::Float));
+ self.0.extend_from_slice(&value.to_le_bytes());
+ }
+
+ fn append_double(&mut self, value: f64) {
+ self.0.push(primitive_header(VariantPrimitiveType::Double));
+ self.0.extend_from_slice(&value.to_le_bytes());
+ }
+
+ fn append_date(&mut self, value: chrono::NaiveDate) {
+ self.0.push(primitive_header(VariantPrimitiveType::Date));
+ let days_since_epoch =
value.signed_duration_since(UNIX_EPOCH_DATE).num_days() as i32;
+ self.0.extend_from_slice(&days_since_epoch.to_le_bytes());
+ }
+
+ fn append_timestamp_micros(&mut self, value:
chrono::DateTime<chrono::Utc>) {
+ self.0
+ .push(primitive_header(VariantPrimitiveType::TimestampMicros));
+ let micros = value.timestamp_micros();
+ self.0.extend_from_slice(µs.to_le_bytes());
+ }
+
+ fn append_timestamp_ntz_micros(&mut self, value: chrono::NaiveDateTime) {
+ self.0
+ .push(primitive_header(VariantPrimitiveType::TimestampNtzMicros));
+ let micros = value.and_utc().timestamp_micros();
+ self.0.extend_from_slice(µs.to_le_bytes());
+ }
+
+ fn append_decimal4(&mut self, integer: i32, scale: u8) {
+ self.0
+ .push(primitive_header(VariantPrimitiveType::Decimal4));
+ self.0.push(scale);
+ self.0.extend_from_slice(&integer.to_le_bytes());
+ }
+
+ fn append_decimal8(&mut self, integer: i64, scale: u8) {
+ self.0
+ .push(primitive_header(VariantPrimitiveType::Decimal8));
+ self.0.push(scale);
+ self.0.extend_from_slice(&integer.to_le_bytes());
+ }
+
+ fn append_decimal16(&mut self, integer: i128, scale: u8) {
+ self.0
+ .push(primitive_header(VariantPrimitiveType::Decimal16));
+ self.0.push(scale);
+ self.0.extend_from_slice(&integer.to_le_bytes());
+ }
+
+ fn append_binary(&mut self, value: &[u8]) {
+ self.0.push(primitive_header(VariantPrimitiveType::Binary));
+ self.0
+ .extend_from_slice(&(value.len() as u32).to_le_bytes());
+ self.0.extend_from_slice(value);
+ }
+
+ fn append_short_string(&mut self, value: ShortString) {
+ let inner = value.0;
+ self.0.push(short_string_header(inner.len()));
+ self.0.extend_from_slice(inner.as_bytes());
+ }
+
+ fn append_string(&mut self, value: &str) {
+ self.0.push(primitive_header(VariantPrimitiveType::String));
+ self.0
+ .extend_from_slice(&(value.len() as u32).to_le_bytes());
+ self.0.extend_from_slice(value.as_bytes());
+ }
+
+ fn offset(&self) -> usize {
+ self.0.len()
+ }
+
+ fn append_value<'m, 'd, T: Into<Variant<'m, 'd>>>(&mut self, value: T) {
+ let variant = value.into();
+ match variant {
+ Variant::Null => self.append_null(),
+ Variant::BooleanTrue => self.append_bool(true),
+ Variant::BooleanFalse => self.append_bool(false),
+ Variant::Int8(v) => self.append_int8(v),
+ Variant::Int16(v) => self.append_int16(v),
+ Variant::Int32(v) => self.append_int32(v),
+ Variant::Int64(v) => self.append_int64(v),
+ Variant::Date(v) => self.append_date(v),
+ Variant::TimestampMicros(v) => self.append_timestamp_micros(v),
+ Variant::TimestampNtzMicros(v) =>
self.append_timestamp_ntz_micros(v),
+ Variant::Decimal4 { integer, scale } =>
self.append_decimal4(integer, scale),
+ Variant::Decimal8 { integer, scale } =>
self.append_decimal8(integer, scale),
+ Variant::Decimal16 { integer, scale } =>
self.append_decimal16(integer, scale),
+ Variant::Float(v) => self.append_float(v),
+ Variant::Double(v) => self.append_double(v),
+ Variant::Binary(v) => self.append_binary(v),
+ Variant::String(s) => self.append_string(s),
+ Variant::ShortString(s) => self.append_short_string(s),
+ Variant::Object(_) | Variant::List(_) => {
+ unreachable!("Object and List variants cannot be created
through Into<Variant>")
+ }
+ }
+ }
+}
+
+#[derive(Default)]
+struct FieldMetadataDictionary {
+ field_name_to_id: BTreeMap<String, u32>,
+ field_names: Vec<String>,
+}
+
+impl FieldMetadataDictionary {
+ /// Add field name to dictionary, return its ID
+ fn add_field_name(&mut self, field_name: &str) -> u32 {
Review Comment:
Not really "add"? More of an "upsert" or "ensure" ?
##########
parquet-variant/src/builder.rs:
##########
@@ -72,7 +72,177 @@ fn make_room_for_header(buffer: &mut Vec<u8>, start_pos:
usize, header_size: usi
buffer.copy_within(src_start..src_end, dst_start);
}
-/// Builder for [`Variant`] values
+#[derive(Default)]
+struct VariantBuffer(Vec<u8>);
+
+impl VariantBuffer {
+ fn append_null(&mut self) {
+ self.0.push(primitive_header(VariantPrimitiveType::Null));
+ }
+
+ fn append_bool(&mut self, value: bool) {
+ let primitive_type = if value {
+ VariantPrimitiveType::BooleanTrue
+ } else {
+ VariantPrimitiveType::BooleanFalse
+ };
+ self.0.push(primitive_header(primitive_type));
+ }
+
+ fn append_int8(&mut self, value: i8) {
+ self.0.push(primitive_header(VariantPrimitiveType::Int8));
+ self.0.push(value as u8);
+ }
+
+ fn append_int16(&mut self, value: i16) {
+ self.0.push(primitive_header(VariantPrimitiveType::Int16));
+ self.0.extend_from_slice(&value.to_le_bytes());
+ }
+
+ fn append_int32(&mut self, value: i32) {
+ self.0.push(primitive_header(VariantPrimitiveType::Int32));
+ self.0.extend_from_slice(&value.to_le_bytes());
+ }
+
+ fn append_int64(&mut self, value: i64) {
+ self.0.push(primitive_header(VariantPrimitiveType::Int64));
+ self.0.extend_from_slice(&value.to_le_bytes());
+ }
+
+ fn append_float(&mut self, value: f32) {
+ self.0.push(primitive_header(VariantPrimitiveType::Float));
+ self.0.extend_from_slice(&value.to_le_bytes());
+ }
+
+ fn append_double(&mut self, value: f64) {
+ self.0.push(primitive_header(VariantPrimitiveType::Double));
+ self.0.extend_from_slice(&value.to_le_bytes());
+ }
+
+ fn append_date(&mut self, value: chrono::NaiveDate) {
+ self.0.push(primitive_header(VariantPrimitiveType::Date));
+ let days_since_epoch =
value.signed_duration_since(UNIX_EPOCH_DATE).num_days() as i32;
+ self.0.extend_from_slice(&days_since_epoch.to_le_bytes());
+ }
+
+ fn append_timestamp_micros(&mut self, value:
chrono::DateTime<chrono::Utc>) {
+ self.0
+ .push(primitive_header(VariantPrimitiveType::TimestampMicros));
+ let micros = value.timestamp_micros();
+ self.0.extend_from_slice(µs.to_le_bytes());
+ }
+
+ fn append_timestamp_ntz_micros(&mut self, value: chrono::NaiveDateTime) {
+ self.0
+ .push(primitive_header(VariantPrimitiveType::TimestampNtzMicros));
+ let micros = value.and_utc().timestamp_micros();
+ self.0.extend_from_slice(µs.to_le_bytes());
+ }
+
+ fn append_decimal4(&mut self, integer: i32, scale: u8) {
+ self.0
+ .push(primitive_header(VariantPrimitiveType::Decimal4));
+ self.0.push(scale);
+ self.0.extend_from_slice(&integer.to_le_bytes());
+ }
+
+ fn append_decimal8(&mut self, integer: i64, scale: u8) {
+ self.0
+ .push(primitive_header(VariantPrimitiveType::Decimal8));
+ self.0.push(scale);
+ self.0.extend_from_slice(&integer.to_le_bytes());
+ }
+
+ fn append_decimal16(&mut self, integer: i128, scale: u8) {
+ self.0
+ .push(primitive_header(VariantPrimitiveType::Decimal16));
+ self.0.push(scale);
+ self.0.extend_from_slice(&integer.to_le_bytes());
+ }
+
+ fn append_binary(&mut self, value: &[u8]) {
+ self.0.push(primitive_header(VariantPrimitiveType::Binary));
+ self.0
+ .extend_from_slice(&(value.len() as u32).to_le_bytes());
+ self.0.extend_from_slice(value);
+ }
+
+ fn append_short_string(&mut self, value: ShortString) {
+ let inner = value.0;
+ self.0.push(short_string_header(inner.len()));
+ self.0.extend_from_slice(inner.as_bytes());
+ }
+
+ fn append_string(&mut self, value: &str) {
+ self.0.push(primitive_header(VariantPrimitiveType::String));
+ self.0
+ .extend_from_slice(&(value.len() as u32).to_le_bytes());
+ self.0.extend_from_slice(value.as_bytes());
+ }
+
+ fn offset(&self) -> usize {
+ self.0.len()
+ }
+
+ fn append_value<'m, 'd, T: Into<Variant<'m, 'd>>>(&mut self, value: T) {
+ let variant = value.into();
+ match variant {
+ Variant::Null => self.append_null(),
+ Variant::BooleanTrue => self.append_bool(true),
+ Variant::BooleanFalse => self.append_bool(false),
+ Variant::Int8(v) => self.append_int8(v),
+ Variant::Int16(v) => self.append_int16(v),
+ Variant::Int32(v) => self.append_int32(v),
+ Variant::Int64(v) => self.append_int64(v),
+ Variant::Date(v) => self.append_date(v),
+ Variant::TimestampMicros(v) => self.append_timestamp_micros(v),
+ Variant::TimestampNtzMicros(v) =>
self.append_timestamp_ntz_micros(v),
+ Variant::Decimal4 { integer, scale } =>
self.append_decimal4(integer, scale),
+ Variant::Decimal8 { integer, scale } =>
self.append_decimal8(integer, scale),
+ Variant::Decimal16 { integer, scale } =>
self.append_decimal16(integer, scale),
+ Variant::Float(v) => self.append_float(v),
+ Variant::Double(v) => self.append_double(v),
+ Variant::Binary(v) => self.append_binary(v),
+ Variant::String(s) => self.append_string(s),
+ Variant::ShortString(s) => self.append_short_string(s),
+ Variant::Object(_) | Variant::List(_) => {
+ unreachable!("Object and List variants cannot be created
through Into<Variant>")
+ }
+ }
+ }
+}
+
+#[derive(Default)]
+struct FieldMetadataDictionary {
+ field_name_to_id: BTreeMap<String, u32>,
+ field_names: Vec<String>,
+}
+
+impl FieldMetadataDictionary {
+ /// Add field name to dictionary, return its ID
+ fn add_field_name(&mut self, field_name: &str) -> u32 {
+ use std::collections::btree_map::Entry;
+ match self.field_name_to_id.entry(field_name.to_string()) {
+ Entry::Occupied(entry) => *entry.get(),
+ Entry::Vacant(entry) => {
+ let id = self.field_names.len() as u32;
+ entry.insert(id);
+ self.field_names.push(field_name.to_string());
+ id
+ }
+ }
+ }
Review Comment:
If we get rid of `field_names`, then I believe this simplifies to just
```suggestion
*self
.field_name_to_id
.entry(field_name.to_string())
.or_insert(self.field_names.len() as u32)
```
##########
parquet-variant/src/builder.rs:
##########
@@ -72,7 +72,183 @@ fn make_room_for_header(buffer: &mut Vec<u8>, start_pos:
usize, header_size: usi
buffer.copy_within(src_start..src_end, dst_start);
}
-/// Builder for [`Variant`] values
+#[derive(Default)]
+struct ValueBuffer(Vec<u8>);
Review Comment:
Adding `impl Deref` and `impl DerefMut` to the underlying vec might simplify
a lot of code -- if it's ok to expose it that way?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]