Jefffrey commented on code in PR #8069:
URL: https://github.com/apache/arrow-rs/pull/8069#discussion_r2678607159
##########
parquet/src/arrow/arrow_writer/mod.rs:
##########
@@ -1122,6 +1122,17 @@ impl ArrowColumnWriterFactory {
ArrowDataType::FixedSizeBinary(_) =>
out.push(bytes(leaves.next().unwrap())?),
_ => out.push(col(leaves.next().unwrap())?),
},
+ ArrowDataType::RunEndEncoded(_, value_type) => match
value_type.data_type() {
+ ArrowDataType::Utf8
+ | ArrowDataType::LargeUtf8
+ | ArrowDataType::Binary
+ | ArrowDataType::LargeBinary =>
out.push(bytes(leaves.next().unwrap())?),
+ ArrowDataType::Utf8View | ArrowDataType::BinaryView => {
+ out.push(bytes(leaves.next().unwrap())?)
+ }
+ ArrowDataType::FixedSizeBinary(_) =>
out.push(bytes(leaves.next().unwrap())?),
Review Comment:
```suggestion
| ArrowDataType::LargeBinary
| ArrowDataType::Utf8View
| ArrowDataType::BinaryView
| ArrowDataType::FixedSizeBinary(_) =>
out.push(bytes(leaves.next().unwrap())?),
```
##########
parquet/src/arrow/arrow_writer/mod.rs:
##########
@@ -4481,4 +4624,408 @@ mod tests {
assert_eq!(get_dict_page_size(col0_meta), 1024 * 1024);
assert_eq!(get_dict_page_size(col1_meta), 1024 * 1024 * 4);
}
+
+ #[test]
+ fn arrow_writer_run_end_encoded_string() {
Review Comment:
These tests have lots of duplicated code, would be nice to pull into a
helper so it's easier to see where the tests differ
##########
parquet/src/arrow/arrow_writer/byte_array.rs:
##########
@@ -92,6 +114,26 @@ macro_rules! downcast_op {
}
d => unreachable!("cannot downcast {} dictionary value to byte
array", d),
},
+ DataType::RunEndEncoded(run_end, value) => match value.data_type()
{
+ DataType::Utf8 => downcast_ree_op!(run_end, StringArray,
$array, $op$(, $arg)*),
+ DataType::LargeUtf8 => {
+ downcast_ree_op!(run_end, LargeStringArray, $array, $op$(,
$arg)*)
+ }
+ DataType::Utf8View => {
Review Comment:
Probably not for this PR, but I notice the dictionary arms above don't have
string/binary views 🤔
##########
parquet/src/arrow/arrow_writer/mod.rs:
##########
@@ -4481,4 +4624,408 @@ mod tests {
assert_eq!(get_dict_page_size(col0_meta), 1024 * 1024);
assert_eq!(get_dict_page_size(col1_meta), 1024 * 1024 * 4);
}
+
+ #[test]
+ fn arrow_writer_run_end_encoded_string() {
+ // Create a run array of strings
+ let mut builder = StringRunBuilder::<Int32Type>::new();
+ builder.extend(
+ vec![Some("alpha"); 100000]
+ .into_iter()
+ .chain(vec![Some("beta"); 100000]),
+ );
+ let run_array: RunArray<Int32Type> = builder.finish();
+ let schema = Arc::new(Schema::new(vec![Field::new(
+ "ree",
+ run_array.data_type().clone(),
+ run_array.is_nullable(),
+ )]));
+
+ // Write to parquet
+ let mut parquet_bytes: Vec<u8> = Vec::new();
+ let mut writer = ArrowWriter::try_new(&mut parquet_bytes,
schema.clone(), None).unwrap();
+ let batch = RecordBatch::try_new(schema.clone(),
vec![Arc::new(run_array)]).unwrap();
+ writer.write(&batch).unwrap();
+ writer.close().unwrap();
+
+ // Read back and verify
+ let bytes = Bytes::from(parquet_bytes);
+ let reader = ParquetRecordBatchReaderBuilder::try_new(bytes).unwrap();
+
+ // Check if dictionary was used by examining the metadata
+ let metadata = reader.metadata();
+ let row_group = &metadata.row_groups()[0];
+ let col_meta = &row_group.columns()[0];
+
+ // If dictionary encoding worked, we should see RLE_DICTIONARY encoding
+ // and have a dictionary page offset
+ let has_dict_encoding = col_meta.encodings().any(|e| e ==
Encoding::RLE_DICTIONARY);
+ let has_dict_page = col_meta.dictionary_page_offset().is_some();
+
+ // Verify the schema is REE encoded when we read it back
+ let expected_schema = Arc::new(Schema::new(vec![Field::new(
+ "ree",
+ DataType::RunEndEncoded(
+ Arc::new(Field::new("run_ends", arrow_schema::DataType::Int32,
false)),
+ Arc::new(Field::new("values", arrow_schema::DataType::Utf8,
true)),
+ ),
+ false,
+ )]));
+ assert_eq!(&expected_schema, reader.schema());
+
+ // Read the data back
+ let batches: Vec<_> = reader
+ .build()
+ .unwrap()
+ .collect::<ArrowResult<Vec<_>>>()
+ .unwrap();
+ assert_eq!(batches.len(), 196);
Review Comment:
Personally I'd omit the batches len check, since I don't think its too
relevant to what we're testing here
##########
parquet/src/arrow/arrow_writer/mod.rs:
##########
@@ -1371,6 +1445,75 @@ fn write_leaf(writer: &mut ColumnWriter<'_>, levels:
&ArrayLevels) -> Result<usi
let array = column.as_primitive::<Float16Type>();
get_float_16_array_slice(array, indices)
}
+ ArrowDataType::RunEndEncoded(_run_ends, values_field) => {
+ match values_field.data_type() {
+ ArrowDataType::Decimal32(_, _) => {
+ let array = arrow_cast::cast(column,
values_field.data_type())?;
+ let array = array.as_primitive::<Decimal32Type>();
+ get_decimal_32_array_slice(array, indices)
+ }
+ ArrowDataType::Decimal64(_, _) => {
+ let array = arrow_cast::cast(column,
values_field.data_type())?;
+ let array = array.as_primitive::<Decimal64Type>();
+ get_decimal_64_array_slice(array, indices)
+ }
+ ArrowDataType::Decimal128(_, _) => {
+ let array = arrow_cast::cast(column,
values_field.data_type())?;
+ let array = array.as_primitive::<Decimal128Type>();
+ get_decimal_128_array_slice(array, indices)
+ }
+ ArrowDataType::Decimal256(_, _) => {
+ let array = arrow_cast::cast(column,
values_field.data_type())?;
+ let array = array
+ .as_any()
+ .downcast_ref::<arrow_array::Decimal256Array>()
+ .unwrap();
Review Comment:
```suggestion
let array =
array.as_primitive::<arrow_array::Decimal256Array>();
```
Just some small simplifications we can do (also keeps it consistent with
32/64/128 above)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]