This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 0268bba4c Fix page size on dictionary fallback (#2854)
0268bba4c is described below
commit 0268bba4c01c2b83986c023258ad4405c29cabff
Author: Dan Harris <[email protected]>
AuthorDate: Mon Oct 10 04:25:30 2022 -0400
Fix page size on dictionary fallback (#2854)
* Fix page size on dictionary fallback
* Make test deterministic
* Comments and improve test
---
parquet/src/arrow/arrow_writer/byte_array.rs | 5 ++-
parquet/src/arrow/arrow_writer/mod.rs | 66 ++++++++++++++++++++++++++++
2 files changed, 70 insertions(+), 1 deletion(-)
diff --git a/parquet/src/arrow/arrow_writer/byte_array.rs
b/parquet/src/arrow/arrow_writer/byte_array.rs
index a25bd8d5c..9ea3767a2 100644
--- a/parquet/src/arrow/arrow_writer/byte_array.rs
+++ b/parquet/src/arrow/arrow_writer/byte_array.rs
@@ -551,7 +551,10 @@ where
match &mut encoder.dict_encoder {
Some(dict_encoder) => dict_encoder.encode(values, indices),
- None => encoder.fallback.encode(values, indices),
+ None => {
+ encoder.num_values += indices.len();
+ encoder.fallback.encode(values, indices)
+ }
}
}
diff --git a/parquet/src/arrow/arrow_writer/mod.rs
b/parquet/src/arrow/arrow_writer/mod.rs
index 2c3d498bc..b5c0b5012 100644
--- a/parquet/src/arrow/arrow_writer/mod.rs
+++ b/parquet/src/arrow/arrow_writer/mod.rs
@@ -624,6 +624,7 @@ mod tests {
use crate::basic::Encoding;
use crate::file::metadata::ParquetMetaData;
+ use crate::file::page_index::index_reader::read_pages_locations;
use crate::file::properties::WriterVersion;
use crate::file::{
reader::{FileReader, SerializedFileReader},
@@ -1108,6 +1109,71 @@ mod tests {
roundtrip(batch, Some(SMALL_SIZE / 2));
}
+ #[test]
+ fn arrow_writer_page_size() {
+ let schema =
+ Arc::new(Schema::new(vec![Field::new("col", DataType::Utf8,
false)]));
+
+ let mut builder = StringBuilder::with_capacity(100, 329 * 10_000);
+
+ // Generate an array of 10 unique 10 character string
+ for i in 0..10 {
+ let value = i
+ .to_string()
+ .repeat(10)
+ .chars()
+ .take(10)
+ .collect::<String>();
+
+ builder.append_value(value);
+ }
+
+ let array = Arc::new(builder.finish());
+
+ let batch = RecordBatch::try_new(schema, vec![array]).unwrap();
+
+ let file = tempfile::tempfile().unwrap();
+
+ // Set everything very low so we fallback to PLAIN encoding after the
first row
+ let props = WriterProperties::builder()
+ .set_data_pagesize_limit(1)
+ .set_dictionary_pagesize_limit(1)
+ .set_write_batch_size(1)
+ .build();
+
+ let mut writer =
+ ArrowWriter::try_new(file.try_clone().unwrap(), batch.schema(),
Some(props))
+ .expect("Unable to write file");
+ writer.write(&batch).unwrap();
+ writer.close().unwrap();
+
+ let reader =
SerializedFileReader::new(file.try_clone().unwrap()).unwrap();
+
+ let column = reader.metadata().row_group(0).columns();
+
+ assert_eq!(column.len(), 1);
+
+ // We should write one row before falling back to PLAIN encoding so
there should still be a
+ // dictionary page.
+ assert!(
+ column[0].dictionary_page_offset().is_some(),
+ "Expected a dictionary page"
+ );
+
+ let page_locations = read_pages_locations(&file, column).unwrap();
+
+ let offset_index = page_locations[0].clone();
+
+ // We should fallback to PLAIN encoding after the first row and our
max page size is 1 bytes
+ // so we expect one dictionary encoded page and then a page per row
thereafter.
+ assert_eq!(
+ offset_index.len(),
+ 10,
+ "Expected 9 pages but got {:#?}",
+ offset_index
+ );
+ }
+
const SMALL_SIZE: usize = 7;
fn roundtrip(