This is an automated email from the ASF dual-hosted git repository.

thinkharderdev pushed a commit to branch issue-2853
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git

commit a85568bb395edb13dca02a80ea382643125cbb5f
Author: Dan Harris <[email protected]>
AuthorDate: Sun Oct 9 09:42:07 2022 -0400

    Fix page size on dictionary fallback
---
 parquet/src/arrow/arrow_writer/byte_array.rs |  5 ++-
 parquet/src/arrow/arrow_writer/mod.rs        | 51 ++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+), 1 deletion(-)

diff --git a/parquet/src/arrow/arrow_writer/byte_array.rs 
b/parquet/src/arrow/arrow_writer/byte_array.rs
index a25bd8d5c..9ea3767a2 100644
--- a/parquet/src/arrow/arrow_writer/byte_array.rs
+++ b/parquet/src/arrow/arrow_writer/byte_array.rs
@@ -551,7 +551,10 @@ where
 
     match &mut encoder.dict_encoder {
         Some(dict_encoder) => dict_encoder.encode(values, indices),
-        None => encoder.fallback.encode(values, indices),
+        None => {
+            encoder.num_values += indices.len();
+            encoder.fallback.encode(values, indices)
+        }
     }
 }
 
diff --git a/parquet/src/arrow/arrow_writer/mod.rs 
b/parquet/src/arrow/arrow_writer/mod.rs
index 2c3d498bc..a872c5f85 100644
--- a/parquet/src/arrow/arrow_writer/mod.rs
+++ b/parquet/src/arrow/arrow_writer/mod.rs
@@ -609,6 +609,7 @@ mod tests {
     use super::*;
 
     use bytes::Bytes;
+    use rand::{thread_rng, Rng};
     use std::fs::File;
     use std::sync::Arc;
 
@@ -624,6 +625,7 @@ mod tests {
 
     use crate::basic::Encoding;
     use crate::file::metadata::ParquetMetaData;
+    use crate::file::page_index::index_reader::read_pages_locations;
     use crate::file::properties::WriterVersion;
     use crate::file::{
         reader::{FileReader, SerializedFileReader},
@@ -1108,6 +1110,55 @@ mod tests {
         roundtrip(batch, Some(SMALL_SIZE / 2));
     }
 
+    #[test]
+    fn arrow_writer_page_size() {
+        let mut rng = thread_rng();
+        let schema =
+            Arc::new(Schema::new(vec![Field::new("col", DataType::Utf8, 
false)]));
+
+        let mut builder = StringBuilder::with_capacity(1_000, 2 * 1_000);
+
+        for _ in 0..10_000 {
+            let value = (0..200)
+                .map(|_| rng.gen_range(b'a'..=b'z') as char)
+                .collect::<String>();
+
+            builder.append_value(value);
+        }
+
+        let array = Arc::new(builder.finish());
+
+        let batch = RecordBatch::try_new(schema, vec![array]).unwrap();
+
+        let file = tempfile::tempfile().unwrap();
+
+        let props = WriterProperties::builder()
+            .set_max_row_group_size(usize::MAX)
+            .set_data_pagesize_limit(256)
+            .build();
+
+        let mut writer =
+            ArrowWriter::try_new(file.try_clone().unwrap(), batch.schema(), 
Some(props))
+                .expect("Unable to write file");
+        writer.write(&batch).unwrap();
+        writer.close().unwrap();
+
+        let reader = 
SerializedFileReader::new(file.try_clone().unwrap()).unwrap();
+
+        let column = reader.metadata().row_group(0).columns();
+
+        let page_locations = read_pages_locations(&file, column).unwrap();
+
+        let offset_index = page_locations[0].clone();
+
+        assert_eq!(
+            offset_index.len(),
+            5,
+            "Expected more than two pages but got {:#?}",
+            offset_index
+        );
+    }
+
     const SMALL_SIZE: usize = 7;
 
     fn roundtrip(

Reply via email to