tustvold commented on code in PR #1995:
URL: https://github.com/apache/arrow-rs/pull/1995#discussion_r912420266


##########
parquet/src/column/reader.rs:
##########
@@ -205,94 +197,74 @@ where
 
         // Read exhaustively all pages until we read all batch_size 
values/levels
         // or there are no more values/levels to read.
-        while max(values_read, levels_read) < batch_size {
+        while levels_read < batch_size {
             if !self.has_next()? {
                 break;
             }
 
             // Batch size for the current iteration
-            let iter_batch_size = {
-                // Compute approximate value based on values decoded so far
-                let mut adjusted_size = min(
-                    batch_size,
-                    (self.num_buffered_values - self.num_decoded_values) as 
usize,
-                );
-
-                // Adjust batch size by taking into account how much data there
-                // to read. As batch_size is also smaller than value and level
-                // slices (if available), this ensures that available space is 
not
-                // exceeded.
-                adjusted_size = min(adjusted_size, batch_size - values_read);
-                adjusted_size = min(adjusted_size, batch_size - levels_read);
-
-                adjusted_size
-            };
+            let iter_batch_size = (batch_size - levels_read)
+                .min((self.num_buffered_values - self.num_decoded_values) as 
usize);
 
             // If the field is required and non-repeated, there are no 
definition levels
-            let (num_def_levels, null_count) = match def_levels.as_mut() {
-                Some(levels) if self.descr.max_def_level() > 0 => {
+            let null_count = match self.descr.max_def_level() > 0 {
+                true => {
+                    let levels = def_levels
+                        .as_mut()
+                        .ok_or_else(|| general_err!("must specify definition 
levels"))?;
+
                     let num_def_levels = self
                         .def_level_decoder
                         .as_mut()
                         .expect("def_level_decoder be set")
-                        .read(*levels, levels_read..levels_read + 
iter_batch_size)?;
+                        .read(levels, levels_read..levels_read + 
iter_batch_size)?;
+
+                    if num_def_levels != iter_batch_size {
+                        return Err(general_err!("insufficient definition 
levels read from column - expected {}, got {}", iter_batch_size, 
num_def_levels));
+                    }
 
-                    let null_count = levels.count_nulls(
+                    levels.count_nulls(
                         levels_read..levels_read + num_def_levels,
                         self.descr.max_def_level(),
-                    );
-                    (num_def_levels, null_count)
+                    )
                 }
-                _ => (0, 0),
+                false => 0,
             };
 
-            let num_rep_levels = match rep_levels.as_mut() {
-                Some(levels) if self.descr.max_rep_level() > 0 => self
+            if self.descr.max_rep_level() > 0 {
+                let levels = rep_levels
+                    .as_mut()
+                    .ok_or_else(|| general_err!("must specify repetition 
levels"))?;
+
+                let rep_levels = self
                     .rep_level_decoder
                     .as_mut()
                     .expect("rep_level_decoder be set")
-                    .read(levels, levels_read..levels_read + iter_batch_size)?,
-                _ => 0,
-            };
+                    .read(levels, levels_read..levels_read + iter_batch_size)?;
 
-            // At this point we have read values, definition and repetition 
levels.
-            // If both definition and repetition levels are defined, their 
counts
-            // should be equal. Values count is always less or equal to 
definition levels.
-            if num_def_levels != 0
-                && num_rep_levels != 0

Review Comment:
   This is the source of #1997 - the value of 0 is used as a sentinel for no 
levels present, which prevents detecting the case of no actual values left



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to