scovich commented on code in PR #7934:
URL: https://github.com/apache/arrow-rs/pull/7934#discussion_r2214111555


##########
parquet-variant/src/variant/metadata.rs:
##########
@@ -234,44 +234,58 @@ impl<'m> VariantMetadata<'m> {
                 self.header.first_offset_byte() as _..self.first_value_byte as 
_,
             )?;
 
-            let offsets =
-                map_bytes_to_offsets(offset_bytes, 
self.header.offset_size).collect::<Vec<_>>();
-
             // Verify the string values in the dictionary are UTF-8 encoded 
strings.
             let value_buffer =
                 string_from_slice(self.bytes, 0, self.first_value_byte as 
_..self.bytes.len())?;
 
+            let mut offsets_iter = map_bytes_to_offsets(offset_bytes, 
self.header.offset_size);
+            let mut current_offset = offsets_iter.next().unwrap_or(0);
+
             if self.header.is_sorted {
                 // Validate the dictionary values are unique and 
lexicographically sorted
                 //
                 // Since we use the offsets to access dictionary values, this 
also validates
                 // offsets are in-bounds and monotonically increasing
-                let are_dictionary_values_unique_and_sorted = 
(1..offsets.len())
-                    .map(|i| {
-                        let field_range = offsets[i - 1]..offsets[i];
-                        value_buffer.get(field_range)
-                    })
-                    .is_sorted_by(|a, b| match (a, b) {
-                        (Some(a), Some(b)) => a < b,
-                        _ => false,
-                    });
-
-                if !are_dictionary_values_unique_and_sorted {
-                    return Err(ArrowError::InvalidArgumentError(
-                        "dictionary values are not unique and 
ordered".to_string(),
-                    ));
+                let mut prev_value: Option<&str> = None;
+
+                for next_offset in offsets_iter {
+                    if next_offset <= current_offset {
+                        return Err(ArrowError::InvalidArgumentError(
+                            "offsets not monotonically increasing".to_string(),
+                        ));
+                    }
+
+                    let current_value =
+                        value_buffer
+                            .get(current_offset..next_offset)
+                            .ok_or_else(|| {
+                                ArrowError::InvalidArgumentError("offset out 
of bounds".to_string())
+                            })?;
+
+                    if let Some(prev_val) = prev_value {
+                        if current_value <= prev_val {
+                            return Err(ArrowError::InvalidArgumentError(
+                                "dictionary values are not unique and 
ordered".to_string(),
+                            ));
+                        }
+                    }
+
+                    prev_value = Some(current_value);
+                    current_offset = next_offset;
                 }
             } else {
                 // Validate offsets are in-bounds and monotonically increasing
                 //
                 // Since shallow validation ensures the first and last offsets 
are in bounds,
                 // we can also verify all offsets are in-bounds by checking if
                 // offsets are monotonically increasing
-                let are_offsets_monotonic = offsets.is_sorted_by(|a, b| a < b);
-                if !are_offsets_monotonic {
-                    return Err(ArrowError::InvalidArgumentError(
-                        "offsets not monotonically increasing".to_string(),
-                    ));
+                for next_offset in offsets_iter {
+                    if next_offset <= current_offset {
+                        return Err(ArrowError::InvalidArgumentError(
+                            "offsets not monotonically increasing".to_string(),
+                        ));
+                    }
+                    current_offset = next_offset;

Review Comment:
   Oh! There's also a bug: Neither variant nor JSON forbids the empty string as 
a field name. So we have to allow empty ranges. In case the dictionary is 
sorted, `""` compares less-than every other string, and so the sortedness check 
will catch any misplaced empty strings.
   
   I'll throw up a quick PR for this.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to