Re: [PR] Improve parquet BinaryView / StringView decoder performance (up to -20%) [arrow-rs]

via GitHub Wed, 21 Jan 2026 07:18:12 -0800


alamb commented on code in PR #9236:
URL: https://github.com/apache/arrow-rs/pull/9236#discussion_r2713036911



##########
parquet/src/arrow/array_reader/byte_view_array.rs:
##########
@@ -469,32 +480,32 @@ impl ByteViewArrayDecoderDictionary {
         // then the base_buffer_idx is 5 - 2 = 3
         let base_buffer_idx = output.buffers.len() as u32 - dict.buffers.len() 
as u32;
 
-        self.decoder.read(len, |keys| {
-            for k in keys {
-                let view = dict
-                    .views
-                    .get(*k as usize)
-                    .ok_or_else(|| general_err!("invalid key={} for 
dictionary", *k))?;
-                let len = *view as u32;
-                if len <= 12 {
-                    // directly append the view if it is inlined
-                    // Safety: the view is from the dictionary, so it is valid
-                    unsafe {
-                        output.append_raw_view_unchecked(view);
+        let mut error = None;
+        let read = self.decoder.read(len, |keys| {
+            output
+                .views
+                .extend(keys.iter().map(|k| match dict.views.get(*k as usize) {
+                    Some(&view) => {
+                        let len = view as u32;
+                        if len <= 12 {
+                            view
+                        } else {
+                            let mut view = ByteView::from(view);
+                            view.buffer_index += base_buffer_idx;
+                            view.into()
+                        }
                     }
-                } else {
-                    // correct the buffer index and append the view
-                    let mut view = ByteView::from(*view);
-                    view.buffer_index += base_buffer_idx;
-                    // Safety: the view is from the dictionary,
-                    // we corrected the index value to point it to output 
buffer, so it is valid
-                    unsafe {
-                        output.append_raw_view_unchecked(&view.into());
+                    None => {
+                        error = Some(general_err!("invalid key={} for 
dictionary", *k));

Review Comment:
   In theory this is different because
   1. It will decode more of the dictionary before returning the error 
   2. It will return the last error rather than the first encountered
   
   I think both are fine, but it might be worthwhile adding a check here to 
only set the error if it is not already set so the first error is returned



##########
parquet/src/arrow/array_reader/byte_view_array.rs:
##########
@@ -330,28 +342,26 @@ impl ByteViewArrayDecoderPlain {
         let to_read = len.min(self.max_remaining_values);
 
         let buf: &[u8] = self.buf.as_ref();
-        let mut read = 0;
+        let buf_len = buf.len();
+        let mut offset = self.offset;
+
         output.views.reserve(to_read);
 
-        let mut utf8_validation_begin = self.offset;
-        while self.offset < self.buf.len() && read != to_read {
-            if self.offset + 4 > self.buf.len() {
+        let mut utf8_validation_begin = offset;
+        for _ in 0..to_read {
+            if offset + 4 > buf_len {
                 return Err(ParquetError::EOF("eof decoding byte 
array".into()));
             }
-            let len_bytes: [u8; 4] = unsafe {
-                buf.get_unchecked(self.offset..self.offset + 4)
-                    .try_into()
-                    .unwrap()
-            };
-            let len = u32::from_le_bytes(len_bytes);
-
-            let start_offset = self.offset + 4;
+            let len = u32::from_le_bytes(unsafe { *(buf.as_ptr().add(offset) 
as *const [u8; 4]) });

Review Comment:
   The idea here is to save a bounds check for the length, right?



##########
parquet/src/arrow/array_reader/byte_view_array.rs:
##########
@@ -373,32 +383,33 @@ impl ByteViewArrayDecoderPlain {
                 // The implementation keeps a water mark 
`utf8_validation_begin` to track the beginning of the buffer that is not 
validated.
                 // If the length is smaller than 128, then we continue to next 
string.
                 // If the length is larger than 128, then we validate the 
buffer before the length bytes, and move the water mark to the beginning of 
next string.
-                if len < 128 {
-                    // fast path, move to next string.
-                    // the len bytes are valid utf8.
-                } else {
+                if len >= 128 {
                     // unfortunately, the len bytes may not be valid utf8, we 
need to wrap up and validate everything before it.
-                    check_valid_utf8(unsafe {
-                        buf.get_unchecked(utf8_validation_begin..self.offset)
-                    })?;
+                    check_valid_utf8(unsafe { 
buf.get_unchecked(utf8_validation_begin..offset) })?;
                     // move the cursor to skip the len bytes.
                     utf8_validation_begin = start_offset;
                 }
             }
 
+            let view = make_view(

Review Comment:
   This looks like a manual inlining of `append_view_unchecked` -- is there any 
particular reason to do this rather than keep a `#inline` function? It is fine, 
I am just curious



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Improve parquet BinaryView / StringView decoder performance (up to -20%) [arrow-rs]

Reply via email to