Dandandan commented on code in PR #9236:
URL: https://github.com/apache/arrow-rs/pull/9236#discussion_r2718210597
##########
parquet/src/arrow/array_reader/byte_view_array.rs:
##########
@@ -373,32 +394,39 @@ impl ByteViewArrayDecoderPlain {
// The implementation keeps a water mark
`utf8_validation_begin` to track the beginning of the buffer that is not
validated.
// If the length is smaller than 128, then we continue to next
string.
// If the length is larger than 128, then we validate the
buffer before the length bytes, and move the water mark to the beginning of
next string.
- if len < 128 {
- // fast path, move to next string.
- // the len bytes are valid utf8.
- } else {
+ if len >= 128 {
// unfortunately, the len bytes may not be valid utf8, we
need to wrap up and validate everything before it.
check_valid_utf8(unsafe {
- buf.get_unchecked(utf8_validation_begin..self.offset)
+ buf.get_unchecked(utf8_validation_begin..end_offset)
})?;
// move the cursor to skip the len bytes.
utf8_validation_begin = start_offset;
}
}
+ let view = make_view(
+ unsafe { buf.get_unchecked(start_offset..end_offset) },
+ block_id,
+ start_offset as u32,
+ );
+ // Safety: views_ptr is valid for writes, and we have reserved
enough space.
unsafe {
- output.append_view_unchecked(block_id, start_offset as u32,
len);
+ views_ptr.add(i).write(view);
}
- self.offset = end_offset;
- read += 1;
}
- // validate the last part of the buffer
- if self.validate_utf8 {
- check_valid_utf8(unsafe {
buf.get_unchecked(utf8_validation_begin..self.offset) })?;
+ // Safety: we have written `to_read` views to `views_ptr`
+ unsafe {
+ output.views.set_len(output.views.len() + to_read);
+ }
+ if VALIDATE_UTF8 {
+ // validate the last part of the buffer
+ check_valid_utf8(unsafe {
buf.get_unchecked(utf8_validation_begin..end_offset) })?;
Review Comment:
Wow great catch - I missed updating this when trying to remove a variable.
I pushed a fix that uses `start_location - 4` (i.e. everything from
`utf8_validation_begin` to the current excluding the length) instead of the
updated `end_offset`.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]