tyrelr commented on a change in pull request #9215:
URL: https://github.com/apache/arrow/pull/9215#discussion_r563168268
##########
File path: rust/arrow/src/array/array_string.rs
##########
@@ -50,49 +50,65 @@ pub struct GenericStringArray<OffsetSize:
StringOffsetSizeTrait> {
}
impl<OffsetSize: StringOffsetSizeTrait> GenericStringArray<OffsetSize> {
- /// Returns the offset for the element at index `i`.
- ///
- /// Note this doesn't do any bound checking, for performance reason.
- #[inline]
- pub fn value_offset(&self, i: usize) -> OffsetSize {
- self.value_offset_at(self.data.offset() + i)
- }
-
/// Returns the length for the element at index `i`.
- ///
- /// Note this doesn't do any bound checking, for performance reason.
#[inline]
- pub fn value_length(&self, mut i: usize) -> OffsetSize {
- i += self.data.offset();
- self.value_offset_at(i + 1) - self.value_offset_at(i)
+ pub fn value_length(&self, i: usize) -> OffsetSize {
+ let offsets = self.value_offsets();
+ offsets[i + 1] - offsets[i]
}
- /// Returns a clone of the value offset buffer
- pub fn value_offsets(&self) -> Buffer {
- self.data.buffers()[0].clone()
+ /// Returns the offset values in the offsets buffer
+ #[inline]
+ pub fn value_offsets(&self) -> &[OffsetSize] {
+ // Soundness
+ // pointer alignment & location is ensured by RawPtrBox
+ // buffer bounds/offset is ensured by the ArrayData instance.
+ unsafe {
+ std::slice::from_raw_parts(
+ self.value_offsets.as_ptr().add(self.data.offset()),
+ self.len() + 1,
+ )
+ }
}
/// Returns a clone of the value data buffer
pub fn value_data(&self) -> Buffer {
self.data.buffers()[1].clone()
}
- #[inline]
- fn value_offset_at(&self, i: usize) -> OffsetSize {
- unsafe { *self.value_offsets.as_ptr().add(i) }
+ /// Returns the element at index
+ /// # Safety
+ /// caller is responsible for ensuring that index is within the array
bounds
+ pub unsafe fn value_unchecked(&self, i: usize) -> &str {
+ let end = self.value_offsets().get_unchecked(i + 1);
+ let start = self.value_offsets().get_unchecked(i);
+
+ // Soundness
+ // pointer alignment & location is ensured by RawPtrBox
+ // buffer bounds/offset is ensured by the value_offset invariants
+ // ISSUE: utf-8 well formedness is not checked
+ let slice = std::slice::from_raw_parts(
+ self.value_data.as_ptr().offset(start.to_isize()),
+ (*end - *start).to_usize().unwrap(),
+ );
+ std::str::from_utf8_unchecked(slice)
}
/// Returns the element at index `i` as &str
pub fn value(&self, i: usize) -> &str {
assert!(i < self.data.len(), "StringArray out of bounds access");
- let offset = i.checked_add(self.data.offset()).unwrap();
+ let end = self.value_offsets()[i + 1];
+ let start = self.value_offsets()[i];
Review comment:
switching to unsafe unsafe access here would help the string benchmarks.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]