Jefffrey commented on code in PR #9176:
URL: https://github.com/apache/arrow-rs/pull/9176#discussion_r2698727405
##########
arrow-row/src/list.rs:
##########
@@ -323,3 +316,180 @@ pub unsafe fn decode_fixed_size_list(
builder.build_unchecked()
}))
}
+
+/// Computes the encoded length for a single list element given its child rows.
+///
+/// This is used by list types (List, LargeList, ListView, LargeListView) to
determine
+/// the encoded length of a list element. For null elements, returns 1 (null
sentinel only).
+/// For valid elements, returns 1 + the sum of padded lengths for each child
row.
+#[inline]
+fn list_element_encoded_len(rows: &Rows, range: Option<Range<usize>>) -> usize
{
+ match range {
+ None => 1,
+ Some(range) => {
+ 1 + range
+ .map(|i|
super::variable::padded_length(Some(rows.row(i).as_ref().len())))
+ .sum::<usize>()
+ }
+ }
+}
+
+/// Computes the encoded lengths for a `GenericListViewArray`
+///
+/// `rows` should contain the encoded child elements
+pub fn compute_lengths_list_view<O: OffsetSizeTrait>(
+ lengths: &mut [usize],
+ rows: &Rows,
+ array: &GenericListViewArray<O>,
+ shift: usize,
+) {
+ let offsets = array.value_offsets();
+ let sizes = array.value_sizes();
+
+ lengths.iter_mut().enumerate().for_each(|(idx, length)| {
+ let start = offsets[idx].as_usize() - shift;
+ let size = sizes[idx].as_usize();
+ let range = array.is_valid(idx).then_some(start..start + size);
+ *length += list_element_encoded_len(rows, range);
+ });
+}
+
+/// Encodes the provided `GenericListViewArray` to `out` with the provided
`SortOptions`
+///
+/// `rows` should contain the encoded child elements
+pub fn encode_list_view<O: OffsetSizeTrait>(
+ data: &mut [u8],
+ out_offsets: &mut [usize],
+ rows: &Rows,
+ opts: SortOptions,
+ array: &GenericListViewArray<O>,
+ shift: usize,
+) {
+ let offsets = array.value_offsets();
+ let sizes = array.value_sizes();
+
+ out_offsets
+ .iter_mut()
+ .skip(1)
+ .enumerate()
+ .for_each(|(idx, offset)| {
+ let start = offsets[idx].as_usize() - shift;
+ let size = sizes[idx].as_usize();
+ let range = array.is_valid(idx).then_some(start..start + size);
+ let out = &mut data[*offset..];
+ *offset += encode_one(out, rows, range, opts)
+ });
+}
+
+/// Decodes a `GenericListViewArray` from `rows` with the provided `options`
+///
+/// # Safety
+///
+/// `rows` must contain valid data for the provided `converter`
+pub unsafe fn decode_list_view<O: OffsetSizeTrait>(
+ converter: &RowConverter,
+ rows: &mut [&[u8]],
+ field: &SortField,
+ validate_utf8: bool,
+) -> Result<GenericListViewArray<O>, ArrowError> {
+ let opts = field.options;
+
+ let mut values_bytes = 0;
+
+ let mut child_count = 0usize;
+ let mut list_sizes: Vec<O> = Vec::with_capacity(rows.len());
+
+ // First pass: count children and compute sizes
+ for row in rows.iter_mut() {
+ let mut row_offset = 0;
+ let mut list_size = 0usize;
+ loop {
+ let decoded = super::variable::decode_blocks(&row[row_offset..],
opts, |x| {
+ values_bytes += x.len();
+ });
+ if decoded <= 1 {
+ list_sizes.push(O::usize_as(list_size));
+ break;
+ }
+ row_offset += decoded;
+ child_count += 1;
+ list_size += 1;
+ }
+ }
+ O::from_usize(child_count).expect("overflow");
Review Comment:
Is this to force a panic?
##########
arrow-row/src/list.rs:
##########
@@ -323,3 +316,180 @@ pub unsafe fn decode_fixed_size_list(
builder.build_unchecked()
}))
}
+
+/// Computes the encoded length for a single list element given its child rows.
+///
+/// This is used by list types (List, LargeList, ListView, LargeListView) to
determine
+/// the encoded length of a list element. For null elements, returns 1 (null
sentinel only).
+/// For valid elements, returns 1 + the sum of padded lengths for each child
row.
+#[inline]
+fn list_element_encoded_len(rows: &Rows, range: Option<Range<usize>>) -> usize
{
+ match range {
+ None => 1,
+ Some(range) => {
+ 1 + range
+ .map(|i|
super::variable::padded_length(Some(rows.row(i).as_ref().len())))
+ .sum::<usize>()
+ }
+ }
+}
+
+/// Computes the encoded lengths for a `GenericListViewArray`
+///
+/// `rows` should contain the encoded child elements
+pub fn compute_lengths_list_view<O: OffsetSizeTrait>(
+ lengths: &mut [usize],
+ rows: &Rows,
+ array: &GenericListViewArray<O>,
+ shift: usize,
+) {
+ let offsets = array.value_offsets();
+ let sizes = array.value_sizes();
+
+ lengths.iter_mut().enumerate().for_each(|(idx, length)| {
+ let start = offsets[idx].as_usize() - shift;
+ let size = sizes[idx].as_usize();
+ let range = array.is_valid(idx).then_some(start..start + size);
+ *length += list_element_encoded_len(rows, range);
+ });
+}
+
+/// Encodes the provided `GenericListViewArray` to `out` with the provided
`SortOptions`
+///
+/// `rows` should contain the encoded child elements
+pub fn encode_list_view<O: OffsetSizeTrait>(
+ data: &mut [u8],
+ out_offsets: &mut [usize],
+ rows: &Rows,
+ opts: SortOptions,
+ array: &GenericListViewArray<O>,
+ shift: usize,
+) {
+ let offsets = array.value_offsets();
+ let sizes = array.value_sizes();
+
+ out_offsets
+ .iter_mut()
+ .skip(1)
+ .enumerate()
+ .for_each(|(idx, offset)| {
+ let start = offsets[idx].as_usize() - shift;
+ let size = sizes[idx].as_usize();
+ let range = array.is_valid(idx).then_some(start..start + size);
+ let out = &mut data[*offset..];
+ *offset += encode_one(out, rows, range, opts)
+ });
+}
+
+/// Decodes a `GenericListViewArray` from `rows` with the provided `options`
+///
+/// # Safety
+///
+/// `rows` must contain valid data for the provided `converter`
+pub unsafe fn decode_list_view<O: OffsetSizeTrait>(
+ converter: &RowConverter,
+ rows: &mut [&[u8]],
+ field: &SortField,
+ validate_utf8: bool,
+) -> Result<GenericListViewArray<O>, ArrowError> {
+ let opts = field.options;
+
+ let mut values_bytes = 0;
+
+ let mut child_count = 0usize;
+ let mut list_sizes: Vec<O> = Vec::with_capacity(rows.len());
+
+ // First pass: count children and compute sizes
+ for row in rows.iter_mut() {
+ let mut row_offset = 0;
+ let mut list_size = 0usize;
+ loop {
+ let decoded = super::variable::decode_blocks(&row[row_offset..],
opts, |x| {
+ values_bytes += x.len();
+ });
+ if decoded <= 1 {
+ list_sizes.push(O::usize_as(list_size));
+ break;
+ }
+ row_offset += decoded;
+ child_count += 1;
+ list_size += 1;
+ }
+ }
+ O::from_usize(child_count).expect("overflow");
+
+ let mut null_count = 0;
+ let nulls = MutableBuffer::collect_bool(rows.len(), |x| {
+ let valid = rows[x][0] != null_sentinel(opts);
+ null_count += !valid as usize;
+ valid
+ });
+
+ let mut values_offsets_vec = Vec::with_capacity(child_count);
+ let mut values_bytes = Vec::with_capacity(values_bytes);
+ for row in rows.iter_mut() {
+ let mut row_offset = 0;
+ loop {
+ let decoded = super::variable::decode_blocks(&row[row_offset..],
opts, |x| {
+ values_bytes.extend_from_slice(x)
+ });
+ row_offset += decoded;
+ if decoded <= 1 {
+ break;
+ }
+ values_offsets_vec.push(values_bytes.len());
+ }
+ *row = &row[row_offset..];
+ }
+
+ if opts.descending {
+ values_bytes.iter_mut().for_each(|o| *o = !*o);
+ }
+
+ let mut last_value_offset = 0;
+ let mut child_rows: Vec<_> = values_offsets_vec
+ .into_iter()
+ .map(|offset| {
+ let v = &values_bytes[last_value_offset..offset];
+ last_value_offset = offset;
+ v
+ })
+ .collect();
+
+ let child = unsafe { converter.convert_raw(&mut child_rows, validate_utf8)
}?;
+ assert_eq!(child.len(), 1);
Review Comment:
Should we return an error here since the function returns a result anyway?
##########
arrow-row/src/lib.rs:
##########
@@ -179,6 +179,39 @@ use crate::variable::{decode_binary, decode_string};
use arrow_array::types::{Int16Type, Int32Type, Int64Type};
mod fixed;
+
+/// Computes the minimum offset and maximum end (offset + size) for a ListView
array.
+/// Returns (min_offset, max_end) which can be used to slice the values array.
+fn compute_list_view_bounds<O: OffsetSizeTrait>(array:
&GenericListViewArray<O>) -> (usize, usize) {
Review Comment:
This function seems oddly placed; should be lower down instead of in the
middle of the `mod` declarations?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]