HaoYang670 commented on code in PR #1684: URL: https://github.com/apache/arrow-rs/pull/1684#discussion_r869955221
########## arrow/src/array/data.rs: ########## @@ -1033,76 +1035,64 @@ impl ArrayData { } /// Calls the `validate(item_index, range)` function for each of - /// the ranges specified in the arrow offset buffer of type + /// the ranges specified in the arrow offsets buffer of type /// `T`. Also validates that each offset is smaller than - /// `max_offset` + /// `offset_limit` + /// + /// For an empty array, the offsets buffer can either be empty + /// or contain a single `0`. /// - /// For example, the offset buffer contained `[1, 2, 4]`, this + /// For example, the offsets buffer contained `[1, 2, 4]`, this /// function would call `validate([1,2])`, and `validate([2,4])` fn validate_each_offset<T, V>( &self, - offset_buffer: &Buffer, + offsets_buffer: &Buffer, offset_limit: usize, validate: V, ) -> Result<()> where T: ArrowNativeType + std::convert::TryInto<usize> + num::Num + std::fmt::Display, V: Fn(usize, Range<usize>) -> Result<()>, { - // An empty binary-like array can have 0 offsets - if self.len == 0 && offset_buffer.is_empty() { - return Ok(()); - } - - let offsets = self.typed_offsets::<T>(offset_buffer)?; - - offsets + self.typed_offsets::<T>(offsets_buffer)? .iter() - .zip(offsets.iter().skip(1)) .enumerate() - .map(|(i, (&start_offset, &end_offset))| { - let start_offset: usize = start_offset - .try_into() - .map_err(|_| { - ArrowError::InvalidArgumentError(format!( - "Offset invariant failure: could not convert start_offset {} to usize in slot {}", - start_offset, i)) - })?; - let end_offset: usize = end_offset - .try_into() - .map_err(|_| { - ArrowError::InvalidArgumentError(format!( - "Offset invariant failure: Could not convert end_offset {} to usize in slot {}", - end_offset, i+1)) - })?; - - if start_offset > offset_limit { - return Err(ArrowError::InvalidArgumentError(format!( - "Offset invariant failure: offset for slot {} out of bounds: {} > {}", - i, start_offset, offset_limit)) - ); - } - - if end_offset > offset_limit { - return Err(ArrowError::InvalidArgumentError(format!( - "Offset invariant failure: offset for slot {} out of bounds: {} > {}", - i, end_offset, offset_limit)) + .map(|(i, x)| { + // check if the offset can be converted to usize + let r = x.to_usize().ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "Offset invariant failure: Could not convert offset {} to usize at position {}", + x, i))} ); + // check if the offset exceeds the limit + match r { + Ok(n) if n <= offset_limit => Ok((i, n)), + Ok(_) => Err(ArrowError::InvalidArgumentError(format!( + "Offset invariant failure: offset at position {} out of bounds: {} > {}", + i, x, offset_limit)) + ), + Err(e) => Err(e), } - - // check range actually is low -> high - if start_offset > end_offset { - return Err(ArrowError::InvalidArgumentError(format!( + }) + .scan(0_usize, |start, end| { + // check offsets are monotonically increasing + match end { + Ok((i, end)) if *start <= end => { + let range = Some(Ok((i, *start..end))); + *start = end; + range + } + Ok((i, end)) => Some(Err(ArrowError::InvalidArgumentError(format!( "Offset invariant failure: non-monotonic offset at slot {}: {} > {}", - i, start_offset, end_offset)) - ); + i - 1, start, end)) Review Comment: The `i - 1` here is a little ugly. I try to find a more elegant way -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org