alamb commented on code in PR #9019: URL: https://github.com/apache/arrow-rs/pull/9019#discussion_r2636161090
########## arrow-buffer/src/buffer/run.rs: ########## @@ -18,75 +18,107 @@ use crate::ArrowNativeType; use crate::buffer::ScalarBuffer; -/// A slice-able buffer of monotonically increasing, positive integers used to store run-ends +/// A slice-able buffer of monotonically increasing, positive integers used to +/// store run-ends. /// -/// # Logical vs Physical -/// -/// A [`RunEndBuffer`] is used to encode runs of the same value, the index of each run is -/// called the physical index. The logical index is then the corresponding index in the logical -/// run-encoded array, i.e. a single run of length `3`, would have the logical indices `0..3`. +/// Used to compactly represent runs of the same value. Values being represented +/// are stored in a separate buffer from this struct. /// -/// Each value in [`RunEndBuffer::values`] is the cumulative length of all runs in the -/// logical array, up to that physical index. +/// # Logical vs Physical /// -/// Consider a [`RunEndBuffer`] containing `[3, 4, 6]`. The maximum physical index is `2`, -/// as there are `3` values, and the maximum logical index is `5`, as the maximum run end -/// is `6`. The physical indices are therefore `[0, 0, 0, 1, 2, 2]` +/// Physically, each value in the `run_ends` buffer is the cumulative length of +/// all runs in the logical representation, up to that physical index. Consider +/// the following example: /// /// ```text -/// ┌─────────┐ ┌─────────┐ ┌─────────┐ -/// │ 3 │ │ 0 │ ─┬──────▶ │ 0 │ -/// ├─────────┤ ├─────────┤ │ ├─────────┤ -/// │ 4 │ │ 1 │ ─┤ ┌────▶ │ 1 │ -/// ├─────────┤ ├─────────┤ │ │ ├─────────┤ -/// │ 6 │ │ 2 │ ─┘ │ ┌──▶ │ 2 │ -/// └─────────┘ ├─────────┤ │ │ └─────────┘ -/// run ends │ 3 │ ───┘ │ physical indices -/// ├─────────┤ │ -/// │ 4 │ ─────┤ -/// ├─────────┤ │ -/// │ 5 │ ─────┘ -/// └─────────┘ -/// logical indices +/// physical logical +/// ┌─────────┬─────────┐ ┌─────────┬─────────┐ +/// │ 3 │ 0 │ ◄──────┬─ │ A │ 0 │ +/// ├─────────┼─────────┤ │ ├─────────┼─────────┤ +/// │ 4 │ 1 │ ◄────┐ ├─ │ A │ 1 │ +/// ├─────────┼─────────┤ │ │ ├─────────┼─────────┤ +/// │ 6 │ 2 │ ◄──┐ │ └─ │ A │ 2 │ +/// └─────────┴─────────┘ │ │ ├─────────┼─────────┤ +/// run-ends index │ └─── │ B │ 3 │ +/// │ ├─────────┼─────────┤ +/// logical_offset = 0 ├───── │ C │ 4 │ +/// logical_length = 6 │ ├─────────┼─────────┤ +/// └───── │ C │ 5 │ +/// └─────────┴─────────┘ +/// values index /// ``` /// +/// A [`RunEndBuffer`] is physically the buffer and offset with length on the left. +/// In this case, the offset and length represent the whole buffer, so it is essentially +/// unsliced. See the section below on slicing for more details on how this buffer +/// handles slicing. +/// +/// We can see how logically the values are represented by the same physical index, +/// how multiple logical indices map to the same physical index. So the [`RunEndBuffer`] +/// containing `[3, 4, 6]` is essentially the physical indices `[0, 0, 0, 1, 2, 2]`, +/// and having a separately stored buffer of values such as `[A, B, C]` can turn +/// this into a representation of `[A, A, A, B, C, C]`. +/// /// # Slicing /// -/// In order to provide zero-copy slicing, this container stores a separate offset and length +/// In order to provide zero-copy slicing, this struct stores a separate **logical** +/// offset and length. Consider the following example: +/// +/// ```text +/// physical logical +/// ┌─────────┬─────────┐ ┌ ─ ─ ─ ─ ┬ ─ ─ ─ ─ ┐ +/// │ 3 │ 0 │ ◄──────┐ A 0 +/// ├─────────┼─────────┤ │ ├── ─ ─ ─ ┼ ─ ─ ─ ─ ┤ +/// │ 4 │ 1 │ ◄────┐ │ A 1 +/// ├─────────┼─────────┤ │ │ ├─────────┼─────────┤ +/// │ 6 │ 2 │ ◄──┐ │ └─ │ A │ 2 │◄─── logical_offset +/// └─────────┴─────────┘ │ │ ├─────────┼─────────┤ +/// run-ends index │ └─── │ B │ 3 │ +/// │ ├─────────┼─────────┤ +/// logical_offset = 2 └───── │ C │ 4 │ +/// logical_length = 3 ├─────────┼─────────┤ +/// C 5 ◄─── logical_offset + logical_length +/// └ ─ ─ ─ ─ ┴ ─ ─ ─ ─ ┘ +/// values index +/// ``` /// -/// For example, a [`RunEndBuffer`] containing values `[3, 6, 8]` with offset and length `4` would -/// describe the physical indices `1, 1, 2, 2` +/// The physical `run_ends` [`ScalarBuffer`] remains unchanged, in order to facilitate +/// zero-copy. However, we now offset into the **logical** representation with an +/// accompanying length. This allows us to represent values `[A, B, C]` using physical +/// indices `0, 1, 2` with the same underlying physical buffer, at the cost of two +/// extra `usize`s to represent the logical slice that was taken. /// -/// For example, a [`RunEndBuffer`] containing values `[6, 8, 9]` with offset `2` and length `5` -/// would describe the physical indices `0, 0, 0, 0, 1` +/// (A [`RunEndBuffer`] is considered unsliced when `logical_offset` is `0` and +/// `logical_length` is equal to the last value in `run_ends`) /// /// [Run-End encoded layout]: https://arrow.apache.org/docs/format/Columnar.html#run-end-encoded-layout #[derive(Debug, Clone)] pub struct RunEndBuffer<E: ArrowNativeType> { run_ends: ScalarBuffer<E>, - len: usize, - offset: usize, + logical_length: usize, Review Comment: 😍 -- I like these names much better ########## arrow-array/src/array/run_array.rs: ########## @@ -161,22 +170,31 @@ impl<R: RunEndIndexType> RunArray<R> { }) } - /// Returns index to the physical array for the given index to the logical array. - /// This function adjusts the input logical index based on `ArrayData::offset` - /// Performs a binary search on the run_ends array for the input index. + /// Calls [`RunEndBuffer::get_physical_index`]. /// /// The result is arbitrary if `logical_index >= self.len()` pub fn get_physical_index(&self, logical_index: usize) -> usize { self.run_ends.get_physical_index(logical_index) } - /// Returns the physical indices of the input logical indices. Returns error if any of the logical - /// index cannot be converted to physical index. The logical indices are sorted and iterated along - /// with run_ends array to find matching physical index. The approach used here was chosen over - /// finding physical index for each logical index using binary search using the function - /// `get_physical_index`. Running benchmarks on both approaches showed that the approach used here + /// Given the input `logical_indices`, return the corresponding physical index + /// for each, according to the underlying [`RunEndBuffer`], taking into account + /// any slicing that has occurred. + /// + /// Returns an error if any of the provided logical indices is out of range. + /// + /// # Implementation + /// + /// The logical indices are sorted and iterated along with the `run_ends` buffer + /// to find the matching physical index. The approach used here was chosen over + /// finding the physical index for each logical index using binary search via + /// the function [`RunEndBuffer::get_physical_index`]. + /// + /// Running benchmarks on both approaches showed that the approach used here /// scaled well for larger inputs. + /// /// See <https://github.com/apache/arrow-rs/pull/3622#issuecomment-1407753727> for more details. + // TODO: this technically should be a method on RunEndBuffer Review Comment: yeah, I think a thin wrapper would be good -- as a follow on PR perhaps ########## arrow-buffer/src/buffer/run.rs: ########## @@ -18,75 +18,107 @@ use crate::ArrowNativeType; use crate::buffer::ScalarBuffer; -/// A slice-able buffer of monotonically increasing, positive integers used to store run-ends +/// A slice-able buffer of monotonically increasing, positive integers used to +/// store run-ends. /// -/// # Logical vs Physical -/// -/// A [`RunEndBuffer`] is used to encode runs of the same value, the index of each run is -/// called the physical index. The logical index is then the corresponding index in the logical -/// run-encoded array, i.e. a single run of length `3`, would have the logical indices `0..3`. +/// Used to compactly represent runs of the same value. Values being represented +/// are stored in a separate buffer from this struct. /// -/// Each value in [`RunEndBuffer::values`] is the cumulative length of all runs in the -/// logical array, up to that physical index. +/// # Logical vs Physical /// -/// Consider a [`RunEndBuffer`] containing `[3, 4, 6]`. The maximum physical index is `2`, -/// as there are `3` values, and the maximum logical index is `5`, as the maximum run end -/// is `6`. The physical indices are therefore `[0, 0, 0, 1, 2, 2]` +/// Physically, each value in the `run_ends` buffer is the cumulative length of +/// all runs in the logical representation, up to that physical index. Consider +/// the following example: /// /// ```text -/// ┌─────────┐ ┌─────────┐ ┌─────────┐ -/// │ 3 │ │ 0 │ ─┬──────▶ │ 0 │ -/// ├─────────┤ ├─────────┤ │ ├─────────┤ -/// │ 4 │ │ 1 │ ─┤ ┌────▶ │ 1 │ -/// ├─────────┤ ├─────────┤ │ │ ├─────────┤ -/// │ 6 │ │ 2 │ ─┘ │ ┌──▶ │ 2 │ -/// └─────────┘ ├─────────┤ │ │ └─────────┘ -/// run ends │ 3 │ ───┘ │ physical indices -/// ├─────────┤ │ -/// │ 4 │ ─────┤ -/// ├─────────┤ │ -/// │ 5 │ ─────┘ -/// └─────────┘ -/// logical indices +/// physical logical +/// ┌─────────┬─────────┐ ┌─────────┬─────────┐ +/// │ 3 │ 0 │ ◄──────┬─ │ A │ 0 │ +/// ├─────────┼─────────┤ │ ├─────────┼─────────┤ +/// │ 4 │ 1 │ ◄────┐ ├─ │ A │ 1 │ +/// ├─────────┼─────────┤ │ │ ├─────────┼─────────┤ +/// │ 6 │ 2 │ ◄──┐ │ └─ │ A │ 2 │ +/// └─────────┴─────────┘ │ │ ├─────────┼─────────┤ +/// run-ends index │ └─── │ B │ 3 │ +/// │ ├─────────┼─────────┤ +/// logical_offset = 0 ├───── │ C │ 4 │ +/// logical_length = 6 │ ├─────────┼─────────┤ +/// └───── │ C │ 5 │ +/// └─────────┴─────────┘ +/// values index /// ``` /// +/// A [`RunEndBuffer`] is physically the buffer and offset with length on the left. +/// In this case, the offset and length represent the whole buffer, so it is essentially +/// unsliced. See the section below on slicing for more details on how this buffer +/// handles slicing. +/// +/// We can see how logically the values are represented by the same physical index, +/// how multiple logical indices map to the same physical index. So the [`RunEndBuffer`] Review Comment: This sentence seems a bit cut off. Maybe something like this would be clearer: ```suggestion /// This means that multiple logical values are represented in the same physical index, /// and multiple logical indices map to the same physical index. The [`RunEndBuffer`] ``` ########## arrow-buffer/src/buffer/run.rs: ########## @@ -18,75 +18,107 @@ use crate::ArrowNativeType; use crate::buffer::ScalarBuffer; -/// A slice-able buffer of monotonically increasing, positive integers used to store run-ends +/// A slice-able buffer of monotonically increasing, positive integers used to +/// store run-ends. /// -/// # Logical vs Physical -/// -/// A [`RunEndBuffer`] is used to encode runs of the same value, the index of each run is -/// called the physical index. The logical index is then the corresponding index in the logical -/// run-encoded array, i.e. a single run of length `3`, would have the logical indices `0..3`. +/// Used to compactly represent runs of the same value. Values being represented Review Comment: Maybe we could add a link to `RunArray` docs. I think you might have to directly link to the docs.rs page to avoid circular dependencies: https://docs.rs/arrow/latest/arrow/array/struct.RunArray.html ########## arrow-buffer/src/buffer/run.rs: ########## @@ -18,75 +18,107 @@ use crate::ArrowNativeType; use crate::buffer::ScalarBuffer; -/// A slice-able buffer of monotonically increasing, positive integers used to store run-ends +/// A slice-able buffer of monotonically increasing, positive integers used to Review Comment: I found the reference to `sliceable` somewhat confusing - it seems too specific (though I see it wasn't added in this PR) ########## arrow-array/src/array/run_array.rs: ########## @@ -117,25 +123,29 @@ impl<R: RunEndIndexType> RunArray<R> { Ok(array_data.into()) } - /// Returns a reference to [`RunEndBuffer`] + /// Returns a reference to the [`RunEndBuffer`]. pub fn run_ends(&self) -> &RunEndBuffer<R::Native> { &self.run_ends } - /// Returns a reference to values array + /// Returns a reference to the values array. /// - /// Note: any slicing of this [`RunArray`] array is not applied to the returned array - /// and must be handled separately + /// Any slicing of this [`RunArray`] array is **not** applied to the returned Review Comment: this is the same as how ListArray works, which is definitely tricky to use correctly, as @rluvaton has noted ########## arrow-buffer/src/buffer/run.rs: ########## @@ -18,75 +18,107 @@ use crate::ArrowNativeType; use crate::buffer::ScalarBuffer; -/// A slice-able buffer of monotonically increasing, positive integers used to store run-ends +/// A slice-able buffer of monotonically increasing, positive integers used to +/// store run-ends. /// -/// # Logical vs Physical -/// -/// A [`RunEndBuffer`] is used to encode runs of the same value, the index of each run is -/// called the physical index. The logical index is then the corresponding index in the logical -/// run-encoded array, i.e. a single run of length `3`, would have the logical indices `0..3`. +/// Used to compactly represent runs of the same value. Values being represented +/// are stored in a separate buffer from this struct. /// -/// Each value in [`RunEndBuffer::values`] is the cumulative length of all runs in the -/// logical array, up to that physical index. +/// # Logical vs Physical /// -/// Consider a [`RunEndBuffer`] containing `[3, 4, 6]`. The maximum physical index is `2`, -/// as there are `3` values, and the maximum logical index is `5`, as the maximum run end -/// is `6`. The physical indices are therefore `[0, 0, 0, 1, 2, 2]` +/// Physically, each value in the `run_ends` buffer is the cumulative length of +/// all runs in the logical representation, up to that physical index. Consider +/// the following example: /// /// ```text -/// ┌─────────┐ ┌─────────┐ ┌─────────┐ -/// │ 3 │ │ 0 │ ─┬──────▶ │ 0 │ -/// ├─────────┤ ├─────────┤ │ ├─────────┤ -/// │ 4 │ │ 1 │ ─┤ ┌────▶ │ 1 │ -/// ├─────────┤ ├─────────┤ │ │ ├─────────┤ -/// │ 6 │ │ 2 │ ─┘ │ ┌──▶ │ 2 │ -/// └─────────┘ ├─────────┤ │ │ └─────────┘ -/// run ends │ 3 │ ───┘ │ physical indices -/// ├─────────┤ │ -/// │ 4 │ ─────┤ -/// ├─────────┤ │ -/// │ 5 │ ─────┘ -/// └─────────┘ -/// logical indices +/// physical logical +/// ┌─────────┬─────────┐ ┌─────────┬─────────┐ Review Comment: this is a nice improvement (to show values and indexes) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
