Re: [PR] arrow-row: Add support for REE [arrow-rs]
alamb commented on code in PR #7649:
URL: https://github.com/apache/arrow-rs/pull/7649#discussion_r2153018217
##
arrow-row/src/run.rs:
##
@@ -0,0 +1,695 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{variable, RowConverter, Rows, SortField};
+use arrow_array::types::RunEndIndexType;
+use arrow_array::{PrimitiveArray, RunArray};
+use arrow_buffer::{ArrowNativeType, ScalarBuffer};
+use arrow_schema::{ArrowError, SortOptions};
+
+/// Computes the lengths of each row for a RunEndEncodedArray
+pub fn compute_lengths(
+lengths: &mut [usize],
+rows: &Rows,
+array: &RunArray,
+) {
+let run_ends = array.run_ends().values();
+let mut logical_start = 0;
+
+// Iterate over each run and apply the same length to all logical
positions in the run
+for (physical_idx, &run_end) in run_ends.iter().enumerate() {
+let logical_end = run_end.as_usize();
+let row = rows.row(physical_idx);
+let encoded_len = variable::encoded_len(Some(row.data));
+
+// Add the same length for all logical positions in this run
+for length in &mut lengths[logical_start..logical_end] {
+*length += encoded_len;
+}
+
+logical_start = logical_end;
+}
+}
+
+/// Encodes the provided `RunEndEncodedArray` to `out` with the provided
`SortOptions`
+///
+/// `rows` should contain the encoded values
+pub fn encode(
+data: &mut [u8],
+offsets: &mut [usize],
+rows: &Rows,
+opts: SortOptions,
+array: &RunArray,
+) {
+let run_ends = array.run_ends();
+
+let mut logical_idx = 0;
+let mut offset_idx = 1; // Skip first offset
+
+// Iterate over each run
+for physical_idx in 0..run_ends.values().len() {
+let run_end = run_ends.values()[physical_idx].as_usize();
+
+// Process all elements in this run
+while logical_idx < run_end && offset_idx < offsets.len() {
+let offset = &mut offsets[offset_idx];
+let out = &mut data[*offset..];
+
+// Use variable-length encoding to make the data self-describing
+let row = rows.row(physical_idx);
Review Comment:
Tracking with
- https://github.com/apache/arrow-rs/issues/7693
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
Re: [PR] arrow-row: Add support for REE [arrow-rs]
alamb commented on code in PR #7649:
URL: https://github.com/apache/arrow-rs/pull/7649#discussion_r2153013293
##
arrow-row/src/run.rs:
##
@@ -0,0 +1,695 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{variable, RowConverter, Rows, SortField};
+use arrow_array::types::RunEndIndexType;
+use arrow_array::{PrimitiveArray, RunArray};
+use arrow_buffer::{ArrowNativeType, ScalarBuffer};
+use arrow_schema::{ArrowError, SortOptions};
+
+/// Computes the lengths of each row for a RunEndEncodedArray
+pub fn compute_lengths(
+lengths: &mut [usize],
+rows: &Rows,
+array: &RunArray,
+) {
+let run_ends = array.run_ends().values();
+let mut logical_start = 0;
+
+// Iterate over each run and apply the same length to all logical
positions in the run
+for (physical_idx, &run_end) in run_ends.iter().enumerate() {
+let logical_end = run_end.as_usize();
+let row = rows.row(physical_idx);
+let encoded_len = variable::encoded_len(Some(row.data));
+
+// Add the same length for all logical positions in this run
+for length in &mut lengths[logical_start..logical_end] {
+*length += encoded_len;
+}
+
+logical_start = logical_end;
+}
+}
+
+/// Encodes the provided `RunEndEncodedArray` to `out` with the provided
`SortOptions`
+///
+/// `rows` should contain the encoded values
+pub fn encode(
+data: &mut [u8],
+offsets: &mut [usize],
+rows: &Rows,
+opts: SortOptions,
+array: &RunArray,
+) {
+let run_ends = array.run_ends();
+
+let mut logical_idx = 0;
+let mut offset_idx = 1; // Skip first offset
+
+// Iterate over each run
+for physical_idx in 0..run_ends.values().len() {
+let run_end = run_ends.values()[physical_idx].as_usize();
+
+// Process all elements in this run
+while logical_idx < run_end && offset_idx < offsets.len() {
+let offset = &mut offsets[offset_idx];
+let out = &mut data[*offset..];
+
+// Use variable-length encoding to make the data self-describing
+let row = rows.row(physical_idx);
+let bytes_written = variable::encode_one(out, Some(row.data),
opts);
+*offset += bytes_written;
+
+logical_idx += 1;
+offset_idx += 1;
+}
+
+// Break if we've processed all offsets
+if offset_idx >= offsets.len() {
+break;
+}
+}
+}
+
+/// Decodes a RunEndEncodedArray from `rows` with the provided `options`
+///
+/// # Safety
+///
+/// `rows` must contain valid data for the provided `converter`
+pub unsafe fn decode(
+converter: &RowConverter,
+rows: &mut [&[u8]],
+field: &SortField,
+validate_utf8: bool,
+) -> Result, ArrowError> {
+if rows.is_empty() {
+let values = converter.convert_raw(&mut [], validate_utf8)?;
+let run_ends_array =
PrimitiveArraynew(ScalarBuffer::from(vec![]), None);
+return RunArraytry_new(&run_ends_array, &values[0]);
+}
+
+// Decode each row's REE data and collect the decoded values
+let mut decoded_values = Vec::new();
+let mut run_ends = Vec::new();
+let mut unique_row_indices = Vec::new();
+
+// Process each row to extract its REE data (following decode_binary
pattern)
+let mut decoded_data = Vec::new();
+for (idx, row) in rows.iter_mut().enumerate() {
+decoded_data.clear();
+// Extract the decoded value data from this row
+let consumed = variable::decode_blocks(row, field.options, |block| {
+decoded_data.extend_from_slice(block);
+});
+
+// Handle bit inversion for descending sort (following decode_binary
pattern)
+if field.options.descending {
+decoded_data.iter_mut().for_each(|b| *b = !*b);
+}
+
+// Update the row to point past the consumed REE data
+*row = &row[consumed..];
+
+// Check if this decoded value is the same as the previous one to
identify runs
+let is_new_run =
+idx == 0 || decoded_data !=
decoded_values[*unique_row_indices.last().unwrap()];
+
+if is_new_run {
+// This is
Re: [PR] arrow-row: Add support for REE [arrow-rs]
alamb commented on code in PR #7649:
URL: https://github.com/apache/arrow-rs/pull/7649#discussion_r2153011523
##
arrow-row/src/run.rs:
##
@@ -0,0 +1,695 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{variable, RowConverter, Rows, SortField};
+use arrow_array::types::RunEndIndexType;
+use arrow_array::{PrimitiveArray, RunArray};
+use arrow_buffer::{ArrowNativeType, ScalarBuffer};
+use arrow_schema::{ArrowError, SortOptions};
+
+/// Computes the lengths of each row for a RunEndEncodedArray
+pub fn compute_lengths(
+lengths: &mut [usize],
+rows: &Rows,
+array: &RunArray,
+) {
+let run_ends = array.run_ends().values();
+let mut logical_start = 0;
+
+// Iterate over each run and apply the same length to all logical
positions in the run
+for (physical_idx, &run_end) in run_ends.iter().enumerate() {
+let logical_end = run_end.as_usize();
+let row = rows.row(physical_idx);
+let encoded_len = variable::encoded_len(Some(row.data));
+
+// Add the same length for all logical positions in this run
+for length in &mut lengths[logical_start..logical_end] {
+*length += encoded_len;
+}
+
+logical_start = logical_end;
+}
+}
+
+/// Encodes the provided `RunEndEncodedArray` to `out` with the provided
`SortOptions`
+///
+/// `rows` should contain the encoded values
+pub fn encode(
+data: &mut [u8],
+offsets: &mut [usize],
+rows: &Rows,
+opts: SortOptions,
+array: &RunArray,
+) {
+let run_ends = array.run_ends();
+
+let mut logical_idx = 0;
+let mut offset_idx = 1; // Skip first offset
+
+// Iterate over each run
+for physical_idx in 0..run_ends.values().len() {
+let run_end = run_ends.values()[physical_idx].as_usize();
+
+// Process all elements in this run
+while logical_idx < run_end && offset_idx < offsets.len() {
+let offset = &mut offsets[offset_idx];
+let out = &mut data[*offset..];
+
+// Use variable-length encoding to make the data self-describing
+let row = rows.row(physical_idx);
+let bytes_written = variable::encode_one(out, Some(row.data),
opts);
+*offset += bytes_written;
+
+logical_idx += 1;
+offset_idx += 1;
+}
+
+// Break if we've processed all offsets
+if offset_idx >= offsets.len() {
+break;
+}
+}
+}
+
+/// Decodes a RunEndEncodedArray from `rows` with the provided `options`
+///
+/// # Safety
+///
+/// `rows` must contain valid data for the provided `converter`
+pub unsafe fn decode(
+converter: &RowConverter,
+rows: &mut [&[u8]],
+field: &SortField,
+validate_utf8: bool,
+) -> Result, ArrowError> {
+if rows.is_empty() {
+let values = converter.convert_raw(&mut [], validate_utf8)?;
+let run_ends_array =
PrimitiveArraynew(ScalarBuffer::from(vec![]), None);
+return RunArraytry_new(&run_ends_array, &values[0]);
+}
+
+// Decode each row's REE data and collect the decoded values
+let mut decoded_values = Vec::new();
+let mut run_ends = Vec::new();
+let mut unique_row_indices = Vec::new();
+
+// Process each row to extract its REE data (following decode_binary
pattern)
+let mut decoded_data = Vec::new();
+for (idx, row) in rows.iter_mut().enumerate() {
+decoded_data.clear();
+// Extract the decoded value data from this row
+let consumed = variable::decode_blocks(row, field.options, |block| {
+decoded_data.extend_from_slice(block);
+});
+
+// Handle bit inversion for descending sort (following decode_binary
pattern)
+if field.options.descending {
+decoded_data.iter_mut().for_each(|b| *b = !*b);
+}
+
+// Update the row to point past the consumed REE data
+*row = &row[consumed..];
+
+// Check if this decoded value is the same as the previous one to
identify runs
+let is_new_run =
+idx == 0 || decoded_data !=
decoded_values[*unique_row_indices.last().unwrap()];
+
+if is_new_run {
+// This is
Re: [PR] arrow-row: Add support for REE [arrow-rs]
alamb commented on PR #7649: URL: https://github.com/apache/arrow-rs/pull/7649#issuecomment-2981535190 Thanks again @brancz -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] arrow-row: Add support for REE [arrow-rs]
alamb merged PR #7649: URL: https://github.com/apache/arrow-rs/pull/7649 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] arrow-row: Add support for REE [arrow-rs]
alamb commented on PR #7649: URL: https://github.com/apache/arrow-rs/pull/7649#issuecomment-2981534648 I plan to merge this PR in and then file follow on issues for items found during review 1) adding partial eq to REE 2) refactor the round trip tests a bit to deduplicate code 3) only decode each physical value once -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] arrow-row: Add support for REE [arrow-rs]
alamb commented on PR #7649: URL: https://github.com/apache/arrow-rs/pull/7649#issuecomment-2978072151 > I think we need test coverage for the following cases (I'll make a PR) Here is a PR: - https://github.com/apache/arrow-rs/pull/7680 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] arrow-row: Add support for REE [arrow-rs]
alamb commented on code in PR #7649:
URL: https://github.com/apache/arrow-rs/pull/7649#discussion_r2150765924
##
arrow-row/src/run.rs:
##
@@ -0,0 +1,695 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{variable, RowConverter, Rows, SortField};
+use arrow_array::types::RunEndIndexType;
+use arrow_array::{PrimitiveArray, RunArray};
+use arrow_buffer::{ArrowNativeType, ScalarBuffer};
+use arrow_schema::{ArrowError, SortOptions};
+
+/// Computes the lengths of each row for a RunEndEncodedArray
+pub fn compute_lengths(
+lengths: &mut [usize],
+rows: &Rows,
+array: &RunArray,
+) {
+let run_ends = array.run_ends().values();
+let mut logical_start = 0;
+
+// Iterate over each run and apply the same length to all logical
positions in the run
+for (physical_idx, &run_end) in run_ends.iter().enumerate() {
+let logical_end = run_end.as_usize();
+let row = rows.row(physical_idx);
+let encoded_len = variable::encoded_len(Some(row.data));
+
+// Add the same length for all logical positions in this run
+for length in &mut lengths[logical_start..logical_end] {
+*length += encoded_len;
+}
+
+logical_start = logical_end;
+}
+}
+
+/// Encodes the provided `RunEndEncodedArray` to `out` with the provided
`SortOptions`
+///
+/// `rows` should contain the encoded values
+pub fn encode(
+data: &mut [u8],
+offsets: &mut [usize],
+rows: &Rows,
+opts: SortOptions,
+array: &RunArray,
+) {
+let run_ends = array.run_ends();
+
+let mut logical_idx = 0;
+let mut offset_idx = 1; // Skip first offset
+
+// Iterate over each run
+for physical_idx in 0..run_ends.values().len() {
+let run_end = run_ends.values()[physical_idx].as_usize();
+
+// Process all elements in this run
+while logical_idx < run_end && offset_idx < offsets.len() {
+let offset = &mut offsets[offset_idx];
+let out = &mut data[*offset..];
+
+// Use variable-length encoding to make the data self-describing
+let row = rows.row(physical_idx);
Review Comment:
Some random performance optimization thoughts (for some future PR):
1. You could hoist this out of the inner loop so it was executed once per
physical value rather than once per logical value
2. You could potentially encode row once and then simply copy the encoded
bytes for all remaining rows. This is probably significantly faster than
re-encoding the same value over and over again.
##
arrow-row/src/run.rs:
##
@@ -0,0 +1,695 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{variable, RowConverter, Rows, SortField};
+use arrow_array::types::RunEndIndexType;
+use arrow_array::{PrimitiveArray, RunArray};
+use arrow_buffer::{ArrowNativeType, ScalarBuffer};
+use arrow_schema::{ArrowError, SortOptions};
+
+/// Computes the lengths of each row for a RunEndEncodedArray
+pub fn compute_lengths(
+lengths: &mut [usize],
+rows: &Rows,
+array: &RunArray,
+) {
+let run_ends = array.run_ends().values();
+let mut logical_start = 0;
+
+// Iterate over each run and apply the same length to all logical
positions in the run
+for (physical_idx, &run_end) in run_ends.iter().enumerate() {
+let logical_end = run_end.as_usize();
+let row = rows.row(physical_idx);
+let encoded_len = vari
Re: [PR] arrow-row: Add support for REE [arrow-rs]
brancz commented on PR #7649: URL: https://github.com/apache/arrow-rs/pull/7649#issuecomment-2970703802 @alamb you can ignore all intermediate commits, the last commit pretty much re-writes everything and I think is also way more readable and understandable (and most of all correct). -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] arrow-row: Add support for REE [arrow-rs]
brancz commented on code in PR #7649:
URL: https://github.com/apache/arrow-rs/pull/7649#discussion_r2145291279
##
arrow-row/src/run_test.rs:
##
@@ -0,0 +1,55 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#[cfg(test)]
+mod tests {
+use crate::{RowConverter, SortField};
+use arrow_array::types::Int32Type;
+use arrow_array::RunArray;
+use arrow_schema::DataType;
+use std::sync::Arc;
+
+#[test]
+fn test_run_end_encoded_supports_datatype() {
+// Test that the RowConverter correctly supports run-end encoded arrays
+assert!(RowConverter::supports_datatype(&DataType::RunEndEncoded(
+Arc::new(arrow_schema::Field::new("run_ends", DataType::Int32,
false)),
+Arc::new(arrow_schema::Field::new("values", DataType::Utf8, true)),
+)));
+}
+
+#[test]
+fn test_run_end_encoded_sorting() {
+// Create two run arrays with different values
+let run_array1: RunArray = vec!["b", "b",
"a"].into_iter().collect();
+let run_array2: RunArray = vec!["a", "a",
"b"].into_iter().collect();
+
+let converter =
RowConverter::new(vec![SortField::new(DataType::RunEndEncoded(
+Arc::new(arrow_schema::Field::new("run_ends", DataType::Int32,
false)),
+Arc::new(arrow_schema::Field::new("values", DataType::Utf8, true)),
+))])
+.unwrap();
+
+let rows1 =
converter.convert_columns(&[Arc::new(run_array1)]).unwrap();
+let rows2 =
converter.convert_columns(&[Arc::new(run_array2)]).unwrap();
+
+// Compare rows - the second row should be less than the first
+assert!(rows2.row(0) < rows1.row(0));
+assert!(rows2.row(1) < rows1.row(0));
+assert!(rows1.row(2) < rows2.row(2));
+}
Review Comment:
Good call, there were *a lot* of problems. Refactoring a bit then pushing
the latest changes.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
Re: [PR] arrow-row: Add support for REE [arrow-rs]
brancz commented on code in PR #7649:
URL: https://github.com/apache/arrow-rs/pull/7649#discussion_r2145292078
##
arrow-row/src/run.rs:
##
@@ -0,0 +1,164 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{null_sentinel, RowConverter, Rows, SortField};
+use arrow_array::types::RunEndIndexType;
+use arrow_array::{Array, RunArray};
+use arrow_buffer::{ArrowNativeType, Buffer};
+use arrow_schema::{ArrowError, SortOptions};
+
+/// Computes the lengths of each row for a RunEndEncodedArray
+pub fn compute_lengths(
+lengths: &mut [usize],
+rows: &Rows,
+array: &RunArray,
+) {
+// We don't need to use run_ends directly, just access through the array
+for (idx, length) in lengths.iter_mut().enumerate() {
+let physical_idx = array.get_physical_index(idx);
+let is_valid = array.values().is_valid(physical_idx);
+if is_valid {
+let row = rows.row(physical_idx);
+*length += row.data.len() + 1; // 1 for the validity byte
+} else {
+*length += 1; // just the null sentinel
+}
+}
+}
+
+/// Encodes the provided `RunEndEncodedArray` to `out` with the provided
`SortOptions`
+///
+/// `rows` should contain the encoded values
+pub fn encode(
+data: &mut [u8],
+offsets: &mut [usize],
+rows: &Rows,
+opts: SortOptions,
+array: &RunArray,
+) {
+for (idx, offset) in offsets.iter_mut().skip(1).enumerate() {
+let physical_idx = array.get_physical_index(idx);
+let is_valid = array.values().is_valid(physical_idx);
+
+let out = &mut data[*offset..];
+if is_valid {
+let row = rows.row(physical_idx);
+out[0] = 1; // valid
+out[1..1 + row.data.len()].copy_from_slice(row.data);
+*offset += row.data.len() + 1;
+} else {
+out[0] = null_sentinel(opts);
+*offset += 1;
+}
+}
+}
+
+/// Decodes a RunEndEncodedArray from `rows` with the provided `options`
+///
+/// # Safety
+///
+/// `rows` must contain valid data for the provided `converter`
+pub unsafe fn decode(
+converter: &RowConverter,
+rows: &mut [&[u8]],
+field: &SortField,
+validate_utf8: bool,
+) -> Result, ArrowError> {
+let opts = field.options;
+
+// Track null values and collect row data to avoid borrow issues
+let mut valid_flags = Vec::with_capacity(rows.len());
+let mut row_data = Vec::with_capacity(rows.len());
+
+// First pass: collect valid flags and data for each row
+for row in rows.iter() {
+let is_valid = row[0] != null_sentinel(opts);
+valid_flags.push(is_valid);
+if is_valid {
+row_data.push(&row[1..]);
+} else {
+row_data.push(&[][..]);
+}
+}
+
+// Now build run ends and values
+let mut run_ends = Vec::new();
+let mut values_data = Vec::new();
+let mut current_value_idx = 0;
+let mut current_run_end = 0;
+
+for (idx, is_valid) in valid_flags.iter().enumerate() {
+current_run_end += 1;
+
+if idx == 0 {
+// Add the first row data
+values_data.push(row_data[idx]);
+run_ends.push(R::Native::usize_as(current_run_end));
+continue;
+}
+
+// Check if this row is different from the previous one
+let value_changed = if !is_valid {
+// Null value - check if previous was null
+!valid_flags[idx - 1]
+} else if !valid_flags[idx - 1] {
+// Previous was null, this is not
+true
+} else {
+// Both are valid, compare data
+let prev_data = row_data[idx - 1];
+let curr_data = row_data[idx];
+prev_data != curr_data
+};
+
+if value_changed {
+// Start a new run
+current_value_idx += 1;
+values_data.push(row_data[idx]);
+run_ends.push(R::Native::usize_as(current_run_end));
+} else {
+// Update the current run end
+run_ends[current_value_idx] = R::Native::usize_as(current_run_end);
+}
+}
+
+// Convert collected values to arrays
+
Re: [PR] arrow-row: Add support for REE [arrow-rs]
brancz commented on code in PR #7649: URL: https://github.com/apache/arrow-rs/pull/7649#discussion_r2144567160 ## arrow-row/src/run_test.rs: ## @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one Review Comment: done in [4f9b8f3](https://github.com/apache/arrow-rs/pull/7649/commits/4f9b8f38d9138790933213d18b08801f9a4e9fd8) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] arrow-row: Add support for REE [arrow-rs]
brancz commented on code in PR #7649:
URL: https://github.com/apache/arrow-rs/pull/7649#discussion_r2144565407
##
arrow-row/src/run.rs:
##
@@ -0,0 +1,164 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{null_sentinel, RowConverter, Rows, SortField};
+use arrow_array::types::RunEndIndexType;
+use arrow_array::{Array, RunArray};
+use arrow_buffer::{ArrowNativeType, Buffer};
+use arrow_schema::{ArrowError, SortOptions};
+
+/// Computes the lengths of each row for a RunEndEncodedArray
+pub fn compute_lengths(
+lengths: &mut [usize],
+rows: &Rows,
+array: &RunArray,
+) {
+// We don't need to use run_ends directly, just access through the array
+for (idx, length) in lengths.iter_mut().enumerate() {
+let physical_idx = array.get_physical_index(idx);
+let is_valid = array.values().is_valid(physical_idx);
+if is_valid {
+let row = rows.row(physical_idx);
+*length += row.data.len() + 1; // 1 for the validity byte
+} else {
+*length += 1; // just the null sentinel
+}
+}
+}
+
+/// Encodes the provided `RunEndEncodedArray` to `out` with the provided
`SortOptions`
+///
+/// `rows` should contain the encoded values
+pub fn encode(
+data: &mut [u8],
+offsets: &mut [usize],
+rows: &Rows,
+opts: SortOptions,
+array: &RunArray,
+) {
+for (idx, offset) in offsets.iter_mut().skip(1).enumerate() {
+let physical_idx = array.get_physical_index(idx);
+let is_valid = array.values().is_valid(physical_idx);
+
+let out = &mut data[*offset..];
+if is_valid {
+let row = rows.row(physical_idx);
+out[0] = 1; // valid
+out[1..1 + row.data.len()].copy_from_slice(row.data);
+*offset += row.data.len() + 1;
+} else {
+out[0] = null_sentinel(opts);
+*offset += 1;
+}
+}
+}
+
+/// Decodes a RunEndEncodedArray from `rows` with the provided `options`
+///
+/// # Safety
+///
+/// `rows` must contain valid data for the provided `converter`
+pub unsafe fn decode(
+converter: &RowConverter,
+rows: &mut [&[u8]],
+field: &SortField,
+validate_utf8: bool,
+) -> Result, ArrowError> {
+let opts = field.options;
+
+// Track null values and collect row data to avoid borrow issues
+let mut valid_flags = Vec::with_capacity(rows.len());
+let mut row_data = Vec::with_capacity(rows.len());
+
+// First pass: collect valid flags and data for each row
+for row in rows.iter() {
+let is_valid = row[0] != null_sentinel(opts);
+valid_flags.push(is_valid);
+if is_valid {
+row_data.push(&row[1..]);
+} else {
+row_data.push(&[][..]);
+}
+}
+
+// Now build run ends and values
+let mut run_ends = Vec::new();
+let mut values_data = Vec::new();
+let mut current_value_idx = 0;
+let mut current_run_end = 0;
+
+for (idx, is_valid) in valid_flags.iter().enumerate() {
+current_run_end += 1;
+
+if idx == 0 {
+// Add the first row data
+values_data.push(row_data[idx]);
+run_ends.push(R::Native::usize_as(current_run_end));
+continue;
+}
+
+// Check if this row is different from the previous one
+let value_changed = if !is_valid {
+// Null value - check if previous was null
+!valid_flags[idx - 1]
+} else if !valid_flags[idx - 1] {
+// Previous was null, this is not
+true
+} else {
+// Both are valid, compare data
+let prev_data = row_data[idx - 1];
+let curr_data = row_data[idx];
+prev_data != curr_data
+};
+
+if value_changed {
+// Start a new run
+current_value_idx += 1;
+values_data.push(row_data[idx]);
+run_ends.push(R::Native::usize_as(current_run_end));
+} else {
+// Update the current run end
+run_ends[current_value_idx] = R::Native::usize_as(current_run_end);
+}
+}
+
+// Convert collected values to arrays
+
Re: [PR] arrow-row: Add support for REE [arrow-rs]
brancz commented on code in PR #7649:
URL: https://github.com/apache/arrow-rs/pull/7649#discussion_r2144564722
##
arrow-row/src/run.rs:
##
@@ -0,0 +1,164 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{null_sentinel, RowConverter, Rows, SortField};
+use arrow_array::types::RunEndIndexType;
+use arrow_array::{Array, RunArray};
+use arrow_buffer::{ArrowNativeType, Buffer};
+use arrow_schema::{ArrowError, SortOptions};
+
+/// Computes the lengths of each row for a RunEndEncodedArray
+pub fn compute_lengths(
+lengths: &mut [usize],
+rows: &Rows,
+array: &RunArray,
+) {
+// We don't need to use run_ends directly, just access through the array
+for (idx, length) in lengths.iter_mut().enumerate() {
+let physical_idx = array.get_physical_index(idx);
+let is_valid = array.values().is_valid(physical_idx);
+if is_valid {
+let row = rows.row(physical_idx);
+*length += row.data.len() + 1; // 1 for the validity byte
+} else {
+*length += 1; // just the null sentinel
+}
+}
+}
+
+/// Encodes the provided `RunEndEncodedArray` to `out` with the provided
`SortOptions`
+///
+/// `rows` should contain the encoded values
+pub fn encode(
+data: &mut [u8],
+offsets: &mut [usize],
+rows: &Rows,
+opts: SortOptions,
+array: &RunArray,
+) {
+for (idx, offset) in offsets.iter_mut().skip(1).enumerate() {
+let physical_idx = array.get_physical_index(idx);
+let is_valid = array.values().is_valid(physical_idx);
+
+let out = &mut data[*offset..];
+if is_valid {
+let row = rows.row(physical_idx);
+out[0] = 1; // valid
+out[1..1 + row.data.len()].copy_from_slice(row.data);
+*offset += row.data.len() + 1;
+} else {
+out[0] = null_sentinel(opts);
+*offset += 1;
+}
+}
+}
+
+/// Decodes a RunEndEncodedArray from `rows` with the provided `options`
+///
+/// # Safety
+///
+/// `rows` must contain valid data for the provided `converter`
+pub unsafe fn decode(
+converter: &RowConverter,
+rows: &mut [&[u8]],
+field: &SortField,
+validate_utf8: bool,
+) -> Result, ArrowError> {
+let opts = field.options;
+
+// Track null values and collect row data to avoid borrow issues
+let mut valid_flags = Vec::with_capacity(rows.len());
Review Comment:
done in
[e39428f](https://github.com/apache/arrow-rs/pull/7649/commits/e39428f66fc577fde867923b6047ac6bd4d5a9d6)
##
arrow-row/src/run.rs:
##
@@ -0,0 +1,164 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{null_sentinel, RowConverter, Rows, SortField};
+use arrow_array::types::RunEndIndexType;
+use arrow_array::{Array, RunArray};
+use arrow_buffer::{ArrowNativeType, Buffer};
+use arrow_schema::{ArrowError, SortOptions};
+
+/// Computes the lengths of each row for a RunEndEncodedArray
+pub fn compute_lengths(
+lengths: &mut [usize],
+rows: &Rows,
+array: &RunArray,
+) {
+// We don't need to use run_ends directly, just access through the array
+for (idx, length) in lengths.iter_mut().enumerate() {
+let physical_idx = array.get_physical_index(idx);
+let is_valid = array.values().is_valid(physical_idx);
+if is_valid {
+let row = rows.row(physical_idx);
+*length += row.data.len() + 1; // 1 for the
Re: [PR] arrow-row: Add support for REE [arrow-rs]
brancz commented on code in PR #7649:
URL: https://github.com/apache/arrow-rs/pull/7649#discussion_r2144564295
##
arrow-row/src/run.rs:
##
@@ -0,0 +1,164 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{null_sentinel, RowConverter, Rows, SortField};
+use arrow_array::types::RunEndIndexType;
+use arrow_array::{Array, RunArray};
+use arrow_buffer::{ArrowNativeType, Buffer};
+use arrow_schema::{ArrowError, SortOptions};
+
+/// Computes the lengths of each row for a RunEndEncodedArray
+pub fn compute_lengths(
+lengths: &mut [usize],
+rows: &Rows,
+array: &RunArray,
+) {
+// We don't need to use run_ends directly, just access through the array
+for (idx, length) in lengths.iter_mut().enumerate() {
+let physical_idx = array.get_physical_index(idx);
+let is_valid = array.values().is_valid(physical_idx);
+if is_valid {
+let row = rows.row(physical_idx);
+*length += row.data.len() + 1; // 1 for the validity byte
+} else {
+*length += 1; // just the null sentinel
+}
+}
+}
+
+/// Encodes the provided `RunEndEncodedArray` to `out` with the provided
`SortOptions`
+///
+/// `rows` should contain the encoded values
+pub fn encode(
+data: &mut [u8],
+offsets: &mut [usize],
+rows: &Rows,
+opts: SortOptions,
+array: &RunArray,
+) {
+for (idx, offset) in offsets.iter_mut().skip(1).enumerate() {
+let physical_idx = array.get_physical_index(idx);
Review Comment:
done in
[9c61fe9](https://github.com/apache/arrow-rs/pull/7649/commits/9c61fe99c36174555e2a43076d786bcf4079a30c)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
Re: [PR] arrow-row: Add support for REE [arrow-rs]
brancz commented on code in PR #7649: URL: https://github.com/apache/arrow-rs/pull/7649#discussion_r2144563113 ## arrow-row/src/run_test.rs: ## @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one Review Comment: no good reason, moved them into `run.rs` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
Re: [PR] arrow-row: Add support for REE [arrow-rs]
brancz commented on code in PR #7649:
URL: https://github.com/apache/arrow-rs/pull/7649#discussion_r2144557604
##
arrow-row/src/run.rs:
##
@@ -0,0 +1,164 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{null_sentinel, RowConverter, Rows, SortField};
+use arrow_array::types::RunEndIndexType;
+use arrow_array::{Array, RunArray};
+use arrow_buffer::{ArrowNativeType, Buffer};
+use arrow_schema::{ArrowError, SortOptions};
+
+/// Computes the lengths of each row for a RunEndEncodedArray
+pub fn compute_lengths(
+lengths: &mut [usize],
+rows: &Rows,
+array: &RunArray,
+) {
+// We don't need to use run_ends directly, just access through the array
+for (idx, length) in lengths.iter_mut().enumerate() {
+let physical_idx = array.get_physical_index(idx);
+let is_valid = array.values().is_valid(physical_idx);
+if is_valid {
+let row = rows.row(physical_idx);
+*length += row.data.len() + 1; // 1 for the validity byte
+} else {
+*length += 1; // just the null sentinel
+}
+}
+}
+
+/// Encodes the provided `RunEndEncodedArray` to `out` with the provided
`SortOptions`
+///
+/// `rows` should contain the encoded values
+pub fn encode(
+data: &mut [u8],
+offsets: &mut [usize],
+rows: &Rows,
+opts: SortOptions,
+array: &RunArray,
+) {
+for (idx, offset) in offsets.iter_mut().skip(1).enumerate() {
+let physical_idx = array.get_physical_index(idx);
+let is_valid = array.values().is_valid(physical_idx);
+
+let out = &mut data[*offset..];
+if is_valid {
+let row = rows.row(physical_idx);
+out[0] = 1; // valid
+out[1..1 + row.data.len()].copy_from_slice(row.data);
+*offset += row.data.len() + 1;
+} else {
+out[0] = null_sentinel(opts);
+*offset += 1;
+}
+}
+}
+
+/// Decodes a RunEndEncodedArray from `rows` with the provided `options`
+///
+/// # Safety
+///
+/// `rows` must contain valid data for the provided `converter`
+pub unsafe fn decode(
+converter: &RowConverter,
+rows: &mut [&[u8]],
+field: &SortField,
+validate_utf8: bool,
+) -> Result, ArrowError> {
+let opts = field.options;
+
+// Track null values and collect row data to avoid borrow issues
+let mut valid_flags = Vec::with_capacity(rows.len());
Review Comment:
Makes sense!
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
Re: [PR] arrow-row: Add support for REE [arrow-rs]
brancz commented on code in PR #7649:
URL: https://github.com/apache/arrow-rs/pull/7649#discussion_r2144552733
##
arrow-row/src/run.rs:
##
@@ -0,0 +1,164 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{null_sentinel, RowConverter, Rows, SortField};
+use arrow_array::types::RunEndIndexType;
+use arrow_array::{Array, RunArray};
+use arrow_buffer::{ArrowNativeType, Buffer};
+use arrow_schema::{ArrowError, SortOptions};
+
+/// Computes the lengths of each row for a RunEndEncodedArray
+pub fn compute_lengths(
+lengths: &mut [usize],
+rows: &Rows,
+array: &RunArray,
+) {
+// We don't need to use run_ends directly, just access through the array
+for (idx, length) in lengths.iter_mut().enumerate() {
+let physical_idx = array.get_physical_index(idx);
+let is_valid = array.values().is_valid(physical_idx);
+if is_valid {
+let row = rows.row(physical_idx);
+*length += row.data.len() + 1; // 1 for the validity byte
+} else {
+*length += 1; // just the null sentinel
+}
+}
+}
+
+/// Encodes the provided `RunEndEncodedArray` to `out` with the provided
`SortOptions`
+///
+/// `rows` should contain the encoded values
+pub fn encode(
+data: &mut [u8],
+offsets: &mut [usize],
+rows: &Rows,
+opts: SortOptions,
+array: &RunArray,
+) {
+for (idx, offset) in offsets.iter_mut().skip(1).enumerate() {
+let physical_idx = array.get_physical_index(idx);
+let is_valid = array.values().is_valid(physical_idx);
+
+let out = &mut data[*offset..];
+if is_valid {
+let row = rows.row(physical_idx);
+out[0] = 1; // valid
+out[1..1 + row.data.len()].copy_from_slice(row.data);
+*offset += row.data.len() + 1;
+} else {
+out[0] = null_sentinel(opts);
+*offset += 1;
+}
+}
+}
+
+/// Decodes a RunEndEncodedArray from `rows` with the provided `options`
+///
+/// # Safety
+///
+/// `rows` must contain valid data for the provided `converter`
+pub unsafe fn decode(
+converter: &RowConverter,
+rows: &mut [&[u8]],
+field: &SortField,
+validate_utf8: bool,
+) -> Result, ArrowError> {
+let opts = field.options;
+
+// Track null values and collect row data to avoid borrow issues
+let mut valid_flags = Vec::with_capacity(rows.len());
+let mut row_data = Vec::with_capacity(rows.len());
+
+// First pass: collect valid flags and data for each row
+for row in rows.iter() {
+let is_valid = row[0] != null_sentinel(opts);
+valid_flags.push(is_valid);
+if is_valid {
+row_data.push(&row[1..]);
+} else {
+row_data.push(&[][..]);
+}
+}
+
+// Now build run ends and values
+let mut run_ends = Vec::new();
+let mut values_data = Vec::new();
+let mut current_value_idx = 0;
+let mut current_run_end = 0;
+
+for (idx, is_valid) in valid_flags.iter().enumerate() {
+current_run_end += 1;
+
+if idx == 0 {
+// Add the first row data
+values_data.push(row_data[idx]);
+run_ends.push(R::Native::usize_as(current_run_end));
+continue;
+}
+
+// Check if this row is different from the previous one
+let value_changed = if !is_valid {
+// Null value - check if previous was null
+!valid_flags[idx - 1]
+} else if !valid_flags[idx - 1] {
+// Previous was null, this is not
+true
+} else {
+// Both are valid, compare data
+let prev_data = row_data[idx - 1];
+let curr_data = row_data[idx];
+prev_data != curr_data
+};
+
+if value_changed {
+// Start a new run
+current_value_idx += 1;
+values_data.push(row_data[idx]);
+run_ends.push(R::Native::usize_as(current_run_end));
+} else {
+// Update the current run end
+run_ends[current_value_idx] = R::Native::usize_as(current_run_end);
+}
+}
+
+// Convert collected values to arrays
+
Re: [PR] arrow-row: Add support for REE [arrow-rs]
brancz commented on code in PR #7649:
URL: https://github.com/apache/arrow-rs/pull/7649#discussion_r2144548240
##
arrow-row/src/run.rs:
##
@@ -0,0 +1,164 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{null_sentinel, RowConverter, Rows, SortField};
+use arrow_array::types::RunEndIndexType;
+use arrow_array::{Array, RunArray};
+use arrow_buffer::{ArrowNativeType, Buffer};
+use arrow_schema::{ArrowError, SortOptions};
+
+/// Computes the lengths of each row for a RunEndEncodedArray
+pub fn compute_lengths(
+lengths: &mut [usize],
+rows: &Rows,
+array: &RunArray,
+) {
+// We don't need to use run_ends directly, just access through the array
+for (idx, length) in lengths.iter_mut().enumerate() {
+let physical_idx = array.get_physical_index(idx);
+let is_valid = array.values().is_valid(physical_idx);
+if is_valid {
+let row = rows.row(physical_idx);
+*length += row.data.len() + 1; // 1 for the validity byte
+} else {
+*length += 1; // just the null sentinel
+}
+}
+}
+
+/// Encodes the provided `RunEndEncodedArray` to `out` with the provided
`SortOptions`
+///
+/// `rows` should contain the encoded values
+pub fn encode(
+data: &mut [u8],
+offsets: &mut [usize],
+rows: &Rows,
+opts: SortOptions,
+array: &RunArray,
+) {
+for (idx, offset) in offsets.iter_mut().skip(1).enumerate() {
+let physical_idx = array.get_physical_index(idx);
+let is_valid = array.values().is_valid(physical_idx);
+
+let out = &mut data[*offset..];
+if is_valid {
+let row = rows.row(physical_idx);
+out[0] = 1; // valid
+out[1..1 + row.data.len()].copy_from_slice(row.data);
+*offset += row.data.len() + 1;
+} else {
+out[0] = null_sentinel(opts);
+*offset += 1;
+}
+}
+}
+
+/// Decodes a RunEndEncodedArray from `rows` with the provided `options`
+///
+/// # Safety
+///
+/// `rows` must contain valid data for the provided `converter`
+pub unsafe fn decode(
+converter: &RowConverter,
+rows: &mut [&[u8]],
+field: &SortField,
+validate_utf8: bool,
+) -> Result, ArrowError> {
+let opts = field.options;
+
+// Track null values and collect row data to avoid borrow issues
+let mut valid_flags = Vec::with_capacity(rows.len());
+let mut row_data = Vec::with_capacity(rows.len());
+
+// First pass: collect valid flags and data for each row
+for row in rows.iter() {
+let is_valid = row[0] != null_sentinel(opts);
+valid_flags.push(is_valid);
+if is_valid {
+row_data.push(&row[1..]);
+} else {
+row_data.push(&[][..]);
+}
+}
+
+// Now build run ends and values
+let mut run_ends = Vec::new();
+let mut values_data = Vec::new();
+let mut current_value_idx = 0;
+let mut current_run_end = 0;
+
+for (idx, is_valid) in valid_flags.iter().enumerate() {
+current_run_end += 1;
+
+if idx == 0 {
+// Add the first row data
+values_data.push(row_data[idx]);
+run_ends.push(R::Native::usize_as(current_run_end));
+continue;
+}
+
+// Check if this row is different from the previous one
+let value_changed = if !is_valid {
+// Null value - check if previous was null
+!valid_flags[idx - 1]
+} else if !valid_flags[idx - 1] {
+// Previous was null, this is not
+true
+} else {
+// Both are valid, compare data
+let prev_data = row_data[idx - 1];
+let curr_data = row_data[idx];
+prev_data != curr_data
+};
+
+if value_changed {
+// Start a new run
+current_value_idx += 1;
+values_data.push(row_data[idx]);
+run_ends.push(R::Native::usize_as(current_run_end));
+} else {
+// Update the current run end
+run_ends[current_value_idx] = R::Native::usize_as(current_run_end);
+}
+}
+
+// Convert collected values to arrays
+
Re: [PR] arrow-row: Add support for REE [arrow-rs]
brancz commented on code in PR #7649:
URL: https://github.com/apache/arrow-rs/pull/7649#discussion_r2144549751
##
arrow-row/src/run.rs:
##
@@ -0,0 +1,164 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{null_sentinel, RowConverter, Rows, SortField};
+use arrow_array::types::RunEndIndexType;
+use arrow_array::{Array, RunArray};
+use arrow_buffer::{ArrowNativeType, Buffer};
+use arrow_schema::{ArrowError, SortOptions};
+
+/// Computes the lengths of each row for a RunEndEncodedArray
+pub fn compute_lengths(
+lengths: &mut [usize],
+rows: &Rows,
+array: &RunArray,
+) {
+// We don't need to use run_ends directly, just access through the array
+for (idx, length) in lengths.iter_mut().enumerate() {
+let physical_idx = array.get_physical_index(idx);
+let is_valid = array.values().is_valid(physical_idx);
+if is_valid {
+let row = rows.row(physical_idx);
+*length += row.data.len() + 1; // 1 for the validity byte
+} else {
+*length += 1; // just the null sentinel
+}
+}
+}
+
+/// Encodes the provided `RunEndEncodedArray` to `out` with the provided
`SortOptions`
+///
+/// `rows` should contain the encoded values
+pub fn encode(
+data: &mut [u8],
+offsets: &mut [usize],
+rows: &Rows,
+opts: SortOptions,
+array: &RunArray,
+) {
+for (idx, offset) in offsets.iter_mut().skip(1).enumerate() {
+let physical_idx = array.get_physical_index(idx);
+let is_valid = array.values().is_valid(physical_idx);
+
+let out = &mut data[*offset..];
+if is_valid {
+let row = rows.row(physical_idx);
+out[0] = 1; // valid
+out[1..1 + row.data.len()].copy_from_slice(row.data);
+*offset += row.data.len() + 1;
+} else {
+out[0] = null_sentinel(opts);
+*offset += 1;
+}
+}
+}
+
+/// Decodes a RunEndEncodedArray from `rows` with the provided `options`
+///
+/// # Safety
+///
+/// `rows` must contain valid data for the provided `converter`
+pub unsafe fn decode(
+converter: &RowConverter,
+rows: &mut [&[u8]],
+field: &SortField,
+validate_utf8: bool,
+) -> Result, ArrowError> {
+let opts = field.options;
+
+// Track null values and collect row data to avoid borrow issues
+let mut valid_flags = Vec::with_capacity(rows.len());
+let mut row_data = Vec::with_capacity(rows.len());
+
+// First pass: collect valid flags and data for each row
+for row in rows.iter() {
+let is_valid = row[0] != null_sentinel(opts);
+valid_flags.push(is_valid);
+if is_valid {
+row_data.push(&row[1..]);
+} else {
+row_data.push(&[][..]);
+}
+}
+
+// Now build run ends and values
+let mut run_ends = Vec::new();
+let mut values_data = Vec::new();
+let mut current_value_idx = 0;
+let mut current_run_end = 0;
+
+for (idx, is_valid) in valid_flags.iter().enumerate() {
+current_run_end += 1;
+
+if idx == 0 {
+// Add the first row data
+values_data.push(row_data[idx]);
+run_ends.push(R::Native::usize_as(current_run_end));
+continue;
+}
+
+// Check if this row is different from the previous one
+let value_changed = if !is_valid {
+// Null value - check if previous was null
+!valid_flags[idx - 1]
+} else if !valid_flags[idx - 1] {
+// Previous was null, this is not
+true
+} else {
+// Both are valid, compare data
+let prev_data = row_data[idx - 1];
+let curr_data = row_data[idx];
+prev_data != curr_data
+};
+
+if value_changed {
+// Start a new run
+current_value_idx += 1;
+values_data.push(row_data[idx]);
+run_ends.push(R::Native::usize_as(current_run_end));
+} else {
+// Update the current run end
+run_ends[current_value_idx] = R::Native::usize_as(current_run_end);
+}
+}
+
+// Convert collected values to arrays
+
Re: [PR] arrow-row: Add support for REE [arrow-rs]
brancz commented on code in PR #7649:
URL: https://github.com/apache/arrow-rs/pull/7649#discussion_r2144519339
##
arrow-row/src/run.rs:
##
@@ -0,0 +1,164 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{null_sentinel, RowConverter, Rows, SortField};
+use arrow_array::types::RunEndIndexType;
+use arrow_array::{Array, RunArray};
+use arrow_buffer::{ArrowNativeType, Buffer};
+use arrow_schema::{ArrowError, SortOptions};
+
+/// Computes the lengths of each row for a RunEndEncodedArray
+pub fn compute_lengths(
+lengths: &mut [usize],
+rows: &Rows,
+array: &RunArray,
+) {
+// We don't need to use run_ends directly, just access through the array
+for (idx, length) in lengths.iter_mut().enumerate() {
+let physical_idx = array.get_physical_index(idx);
+let is_valid = array.values().is_valid(physical_idx);
+if is_valid {
+let row = rows.row(physical_idx);
+*length += row.data.len() + 1; // 1 for the validity byte
+} else {
+*length += 1; // just the null sentinel
+}
+}
+}
+
+/// Encodes the provided `RunEndEncodedArray` to `out` with the provided
`SortOptions`
+///
+/// `rows` should contain the encoded values
+pub fn encode(
+data: &mut [u8],
+offsets: &mut [usize],
+rows: &Rows,
+opts: SortOptions,
+array: &RunArray,
+) {
+for (idx, offset) in offsets.iter_mut().skip(1).enumerate() {
+let physical_idx = array.get_physical_index(idx);
Review Comment:
turns out it's actually also more readable that way
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
Re: [PR] arrow-row: Add support for REE [arrow-rs]
brancz commented on code in PR #7649:
URL: https://github.com/apache/arrow-rs/pull/7649#discussion_r2144508333
##
arrow-row/src/run.rs:
##
@@ -0,0 +1,164 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{null_sentinel, RowConverter, Rows, SortField};
+use arrow_array::types::RunEndIndexType;
+use arrow_array::{Array, RunArray};
+use arrow_buffer::{ArrowNativeType, Buffer};
+use arrow_schema::{ArrowError, SortOptions};
+
+/// Computes the lengths of each row for a RunEndEncodedArray
+pub fn compute_lengths(
+lengths: &mut [usize],
+rows: &Rows,
+array: &RunArray,
+) {
+// We don't need to use run_ends directly, just access through the array
+for (idx, length) in lengths.iter_mut().enumerate() {
+let physical_idx = array.get_physical_index(idx);
+let is_valid = array.values().is_valid(physical_idx);
+if is_valid {
+let row = rows.row(physical_idx);
+*length += row.data.len() + 1; // 1 for the validity byte
+} else {
+*length += 1; // just the null sentinel
+}
+}
+}
+
+/// Encodes the provided `RunEndEncodedArray` to `out` with the provided
`SortOptions`
+///
+/// `rows` should contain the encoded values
+pub fn encode(
+data: &mut [u8],
+offsets: &mut [usize],
+rows: &Rows,
+opts: SortOptions,
+array: &RunArray,
+) {
+for (idx, offset) in offsets.iter_mut().skip(1).enumerate() {
+let physical_idx = array.get_physical_index(idx);
Review Comment:
good catch, I don't know why I wasn't thinking of that
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
Re: [PR] arrow-row: Add support for REE [arrow-rs]
alamb commented on code in PR #7649:
URL: https://github.com/apache/arrow-rs/pull/7649#discussion_r2143666108
##
arrow-row/src/lib.rs:
##
@@ -145,9 +145,11 @@ use variable::{decode_binary_view, decode_string_view};
use crate::fixed::{decode_bool, decode_fixed_size_binary, decode_primitive};
use crate::variable::{decode_binary, decode_string};
+use arrow_array::types::{Int16Type, Int32Type, Int64Type};
mod fixed;
mod list;
+mod run;
Review Comment:
I double checked and `run` is consistent with the naming of REEArray
elsewhere in the crate 👍
##
arrow-row/src/run.rs:
##
@@ -0,0 +1,164 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{null_sentinel, RowConverter, Rows, SortField};
+use arrow_array::types::RunEndIndexType;
+use arrow_array::{Array, RunArray};
+use arrow_buffer::{ArrowNativeType, Buffer};
+use arrow_schema::{ArrowError, SortOptions};
+
+/// Computes the lengths of each row for a RunEndEncodedArray
+pub fn compute_lengths(
+lengths: &mut [usize],
+rows: &Rows,
+array: &RunArray,
+) {
+// We don't need to use run_ends directly, just access through the array
+for (idx, length) in lengths.iter_mut().enumerate() {
+let physical_idx = array.get_physical_index(idx);
+let is_valid = array.values().is_valid(physical_idx);
+if is_valid {
+let row = rows.row(physical_idx);
+*length += row.data.len() + 1; // 1 for the validity byte
+} else {
+*length += 1; // just the null sentinel
+}
+}
+}
+
+/// Encodes the provided `RunEndEncodedArray` to `out` with the provided
`SortOptions`
+///
+/// `rows` should contain the encoded values
+pub fn encode(
+data: &mut [u8],
+offsets: &mut [usize],
+rows: &Rows,
+opts: SortOptions,
+array: &RunArray,
+) {
+for (idx, offset) in offsets.iter_mut().skip(1).enumerate() {
+let physical_idx = array.get_physical_index(idx);
+let is_valid = array.values().is_valid(physical_idx);
+
+let out = &mut data[*offset..];
+if is_valid {
+let row = rows.row(physical_idx);
+out[0] = 1; // valid
+out[1..1 + row.data.len()].copy_from_slice(row.data);
+*offset += row.data.len() + 1;
+} else {
+out[0] = null_sentinel(opts);
+*offset += 1;
+}
+}
+}
+
+/// Decodes a RunEndEncodedArray from `rows` with the provided `options`
+///
+/// # Safety
+///
+/// `rows` must contain valid data for the provided `converter`
+pub unsafe fn decode(
+converter: &RowConverter,
+rows: &mut [&[u8]],
+field: &SortField,
+validate_utf8: bool,
+) -> Result, ArrowError> {
+let opts = field.options;
+
+// Track null values and collect row data to avoid borrow issues
+let mut valid_flags = Vec::with_capacity(rows.len());
Review Comment:
I think you could use `BooleanBufferBuilder` here which might be more
efficient
##
arrow-row/src/lib.rs:
##
@@ -400,6 +404,17 @@ impl Codec {
};
Ok(Self::Dictionary(converter, owned))
}
+DataType::RunEndEncoded(_, values) => {
+// Similar to List implementation
Review Comment:
Maybe we can pull the transformation into a documented helper function (not
needed, I just was confused for a bit until I read the comments in the
List/LargeList implementation
##
arrow-row/src/run_test.rs:
##
@@ -0,0 +1,55 @@
+// Licensed to the Apache Software Foundation (ASF) under one
Review Comment:
I think it is more standard in this repo to put unit tests like this in the
same module (aka I would expect this to be in `arrow-row/src/run.rs`
Is there any reason it is in a different module?
##
arrow-row/src/run.rs:
##
@@ -0,0 +1,164 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use
