Re: [PR] arrow-row: Add support for REE [arrow-rs]

2025-06-17 Thread via GitHub


alamb commented on code in PR #7649:
URL: https://github.com/apache/arrow-rs/pull/7649#discussion_r2153018217


##
arrow-row/src/run.rs:
##
@@ -0,0 +1,695 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{variable, RowConverter, Rows, SortField};
+use arrow_array::types::RunEndIndexType;
+use arrow_array::{PrimitiveArray, RunArray};
+use arrow_buffer::{ArrowNativeType, ScalarBuffer};
+use arrow_schema::{ArrowError, SortOptions};
+
+/// Computes the lengths of each row for a RunEndEncodedArray
+pub fn compute_lengths(
+lengths: &mut [usize],
+rows: &Rows,
+array: &RunArray,
+) {
+let run_ends = array.run_ends().values();
+let mut logical_start = 0;
+
+// Iterate over each run and apply the same length to all logical 
positions in the run
+for (physical_idx, &run_end) in run_ends.iter().enumerate() {
+let logical_end = run_end.as_usize();
+let row = rows.row(physical_idx);
+let encoded_len = variable::encoded_len(Some(row.data));
+
+// Add the same length for all logical positions in this run
+for length in &mut lengths[logical_start..logical_end] {
+*length += encoded_len;
+}
+
+logical_start = logical_end;
+}
+}
+
+/// Encodes the provided `RunEndEncodedArray` to `out` with the provided 
`SortOptions`
+///
+/// `rows` should contain the encoded values
+pub fn encode(
+data: &mut [u8],
+offsets: &mut [usize],
+rows: &Rows,
+opts: SortOptions,
+array: &RunArray,
+) {
+let run_ends = array.run_ends();
+
+let mut logical_idx = 0;
+let mut offset_idx = 1; // Skip first offset
+
+// Iterate over each run
+for physical_idx in 0..run_ends.values().len() {
+let run_end = run_ends.values()[physical_idx].as_usize();
+
+// Process all elements in this run
+while logical_idx < run_end && offset_idx < offsets.len() {
+let offset = &mut offsets[offset_idx];
+let out = &mut data[*offset..];
+
+// Use variable-length encoding to make the data self-describing
+let row = rows.row(physical_idx);

Review Comment:
   Tracking with
   - https://github.com/apache/arrow-rs/issues/7693



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



Re: [PR] arrow-row: Add support for REE [arrow-rs]

2025-06-17 Thread via GitHub


alamb commented on code in PR #7649:
URL: https://github.com/apache/arrow-rs/pull/7649#discussion_r2153013293


##
arrow-row/src/run.rs:
##
@@ -0,0 +1,695 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{variable, RowConverter, Rows, SortField};
+use arrow_array::types::RunEndIndexType;
+use arrow_array::{PrimitiveArray, RunArray};
+use arrow_buffer::{ArrowNativeType, ScalarBuffer};
+use arrow_schema::{ArrowError, SortOptions};
+
+/// Computes the lengths of each row for a RunEndEncodedArray
+pub fn compute_lengths(
+lengths: &mut [usize],
+rows: &Rows,
+array: &RunArray,
+) {
+let run_ends = array.run_ends().values();
+let mut logical_start = 0;
+
+// Iterate over each run and apply the same length to all logical 
positions in the run
+for (physical_idx, &run_end) in run_ends.iter().enumerate() {
+let logical_end = run_end.as_usize();
+let row = rows.row(physical_idx);
+let encoded_len = variable::encoded_len(Some(row.data));
+
+// Add the same length for all logical positions in this run
+for length in &mut lengths[logical_start..logical_end] {
+*length += encoded_len;
+}
+
+logical_start = logical_end;
+}
+}
+
+/// Encodes the provided `RunEndEncodedArray` to `out` with the provided 
`SortOptions`
+///
+/// `rows` should contain the encoded values
+pub fn encode(
+data: &mut [u8],
+offsets: &mut [usize],
+rows: &Rows,
+opts: SortOptions,
+array: &RunArray,
+) {
+let run_ends = array.run_ends();
+
+let mut logical_idx = 0;
+let mut offset_idx = 1; // Skip first offset
+
+// Iterate over each run
+for physical_idx in 0..run_ends.values().len() {
+let run_end = run_ends.values()[physical_idx].as_usize();
+
+// Process all elements in this run
+while logical_idx < run_end && offset_idx < offsets.len() {
+let offset = &mut offsets[offset_idx];
+let out = &mut data[*offset..];
+
+// Use variable-length encoding to make the data self-describing
+let row = rows.row(physical_idx);
+let bytes_written = variable::encode_one(out, Some(row.data), 
opts);
+*offset += bytes_written;
+
+logical_idx += 1;
+offset_idx += 1;
+}
+
+// Break if we've processed all offsets
+if offset_idx >= offsets.len() {
+break;
+}
+}
+}
+
+/// Decodes a RunEndEncodedArray from `rows` with the provided `options`
+///
+/// # Safety
+///
+/// `rows` must contain valid data for the provided `converter`
+pub unsafe fn decode(
+converter: &RowConverter,
+rows: &mut [&[u8]],
+field: &SortField,
+validate_utf8: bool,
+) -> Result, ArrowError> {
+if rows.is_empty() {
+let values = converter.convert_raw(&mut [], validate_utf8)?;
+let run_ends_array = 
PrimitiveArraynew(ScalarBuffer::from(vec![]), None);
+return RunArraytry_new(&run_ends_array, &values[0]);
+}
+
+// Decode each row's REE data and collect the decoded values
+let mut decoded_values = Vec::new();
+let mut run_ends = Vec::new();
+let mut unique_row_indices = Vec::new();
+
+// Process each row to extract its REE data (following decode_binary 
pattern)
+let mut decoded_data = Vec::new();
+for (idx, row) in rows.iter_mut().enumerate() {
+decoded_data.clear();
+// Extract the decoded value data from this row
+let consumed = variable::decode_blocks(row, field.options, |block| {
+decoded_data.extend_from_slice(block);
+});
+
+// Handle bit inversion for descending sort (following decode_binary 
pattern)
+if field.options.descending {
+decoded_data.iter_mut().for_each(|b| *b = !*b);
+}
+
+// Update the row to point past the consumed REE data
+*row = &row[consumed..];
+
+// Check if this decoded value is the same as the previous one to 
identify runs
+let is_new_run =
+idx == 0 || decoded_data != 
decoded_values[*unique_row_indices.last().unwrap()];
+
+if is_new_run {
+// This is 

Re: [PR] arrow-row: Add support for REE [arrow-rs]

2025-06-17 Thread via GitHub


alamb commented on code in PR #7649:
URL: https://github.com/apache/arrow-rs/pull/7649#discussion_r2153011523


##
arrow-row/src/run.rs:
##
@@ -0,0 +1,695 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{variable, RowConverter, Rows, SortField};
+use arrow_array::types::RunEndIndexType;
+use arrow_array::{PrimitiveArray, RunArray};
+use arrow_buffer::{ArrowNativeType, ScalarBuffer};
+use arrow_schema::{ArrowError, SortOptions};
+
+/// Computes the lengths of each row for a RunEndEncodedArray
+pub fn compute_lengths(
+lengths: &mut [usize],
+rows: &Rows,
+array: &RunArray,
+) {
+let run_ends = array.run_ends().values();
+let mut logical_start = 0;
+
+// Iterate over each run and apply the same length to all logical 
positions in the run
+for (physical_idx, &run_end) in run_ends.iter().enumerate() {
+let logical_end = run_end.as_usize();
+let row = rows.row(physical_idx);
+let encoded_len = variable::encoded_len(Some(row.data));
+
+// Add the same length for all logical positions in this run
+for length in &mut lengths[logical_start..logical_end] {
+*length += encoded_len;
+}
+
+logical_start = logical_end;
+}
+}
+
+/// Encodes the provided `RunEndEncodedArray` to `out` with the provided 
`SortOptions`
+///
+/// `rows` should contain the encoded values
+pub fn encode(
+data: &mut [u8],
+offsets: &mut [usize],
+rows: &Rows,
+opts: SortOptions,
+array: &RunArray,
+) {
+let run_ends = array.run_ends();
+
+let mut logical_idx = 0;
+let mut offset_idx = 1; // Skip first offset
+
+// Iterate over each run
+for physical_idx in 0..run_ends.values().len() {
+let run_end = run_ends.values()[physical_idx].as_usize();
+
+// Process all elements in this run
+while logical_idx < run_end && offset_idx < offsets.len() {
+let offset = &mut offsets[offset_idx];
+let out = &mut data[*offset..];
+
+// Use variable-length encoding to make the data self-describing
+let row = rows.row(physical_idx);
+let bytes_written = variable::encode_one(out, Some(row.data), 
opts);
+*offset += bytes_written;
+
+logical_idx += 1;
+offset_idx += 1;
+}
+
+// Break if we've processed all offsets
+if offset_idx >= offsets.len() {
+break;
+}
+}
+}
+
+/// Decodes a RunEndEncodedArray from `rows` with the provided `options`
+///
+/// # Safety
+///
+/// `rows` must contain valid data for the provided `converter`
+pub unsafe fn decode(
+converter: &RowConverter,
+rows: &mut [&[u8]],
+field: &SortField,
+validate_utf8: bool,
+) -> Result, ArrowError> {
+if rows.is_empty() {
+let values = converter.convert_raw(&mut [], validate_utf8)?;
+let run_ends_array = 
PrimitiveArraynew(ScalarBuffer::from(vec![]), None);
+return RunArraytry_new(&run_ends_array, &values[0]);
+}
+
+// Decode each row's REE data and collect the decoded values
+let mut decoded_values = Vec::new();
+let mut run_ends = Vec::new();
+let mut unique_row_indices = Vec::new();
+
+// Process each row to extract its REE data (following decode_binary 
pattern)
+let mut decoded_data = Vec::new();
+for (idx, row) in rows.iter_mut().enumerate() {
+decoded_data.clear();
+// Extract the decoded value data from this row
+let consumed = variable::decode_blocks(row, field.options, |block| {
+decoded_data.extend_from_slice(block);
+});
+
+// Handle bit inversion for descending sort (following decode_binary 
pattern)
+if field.options.descending {
+decoded_data.iter_mut().for_each(|b| *b = !*b);
+}
+
+// Update the row to point past the consumed REE data
+*row = &row[consumed..];
+
+// Check if this decoded value is the same as the previous one to 
identify runs
+let is_new_run =
+idx == 0 || decoded_data != 
decoded_values[*unique_row_indices.last().unwrap()];
+
+if is_new_run {
+// This is 

Re: [PR] arrow-row: Add support for REE [arrow-rs]

2025-06-17 Thread via GitHub


alamb commented on PR #7649:
URL: https://github.com/apache/arrow-rs/pull/7649#issuecomment-2981535190

   Thanks again @brancz 


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



Re: [PR] arrow-row: Add support for REE [arrow-rs]

2025-06-17 Thread via GitHub


alamb merged PR #7649:
URL: https://github.com/apache/arrow-rs/pull/7649


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



Re: [PR] arrow-row: Add support for REE [arrow-rs]

2025-06-17 Thread via GitHub


alamb commented on PR #7649:
URL: https://github.com/apache/arrow-rs/pull/7649#issuecomment-2981534648

   I plan to merge this PR in and then file follow on issues for items found 
during review
   1) adding partial eq to REE
   2) refactor the round trip tests a bit to deduplicate code
   3) only decode each physical value once


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



Re: [PR] arrow-row: Add support for REE [arrow-rs]

2025-06-16 Thread via GitHub


alamb commented on PR #7649:
URL: https://github.com/apache/arrow-rs/pull/7649#issuecomment-2978072151

   > I think we need test coverage for the following cases (I'll make a PR)
   
   Here is a PR: 
   - https://github.com/apache/arrow-rs/pull/7680


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



Re: [PR] arrow-row: Add support for REE [arrow-rs]

2025-06-16 Thread via GitHub


alamb commented on code in PR #7649:
URL: https://github.com/apache/arrow-rs/pull/7649#discussion_r2150765924


##
arrow-row/src/run.rs:
##
@@ -0,0 +1,695 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{variable, RowConverter, Rows, SortField};
+use arrow_array::types::RunEndIndexType;
+use arrow_array::{PrimitiveArray, RunArray};
+use arrow_buffer::{ArrowNativeType, ScalarBuffer};
+use arrow_schema::{ArrowError, SortOptions};
+
+/// Computes the lengths of each row for a RunEndEncodedArray
+pub fn compute_lengths(
+lengths: &mut [usize],
+rows: &Rows,
+array: &RunArray,
+) {
+let run_ends = array.run_ends().values();
+let mut logical_start = 0;
+
+// Iterate over each run and apply the same length to all logical 
positions in the run
+for (physical_idx, &run_end) in run_ends.iter().enumerate() {
+let logical_end = run_end.as_usize();
+let row = rows.row(physical_idx);
+let encoded_len = variable::encoded_len(Some(row.data));
+
+// Add the same length for all logical positions in this run
+for length in &mut lengths[logical_start..logical_end] {
+*length += encoded_len;
+}
+
+logical_start = logical_end;
+}
+}
+
+/// Encodes the provided `RunEndEncodedArray` to `out` with the provided 
`SortOptions`
+///
+/// `rows` should contain the encoded values
+pub fn encode(
+data: &mut [u8],
+offsets: &mut [usize],
+rows: &Rows,
+opts: SortOptions,
+array: &RunArray,
+) {
+let run_ends = array.run_ends();
+
+let mut logical_idx = 0;
+let mut offset_idx = 1; // Skip first offset
+
+// Iterate over each run
+for physical_idx in 0..run_ends.values().len() {
+let run_end = run_ends.values()[physical_idx].as_usize();
+
+// Process all elements in this run
+while logical_idx < run_end && offset_idx < offsets.len() {
+let offset = &mut offsets[offset_idx];
+let out = &mut data[*offset..];
+
+// Use variable-length encoding to make the data self-describing
+let row = rows.row(physical_idx);

Review Comment:
   Some random performance optimization thoughts (for some future PR):
   
   1. You could hoist this out of the inner loop so it was executed once per 
physical value rather than once per logical value
   2. You could potentially encode row once and then simply copy the encoded 
bytes for all remaining rows. This is probably significantly faster than 
re-encoding the same value over and over again.
   



##
arrow-row/src/run.rs:
##
@@ -0,0 +1,695 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{variable, RowConverter, Rows, SortField};
+use arrow_array::types::RunEndIndexType;
+use arrow_array::{PrimitiveArray, RunArray};
+use arrow_buffer::{ArrowNativeType, ScalarBuffer};
+use arrow_schema::{ArrowError, SortOptions};
+
+/// Computes the lengths of each row for a RunEndEncodedArray
+pub fn compute_lengths(
+lengths: &mut [usize],
+rows: &Rows,
+array: &RunArray,
+) {
+let run_ends = array.run_ends().values();
+let mut logical_start = 0;
+
+// Iterate over each run and apply the same length to all logical 
positions in the run
+for (physical_idx, &run_end) in run_ends.iter().enumerate() {
+let logical_end = run_end.as_usize();
+let row = rows.row(physical_idx);
+let encoded_len = vari

Re: [PR] arrow-row: Add support for REE [arrow-rs]

2025-06-13 Thread via GitHub


brancz commented on PR #7649:
URL: https://github.com/apache/arrow-rs/pull/7649#issuecomment-2970703802

   @alamb you can ignore all intermediate commits, the last commit pretty much 
re-writes everything and I think is also way more readable and understandable 
(and most of all correct).


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



Re: [PR] arrow-row: Add support for REE [arrow-rs]

2025-06-13 Thread via GitHub


brancz commented on code in PR #7649:
URL: https://github.com/apache/arrow-rs/pull/7649#discussion_r2145291279


##
arrow-row/src/run_test.rs:
##
@@ -0,0 +1,55 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#[cfg(test)]
+mod tests {
+use crate::{RowConverter, SortField};
+use arrow_array::types::Int32Type;
+use arrow_array::RunArray;
+use arrow_schema::DataType;
+use std::sync::Arc;
+
+#[test]
+fn test_run_end_encoded_supports_datatype() {
+// Test that the RowConverter correctly supports run-end encoded arrays
+assert!(RowConverter::supports_datatype(&DataType::RunEndEncoded(
+Arc::new(arrow_schema::Field::new("run_ends", DataType::Int32, 
false)),
+Arc::new(arrow_schema::Field::new("values", DataType::Utf8, true)),
+)));
+}
+
+#[test]
+fn test_run_end_encoded_sorting() {
+// Create two run arrays with different values
+let run_array1: RunArray = vec!["b", "b", 
"a"].into_iter().collect();
+let run_array2: RunArray = vec!["a", "a", 
"b"].into_iter().collect();
+
+let converter = 
RowConverter::new(vec![SortField::new(DataType::RunEndEncoded(
+Arc::new(arrow_schema::Field::new("run_ends", DataType::Int32, 
false)),
+Arc::new(arrow_schema::Field::new("values", DataType::Utf8, true)),
+))])
+.unwrap();
+
+let rows1 = 
converter.convert_columns(&[Arc::new(run_array1)]).unwrap();
+let rows2 = 
converter.convert_columns(&[Arc::new(run_array2)]).unwrap();
+
+// Compare rows - the second row should be less than the first
+assert!(rows2.row(0) < rows1.row(0));
+assert!(rows2.row(1) < rows1.row(0));
+assert!(rows1.row(2) < rows2.row(2));
+}

Review Comment:
   Good call, there were *a lot* of problems. Refactoring a bit then pushing 
the latest changes.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



Re: [PR] arrow-row: Add support for REE [arrow-rs]

2025-06-13 Thread via GitHub


brancz commented on code in PR #7649:
URL: https://github.com/apache/arrow-rs/pull/7649#discussion_r2145292078


##
arrow-row/src/run.rs:
##
@@ -0,0 +1,164 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{null_sentinel, RowConverter, Rows, SortField};
+use arrow_array::types::RunEndIndexType;
+use arrow_array::{Array, RunArray};
+use arrow_buffer::{ArrowNativeType, Buffer};
+use arrow_schema::{ArrowError, SortOptions};
+
+/// Computes the lengths of each row for a RunEndEncodedArray
+pub fn compute_lengths(
+lengths: &mut [usize],
+rows: &Rows,
+array: &RunArray,
+) {
+// We don't need to use run_ends directly, just access through the array
+for (idx, length) in lengths.iter_mut().enumerate() {
+let physical_idx = array.get_physical_index(idx);
+let is_valid = array.values().is_valid(physical_idx);
+if is_valid {
+let row = rows.row(physical_idx);
+*length += row.data.len() + 1; // 1 for the validity byte
+} else {
+*length += 1; // just the null sentinel
+}
+}
+}
+
+/// Encodes the provided `RunEndEncodedArray` to `out` with the provided 
`SortOptions`
+///
+/// `rows` should contain the encoded values
+pub fn encode(
+data: &mut [u8],
+offsets: &mut [usize],
+rows: &Rows,
+opts: SortOptions,
+array: &RunArray,
+) {
+for (idx, offset) in offsets.iter_mut().skip(1).enumerate() {
+let physical_idx = array.get_physical_index(idx);
+let is_valid = array.values().is_valid(physical_idx);
+
+let out = &mut data[*offset..];
+if is_valid {
+let row = rows.row(physical_idx);
+out[0] = 1; // valid
+out[1..1 + row.data.len()].copy_from_slice(row.data);
+*offset += row.data.len() + 1;
+} else {
+out[0] = null_sentinel(opts);
+*offset += 1;
+}
+}
+}
+
+/// Decodes a RunEndEncodedArray from `rows` with the provided `options`
+///
+/// # Safety
+///
+/// `rows` must contain valid data for the provided `converter`
+pub unsafe fn decode(
+converter: &RowConverter,
+rows: &mut [&[u8]],
+field: &SortField,
+validate_utf8: bool,
+) -> Result, ArrowError> {
+let opts = field.options;
+
+// Track null values and collect row data to avoid borrow issues
+let mut valid_flags = Vec::with_capacity(rows.len());
+let mut row_data = Vec::with_capacity(rows.len());
+
+// First pass: collect valid flags and data for each row
+for row in rows.iter() {
+let is_valid = row[0] != null_sentinel(opts);
+valid_flags.push(is_valid);
+if is_valid {
+row_data.push(&row[1..]);
+} else {
+row_data.push(&[][..]);
+}
+}
+
+// Now build run ends and values
+let mut run_ends = Vec::new();
+let mut values_data = Vec::new();
+let mut current_value_idx = 0;
+let mut current_run_end = 0;
+
+for (idx, is_valid) in valid_flags.iter().enumerate() {
+current_run_end += 1;
+
+if idx == 0 {
+// Add the first row data
+values_data.push(row_data[idx]);
+run_ends.push(R::Native::usize_as(current_run_end));
+continue;
+}
+
+// Check if this row is different from the previous one
+let value_changed = if !is_valid {
+// Null value - check if previous was null
+!valid_flags[idx - 1]
+} else if !valid_flags[idx - 1] {
+// Previous was null, this is not
+true
+} else {
+// Both are valid, compare data
+let prev_data = row_data[idx - 1];
+let curr_data = row_data[idx];
+prev_data != curr_data
+};
+
+if value_changed {
+// Start a new run
+current_value_idx += 1;
+values_data.push(row_data[idx]);
+run_ends.push(R::Native::usize_as(current_run_end));
+} else {
+// Update the current run end
+run_ends[current_value_idx] = R::Native::usize_as(current_run_end);
+}
+}
+
+// Convert collected values to arrays
+  

Re: [PR] arrow-row: Add support for REE [arrow-rs]

2025-06-13 Thread via GitHub


brancz commented on code in PR #7649:
URL: https://github.com/apache/arrow-rs/pull/7649#discussion_r2144567160


##
arrow-row/src/run_test.rs:
##
@@ -0,0 +1,55 @@
+// Licensed to the Apache Software Foundation (ASF) under one

Review Comment:
   done in 
[4f9b8f3](https://github.com/apache/arrow-rs/pull/7649/commits/4f9b8f38d9138790933213d18b08801f9a4e9fd8)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



Re: [PR] arrow-row: Add support for REE [arrow-rs]

2025-06-13 Thread via GitHub


brancz commented on code in PR #7649:
URL: https://github.com/apache/arrow-rs/pull/7649#discussion_r2144565407


##
arrow-row/src/run.rs:
##
@@ -0,0 +1,164 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{null_sentinel, RowConverter, Rows, SortField};
+use arrow_array::types::RunEndIndexType;
+use arrow_array::{Array, RunArray};
+use arrow_buffer::{ArrowNativeType, Buffer};
+use arrow_schema::{ArrowError, SortOptions};
+
+/// Computes the lengths of each row for a RunEndEncodedArray
+pub fn compute_lengths(
+lengths: &mut [usize],
+rows: &Rows,
+array: &RunArray,
+) {
+// We don't need to use run_ends directly, just access through the array
+for (idx, length) in lengths.iter_mut().enumerate() {
+let physical_idx = array.get_physical_index(idx);
+let is_valid = array.values().is_valid(physical_idx);
+if is_valid {
+let row = rows.row(physical_idx);
+*length += row.data.len() + 1; // 1 for the validity byte
+} else {
+*length += 1; // just the null sentinel
+}
+}
+}
+
+/// Encodes the provided `RunEndEncodedArray` to `out` with the provided 
`SortOptions`
+///
+/// `rows` should contain the encoded values
+pub fn encode(
+data: &mut [u8],
+offsets: &mut [usize],
+rows: &Rows,
+opts: SortOptions,
+array: &RunArray,
+) {
+for (idx, offset) in offsets.iter_mut().skip(1).enumerate() {
+let physical_idx = array.get_physical_index(idx);
+let is_valid = array.values().is_valid(physical_idx);
+
+let out = &mut data[*offset..];
+if is_valid {
+let row = rows.row(physical_idx);
+out[0] = 1; // valid
+out[1..1 + row.data.len()].copy_from_slice(row.data);
+*offset += row.data.len() + 1;
+} else {
+out[0] = null_sentinel(opts);
+*offset += 1;
+}
+}
+}
+
+/// Decodes a RunEndEncodedArray from `rows` with the provided `options`
+///
+/// # Safety
+///
+/// `rows` must contain valid data for the provided `converter`
+pub unsafe fn decode(
+converter: &RowConverter,
+rows: &mut [&[u8]],
+field: &SortField,
+validate_utf8: bool,
+) -> Result, ArrowError> {
+let opts = field.options;
+
+// Track null values and collect row data to avoid borrow issues
+let mut valid_flags = Vec::with_capacity(rows.len());
+let mut row_data = Vec::with_capacity(rows.len());
+
+// First pass: collect valid flags and data for each row
+for row in rows.iter() {
+let is_valid = row[0] != null_sentinel(opts);
+valid_flags.push(is_valid);
+if is_valid {
+row_data.push(&row[1..]);
+} else {
+row_data.push(&[][..]);
+}
+}
+
+// Now build run ends and values
+let mut run_ends = Vec::new();
+let mut values_data = Vec::new();
+let mut current_value_idx = 0;
+let mut current_run_end = 0;
+
+for (idx, is_valid) in valid_flags.iter().enumerate() {
+current_run_end += 1;
+
+if idx == 0 {
+// Add the first row data
+values_data.push(row_data[idx]);
+run_ends.push(R::Native::usize_as(current_run_end));
+continue;
+}
+
+// Check if this row is different from the previous one
+let value_changed = if !is_valid {
+// Null value - check if previous was null
+!valid_flags[idx - 1]
+} else if !valid_flags[idx - 1] {
+// Previous was null, this is not
+true
+} else {
+// Both are valid, compare data
+let prev_data = row_data[idx - 1];
+let curr_data = row_data[idx];
+prev_data != curr_data
+};
+
+if value_changed {
+// Start a new run
+current_value_idx += 1;
+values_data.push(row_data[idx]);
+run_ends.push(R::Native::usize_as(current_run_end));
+} else {
+// Update the current run end
+run_ends[current_value_idx] = R::Native::usize_as(current_run_end);
+}
+}
+
+// Convert collected values to arrays
+  

Re: [PR] arrow-row: Add support for REE [arrow-rs]

2025-06-13 Thread via GitHub


brancz commented on code in PR #7649:
URL: https://github.com/apache/arrow-rs/pull/7649#discussion_r2144564722


##
arrow-row/src/run.rs:
##
@@ -0,0 +1,164 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{null_sentinel, RowConverter, Rows, SortField};
+use arrow_array::types::RunEndIndexType;
+use arrow_array::{Array, RunArray};
+use arrow_buffer::{ArrowNativeType, Buffer};
+use arrow_schema::{ArrowError, SortOptions};
+
+/// Computes the lengths of each row for a RunEndEncodedArray
+pub fn compute_lengths(
+lengths: &mut [usize],
+rows: &Rows,
+array: &RunArray,
+) {
+// We don't need to use run_ends directly, just access through the array
+for (idx, length) in lengths.iter_mut().enumerate() {
+let physical_idx = array.get_physical_index(idx);
+let is_valid = array.values().is_valid(physical_idx);
+if is_valid {
+let row = rows.row(physical_idx);
+*length += row.data.len() + 1; // 1 for the validity byte
+} else {
+*length += 1; // just the null sentinel
+}
+}
+}
+
+/// Encodes the provided `RunEndEncodedArray` to `out` with the provided 
`SortOptions`
+///
+/// `rows` should contain the encoded values
+pub fn encode(
+data: &mut [u8],
+offsets: &mut [usize],
+rows: &Rows,
+opts: SortOptions,
+array: &RunArray,
+) {
+for (idx, offset) in offsets.iter_mut().skip(1).enumerate() {
+let physical_idx = array.get_physical_index(idx);
+let is_valid = array.values().is_valid(physical_idx);
+
+let out = &mut data[*offset..];
+if is_valid {
+let row = rows.row(physical_idx);
+out[0] = 1; // valid
+out[1..1 + row.data.len()].copy_from_slice(row.data);
+*offset += row.data.len() + 1;
+} else {
+out[0] = null_sentinel(opts);
+*offset += 1;
+}
+}
+}
+
+/// Decodes a RunEndEncodedArray from `rows` with the provided `options`
+///
+/// # Safety
+///
+/// `rows` must contain valid data for the provided `converter`
+pub unsafe fn decode(
+converter: &RowConverter,
+rows: &mut [&[u8]],
+field: &SortField,
+validate_utf8: bool,
+) -> Result, ArrowError> {
+let opts = field.options;
+
+// Track null values and collect row data to avoid borrow issues
+let mut valid_flags = Vec::with_capacity(rows.len());

Review Comment:
   done in 
[e39428f](https://github.com/apache/arrow-rs/pull/7649/commits/e39428f66fc577fde867923b6047ac6bd4d5a9d6)



##
arrow-row/src/run.rs:
##
@@ -0,0 +1,164 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{null_sentinel, RowConverter, Rows, SortField};
+use arrow_array::types::RunEndIndexType;
+use arrow_array::{Array, RunArray};
+use arrow_buffer::{ArrowNativeType, Buffer};
+use arrow_schema::{ArrowError, SortOptions};
+
+/// Computes the lengths of each row for a RunEndEncodedArray
+pub fn compute_lengths(
+lengths: &mut [usize],
+rows: &Rows,
+array: &RunArray,
+) {
+// We don't need to use run_ends directly, just access through the array
+for (idx, length) in lengths.iter_mut().enumerate() {
+let physical_idx = array.get_physical_index(idx);
+let is_valid = array.values().is_valid(physical_idx);
+if is_valid {
+let row = rows.row(physical_idx);
+*length += row.data.len() + 1; // 1 for the

Re: [PR] arrow-row: Add support for REE [arrow-rs]

2025-06-13 Thread via GitHub


brancz commented on code in PR #7649:
URL: https://github.com/apache/arrow-rs/pull/7649#discussion_r2144564295


##
arrow-row/src/run.rs:
##
@@ -0,0 +1,164 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{null_sentinel, RowConverter, Rows, SortField};
+use arrow_array::types::RunEndIndexType;
+use arrow_array::{Array, RunArray};
+use arrow_buffer::{ArrowNativeType, Buffer};
+use arrow_schema::{ArrowError, SortOptions};
+
+/// Computes the lengths of each row for a RunEndEncodedArray
+pub fn compute_lengths(
+lengths: &mut [usize],
+rows: &Rows,
+array: &RunArray,
+) {
+// We don't need to use run_ends directly, just access through the array
+for (idx, length) in lengths.iter_mut().enumerate() {
+let physical_idx = array.get_physical_index(idx);
+let is_valid = array.values().is_valid(physical_idx);
+if is_valid {
+let row = rows.row(physical_idx);
+*length += row.data.len() + 1; // 1 for the validity byte
+} else {
+*length += 1; // just the null sentinel
+}
+}
+}
+
+/// Encodes the provided `RunEndEncodedArray` to `out` with the provided 
`SortOptions`
+///
+/// `rows` should contain the encoded values
+pub fn encode(
+data: &mut [u8],
+offsets: &mut [usize],
+rows: &Rows,
+opts: SortOptions,
+array: &RunArray,
+) {
+for (idx, offset) in offsets.iter_mut().skip(1).enumerate() {
+let physical_idx = array.get_physical_index(idx);

Review Comment:
   done in 
[9c61fe9](https://github.com/apache/arrow-rs/pull/7649/commits/9c61fe99c36174555e2a43076d786bcf4079a30c)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



Re: [PR] arrow-row: Add support for REE [arrow-rs]

2025-06-13 Thread via GitHub


brancz commented on code in PR #7649:
URL: https://github.com/apache/arrow-rs/pull/7649#discussion_r2144563113


##
arrow-row/src/run_test.rs:
##
@@ -0,0 +1,55 @@
+// Licensed to the Apache Software Foundation (ASF) under one

Review Comment:
   no good reason, moved them into `run.rs`



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



Re: [PR] arrow-row: Add support for REE [arrow-rs]

2025-06-13 Thread via GitHub


brancz commented on code in PR #7649:
URL: https://github.com/apache/arrow-rs/pull/7649#discussion_r2144557604


##
arrow-row/src/run.rs:
##
@@ -0,0 +1,164 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{null_sentinel, RowConverter, Rows, SortField};
+use arrow_array::types::RunEndIndexType;
+use arrow_array::{Array, RunArray};
+use arrow_buffer::{ArrowNativeType, Buffer};
+use arrow_schema::{ArrowError, SortOptions};
+
+/// Computes the lengths of each row for a RunEndEncodedArray
+pub fn compute_lengths(
+lengths: &mut [usize],
+rows: &Rows,
+array: &RunArray,
+) {
+// We don't need to use run_ends directly, just access through the array
+for (idx, length) in lengths.iter_mut().enumerate() {
+let physical_idx = array.get_physical_index(idx);
+let is_valid = array.values().is_valid(physical_idx);
+if is_valid {
+let row = rows.row(physical_idx);
+*length += row.data.len() + 1; // 1 for the validity byte
+} else {
+*length += 1; // just the null sentinel
+}
+}
+}
+
+/// Encodes the provided `RunEndEncodedArray` to `out` with the provided 
`SortOptions`
+///
+/// `rows` should contain the encoded values
+pub fn encode(
+data: &mut [u8],
+offsets: &mut [usize],
+rows: &Rows,
+opts: SortOptions,
+array: &RunArray,
+) {
+for (idx, offset) in offsets.iter_mut().skip(1).enumerate() {
+let physical_idx = array.get_physical_index(idx);
+let is_valid = array.values().is_valid(physical_idx);
+
+let out = &mut data[*offset..];
+if is_valid {
+let row = rows.row(physical_idx);
+out[0] = 1; // valid
+out[1..1 + row.data.len()].copy_from_slice(row.data);
+*offset += row.data.len() + 1;
+} else {
+out[0] = null_sentinel(opts);
+*offset += 1;
+}
+}
+}
+
+/// Decodes a RunEndEncodedArray from `rows` with the provided `options`
+///
+/// # Safety
+///
+/// `rows` must contain valid data for the provided `converter`
+pub unsafe fn decode(
+converter: &RowConverter,
+rows: &mut [&[u8]],
+field: &SortField,
+validate_utf8: bool,
+) -> Result, ArrowError> {
+let opts = field.options;
+
+// Track null values and collect row data to avoid borrow issues
+let mut valid_flags = Vec::with_capacity(rows.len());

Review Comment:
   Makes sense!



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



Re: [PR] arrow-row: Add support for REE [arrow-rs]

2025-06-13 Thread via GitHub


brancz commented on code in PR #7649:
URL: https://github.com/apache/arrow-rs/pull/7649#discussion_r2144552733


##
arrow-row/src/run.rs:
##
@@ -0,0 +1,164 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{null_sentinel, RowConverter, Rows, SortField};
+use arrow_array::types::RunEndIndexType;
+use arrow_array::{Array, RunArray};
+use arrow_buffer::{ArrowNativeType, Buffer};
+use arrow_schema::{ArrowError, SortOptions};
+
+/// Computes the lengths of each row for a RunEndEncodedArray
+pub fn compute_lengths(
+lengths: &mut [usize],
+rows: &Rows,
+array: &RunArray,
+) {
+// We don't need to use run_ends directly, just access through the array
+for (idx, length) in lengths.iter_mut().enumerate() {
+let physical_idx = array.get_physical_index(idx);
+let is_valid = array.values().is_valid(physical_idx);
+if is_valid {
+let row = rows.row(physical_idx);
+*length += row.data.len() + 1; // 1 for the validity byte
+} else {
+*length += 1; // just the null sentinel
+}
+}
+}
+
+/// Encodes the provided `RunEndEncodedArray` to `out` with the provided 
`SortOptions`
+///
+/// `rows` should contain the encoded values
+pub fn encode(
+data: &mut [u8],
+offsets: &mut [usize],
+rows: &Rows,
+opts: SortOptions,
+array: &RunArray,
+) {
+for (idx, offset) in offsets.iter_mut().skip(1).enumerate() {
+let physical_idx = array.get_physical_index(idx);
+let is_valid = array.values().is_valid(physical_idx);
+
+let out = &mut data[*offset..];
+if is_valid {
+let row = rows.row(physical_idx);
+out[0] = 1; // valid
+out[1..1 + row.data.len()].copy_from_slice(row.data);
+*offset += row.data.len() + 1;
+} else {
+out[0] = null_sentinel(opts);
+*offset += 1;
+}
+}
+}
+
+/// Decodes a RunEndEncodedArray from `rows` with the provided `options`
+///
+/// # Safety
+///
+/// `rows` must contain valid data for the provided `converter`
+pub unsafe fn decode(
+converter: &RowConverter,
+rows: &mut [&[u8]],
+field: &SortField,
+validate_utf8: bool,
+) -> Result, ArrowError> {
+let opts = field.options;
+
+// Track null values and collect row data to avoid borrow issues
+let mut valid_flags = Vec::with_capacity(rows.len());
+let mut row_data = Vec::with_capacity(rows.len());
+
+// First pass: collect valid flags and data for each row
+for row in rows.iter() {
+let is_valid = row[0] != null_sentinel(opts);
+valid_flags.push(is_valid);
+if is_valid {
+row_data.push(&row[1..]);
+} else {
+row_data.push(&[][..]);
+}
+}
+
+// Now build run ends and values
+let mut run_ends = Vec::new();
+let mut values_data = Vec::new();
+let mut current_value_idx = 0;
+let mut current_run_end = 0;
+
+for (idx, is_valid) in valid_flags.iter().enumerate() {
+current_run_end += 1;
+
+if idx == 0 {
+// Add the first row data
+values_data.push(row_data[idx]);
+run_ends.push(R::Native::usize_as(current_run_end));
+continue;
+}
+
+// Check if this row is different from the previous one
+let value_changed = if !is_valid {
+// Null value - check if previous was null
+!valid_flags[idx - 1]
+} else if !valid_flags[idx - 1] {
+// Previous was null, this is not
+true
+} else {
+// Both are valid, compare data
+let prev_data = row_data[idx - 1];
+let curr_data = row_data[idx];
+prev_data != curr_data
+};
+
+if value_changed {
+// Start a new run
+current_value_idx += 1;
+values_data.push(row_data[idx]);
+run_ends.push(R::Native::usize_as(current_run_end));
+} else {
+// Update the current run end
+run_ends[current_value_idx] = R::Native::usize_as(current_run_end);
+}
+}
+
+// Convert collected values to arrays
+  

Re: [PR] arrow-row: Add support for REE [arrow-rs]

2025-06-13 Thread via GitHub


brancz commented on code in PR #7649:
URL: https://github.com/apache/arrow-rs/pull/7649#discussion_r2144548240


##
arrow-row/src/run.rs:
##
@@ -0,0 +1,164 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{null_sentinel, RowConverter, Rows, SortField};
+use arrow_array::types::RunEndIndexType;
+use arrow_array::{Array, RunArray};
+use arrow_buffer::{ArrowNativeType, Buffer};
+use arrow_schema::{ArrowError, SortOptions};
+
+/// Computes the lengths of each row for a RunEndEncodedArray
+pub fn compute_lengths(
+lengths: &mut [usize],
+rows: &Rows,
+array: &RunArray,
+) {
+// We don't need to use run_ends directly, just access through the array
+for (idx, length) in lengths.iter_mut().enumerate() {
+let physical_idx = array.get_physical_index(idx);
+let is_valid = array.values().is_valid(physical_idx);
+if is_valid {
+let row = rows.row(physical_idx);
+*length += row.data.len() + 1; // 1 for the validity byte
+} else {
+*length += 1; // just the null sentinel
+}
+}
+}
+
+/// Encodes the provided `RunEndEncodedArray` to `out` with the provided 
`SortOptions`
+///
+/// `rows` should contain the encoded values
+pub fn encode(
+data: &mut [u8],
+offsets: &mut [usize],
+rows: &Rows,
+opts: SortOptions,
+array: &RunArray,
+) {
+for (idx, offset) in offsets.iter_mut().skip(1).enumerate() {
+let physical_idx = array.get_physical_index(idx);
+let is_valid = array.values().is_valid(physical_idx);
+
+let out = &mut data[*offset..];
+if is_valid {
+let row = rows.row(physical_idx);
+out[0] = 1; // valid
+out[1..1 + row.data.len()].copy_from_slice(row.data);
+*offset += row.data.len() + 1;
+} else {
+out[0] = null_sentinel(opts);
+*offset += 1;
+}
+}
+}
+
+/// Decodes a RunEndEncodedArray from `rows` with the provided `options`
+///
+/// # Safety
+///
+/// `rows` must contain valid data for the provided `converter`
+pub unsafe fn decode(
+converter: &RowConverter,
+rows: &mut [&[u8]],
+field: &SortField,
+validate_utf8: bool,
+) -> Result, ArrowError> {
+let opts = field.options;
+
+// Track null values and collect row data to avoid borrow issues
+let mut valid_flags = Vec::with_capacity(rows.len());
+let mut row_data = Vec::with_capacity(rows.len());
+
+// First pass: collect valid flags and data for each row
+for row in rows.iter() {
+let is_valid = row[0] != null_sentinel(opts);
+valid_flags.push(is_valid);
+if is_valid {
+row_data.push(&row[1..]);
+} else {
+row_data.push(&[][..]);
+}
+}
+
+// Now build run ends and values
+let mut run_ends = Vec::new();
+let mut values_data = Vec::new();
+let mut current_value_idx = 0;
+let mut current_run_end = 0;
+
+for (idx, is_valid) in valid_flags.iter().enumerate() {
+current_run_end += 1;
+
+if idx == 0 {
+// Add the first row data
+values_data.push(row_data[idx]);
+run_ends.push(R::Native::usize_as(current_run_end));
+continue;
+}
+
+// Check if this row is different from the previous one
+let value_changed = if !is_valid {
+// Null value - check if previous was null
+!valid_flags[idx - 1]
+} else if !valid_flags[idx - 1] {
+// Previous was null, this is not
+true
+} else {
+// Both are valid, compare data
+let prev_data = row_data[idx - 1];
+let curr_data = row_data[idx];
+prev_data != curr_data
+};
+
+if value_changed {
+// Start a new run
+current_value_idx += 1;
+values_data.push(row_data[idx]);
+run_ends.push(R::Native::usize_as(current_run_end));
+} else {
+// Update the current run end
+run_ends[current_value_idx] = R::Native::usize_as(current_run_end);
+}
+}
+
+// Convert collected values to arrays
+  

Re: [PR] arrow-row: Add support for REE [arrow-rs]

2025-06-13 Thread via GitHub


brancz commented on code in PR #7649:
URL: https://github.com/apache/arrow-rs/pull/7649#discussion_r2144549751


##
arrow-row/src/run.rs:
##
@@ -0,0 +1,164 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{null_sentinel, RowConverter, Rows, SortField};
+use arrow_array::types::RunEndIndexType;
+use arrow_array::{Array, RunArray};
+use arrow_buffer::{ArrowNativeType, Buffer};
+use arrow_schema::{ArrowError, SortOptions};
+
+/// Computes the lengths of each row for a RunEndEncodedArray
+pub fn compute_lengths(
+lengths: &mut [usize],
+rows: &Rows,
+array: &RunArray,
+) {
+// We don't need to use run_ends directly, just access through the array
+for (idx, length) in lengths.iter_mut().enumerate() {
+let physical_idx = array.get_physical_index(idx);
+let is_valid = array.values().is_valid(physical_idx);
+if is_valid {
+let row = rows.row(physical_idx);
+*length += row.data.len() + 1; // 1 for the validity byte
+} else {
+*length += 1; // just the null sentinel
+}
+}
+}
+
+/// Encodes the provided `RunEndEncodedArray` to `out` with the provided 
`SortOptions`
+///
+/// `rows` should contain the encoded values
+pub fn encode(
+data: &mut [u8],
+offsets: &mut [usize],
+rows: &Rows,
+opts: SortOptions,
+array: &RunArray,
+) {
+for (idx, offset) in offsets.iter_mut().skip(1).enumerate() {
+let physical_idx = array.get_physical_index(idx);
+let is_valid = array.values().is_valid(physical_idx);
+
+let out = &mut data[*offset..];
+if is_valid {
+let row = rows.row(physical_idx);
+out[0] = 1; // valid
+out[1..1 + row.data.len()].copy_from_slice(row.data);
+*offset += row.data.len() + 1;
+} else {
+out[0] = null_sentinel(opts);
+*offset += 1;
+}
+}
+}
+
+/// Decodes a RunEndEncodedArray from `rows` with the provided `options`
+///
+/// # Safety
+///
+/// `rows` must contain valid data for the provided `converter`
+pub unsafe fn decode(
+converter: &RowConverter,
+rows: &mut [&[u8]],
+field: &SortField,
+validate_utf8: bool,
+) -> Result, ArrowError> {
+let opts = field.options;
+
+// Track null values and collect row data to avoid borrow issues
+let mut valid_flags = Vec::with_capacity(rows.len());
+let mut row_data = Vec::with_capacity(rows.len());
+
+// First pass: collect valid flags and data for each row
+for row in rows.iter() {
+let is_valid = row[0] != null_sentinel(opts);
+valid_flags.push(is_valid);
+if is_valid {
+row_data.push(&row[1..]);
+} else {
+row_data.push(&[][..]);
+}
+}
+
+// Now build run ends and values
+let mut run_ends = Vec::new();
+let mut values_data = Vec::new();
+let mut current_value_idx = 0;
+let mut current_run_end = 0;
+
+for (idx, is_valid) in valid_flags.iter().enumerate() {
+current_run_end += 1;
+
+if idx == 0 {
+// Add the first row data
+values_data.push(row_data[idx]);
+run_ends.push(R::Native::usize_as(current_run_end));
+continue;
+}
+
+// Check if this row is different from the previous one
+let value_changed = if !is_valid {
+// Null value - check if previous was null
+!valid_flags[idx - 1]
+} else if !valid_flags[idx - 1] {
+// Previous was null, this is not
+true
+} else {
+// Both are valid, compare data
+let prev_data = row_data[idx - 1];
+let curr_data = row_data[idx];
+prev_data != curr_data
+};
+
+if value_changed {
+// Start a new run
+current_value_idx += 1;
+values_data.push(row_data[idx]);
+run_ends.push(R::Native::usize_as(current_run_end));
+} else {
+// Update the current run end
+run_ends[current_value_idx] = R::Native::usize_as(current_run_end);
+}
+}
+
+// Convert collected values to arrays
+  

Re: [PR] arrow-row: Add support for REE [arrow-rs]

2025-06-13 Thread via GitHub


brancz commented on code in PR #7649:
URL: https://github.com/apache/arrow-rs/pull/7649#discussion_r2144519339


##
arrow-row/src/run.rs:
##
@@ -0,0 +1,164 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{null_sentinel, RowConverter, Rows, SortField};
+use arrow_array::types::RunEndIndexType;
+use arrow_array::{Array, RunArray};
+use arrow_buffer::{ArrowNativeType, Buffer};
+use arrow_schema::{ArrowError, SortOptions};
+
+/// Computes the lengths of each row for a RunEndEncodedArray
+pub fn compute_lengths(
+lengths: &mut [usize],
+rows: &Rows,
+array: &RunArray,
+) {
+// We don't need to use run_ends directly, just access through the array
+for (idx, length) in lengths.iter_mut().enumerate() {
+let physical_idx = array.get_physical_index(idx);
+let is_valid = array.values().is_valid(physical_idx);
+if is_valid {
+let row = rows.row(physical_idx);
+*length += row.data.len() + 1; // 1 for the validity byte
+} else {
+*length += 1; // just the null sentinel
+}
+}
+}
+
+/// Encodes the provided `RunEndEncodedArray` to `out` with the provided 
`SortOptions`
+///
+/// `rows` should contain the encoded values
+pub fn encode(
+data: &mut [u8],
+offsets: &mut [usize],
+rows: &Rows,
+opts: SortOptions,
+array: &RunArray,
+) {
+for (idx, offset) in offsets.iter_mut().skip(1).enumerate() {
+let physical_idx = array.get_physical_index(idx);

Review Comment:
   turns out it's actually also more readable that way



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



Re: [PR] arrow-row: Add support for REE [arrow-rs]

2025-06-13 Thread via GitHub


brancz commented on code in PR #7649:
URL: https://github.com/apache/arrow-rs/pull/7649#discussion_r2144508333


##
arrow-row/src/run.rs:
##
@@ -0,0 +1,164 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{null_sentinel, RowConverter, Rows, SortField};
+use arrow_array::types::RunEndIndexType;
+use arrow_array::{Array, RunArray};
+use arrow_buffer::{ArrowNativeType, Buffer};
+use arrow_schema::{ArrowError, SortOptions};
+
+/// Computes the lengths of each row for a RunEndEncodedArray
+pub fn compute_lengths(
+lengths: &mut [usize],
+rows: &Rows,
+array: &RunArray,
+) {
+// We don't need to use run_ends directly, just access through the array
+for (idx, length) in lengths.iter_mut().enumerate() {
+let physical_idx = array.get_physical_index(idx);
+let is_valid = array.values().is_valid(physical_idx);
+if is_valid {
+let row = rows.row(physical_idx);
+*length += row.data.len() + 1; // 1 for the validity byte
+} else {
+*length += 1; // just the null sentinel
+}
+}
+}
+
+/// Encodes the provided `RunEndEncodedArray` to `out` with the provided 
`SortOptions`
+///
+/// `rows` should contain the encoded values
+pub fn encode(
+data: &mut [u8],
+offsets: &mut [usize],
+rows: &Rows,
+opts: SortOptions,
+array: &RunArray,
+) {
+for (idx, offset) in offsets.iter_mut().skip(1).enumerate() {
+let physical_idx = array.get_physical_index(idx);

Review Comment:
   good catch, I don't know why I wasn't thinking of that



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



Re: [PR] arrow-row: Add support for REE [arrow-rs]

2025-06-12 Thread via GitHub


alamb commented on code in PR #7649:
URL: https://github.com/apache/arrow-rs/pull/7649#discussion_r2143666108


##
arrow-row/src/lib.rs:
##
@@ -145,9 +145,11 @@ use variable::{decode_binary_view, decode_string_view};
 
 use crate::fixed::{decode_bool, decode_fixed_size_binary, decode_primitive};
 use crate::variable::{decode_binary, decode_string};
+use arrow_array::types::{Int16Type, Int32Type, Int64Type};
 
 mod fixed;
 mod list;
+mod run;

Review Comment:
   I double checked and `run` is consistent with the naming of REEArray 
elsewhere in the crate 👍 



##
arrow-row/src/run.rs:
##
@@ -0,0 +1,164 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{null_sentinel, RowConverter, Rows, SortField};
+use arrow_array::types::RunEndIndexType;
+use arrow_array::{Array, RunArray};
+use arrow_buffer::{ArrowNativeType, Buffer};
+use arrow_schema::{ArrowError, SortOptions};
+
+/// Computes the lengths of each row for a RunEndEncodedArray
+pub fn compute_lengths(
+lengths: &mut [usize],
+rows: &Rows,
+array: &RunArray,
+) {
+// We don't need to use run_ends directly, just access through the array
+for (idx, length) in lengths.iter_mut().enumerate() {
+let physical_idx = array.get_physical_index(idx);
+let is_valid = array.values().is_valid(physical_idx);
+if is_valid {
+let row = rows.row(physical_idx);
+*length += row.data.len() + 1; // 1 for the validity byte
+} else {
+*length += 1; // just the null sentinel
+}
+}
+}
+
+/// Encodes the provided `RunEndEncodedArray` to `out` with the provided 
`SortOptions`
+///
+/// `rows` should contain the encoded values
+pub fn encode(
+data: &mut [u8],
+offsets: &mut [usize],
+rows: &Rows,
+opts: SortOptions,
+array: &RunArray,
+) {
+for (idx, offset) in offsets.iter_mut().skip(1).enumerate() {
+let physical_idx = array.get_physical_index(idx);
+let is_valid = array.values().is_valid(physical_idx);
+
+let out = &mut data[*offset..];
+if is_valid {
+let row = rows.row(physical_idx);
+out[0] = 1; // valid
+out[1..1 + row.data.len()].copy_from_slice(row.data);
+*offset += row.data.len() + 1;
+} else {
+out[0] = null_sentinel(opts);
+*offset += 1;
+}
+}
+}
+
+/// Decodes a RunEndEncodedArray from `rows` with the provided `options`
+///
+/// # Safety
+///
+/// `rows` must contain valid data for the provided `converter`
+pub unsafe fn decode(
+converter: &RowConverter,
+rows: &mut [&[u8]],
+field: &SortField,
+validate_utf8: bool,
+) -> Result, ArrowError> {
+let opts = field.options;
+
+// Track null values and collect row data to avoid borrow issues
+let mut valid_flags = Vec::with_capacity(rows.len());

Review Comment:
I think you could use `BooleanBufferBuilder` here which might be more 
efficient



##
arrow-row/src/lib.rs:
##
@@ -400,6 +404,17 @@ impl Codec {
 };
 Ok(Self::Dictionary(converter, owned))
 }
+DataType::RunEndEncoded(_, values) => {
+// Similar to List implementation

Review Comment:
   Maybe we can pull the transformation into a documented helper function (not 
needed, I just was confused for a bit until I read the comments in the 
List/LargeList implementation



##
arrow-row/src/run_test.rs:
##
@@ -0,0 +1,55 @@
+// Licensed to the Apache Software Foundation (ASF) under one

Review Comment:
   I think it is more standard in this repo to put unit tests like this in the 
same module (aka I would expect this to be in `arrow-row/src/run.rs`
   
   Is there any reason it is in a different module?



##
arrow-row/src/run.rs:
##
@@ -0,0 +1,164 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use