This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 25e10ddf0 Add ArrayAccessor, Iterator, Extend and benchmarks for
RunArray (#3603)
25e10ddf0 is described below
commit 25e10ddf0b8b65f0d73142717e7c22ab150d6870
Author: askoa <[email protected]>
AuthorDate: Sat Feb 4 04:24:56 2023 -0500
Add ArrayAccessor, Iterator, Extend and benchmarks for RunArray (#3603)
* Add ArrayAccessor, Iterator, Extend and benchmarks for RunArray
* fix clippy issues
* minor fix
* incorporate pr suggestions
* formatting fix
* fix clippy issues
---------
Co-authored-by: ask <ask@local>
---
arrow-array/src/array/run_array.rs | 226 ++++++++++++++++-
.../src/builder/generic_byte_run_builder.rs | 53 +++-
arrow-array/src/builder/primitive_run_builder.rs | 31 +++
arrow-array/src/lib.rs | 1 +
arrow-array/src/run_iterator.rs | 273 +++++++++++++++++++++
arrow/Cargo.toml | 4 +
arrow/benches/string_run_builder.rs | 80 ++++++
7 files changed, 656 insertions(+), 12 deletions(-)
diff --git a/arrow-array/src/array/run_array.rs
b/arrow-array/src/array/run_array.rs
index 48c4896b6..8cc1f676b 100644
--- a/arrow-array/src/array/run_array.rs
+++ b/arrow-array/src/array/run_array.rs
@@ -24,8 +24,9 @@ use arrow_schema::{ArrowError, DataType, Field};
use crate::{
builder::StringRunBuilder,
make_array,
+ run_iterator::RunArrayIter,
types::{Int16Type, Int32Type, Int64Type, RunEndIndexType},
- Array, ArrayRef, PrimitiveArray,
+ Array, ArrayAccessor, ArrayRef, PrimitiveArray,
};
///
@@ -121,6 +122,27 @@ impl<R: RunEndIndexType> RunArray<R> {
pub fn values(&self) -> &ArrayRef {
&self.values
}
+
+ /// Downcast this [`RunArray`] to a [`TypedRunArray`]
+ ///
+ /// ```
+ /// use arrow_array::{Array, ArrayAccessor, RunArray, StringArray,
types::Int32Type};
+ ///
+ /// let orig = [Some("a"), Some("b"), None];
+ /// let run_array = RunArray::<Int32Type>::from_iter(orig);
+ /// let typed = run_array.downcast::<StringArray>().unwrap();
+ /// assert_eq!(typed.value(0), "a");
+ /// assert_eq!(typed.value(1), "b");
+ /// assert!(typed.values().is_null(2));
+ /// ```
+ ///
+ pub fn downcast<V: 'static>(&self) -> Option<TypedRunArray<'_, R, V>> {
+ let values = self.values.as_any().downcast_ref()?;
+ Some(TypedRunArray {
+ run_array: self,
+ values,
+ })
+ }
}
impl<R: RunEndIndexType> From<ArrayData> for RunArray<R> {
@@ -274,15 +296,195 @@ pub type Int32RunArray = RunArray<Int32Type>;
/// ```
pub type Int64RunArray = RunArray<Int64Type>;
+/// A strongly-typed wrapper around a [`RunArray`] that implements
[`ArrayAccessor`]
+/// and [`IntoIterator`] allowing fast access to its elements
+///
+/// ```
+/// use arrow_array::{RunArray, StringArray, types::Int32Type};
+///
+/// let orig = ["a", "b", "a", "b"];
+/// let ree_array = RunArray::<Int32Type>::from_iter(orig);
+///
+/// // `TypedRunArray` allows you to access the values directly
+/// let typed = ree_array.downcast::<StringArray>().unwrap();
+///
+/// for (maybe_val, orig) in typed.into_iter().zip(orig) {
+/// assert_eq!(maybe_val.unwrap(), orig)
+/// }
+/// ```
+pub struct TypedRunArray<'a, R: RunEndIndexType, V> {
+ /// The run array
+ run_array: &'a RunArray<R>,
+
+ /// The values of the run_array
+ values: &'a V,
+}
+
+// Manually implement `Clone` to avoid `V: Clone` type constraint
+impl<'a, R: RunEndIndexType, V> Clone for TypedRunArray<'a, R, V> {
+ fn clone(&self) -> Self {
+ Self {
+ run_array: self.run_array,
+ values: self.values,
+ }
+ }
+}
+
+impl<'a, R: RunEndIndexType, V> Copy for TypedRunArray<'a, R, V> {}
+
+impl<'a, R: RunEndIndexType, V> std::fmt::Debug for TypedRunArray<'a, R, V> {
+ fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+ writeln!(f, "TypedRunArray({:?})", self.run_array)
+ }
+}
+
+impl<'a, R: RunEndIndexType, V> TypedRunArray<'a, R, V> {
+ /// Returns the run_ends of this [`TypedRunArray`]
+ pub fn run_ends(&self) -> &'a PrimitiveArray<R> {
+ self.run_array.run_ends()
+ }
+
+ /// Returns the values of this [`TypedRunArray`]
+ pub fn values(&self) -> &'a V {
+ self.values
+ }
+
+ /// Returns index to the physcial array for the given index to the logical
array.
+ /// Performs a binary search on the run_ends array for the input index.
+ #[inline]
+ pub fn get_physical_index(&self, logical_index: usize) -> Option<usize> {
+ if logical_index >= self.run_array.len() {
+ return None;
+ }
+ let mut st: usize = 0;
+ let mut en: usize = self.run_ends().len();
+ while st + 1 < en {
+ let mid: usize = (st + en) / 2;
+ if logical_index
+ < unsafe {
+ // Safety:
+ // The value of mid will always be between 1 and len - 1,
+ // where len is length of run ends array.
+ // This is based on the fact that `st` starts with 0 and
+ // `en` starts with len. The condition `st + 1 < en`
ensures
+ // `st` and `en` differs atleast by two. So the value of
`mid`
+ // will never be either `st` or `en`
+ self.run_ends().value_unchecked(mid - 1).as_usize()
+ }
+ {
+ en = mid
+ } else {
+ st = mid
+ }
+ }
+ Some(st)
+ }
+}
+
+impl<'a, R: RunEndIndexType, V: Sync> Array for TypedRunArray<'a, R, V> {
+ fn as_any(&self) -> &dyn Any {
+ self.run_array
+ }
+
+ fn data(&self) -> &ArrayData {
+ &self.run_array.data
+ }
+
+ fn into_data(self) -> ArrayData {
+ self.run_array.into_data()
+ }
+}
+
+// Array accessor converts the index of logical array to the index of the
physical array
+// using binary search. The time complexity is O(log N) where N is number of
runs.
+impl<'a, R, V> ArrayAccessor for TypedRunArray<'a, R, V>
+where
+ R: RunEndIndexType,
+ V: Sync + Send,
+ &'a V: ArrayAccessor,
+ <&'a V as ArrayAccessor>::Item: Default,
+{
+ type Item = <&'a V as ArrayAccessor>::Item;
+
+ fn value(&self, logical_index: usize) -> Self::Item {
+ assert!(
+ logical_index < self.len(),
+ "Trying to access an element at index {} from a TypedRunArray of
length {}",
+ logical_index,
+ self.len()
+ );
+ unsafe { self.value_unchecked(logical_index) }
+ }
+
+ unsafe fn value_unchecked(&self, logical_index: usize) -> Self::Item {
+ let physical_index = self.get_physical_index(logical_index).unwrap();
+ self.values().value_unchecked(physical_index)
+ }
+}
+
+impl<'a, R, V> IntoIterator for TypedRunArray<'a, R, V>
+where
+ R: RunEndIndexType,
+ V: Sync + Send,
+ &'a V: ArrayAccessor,
+ <&'a V as ArrayAccessor>::Item: Default,
+{
+ type Item = Option<<&'a V as ArrayAccessor>::Item>;
+ type IntoIter = RunArrayIter<'a, R, V>;
+
+ fn into_iter(self) -> Self::IntoIter {
+ RunArrayIter::new(self)
+ }
+}
+
#[cfg(test)]
mod tests {
use std::sync::Arc;
+ use rand::seq::SliceRandom;
+ use rand::thread_rng;
+ use rand::Rng;
+
use super::*;
use crate::builder::PrimitiveRunBuilder;
use crate::types::{Int16Type, Int32Type, Int8Type, UInt32Type};
use crate::{Array, Int16Array, Int32Array, StringArray};
+ fn build_input_array(approx_size: usize) -> Vec<Option<i32>> {
+ // The input array is created by shuffling and repeating
+ // the seed values random number of times.
+ let mut seed: Vec<Option<i32>> = vec![
+ None,
+ None,
+ Some(1),
+ Some(2),
+ Some(3),
+ Some(4),
+ Some(5),
+ Some(6),
+ ];
+ let mut ix = 0;
+ let mut result: Vec<Option<i32>> = Vec::with_capacity(approx_size);
+ let mut rng = thread_rng();
+ while result.len() < approx_size {
+ // shuffle the seed array if all the values are iterated.
+ if ix == 0 {
+ seed.shuffle(&mut rng);
+ }
+ // repeat the items between 1 and 7 times.
+ let num = rand::thread_rng().gen_range(1..8);
+ for _ in 0..num {
+ result.push(seed[ix]);
+ }
+ ix += 1;
+ if ix == 8 {
+ ix = 0
+ }
+ }
+ println!("Size of input array: {}", result.len());
+ result
+ }
+
#[test]
fn test_run_array() {
// Construct a value array
@@ -504,4 +706,26 @@ mod tests {
let a = RunArray::<Int32Type>::from_iter(["32"]);
let _ = RunArray::<Int64Type>::from(a.into_data());
}
+
+ #[test]
+ fn test_ree_array_accessor() {
+ let input_array = build_input_array(256);
+
+ // Encode the input_array to ree_array
+ let mut builder =
+ PrimitiveRunBuilder::<Int16Type,
Int32Type>::with_capacity(input_array.len());
+ builder.extend(input_array.iter().copied());
+ let run_array = builder.finish();
+ let typed = run_array.downcast::<PrimitiveArray<Int32Type>>().unwrap();
+
+ for (i, inp_val) in input_array.iter().enumerate() {
+ if let Some(val) = inp_val {
+ let actual = typed.value(i);
+ assert_eq!(*val, actual)
+ } else {
+ let physical_ix = typed.get_physical_index(i).unwrap();
+ assert!(typed.values().is_null(physical_ix));
+ };
+ }
+ }
}
diff --git a/arrow-array/src/builder/generic_byte_run_builder.rs
b/arrow-array/src/builder/generic_byte_run_builder.rs
index c1ecbcb5d..c6dbb82ff 100644
--- a/arrow-array/src/builder/generic_byte_run_builder.rs
+++ b/arrow-array/src/builder/generic_byte_run_builder.rs
@@ -44,15 +44,14 @@ use arrow_buffer::ArrowNativeType;
///
/// let mut builder =
/// GenericByteRunBuilder::<Int16Type, BinaryType>::new();
-/// builder.append_value(b"abc");
-/// builder.append_value(b"abc");
-/// builder.append_null();
+/// builder.extend([Some(b"abc"), Some(b"abc"), None,
Some(b"def")].into_iter());
/// builder.append_value(b"def");
+/// builder.append_null();
/// let array = builder.finish();
///
/// assert_eq!(
/// array.run_ends(),
-/// &Int16Array::from(vec![Some(2), Some(3), Some(4)])
+/// &Int16Array::from(vec![Some(2), Some(3), Some(5), Some(6)])
/// );
///
/// let av = array.values();
@@ -60,6 +59,7 @@ use arrow_buffer::ArrowNativeType;
/// assert!(!av.is_null(0));
/// assert!(av.is_null(1));
/// assert!(!av.is_null(2));
+/// assert!(av.is_null(3));
///
/// // Values are polymorphic and so require a downcast.
/// let ava: &BinaryArray = as_generic_binary_array(av.as_ref());
@@ -299,6 +299,19 @@ where
}
}
+impl<R, V, S> Extend<Option<S>> for GenericByteRunBuilder<R, V>
+where
+ R: RunEndIndexType,
+ V: ByteArrayType,
+ S: AsRef<V::Native>,
+{
+ fn extend<T: IntoIterator<Item = Option<S>>>(&mut self, iter: T) {
+ for elem in iter {
+ self.append_option(elem);
+ }
+ }
+}
+
/// Array builder for [`RunArray`] that encodes strings ([`Utf8Type`]).
///
/// ```
@@ -315,9 +328,7 @@ where
/// // The builder builds the dictionary value by value
/// builder.append_value("abc");
/// builder.append_null();
-/// builder.append_value("def");
-/// builder.append_value("def");
-/// builder.append_value("abc");
+/// builder.extend([Some("def"), Some("def"), Some("abc")]);
/// let array = builder.finish();
///
/// assert_eq!(
@@ -356,9 +367,7 @@ pub type LargeStringRunBuilder<K> =
GenericByteRunBuilder<K, LargeUtf8Type>;
/// // The builder builds the dictionary value by value
/// builder.append_value(b"abc");
/// builder.append_null();
-/// builder.append_value(b"def");
-/// builder.append_value(b"def");
-/// builder.append_value(b"abc");
+/// builder.extend([Some(b"def"), Some(b"def"), Some(b"abc")]);
/// let array = builder.finish();
///
/// assert_eq!(
@@ -387,7 +396,9 @@ mod tests {
use super::*;
use crate::array::Array;
- use crate::types::Int16Type;
+ use crate::cast::as_primitive_array;
+ use crate::cast::as_string_array;
+ use crate::types::{Int16Type, Int32Type};
use crate::GenericByteArray;
use crate::Int16Array;
use crate::Int16RunArray;
@@ -516,4 +527,24 @@ mod tests {
fn test_binary_run_buider_finish_cloned() {
test_bytes_run_buider_finish_cloned::<BinaryType>(vec![b"abc", b"def",
b"ghi"]);
}
+
+ #[test]
+ fn test_extend() {
+ let mut builder = StringRunBuilder::<Int32Type>::new();
+ builder.extend(["a", "a", "a", "", "", "b",
"b"].into_iter().map(Some));
+ builder.extend(["b", "cupcakes", "cupcakes"].into_iter().map(Some));
+ let array = builder.finish();
+
+ assert_eq!(array.len(), 10);
+ assert_eq!(
+ as_primitive_array::<Int32Type>(array.run_ends()).values(),
+ &[3, 5, 8, 10]
+ );
+
+ let str_array = as_string_array(array.values().as_ref());
+ assert_eq!(str_array.value(0), "a");
+ assert_eq!(str_array.value(1), "");
+ assert_eq!(str_array.value(2), "b");
+ assert_eq!(str_array.value(3), "cupcakes");
+ }
}
diff --git a/arrow-array/src/builder/primitive_run_builder.rs
b/arrow-array/src/builder/primitive_run_builder.rs
index 82c46abfa..410662283 100644
--- a/arrow-array/src/builder/primitive_run_builder.rs
+++ b/arrow-array/src/builder/primitive_run_builder.rs
@@ -253,6 +253,18 @@ where
}
}
+impl<R, V> Extend<Option<V::Native>> for PrimitiveRunBuilder<R, V>
+where
+ R: RunEndIndexType,
+ V: ArrowPrimitiveType,
+{
+ fn extend<T: IntoIterator<Item = Option<V::Native>>>(&mut self, iter: T) {
+ for elem in iter {
+ self.append_option(elem);
+ }
+ }
+}
+
#[cfg(test)]
mod tests {
use crate::builder::PrimitiveRunBuilder;
@@ -291,4 +303,23 @@ mod tests {
assert_eq!(ava, &UInt32Array::from(vec![Some(1234), None,
Some(5678)]));
}
+
+ #[test]
+ fn test_extend() {
+ let mut builder = PrimitiveRunBuilder::<Int16Type, Int16Type>::new();
+ builder.extend([1, 2, 2, 5, 5, 4, 4].into_iter().map(Some));
+ builder.extend([4, 4, 6, 2].into_iter().map(Some));
+ let array = builder.finish();
+
+ assert_eq!(array.len(), 11);
+ assert_eq!(array.null_count(), 0);
+ assert_eq!(
+ as_primitive_array::<Int16Type>(array.run_ends()).values(),
+ &[1, 3, 5, 9, 10, 11]
+ );
+ assert_eq!(
+ as_primitive_array::<Int16Type>(array.values().as_ref()).values(),
+ &[1, 2, 5, 4, 6, 2]
+ );
+ }
}
diff --git a/arrow-array/src/lib.rs b/arrow-array/src/lib.rs
index d6a9ab30b..d8dc6efe2 100644
--- a/arrow-array/src/lib.rs
+++ b/arrow-array/src/lib.rs
@@ -178,6 +178,7 @@ pub mod cast;
mod delta;
pub mod iterator;
mod raw_pointer;
+pub mod run_iterator;
pub mod temporal_conversions;
pub mod timezone;
mod trusted_len;
diff --git a/arrow-array/src/run_iterator.rs b/arrow-array/src/run_iterator.rs
new file mode 100644
index 000000000..6a7b785fe
--- /dev/null
+++ b/arrow-array/src/run_iterator.rs
@@ -0,0 +1,273 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Idiomatic iterator for [`RunArray`](crate::Array)
+
+use arrow_buffer::ArrowNativeType;
+
+use crate::{array::ArrayAccessor, types::RunEndIndexType, Array,
TypedRunArray};
+
+/// The [`RunArrayIter`] provides an idiomatic way to iterate over the run
array.
+/// It returns Some(T) if there is a value or None if the value is null.
+///
+/// The iterator comes with a cost as it has to iterate over three arrays to
determine
+/// the value to be returned. The run_ends array is used to determine the
index of the value.
+/// The nulls array is used to determine if the value is null and the values
array is used to
+/// get the value.
+///
+/// Unlike other iterators in this crate, [`RunArrayIter`] does not use
[`ArrayAccessor`]
+/// because the run array accessor does binary search to access each value
which is too slow.
+/// The run array iterator can determine the next value in constant time.
+///
+#[derive(Debug)]
+pub struct RunArrayIter<'a, R, V>
+where
+ R: RunEndIndexType,
+ V: Sync + Send,
+ &'a V: ArrayAccessor,
+ <&'a V as ArrayAccessor>::Item: Default,
+{
+ array: TypedRunArray<'a, R, V>,
+ current_logical: usize,
+ current_physical: usize,
+ current_end_logical: usize,
+ current_end_physical: usize,
+}
+
+impl<'a, R, V> RunArrayIter<'a, R, V>
+where
+ R: RunEndIndexType,
+ V: Sync + Send,
+ &'a V: ArrayAccessor,
+ <&'a V as ArrayAccessor>::Item: Default,
+{
+ /// create a new iterator
+ pub fn new(array: TypedRunArray<'a, R, V>) -> Self {
+ let logical_len = array.len();
+ let physical_len: usize = array.values().len();
+ RunArrayIter {
+ array,
+ current_logical: 0,
+ current_physical: 0,
+ current_end_logical: logical_len,
+ current_end_physical: physical_len,
+ }
+ }
+}
+
+impl<'a, R, V> Iterator for RunArrayIter<'a, R, V>
+where
+ R: RunEndIndexType,
+ V: Sync + Send,
+ &'a V: ArrayAccessor,
+ <&'a V as ArrayAccessor>::Item: Default,
+{
+ type Item = Option<<&'a V as ArrayAccessor>::Item>;
+
+ #[inline]
+ fn next(&mut self) -> Option<Self::Item> {
+ if self.current_logical == self.current_end_logical {
+ return None;
+ }
+ // If current logical index is greater than current run end index then
increment
+ // the physical index.
+ if self.current_logical
+ >= self
+ .array
+ .run_ends()
+ .value(self.current_physical)
+ .as_usize()
+ {
+ // As the run_ends is expected to be strictly increasing, there
+ // should be at least one logical entry in one physical entry.
Because of this
+ // reason the next value can be accessed by incrementing physical
index once.
+ self.current_physical += 1;
+ }
+ if self.array.values().is_null(self.current_physical) {
+ self.current_logical += 1;
+ Some(None)
+ } else {
+ self.current_logical += 1;
+ // Safety:
+ // The self.current_physical is kept within bounds of
self.current_logical.
+ // The self.current_logical will not go out of bounds because of
the check
+ // `self.current_logical = self.current_end_logical` above.
+ unsafe {
+ Some(Some(
+ self.array.values().value_unchecked(self.current_physical),
+ ))
+ }
+ }
+ }
+
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ (
+ self.current_end_logical - self.current_logical,
+ Some(self.current_end_logical - self.current_logical),
+ )
+ }
+}
+
+impl<'a, R, V> DoubleEndedIterator for RunArrayIter<'a, R, V>
+where
+ R: RunEndIndexType,
+ V: Sync + Send,
+ &'a V: ArrayAccessor,
+ <&'a V as ArrayAccessor>::Item: Default,
+{
+ fn next_back(&mut self) -> Option<Self::Item> {
+ if self.current_end_logical == self.current_logical {
+ return None;
+ }
+
+ self.current_end_logical -= 1;
+
+ if self.current_end_physical > 0
+ && self.current_end_logical
+ < self
+ .array
+ .run_ends()
+ .value(self.current_end_physical - 1)
+ .as_usize()
+ {
+ // As the run_ends is expected to be strictly increasing, there
+ // should be at least one logical entry in one physical entry.
Because of this
+ // reason the next value can be accessed by decrementing physical
index once.
+ self.current_end_physical -= 1;
+ }
+
+ Some(if self.array.values().is_null(self.current_end_physical) {
+ None
+ } else {
+ // Safety:
+ // The check `self.current_end_physical > 0` ensures the value
will not underflow.
+ // Also self.current_end_physical starts with array.len() and
+ // decrements based on the bounds of self.current_end_logical.
+ unsafe {
+ Some(
+ self.array
+ .values()
+ .value_unchecked(self.current_end_physical),
+ )
+ }
+ })
+ }
+}
+
+/// all arrays have known size.
+impl<'a, R, V> ExactSizeIterator for RunArrayIter<'a, R, V>
+where
+ R: RunEndIndexType,
+ V: Sync + Send,
+ &'a V: ArrayAccessor,
+ <&'a V as ArrayAccessor>::Item: Default,
+{
+}
+
+#[cfg(test)]
+mod tests {
+ use crate::{
+ array::{Int32Array, StringArray},
+ builder::PrimitiveRunBuilder,
+ types::Int32Type,
+ Int64RunArray,
+ };
+
+ #[test]
+ fn test_primitive_array_iter_round_trip() {
+ let mut input_vec = vec![
+ Some(32),
+ Some(32),
+ None,
+ Some(64),
+ Some(64),
+ Some(64),
+ Some(72),
+ ];
+ let mut builder = PrimitiveRunBuilder::<Int32Type, Int32Type>::new();
+ builder.extend(input_vec.clone().into_iter());
+ let ree_array = builder.finish();
+ let ree_array = ree_array.downcast::<Int32Array>().unwrap();
+
+ let output_vec: Vec<Option<i32>> = ree_array.into_iter().collect();
+ assert_eq!(input_vec, output_vec);
+
+ let rev_output_vec: Vec<Option<i32>> =
ree_array.into_iter().rev().collect();
+ input_vec.reverse();
+ assert_eq!(input_vec, rev_output_vec);
+ }
+
+ #[test]
+ fn test_double_ended() {
+ let input_vec = vec![
+ Some(32),
+ Some(32),
+ None,
+ Some(64),
+ Some(64),
+ Some(64),
+ Some(72),
+ ];
+ let mut builder = PrimitiveRunBuilder::<Int32Type, Int32Type>::new();
+ builder.extend(input_vec.into_iter());
+ let ree_array = builder.finish();
+ let ree_array = ree_array.downcast::<Int32Array>().unwrap();
+
+ let mut iter = ree_array.into_iter();
+ assert_eq!(Some(Some(32)), iter.next());
+ assert_eq!(Some(Some(72)), iter.next_back());
+ assert_eq!(Some(Some(32)), iter.next());
+ assert_eq!(Some(Some(64)), iter.next_back());
+ assert_eq!(Some(None), iter.next());
+ assert_eq!(Some(Some(64)), iter.next_back());
+ assert_eq!(Some(Some(64)), iter.next());
+ assert_eq!(None, iter.next_back());
+ assert_eq!(None, iter.next());
+ }
+
+ #[test]
+ fn test_string_array_iter_round_trip() {
+ let input_vec = vec!["ab", "ab", "ba", "cc", "cc"];
+ let input_ree_array: Int64RunArray = input_vec.into_iter().collect();
+ let string_ree_array =
input_ree_array.downcast::<StringArray>().unwrap();
+
+ // to and from iter, with a +1
+ let result: Vec<Option<String>> = string_ree_array
+ .into_iter()
+ .map(|e| {
+ e.map(|e| {
+ let mut a = e.to_string();
+ a.push('b');
+ a
+ })
+ })
+ .collect();
+
+ let result_asref: Vec<Option<&str>> =
+ result.iter().map(|f| f.as_deref()).collect();
+
+ let expected_vec = vec![
+ Some("abb"),
+ Some("abb"),
+ Some("bab"),
+ Some("ccb"),
+ Some("ccb"),
+ ];
+
+ assert_eq!(expected_vec, result_asref);
+ }
+}
diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml
index 6de513df3..57e3907a2 100644
--- a/arrow/Cargo.toml
+++ b/arrow/Cargo.toml
@@ -237,6 +237,10 @@ required-features = ["test_utils"]
name = "string_dictionary_builder"
harness = false
+[[bench]]
+name = "string_run_builder"
+harness = false
+
[[bench]]
name = "substring_kernels"
harness = false
diff --git a/arrow/benches/string_run_builder.rs
b/arrow/benches/string_run_builder.rs
new file mode 100644
index 000000000..2f0401bbe
--- /dev/null
+++ b/arrow/benches/string_run_builder.rs
@@ -0,0 +1,80 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::StringRunBuilder;
+use arrow::datatypes::Int32Type;
+use criterion::{criterion_group, criterion_main, Criterion};
+use rand::{thread_rng, Rng};
+
+fn build_strings(
+ physical_array_len: usize,
+ logical_array_len: usize,
+ string_len: usize,
+) -> Vec<String> {
+ let mut rng = thread_rng();
+ let run_len = logical_array_len / physical_array_len;
+ let mut values: Vec<String> = (0..physical_array_len)
+ .map(|_| (0..string_len).map(|_| rng.gen::<char>()).collect())
+ .flat_map(|s| std::iter::repeat(s).take(run_len))
+ .collect();
+ while values.len() < logical_array_len {
+ let last_val = values[values.len() - 1].clone();
+ values.push(last_val);
+ }
+ values
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+ let mut group = c.benchmark_group("string_run_builder");
+
+ let mut do_bench = |physical_array_len: usize,
+ logical_array_len: usize,
+ string_len: usize| {
+ group.bench_function(
+ format!(
+ "(run_array_len:{logical_array_len},
physical_array_len:{physical_array_len}, string_len: {string_len})",
+ ),
+ |b| {
+ let strings =
+ build_strings(physical_array_len, logical_array_len,
string_len);
+ b.iter(|| {
+ let mut builder =
StringRunBuilder::<Int32Type>::with_capacity(
+ physical_array_len,
+ (string_len + 1) * physical_array_len,
+ );
+
+ for val in &strings {
+ builder.append_value(val);
+ }
+
+ builder.finish();
+ })
+ },
+ );
+ };
+
+ do_bench(20, 1000, 5);
+ do_bench(100, 1000, 5);
+ do_bench(100, 1000, 10);
+ do_bench(100, 10000, 10);
+ do_bench(100, 10000, 100);
+
+ group.finish();
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);