This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 4bd737dab Support casting between BinaryView <--> Utf8 and LargeUtf8
(#6180)
4bd737dab is described below
commit 4bd737dab2aa17aca200259347909d48ed793ba1
Author: Xin Li <[email protected]>
AuthorDate: Thu Aug 8 07:20:33 2024 -0700
Support casting between BinaryView <--> Utf8 and LargeUtf8 (#6180)
* support cast between binaryview and string
* update impl. and add bench mark
* Add ut for views
* Apply coments
---
arrow-cast/src/cast/mod.rs | 102 ++++++++++++++++++++++++++++++++++++------
arrow/benches/cast_kernels.rs | 39 ++++++++++++++++
2 files changed, 127 insertions(+), 14 deletions(-)
diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs
index f6103cb84..9f552ec72 100644
--- a/arrow-cast/src/cast/mod.rs
+++ b/arrow-cast/src/cast/mod.rs
@@ -225,10 +225,11 @@ pub fn can_cast_types(from_type: &DataType, to_type:
&DataType) -> bool {
| Timestamp(Millisecond, _)
| Timestamp(Microsecond, _)
| Timestamp(Nanosecond, _)
- | Interval(_),
+ | Interval(_)
+ | BinaryView,
) => true,
(Utf8 | LargeUtf8, Utf8View) => true,
- (BinaryView, Binary | LargeBinary) => true,
+ (BinaryView, Binary | LargeBinary | Utf8 | LargeUtf8 | Utf8View ) =>
true,
(Utf8 | LargeUtf8, _) => to_type.is_numeric() && to_type != &Float16,
(_, Utf8 | LargeUtf8) => from_type.is_primitive(),
@@ -1229,6 +1230,9 @@ pub fn cast_with_options(
cast_byte_container::<BinaryType, LargeBinaryType>(&binary)
}
Utf8View =>
Ok(Arc::new(StringViewArray::from(array.as_string::<i32>()))),
+ BinaryView => Ok(Arc::new(
+
StringViewArray::from(array.as_string::<i32>()).to_binary_view(),
+ )),
LargeUtf8 => cast_byte_container::<Utf8Type, LargeUtf8Type>(array),
Time32(TimeUnit::Second) => parse_string::<Time32SecondType,
i32>(array, cast_options),
Time32(TimeUnit::Millisecond) => {
@@ -1282,6 +1286,7 @@ pub fn cast_with_options(
Date64 => parse_string_view::<Date64Type>(array, cast_options),
Binary => cast_view_to_byte::<StringViewType,
GenericBinaryType<i32>>(array),
LargeBinary => cast_view_to_byte::<StringViewType,
GenericBinaryType<i64>>(array),
+ BinaryView =>
Ok(Arc::new(array.as_string_view().clone().to_binary_view())),
Utf8 => cast_view_to_byte::<StringViewType,
GenericStringType<i32>>(array),
LargeUtf8 => cast_view_to_byte::<StringViewType,
GenericStringType<i64>>(array),
Time32(TimeUnit::Second) =>
parse_string_view::<Time32SecondType>(array, cast_options),
@@ -1339,6 +1344,13 @@ pub fn cast_with_options(
array.as_string::<i64>().clone(),
))),
Utf8View =>
Ok(Arc::new(StringViewArray::from(array.as_string::<i64>()))),
+ BinaryView => Ok(Arc::new(BinaryViewArray::from(
+ array
+ .as_string::<i64>()
+ .into_iter()
+ .map(|x| x.map(|x| x.as_bytes()))
+ .collect::<Vec<_>>(),
+ ))),
Time32(TimeUnit::Second) => parse_string::<Time32SecondType,
i64>(array, cast_options),
Time32(TimeUnit::Millisecond) => {
parse_string::<Time32MillisecondType, i64>(array, cast_options)
@@ -1417,6 +1429,20 @@ pub fn cast_with_options(
(BinaryView, LargeBinary) => {
cast_view_to_byte::<BinaryViewType, GenericBinaryType<i64>>(array)
}
+ (BinaryView, Utf8) => {
+ let binary_arr = cast_view_to_byte::<BinaryViewType,
GenericBinaryType<i32>>(array)?;
+ cast_binary_to_string::<i32>(&binary_arr, cast_options)
+ }
+ (BinaryView, LargeUtf8) => {
+ let binary_arr = cast_view_to_byte::<BinaryViewType,
GenericBinaryType<i64>>(array)?;
+ cast_binary_to_string::<i64>(&binary_arr, cast_options)
+ }
+ (BinaryView, Utf8View) => {
+ Ok(Arc::new(array.as_binary_view().clone().to_string_view()?) as
ArrayRef)
+ }
+ (BinaryView, _) => Err(ArrowError::CastError(format!(
+ "Casting from {from_type:?} to {to_type:?} not supported",
+ ))),
(from_type, LargeUtf8) if from_type.is_primitive() => {
value_to_string::<i64>(array, cast_options)
}
@@ -2008,7 +2034,6 @@ pub fn cast_with_options(
})?,
))
}
-
(Date64, Timestamp(TimeUnit::Second, None)) => Ok(Arc::new(
array
.as_primitive::<Date64Type>()
@@ -5256,12 +5281,6 @@ mod tests {
}
}
- #[test]
- fn test_string_to_view() {
- _test_string_to_view::<i32>();
- _test_string_to_view::<i64>();
- }
-
const VIEW_TEST_DATA: [Option<&str>; 5] = [
Some("hello"),
Some("repeated"),
@@ -5270,6 +5289,44 @@ mod tests {
Some("repeated"),
];
+ #[test]
+ fn test_string_view_to_binary_view() {
+ let string_view_array = StringViewArray::from_iter(VIEW_TEST_DATA);
+
+ assert!(can_cast_types(
+ string_view_array.data_type(),
+ &DataType::BinaryView
+ ));
+
+ let binary_view_array = cast(&string_view_array,
&DataType::BinaryView).unwrap();
+ assert_eq!(binary_view_array.data_type(), &DataType::BinaryView);
+
+ let expect_binary_view_array =
BinaryViewArray::from_iter(VIEW_TEST_DATA);
+ assert_eq!(binary_view_array.as_ref(), &expect_binary_view_array);
+ }
+
+ #[test]
+ fn test_binary_view_to_string_view() {
+ let binary_view_array = BinaryViewArray::from_iter(VIEW_TEST_DATA);
+
+ assert!(can_cast_types(
+ binary_view_array.data_type(),
+ &DataType::Utf8View
+ ));
+
+ let string_view_array = cast(&binary_view_array,
&DataType::Utf8View).unwrap();
+ assert_eq!(string_view_array.data_type(), &DataType::Utf8View);
+
+ let expect_string_view_array =
StringViewArray::from_iter(VIEW_TEST_DATA);
+ assert_eq!(string_view_array.as_ref(), &expect_string_view_array);
+ }
+
+ #[test]
+ fn test_string_to_view() {
+ _test_string_to_view::<i32>();
+ _test_string_to_view::<i64>();
+ }
+
fn _test_string_to_view<O>()
where
O: OffsetSizeTrait,
@@ -5281,11 +5338,22 @@ mod tests {
&DataType::Utf8View
));
+ assert!(can_cast_types(
+ string_array.data_type(),
+ &DataType::BinaryView
+ ));
+
let string_view_array = cast(&string_array,
&DataType::Utf8View).unwrap();
assert_eq!(string_view_array.data_type(), &DataType::Utf8View);
+ let binary_view_array = cast(&string_array,
&DataType::BinaryView).unwrap();
+ assert_eq!(binary_view_array.data_type(), &DataType::BinaryView);
+
let expect_string_view_array =
StringViewArray::from_iter(VIEW_TEST_DATA);
assert_eq!(string_view_array.as_ref(), &expect_string_view_array);
+
+ let expect_binary_view_array =
BinaryViewArray::from_iter(VIEW_TEST_DATA);
+ assert_eq!(binary_view_array.as_ref(), &expect_binary_view_array);
}
#[test]
@@ -5380,7 +5448,7 @@ mod tests {
where
O: OffsetSizeTrait,
{
- let view_array = {
+ let string_view_array = {
let mut builder =
StringViewBuilder::new().with_fixed_block_size(8); // multiple buffers.
for s in VIEW_TEST_DATA.iter() {
builder.append_option(*s);
@@ -5388,15 +5456,21 @@ mod tests {
builder.finish()
};
+ let binary_view_array = BinaryViewArray::from_iter(VIEW_TEST_DATA);
+
let expected_string_array =
GenericStringArray::<O>::from_iter(VIEW_TEST_DATA);
let expected_type = expected_string_array.data_type();
- assert!(can_cast_types(view_array.data_type(), expected_type));
+ assert!(can_cast_types(string_view_array.data_type(), expected_type));
+ assert!(can_cast_types(binary_view_array.data_type(), expected_type));
- let string_array = cast(&view_array, expected_type).unwrap();
- assert_eq!(string_array.data_type(), expected_type);
+ let string_view_casted_array = cast(&string_view_array,
expected_type).unwrap();
+ assert_eq!(string_view_casted_array.data_type(), expected_type);
+ assert_eq!(string_view_casted_array.as_ref(), &expected_string_array);
- assert_eq!(string_array.as_ref(), &expected_string_array);
+ let binary_view_casted_array = cast(&binary_view_array,
expected_type).unwrap();
+ assert_eq!(binary_view_casted_array.data_type(), expected_type);
+ assert_eq!(binary_view_casted_array.as_ref(), &expected_string_array);
}
#[test]
diff --git a/arrow/benches/cast_kernels.rs b/arrow/benches/cast_kernels.rs
index 8803e8eea..ec7990d3d 100644
--- a/arrow/benches/cast_kernels.rs
+++ b/arrow/benches/cast_kernels.rs
@@ -114,6 +114,18 @@ fn build_decimal256_array(size: usize, precision: u8,
scale: i8) -> ArrayRef {
)
}
+fn build_string_array(size: usize) -> ArrayRef {
+ let mut builder = StringBuilder::new();
+ for v in 0..size {
+ match v % 3 {
+ 0 => builder.append_value("small"),
+ 1 => builder.append_value("larger string more than 12 bytes"),
+ _ => builder.append_null(),
+ }
+ }
+ Arc::new(builder.finish())
+}
+
fn build_dict_array(size: usize) -> ArrayRef {
let values = StringArray::from_iter([
Some("small"),
@@ -148,9 +160,12 @@ fn add_benchmark(c: &mut Criterion) {
let decimal128_array = build_decimal128_array(512, 10, 3);
let decimal256_array = build_decimal256_array(512, 50, 3);
+ let string_array = build_string_array(512);
+ let wide_string_array = cast(&string_array, &DataType::LargeUtf8).unwrap();
let dict_array = build_dict_array(10_000);
let string_view_array = cast(&dict_array, &DataType::Utf8View).unwrap();
+ let binary_view_array = cast(&string_view_array,
&DataType::BinaryView).unwrap();
c.bench_function("cast int32 to int32 512", |b| {
b.iter(|| cast_array(&i32_array, DataType::Int32))
@@ -262,6 +277,30 @@ fn add_benchmark(c: &mut Criterion) {
)
})
});
+ c.bench_function("cast string view to string", |b| {
+ b.iter(|| cast_array(&string_view_array, DataType::Utf8))
+ });
+ c.bench_function("cast string view to wide string", |b| {
+ b.iter(|| cast_array(&string_view_array, DataType::LargeUtf8))
+ });
+ c.bench_function("cast binary view to string", |b| {
+ b.iter(|| cast_array(&binary_view_array, DataType::Utf8))
+ });
+ c.bench_function("cast binary view to wide string", |b| {
+ b.iter(|| cast_array(&binary_view_array, DataType::LargeUtf8))
+ });
+ c.bench_function("cast string to binary view 512", |b| {
+ b.iter(|| cast_array(&string_array, DataType::BinaryView))
+ });
+ c.bench_function("cast wide string to binary view 512", |b| {
+ b.iter(|| cast_array(&wide_string_array, DataType::BinaryView))
+ });
+ c.bench_function("cast string view to binary view", |b| {
+ b.iter(|| cast_array(&string_view_array, DataType::BinaryView))
+ });
+ c.bench_function("cast binary view to string view", |b| {
+ b.iter(|| cast_array(&binary_view_array, DataType::Utf8View))
+ });
}
criterion_group!(benches, add_benchmark);