This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 4bd737dab Support casting between BinaryView <--> Utf8 and LargeUtf8 
(#6180)
4bd737dab is described below

commit 4bd737dab2aa17aca200259347909d48ed793ba1
Author: Xin Li <[email protected]>
AuthorDate: Thu Aug 8 07:20:33 2024 -0700

    Support casting between BinaryView <--> Utf8 and LargeUtf8 (#6180)
    
    * support cast between binaryview and string
    
    * update impl. and add bench mark
    
    * Add ut for views
    
    * Apply coments
---
 arrow-cast/src/cast/mod.rs    | 102 ++++++++++++++++++++++++++++++++++++------
 arrow/benches/cast_kernels.rs |  39 ++++++++++++++++
 2 files changed, 127 insertions(+), 14 deletions(-)

diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs
index f6103cb84..9f552ec72 100644
--- a/arrow-cast/src/cast/mod.rs
+++ b/arrow-cast/src/cast/mod.rs
@@ -225,10 +225,11 @@ pub fn can_cast_types(from_type: &DataType, to_type: 
&DataType) -> bool {
             | Timestamp(Millisecond, _)
             | Timestamp(Microsecond, _)
             | Timestamp(Nanosecond, _)
-            | Interval(_),
+            | Interval(_)
+            | BinaryView,
         ) => true,
         (Utf8 | LargeUtf8, Utf8View) => true,
-        (BinaryView, Binary | LargeBinary) => true,
+        (BinaryView, Binary | LargeBinary | Utf8 | LargeUtf8 | Utf8View ) => 
true,
         (Utf8 | LargeUtf8, _) => to_type.is_numeric() && to_type != &Float16,
         (_, Utf8 | LargeUtf8) => from_type.is_primitive(),
 
@@ -1229,6 +1230,9 @@ pub fn cast_with_options(
                 cast_byte_container::<BinaryType, LargeBinaryType>(&binary)
             }
             Utf8View => 
Ok(Arc::new(StringViewArray::from(array.as_string::<i32>()))),
+            BinaryView => Ok(Arc::new(
+                
StringViewArray::from(array.as_string::<i32>()).to_binary_view(),
+            )),
             LargeUtf8 => cast_byte_container::<Utf8Type, LargeUtf8Type>(array),
             Time32(TimeUnit::Second) => parse_string::<Time32SecondType, 
i32>(array, cast_options),
             Time32(TimeUnit::Millisecond) => {
@@ -1282,6 +1286,7 @@ pub fn cast_with_options(
             Date64 => parse_string_view::<Date64Type>(array, cast_options),
             Binary => cast_view_to_byte::<StringViewType, 
GenericBinaryType<i32>>(array),
             LargeBinary => cast_view_to_byte::<StringViewType, 
GenericBinaryType<i64>>(array),
+            BinaryView => 
Ok(Arc::new(array.as_string_view().clone().to_binary_view())),
             Utf8 => cast_view_to_byte::<StringViewType, 
GenericStringType<i32>>(array),
             LargeUtf8 => cast_view_to_byte::<StringViewType, 
GenericStringType<i64>>(array),
             Time32(TimeUnit::Second) => 
parse_string_view::<Time32SecondType>(array, cast_options),
@@ -1339,6 +1344,13 @@ pub fn cast_with_options(
                 array.as_string::<i64>().clone(),
             ))),
             Utf8View => 
Ok(Arc::new(StringViewArray::from(array.as_string::<i64>()))),
+            BinaryView => Ok(Arc::new(BinaryViewArray::from(
+                array
+                    .as_string::<i64>()
+                    .into_iter()
+                    .map(|x| x.map(|x| x.as_bytes()))
+                    .collect::<Vec<_>>(),
+            ))),
             Time32(TimeUnit::Second) => parse_string::<Time32SecondType, 
i64>(array, cast_options),
             Time32(TimeUnit::Millisecond) => {
                 parse_string::<Time32MillisecondType, i64>(array, cast_options)
@@ -1417,6 +1429,20 @@ pub fn cast_with_options(
         (BinaryView, LargeBinary) => {
             cast_view_to_byte::<BinaryViewType, GenericBinaryType<i64>>(array)
         }
+        (BinaryView, Utf8) => {
+            let binary_arr = cast_view_to_byte::<BinaryViewType, 
GenericBinaryType<i32>>(array)?;
+            cast_binary_to_string::<i32>(&binary_arr, cast_options)
+        }
+        (BinaryView, LargeUtf8) => {
+            let binary_arr = cast_view_to_byte::<BinaryViewType, 
GenericBinaryType<i64>>(array)?;
+            cast_binary_to_string::<i64>(&binary_arr, cast_options)
+        }
+        (BinaryView, Utf8View) => {
+            Ok(Arc::new(array.as_binary_view().clone().to_string_view()?) as 
ArrayRef)
+        }
+        (BinaryView, _) => Err(ArrowError::CastError(format!(
+            "Casting from {from_type:?} to {to_type:?} not supported",
+        ))),
         (from_type, LargeUtf8) if from_type.is_primitive() => {
             value_to_string::<i64>(array, cast_options)
         }
@@ -2008,7 +2034,6 @@ pub fn cast_with_options(
                     })?,
             ))
         }
-
         (Date64, Timestamp(TimeUnit::Second, None)) => Ok(Arc::new(
             array
                 .as_primitive::<Date64Type>()
@@ -5256,12 +5281,6 @@ mod tests {
         }
     }
 
-    #[test]
-    fn test_string_to_view() {
-        _test_string_to_view::<i32>();
-        _test_string_to_view::<i64>();
-    }
-
     const VIEW_TEST_DATA: [Option<&str>; 5] = [
         Some("hello"),
         Some("repeated"),
@@ -5270,6 +5289,44 @@ mod tests {
         Some("repeated"),
     ];
 
+    #[test]
+    fn test_string_view_to_binary_view() {
+        let string_view_array = StringViewArray::from_iter(VIEW_TEST_DATA);
+
+        assert!(can_cast_types(
+            string_view_array.data_type(),
+            &DataType::BinaryView
+        ));
+
+        let binary_view_array = cast(&string_view_array, 
&DataType::BinaryView).unwrap();
+        assert_eq!(binary_view_array.data_type(), &DataType::BinaryView);
+
+        let expect_binary_view_array = 
BinaryViewArray::from_iter(VIEW_TEST_DATA);
+        assert_eq!(binary_view_array.as_ref(), &expect_binary_view_array);
+    }
+
+    #[test]
+    fn test_binary_view_to_string_view() {
+        let binary_view_array = BinaryViewArray::from_iter(VIEW_TEST_DATA);
+
+        assert!(can_cast_types(
+            binary_view_array.data_type(),
+            &DataType::Utf8View
+        ));
+
+        let string_view_array = cast(&binary_view_array, 
&DataType::Utf8View).unwrap();
+        assert_eq!(string_view_array.data_type(), &DataType::Utf8View);
+
+        let expect_string_view_array = 
StringViewArray::from_iter(VIEW_TEST_DATA);
+        assert_eq!(string_view_array.as_ref(), &expect_string_view_array);
+    }
+
+    #[test]
+    fn test_string_to_view() {
+        _test_string_to_view::<i32>();
+        _test_string_to_view::<i64>();
+    }
+
     fn _test_string_to_view<O>()
     where
         O: OffsetSizeTrait,
@@ -5281,11 +5338,22 @@ mod tests {
             &DataType::Utf8View
         ));
 
+        assert!(can_cast_types(
+            string_array.data_type(),
+            &DataType::BinaryView
+        ));
+
         let string_view_array = cast(&string_array, 
&DataType::Utf8View).unwrap();
         assert_eq!(string_view_array.data_type(), &DataType::Utf8View);
 
+        let binary_view_array = cast(&string_array, 
&DataType::BinaryView).unwrap();
+        assert_eq!(binary_view_array.data_type(), &DataType::BinaryView);
+
         let expect_string_view_array = 
StringViewArray::from_iter(VIEW_TEST_DATA);
         assert_eq!(string_view_array.as_ref(), &expect_string_view_array);
+
+        let expect_binary_view_array = 
BinaryViewArray::from_iter(VIEW_TEST_DATA);
+        assert_eq!(binary_view_array.as_ref(), &expect_binary_view_array);
     }
 
     #[test]
@@ -5380,7 +5448,7 @@ mod tests {
     where
         O: OffsetSizeTrait,
     {
-        let view_array = {
+        let string_view_array = {
             let mut builder = 
StringViewBuilder::new().with_fixed_block_size(8); // multiple buffers.
             for s in VIEW_TEST_DATA.iter() {
                 builder.append_option(*s);
@@ -5388,15 +5456,21 @@ mod tests {
             builder.finish()
         };
 
+        let binary_view_array = BinaryViewArray::from_iter(VIEW_TEST_DATA);
+
         let expected_string_array = 
GenericStringArray::<O>::from_iter(VIEW_TEST_DATA);
         let expected_type = expected_string_array.data_type();
 
-        assert!(can_cast_types(view_array.data_type(), expected_type));
+        assert!(can_cast_types(string_view_array.data_type(), expected_type));
+        assert!(can_cast_types(binary_view_array.data_type(), expected_type));
 
-        let string_array = cast(&view_array, expected_type).unwrap();
-        assert_eq!(string_array.data_type(), expected_type);
+        let string_view_casted_array = cast(&string_view_array, 
expected_type).unwrap();
+        assert_eq!(string_view_casted_array.data_type(), expected_type);
+        assert_eq!(string_view_casted_array.as_ref(), &expected_string_array);
 
-        assert_eq!(string_array.as_ref(), &expected_string_array);
+        let binary_view_casted_array = cast(&binary_view_array, 
expected_type).unwrap();
+        assert_eq!(binary_view_casted_array.data_type(), expected_type);
+        assert_eq!(binary_view_casted_array.as_ref(), &expected_string_array);
     }
 
     #[test]
diff --git a/arrow/benches/cast_kernels.rs b/arrow/benches/cast_kernels.rs
index 8803e8eea..ec7990d3d 100644
--- a/arrow/benches/cast_kernels.rs
+++ b/arrow/benches/cast_kernels.rs
@@ -114,6 +114,18 @@ fn build_decimal256_array(size: usize, precision: u8, 
scale: i8) -> ArrayRef {
     )
 }
 
+fn build_string_array(size: usize) -> ArrayRef {
+    let mut builder = StringBuilder::new();
+    for v in 0..size {
+        match v % 3 {
+            0 => builder.append_value("small"),
+            1 => builder.append_value("larger string more than 12 bytes"),
+            _ => builder.append_null(),
+        }
+    }
+    Arc::new(builder.finish())
+}
+
 fn build_dict_array(size: usize) -> ArrayRef {
     let values = StringArray::from_iter([
         Some("small"),
@@ -148,9 +160,12 @@ fn add_benchmark(c: &mut Criterion) {
 
     let decimal128_array = build_decimal128_array(512, 10, 3);
     let decimal256_array = build_decimal256_array(512, 50, 3);
+    let string_array = build_string_array(512);
+    let wide_string_array = cast(&string_array, &DataType::LargeUtf8).unwrap();
 
     let dict_array = build_dict_array(10_000);
     let string_view_array = cast(&dict_array, &DataType::Utf8View).unwrap();
+    let binary_view_array = cast(&string_view_array, 
&DataType::BinaryView).unwrap();
 
     c.bench_function("cast int32 to int32 512", |b| {
         b.iter(|| cast_array(&i32_array, DataType::Int32))
@@ -262,6 +277,30 @@ fn add_benchmark(c: &mut Criterion) {
             )
         })
     });
+    c.bench_function("cast string view to string", |b| {
+        b.iter(|| cast_array(&string_view_array, DataType::Utf8))
+    });
+    c.bench_function("cast string view to wide string", |b| {
+        b.iter(|| cast_array(&string_view_array, DataType::LargeUtf8))
+    });
+    c.bench_function("cast binary view to string", |b| {
+        b.iter(|| cast_array(&binary_view_array, DataType::Utf8))
+    });
+    c.bench_function("cast binary view to wide string", |b| {
+        b.iter(|| cast_array(&binary_view_array, DataType::LargeUtf8))
+    });
+    c.bench_function("cast string to binary view 512", |b| {
+        b.iter(|| cast_array(&string_array, DataType::BinaryView))
+    });
+    c.bench_function("cast wide string to binary view 512", |b| {
+        b.iter(|| cast_array(&wide_string_array, DataType::BinaryView))
+    });
+    c.bench_function("cast string view to binary view", |b| {
+        b.iter(|| cast_array(&string_view_array, DataType::BinaryView))
+    });
+    c.bench_function("cast binary view to string view", |b| {
+        b.iter(|| cast_array(&binary_view_array, DataType::Utf8View))
+    });
 }
 
 criterion_group!(benches, add_benchmark);

Reply via email to