This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 0ce6d1687e Update ASCII scalar function to support Utf8View #11834 
(#11884)
0ce6d1687e is described below

commit 0ce6d1687e3fccc4dd50c5759919f2a9baecb549
Author: Dmitry Bugakov <[email protected]>
AuthorDate: Thu Aug 8 18:10:30 2024 +0200

    Update ASCII scalar function to support Utf8View #11834 (#11884)
---
 datafusion/functions/src/string/ascii.rs           | 118 ++++++++++++++++-----
 datafusion/sqllogictest/test_files/string_view.slt |  99 +++++++++++++++++
 2 files changed, 189 insertions(+), 28 deletions(-)

diff --git a/datafusion/functions/src/string/ascii.rs 
b/datafusion/functions/src/string/ascii.rs
index 9e1e6b81b6..68ba3f5ff1 100644
--- a/datafusion/functions/src/string/ascii.rs
+++ b/datafusion/functions/src/string/ascii.rs
@@ -16,33 +16,15 @@
 // under the License.
 
 use crate::utils::make_scalar_function;
-use arrow::array::Int32Array;
-use arrow::array::{ArrayRef, OffsetSizeTrait};
+use arrow::array::{ArrayAccessor, ArrayIter, ArrayRef, AsArray, Int32Array};
 use arrow::datatypes::DataType;
-use datafusion_common::{cast::as_generic_string_array, internal_err, Result};
+use arrow::error::ArrowError;
+use datafusion_common::{internal_err, Result};
 use datafusion_expr::ColumnarValue;
 use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
 use std::any::Any;
 use std::sync::Arc;
 
-/// Returns the numeric code of the first character of the argument.
-/// ascii('x') = 120
-pub fn ascii<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
-    let string_array = as_generic_string_array::<T>(&args[0])?;
-
-    let result = string_array
-        .iter()
-        .map(|string| {
-            string.map(|string: &str| {
-                let mut chars = string.chars();
-                chars.next().map_or(0, |v| v as i32)
-            })
-        })
-        .collect::<Int32Array>();
-
-    Ok(Arc::new(result) as ArrayRef)
-}
-
 #[derive(Debug)]
 pub struct AsciiFunc {
     signature: Signature,
@@ -60,7 +42,7 @@ impl AsciiFunc {
         Self {
             signature: Signature::uniform(
                 1,
-                vec![Utf8, LargeUtf8],
+                vec![Utf8, LargeUtf8, Utf8View],
                 Volatility::Immutable,
             ),
         }
@@ -87,12 +69,92 @@ impl ScalarUDFImpl for AsciiFunc {
     }
 
     fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
-        match args[0].data_type() {
-            DataType::Utf8 => make_scalar_function(ascii::<i32>, vec![])(args),
-            DataType::LargeUtf8 => {
-                return make_scalar_function(ascii::<i64>, vec![])(args);
-            }
-            _ => internal_err!("Unsupported data type"),
+        make_scalar_function(ascii, vec![])(args)
+    }
+}
+
+fn calculate_ascii<'a, V>(array: V) -> Result<ArrayRef, ArrowError>
+where
+    V: ArrayAccessor<Item = &'a str>,
+{
+    let iter = ArrayIter::new(array);
+    let result = iter
+        .map(|string| {
+            string.map(|s| {
+                let mut chars = s.chars();
+                chars.next().map_or(0, |v| v as i32)
+            })
+        })
+        .collect::<Int32Array>();
+
+    Ok(Arc::new(result) as ArrayRef)
+}
+
+/// Returns the numeric code of the first character of the argument.
+pub fn ascii(args: &[ArrayRef]) -> Result<ArrayRef> {
+    match args[0].data_type() {
+        DataType::Utf8 => {
+            let string_array = args[0].as_string::<i32>();
+            Ok(calculate_ascii(string_array)?)
+        }
+        DataType::LargeUtf8 => {
+            let string_array = args[0].as_string::<i64>();
+            Ok(calculate_ascii(string_array)?)
+        }
+        DataType::Utf8View => {
+            let string_array = args[0].as_string_view();
+            Ok(calculate_ascii(string_array)?)
         }
+        _ => internal_err!("Unsupported data type"),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::string::ascii::AsciiFunc;
+    use crate::utils::test::test_function;
+    use arrow::array::{Array, Int32Array};
+    use arrow::datatypes::DataType::Int32;
+    use datafusion_common::{Result, ScalarValue};
+    use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
+
+    macro_rules! test_ascii {
+        ($INPUT:expr, $EXPECTED:expr) => {
+            test_function!(
+                AsciiFunc::new(),
+                &[ColumnarValue::Scalar(ScalarValue::Utf8($INPUT))],
+                $EXPECTED,
+                i32,
+                Int32,
+                Int32Array
+            );
+
+            test_function!(
+                AsciiFunc::new(),
+                &[ColumnarValue::Scalar(ScalarValue::LargeUtf8($INPUT))],
+                $EXPECTED,
+                i32,
+                Int32,
+                Int32Array
+            );
+
+            test_function!(
+                AsciiFunc::new(),
+                &[ColumnarValue::Scalar(ScalarValue::Utf8View($INPUT))],
+                $EXPECTED,
+                i32,
+                Int32,
+                Int32Array
+            );
+        };
+    }
+
+    #[test]
+    fn test_functions() -> Result<()> {
+        test_ascii!(Some(String::from("x")), Ok(Some(120)));
+        test_ascii!(Some(String::from("a")), Ok(Some(97)));
+        test_ascii!(Some(String::from("")), Ok(Some(0)));
+        test_ascii!(None, Ok(None));
+        Ok(())
     }
 }
diff --git a/datafusion/sqllogictest/test_files/string_view.slt 
b/datafusion/sqllogictest/test_files/string_view.slt
index 4d3f72b1e8..fc10a34256 100644
--- a/datafusion/sqllogictest/test_files/string_view.slt
+++ b/datafusion/sqllogictest/test_files/string_view.slt
@@ -500,3 +500,102 @@ select column2|| ' ' ||column3 from temp;
 ----
 rust fast
 datafusion cool
+
+### ASCII
+# Setup the initial test data
+statement ok
+create table test_source as values
+    ('Andrew', 'X'),
+    ('Xiangpeng', 'Xiangpeng'),
+    ('Raphael', 'R'),
+    (NULL, 'R');
+
+# Table with the different combination of column types
+statement ok
+create table test as
+SELECT
+  arrow_cast(column1, 'Utf8') as column1_utf8,
+  arrow_cast(column2, 'Utf8') as column2_utf8,
+  arrow_cast(column1, 'LargeUtf8') as column1_large_utf8,
+  arrow_cast(column2, 'LargeUtf8') as column2_large_utf8,
+  arrow_cast(column1, 'Utf8View') as column1_utf8view,
+  arrow_cast(column2, 'Utf8View') as column2_utf8view
+FROM test_source;
+
+# Test ASCII with utf8view against utf8view, utf8, and largeutf8
+# (should be no casts)
+query TT
+EXPLAIN SELECT
+  ASCII(column1_utf8view) as c1,
+  ASCII(column2_utf8) as c2,
+  ASCII(column2_large_utf8) as c3
+FROM test;
+----
+logical_plan
+01)Projection: ascii(test.column1_utf8view) AS c1, ascii(test.column2_utf8) AS 
c2, ascii(test.column2_large_utf8) AS c3
+02)--TableScan: test projection=[column2_utf8, column2_large_utf8, 
column1_utf8view]
+
+query III
+SELECT
+  ASCII(column1_utf8view) as c1,
+  ASCII(column2_utf8) as c2,
+  ASCII(column2_large_utf8) as c3
+FROM test;
+----
+65 88 88
+88 88 88
+82 82 82
+NULL 82 82
+
+query TT
+EXPLAIN SELECT
+  ASCII(column1_utf8) as c1,
+  ASCII(column1_large_utf8) as c2,
+  ASCII(column2_utf8view) as c3,
+  ASCII('hello') as c4,
+  ASCII(arrow_cast('world', 'Utf8View')) as c5
+FROM test;
+----
+logical_plan
+01)Projection: ascii(test.column1_utf8) AS c1, ascii(test.column1_large_utf8) 
AS c2, ascii(test.column2_utf8view) AS c3, Int32(104) AS c4, Int32(119) AS c5
+02)--TableScan: test projection=[column1_utf8, column1_large_utf8, 
column2_utf8view]
+
+query IIIII
+SELECT
+  ASCII(column1_utf8) as c1,
+  ASCII(column1_large_utf8) as c2,
+  ASCII(column2_utf8view) as c3,
+  ASCII('hello') as c4,
+  ASCII(arrow_cast('world', 'Utf8View')) as c5
+FROM test;
+----
+65 65 88 104 119
+88 88 88 104 119
+82 82 82 104 119
+NULL NULL 82 104 119
+
+# Test ASCII with literals cast to Utf8View
+query TT
+EXPLAIN SELECT
+  ASCII(arrow_cast('äöüß', 'Utf8View')) as c1,
+  ASCII(arrow_cast('', 'Utf8View')) as c2,
+  ASCII(arrow_cast(NULL, 'Utf8View')) as c3
+FROM test;
+----
+logical_plan
+01)Projection: Int32(228) AS c1, Int32(0) AS c2, Int32(NULL) AS c3
+02)--TableScan: test projection=[]
+
+query III
+SELECT
+  ASCII(arrow_cast('äöüß', 'Utf8View')) as c1,
+  ASCII(arrow_cast('', 'Utf8View')) as c2,
+  ASCII(arrow_cast(NULL, 'Utf8View')) as c3
+----
+228 0 NULL
+
+statement ok
+drop table test;
+
+statement ok
+drop table test_source;


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to