This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 6746007826 feat(spark): Implement Spark functions `url_encode`,
`url_decode` and `try_url_decode` (#17399)
6746007826 is described below
commit 6746007826ebd3fcb5614bf87183674435bbb134
Author: Quoc Anh <[email protected]>
AuthorDate: Sat Dec 6 18:22:53 2025 +1100
feat(spark): Implement Spark functions `url_encode`, `url_decode` and
`try_url_decode` (#17399)
## Which issue does this PR close?
- Part of #15914
## Rationale for this change
## What changes are included in this PR?
Implement Spark functions `url_encode`, `url_decode` and
`try_url_decode`
## Are these changes tested?
Yes
## Are there any user-facing changes?
Yes
Co-authored-by: Jeffrey Vo <[email protected]>
---
Cargo.lock | 1 +
datafusion/spark/Cargo.toml | 1 +
datafusion/spark/src/function/url/mod.rs | 17 +-
.../spark/src/function/url/try_url_decode.rs | 109 +++++++++
datafusion/spark/src/function/url/url_decode.rs | 259 +++++++++++++++++++++
datafusion/spark/src/function/url/url_encode.rs | 131 +++++++++++
.../test_files/spark/url/try_url_decode.slt | 69 ++++++
.../test_files/spark/url/url_decode.slt | 60 ++++-
.../test_files/spark/url/url_encode.slt | 24 +-
9 files changed, 651 insertions(+), 20 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index 6d54d234e0..08198cc49b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2666,6 +2666,7 @@ dependencies = [
"datafusion-functions",
"datafusion-functions-nested",
"log",
+ "percent-encoding",
"rand 0.9.2",
"sha1",
"url",
diff --git a/datafusion/spark/Cargo.toml b/datafusion/spark/Cargo.toml
index f39b6992c3..09959db41f 100644
--- a/datafusion/spark/Cargo.toml
+++ b/datafusion/spark/Cargo.toml
@@ -50,6 +50,7 @@ datafusion-expr = { workspace = true }
datafusion-functions = { workspace = true, features = ["crypto_expressions"] }
datafusion-functions-nested = { workspace = true }
log = { workspace = true }
+percent-encoding = "2.3.2"
rand = { workspace = true }
sha1 = "0.10"
url = { workspace = true }
diff --git a/datafusion/spark/src/function/url/mod.rs
b/datafusion/spark/src/function/url/mod.rs
index 82bf8a9e09..657655429e 100644
--- a/datafusion/spark/src/function/url/mod.rs
+++ b/datafusion/spark/src/function/url/mod.rs
@@ -21,9 +21,15 @@ use std::sync::Arc;
pub mod parse_url;
pub mod try_parse_url;
+pub mod try_url_decode;
+pub mod url_decode;
+pub mod url_encode;
make_udf_function!(parse_url::ParseUrl, parse_url);
make_udf_function!(try_parse_url::TryParseUrl, try_parse_url);
+make_udf_function!(try_url_decode::TryUrlDecode, try_url_decode);
+make_udf_function!(url_decode::UrlDecode, url_decode);
+make_udf_function!(url_encode::UrlEncode, url_encode);
pub mod expr_fn {
use datafusion_functions::export_functions;
@@ -38,8 +44,17 @@ pub mod expr_fn {
"Same as parse_url but returns NULL if an invalid URL is provided.",
args
));
+ export_functions!((url_decode, "Decodes a URL-encoded string in
‘application/x-www-form-urlencoded’ format to its original format.", args));
+ export_functions!((try_url_decode, "Same as url_decode but returns NULL if
an invalid URL-encoded string is provided", args));
+ export_functions!((url_encode, "Encodes a string into a URL-encoded string
in ‘application/x-www-form-urlencoded’ format.", args));
}
pub fn functions() -> Vec<Arc<ScalarUDF>> {
- vec![parse_url(), try_parse_url()]
+ vec![
+ parse_url(),
+ try_parse_url(),
+ try_url_decode(),
+ url_decode(),
+ url_encode(),
+ ]
}
diff --git a/datafusion/spark/src/function/url/try_url_decode.rs
b/datafusion/spark/src/function/url/try_url_decode.rs
new file mode 100644
index 0000000000..61440e7ff0
--- /dev/null
+++ b/datafusion/spark/src/function/url/try_url_decode.rs
@@ -0,0 +1,109 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+
+use arrow::array::ArrayRef;
+use arrow::datatypes::DataType;
+
+use datafusion_common::Result;
+use datafusion_expr::{
+ ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
+use datafusion_functions::utils::make_scalar_function;
+
+use crate::function::url::url_decode::{spark_handled_url_decode, UrlDecode};
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct TryUrlDecode {
+ signature: Signature,
+ url_decoder: UrlDecode,
+}
+
+impl Default for TryUrlDecode {
+ fn default() -> Self {
+ Self::new()
+ }
+}
+
+impl TryUrlDecode {
+ pub fn new() -> Self {
+ Self {
+ signature: Signature::string(1, Volatility::Immutable),
+ url_decoder: UrlDecode::new(),
+ }
+ }
+}
+
+impl ScalarUDFImpl for TryUrlDecode {
+ fn as_any(&self) -> &dyn Any {
+ self
+ }
+
+ fn name(&self) -> &str {
+ "try_url_decode"
+ }
+
+ fn signature(&self) -> &Signature {
+ &self.signature
+ }
+
+ fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+ self.url_decoder.return_type(arg_types)
+ }
+
+ fn invoke_with_args(&self, args: ScalarFunctionArgs) ->
Result<ColumnarValue> {
+ let ScalarFunctionArgs { args, .. } = args;
+ make_scalar_function(spark_try_url_decode, vec![])(&args)
+ }
+}
+
+fn spark_try_url_decode(args: &[ArrayRef]) -> Result<ArrayRef> {
+ spark_handled_url_decode(args, |x| match x {
+ Err(_) => Ok(None),
+ result => result,
+ })
+}
+
+#[cfg(test)]
+mod tests {
+ use std::sync::Arc;
+
+ use arrow::array::StringArray;
+ use datafusion_common::{cast::as_string_array, Result};
+
+ use super::*;
+
+ #[test]
+ fn test_try_decode_error_handled() -> Result<()> {
+ let input = Arc::new(StringArray::from(vec![
+ Some("http%3A%2F%2spark.apache.org"), // '%2s' is not a valid
percent encoded character
+ // Valid cases
+ Some("https%3A%2F%2Fspark.apache.org"),
+ None,
+ ]));
+
+ let expected =
+ StringArray::from(vec![None, Some("https://spark.apache.org"),
None]);
+
+ let result = spark_try_url_decode(&[input as ArrayRef])?;
+ let result = as_string_array(&result)?;
+
+ assert_eq!(&expected, result);
+ Ok(())
+ }
+}
diff --git a/datafusion/spark/src/function/url/url_decode.rs
b/datafusion/spark/src/function/url/url_decode.rs
new file mode 100644
index 0000000000..520588bc19
--- /dev/null
+++ b/datafusion/spark/src/function/url/url_decode.rs
@@ -0,0 +1,259 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::borrow::Cow;
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, LargeStringArray, StringArray, StringViewArray};
+use arrow::datatypes::DataType;
+use datafusion_common::cast::{
+ as_large_string_array, as_string_array, as_string_view_array,
+};
+use datafusion_common::{exec_datafusion_err, exec_err, plan_err, Result};
+use datafusion_expr::{
+ ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
+use datafusion_functions::utils::make_scalar_function;
+use percent_encoding::percent_decode;
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct UrlDecode {
+ signature: Signature,
+}
+
+impl Default for UrlDecode {
+ fn default() -> Self {
+ Self::new()
+ }
+}
+
+impl UrlDecode {
+ pub fn new() -> Self {
+ Self {
+ signature: Signature::string(1, Volatility::Immutable),
+ }
+ }
+
+ /// Decodes a URL-encoded string from application/x-www-form-urlencoded
format.
+ /// Although the `url::form_urlencoded` support decoding, it does not
return error when the string is malformed
+ /// For example: "%2s" is not a valid percent-encoding, the `decode`
function from `url::form_urlencoded`
+ /// will ignore this instead of return error
+ /// This function reproduce the same decoding process, plus an extra
validation step
+ /// See
<https://github.com/servo/rust-url/blob/b06048d70d4cc9cf4ffb277f06cfcebd53b2141e/form_urlencoded/src/lib.rs#L70-L76>
+ ///
+ /// # Arguments
+ ///
+ /// * `value` - The URL-encoded string to decode
+ ///
+ /// # Returns
+ ///
+ /// * `Ok(String)` - The decoded string
+ /// * `Err(DataFusionError)` - If the input is malformed or contains
invalid UTF-8
+ ///
+ fn decode(value: &str) -> Result<String> {
+ // Check if the string has valid percent encoding
+ Self::validate_percent_encoding(value)?;
+
+ let replaced = Self::replace_plus(value.as_bytes());
+ percent_decode(&replaced)
+ .decode_utf8()
+ .map_err(|e| exec_datafusion_err!("Invalid UTF-8 sequence: {e}"))
+ .map(|parsed| parsed.into_owned())
+ }
+
+ /// Replace b'+' with b' '
+ /// See:
<https://github.com/servo/rust-url/blob/dbd526178ed9276176602dd039022eba89e8fc93/form_urlencoded/src/lib.rs#L79-L93>
+ fn replace_plus(input: &[u8]) -> Cow<'_, [u8]> {
+ match input.iter().position(|&b| b == b'+') {
+ None => Cow::Borrowed(input),
+ Some(first_position) => {
+ let mut replaced = input.to_owned();
+ replaced[first_position] = b' ';
+ for byte in &mut replaced[first_position + 1..] {
+ if *byte == b'+' {
+ *byte = b' ';
+ }
+ }
+ Cow::Owned(replaced)
+ }
+ }
+ }
+
+ /// Validate percent-encoding of the string
+ fn validate_percent_encoding(value: &str) -> Result<()> {
+ let bytes = value.as_bytes();
+ let mut i = 0;
+
+ while i < bytes.len() {
+ if bytes[i] == b'%' {
+ // Check if we have at least 2 more characters
+ if i + 2 >= bytes.len() {
+ return exec_err!(
+ "Invalid percent-encoding: incomplete sequence at
position {}",
+ i
+ );
+ }
+
+ let hex1 = bytes[i + 1];
+ let hex2 = bytes[i + 2];
+
+ if !hex1.is_ascii_hexdigit() || !hex2.is_ascii_hexdigit() {
+ return exec_err!(
+ "Invalid percent-encoding: invalid hex sequence
'%{}{}' at position {}",
+ hex1 as char,
+ hex2 as char,
+ i
+ );
+ }
+ i += 3;
+ } else {
+ i += 1;
+ }
+ }
+ Ok(())
+ }
+}
+
+impl ScalarUDFImpl for UrlDecode {
+ fn as_any(&self) -> &dyn Any {
+ self
+ }
+
+ fn name(&self) -> &str {
+ "url_decode"
+ }
+
+ fn signature(&self) -> &Signature {
+ &self.signature
+ }
+
+ fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+ if arg_types.len() != 1 {
+ return plan_err!(
+ "{} expects 1 argument, but got {}",
+ self.name(),
+ arg_types.len()
+ );
+ }
+ // As the type signature is already checked, we can safely return the
type of the first argument
+ Ok(arg_types[0].clone())
+ }
+
+ fn invoke_with_args(&self, args: ScalarFunctionArgs) ->
Result<ColumnarValue> {
+ let ScalarFunctionArgs { args, .. } = args;
+ make_scalar_function(spark_url_decode, vec![])(&args)
+ }
+}
+
+/// Core implementation of URL decoding function.
+///
+/// # Arguments
+///
+/// * `args` - A slice containing exactly one ArrayRef with the URL-encoded
strings to decode
+///
+/// # Returns
+///
+/// * `Ok(ArrayRef)` - A new array of the same type containing decoded strings
+/// * `Err(DataFusionError)` - If validation fails or invalid arguments are
provided
+///
+fn spark_url_decode(args: &[ArrayRef]) -> Result<ArrayRef> {
+ spark_handled_url_decode(args, |x| x)
+}
+
+pub fn spark_handled_url_decode(
+ args: &[ArrayRef],
+ err_handle_fn: impl Fn(Result<Option<String>>) -> Result<Option<String>>,
+) -> Result<ArrayRef> {
+ if args.len() != 1 {
+ return exec_err!("`url_decode` expects 1 argument");
+ }
+
+ match &args[0].data_type() {
+ DataType::Utf8 => as_string_array(&args[0])?
+ .iter()
+ .map(|x| x.map(UrlDecode::decode).transpose())
+ .map(&err_handle_fn)
+ .collect::<Result<StringArray>>()
+ .map(|array| Arc::new(array) as ArrayRef),
+ DataType::LargeUtf8 => as_large_string_array(&args[0])?
+ .iter()
+ .map(|x| x.map(UrlDecode::decode).transpose())
+ .map(&err_handle_fn)
+ .collect::<Result<LargeStringArray>>()
+ .map(|array| Arc::new(array) as ArrayRef),
+ DataType::Utf8View => as_string_view_array(&args[0])?
+ .iter()
+ .map(|x| x.map(UrlDecode::decode).transpose())
+ .map(&err_handle_fn)
+ .collect::<Result<StringViewArray>>()
+ .map(|array| Arc::new(array) as ArrayRef),
+ other => exec_err!("`url_decode`: Expr must be STRING, got {other:?}"),
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use arrow::array::StringArray;
+ use datafusion_common::Result;
+
+ use super::*;
+
+ #[test]
+ fn test_decode() -> Result<()> {
+ let input = Arc::new(StringArray::from(vec![
+ Some("https%3A%2F%2Fspark.apache.org"),
+ Some("inva+lid://user:pass@host/file\\;param?query\\;p2"),
+ Some("inva lid://user:pass@host/file\\;param?query\\;p2"),
+ Some("%7E%21%40%23%24%25%5E%26%2A%28%29%5F%2B"),
+ Some("%E4%BD%A0%E5%A5%BD"),
+ Some(""),
+ None,
+ ]));
+ let expected = StringArray::from(vec![
+ Some("https://spark.apache.org"),
+ Some("inva lid://user:pass@host/file\\;param?query\\;p2"),
+ Some("inva lid://user:pass@host/file\\;param?query\\;p2"),
+ Some("~!@#$%^&*()_+"),
+ Some("你好"),
+ Some(""),
+ None,
+ ]);
+
+ let result = spark_url_decode(&[input as ArrayRef])?;
+ let result = as_string_array(&result)?;
+
+ assert_eq!(&expected, result);
+
+ Ok(())
+ }
+
+ #[test]
+ fn test_decode_error() -> Result<()> {
+ let input = Arc::new(StringArray::from(vec![
+ Some("http%3A%2F%2spark.apache.org"), // '%2s' is not a valid
percent encoded character
+ // Valid cases
+ Some("https%3A%2F%2Fspark.apache.org"),
+ None,
+ ]));
+
+ let result = spark_url_decode(&[input]);
+ assert!(result.is_err_and(|e| e.to_string().contains("Invalid
percent-encoding")));
+
+ Ok(())
+ }
+}
diff --git a/datafusion/spark/src/function/url/url_encode.rs
b/datafusion/spark/src/function/url/url_encode.rs
new file mode 100644
index 0000000000..9b37f0ac6a
--- /dev/null
+++ b/datafusion/spark/src/function/url/url_encode.rs
@@ -0,0 +1,131 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::array::{ArrayRef, LargeStringArray, StringArray, StringViewArray};
+use arrow::datatypes::DataType;
+use datafusion_common::cast::{
+ as_large_string_array, as_string_array, as_string_view_array,
+};
+use datafusion_common::{exec_err, plan_err, Result};
+use datafusion_expr::{
+ ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
+use datafusion_functions::utils::make_scalar_function;
+use url::form_urlencoded::byte_serialize;
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct UrlEncode {
+ signature: Signature,
+}
+
+impl Default for UrlEncode {
+ fn default() -> Self {
+ Self::new()
+ }
+}
+
+impl UrlEncode {
+ pub fn new() -> Self {
+ Self {
+ signature: Signature::string(1, Volatility::Immutable),
+ }
+ }
+
+ /// Encode a string to application/x-www-form-urlencoded format.
+ ///
+ /// # Arguments
+ ///
+ /// * `value` - The string to encode
+ ///
+ /// # Returns
+ ///
+ /// * `Ok(String)` - The encoded string
+ ///
+ fn encode(value: &str) -> Result<String> {
+ Ok(byte_serialize(value.as_bytes()).collect::<String>())
+ }
+}
+
+impl ScalarUDFImpl for UrlEncode {
+ fn as_any(&self) -> &dyn Any {
+ self
+ }
+
+ fn name(&self) -> &str {
+ "url_encode"
+ }
+
+ fn signature(&self) -> &Signature {
+ &self.signature
+ }
+
+ fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+ if arg_types.len() != 1 {
+ return plan_err!(
+ "{} expects 1 argument, but got {}",
+ self.name(),
+ arg_types.len()
+ );
+ }
+ // As the type signature is already checked, we can safely return the
type of the first argument
+ Ok(arg_types[0].clone())
+ }
+
+ fn invoke_with_args(&self, args: ScalarFunctionArgs) ->
Result<ColumnarValue> {
+ let ScalarFunctionArgs { args, .. } = args;
+ make_scalar_function(spark_url_encode, vec![])(&args)
+ }
+}
+
+/// Core implementation of URL encoding function.
+///
+/// # Arguments
+///
+/// * `args` - A slice containing exactly one ArrayRef with the strings to
encode
+///
+/// # Returns
+///
+/// * `Ok(ArrayRef)` - A new array of the same type containing encoded strings
+/// * `Err(DataFusionError)` - If invalid arguments are provided
+///
+fn spark_url_encode(args: &[ArrayRef]) -> Result<ArrayRef> {
+ if args.len() != 1 {
+ return exec_err!("`url_encode` expects 1 argument");
+ }
+
+ match &args[0].data_type() {
+ DataType::Utf8 => as_string_array(&args[0])?
+ .iter()
+ .map(|x| x.map(UrlEncode::encode).transpose())
+ .collect::<Result<StringArray>>()
+ .map(|array| Arc::new(array) as ArrayRef),
+ DataType::LargeUtf8 => as_large_string_array(&args[0])?
+ .iter()
+ .map(|x| x.map(UrlEncode::encode).transpose())
+ .collect::<Result<LargeStringArray>>()
+ .map(|array| Arc::new(array) as ArrayRef),
+ DataType::Utf8View => as_string_view_array(&args[0])?
+ .iter()
+ .map(|x| x.map(UrlEncode::encode).transpose())
+ .collect::<Result<StringViewArray>>()
+ .map(|array| Arc::new(array) as ArrayRef),
+ other => exec_err!("`url_encode`: Expr must be STRING, got {other:?}"),
+ }
+}
diff --git a/datafusion/sqllogictest/test_files/spark/url/try_url_decode.slt
b/datafusion/sqllogictest/test_files/spark/url/try_url_decode.slt
new file mode 100644
index 0000000000..559c77af97
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/url/try_url_decode.slt
@@ -0,0 +1,69 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+query T
+SELECT try_url_decode('https%3A%2F%2Fspark.apache.org');
+----
+https://spark.apache.org
+
+# Test with LargeUtf8
+query T
+SELECT try_url_decode(arrow_cast('https%3A%2F%2Fspark.apache.org',
'LargeUtf8'));
+----
+https://spark.apache.org
+
+# Test with Utf8View
+query T
+SELECT try_url_decode(arrow_cast('https%3A%2F%2Fspark.apache.org',
'Utf8View'));
+----
+https://spark.apache.org
+
+# Non-ASCII string
+query T
+SELECT try_url_decode('%E4%BD%A0%E5%A5%BD')
+----
+你好
+
+# Empty string
+query T
+SELECT try_url_decode('');
+----
+(empty)
+
+# Null value
+query T
+SELECT try_url_decode(NULL::string);
+----
+NULL
+
+# Roundtrip with url_encode
+query T
+SELECT try_url_decode(url_encode('Spark SQL ~!@#$%^&*()'));
+----
+Spark SQL ~!@#$%^&*()
+
+# Plus replacement
+query T
+SELECT try_url_decode('Spark+SQL%21');
+----
+Spark SQL!
+
+# Handled invalid percent encoding error
+query T
+SELECT try_url_decode('https%3%2F%2Fspark.apache.org'::string);
+----
+NULL
diff --git a/datafusion/sqllogictest/test_files/spark/url/url_decode.slt
b/datafusion/sqllogictest/test_files/spark/url/url_decode.slt
index fa5028b647..61399aa0ef 100644
--- a/datafusion/sqllogictest/test_files/spark/url/url_decode.slt
+++ b/datafusion/sqllogictest/test_files/spark/url/url_decode.slt
@@ -15,13 +15,53 @@
# specific language governing permissions and limitations
# under the License.
-# This file was originally created by a porting script from:
-#
https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
-# This file is part of the implementation of the datafusion-spark function
library.
-# For more information, please see:
-# https://github.com/apache/datafusion/issues/15914
-
-## Original Query: SELECT url_decode('https%3A%2F%2Fspark.apache.org');
-## PySpark 3.5.5 Result: {'url_decode(https%3A%2F%2Fspark.apache.org)':
'https://spark.apache.org',
'typeof(url_decode(https%3A%2F%2Fspark.apache.org))': 'string',
'typeof(https%3A%2F%2Fspark.apache.org)': 'string'}
-#query
-#SELECT url_decode('https%3A%2F%2Fspark.apache.org'::string);
+query T
+SELECT url_decode('https%3A%2F%2Fspark.apache.org');
+----
+https://spark.apache.org
+
+# Test with LargeUtf8
+query T
+SELECT url_decode(arrow_cast('https%3A%2F%2Fspark.apache.org', 'LargeUtf8'));
+----
+https://spark.apache.org
+
+# Test with Utf8View
+query T
+SELECT url_decode(arrow_cast('https%3A%2F%2Fspark.apache.org', 'Utf8View'));
+----
+https://spark.apache.org
+
+# Non-ASCII string
+query T
+SELECT url_decode('%E4%BD%A0%E5%A5%BD')
+----
+你好
+
+# Empty string
+query T
+SELECT url_decode('');
+----
+(empty)
+
+# Null value
+query T
+SELECT url_decode(NULL::string);
+----
+NULL
+
+# Roundtrip with url_encode
+query T
+SELECT url_decode(url_encode('Spark SQL ~!@#$%^&*()'));
+----
+Spark SQL ~!@#$%^&*()
+
+# Plus replacement
+query T
+SELECT url_decode('Spark+SQL%21');
+----
+Spark SQL!
+
+# Invalid percent encoding case
+query error DataFusion error: Execution error: Invalid percent\-encoding:
invalid hex sequence '%3%' at position 5
+SELECT url_decode('https%3%2F%2Fspark.apache.org'::string);
diff --git a/datafusion/sqllogictest/test_files/spark/url/url_encode.slt
b/datafusion/sqllogictest/test_files/spark/url/url_encode.slt
index 6aef87dcb4..3d7a42f193 100644
--- a/datafusion/sqllogictest/test_files/spark/url/url_encode.slt
+++ b/datafusion/sqllogictest/test_files/spark/url/url_encode.slt
@@ -15,13 +15,19 @@
# specific language governing permissions and limitations
# under the License.
-# This file was originally created by a porting script from:
-#
https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
-# This file is part of the implementation of the datafusion-spark function
library.
-# For more information, please see:
-# https://github.com/apache/datafusion/issues/15914
+query T
+SELECT url_encode('https://spark.apache.org');
+----
+https%3A%2F%2Fspark.apache.org
-## Original Query: SELECT url_encode('https://spark.apache.org');
-## PySpark 3.5.5 Result: {'url_encode(https://spark.apache.org)':
'https%3A%2F%2Fspark.apache.org',
'typeof(url_encode(https://spark.apache.org))': 'string',
'typeof(https://spark.apache.org)': 'string'}
-#query
-#SELECT url_encode('https://spark.apache.org'::string);
+# Test with LargeUtf8
+query T
+SELECT url_encode(arrow_cast('https://spark.apache.org', 'LargeUtf8'));
+----
+https%3A%2F%2Fspark.apache.org
+
+# Test with Utf8View
+query T
+SELECT url_encode(arrow_cast('https://spark.apache.org', 'Utf8View'));
+----
+https%3A%2F%2Fspark.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]