This is an automated email from the ASF dual-hosted git repository.

exmy pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git


The following commit(s) were added to refs/heads/main by this push:
     new 6a0a279735 [GLUTEN-11012][CH] Support arabic-indic digits in 
unix_timestamp (#11013)
6a0a279735 is described below

commit 6a0a279735b00781764df3719b74f68be5bf149b
Author: lgbo <[email protected]>
AuthorDate: Mon Nov 10 10:16:42 2025 +0800

    [GLUTEN-11012][CH] Support arabic-indic digits in unix_timestamp (#11013)
---
 .../execution/GlutenFunctionValidateSuite.scala    |  35 ++++
 .../Functions/ArabicIndicToAsciiDigitForDate.cpp   | 208 +++++++++++++++++++++
 .../Parser/scalar_function_parser/getTimestamp.h   |   8 +-
 .../scalar_function_parser/unixTimestamp.cpp       |   1 -
 4 files changed, 250 insertions(+), 2 deletions(-)

diff --git 
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala
 
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala
index d2429a2192..fc6ea354e0 100644
--- 
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala
+++ 
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala
@@ -1390,4 +1390,39 @@ class GlutenFunctionValidateSuite extends 
GlutenClickHouseWholeStageTransformerS
     val sql = "select map(string_field1, int_field1) from json_test where 
string_field1 is not null"
     compareResultsAgainstVanillaSpark(sql, true, { _ => })
   }
+
+  test("arabic_indic digit date") {
+    withSQLConf(
+      SQLConf.OPTIMIZER_EXCLUDED_RULES.key ->
+        (ConstantFolding.ruleName + "," + NullPropagation.ruleName),
+      ("spark.sql.legacy.timeParserPolicy", "LEGACY")) {
+      sql("create table tb_arabic_date(d string) using parquet")
+      sql("""
+            |insert into tb_arabic_date values
+            |'2aLZoNmi2aQt2aDZpi3ZoNmh',
+            |'2aLZoNmi2aQt2aHZoi3Zo9mh',
+            |'2aLZoNmi2aQt2aHZoi3Zo9mh'
+            |""".stripMargin)
+      var query_sql = """
+                        |select
+                        |from_unixtime(unix_timestamp(cast(unbase64(d) as 
string), 'yyyy-MM-dd'))
+                        |from tb_arabic_date
+                        |""".stripMargin
+      compareResultsAgainstVanillaSpark(query_sql, true, { _ => })
+
+      query_sql = """
+                    |select from_unixtime(
+                    | unix_timestamp(cast(unbase64('2aLZoNmi2aQt2aDZpi3ZoNmh') 
as string),
+                    | 'yyyy-MM-dd'))
+                    |""".stripMargin
+      compareResultsAgainstVanillaSpark(query_sql, true, { _ => })
+
+      query_sql = """
+                    |select from_unixtime(unix_timestamp('2020-01-01', 
'yyyy-MM-dd'))
+                    |""".stripMargin
+      compareResultsAgainstVanillaSpark(query_sql, true, { _ => })
+
+      sql("drop table tb_arabic_date")
+    }
+  }
 }
diff --git a/cpp-ch/local-engine/Functions/ArabicIndicToAsciiDigitForDate.cpp 
b/cpp-ch/local-engine/Functions/ArabicIndicToAsciiDigitForDate.cpp
new file mode 100644
index 0000000000..94332c3a5f
--- /dev/null
+++ b/cpp-ch/local-engine/Functions/ArabicIndicToAsciiDigitForDate.cpp
@@ -0,0 +1,208 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include <Columns/IColumn.h>
+#include <Columns/ColumnNullable.h>
+#include <Columns/ColumnConst.h>
+#include <Columns/ColumnString.h>
+#include <DataTypes/DataTypeNullable.h>
+#include <DataTypes/IDataType.h>
+#include <Functions/FunctionFactory.h>
+#include <Functions/FunctionHelpers.h>
+#include <Functions/IFunction.h>
+#include <Common/Exception.h>
+#include <Common/logger_useful.h>
+
+namespace DB
+{
+namespace ErrorCodes
+{
+extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+}
+}
+
+namespace local_engine
+{
+// Since spark 3.3, unix_timestamp support arabic number input, e.g., 
"٢٠٢١-٠٧-٠١ ١٢:٠٠:٠٠".
+// We implement a function to translate arabic indic digits to ascii digits 
here.
+class ArabicIndicToAsciiDigitForDateFunction : public DB::IFunction
+{
+public:
+    static constexpr auto name = "arabic_indic_to_ascii_digit_for_date";
+
+    static DB::FunctionPtr create(DB::ContextPtr) { return 
std::make_shared<ArabicIndicToAsciiDigitForDateFunction>(); }
+
+    String getName() const override { return name; }
+
+    bool isSuitableForShortCircuitArgumentsExecution(const 
DB::DataTypesWithConstInfo & /*arguments*/) const override { return false; }
+    size_t getNumberOfArguments() const override { return 1; }
+
+    DB::DataTypePtr getReturnTypeImpl(const DB::DataTypes & arguments) const 
override
+    {
+        auto nested_type = DB::removeNullable(arguments[0]);
+        if (!DB::WhichDataType(nested_type).isString())
+            throw DB::Exception(DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, 
"Argument for function {} must be String, but got {}", getName(), 
arguments[0]->getName());
+        return arguments[0];
+    }
+
+    DB::ColumnPtr executeImpl(const DB::ColumnsWithTypeAndName & arguments, 
const DB::DataTypePtr &, size_t input_rows_count) const override
+    {
+        auto data_col = arguments[0].column;
+        const DB::ColumnString * col_str = nullptr;
+        const DB::ColumnNullable * col_nullable = nullptr;
+        const DB::NullMap * null_map = nullptr;
+        if (data_col->isConst())
+        {
+            if (data_col->isNullAt(0))
+            {
+                return data_col;
+            }
+            const DB::ColumnConst * col_const = 
DB::checkAndGetColumn<DB::ColumnConst>(data_col.get());
+            data_col = col_const->getDataColumnPtr();
+            if (data_col->isNullable())
+            {
+                col_nullable = 
DB::checkAndGetColumn<DB::ColumnNullable>(data_col.get());
+                null_map = &(col_nullable->getNullMapData());
+                col_str = 
DB::checkAndGetColumn<DB::ColumnString>(&(col_nullable->getNestedColumn()));
+            }
+            else
+            {
+                col_str = 
DB::checkAndGetColumn<DB::ColumnString>(data_col.get());
+            }
+            if (!col_str)
+                throw DB::Exception(DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, 
"Argument for function {} must be String, but got {}", getName(), 
data_col->getName());
+            auto date_str = col_str->getDataAt(0);
+            auto new_str = convertArabicIndicDigit(date_str);
+            auto new_data_col = data_col->cloneEmpty();
+            new_data_col->insertData(new_str.c_str(), new_str.size());
+            return DB::ColumnConst::create(std::move(new_data_col), 
input_rows_count);
+        }
+
+        if (data_col->isNullable())
+        {
+            col_nullable = 
DB::checkAndGetColumn<DB::ColumnNullable>(data_col.get());
+            null_map = &(col_nullable->getNullMapData());
+            col_str = 
DB::checkAndGetColumn<DB::ColumnString>(&(col_nullable->getNestedColumn()));
+        }
+        else
+        {
+            col_str = DB::checkAndGetColumn<DB::ColumnString>(data_col.get());
+        }
+        if (!col_str)
+            throw DB::Exception(DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, 
"Argument for function {} must be String, but got {}", getName(), 
data_col->getName());
+
+        auto nested_data_col = DB::removeNullable(arguments[0].column);
+        bool has_arabic_indic_digit = false;
+        size_t row_index = 0;
+        for (row_index = 0; row_index < input_rows_count; ++row_index)
+        {
+            if (null_map && (*null_map)[row_index])
+            {
+                continue;
+            }
+            auto str = col_str->getDataAt(row_index);
+            if (hasArabicIndicDigit(str))
+            {
+                has_arabic_indic_digit = true;
+                break;
+            }
+        }
+
+        if (!has_arabic_indic_digit)
+        {
+            // No Arabic indic digits found, return the original column
+            return arguments[0].column;
+        }
+
+        auto res_col = data_col->cloneEmpty();
+        if (row_index)
+        {
+            res_col->insertManyFrom(*data_col, 0, row_index);
+        }
+        for (; row_index < input_rows_count; ++row_index)
+        {
+            if (null_map && (*null_map)[row_index])
+            {
+                res_col->insertDefault();
+                continue;
+            }
+            auto str = convertArabicIndicDigit(col_str->getDataAt(row_index));
+            res_col->insertData(str.c_str(), str.size());
+        }
+        return res_col;
+    }
+
+private:
+    bool hasArabicIndicDigit(StringRef str) const
+    {
+        // In most cases, the first byte is a digit.
+        char c = reinterpret_cast<char>(str.data[0]);
+        if ('0' <= c && c <= '9')
+        {
+            return false;
+        }
+        return true;
+    }
+
+
+    bool isArabicIndicDigit(char32_t c) const { return c >= 0x0660 && c <= 
0x0669; }
+    char toAsciiDigit(char32_t c) const { return static_cast<char>(c - 0x0660 
+ '0'); }
+
+    String convertArabicIndicDigit(const StringRef & str) const
+    {
+        std::string result;
+        result.reserve(str.size);
+        for (size_t i = 0; i < str.size;)
+        {
+            unsigned char c = str.data[i];
+            char32_t cp = 0;
+            if ((c & 0x80) == 0) // 1-byte
+            {
+                cp = c;
+                i += 1;
+            }
+            else if ((c & 0xE0) == 0xC0) // 2-byte
+            {
+                cp = ((c & 0x1F) << 6) | (str.data[i + 1] & 0x3F);
+                i += 2;
+            }
+            else if ((c & 0xF0) == 0xE0) // 3-byte
+            {
+                cp = ((c & 0x0F) << 12) | ((str.data[i + 1] & 0x3F) << 6) | 
(str.data[i + 2] & 0x3F);
+                i += 3;
+            }
+            else if ((c & 0xF8) == 0xF0) // 4-byte
+            {
+                cp = ((c & 0x07) << 18) | ((str.data[i + 1] & 0x3F) << 12) | 
((str.data[i + 2] & 0x3F) << 6) | (str.data[i + 3] & 0x3F);
+                i += 4;
+            }
+            if (isArabicIndicDigit(cp))
+                result.push_back(toAsciiDigit(cp));
+            else
+                result.push_back(cp);
+        }
+        return result;
+    }
+};
+
+using namespace DB;
+REGISTER_FUNCTION(ArabicIndicToAsciiDigitForDate)
+{
+    factory.registerFunction<ArabicIndicToAsciiDigitForDateFunction>();
+}
+}
diff --git a/cpp-ch/local-engine/Parser/scalar_function_parser/getTimestamp.h 
b/cpp-ch/local-engine/Parser/scalar_function_parser/getTimestamp.h
index cbe83e5acc..53e6a0e6e6 100644
--- a/cpp-ch/local-engine/Parser/scalar_function_parser/getTimestamp.h
+++ b/cpp-ch/local-engine/Parser/scalar_function_parser/getTimestamp.h
@@ -61,7 +61,7 @@ public:
         auto parsed_args = parseFunctionArguments(substrait_func, actions_dag);
         if (parsed_args.size() != 2)
             throw 
DB::Exception(DB::ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} 
requires exactly two arguments", getName());
-        const auto * expr_arg = parsed_args[0];
+        const auto * expr_arg = convertArabicIndicDigit(actions_dag, 
parsed_args[0]);
         const auto * fmt_arg = parsed_args[1];
         
         const auto & args = substrait_func.arguments();
@@ -129,5 +129,11 @@ private:
             return std::regex_match(fmt, fmtPattern);
         }
     }
+
+    const DB::ActionsDAG::Node * convertArabicIndicDigit(DB::ActionsDAG & 
actions_dag, const DB::ActionsDAG::Node * node) const
+    {
+        const auto * func_node = toFunctionNode(actions_dag, 
"arabic_indic_to_ascii_digit_for_date", {node});
+        return func_node;
+    }
 };
 }
diff --git 
a/cpp-ch/local-engine/Parser/scalar_function_parser/unixTimestamp.cpp 
b/cpp-ch/local-engine/Parser/scalar_function_parser/unixTimestamp.cpp
index 3ac0babf58..37f309a2d6 100644
--- a/cpp-ch/local-engine/Parser/scalar_function_parser/unixTimestamp.cpp
+++ b/cpp-ch/local-engine/Parser/scalar_function_parser/unixTimestamp.cpp
@@ -59,7 +59,6 @@ public:
             throw 
DB::Exception(DB::ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} 
requires exactly two arguments", getName());
 
         const auto * expr_arg = parsed_args[0];
-        const auto * fmt_arg = parsed_args[1];
         auto expr_type = removeNullable(expr_arg->result_type);
         if (isString(expr_type))
             return FunctionParserGetTimestamp::parse(substrait_func, 
actions_dag);


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to