Re: [PR] [feat](func) Add levenshtein, damerau_levenshtein, jaro_winkler, jaccard_similarity built-in scalar functions [doris]

via GitHub Tue, 24 Feb 2026 22:07:17 -0800


Copilot commented on code in PR #60799:
URL: https://github.com/apache/doris/pull/60799#discussion_r2851043611



##########
be/src/vec/functions/function_string.cpp:
##########
@@ -290,6 +290,249 @@ struct FindInSetOp {
     }
 };
 
+struct NameLevenshtein {
+    static constexpr auto name = "levenshtein";
+};
+
+struct LevenshteinOp {
+    using ResultDataType = DataTypeInt32;
+    using ResultPaddedPODArray = PaddedPODArray<Int32>;
+
+    static void execute(const std::string_view& s, const std::string_view& t, 
int32_t& res) {
+        const size_t m = s.size();
+        const size_t n = t.size();

Review Comment:
   `LevenshteinOp` is O(m×n) time and can become prohibitively expensive on 
large VARCHAR/STRING inputs (e.g., tens of thousands of bytes). Consider adding 
a length/cost guard (similar to other expensive functions) to prevent runaway 
CPU usage or query timeouts when users pass very long strings.
   ```suggestion
           const size_t n = t.size();
           // Guard against excessively expensive O(m * n) computation on very 
large inputs.
           // If the estimated cost exceeds the threshold, return the maximal 
possible
           // Levenshtein distance for these lengths (max(m, n)) without 
running DP.
           constexpr size_t MAX_LEVENSHTEIN_COST = 4 * 1024 * 1024; // tunable 
upper bound
           if (static_cast<unsigned long long>(m) * static_cast<unsigned long 
long>(n) >
               static_cast<unsigned long long>(MAX_LEVENSHTEIN_COST)) {
               res = static_cast<int32_t>(std::max(m, n));
               return;
           }
   ```



##########
be/src/vec/functions/function_string.cpp:
##########
@@ -290,6 +290,249 @@ struct FindInSetOp {
     }
 };
 
+struct NameLevenshtein {
+    static constexpr auto name = "levenshtein";
+};
+
+struct LevenshteinOp {
+    using ResultDataType = DataTypeInt32;
+    using ResultPaddedPODArray = PaddedPODArray<Int32>;
+
+    static void execute(const std::string_view& s, const std::string_view& t, 
int32_t& res) {
+        const size_t m = s.size();
+        const size_t n = t.size();
+        if (m == 0) {
+            res = static_cast<int32_t>(n);
+            return;
+        }
+        if (n == 0) {
+            res = static_cast<int32_t>(m);
+            return;
+        }
+        if (s == t) {
+            res = 0;
+            return;
+        }
+        constexpr size_t STACK_MAX = 512;
+        int32_t prev_stk[STACK_MAX + 1], curr_stk[STACK_MAX + 1];
+        std::vector<int32_t> prev_heap, curr_heap;
+        int32_t* prev_row;
+        int32_t* curr_row;
+        if (n <= STACK_MAX) {
+            prev_row = prev_stk;
+            curr_row = curr_stk;
+        } else {
+            prev_heap.resize(n + 1);
+            curr_heap.resize(n + 1);
+            prev_row = prev_heap.data();
+            curr_row = curr_heap.data();
+        }
+        for (size_t j = 0; j <= n; ++j) prev_row[j] = static_cast<int32_t>(j);
+        for (size_t i = 1; i <= m; ++i) {
+            curr_row[0] = static_cast<int32_t>(i);
+            for (size_t j = 1; j <= n; ++j) {
+                if (s[i - 1] == t[j - 1]) {
+                    curr_row[j] = prev_row[j - 1];
+                } else {
+                    curr_row[j] = 1 + std::min({prev_row[j - 1], prev_row[j], 
curr_row[j - 1]});
+                }
+            }
+            std::swap(prev_row, curr_row);
+        }
+        res = prev_row[n];
+    }
+};
+
+struct NameDamerauLevenshtein {
+    static constexpr auto name = "damerau_levenshtein";
+};
+
+struct DamerauLevenshteinOp {
+    using ResultDataType = DataTypeInt32;
+    using ResultPaddedPODArray = PaddedPODArray<Int32>;
+
+    static void execute(const std::string_view& s, const std::string_view& t, 
int32_t& res) {
+        const size_t m = s.size(), n = t.size();
+        if (m == 0) {
+            res = static_cast<int32_t>(n);
+            return;
+        }
+        if (n == 0) {
+            res = static_cast<int32_t>(m);
+            return;
+        }
+        const size_t stride = n + 2;
+        const int32_t max_dist = static_cast<int32_t>(m + n);
+        std::vector<int32_t> d((m + 2) * stride, 0);
+        d[0] = max_dist;
+        for (size_t i = 0; i <= m; ++i) {
+            d[(i + 1) * stride + 0] = max_dist;
+            d[(i + 1) * stride + 1] = static_cast<int32_t>(i);
+        }
+        for (size_t j = 0; j <= n; ++j) {
+            d[0 * stride + (j + 1)] = max_dist;
+            d[1 * stride + (j + 1)] = static_cast<int32_t>(j);
+        }
+        int32_t da[256] = {};
+        for (size_t i = 1; i <= m; ++i) {
+            int32_t db = 0;
+            for (size_t j = 1; j <= n; ++j) {
+                const int32_t k = da[(uint8_t)t[j - 1]];
+                const int32_t l = db;
+                int32_t cost;
+                if (s[i - 1] == t[j - 1]) {
+                    cost = 0;
+                    db = static_cast<int32_t>(j);
+                } else {
+                    cost = 1;
+                }
+                const int32_t trans = d[static_cast<size_t>(k) * stride + 
static_cast<size_t>(l)] +
+                                      (static_cast<int32_t>(i) - k - 1) + 1 +
+                                      (static_cast<int32_t>(j) - l - 1);
+                d[(i + 1) * stride + (j + 1)] =
+                        std::min({d[i * stride + j] + cost, d[(i + 1) * stride 
+ j] + 1,
+                                  d[i * stride + (j + 1)] + 1, trans});
+            }
+            da[(uint8_t)s[i - 1]] = static_cast<int32_t>(i);
+        }
+        res = d[(m + 1) * stride + (n + 1)];
+    }
+};
+
+struct NameJaroWinkler {
+    static constexpr auto name = "jaro_winkler";
+};
+
+struct JaroWinklerOp {
+    using ResultDataType = DataTypeFloat64;
+    using ResultPaddedPODArray = PaddedPODArray<Float64>;
+
+    static void execute(const std::string_view& s, const std::string_view& t, 
double& res) {
+        if (s == t) {
+            res = 1.0;
+            return;
+        }
+        const size_t m = s.size(), n = t.size();
+        if (m == 0 || n == 0) {
+            res = 0.0;
+            return;
+        }
+        const size_t match_dist = std::max(m, n) / 2 - (std::max(m, n) >= 2 ? 
1 : 0);
+        constexpr size_t STACK_MAX = 512;
+        uint8_t s_stk[STACK_MAX] = {};
+        uint8_t t_stk[STACK_MAX] = {};
+        std::vector<uint8_t> s_heap, t_heap;
+        uint8_t* sm;
+        uint8_t* tm;
+        if (m <= STACK_MAX && n <= STACK_MAX) {
+            sm = s_stk;
+            tm = t_stk;
+        } else {
+            s_heap.assign(m, 0);
+            t_heap.assign(n, 0);
+            sm = s_heap.data();
+            tm = t_heap.data();
+        }
+        double matches = 0;
+        for (size_t i = 0; i < m; ++i) {
+            const size_t start = (i >= match_dist) ? i - match_dist : 0;
+            const size_t end = std::min(i + match_dist + 1, n);
+            for (size_t j = start; j < end; ++j) {
+                if (!tm[j] && s[i] == t[j]) {
+                    sm[i] = tm[j] = 1;
+                    ++matches;
+                    break;
+                }
+            }
+        }
+        if (matches == 0.0) {
+            res = 0.0;
+            return;
+        }
+        double transpositions = 0;
+        size_t k = 0;
+        for (size_t i = 0; i < m; ++i) {
+            if (sm[i]) {
+                while (!tm[k]) ++k;
+                if (s[i] != t[k]) ++transpositions;
+                ++k;
+            }
+        }
+        const double jaro = (matches / static_cast<double>(m) + matches / 
static_cast<double>(n) +
+                             (matches - transpositions / 2.0) / matches) /
+                            3.0;
+        size_t prefix = 0;
+        const size_t max_prefix = std::min(static_cast<size_t>(4), std::min(m, 
n));
+        while (prefix < max_prefix && s[prefix] == t[prefix]) ++prefix;
+        res = jaro + static_cast<double>(prefix) * 0.1 * (1.0 - jaro);
+    }
+};
+
+struct NameJaccardSimilarity {
+    static constexpr auto name = "jaccard_similarity";
+};
+
+struct JaccardSimilarityOp {
+    using ResultDataType = DataTypeFloat64;
+    using ResultPaddedPODArray = PaddedPODArray<Float64>;
+
+    static void execute(const std::string_view& s, const std::string_view& t, 
double& res) {
+        if (s == t) {
+            res = 1.0;
+            return;
+        }
+        if (s.size() < 2 && t.size() < 2) {
+            res = 0.0;
+            return;
+        }

Review Comment:
   In `JaccardSimilarityOp`, the two consecutive checks for `size() < 2` both 
return 0.0 (except for the earlier `s == t` fast path), so one of them is 
redundant. Simplifying this branch would make the edge-case behavior easier to 
follow and slightly reduce branching.
   ```suggestion
   
   ```



##########
fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/JaccardSimilarity.java:
##########
@@ -0,0 +1,78 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.nereids.trees.expressions.functions.scalar;
+
+import org.apache.doris.catalog.FunctionSignature;
+import org.apache.doris.nereids.trees.expressions.Expression;
+import 
org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature;
+import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable;
+import org.apache.doris.nereids.trees.expressions.shape.BinaryExpression;
+import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
+import org.apache.doris.nereids.types.DoubleType;
+import org.apache.doris.nereids.types.StringType;
+import org.apache.doris.nereids.types.VarcharType;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableList;
+
+import java.util.List;
+
+/**
+ * ScalarFunction 'jaccard_similarity'. Returns Jaccard similarity (0.0-1.0) 
based on character bigrams.

Review Comment:
   The Javadoc says Jaccard similarity is based on “character bigrams”, but the 
implementation in BE builds bigrams from raw bytes. Please align the 
documentation with the actual semantics (e.g., “byte bigrams”) to avoid 
confusion for UTF-8 input.
   ```suggestion
    * ScalarFunction 'jaccard_similarity'. Returns Jaccard similarity (0.0-1.0) 
based on byte bigrams of the input strings.
   ```



##########
be/src/vec/functions/function_string.cpp:
##########
@@ -290,6 +290,249 @@ struct FindInSetOp {
     }
 };
 
+struct NameLevenshtein {
+    static constexpr auto name = "levenshtein";
+};
+
+struct LevenshteinOp {
+    using ResultDataType = DataTypeInt32;
+    using ResultPaddedPODArray = PaddedPODArray<Int32>;
+
+    static void execute(const std::string_view& s, const std::string_view& t, 
int32_t& res) {
+        const size_t m = s.size();
+        const size_t n = t.size();
+        if (m == 0) {
+            res = static_cast<int32_t>(n);
+            return;
+        }
+        if (n == 0) {
+            res = static_cast<int32_t>(m);
+            return;
+        }
+        if (s == t) {
+            res = 0;
+            return;
+        }
+        constexpr size_t STACK_MAX = 512;
+        int32_t prev_stk[STACK_MAX + 1], curr_stk[STACK_MAX + 1];
+        std::vector<int32_t> prev_heap, curr_heap;
+        int32_t* prev_row;
+        int32_t* curr_row;
+        if (n <= STACK_MAX) {
+            prev_row = prev_stk;
+            curr_row = curr_stk;
+        } else {
+            prev_heap.resize(n + 1);
+            curr_heap.resize(n + 1);
+            prev_row = prev_heap.data();
+            curr_row = curr_heap.data();
+        }
+        for (size_t j = 0; j <= n; ++j) prev_row[j] = static_cast<int32_t>(j);
+        for (size_t i = 1; i <= m; ++i) {
+            curr_row[0] = static_cast<int32_t>(i);
+            for (size_t j = 1; j <= n; ++j) {
+                if (s[i - 1] == t[j - 1]) {
+                    curr_row[j] = prev_row[j - 1];
+                } else {
+                    curr_row[j] = 1 + std::min({prev_row[j - 1], prev_row[j], 
curr_row[j - 1]});
+                }
+            }
+            std::swap(prev_row, curr_row);
+        }
+        res = prev_row[n];
+    }
+};
+
+struct NameDamerauLevenshtein {
+    static constexpr auto name = "damerau_levenshtein";
+};
+
+struct DamerauLevenshteinOp {
+    using ResultDataType = DataTypeInt32;
+    using ResultPaddedPODArray = PaddedPODArray<Int32>;
+
+    static void execute(const std::string_view& s, const std::string_view& t, 
int32_t& res) {
+        const size_t m = s.size(), n = t.size();
+        if (m == 0) {
+            res = static_cast<int32_t>(n);
+            return;
+        }
+        if (n == 0) {
+            res = static_cast<int32_t>(m);
+            return;
+        }
+        const size_t stride = n + 2;
+        const int32_t max_dist = static_cast<int32_t>(m + n);
+        std::vector<int32_t> d((m + 2) * stride, 0);
+        d[0] = max_dist;

Review Comment:
   `DamerauLevenshteinOp` allocates a full (m+2)×(n+2) `int32_t` matrix based 
directly on input lengths. With VARCHAR potentially up to 65533 bytes, this can 
attempt multi‑GB allocations (or overflow size computations) and lead to 
OOM/instability. Please add an explicit guard (max input length and/or max 
matrix cells with overflow-safe multiplication) and fail gracefully (e.g., 
throw a controlled exception or return NULL) before allocating.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [feat](func) Add levenshtein, damerau_levenshtein, jaro_winkler, jaccard_similarity built-in scalar functions [doris]

Reply via email to