linrrzqqq commented on code in PR #60799:
URL: https://github.com/apache/doris/pull/60799#discussion_r2899003759
##########
be/src/vec/functions/function_string.cpp:
##########
@@ -290,6 +290,275 @@ struct FindInSetOp {
}
};
+struct NameLevenshtein {
+ static constexpr auto name = "levenshtein";
+};
+
+struct LevenshteinOp {
+ using ResultDataType = DataTypeInt32;
+ using ResultPaddedPODArray = PaddedPODArray<Int32>;
+
+ static void execute(const std::string_view& s, const std::string_view& t,
int32_t& res) {
+ const size_t m = s.size();
+ const size_t n = t.size();
+ if (m == 0) {
+ res = static_cast<int32_t>(n);
+ return;
+ }
+ if (n == 0) {
+ res = static_cast<int32_t>(m);
+ return;
+ }
+ if (s == t) {
+ res = 0;
+ return;
+ }
+ // Guard against excessive O(m*n) computation on very large inputs.
+ // Limit each string to 65535 bytes (max VARCHAR length).
+ constexpr size_t MAX_INPUT_LEN = 65535;
+ if (m > MAX_INPUT_LEN || n > MAX_INPUT_LEN) {
+ throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
+ "Input string too long for levenshtein, max
{} bytes",
+ MAX_INPUT_LEN);
+ }
+ constexpr size_t STACK_MAX = 512;
+ int32_t prev_stk[STACK_MAX + 1], curr_stk[STACK_MAX + 1];
+ std::vector<int32_t> prev_heap, curr_heap;
+ int32_t* prev_row;
+ int32_t* curr_row;
+ if (n <= STACK_MAX) {
+ prev_row = prev_stk;
+ curr_row = curr_stk;
+ } else {
+ prev_heap.resize(n + 1);
+ curr_heap.resize(n + 1);
+ prev_row = prev_heap.data();
+ curr_row = curr_heap.data();
+ }
+ for (size_t j = 0; j <= n; ++j) prev_row[j] = static_cast<int32_t>(j);
+ for (size_t i = 1; i <= m; ++i) {
+ curr_row[0] = static_cast<int32_t>(i);
+ for (size_t j = 1; j <= n; ++j) {
+ if (s[i - 1] == t[j - 1]) {
+ curr_row[j] = prev_row[j - 1];
+ } else {
+ curr_row[j] = 1 + std::min({prev_row[j - 1], prev_row[j],
curr_row[j - 1]});
+ }
+ }
+ std::swap(prev_row, curr_row);
+ }
+ res = prev_row[n];
+ }
+};
+
+struct NameDamerauLevenshtein {
+ static constexpr auto name = "damerau_levenshtein";
+};
+
+struct DamerauLevenshteinOp {
+ using ResultDataType = DataTypeInt32;
+ using ResultPaddedPODArray = PaddedPODArray<Int32>;
+
+ static void execute(const std::string_view& s, const std::string_view& t,
int32_t& res) {
+ const size_t m = s.size(), n = t.size();
+ if (m == 0) {
+ res = static_cast<int32_t>(n);
+ return;
+ }
+ if (n == 0) {
+ res = static_cast<int32_t>(m);
+ return;
+ }
+ // Guard against OOM: the DP matrix allocates
(m+2)*(n+2)*sizeof(int32_t) bytes.
+ // At 10000 chars each that is ~400 MB; cap well below VARCHAR max
(65535).
+ constexpr size_t MAX_INPUT_LEN = 10000;
+ if (m > MAX_INPUT_LEN || n > MAX_INPUT_LEN) {
+ throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
+ "Input string too long for
damerau_levenshtein, max {} bytes",
+ MAX_INPUT_LEN);
+ }
+ const size_t stride = n + 2;
+ const int32_t max_dist = static_cast<int32_t>(m + n);
+ std::vector<int32_t> d((m + 2) * stride, 0);
+ d[0] = max_dist;
+ for (size_t i = 0; i <= m; ++i) {
+ d[(i + 1) * stride + 0] = max_dist;
+ d[(i + 1) * stride + 1] = static_cast<int32_t>(i);
+ }
+ for (size_t j = 0; j <= n; ++j) {
+ d[0 * stride + (j + 1)] = max_dist;
+ d[1 * stride + (j + 1)] = static_cast<int32_t>(j);
+ }
+ int32_t da[256] = {};
+ for (size_t i = 1; i <= m; ++i) {
+ int32_t db = 0;
+ for (size_t j = 1; j <= n; ++j) {
+ const int32_t k = da[(uint8_t)t[j - 1]];
+ const int32_t l = db;
+ int32_t cost;
+ if (s[i - 1] == t[j - 1]) {
+ cost = 0;
+ db = static_cast<int32_t>(j);
+ } else {
+ cost = 1;
+ }
+ const int32_t trans = d[static_cast<size_t>(k) * stride +
static_cast<size_t>(l)] +
+ (static_cast<int32_t>(i) - k - 1) + 1 +
+ (static_cast<int32_t>(j) - l - 1);
+ d[(i + 1) * stride + (j + 1)] =
+ std::min({d[i * stride + j] + cost, d[(i + 1) * stride
+ j] + 1,
+ d[i * stride + (j + 1)] + 1, trans});
+ }
+ da[(uint8_t)s[i - 1]] = static_cast<int32_t>(i);
+ }
+ res = d[(m + 1) * stride + (n + 1)];
+ }
+};
+
+struct NameJaroWinkler {
+ static constexpr auto name = "jaro_winkler";
+};
+
+struct JaroWinklerOp {
Review Comment:
ditto in other function
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]