This is an automated email from the ASF dual-hosted git repository.
zhangstar333 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 8843efc1a3d [feature](function)support url domain functions (#42488)
8843efc1a3d is described below
commit 8843efc1a3d28518e76b1681ca54a0715e491213
Author: zhangstar333 <[email protected]>
AuthorDate: Wed Oct 30 17:33:39 2024 +0800
[feature](function)support url domain functions (#42488)
## Proposed changes
support
top_level_domain/first_significant_subdomain/cut_to_first_significant_subdomain
functions
doc: https://github.com/apache/doris-website/pull/1230
<!--Describe your changes.-->
---
be/src/vec/functions/url/domain.h | 127 +++++-
be/src/vec/functions/url/find_symbols.h | 481 +++++++++++++++++++++
be/src/vec/functions/url/function_url.cpp | 23 +
be/src/vec/functions/url/functions_url.h | 11 -
be/src/vec/functions/url/tldLookup.generated.cpp | 140 ++++++
be/src/vec/functions/url/tldLookup.h | 34 ++
.../doris/catalog/BuiltinScalarFunctions.java | 6 +
.../scalar/CutToFirstSignificantSubdomain.java | 68 +++
.../scalar/FirstSignificantSubdomain.java | 68 +++
.../functions/scalar/TopLevelDomain.java | 68 +++
.../expressions/visitor/ScalarFunctionVisitor.java | 16 +
gensrc/script/doris_builtins_functions.py | 5 +-
.../string_functions/test_url_functions.out | 121 ++++++
.../string_functions/test_url_functions.groovy | 79 ++++
14 files changed, 1234 insertions(+), 13 deletions(-)
diff --git a/be/src/vec/functions/url/domain.h
b/be/src/vec/functions/url/domain.h
index 54361134eff..b2ec5e0c9d9 100644
--- a/be/src/vec/functions/url/domain.h
+++ b/be/src/vec/functions/url/domain.h
@@ -20,11 +20,12 @@
#pragma once
-// #include <base/find_symbols.h>
#include <cstring>
#include "vec/common/string_utils/string_utils.h"
+#include "vec/functions/url/find_symbols.h"
#include "vec/functions/url/protocol.h"
+#include "vec/functions/url/tldLookup.h"
namespace doris::vectorized {
@@ -144,4 +145,128 @@ struct ExtractDomain {
}
};
+struct ExtractTopLevelDomain {
+ static size_t get_reserve_length_for_element() { return 5; }
+
+ static void execute(const char* data, size_t size, const char*& res_data,
size_t& res_size) {
+ res_data = data;
+ res_size = 0;
+ StringRef host = get_url_host(data, size);
+
+ if (host.size == 0) {
+ return;
+ } else {
+ auto host_view = host.to_string_view();
+ if (host_view[host_view.size() - 1] == '.') {
+ host_view.remove_suffix(1);
+ }
+
+ const auto* host_end = host_view.data() + host_view.size();
+ const char* last_dot =
find_last_symbols_or_null<'.'>(host_view.data(), host_end);
+ if (!last_dot) {
+ return;
+ }
+
+ /// For IPv4 addresses select nothing.
+ ///
+ /// NOTE: it is safe to access last_dot[1]
+ /// since getURLHost() will not return a host if there is symbol
after dot.
+ if (is_numeric_ascii(last_dot[1])) {
+ return;
+ }
+
+ res_data = last_dot + 1;
+ res_size = host_end - res_data;
+ }
+ }
+};
+
+struct ExtractFirstSignificantSubdomain {
+ static size_t get_reserve_length_for_element() { return 10; }
+
+ static void execute(const Pos data, const size_t size, Pos& res_data,
size_t& res_size,
+ Pos* out_domain_end = nullptr) {
+ res_data = data;
+ res_size = 0;
+
+ Pos tmp;
+ size_t domain_length = 0;
+ ExtractDomain<true>::execute(data, size, tmp, domain_length);
+
+ if (domain_length == 0) {
+ return;
+ }
+ if (out_domain_end) {
+ *out_domain_end = tmp + domain_length;
+ }
+
+ /// cut useless dot
+ if (tmp[domain_length - 1] == '.') {
+ --domain_length;
+ }
+
+ res_data = tmp;
+ res_size = domain_length;
+
+ const auto* begin = tmp;
+ const auto* end = begin + domain_length;
+ std::array<const char*, 3> last_periods {};
+
+ const auto* pos = find_first_symbols<'.'>(begin, end);
+ while (pos < end) {
+ last_periods[2] = last_periods[1];
+ last_periods[1] = last_periods[0];
+ last_periods[0] = pos;
+ pos = find_first_symbols<'.'>(pos + 1, end);
+ }
+
+ if (!last_periods[0]) {
+ return;
+ }
+
+ if (!last_periods[1]) {
+ res_size = last_periods[0] - begin;
+ return;
+ }
+
+ if (!last_periods[2]) {
+ last_periods[2] = begin - 1;
+ }
+
+ const auto* end_of_level_domain =
find_first_symbols<'/'>(last_periods[0], end);
+ if (!end_of_level_domain) {
+ end_of_level_domain = end;
+ }
+
+ auto host_len = static_cast<size_t>(end_of_level_domain -
last_periods[1] - 1);
+ StringRef host {last_periods[1] + 1, host_len};
+ if (tldLookup::is_valid(host.data, host.size)) {
+ res_data += last_periods[2] + 1 - begin;
+ res_size = last_periods[1] - last_periods[2] - 1;
+ } else {
+ res_data += last_periods[1] + 1 - begin;
+ res_size = last_periods[0] - last_periods[1] - 1;
+ }
+ }
+};
+
+struct CutToFirstSignificantSubdomain {
+ static size_t get_reserve_length_for_element() { return 15; }
+
+ static void execute(const Pos data, const size_t size, Pos& res_data,
size_t& res_size) {
+ res_data = data;
+ res_size = 0;
+
+ Pos tmp_data = data;
+ size_t tmp_length;
+ Pos domain_end = data;
+ ExtractFirstSignificantSubdomain::execute(data, size, tmp_data,
tmp_length, &domain_end);
+
+ if (tmp_length == 0) {
+ return;
+ }
+ res_data = tmp_data;
+ res_size = domain_end - tmp_data;
+ }
+};
} // namespace doris::vectorized
diff --git a/be/src/vec/functions/url/find_symbols.h
b/be/src/vec/functions/url/find_symbols.h
new file mode 100644
index 00000000000..7af95ce06bd
--- /dev/null
+++ b/be/src/vec/functions/url/find_symbols.h
@@ -0,0 +1,481 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// This file is copied from
+//
https://github.com/ClickHouse/ClickHouse/blob/master/base/base/find_symbols.h
+// and modified by Doris
+
+#pragma once
+
+#include <array>
+#include <cstdint>
+#include <string>
+
+#if defined(__SSE4_2__)
+#include <nmmintrin.h>
+#endif
+
+/** find_first_symbols<c1, c2, ...>(begin, end):
+ *
+ * Allow to search for next character from the set of 'symbols...' in a
string.
+ * It is similar to 'strpbrk', 'strcspn' (and 'strchr', 'memchr' in the case
of one symbol and '\0'),
+ * but with the following differences:
+ * - works with any memory ranges, including containing zero bytes;
+ * - doesn't require terminating zero byte: end of memory range is passed
explicitly;
+ * - if not found, returns pointer to end instead of nullptr;
+ * - maximum number of symbols to search is 16.
+ *
+ * Uses SSE 2 in case of small number of symbols for search and SSE 4.2 in
the case of large number of symbols,
+ * that have more than 2x performance advantage over trivial loop
+ * in the case of parsing tab-separated dump with (probably escaped) string
fields.
+ * In the case of parsing tab separated dump with short strings, there is no
performance degradation over trivial loop.
+ *
+ * Note: the optimal threshold to choose between SSE 2 and SSE 4.2 may depend
on CPU model.
+ *
+ * find_last_symbols_or_null<c1, c2, ...>(begin, end):
+ *
+ * Allow to search for the last matching character in a string.
+ * If no such characters, returns nullptr.
+ */
+
+struct SearchSymbols {
+ static constexpr auto BUFFER_SIZE = 16;
+
+ SearchSymbols() = default;
+
+ explicit SearchSymbols(std::string in) : str(std::move(in)) {
+#if defined(__SSE4_2__)
+ if (str.size() > BUFFER_SIZE) {
+ throw std::runtime_error("SearchSymbols can contain at most " +
+ std::to_string(BUFFER_SIZE) + " symbols
and " +
+ std::to_string(str.size()) + " was
provided\n");
+ }
+
+ char tmp_safety_buffer[BUFFER_SIZE] = {0};
+
+ memcpy(tmp_safety_buffer, str.data(), str.size());
+
+ simd_vector = _mm_loadu_si128(reinterpret_cast<const
__m128i*>(tmp_safety_buffer));
+#endif
+ }
+
+#if defined(__SSE4_2__)
+ __m128i simd_vector;
+#endif
+ std::string str;
+};
+
+namespace detail {
+template <char... chars>
+constexpr bool is_in(char x) {
+ return ((x == chars) || ...);
+} // NOLINT(misc-redundant-expression)
+
+static bool is_in(char c, const char* symbols, size_t num_chars) {
+ for (size_t i = 0U; i < num_chars; ++i) {
+ if (c == symbols[i]) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+#if defined(__SSE2__)
+template <char s0>
+inline __m128i mm_is_in(__m128i bytes) {
+ __m128i eq0 = _mm_cmpeq_epi8(bytes, _mm_set1_epi8(s0));
+ return eq0;
+}
+
+template <char s0, char s1, char... tail>
+inline __m128i mm_is_in(__m128i bytes) {
+ __m128i eq0 = _mm_cmpeq_epi8(bytes, _mm_set1_epi8(s0));
+ __m128i eq = mm_is_in<s1, tail...>(bytes);
+ return _mm_or_si128(eq0, eq);
+}
+
+inline __m128i mm_is_in(__m128i bytes, const char* symbols, size_t num_chars) {
+ __m128i accumulator = _mm_setzero_si128();
+ for (size_t i = 0; i < num_chars; ++i) {
+ __m128i eq = _mm_cmpeq_epi8(bytes, _mm_set1_epi8(symbols[i]));
+ accumulator = _mm_or_si128(accumulator, eq);
+ }
+
+ return accumulator;
+}
+
+inline std::array<__m128i, 16u> mm_is_in_prepare(const char* symbols, size_t
num_chars) {
+ std::array<__m128i, 16u> result {};
+
+ for (size_t i = 0; i < num_chars; ++i) {
+ result[i] = _mm_set1_epi8(symbols[i]);
+ }
+
+ return result;
+}
+
+inline __m128i mm_is_in_execute(__m128i bytes, const std::array<__m128i, 16u>&
needles) {
+ __m128i accumulator = _mm_setzero_si128();
+
+ for (const auto& needle : needles) {
+ __m128i eq = _mm_cmpeq_epi8(bytes, needle);
+ accumulator = _mm_or_si128(accumulator, eq);
+ }
+
+ return accumulator;
+}
+#endif
+
+template <bool positive>
+constexpr bool maybe_negate(bool x) {
+ return x == positive;
+}
+
+template <bool positive>
+constexpr uint16_t maybe_negate(uint16_t x) {
+ if constexpr (positive)
+ return x;
+ else
+ return ~x;
+}
+
+enum class ReturnMode : uint8_t {
+ End,
+ Nullptr,
+};
+
+template <bool positive, ReturnMode return_mode, char... symbols>
+inline const char* find_first_symbols_sse2(const char* const begin, const
char* const end) {
+ const char* pos = begin;
+
+#if defined(__SSE2__)
+ for (; pos + 15 < end; pos += 16) {
+ __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos));
+
+ __m128i eq = mm_is_in<symbols...>(bytes);
+
+ uint16_t bit_mask =
maybe_negate<positive>(uint16_t(_mm_movemask_epi8(eq)));
+ if (bit_mask) return pos + __builtin_ctz(bit_mask);
+ }
+#endif
+
+ for (; pos < end; ++pos)
+ if (maybe_negate<positive>(is_in<symbols...>(*pos))) return pos;
+
+ return return_mode == ReturnMode::End ? end : nullptr;
+}
+
+template <bool positive, ReturnMode return_mode>
+inline const char* find_first_symbols_sse2(const char* const begin, const
char* const end,
+ const char* symbols, size_t
num_chars) {
+ const char* pos = begin;
+
+#if defined(__SSE2__)
+ const auto needles = mm_is_in_prepare(symbols, num_chars);
+ for (; pos + 15 < end; pos += 16) {
+ __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos));
+
+ __m128i eq = mm_is_in_execute(bytes, needles);
+
+ uint16_t bit_mask =
maybe_negate<positive>(uint16_t(_mm_movemask_epi8(eq)));
+ if (bit_mask) return pos + __builtin_ctz(bit_mask);
+ }
+#endif
+
+ for (; pos < end; ++pos)
+ if (maybe_negate<positive>(is_in(*pos, symbols, num_chars))) return
pos;
+
+ return return_mode == ReturnMode::End ? end : nullptr;
+}
+
+template <bool positive, ReturnMode return_mode, char... symbols>
+inline const char* find_last_symbols_sse2(const char* const begin, const char*
const end) {
+ const char* pos = end;
+
+#if defined(__SSE2__)
+ for (; pos - 16 >= begin;
+ pos -=
+ 16) /// Assuming the pointer cannot overflow. Assuming we can compare
these pointers.
+ {
+ __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos -
16));
+
+ __m128i eq = mm_is_in<symbols...>(bytes);
+
+ uint16_t bit_mask =
maybe_negate<positive>(uint16_t(_mm_movemask_epi8(eq)));
+ if (bit_mask)
+ return pos - 1 -
+ (__builtin_clz(bit_mask) -
+ 16); /// because __builtin_clz works with mask as uint32.
+ }
+#endif
+
+ --pos;
+ for (; pos >= begin; --pos)
+ if (maybe_negate<positive>(is_in<symbols...>(*pos))) return pos;
+
+ return return_mode == ReturnMode::End ? end : nullptr;
+}
+
+template <bool positive, ReturnMode return_mode, size_t num_chars, char c01,
char c02 = 0,
+ char c03 = 0, char c04 = 0, char c05 = 0, char c06 = 0, char c07 =
0, char c08 = 0,
+ char c09 = 0, char c10 = 0, char c11 = 0, char c12 = 0, char c13 =
0, char c14 = 0,
+ char c15 = 0, char c16 = 0>
+inline const char* find_first_symbols_sse42(const char* const begin, const
char* const end) {
+ const char* pos = begin;
+
+#if defined(__SSE4_2__)
+ constexpr int mode = _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY |
_SIDD_LEAST_SIGNIFICANT;
+
+ __m128i set = _mm_setr_epi8(c01, c02, c03, c04, c05, c06, c07, c08, c09,
c10, c11, c12, c13,
+ c14, c15, c16);
+
+ for (; pos + 15 < end; pos += 16) {
+ __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos));
+
+ if constexpr (positive) {
+ if (_mm_cmpestrc(set, num_chars, bytes, 16, mode))
+ return pos + _mm_cmpestri(set, num_chars, bytes, 16, mode);
+ } else {
+ if (_mm_cmpestrc(set, num_chars, bytes, 16, mode |
_SIDD_NEGATIVE_POLARITY))
+ return pos +
+ _mm_cmpestri(set, num_chars, bytes, 16, mode |
_SIDD_NEGATIVE_POLARITY);
+ }
+ }
+#endif
+
+ for (; pos < end; ++pos)
+ if ((num_chars == 1 && maybe_negate<positive>(is_in<c01>(*pos))) ||
+ (num_chars == 2 && maybe_negate<positive>(is_in<c01, c02>(*pos)))
||
+ (num_chars == 3 && maybe_negate<positive>(is_in<c01, c02,
c03>(*pos))) ||
+ (num_chars == 4 && maybe_negate<positive>(is_in<c01, c02, c03,
c04>(*pos))) ||
+ (num_chars == 5 && maybe_negate<positive>(is_in<c01, c02, c03,
c04, c05>(*pos))) ||
+ (num_chars == 6 && maybe_negate<positive>(is_in<c01, c02, c03,
c04, c05, c06>(*pos))) ||
+ (num_chars == 7 &&
+ maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06,
c07>(*pos))) ||
+ (num_chars == 8 &&
+ maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07,
c08>(*pos))) ||
+ (num_chars == 9 &&
+ maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07,
c08, c09>(*pos))) ||
+ (num_chars == 10 &&
+ maybe_negate<positive>(
+ is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09,
c10>(*pos))) ||
+ (num_chars == 11 &&
+ maybe_negate<positive>(
+ is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10,
c11>(*pos))) ||
+ (num_chars == 12 &&
+ maybe_negate<positive>(
+ is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10,
c11, c12>(*pos))) ||
+ (num_chars == 13 &&
+ maybe_negate<positive>(
+ is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10,
c11, c12, c13>(
+ *pos))) ||
+ (num_chars == 14 &&
+ maybe_negate<positive>(
+ is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10,
c11, c12, c13, c14>(
+ *pos))) ||
+ (num_chars == 15 &&
+ maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07,
c08, c09, c10, c11,
+ c12, c13, c14, c15>(*pos))) ||
+ (num_chars == 16 &&
+ maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07,
c08, c09, c10, c11,
+ c12, c13, c14, c15, c16>(*pos))))
+ return pos;
+ return return_mode == ReturnMode::End ? end : nullptr;
+}
+
+template <bool positive, ReturnMode return_mode>
+inline const char* find_first_symbols_sse42(const char* const begin, const
char* const end,
+ const SearchSymbols& symbols) {
+ const char* pos = begin;
+
+ const auto num_chars = symbols.str.size();
+
+#if defined(__SSE4_2__)
+ constexpr int mode = _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY |
_SIDD_LEAST_SIGNIFICANT;
+
+ const __m128i set = symbols.simd_vector;
+
+ for (; pos + 15 < end; pos += 16) {
+ __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos));
+
+ if constexpr (positive) {
+ if (_mm_cmpestrc(set, num_chars, bytes, 16, mode))
+ return pos + _mm_cmpestri(set, num_chars, bytes, 16, mode);
+ } else {
+ if (_mm_cmpestrc(set, num_chars, bytes, 16, mode |
_SIDD_NEGATIVE_POLARITY))
+ return pos +
+ _mm_cmpestri(set, num_chars, bytes, 16, mode |
_SIDD_NEGATIVE_POLARITY);
+ }
+ }
+#endif
+
+ for (; pos < end; ++pos)
+ if (maybe_negate<positive>(is_in(*pos, symbols.str.data(),
num_chars))) return pos;
+
+ return return_mode == ReturnMode::End ? end : nullptr;
+}
+
+/// NOTE No SSE 4.2 implementation for find_last_symbols_or_null. Not worth to
do.
+
+template <bool positive, ReturnMode return_mode, char... symbols>
+inline const char* find_first_symbols_dispatch(const char* begin, const char*
end)
+ requires(0 <= sizeof...(symbols) && sizeof...(symbols) <= 16)
+{
+#if defined(__SSE4_2__)
+ if (sizeof...(symbols) >= 5)
+ return find_first_symbols_sse42<positive, return_mode,
sizeof...(symbols), symbols...>(
+ begin, end);
+ else
+#endif
+ return find_first_symbols_sse2<positive, return_mode,
symbols...>(begin, end);
+}
+
+template <bool positive, ReturnMode return_mode>
+inline const char* find_first_symbols_dispatch(const std::string_view haystack,
+ const SearchSymbols& symbols) {
+#if defined(__SSE4_2__)
+ if (symbols.str.size() >= 5)
+ return find_first_symbols_sse42<positive,
return_mode>(haystack.begin(), haystack.end(),
+ symbols);
+ else
+#endif
+ return find_first_symbols_sse2<positive, return_mode>(
+ haystack.begin(), haystack.end(), symbols.str.data(),
symbols.str.size());
+}
+
+} // namespace detail
+
+template <char... symbols>
+inline const char* find_first_symbols(const char* begin, const char* end) {
+ return detail::find_first_symbols_dispatch<true, detail::ReturnMode::End,
symbols...>(begin,
+
end);
+}
+
+/// Returning non const result for non const arguments.
+/// It is convenient when you are using this function to iterate through
non-const buffer.
+template <char... symbols>
+inline char* find_first_symbols(char* begin, char* end) {
+ return const_cast<char*>(
+ detail::find_first_symbols_dispatch<true, detail::ReturnMode::End,
symbols...>(begin,
+
end));
+}
+
+inline const char* find_first_symbols(std::string_view haystack, const
SearchSymbols& symbols) {
+ return detail::find_first_symbols_dispatch<true,
detail::ReturnMode::End>(haystack, symbols);
+}
+
+template <char... symbols>
+inline const char* find_first_not_symbols(const char* begin, const char* end) {
+ return detail::find_first_symbols_dispatch<false, detail::ReturnMode::End,
symbols...>(begin,
+
end);
+}
+
+template <char... symbols>
+inline char* find_first_not_symbols(char* begin, char* end) {
+ return const_cast<char*>(
+ detail::find_first_symbols_dispatch<false,
detail::ReturnMode::End, symbols...>(begin,
+
end));
+}
+
+inline const char* find_first_not_symbols(std::string_view haystack, const
SearchSymbols& symbols) {
+ return detail::find_first_symbols_dispatch<false,
detail::ReturnMode::End>(haystack, symbols);
+}
+
+template <char... symbols>
+inline const char* find_first_symbols_or_null(const char* begin, const char*
end) {
+ return detail::find_first_symbols_dispatch<true,
detail::ReturnMode::Nullptr, symbols...>(begin,
+
end);
+}
+
+template <char... symbols>
+inline char* find_first_symbols_or_null(char* begin, char* end) {
+ return const_cast<char*>(
+ detail::find_first_symbols_dispatch<true,
detail::ReturnMode::Nullptr, symbols...>(
+ begin, end));
+}
+
+inline const char* find_first_symbols_or_null(std::string_view haystack,
+ const SearchSymbols& symbols) {
+ return detail::find_first_symbols_dispatch<true,
detail::ReturnMode::Nullptr>(haystack,
+
symbols);
+}
+
+template <char... symbols>
+inline const char* find_first_not_symbols_or_null(const char* begin, const
char* end) {
+ return detail::find_first_symbols_dispatch<false,
detail::ReturnMode::Nullptr, symbols...>(
+ begin, end);
+}
+
+template <char... symbols>
+inline char* find_first_not_symbols_or_null(char* begin, char* end) {
+ return const_cast<char*>(
+ detail::find_first_symbols_dispatch<false,
detail::ReturnMode::Nullptr, symbols...>(
+ begin, end));
+}
+
+inline const char* find_first_not_symbols_or_null(std::string_view haystack,
+ const SearchSymbols&
symbols) {
+ return detail::find_first_symbols_dispatch<false,
detail::ReturnMode::Nullptr>(haystack,
+
symbols);
+}
+
+template <char... symbols>
+inline const char* find_last_symbols_or_null(const char* begin, const char*
end) {
+ return detail::find_last_symbols_sse2<true, detail::ReturnMode::Nullptr,
symbols...>(begin,
+
end);
+}
+
+template <char... symbols>
+inline char* find_last_symbols_or_null(char* begin, char* end) {
+ return const_cast<char*>(
+ detail::find_last_symbols_sse2<true, detail::ReturnMode::Nullptr,
symbols...>(begin,
+
end));
+}
+
+template <char... symbols>
+inline const char* find_last_not_symbols_or_null(const char* begin, const
char* end) {
+ return detail::find_last_symbols_sse2<false, detail::ReturnMode::Nullptr,
symbols...>(begin,
+
end);
+}
+
+template <char... symbols>
+inline char* find_last_not_symbols_or_null(char* begin, char* end) {
+ return const_cast<char*>(
+ detail::find_last_symbols_sse2<false, detail::ReturnMode::Nullptr,
symbols...>(begin,
+
end));
+}
+
+/// Slightly resembles boost::split. The drawback of boost::split is that it
fires a false positive in clang static analyzer.
+/// See https://github.com/boostorg/algorithm/issues/63
+/// And https://bugs.llvm.org/show_bug.cgi?id=41141
+template <char... symbols, typename To>
+inline To& splitInto(To& to, std::string_view what, bool token_compress =
false) {
+ const char* pos = what.data();
+ const char* end = pos + what.size();
+ while (pos < end) {
+ const char* delimiter_or_end = find_first_symbols<symbols...>(pos,
end);
+
+ if (!token_compress || pos < delimiter_or_end) to.emplace_back(pos,
delimiter_or_end - pos);
+
+ if (delimiter_or_end < end)
+ pos = delimiter_or_end + 1;
+ else
+ pos = delimiter_or_end;
+ }
+
+ return to;
+}
diff --git a/be/src/vec/functions/url/function_url.cpp
b/be/src/vec/functions/url/function_url.cpp
index e25af6f7f27..47afe076b74 100644
--- a/be/src/vec/functions/url/function_url.cpp
+++ b/be/src/vec/functions/url/function_url.cpp
@@ -46,10 +46,33 @@ struct NameProtocol {
using FunctionProtocol =
FunctionStringToString<ExtractSubstringImpl<ExtractProtocol>,
NameProtocol>;
+struct NameTopLevelDomain {
+ static constexpr auto name = "top_level_domain";
+};
+using FunctionTopLevelDomain =
+ FunctionStringToString<ExtractSubstringImpl<ExtractTopLevelDomain>,
NameTopLevelDomain>;
+
+struct NameFirstSignificantSubdomain {
+ static constexpr auto name = "first_significant_subdomain";
+};
+using FunctionFirstSignificantSubdomain =
+
FunctionStringToString<ExtractSubstringImpl<ExtractFirstSignificantSubdomain>,
+ NameFirstSignificantSubdomain>;
+
+struct NameCutToFirstSignificantSubdomain {
+ static constexpr auto name = "cut_to_first_significant_subdomain";
+};
+using FunctionCutToFirstSignificantSubdomain =
+
FunctionStringToString<ExtractSubstringImpl<CutToFirstSignificantSubdomain>,
+ NameCutToFirstSignificantSubdomain>;
+
void register_function_url(SimpleFunctionFactory& factory) {
factory.register_function<FunctionDomain>();
factory.register_function<FunctionDomainWithoutWWW>();
factory.register_function<FunctionProtocol>();
+ factory.register_function<FunctionTopLevelDomain>();
+ factory.register_function<FunctionFirstSignificantSubdomain>();
+ factory.register_function<FunctionCutToFirstSignificantSubdomain>();
}
} // namespace doris::vectorized
diff --git a/be/src/vec/functions/url/functions_url.h
b/be/src/vec/functions/url/functions_url.h
index f9f02a17a66..b6736496d24 100644
--- a/be/src/vec/functions/url/functions_url.h
+++ b/be/src/vec/functions/url/functions_url.h
@@ -89,7 +89,6 @@ struct ExtractSubstringImpl {
for (size_t i = 0; i < size; ++i) {
Extractor::execute(reinterpret_cast<const
char*>(&data[prev_offset]),
offsets[i] - prev_offset, start, length);
-
res_data.resize(res_data.size() + length);
memcpy_small_allow_read_write_overflow15(&res_data[res_offset],
start, length);
res_offset += length;
@@ -105,11 +104,6 @@ struct ExtractSubstringImpl {
Extractor::execute(data.data(), data.size(), start, length);
res_data.assign(start, length);
}
-
- // static void vector_fixed(const ColumnString::Chars &, size_t,
ColumnString::Chars &)
- // {
- // throw Exception("Column of type FixedString is not supported by URL
functions", ErrorCodes::ILLEGAL_COLUMN);
- // }
};
/** Delete part of string using the Extractor.
@@ -155,11 +149,6 @@ struct CutSubstringImpl {
res_data.append(data.data(), start);
res_data.append(start + length, data.data() + data.size());
}
-
- // static void vector_fixed(const ColumnString::Chars &, size_t,
ColumnString::Chars &)
- // {
- // throw Exception("Column of type FixedString is not supported by URL
functions", ErrorCodes::ILLEGAL_COLUMN);
- // }
};
} // namespace doris::vectorized
diff --git a/be/src/vec/functions/url/tldLookup.generated.cpp
b/be/src/vec/functions/url/tldLookup.generated.cpp
new file mode 100644
index 00000000000..9b9471c094d
--- /dev/null
+++ b/be/src/vec/functions/url/tldLookup.generated.cpp
@@ -0,0 +1,140 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// This file is copied from
+//
https://github.com/ClickHouse/ClickHouse/blob/master/src/Functions/URL/tldLookup.generated.cpp
+// and modified by Doris
+
+// clang-format off
+/* C++ code produced by gperf version 3.1 */
+/* Command-line: /usr/bin/gperf --output-file=tldLookup.generated.cpp
tldLookup.gperf */
+/* Computed positions: -k'1-11,13-14,17,$' */
+
+#if !( \
+ (' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) && ('%' ==
37) && ('&' == 38) && ('\'' == 39) && ('(' == 40) && (')' == 41) && ('*' == 42)
&& ('+' == 43) && (',' == 44) && ('-' == 45) && ('.' == 46) && ('/' == 47) &&
('0' == 48) && ('1' == 49) && ('2' == 50) && ('3' == 51) && ('4' == 52) && ('5'
== 53) && ('6' == 54) && ('7' == 55) && ('8' == 56) && ('9' == 57) && (':' ==
58) && (';' == 59) && ('<' == 60) && ('=' == 61) && ('>' == 62) && ('?' == 63)
&& ('A' == 65) && ('B [...]
+/* The character set is not based on ISO-646. */
+#error "gperf generated tables don't work with this execution character set.
Please report a bug to <[email protected]>."
+#endif
+
+#line 7 "tldLookup.gperf"
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wimplicit-fallthrough"
+#pragma GCC diagnostic ignored "-Wzero-as-null-pointer-constant"
+#pragma GCC diagnostic ignored "-Wunused-macros"
+#include <cstring>
+
+#define TOTAL_KEYWORDS 5045
+#define MIN_WORD_LENGTH 4
+#define MAX_WORD_LENGTH 34
+#define MIN_HASH_VALUE 75
+#define MAX_HASH_VALUE 110600
+/* maximum key range = 110526, duplicates = 0 */
+
+class TopLevelDomainLookupHash {
+private:
+ static inline unsigned int hash(const char* str, size_t len);
+
+public:
+ static const char* is_valid(const char* str, size_t len);
+};
+
+inline unsigned int TopLevelDomainLookupHash::hash(const char* str, size_t
len) {
+ static const unsigned int asso_values[] = {110601, 110601, 110601, 110601,
110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601,
110601, 110601,
+ 110601, 110601, 110601, 110601,
110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601,
110601, 110601,
+ 110601, 110601, 110601, 110601,
110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 3905,
0, 5,
+ 11617, 15312, 10, 5, 25, 0, 25,
0, 5, 0, 0, 110601, 110601, 110601, 5, 110601,
+ 110601, 110601, 110601, 110601,
30, 20, 5, 15, 10, 65, 45, 80, 70, 55, 110601, 110601,
+ 110601, 110601, 110601, 110601,
110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601,
110601, 110601,
+ 110601, 2570, 9477, 1350, 15,
130, 5915, 1830, 4360, 2210, 5405, 63, 3190, 20, 1165, 5,
+ 6120, 5863, 470, 2315, 175, 0,
815, 40, 13577, 115, 5680, 1030, 11798, 23179, 345, 1097,
+ 28079, 13839, 245, 25674,
31874, 75, 31774, 7351, 27474, 190, 16044, 8040, 50, 25, 35, 55,
+ 0, 0, 30, 0, 10, 0, 0, 0, 35,
0, 55, 10, 5, 65, 0, 60,
+ 0, 25, 5, 30, 0, 5, 10, 0, 20,
5, 5, 35, 5, 0, 0, 0,
+ 0, 0, 15, 0, 5, 5, 0, 5, 5, 5,
0, 0, 0, 0, 0, 15,
+ 5, 110601, 110601, 5, 10, 45,
5, 110601, 0, 110601, 110601, 110601, 110601, 110601, 110601, 110601,
+ 0, 0, 0, 0, 110601, 110601,
110601, 45, 0, 0, 0, 0, 110601, 110601, 110601, 110601,
+ 0, 0, 110601, 0, 0, 0, 0, 5, 0,
5, 30, 0, 0, 110601, 110601, 110601,
+ 110601, 110601, 110601, 110601,
0, 110601, 110601, 110601, 0, 0, 5, 0, 20, 40, 110601, 110601,
+ 110601, 110601, 110601, 110601,
110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601,
110601, 110601,
+ 110601, 110601, 110601, 110601};
+ unsigned int hval = len;
+
+ switch (hval) {
+ default:
+ hval += asso_values[static_cast<unsigned char>(str[16])];
+ /*FALLTHROUGH*/
+ case 16:
+ case 15:
+ case 14:
+ hval += asso_values[static_cast<unsigned char>(str[13] + 1)];
+ /*FALLTHROUGH*/
+ case 13:
+ hval += asso_values[static_cast<unsigned char>(str[12])];
+ /*FALLTHROUGH*/
+ case 12:
+ case 11:
+ hval += asso_values[static_cast<unsigned char>(str[10])];
+ /*FALLTHROUGH*/
+ case 10:
+ hval += asso_values[static_cast<unsigned char>(str[9])];
+ /*FALLTHROUGH*/
+ case 9:
+ hval += asso_values[static_cast<unsigned char>(str[8] + 1)];
+ /*FALLTHROUGH*/
+ case 8:
+ hval += asso_values[static_cast<unsigned char>(str[7])];
+ /*FALLTHROUGH*/
+ case 7:
+ hval += asso_values[static_cast<unsigned char>(str[6] + 3)];
+ /*FALLTHROUGH*/
+ case 6:
+ hval += asso_values[static_cast<unsigned char>(str[5])];
+ /*FALLTHROUGH*/
+ case 5:
+ hval += asso_values[static_cast<unsigned char>(str[4] + 2)];
+ /*FALLTHROUGH*/
+ case 4:
+ hval += asso_values[static_cast<unsigned char>(str[3] + 1)];
+ /*FALLTHROUGH*/
+ case 3:
+ hval += asso_values[static_cast<unsigned char>(str[2])];
+ /*FALLTHROUGH*/
+ case 2:
+ hval += asso_values[static_cast<unsigned char>(str[1])];
+ /*FALLTHROUGH*/
+ case 1:
+ hval += asso_values[static_cast<unsigned char>(str[0] + 20)];
+ break;
+ }
+ return hval + asso_values[static_cast<unsigned char>(str[len - 1])];
+}
+
+const char* TopLevelDomainLookupHash::is_valid(const char* str, size_t len) {
+ static const char* const wordlist[] =
{"","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","co.tm","","\340\270\227\340\270\253\340\270\262\340\270\243.\340\271\204\340\270\227\340\270\242","","","","com.mu","","","","","com.so","","\340\270\243\340\270\261\340\270\220\340\270\232\340\270\262\340\270\245.\340\271\
[...]
+ if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH) {
+ unsigned int key = hash(str, len);
+
+ if (key <= MAX_HASH_VALUE) {
+ const char* s = wordlist[key];
+
+ if (*str == *s && !strncmp(str + 1, s + 1, len - 1) && s[len] ==
'\0')
+ return s;
+ }
+ }
+ return nullptr;
+}
+#line 5060 "tldLookup.gperf"
\ No newline at end of file
diff --git a/be/src/vec/functions/url/tldLookup.h
b/be/src/vec/functions/url/tldLookup.h
new file mode 100644
index 00000000000..9be88890c14
--- /dev/null
+++ b/be/src/vec/functions/url/tldLookup.h
@@ -0,0 +1,34 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// This file is copied from
+//
https://github.com/ClickHouse/ClickHouse/blob/master/src/Functions/URL/tldLookup.h
+// and modified by Doris
+
+#pragma once
+
+#include <cstdlib>
+
+// Definition of the class generated by gperf, present on gperf/tldLookup.gperf
+class TopLevelDomainLookupHash {
+private:
+ static inline unsigned int hash(const char* str, size_t len);
+
+public:
+ static const char* is_valid(const char* str, size_t len);
+};
+
+using tldLookup = TopLevelDomainLookupHash;
\ No newline at end of file
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
index 8dea4eeb8d2..ed3f2895cc8 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
@@ -139,6 +139,7 @@ import
org.apache.doris.nereids.trees.expressions.functions.scalar.CurrentDate;
import org.apache.doris.nereids.trees.expressions.functions.scalar.CurrentTime;
import org.apache.doris.nereids.trees.expressions.functions.scalar.CurrentUser;
import org.apache.doris.nereids.trees.expressions.functions.scalar.CutIpv6;
+import
org.apache.doris.nereids.trees.expressions.functions.scalar.CutToFirstSignificantSubdomain;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Database;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Date;
import org.apache.doris.nereids.trees.expressions.functions.scalar.DateDiff;
@@ -180,6 +181,7 @@ import
org.apache.doris.nereids.trees.expressions.functions.scalar.Exp;
import
org.apache.doris.nereids.trees.expressions.functions.scalar.ExtractUrlParameter;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Field;
import org.apache.doris.nereids.trees.expressions.functions.scalar.FindInSet;
+import
org.apache.doris.nereids.trees.expressions.functions.scalar.FirstSignificantSubdomain;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Floor;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Fmod;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Fpow;
@@ -440,6 +442,7 @@ import
org.apache.doris.nereids.trees.expressions.functions.scalar.ToIso8601;
import org.apache.doris.nereids.trees.expressions.functions.scalar.ToMonday;
import
org.apache.doris.nereids.trees.expressions.functions.scalar.ToQuantileState;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Tokenize;
+import
org.apache.doris.nereids.trees.expressions.functions.scalar.TopLevelDomain;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Translate;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Trim;
import org.apache.doris.nereids.trees.expressions.functions.scalar.TrimIn;
@@ -606,6 +609,7 @@ public class BuiltinScalarFunctions implements
FunctionHelper {
scalar(CurrentTime.class, "curtime", "current_time"),
scalar(CurrentUser.class, "current_user"),
scalar(CutIpv6.class, "cut_ipv6"),
+ scalar(CutToFirstSignificantSubdomain.class,
"cut_to_first_significant_subdomain"),
scalar(Database.class, "database", "schema"),
scalar(Date.class, "date"),
scalar(DateDiff.class, "datediff"),
@@ -647,6 +651,7 @@ public class BuiltinScalarFunctions implements
FunctionHelper {
scalar(ExtractUrlParameter.class, "extract_url_parameter"),
scalar(Field.class, "field"),
scalar(FindInSet.class, "find_in_set"),
+ scalar(FirstSignificantSubdomain.class,
"first_significant_subdomain"),
scalar(Floor.class, "floor"),
scalar(Fmod.class, "fmod"),
scalar(Fpow.class, "fpow"),
@@ -926,6 +931,7 @@ public class BuiltinScalarFunctions implements
FunctionHelper {
scalar(ToIso8601.class, "to_iso8601"),
scalar(Tokenize.class, "tokenize"),
scalar(ToMonday.class, "to_monday"),
+ scalar(TopLevelDomain.class, "top_level_domain"),
scalar(ToQuantileState.class, "to_quantile_state"),
scalar(Translate.class, "translate"),
scalar(Trim.class, "trim"),
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/CutToFirstSignificantSubdomain.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/CutToFirstSignificantSubdomain.java
new file mode 100644
index 00000000000..a2e77531e43
--- /dev/null
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/CutToFirstSignificantSubdomain.java
@@ -0,0 +1,68 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.nereids.trees.expressions.functions.scalar;
+
+import org.apache.doris.catalog.FunctionSignature;
+import org.apache.doris.nereids.trees.expressions.Expression;
+import
org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature;
+import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable;
+import org.apache.doris.nereids.trees.expressions.shape.UnaryExpression;
+import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
+import org.apache.doris.nereids.types.StringType;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableList;
+
+import java.util.List;
+
+/**
+ * ScalarFunction 'CutToFirstSignificantSubdomain'. This class is generated by
GenerateFunction.
+ */
+public class CutToFirstSignificantSubdomain extends ScalarFunction
+ implements UnaryExpression, ExplicitlyCastableSignature,
PropagateNullable {
+
+ public static final List<FunctionSignature> SIGNATURES = ImmutableList.of(
+
FunctionSignature.ret(StringType.INSTANCE).args(StringType.INSTANCE)
+ );
+
+ /**
+ * constructor with 1 argument.
+ */
+ public CutToFirstSignificantSubdomain(Expression arg) {
+ super("cut_to_first_significant_subdomain", arg);
+ }
+
+ /**
+ * withChildren.
+ */
+ @Override
+ public CutToFirstSignificantSubdomain withChildren(List<Expression>
children) {
+ Preconditions.checkArgument(children.size() == 1);
+ return new CutToFirstSignificantSubdomain(children.get(0));
+ }
+
+ @Override
+ public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
+ return visitor.visitCutToFirstSignificantSubdomain(this, context);
+ }
+
+ @Override
+ public List<FunctionSignature> getSignatures() {
+ return SIGNATURES;
+ }
+}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/FirstSignificantSubdomain.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/FirstSignificantSubdomain.java
new file mode 100644
index 00000000000..1af4dd96e6d
--- /dev/null
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/FirstSignificantSubdomain.java
@@ -0,0 +1,68 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.nereids.trees.expressions.functions.scalar;
+
+import org.apache.doris.catalog.FunctionSignature;
+import org.apache.doris.nereids.trees.expressions.Expression;
+import
org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature;
+import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable;
+import org.apache.doris.nereids.trees.expressions.shape.UnaryExpression;
+import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
+import org.apache.doris.nereids.types.StringType;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableList;
+
+import java.util.List;
+
+/**
+ * ScalarFunction 'FirstSignificantSubdomain'. This class is generated by
GenerateFunction.
+ */
+public class FirstSignificantSubdomain extends ScalarFunction
+ implements UnaryExpression, ExplicitlyCastableSignature,
PropagateNullable {
+
+ public static final List<FunctionSignature> SIGNATURES = ImmutableList.of(
+
FunctionSignature.ret(StringType.INSTANCE).args(StringType.INSTANCE)
+ );
+
+ /**
+ * constructor with 1 argument.
+ */
+ public FirstSignificantSubdomain(Expression arg) {
+ super("first_significant_subdomain", arg);
+ }
+
+ /**
+ * withChildren.
+ */
+ @Override
+ public FirstSignificantSubdomain withChildren(List<Expression> children) {
+ Preconditions.checkArgument(children.size() == 1);
+ return new FirstSignificantSubdomain(children.get(0));
+ }
+
+ @Override
+ public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
+ return visitor.visitFirstSignificantSubdomain(this, context);
+ }
+
+ @Override
+ public List<FunctionSignature> getSignatures() {
+ return SIGNATURES;
+ }
+}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/TopLevelDomain.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/TopLevelDomain.java
new file mode 100644
index 00000000000..05997659a2e
--- /dev/null
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/TopLevelDomain.java
@@ -0,0 +1,68 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.nereids.trees.expressions.functions.scalar;
+
+import org.apache.doris.catalog.FunctionSignature;
+import org.apache.doris.nereids.trees.expressions.Expression;
+import
org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature;
+import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable;
+import org.apache.doris.nereids.trees.expressions.shape.UnaryExpression;
+import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
+import org.apache.doris.nereids.types.StringType;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableList;
+
+import java.util.List;
+
+/**
+ * ScalarFunction 'TopLevelDomain'. This class is generated by
GenerateFunction.
+ */
+public class TopLevelDomain extends ScalarFunction
+ implements UnaryExpression, ExplicitlyCastableSignature,
PropagateNullable {
+
+ public static final List<FunctionSignature> SIGNATURES = ImmutableList.of(
+
FunctionSignature.ret(StringType.INSTANCE).args(StringType.INSTANCE)
+ );
+
+ /**
+ * constructor with 1 argument.
+ */
+ public TopLevelDomain(Expression arg) {
+ super("top_level_domain", arg);
+ }
+
+ /**
+ * withChildren.
+ */
+ @Override
+ public TopLevelDomain withChildren(List<Expression> children) {
+ Preconditions.checkArgument(children.size() == 1);
+ return new TopLevelDomain(children.get(0));
+ }
+
+ @Override
+ public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
+ return visitor.visitTopLevelDomain(this, context);
+ }
+
+ @Override
+ public List<FunctionSignature> getSignatures() {
+ return SIGNATURES;
+ }
+}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
index c5e9688d3c1..2619731cfc8 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
@@ -146,6 +146,7 @@ import
org.apache.doris.nereids.trees.expressions.functions.scalar.CurrentDate;
import org.apache.doris.nereids.trees.expressions.functions.scalar.CurrentTime;
import org.apache.doris.nereids.trees.expressions.functions.scalar.CurrentUser;
import org.apache.doris.nereids.trees.expressions.functions.scalar.CutIpv6;
+import
org.apache.doris.nereids.trees.expressions.functions.scalar.CutToFirstSignificantSubdomain;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Database;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Date;
import org.apache.doris.nereids.trees.expressions.functions.scalar.DateDiff;
@@ -188,6 +189,7 @@ import
org.apache.doris.nereids.trees.expressions.functions.scalar.Exp;
import
org.apache.doris.nereids.trees.expressions.functions.scalar.ExtractUrlParameter;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Field;
import org.apache.doris.nereids.trees.expressions.functions.scalar.FindInSet;
+import
org.apache.doris.nereids.trees.expressions.functions.scalar.FirstSignificantSubdomain;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Floor;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Fmod;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Fpow;
@@ -437,6 +439,7 @@ import
org.apache.doris.nereids.trees.expressions.functions.scalar.ToIso8601;
import org.apache.doris.nereids.trees.expressions.functions.scalar.ToMonday;
import
org.apache.doris.nereids.trees.expressions.functions.scalar.ToQuantileState;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Tokenize;
+import
org.apache.doris.nereids.trees.expressions.functions.scalar.TopLevelDomain;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Translate;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Trim;
import org.apache.doris.nereids.trees.expressions.functions.scalar.TrimIn;
@@ -903,6 +906,11 @@ public interface ScalarFunctionVisitor<R, C> {
return visitScalarFunction(charFunc, context);
}
+ default R
visitCutToFirstSignificantSubdomain(CutToFirstSignificantSubdomain
cutToFirstSignificantSubdomain,
+ C context) {
+ return visitScalarFunction(cutToFirstSignificantSubdomain, context);
+ }
+
default R visitEncodeAsSmallInt(EncodeAsSmallInt encode, C context) {
return visitScalarFunction(encode, context);
}
@@ -1187,6 +1195,10 @@ public interface ScalarFunctionVisitor<R, C> {
return visitScalarFunction(findInSet, context);
}
+ default R visitFirstSignificantSubdomain(FirstSignificantSubdomain
firstSignificantSubdomain, C context) {
+ return visitScalarFunction(firstSignificantSubdomain, context);
+ }
+
default R visitFloor(Floor floor, C context) {
return visitScalarFunction(floor, context);
}
@@ -2111,6 +2123,10 @@ public interface ScalarFunctionVisitor<R, C> {
return visitScalarFunction(tokenize, context);
}
+ default R visitTopLevelDomain(TopLevelDomain topLevelDomain, C context) {
+ return visitScalarFunction(topLevelDomain, context);
+ }
+
default R visitToQuantileState(ToQuantileState toQuantileState, C context)
{
return visitScalarFunction(toQuantileState, context);
}
diff --git a/gensrc/script/doris_builtins_functions.py
b/gensrc/script/doris_builtins_functions.py
index 73e68badcda..31b02f9b979 100644
--- a/gensrc/script/doris_builtins_functions.py
+++ b/gensrc/script/doris_builtins_functions.py
@@ -2077,7 +2077,10 @@ visible_functions = {
"Url": [
[['domain'], 'STRING', ['STRING'], ''],
[['domain_without_www'], 'STRING', ['STRING'], ''],
- [['protocol'], 'STRING', ['STRING'], '']
+ [['protocol'], 'STRING', ['STRING'], ''],
+ [['top_level_domain'], 'STRING', ['STRING'], ''],
+ [['cut_to_first_significant_subdomain'], 'STRING', ['STRING'], ''],
+ [['first_significant_subdomain'], 'STRING', ['STRING'], '']
],
# search functions
diff --git
a/regression-test/data/query_p0/sql_functions/string_functions/test_url_functions.out
b/regression-test/data/query_p0/sql_functions/string_functions/test_url_functions.out
new file mode 100644
index 00000000000..ce1ef717975
--- /dev/null
+++
b/regression-test/data/query_p0/sql_functions/string_functions/test_url_functions.out
@@ -0,0 +1,121 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !empty_nullable1 --
+
+-- !empty_nullable2 --
+
+-- !empty_nullable3 --
+
+-- !empty_not_nullable1 --
+
+-- !empty_not_nullable2 --
+
+-- !empty_not_nullable3 --
+
+-- !empty_null1 --
+\N
+
+-- !empty_null2 --
+\N
+
+-- !empty_null3 --
+\N
+
+-- !empty_const1 --
+com
+
+-- !empty_const2 --
+baidu
+
+-- !empty_const3 --
+baidu.com
+
+-- !empty_const4 --
+cn
+
+-- !empty_const5 --
+google
+
+-- !empty_const6 --
+google.com.cn
+
+-- !empty_const7 --
+
+
+-- !empty_const8 --
+
+
+-- !empty_const9 --
+
+
+-- !nullable1 --
+1 www.baidu.com com
+10 https://news.clickhouse.com.tr/ tr
+2 www.google.com.cn cn
+3 invalid url
+4
+5
+6 \N \N
+7 xxxxxxxx
+8 http://www.example.com/a/b/c?a=b com
+9 https://news.clickhouse.com/ com
+
+-- !nullable2 --
+1 www.baidu.com baidu
+10 https://news.clickhouse.com.tr/ clickhouse
+2 www.google.com.cn google
+3 invalid url
+4
+5
+6 \N \N
+7 xxxxxxxx
+8 http://www.example.com/a/b/c?a=b example
+9 https://news.clickhouse.com/ clickhouse
+
+-- !nullable3 --
+1 www.baidu.com baidu.com
+10 https://news.clickhouse.com.tr/ clickhouse.com.tr
+2 www.google.com.cn google.com.cn
+3 invalid url
+4
+5
+6 \N \N
+7 xxxxxxxx
+8 http://www.example.com/a/b/c?a=b example.com
+9 https://news.clickhouse.com/ clickhouse.com
+
+-- !not_nullable1 --
+1 www.baidu.com com
+10 https://news.clickhouse.com.tr/ tr
+2 www.google.com.cn cn
+3 invalid url
+4
+5
+6
+7 xxxxxxxx
+8 http://www.example.com/a/b/c?a=b com
+9 https://news.clickhouse.com/ com
+
+-- !not_nullable2 --
+1 www.baidu.com baidu
+10 https://news.clickhouse.com.tr/ clickhouse
+2 www.google.com.cn google
+3 invalid url
+4
+5
+6
+7 xxxxxxxx
+8 http://www.example.com/a/b/c?a=b example
+9 https://news.clickhouse.com/ clickhouse
+
+-- !not_nullable3 --
+1 www.baidu.com baidu.com
+10 https://news.clickhouse.com.tr/ clickhouse.com.tr
+2 www.google.com.cn google.com.cn
+3 invalid url
+4
+5
+6
+7 xxxxxxxx
+8 http://www.example.com/a/b/c?a=b example.com
+9 https://news.clickhouse.com/ clickhouse.com
+
diff --git
a/regression-test/suites/query_p0/sql_functions/string_functions/test_url_functions.groovy
b/regression-test/suites/query_p0/sql_functions/string_functions/test_url_functions.groovy
new file mode 100644
index 00000000000..389020b63e2
--- /dev/null
+++
b/regression-test/suites/query_p0/sql_functions/string_functions/test_url_functions.groovy
@@ -0,0 +1,79 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_url_functions") {
+ sql " drop table if exists test_url_functions"
+ sql """
+ create table test_url_functions (
+ id int,
+ s1 string not null,
+ s2 string null
+ )
+ DISTRIBUTED BY HASH(id)
+ PROPERTIES
+ (
+ "replication_num" = "1"
+ );
+ """
+
+ //empty table
+ order_qt_empty_nullable1 "select top_level_domain(s2) from
test_url_functions"
+ order_qt_empty_nullable2 "select first_significant_subdomain(s2) from
test_url_functions"
+ order_qt_empty_nullable3 "select cut_to_first_significant_subdomain(s2)
from test_url_functions"
+ order_qt_empty_not_nullable1 "select top_level_domain(s1) from
test_url_functions"
+ order_qt_empty_not_nullable2 "select first_significant_subdomain(s1) from
test_url_functions"
+ order_qt_empty_not_nullable3 "select
cut_to_first_significant_subdomain(s1) from test_url_functions"
+
+ //null / const
+ order_qt_empty_null1 "select top_level_domain(NULL)"
+ order_qt_empty_null2 "select first_significant_subdomain(NULL)"
+ order_qt_empty_null3 "select cut_to_first_significant_subdomain(NULL)"
+
+ //vaild url
+ order_qt_empty_const1 "select top_level_domain('www.baidu.com')"
+ order_qt_empty_const2 "select first_significant_subdomain('www.baidu.com')"
+ order_qt_empty_const3 "select
cut_to_first_significant_subdomain('www.baidu.com')"
+ order_qt_empty_const4 "select top_level_domain('www.google.com.cn')"
+ order_qt_empty_const5 "select
first_significant_subdomain('www.google.com.cn')"
+ order_qt_empty_const6 "select
cut_to_first_significant_subdomain('www.google.com.cn')"
+
+ //invaild url
+ order_qt_empty_const7 "select top_level_domain('I am invaild url')"
+ order_qt_empty_const8 "select first_significant_subdomain('I am invaild
url')"
+ order_qt_empty_const9 "select cut_to_first_significant_subdomain('I am
invaild url')"
+
+
+ sql """ insert into test_url_functions values (1, 'www.baidu.com',
'www.baidu.com'); """
+ sql """ insert into test_url_functions values (2, 'www.google.com.cn',
'www.google.com.cn'); """
+ sql """ insert into test_url_functions values (3, 'invalid url', 'invalid
url'); """
+ sql """ insert into test_url_functions values (4, '', ''); """
+ sql """ insert into test_url_functions values (5, ' ', ' '); """
+ sql """ insert into test_url_functions values (6, ' ', NULL); """
+ sql """ insert into test_url_functions values (7, 'xxxxxxxx', 'xxxxxxxx');
"""
+ sql """ insert into test_url_functions values (8,
'http://www.example.com/a/b/c?a=b', 'http://www.example.com/a/b/c?a=b'); """
+ sql """ insert into test_url_functions values (9,
'https://news.clickhouse.com/', 'https://news.clickhouse.com/'); """
+ sql """ insert into test_url_functions values (10,
'https://news.clickhouse.com.tr/', 'https://news.clickhouse.com.tr/'); """
+
+ order_qt_nullable1 "select id,s2,top_level_domain(s2) from
test_url_functions order by id"
+ order_qt_nullable2 "select id,s2,first_significant_subdomain(s2) from
test_url_functions order by id"
+ order_qt_nullable3 "select id,s2,cut_to_first_significant_subdomain(s2)
from test_url_functions order by id"
+
+ order_qt_not_nullable1 "select id,s1,top_level_domain(s1) from
test_url_functions order by id"
+ order_qt_not_nullable2 "select id,s1,first_significant_subdomain(s1) from
test_url_functions order by id"
+ order_qt_not_nullable3 "select
id,s1,cut_to_first_significant_subdomain(s1) from test_url_functions order by
id"
+
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]