kou commented on code in PR #49421: URL: https://github.com/apache/arrow/pull/49421#discussion_r2922797151
##########
cpp/src/gandiva/gdv_string_function_stubs.cc:
##########
@@ -81,91 +83,125 @@ const char* gdv_fn_regexp_extract_utf8_utf8_int32(int64_t
ptr, int64_t holder_pt
return (*holder)(context, data, data_len, extract_index, out_length);
}
-#define GDV_FN_CAST_VARLEN_TYPE_FROM_TYPE(IN_TYPE, CAST_NAME, ARROW_TYPE)
\
- GANDIVA_EXPORT
\
- const char* gdv_fn_cast##CAST_NAME##_##IN_TYPE##_int64(
\
- int64_t context, gdv_##IN_TYPE value, int64_t len, int32_t * out_len) {
\
- if (len < 0) {
\
- gdv_fn_context_set_error_msg(context, "Buffer length cannot be
negative"); \
- *out_len = 0;
\
- return "";
\
- }
\
- if (len == 0) {
\
- *out_len = 0;
\
- return "";
\
- }
\
- arrow::internal::StringFormatter<arrow::ARROW_TYPE> formatter;
\
- char* ret = reinterpret_cast<char*>(
\
- gdv_fn_context_arena_malloc(context, static_cast<int32_t>(len)));
\
- if (ret == nullptr) {
\
- gdv_fn_context_set_error_msg(context, "Could not allocate memory");
\
- *out_len = 0;
\
- return "";
\
- }
\
- arrow::Status status = formatter(value, [&](std::string_view v) {
\
- int64_t size = static_cast<int64_t>(v.size());
\
- *out_len = static_cast<int32_t>(len < size ? len : size);
\
- memcpy(ret, v.data(), *out_len);
\
- return arrow::Status::OK();
\
- });
\
- if (!status.ok()) {
\
- std::string err = "Could not cast " + std::to_string(value) + " to
string"; \
- gdv_fn_context_set_error_msg(context, err.c_str());
\
- *out_len = 0;
\
- return "";
\
- }
\
- return ret;
\
+// The following castVARCHAR macros are optimized to allocate only the actual
+// string size instead of the maximum buffer length (which can be 65536+
bytes).
+
+// Helper: arena allocation + null check
+#define GDV_FN_CAST_VARLEN_ALLOC(SIZE)
\
+ char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context,
SIZE)); \
+ if (ret == nullptr) {
\
+ gdv_fn_context_set_error_msg(context, "Could not allocate memory");
\
+ *out_len = 0;
\
+ return "";
\
+ }
+
+// Helper: function signature + len validation
+#define GDV_FN_CAST_VARLEN_PREFIX(IN_TYPE, CAST_NAME)
\
+ GANDIVA_EXPORT
\
+ const char* gdv_fn_cast##CAST_NAME##_##IN_TYPE##_int64(
\
+ int64_t context, gdv_##IN_TYPE value, int64_t len, int32_t * out_len) {
\
+ if (len < 0) {
\
+ gdv_fn_context_set_error_msg(context, "Buffer length cannot be
negative"); \
+ *out_len = 0;
\
+ return "";
\
+ }
\
+ if (len == 0) {
\
+ *out_len = 0;
\
+ return "";
\
+ }
+
+// Macro for integer types (int32/int64). Uses optimized digit-pair conversion
+// directly into a small arena allocation (11 bytes for int32, 20 for int64).
+#define GDV_FN_CAST_VARLEN_TYPE_FROM_INTEGER(IN_TYPE, CAST_NAME, ARROW_TYPE)
\
+ GDV_FN_CAST_VARLEN_PREFIX(IN_TYPE, CAST_NAME)
\
+ constexpr int32_t max_len = std::numeric_limits<gdv_##IN_TYPE>::digits10 +
2; \
+ GDV_FN_CAST_VARLEN_ALLOC(max_len)
\
+ char* end = ret + max_len;
\
+ char* p = end;
\
+ auto uval = value < 0
\
+ ? static_cast<std::make_unsigned_t<gdv_##IN_TYPE>>(
\
+
~static_cast<std::make_unsigned_t<gdv_##IN_TYPE>>(value) + 1) \
+ : static_cast<std::make_unsigned_t<gdv_##IN_TYPE>>(value);
\
+ const char* pairs = arrow::internal::detail::digit_pairs;
\
+ while (uval >= 100) {
\
+ auto idx = (uval % 100) * 2;
\
+ uval /= 100;
\
+ *--p = pairs[idx + 1];
\
+ *--p = pairs[idx];
\
+ }
\
+ if (uval >= 10) {
\
+ auto idx = uval * 2;
\
+ *--p = pairs[idx + 1];
\
+ *--p = pairs[idx];
\
+ } else {
\
+ *--p = '0' + static_cast<char>(uval);
\
+ }
\
+ if (value < 0) {
\
+ *--p = '-';
\
+ }
\
Review Comment:
Why do we need to re-implement
https://github.com/apache/arrow/blob/669830165a26eaa8169b3f5bef07137aec6f0711/cpp/src/arrow/util/formatting.h#L151-L176
?
Can we use `FormatAllDigits()` instead?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
