mrhhsg commented on code in PR #63309:
URL: https://github.com/apache/doris/pull/63309#discussion_r3274781463


##########
be/src/util/jsonb_parser_simd.h:
##########
@@ -281,54 +289,151 @@ struct JsonbParser {
         case simdjson::ondemand::number_type::signed_integer:
         case simdjson::ondemand::number_type::unsigned_integer: {
             int128_t val = num.is_int64() ? (int128_t)num.get_int64() : 
(int128_t)num.get_uint64();
-            bool success = false;
-            if (val >= std::numeric_limits<int8_t>::min() &&
-                val <= std::numeric_limits<int8_t>::max()) {
-                success = writer.writeInt8((int8_t)val);
-            } else if (val >= std::numeric_limits<int16_t>::min() &&
-                       val <= std::numeric_limits<int16_t>::max()) {
-                success = writer.writeInt16((int16_t)val);
-            } else if (val >= std::numeric_limits<int32_t>::min() &&
-                       val <= std::numeric_limits<int32_t>::max()) {
-                success = writer.writeInt32((int32_t)val);
-            } else if (val >= std::numeric_limits<int64_t>::min() &&
-                       val <= std::numeric_limits<int64_t>::max()) {
-                success = writer.writeInt64((int64_t)val);
-            } else { // INT128
-                success = writer.writeInt128(val);
+            RETURN_IF_ERROR(write_int128(val, writer));
+            break;
+        }
+        case simdjson::ondemand::number_type::big_integer: {
+            RETURN_IF_ERROR(write_number_from_raw_json(raw_string, writer));
+            break;
+        }
+        }
+        return Status::OK();
+    }
+
+    static bool is_json_number_space(char c) {
+        return c == ' ' || c == '\t' || c == '\n' || c == '\r';
+    }
+
+    static std::string_view trim_json_number(std::string_view raw_number) {
+        while (!raw_number.empty() && 
is_json_number_space(raw_number.front())) {
+            raw_number.remove_prefix(1);
+        }
+        while (!raw_number.empty() && is_json_number_space(raw_number.back())) 
{
+            raw_number.remove_suffix(1);
+        }
+        return raw_number;
+    }
+
+    static bool is_json_number_digit(char c) { return c >= '0' && c <= '9'; }
+
+    static Status validate_json_number(std::string_view raw_number, bool& 
is_integer) {
+        if (raw_number.empty()) {
+            return Status::InvalidArgument("empty number");
+        }
+
+        size_t pos = 0;
+        if (raw_number[pos] == '-') {
+            ++pos;
+            if (pos == raw_number.size()) {
+                return Status::InvalidArgument("invalid number, raw string is: 
" +
+                                               std::string(raw_number));
             }
+        }
 
-            if (!success) {
-                return Status::InvalidArgument("writeInt failed");
+        if (raw_number[pos] == '0') {
+            ++pos;
+        } else if (raw_number[pos] >= '1' && raw_number[pos] <= '9') {
+            while (pos < raw_number.size() && 
is_json_number_digit(raw_number[pos])) {
+                ++pos;
             }
-            break;
+        } else {
+            return Status::InvalidArgument("invalid number, raw string is: " +
+                                           std::string(raw_number));
         }
-        case simdjson::ondemand::number_type::big_integer: {
-            StringParser::ParseResult result;
-            auto val = 
StringParser::string_to_int<int128_t>(raw_string.data(), raw_string.size(),
+
+        bool has_fraction = false;
+        if (pos < raw_number.size() && raw_number[pos] == '.') {
+            has_fraction = true;
+            ++pos;
+            if (pos == raw_number.size() || 
!is_json_number_digit(raw_number[pos])) {
+                return Status::InvalidArgument("invalid number, raw string is: 
" +
+                                               std::string(raw_number));
+            }
+            while (pos < raw_number.size() && 
is_json_number_digit(raw_number[pos])) {
+                ++pos;
+            }
+        }
+
+        bool has_exponent = false;
+        if (pos < raw_number.size() && (raw_number[pos] == 'e' || 
raw_number[pos] == 'E')) {
+            has_exponent = true;
+            ++pos;
+            if (pos < raw_number.size() && (raw_number[pos] == '+' || 
raw_number[pos] == '-')) {
+                ++pos;
+            }
+            if (pos == raw_number.size() || 
!is_json_number_digit(raw_number[pos])) {
+                return Status::InvalidArgument("invalid number, raw string is: 
" +
+                                               std::string(raw_number));
+            }
+            while (pos < raw_number.size() && 
is_json_number_digit(raw_number[pos])) {
+                ++pos;
+            }
+        }
+
+        if (pos != raw_number.size()) {
+            return Status::InvalidArgument("simdjson parse exception: trailing 
content");
+        }
+        is_integer = !has_fraction && !has_exponent;
+        return Status::OK();
+    }
+
+    static Status write_int128(int128_t val, JsonbWriter& writer) {
+        bool success = false;
+        if (val >= std::numeric_limits<int8_t>::min() &&
+            val <= std::numeric_limits<int8_t>::max()) {
+            success = writer.writeInt8((int8_t)val);
+        } else if (val >= std::numeric_limits<int16_t>::min() &&
+                   val <= std::numeric_limits<int16_t>::max()) {
+            success = writer.writeInt16((int16_t)val);
+        } else if (val >= std::numeric_limits<int32_t>::min() &&
+                   val <= std::numeric_limits<int32_t>::max()) {
+            success = writer.writeInt32((int32_t)val);
+        } else if (val >= std::numeric_limits<int64_t>::min() &&
+                   val <= std::numeric_limits<int64_t>::max()) {
+            success = writer.writeInt64((int64_t)val);
+        } else { // INT128
+            success = writer.writeInt128(val);
+        }
+
+        if (!success) {
+            return Status::InvalidArgument("writeInt failed");
+        }
+        return Status::OK();
+    }
+
+    static Status write_number_from_raw_json(const char* pch, size_t len, 
JsonbWriter& writer) {
+        return write_number_from_raw_json(std::string_view(pch, len), writer);
+    }
+
+    // According to https://github.com/simdjson/simdjson/pull/2139
+    // For numbers larger than 64 bits, we can obtain the raw_json_token and 
parse it ourselves.
+    // This allows handling numbers larger than 64 bits, such as int128.
+    static Status write_number_from_raw_json(std::string_view raw_number, 
JsonbWriter& writer) {
+        raw_number = trim_json_number(raw_number);
+        bool is_integer = false;
+        RETURN_IF_ERROR(validate_json_number(raw_number, is_integer));
+
+        StringParser::ParseResult result;
+        if (is_integer) {
+            auto val = 
StringParser::string_to_int<int128_t>(raw_number.data(), raw_number.size(),
                                                              &result);
-            if (result != StringParser::PARSE_SUCCESS) {
-                // If the string exceeds the range of int128_t, it will 
attempt to convert it to double.
-                // This may result in loss of precision, but for JSON, 
exchanging data as plain text between different systems may inherently cause 
precision loss.
-                // try parse as double
-                double double_val = StringParser::string_to_float<double>(
-                        raw_string.data(), raw_string.size(), &result);
-                if (result != StringParser::PARSE_SUCCESS) {
-                    // if both parse failed, return error
-                    return Status::InvalidArgument("invalid number, raw string 
is: " +
-                                                   std::string(raw_string));
-                }
-                if (!writer.writeDouble(double_val)) {
-                    return Status::InvalidArgument("writeDouble failed");
-                }
-            } else {
-                // as int128_t
-                if (!writer.writeInt128(val)) {
-                    return Status::InvalidArgument("writeInt128 failed");
-                }
+            if (result == StringParser::PARSE_SUCCESS) {
+                RETURN_IF_ERROR(write_int128(val, writer));
+                return Status::OK();
             }
-            break;
         }
+
+        // If the string exceeds the range of int128_t, it will attempt to 
convert it to double.
+        // This may result in loss of precision, but for JSON, exchanging data 
as plain text
+        // between different systems may inherently cause precision loss.
+        double double_val = 
StringParser::string_to_float<double>(raw_number.data(),
+                                                                  
raw_number.size(), &result);
+        if (result != StringParser::PARSE_SUCCESS) {
+            return Status::InvalidArgument("invalid number, raw string is: " +

Review Comment:
   `!std::isfinite(double_val)` will ensure the double value is finite.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to