This is an automated email from the ASF dual-hosted git repository. nickva pushed a commit to tag 2.0.0 in repository https://gitbox.apache.org/repos/asf/couchdb-jiffy.git
commit 514d5485efdcdda5d499d25eead78b3af2324f3a Author: Nick Vatamaniuc <[email protected]> AuthorDate: Wed Apr 22 00:45:59 2026 -0400 Simplify and speed up number parsing With the ffc_parse_json_number [1] patch, and consolidating overflow atoms [2] in jiffy, we can now have our cake and eat it, too; that is we simplify and speed-up number decoding at the same time. [1] https://github.com/kolemannix/ffc.h/pull/22 [2] https://github.com/davisp/jiffy/pull/284 --- c_src/decoder.c | 305 +++----------------------------- c_src/ffc.h | 86 ++++++++- test/cases/leading_zero_in_number.eterm | 2 +- 3 files changed, 105 insertions(+), 288 deletions(-) diff --git a/c_src/decoder.c b/c_src/decoder.c index 3617b41..4915241 100644 --- a/c_src/decoder.c +++ b/c_src/decoder.c @@ -25,18 +25,6 @@ enum { st_invalid } JsonState; -enum { - nst_init=0, - nst_sign, - nst_mantissa, - nst_frac0, - nst_frac1, - nst_frac, - nst_esign, - nst_echeck, - nst_edigit -} JsonNumState; - typedef struct { ErlNifEnv* env; jiffy_st* atoms; @@ -371,292 +359,43 @@ parse: static int dec_number(Decoder* d, ERL_NIF_TERM* value) { - ERL_NIF_TERM num_type = d->atoms->atom_error; - char state = nst_init; - int is_real = 0; - double dval; - int64_t lval; - - // Use the same trick as did for dec_string. The restrict qualifier hints - // to the compiler p won't alias any other pointers so it can optimize - // access to it. Also avoid writing back do d->i on every increment, - // instead increment a local variable (hopefully in a register) then update - // d->i once at the end. Also, when parsing looping states (mantissa, frac, - // edigit) scan-ahead quickly looking for strings of digits only. The wins - // will not be as big as we have for strings as most numbers are not that - // long, but it shouldn't hurt either. + // ffc validates, parses, and picks int-vs-double in a single call const unsigned char* JIFFY_RESTRICT p = d->p; - const size_t len = d->len; const size_t start = d->i; - size_t idx = start; - while(idx < len) { - switch(state) { - case nst_init: - switch(p[idx]) { - case '-': - state = nst_sign; - idx++; - break; - case '0': - state = nst_frac0; - idx++; - break; - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - state = nst_mantissa; - idx++; - break; - default: - assert(0 && "this state should be unreachable"); // LCOV_EXCL_LINE - } - break; - - case nst_sign: - switch(p[idx]) { - case '0': - state = nst_frac0; - idx++; - break; - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - state = nst_mantissa; - idx++; - break; - default: - goto error; - } - break; - - case nst_mantissa: - switch(p[idx]) { - case '.': - state = nst_frac1; - idx++; - break; - case 'e': - case 'E': - state = nst_esign; - idx++; - break; - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - while(idx < len && p[idx] >= '0' && p[idx] <= '9') { - idx++; - } - break; - default: - goto parse; - } - break; - - case nst_frac0: - switch(p[idx]) { - case '.': - state = nst_frac1; - idx++; - break; - case 'e': - case 'E': - state = nst_esign; - idx++; - break; - default: - goto parse; - } - break; - - case nst_frac1: - is_real = 1; - switch(p[idx]) { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - state = nst_frac; - idx++; - break; - default: - goto parse; - } - break; - - case nst_frac: - switch(p[idx]) { - case 'e': - case 'E': - state = nst_esign; - idx++; - break; - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - while(idx < len && p[idx] >= '0' && p[idx] <= '9') { - idx++; - } - break; - default: - goto parse; - } - break; - - case nst_esign: - is_real = 1; - switch(p[idx]) { - case '-': - case '+': - state = nst_echeck; - idx++; - break; - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - state = nst_edigit; - idx++; - break; - default: - goto error; - } - break; - - case nst_echeck: - switch(p[idx]) { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - state = nst_edigit; - while(idx < len && p[idx] >= '0' && p[idx] <= '9') { - idx++; - } - break; - default: - goto parse; - } - break; + const char* nstart = (const char*)&p[start]; + const char* nend_max = (const char*)&p[d->len]; - case nst_edigit: - switch(p[idx]) { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - while(idx < len && p[idx] >= '0' && p[idx] <= '9') { - idx++; - } - break; - default: - goto parse; - } - break; + ffc_json_number jn; + ffc_result r = ffc_parse_json_number(nstart, nend_max, &jn); - default: - goto error; - } - } + // After parsing r.ptr point to where parsing stops: + // OK - first byte past the number + // OUT_OF_RANGE - first byte past the number (same span, just doesn't fit) + // INVALID - the offending byte + d->i = start + (size_t)(r.ptr - nstart); -parse: - d->i = idx; - - switch(state) { - case nst_init: - case nst_sign: - case nst_frac1: - case nst_esign: - case nst_echeck: - return 0; - default: - break; + if(r.outcome == FFC_OUTCOME_INVALID_INPUT) { + return 0; } - // Use ffc.h to parse numbers. It parses direclty from the stream no need - // to allocate a separate buffer as with strtod and strtol. The state - // machine already validated the syntax, so parse-level errors shouldn't - // occur here, only FFC_OUTCOME_OUT_OF_RANGE. If the range erro happens we - // fall back to Erlang with handle big numbers. - const char* nstart = (const char*)&p[start]; - const char* nend = (const char*)&p[d->i]; - const size_t num_len = d->i - start; - - if(is_real) { - ffc_parse_options opts = {FFC_PRESET_JSON, '.'}; - ffc_result res = ffc_from_chars_double_options(nstart, nend, &dval, opts); - if(res.outcome == FFC_OUTCOME_OK) { - *value = enif_make_double(d->env, dval); - return 1; - } - } else { - ffc_result res = ffc_parse_i64(num_len, nstart, 10, &lval); - if(res.outcome == FFC_OUTCOME_OK) { - *value = enif_make_int64(d->env, lval); - return 1; + if(r.outcome == FFC_OUTCOME_OK) { + if(jn.kind == FFC_JSON_NUM_KIND_INT64) { + *value = enif_make_int64(d->env, jn.value.i64); + } else { + *value = enif_make_double(d->env, jn.value.f64); } + return 1; } - // Let Erlang handle out-of-range cases - num_type = is_real ? d->atoms->atom_bigdbl : d->atoms->atom_bignum; + ERL_NIF_TERM num_type = (jn.kind == FFC_JSON_NUM_KIND_INT64) + ? d->atoms->atom_bignum + : d->atoms->atom_bigdbl; d->is_partial = 1; + const size_t num_len = (size_t)(r.ptr - nstart); *value = enif_make_sub_binary(d->env, d->arg, start, num_len); *value = enif_make_tuple2(d->env, num_type, *value); return 1; - -error: - d->i = idx; - return 0; } static ERL_NIF_TERM diff --git a/c_src/ffc.h b/c_src/ffc.h index 8d40697..056b6f9 100644 --- a/c_src/ffc.h +++ b/c_src/ffc.h @@ -253,6 +253,32 @@ uint64_t ffc_parse_u64_simple(size_t len, const char *input, int base, ffc_outco int32_t ffc_parse_i32_simple(size_t len, const char *input, int base, ffc_outcome *outcome); uint32_t ffc_parse_u32_simple(size_t len, const char *input, int base, ffc_outcome *outcome); +/** + * Parse a JSON number from the range [start, end) and return an int64_t or a double + * + * If the outcome is FCC_OUTCOME_OK + * If kind == FFC_JSON_NUM_KIND_INT64, value will be an int64 + * If kind == FCC_JSON_NUM_DOUBLE, value will be a double + * + * The returned ffc_result's ptr points at the byte where parsing stopped + */ + +typedef uint32_t ffc_json_number_kind; +enum ffc_json_number_kind_bits { + FFC_JSON_NUM_KIND_INT64 = 0, + FFC_JSON_NUM_KIND_DOUBLE = 1, +}; + +typedef struct ffc_json_number { + ffc_json_number_kind kind; + union { + int64_t i64; + double f64; + } value; +} ffc_json_number; + +ffc_result ffc_parse_json_number(const char *start, const char *end, ffc_json_number *out); + #endif // FFC_API #ifdef FFC_IMPL @@ -1271,13 +1297,16 @@ ffc_parsed ffc_parse_number_string( ++p; } if ((p == pend) || !ffc_is_integer(*p)) { - if (!(uint64_t)(fmt & FFC_FORMAT_FLAG_FIXED)) { + if (basic_json_fmt || !(uint64_t)(fmt & FFC_FORMAT_FLAG_FIXED)) { // The exponential part is invalid for scientific notation, so it must // be a trailing token for fixed notation. However, fixed notation is - // disabled, so report a scientific notation error. + // disabled, so report a scientific notation error. JSON mode is strict + // for the scientific form (exp = e [ minus / plus ] 1*DIGIT in RFC + // 8259) so we also report the error, even though FIXED is part of + // FFC_PRESET_JSON. return ffc_report_parse_error(p, FFC_PARSE_OUTCOME_MISSING_EXPONENTIAL_PART); } - // Otherwise, we will be ignoring the 'e'. + // Otherwise (fixed-tolerant, non-JSON), we will be ignoring the 'e'. p = location_of_e; } else { while ((p != pend) && ffc_is_integer(*p)) { @@ -3200,7 +3229,56 @@ uint32_t ffc_parse_u32_simple(size_t len, const char *input, int base, ffc_outco return out; } -#undef FFC_DOUBLE_SMALLEST_POWER_OF_10 +ffc_result ffc_parse_json_number(const char *start, const char *end, + ffc_json_number *out) { + ffc_result answer; + + if (start == end) { + answer.ptr = (char *)start; + answer.outcome = FFC_OUTCOME_INVALID_INPUT; + return answer; + } + + ffc_parse_options opts; + opts.format = FFC_PRESET_JSON; + opts.decimal_point = '.'; + + ffc_parsed pns = ffc_parse_number_string(start, end, opts, true); + + if (!pns.valid) { + answer.ptr = (char *)pns.lastmatch; + answer.outcome = FFC_OUTCOME_INVALID_INPUT; + return answer; + } + + // INT64 or DOUBLE? + // For an integer bytes consumed past the sign should be just digits + // If we see '.' then `fractional_part_start` is not NULL + // If we see e/E then consumed span is > int_part_len (e + $digit) + // If both above are true then we have a DOUBLE + size_t consumed = (size_t)(pns.lastmatch - start) - (pns.negative ? 1 : 0); + bool is_integer = (pns.fraction_part_start == NULL) && (consumed == pns.int_part_len); + + ffc_result r; + if (is_integer) { + ffc_int_value v = {0}; + r = ffc_parse_int_string(start, end, &v, FFC_INT_KIND_S64, opts, 10); + out->kind = FFC_JSON_NUM_KIND_INT64; + if (r.outcome == FFC_OUTCOME_OK) { + out->value.i64 = v.s64; + } + } else { + ffc_value v = {0}; + r = ffc_from_chars_advanced(pns, &v, FFC_VALUE_KIND_DOUBLE); + out->kind = FFC_JSON_NUM_KIND_DOUBLE; + if (r.outcome == FFC_OUTCOME_OK) { + out->value.f64 = v.d; + } + } + return r; +} + +#undef FFC_DOUBLE_SMALLEST_POWER_OF_10 #undef FFC_DOUBLE_LARGEST_POWER_OF_10 #undef FFC_DOUBLE_SIGN_INDEX #undef FFC_DOUBLE_INFINITE_POWER diff --git a/test/cases/leading_zero_in_number.eterm b/test/cases/leading_zero_in_number.eterm index 5bc5d8c..e184037 100644 --- a/test/cases/leading_zero_in_number.eterm +++ b/test/cases/leading_zero_in_number.eterm @@ -1 +1 @@ -{error,{17,invalid_json}}. +{error,{16,invalid_number}}.
