(couchdb-jiffy) 02/09: Simplify and speed up number parsing

vatamane Sat, 25 Apr 2026 09:22:41 -0700

This is an automated email from the ASF dual-hosted git repository.

nickva pushed a commit to tag 2.0.0
in repository https://gitbox.apache.org/repos/asf/couchdb-jiffy.git


commit 514d5485efdcdda5d499d25eead78b3af2324f3a
Author: Nick Vatamaniuc <[email protected]>
AuthorDate: Wed Apr 22 00:45:59 2026 -0400

    Simplify and speed up number parsing
    
    With the ffc_parse_json_number [1] patch, and consolidating overflow atoms 
[2]
    in jiffy, we can now have our cake and eat it, too; that is we simplify and
    speed-up number decoding at the same time.
    
    [1] https://github.com/kolemannix/ffc.h/pull/22
    [2] https://github.com/davisp/jiffy/pull/284
---
 c_src/decoder.c                         | 305 +++-----------------------------
 c_src/ffc.h                             |  86 ++++++++-
 test/cases/leading_zero_in_number.eterm |   2 +-
 3 files changed, 105 insertions(+), 288 deletions(-)

diff --git a/c_src/decoder.c b/c_src/decoder.c
index 3617b41..4915241 100644
--- a/c_src/decoder.c
+++ b/c_src/decoder.c
@@ -25,18 +25,6 @@ enum {
     st_invalid
 } JsonState;
 
-enum {
-    nst_init=0,
-    nst_sign,
-    nst_mantissa,
-    nst_frac0,
-    nst_frac1,
-    nst_frac,
-    nst_esign,
-    nst_echeck,
-    nst_edigit
-} JsonNumState;
-
 typedef struct {
     ErlNifEnv*      env;
     jiffy_st*       atoms;
@@ -371,292 +359,43 @@ parse:
 static int
 dec_number(Decoder* d, ERL_NIF_TERM* value)
 {
-    ERL_NIF_TERM num_type = d->atoms->atom_error;
-    char state = nst_init;
-    int is_real = 0;
-    double dval;
-    int64_t lval;
-
-    // Use the same trick as did for dec_string. The restrict qualifier hints
-    // to the compiler p won't alias any other pointers so it can optimize
-    // access to it. Also avoid writing back do d->i on every increment,
-    // instead increment a local variable (hopefully in a register) then update
-    // d->i once at the end. Also, when parsing looping states (mantissa, frac,
-    // edigit) scan-ahead quickly looking for strings of digits only. The wins
-    // will not be as big as we have for strings as most numbers are not that
-    // long, but it shouldn't hurt either.
+    // ffc validates, parses, and picks int-vs-double in a single call
     const unsigned char* JIFFY_RESTRICT p = d->p;
-    const size_t len = d->len;
     const size_t start = d->i;
-    size_t idx = start;
-    while(idx < len) {
-        switch(state) {
-            case nst_init:
-                switch(p[idx]) {
-                    case '-':
-                        state = nst_sign;
-                        idx++;
-                        break;
-                    case '0':
-                        state = nst_frac0;
-                        idx++;
-                        break;
-                    case '1':
-                    case '2':
-                    case '3':
-                    case '4':
-                    case '5':
-                    case '6':
-                    case '7':
-                    case '8':
-                    case '9':
-                        state = nst_mantissa;
-                        idx++;
-                        break;
-                    default:
-                        assert(0 && "this state should be unreachable"); // 
LCOV_EXCL_LINE
-                }
-                break;
-
-            case nst_sign:
-                switch(p[idx]) {
-                    case '0':
-                        state = nst_frac0;
-                        idx++;
-                        break;
-                    case '1':
-                    case '2':
-                    case '3':
-                    case '4':
-                    case '5':
-                    case '6':
-                    case '7':
-                    case '8':
-                    case '9':
-                        state = nst_mantissa;
-                        idx++;
-                        break;
-                    default:
-                      goto error;
-                }
-                break;
-
-            case nst_mantissa:
-                switch(p[idx]) {
-                    case '.':
-                        state = nst_frac1;
-                        idx++;
-                        break;
-                    case 'e':
-                    case 'E':
-                        state = nst_esign;
-                        idx++;
-                        break;
-                    case '0':
-                    case '1':
-                    case '2':
-                    case '3':
-                    case '4':
-                    case '5':
-                    case '6':
-                    case '7':
-                    case '8':
-                    case '9':
-                        while(idx < len && p[idx] >= '0' && p[idx] <= '9') {
-                            idx++;
-                        }
-                        break;
-                    default:
-                        goto parse;
-                }
-                break;
-
-            case nst_frac0:
-                switch(p[idx]) {
-                    case '.':
-                        state = nst_frac1;
-                        idx++;
-                        break;
-                    case 'e':
-                    case 'E':
-                        state = nst_esign;
-                        idx++;
-                        break;
-                    default:
-                        goto parse;
-                }
-                break;
-
-            case nst_frac1:
-                is_real = 1;
-                switch(p[idx]) {
-                    case '0':
-                    case '1':
-                    case '2':
-                    case '3':
-                    case '4':
-                    case '5':
-                    case '6':
-                    case '7':
-                    case '8':
-                    case '9':
-                        state = nst_frac;
-                        idx++;
-                        break;
-                    default:
-                        goto parse;
-                }
-                break;
-
-            case nst_frac:
-                switch(p[idx]) {
-                    case 'e':
-                    case 'E':
-                        state = nst_esign;
-                        idx++;
-                        break;
-                    case '0':
-                    case '1':
-                    case '2':
-                    case '3':
-                    case '4':
-                    case '5':
-                    case '6':
-                    case '7':
-                    case '8':
-                    case '9':
-                        while(idx < len && p[idx] >= '0' && p[idx] <= '9') {
-                            idx++;
-                        }
-                        break;
-                    default:
-                        goto parse;
-                }
-                break;
-
-            case nst_esign:
-                is_real = 1;
-                switch(p[idx]) {
-                    case '-':
-                    case '+':
-                        state = nst_echeck;
-                        idx++;
-                        break;
-                    case '0':
-                    case '1':
-                    case '2':
-                    case '3':
-                    case '4':
-                    case '5':
-                    case '6':
-                    case '7':
-                    case '8':
-                    case '9':
-                        state = nst_edigit;
-                        idx++;
-                        break;
-                    default:
-                        goto error;
-                }
-                break;
-
-             case nst_echeck:
-                switch(p[idx]) {
-                    case '0':
-                    case '1':
-                    case '2':
-                    case '3':
-                    case '4':
-                    case '5':
-                    case '6':
-                    case '7':
-                    case '8':
-                    case '9':
-                        state = nst_edigit;
-                        while(idx < len && p[idx] >= '0' && p[idx] <= '9') {
-                            idx++;
-                        }
-                        break;
-                    default:
-                        goto parse;
-                }
-                break;
+    const char* nstart = (const char*)&p[start];
+    const char* nend_max = (const char*)&p[d->len];
 
-            case nst_edigit:
-                switch(p[idx]) {
-                    case '0':
-                    case '1':
-                    case '2':
-                    case '3':
-                    case '4':
-                    case '5':
-                    case '6':
-                    case '7':
-                    case '8':
-                    case '9':
-                        while(idx < len && p[idx] >= '0' && p[idx] <= '9') {
-                            idx++;
-                        }
-                        break;
-                    default:
-                        goto parse;
-                }
-                break;
+    ffc_json_number jn;
+    ffc_result r = ffc_parse_json_number(nstart, nend_max, &jn);
 
-            default:
-                goto error;
-        }
-    }
+    // After parsing r.ptr point to where parsing stops:
+    //   OK - first byte past the number
+    //   OUT_OF_RANGE - first byte past the number (same span, just doesn't 
fit)
+    //   INVALID - the offending byte
+    d->i = start + (size_t)(r.ptr - nstart);
 
-parse:
-    d->i = idx;
-
-    switch(state) {
-        case nst_init:
-        case nst_sign:
-        case nst_frac1:
-        case nst_esign:
-        case nst_echeck:
-            return 0;
-        default:
-            break;
+    if(r.outcome == FFC_OUTCOME_INVALID_INPUT) {
+        return 0;
     }
 
-    // Use ffc.h to parse numbers. It parses direclty from the stream no need
-    // to allocate a separate buffer as with strtod and strtol. The state
-    // machine already validated the syntax, so parse-level errors shouldn't
-    // occur here, only FFC_OUTCOME_OUT_OF_RANGE. If the range erro happens we
-    // fall back to Erlang with handle big numbers.
-    const char* nstart = (const char*)&p[start];
-    const char* nend = (const char*)&p[d->i];
-    const size_t num_len = d->i - start;
-
-    if(is_real) {
-        ffc_parse_options opts = {FFC_PRESET_JSON, '.'};
-        ffc_result res = ffc_from_chars_double_options(nstart, nend, &dval, 
opts);
-        if(res.outcome == FFC_OUTCOME_OK) {
-            *value = enif_make_double(d->env, dval);
-            return 1;
-        }
-    } else {
-        ffc_result res = ffc_parse_i64(num_len, nstart, 10, &lval);
-        if(res.outcome == FFC_OUTCOME_OK) {
-            *value = enif_make_int64(d->env, lval);
-            return 1;
+    if(r.outcome == FFC_OUTCOME_OK) {
+        if(jn.kind == FFC_JSON_NUM_KIND_INT64) {
+            *value = enif_make_int64(d->env, jn.value.i64);
+        } else {
+            *value = enif_make_double(d->env, jn.value.f64);
         }
+        return 1;
     }
 
-    // Let Erlang handle out-of-range cases
-    num_type = is_real ? d->atoms->atom_bigdbl : d->atoms->atom_bignum;
+    ERL_NIF_TERM num_type = (jn.kind == FFC_JSON_NUM_KIND_INT64)
+        ? d->atoms->atom_bignum
+        : d->atoms->atom_bigdbl;
 
     d->is_partial = 1;
+    const size_t num_len = (size_t)(r.ptr - nstart);
     *value = enif_make_sub_binary(d->env, d->arg, start, num_len);
     *value = enif_make_tuple2(d->env, num_type, *value);
     return 1;
-
-error:
-    d->i = idx;
-    return 0;
 }
 
 static ERL_NIF_TERM
diff --git a/c_src/ffc.h b/c_src/ffc.h
index 8d40697..056b6f9 100644
--- a/c_src/ffc.h
+++ b/c_src/ffc.h
@@ -253,6 +253,32 @@ uint64_t ffc_parse_u64_simple(size_t len, const char 
*input, int base, ffc_outco
 int32_t  ffc_parse_i32_simple(size_t len, const char *input, int base, 
ffc_outcome *outcome);
 uint32_t ffc_parse_u32_simple(size_t len, const char *input, int base, 
ffc_outcome *outcome);
 
+/**
+ * Parse a JSON number from the range [start, end) and return an int64_t or a 
double
+ *
+ * If the outcome is FCC_OUTCOME_OK
+ *  If kind == FFC_JSON_NUM_KIND_INT64, value will be an int64
+ *  If kind == FCC_JSON_NUM_DOUBLE, value will be a double
+ *
+ * The returned ffc_result's ptr points at the byte where parsing stopped
+ */
+
+typedef uint32_t ffc_json_number_kind;
+enum ffc_json_number_kind_bits {
+  FFC_JSON_NUM_KIND_INT64  = 0,
+  FFC_JSON_NUM_KIND_DOUBLE = 1,
+};
+
+typedef struct ffc_json_number {
+  ffc_json_number_kind kind;
+  union {
+    int64_t i64;
+    double  f64;
+  } value;
+} ffc_json_number;
+
+ffc_result ffc_parse_json_number(const char *start, const char *end, 
ffc_json_number *out);
+
 #endif // FFC_API
 
 #ifdef FFC_IMPL
@@ -1271,13 +1297,16 @@ ffc_parsed ffc_parse_number_string(
       ++p;
     }
     if ((p == pend) || !ffc_is_integer(*p)) {
-      if (!(uint64_t)(fmt & FFC_FORMAT_FLAG_FIXED)) {
+      if (basic_json_fmt || !(uint64_t)(fmt & FFC_FORMAT_FLAG_FIXED)) {
         // The exponential part is invalid for scientific notation, so it must
         // be a trailing token for fixed notation. However, fixed notation is
-        // disabled, so report a scientific notation error.
+        // disabled, so report a scientific notation error. JSON mode is strict
+        // for the scientific form (exp = e [ minus / plus ] 1*DIGIT in RFC
+        // 8259) so we also report the error, even though FIXED is part of
+        // FFC_PRESET_JSON.
         return ffc_report_parse_error(p, 
FFC_PARSE_OUTCOME_MISSING_EXPONENTIAL_PART);
       }
-      // Otherwise, we will be ignoring the 'e'.
+      // Otherwise (fixed-tolerant, non-JSON), we will be ignoring the 'e'.
       p = location_of_e;
     } else {
       while ((p != pend) && ffc_is_integer(*p)) {
@@ -3200,7 +3229,56 @@ uint32_t ffc_parse_u32_simple(size_t len, const char 
*input, int base, ffc_outco
   return out;
 }
 
-#undef FFC_DOUBLE_SMALLEST_POWER_OF_10        
+ffc_result ffc_parse_json_number(const char *start, const char *end,
+                                 ffc_json_number *out) {
+  ffc_result answer;
+
+  if (start == end) {
+    answer.ptr = (char *)start;
+    answer.outcome = FFC_OUTCOME_INVALID_INPUT;
+    return answer;
+  }
+
+  ffc_parse_options opts;
+  opts.format = FFC_PRESET_JSON;
+  opts.decimal_point = '.';
+
+  ffc_parsed pns = ffc_parse_number_string(start, end, opts, true);
+
+  if (!pns.valid) {
+    answer.ptr = (char *)pns.lastmatch;
+    answer.outcome = FFC_OUTCOME_INVALID_INPUT;
+    return answer;
+  }
+
+  // INT64 or DOUBLE?
+  // For an integer bytes consumed past the sign should be just digits
+  // If we see '.' then `fractional_part_start` is not NULL
+  // If we see e/E then consumed span is > int_part_len (e + $digit)
+  // If both above are true then we have a DOUBLE
+  size_t consumed = (size_t)(pns.lastmatch - start) - (pns.negative ? 1 : 0);
+  bool is_integer = (pns.fraction_part_start == NULL) && (consumed == 
pns.int_part_len);
+
+  ffc_result r;
+  if (is_integer) {
+    ffc_int_value v = {0};
+    r = ffc_parse_int_string(start, end, &v, FFC_INT_KIND_S64, opts, 10);
+    out->kind = FFC_JSON_NUM_KIND_INT64;
+    if (r.outcome == FFC_OUTCOME_OK) {
+      out->value.i64 = v.s64;
+    }
+  } else {
+    ffc_value v = {0};
+    r = ffc_from_chars_advanced(pns, &v, FFC_VALUE_KIND_DOUBLE);
+    out->kind = FFC_JSON_NUM_KIND_DOUBLE;
+    if (r.outcome == FFC_OUTCOME_OK) {
+      out->value.f64 = v.d;
+    }
+  }
+  return r;
+}
+
+#undef FFC_DOUBLE_SMALLEST_POWER_OF_10
 #undef FFC_DOUBLE_LARGEST_POWER_OF_10         
 #undef FFC_DOUBLE_SIGN_INDEX                  
 #undef FFC_DOUBLE_INFINITE_POWER              
diff --git a/test/cases/leading_zero_in_number.eterm 
b/test/cases/leading_zero_in_number.eterm
index 5bc5d8c..e184037 100644
--- a/test/cases/leading_zero_in_number.eterm
+++ b/test/cases/leading_zero_in_number.eterm
@@ -1 +1 @@
-{error,{17,invalid_json}}.
+{error,{16,invalid_number}}.

(couchdb-jiffy) 02/09: Simplify and speed up number parsing

Reply via email to