(couchdb-jiffy) 08/09: Skip-ahead for UTF8 encoding

vatamane Sat, 25 Apr 2026 09:22:47 -0700

This is an automated email from the ASF dual-hosted git repository.

nickva pushed a commit to tag 2.0.0
in repository https://gitbox.apache.org/repos/asf/couchdb-jiffy.git


commit 81f1918f680dde7b1274decdd17023112b02e577
Author: Nick Vatamaniuc <[email protected]>
AuthorDate: Thu Apr 23 14:22:57 2026 -0400

    Skip-ahead for UTF8 encoding
    
    Just like we do it for ASCII-only input, this should help unicode-heavy 
input.
---
 c_src/decoder.c    |  2 +-
 c_src/encoder.c    | 51 +++++++++++++++++++++++++++++++++++----------------
 c_src/jiffy_simd.h | 27 ++++++++++++++++++++++++++-
 c_src/jiffy_utf8.h | 20 ++++++++++++++++++++
 4 files changed, 82 insertions(+), 18 deletions(-)

diff --git a/c_src/decoder.c b/c_src/decoder.c
index 4915241..05524f9 100644
--- a/c_src/decoder.c
+++ b/c_src/decoder.c
@@ -260,7 +260,7 @@ dec_string(Decoder* d, ERL_NIF_TERM* value)
         } else if(d->p[d->i] < 0x80) {
             // Scan ahead plain ASCII as an optimization. The first
             // byte has already been checked, so start at i+1.
-            d->i = jiffy_scan_string_body(d->p, d->len, d->i + 1);
+            d->i = jiffy_scan_ascii_string_body(d->p, d->len, d->i + 1);
         } else {
             ulen = utf8_validate(&(d->p[d->i]), d->len - d->i);
             if(ulen == 0) {
diff --git a/c_src/encoder.c b/c_src/encoder.c
index a9644fd..79ea956 100644
--- a/c_src/encoder.c
+++ b/c_src/encoder.c
@@ -409,7 +409,7 @@ enc_quoted(Encoder* e,
                     i++;
                 }
             } else {
-                i = jiffy_scan_string_body(data, size, i);
+                i = jiffy_scan_ascii_string_body(data, size, i);
             }
             size_t run = i - start;
             if(!enc_ensure(e, run)) {
@@ -424,27 +424,46 @@ enc_quoted(Encoder* e,
                 e->i += unicode_to_utf8((int)data[i], &(e->p[e->i]));
             }
             i++;
-        } else {
-            // UTF-8 2/3/4-byte sequence: validate, then copy as is or
-            // or uencode as \uXXXX
+        } else if(JIFFY_UNLIKELY(e->uescape)) {
             ulen = utf8_validate((unsigned char*)&(data[i]), size - i);
             if(JIFFY_UNLIKELY(ulen == 0)) {
                 return 0;
-            } else if(JIFFY_UNLIKELY(e->uescape)) {
-                uval = utf8_to_unicode((unsigned char*)&(data[i]), size - i);
-                if(uval < 0) {
-                    return 0;
-                }
-                esc_len = unicode_uescape(uval, &(e->p[e->i]));
-                if(esc_len < 0) {
-                    return 0;
+            }
+            uval = utf8_to_unicode((unsigned char*)&(data[i]), size - i);
+            if(uval < 0) {
+                return 0;
+            }
+            esc_len = unicode_uescape(uval, &(e->p[e->i]));
+            if(esc_len < 0) {
+                return 0;
+            }
+            e->i += esc_len;
+            i += ulen;
+        } else {
+            // Non-ASCII UTF-8 . Scan through the run first and then validate
+            // the whole thing, kinda how we do it for ASCII only.
+            start = i;
+            i++;
+            if(e->escape_forward_slashes) {
+                while(i < size
+                        && data[i] >= 0x20
+                        && data[i] != '\"'
+                        && data[i] != '\\'
+                        && data[i] != '/') {
+                    i++;
                 }
-                e->i += esc_len;
             } else {
-                memcpy(&e->p[e->i], &data[i], ulen);
-                e->i += ulen;
+                i = jiffy_scan_utf8_string_body(data, size, i);
             }
-            i += ulen;
+            size_t run = i - start;
+            if(JIFFY_UNLIKELY(!utf8_validate_range(&data[start], run))) {
+                return 0;
+            }
+            if(JIFFY_UNLIKELY(!enc_ensure(e, run))) {
+                return 0;
+            }
+            memcpy(&(e->p[e->i]), &data[start], run);
+            e->i += run;
         }
     }
 
diff --git a/c_src/jiffy_simd.h b/c_src/jiffy_simd.h
index 44ad5d8..6f3eb76 100644
--- a/c_src/jiffy_simd.h
+++ b/c_src/jiffy_simd.h
@@ -50,7 +50,7 @@ jiffy_block_has_stop(const unsigned char* JIFFY_RESTRICT p)
 }
 
 static inline size_t
-jiffy_scan_string_body(const unsigned char* JIFFY_RESTRICT p, size_t len, 
size_t i)
+jiffy_scan_ascii_string_body(const unsigned char* JIFFY_RESTRICT p, size_t 
len, size_t i)
 {
     while (i + JIFFY_SIMD_BLOCK_SIZE <= len && !jiffy_block_has_stop(p + i)) {
         i += JIFFY_SIMD_BLOCK_SIZE;
@@ -61,4 +61,29 @@ jiffy_scan_string_body(const unsigned char* JIFFY_RESTRICT 
p, size_t len, size_t
     return i;
 }
 
+// Variant of the scan which lets UTF-8 multi-byte sequences pass through. This
+// so we can scan a whole block and then validate it as a block later.
+static inline unsigned int
+jiffy_block_has_utf8_stop(const unsigned char* JIFFY_RESTRICT p)
+{
+    unsigned int bad = 0;
+    for (int i = 0; i < JIFFY_SIMD_BLOCK_SIZE; i++) {
+        unsigned char c = p[i];
+        bad |= (unsigned)(c < 0x20) | (unsigned)(c == '"') | (unsigned)(c == 
'\\');
+    }
+    return bad;
+}
+
+static inline size_t
+jiffy_scan_utf8_string_body(const unsigned char* JIFFY_RESTRICT p, size_t len, 
size_t i)
+{
+    while (i + JIFFY_SIMD_BLOCK_SIZE <= len && !jiffy_block_has_utf8_stop(p + 
i)) {
+        i += JIFFY_SIMD_BLOCK_SIZE;
+    }
+    while (i < len && p[i] >= 0x20 && p[i] != '"' && p[i] != '\\') {
+        i++;
+    }
+    return i;
+}
+
 #endif
diff --git a/c_src/jiffy_utf8.h b/c_src/jiffy_utf8.h
index f6891e9..95b333a 100644
--- a/c_src/jiffy_utf8.h
+++ b/c_src/jiffy_utf8.h
@@ -236,6 +236,26 @@ utf8_validate(const unsigned char* JIFFY_RESTRICT data, 
size_t size)
     return 4;
 }
 
+// Validate a whole range UTF-8 codepoints
+static inline int
+utf8_validate_range(const unsigned char* data, size_t size)
+{
+    size_t i = 0;
+    while(i < size) {
+        if(data[i] < 0x80) {
+            i++;
+            // ASCII skip-through
+            continue;
+        }
+        size_t ulen = utf8_validate((unsigned char*)&data[i], size - i);
+        if(ulen == 0) {
+            return 0;
+        }
+        i += ulen;
+    }
+    return 1;
+}
+
 static inline int
 unicode_to_utf8(int c, unsigned char* buf)
 {

(couchdb-jiffy) 08/09: Skip-ahead for UTF8 encoding

Reply via email to