This is an automated email from the ASF dual-hosted git repository. nickva pushed a commit to tag 2.0.0 in repository https://gitbox.apache.org/repos/asf/couchdb-jiffy.git
commit 81f1918f680dde7b1274decdd17023112b02e577 Author: Nick Vatamaniuc <[email protected]> AuthorDate: Thu Apr 23 14:22:57 2026 -0400 Skip-ahead for UTF8 encoding Just like we do it for ASCII-only input, this should help unicode-heavy input. --- c_src/decoder.c | 2 +- c_src/encoder.c | 51 +++++++++++++++++++++++++++++++++++---------------- c_src/jiffy_simd.h | 27 ++++++++++++++++++++++++++- c_src/jiffy_utf8.h | 20 ++++++++++++++++++++ 4 files changed, 82 insertions(+), 18 deletions(-) diff --git a/c_src/decoder.c b/c_src/decoder.c index 4915241..05524f9 100644 --- a/c_src/decoder.c +++ b/c_src/decoder.c @@ -260,7 +260,7 @@ dec_string(Decoder* d, ERL_NIF_TERM* value) } else if(d->p[d->i] < 0x80) { // Scan ahead plain ASCII as an optimization. The first // byte has already been checked, so start at i+1. - d->i = jiffy_scan_string_body(d->p, d->len, d->i + 1); + d->i = jiffy_scan_ascii_string_body(d->p, d->len, d->i + 1); } else { ulen = utf8_validate(&(d->p[d->i]), d->len - d->i); if(ulen == 0) { diff --git a/c_src/encoder.c b/c_src/encoder.c index a9644fd..79ea956 100644 --- a/c_src/encoder.c +++ b/c_src/encoder.c @@ -409,7 +409,7 @@ enc_quoted(Encoder* e, i++; } } else { - i = jiffy_scan_string_body(data, size, i); + i = jiffy_scan_ascii_string_body(data, size, i); } size_t run = i - start; if(!enc_ensure(e, run)) { @@ -424,27 +424,46 @@ enc_quoted(Encoder* e, e->i += unicode_to_utf8((int)data[i], &(e->p[e->i])); } i++; - } else { - // UTF-8 2/3/4-byte sequence: validate, then copy as is or - // or uencode as \uXXXX + } else if(JIFFY_UNLIKELY(e->uescape)) { ulen = utf8_validate((unsigned char*)&(data[i]), size - i); if(JIFFY_UNLIKELY(ulen == 0)) { return 0; - } else if(JIFFY_UNLIKELY(e->uescape)) { - uval = utf8_to_unicode((unsigned char*)&(data[i]), size - i); - if(uval < 0) { - return 0; - } - esc_len = unicode_uescape(uval, &(e->p[e->i])); - if(esc_len < 0) { - return 0; + } + uval = utf8_to_unicode((unsigned char*)&(data[i]), size - i); + if(uval < 0) { + return 0; + } + esc_len = unicode_uescape(uval, &(e->p[e->i])); + if(esc_len < 0) { + return 0; + } + e->i += esc_len; + i += ulen; + } else { + // Non-ASCII UTF-8 . Scan through the run first and then validate + // the whole thing, kinda how we do it for ASCII only. + start = i; + i++; + if(e->escape_forward_slashes) { + while(i < size + && data[i] >= 0x20 + && data[i] != '\"' + && data[i] != '\\' + && data[i] != '/') { + i++; } - e->i += esc_len; } else { - memcpy(&e->p[e->i], &data[i], ulen); - e->i += ulen; + i = jiffy_scan_utf8_string_body(data, size, i); } - i += ulen; + size_t run = i - start; + if(JIFFY_UNLIKELY(!utf8_validate_range(&data[start], run))) { + return 0; + } + if(JIFFY_UNLIKELY(!enc_ensure(e, run))) { + return 0; + } + memcpy(&(e->p[e->i]), &data[start], run); + e->i += run; } } diff --git a/c_src/jiffy_simd.h b/c_src/jiffy_simd.h index 44ad5d8..6f3eb76 100644 --- a/c_src/jiffy_simd.h +++ b/c_src/jiffy_simd.h @@ -50,7 +50,7 @@ jiffy_block_has_stop(const unsigned char* JIFFY_RESTRICT p) } static inline size_t -jiffy_scan_string_body(const unsigned char* JIFFY_RESTRICT p, size_t len, size_t i) +jiffy_scan_ascii_string_body(const unsigned char* JIFFY_RESTRICT p, size_t len, size_t i) { while (i + JIFFY_SIMD_BLOCK_SIZE <= len && !jiffy_block_has_stop(p + i)) { i += JIFFY_SIMD_BLOCK_SIZE; @@ -61,4 +61,29 @@ jiffy_scan_string_body(const unsigned char* JIFFY_RESTRICT p, size_t len, size_t return i; } +// Variant of the scan which lets UTF-8 multi-byte sequences pass through. This +// so we can scan a whole block and then validate it as a block later. +static inline unsigned int +jiffy_block_has_utf8_stop(const unsigned char* JIFFY_RESTRICT p) +{ + unsigned int bad = 0; + for (int i = 0; i < JIFFY_SIMD_BLOCK_SIZE; i++) { + unsigned char c = p[i]; + bad |= (unsigned)(c < 0x20) | (unsigned)(c == '"') | (unsigned)(c == '\\'); + } + return bad; +} + +static inline size_t +jiffy_scan_utf8_string_body(const unsigned char* JIFFY_RESTRICT p, size_t len, size_t i) +{ + while (i + JIFFY_SIMD_BLOCK_SIZE <= len && !jiffy_block_has_utf8_stop(p + i)) { + i += JIFFY_SIMD_BLOCK_SIZE; + } + while (i < len && p[i] >= 0x20 && p[i] != '"' && p[i] != '\\') { + i++; + } + return i; +} + #endif diff --git a/c_src/jiffy_utf8.h b/c_src/jiffy_utf8.h index f6891e9..95b333a 100644 --- a/c_src/jiffy_utf8.h +++ b/c_src/jiffy_utf8.h @@ -236,6 +236,26 @@ utf8_validate(const unsigned char* JIFFY_RESTRICT data, size_t size) return 4; } +// Validate a whole range UTF-8 codepoints +static inline int +utf8_validate_range(const unsigned char* data, size_t size) +{ + size_t i = 0; + while(i < size) { + if(data[i] < 0x80) { + i++; + // ASCII skip-through + continue; + } + size_t ulen = utf8_validate((unsigned char*)&data[i], size - i); + if(ulen == 0) { + return 0; + } + i += ulen; + } + return 1; +} + static inline int unicode_to_utf8(int c, unsigned char* buf) {
