(couchdb-jiffy) 03/09: Encode UTF8 atoms (OTP 26+ only)

vatamane Sat, 25 Apr 2026 09:22:47 -0700

This is an automated email from the ASF dual-hosted git repository.

nickva pushed a commit to tag 2.0.0
in repository https://gitbox.apache.org/repos/asf/couchdb-jiffy.git


commit 25ad8b52b3f1a09d7a96541b85f2641e0a33ca41
Author: Nick Vatamaniuc <[email protected]>
AuthorDate: Wed Apr 22 15:11:16 2026 -0400

    Encode UTF8 atoms (OTP 26+ only)
    
    As a nice side-benefit we get to de-duplicate string encoding between atoms 
and
    strings. The only snag is this is an OTP 26+ only feature. Below that ERL 
NIF
    interface doesn't return atom in UTF8 format.
    
    Fix https://github.com/davisp/jiffy/issues/231
---
 c_src/encoder.c                | 168 +++++++++++++++--------------------------
 test/jiffy_04_string_tests.erl |  46 ++++++++++-
 2 files changed, 107 insertions(+), 107 deletions(-)

diff --git a/c_src/encoder.c b/c_src/encoder.c
index 196ecf8..731b78b 100644
--- a/c_src/encoder.c
+++ b/c_src/encoder.c
@@ -362,30 +362,32 @@ enc_special_character(Encoder* e, int val) {
     }
 }
 
-static int
-enc_atom(Encoder* e, ERL_NIF_TERM val)
+// ERL_NIF_UTF8 was added in NIF 2.17 (OTP 26). We detect it to know
+// if we can pass it to enif_get_atom()
+#if ERL_NIF_MAJOR_VERSION > 2 \
+        || (ERL_NIF_MAJOR_VERSION == 2 && ERL_NIF_MINOR_VERSION >= 17)
+#define JIFFY_ENIF_HAS_UTF8 1
+#endif
+
+static inline int
+enc_quoted(Encoder* e,
+           const unsigned char* JIFFY_RESTRICT data,
+           size_t size,
+           int latin1_only)
 {
     static const int MAX_ESCAPE_LEN = 12;
-    unsigned char data[512];
-
-    size_t size;
+    size_t i = 0;
     size_t start;
-    size_t i;
-
-    if(!enif_get_atom(e->env, val, (char*)data, 512, ERL_NIF_LATIN1)) {
-        return 0;
-    }
-
-    size = strlen((const char*)data);
+    size_t ulen;
+    int uval;
+    int esc_len;
 
-    /* Reserve space for the first quotation mark and most of the output. */
     if(!enc_ensure(e, size + MAX_ESCAPE_LEN + 1)) {
         return 0;
     }
 
     e->p[e->i++] = '\"';
 
-    i = 0;
     while(i < size) {
         if(!enc_ensure(e, MAX_ESCAPE_LEN)) {
             return 0;
@@ -394,9 +396,7 @@ enc_atom(Encoder* e, ERL_NIF_TERM val)
         if(JIFFY_UNLIKELY(enc_special_character(e, data[i]))) {
             i++;
         } else if(JIFFY_LIKELY(data[i] < 0x80)) {
-            // Scan ahead for plain ASCII chars which don't need escaping.
-            // Since optionally users could escape forward slashes, too, we
-            // stop on them as well
+            // Scan ahead for plain ASCII chars that don't need escaping.
             start = i;
             i++;
             if(e->escape_forward_slashes) {
@@ -417,111 +417,33 @@ enc_atom(Encoder* e, ERL_NIF_TERM val)
             }
             memcpy(&(e->p[e->i]), &data[start], run);
             e->i += run;
-        } else if(data[i] >= 0x80) {
-            /* The atom encoding is latin1, so we don't need validation
-             * as all latin1 characters are valid Unicode codepoints. */
-            if (!e->uescape) {
-                e->i += unicode_to_utf8(data[i], &e->p[e->i]);
+        } else if(latin1_only) {
+            if(JIFFY_UNLIKELY(e->uescape)) {
+                e->i += unicode_uescape((int)data[i], &(e->p[e->i]));
             } else {
-                e->i += unicode_uescape(data[i], &e->p[e->i]);
+                e->i += unicode_to_utf8((int)data[i], &(e->p[e->i]));
             }
-
-            i++;
-        }
-    }
-
-    if(!enc_ensure(e, 1)) {
-        return 0;
-    }
-
-    e->p[e->i++] = '\"';
-    e->count++;
-
-    return 1;
-}
-
-static int
-enc_string(Encoder* e, ERL_NIF_TERM val)
-{
-    static const int MAX_ESCAPE_LEN = 12;
-    ErlNifBinary bin;
-
-    unsigned char* JIFFY_RESTRICT data;
-    size_t size;
-    int esc_len;
-    size_t ulen;
-    int uval;
-    size_t start;
-    size_t i;
-
-    if(!enif_inspect_binary(e->env, val, &bin)) {
-        return 0;
-    }
-
-    data = bin.data;
-    size = bin.size;
-
-    /* Reserve space for the first quotation mark and most of the output. */
-    if(!enc_ensure(e, size + MAX_ESCAPE_LEN + 1)) {
-        return 0;
-    }
-
-    e->p[e->i++] = '\"';
-
-    i = 0;
-    while(i < size) {
-        if(!enc_ensure(e, MAX_ESCAPE_LEN)) {
-            return 0;
-        }
-
-        if(enc_special_character(e, data[i])) {
-            i++;
-        } else if(data[i] < 0x80) {
-            // Scan ahead for plain ASCII char and memcpy them. Stop at quotes,
-            // backslashes, and forward slashes, since users can optionally
-            // choose to escape them too.
-            start = i;
             i++;
-            if(e->escape_forward_slashes) {
-                while(i < size
-                        && data[i] >= 0x20
-                        && data[i] < 0x80
-                        && data[i] != '\"'
-                        && data[i] != '\\'
-                        && data[i] != '/') {
-                    i++;
-                }
-            } else {
-                i = jiffy_scan_string_body(data, size, i);
-            }
-            size_t run = i - start;
-            if(!enc_ensure(e, run)) {
-                return 0;
-            }
-            memcpy(&(e->p[e->i]), &data[start], run);
-            e->i += run;
-        } else if(JIFFY_UNLIKELY(data[i] >= 0x80)) {
-            ulen = utf8_validate(&(data[i]), size - i);
-
-            if (JIFFY_UNLIKELY(ulen == 0)) {
+        } else {
+            // UTF-8 2/3/4-byte sequence: validate, then copy as is or
+            // or uencode as \uXXXX
+            ulen = utf8_validate((unsigned char*)&(data[i]), size - i);
+            if(JIFFY_UNLIKELY(ulen == 0)) {
                 return 0;
-            } else if (JIFFY_UNLIKELY(e->uescape)) {
-                uval = utf8_to_unicode(&(data[i]), size-i);
+            } else if(JIFFY_UNLIKELY(e->uescape)) {
+                uval = utf8_to_unicode((unsigned char*)&(data[i]), size - i);
                 if(uval < 0) {
                     return 0;
                 }
-
                 esc_len = unicode_uescape(uval, &(e->p[e->i]));
                 if(esc_len < 0) {
                     return 0;
                 }
-
                 e->i += esc_len;
             } else {
                 memcpy(&e->p[e->i], &data[i], ulen);
                 e->i += ulen;
             }
-
             i += ulen;
         }
     }
@@ -536,6 +458,40 @@ enc_string(Encoder* e, ERL_NIF_TERM val)
     return 1;
 }
 
+static int
+enc_atom(Encoder* e, ERL_NIF_TERM val)
+{
+    // 255 code points * max 4 UTF-8 bytes + NUL fits in 1024.
+    unsigned char data[1024];
+    int n;
+
+#ifdef JIFFY_ENIF_HAS_UTF8
+    n = enif_get_atom(e->env, val, (char*)data, sizeof(data), ERL_NIF_UTF8);
+    if(n <= 0) {
+        return 0;
+    }
+    return enc_quoted(e, data, (size_t)n - 1, 0);
+#else
+    n = enif_get_atom(e->env, val, (char*)data, sizeof(data), ERL_NIF_LATIN1);
+    if(n <= 0) {
+        return 0;
+    }
+    return enc_quoted(e, data, (size_t)n - 1, 1);
+#endif
+}
+
+static int
+enc_string(Encoder* e, ERL_NIF_TERM val)
+{
+    ErlNifBinary bin;
+
+    if(!enif_inspect_binary(e->env, val, &bin)) {
+        return 0;
+    }
+
+    return enc_quoted(e, bin.data, bin.size, 0);
+}
+
 static inline int
 enc_object_key(ErlNifEnv *env, Encoder* e, ERL_NIF_TERM val)
 {
diff --git a/test/jiffy_04_string_tests.erl b/test/jiffy_04_string_tests.erl
index c96e014..88276df 100644
--- a/test/jiffy_04_string_tests.erl
+++ b/test/jiffy_04_string_tests.erl
@@ -11,7 +11,51 @@
 latin1_atom_test_() ->
     Key = list_to_atom([228]), %% `ä`
     Expected = <<"{\"", 195, 164, "\":\"bar\"}">>,
-    ?_assertEqual(Expected, jiffy:encode({[{Key, <<"bar">>}]})).
+    ?_assertEqual(Expected, enc({[{Key, <<"bar">>}]})).
+
+% These are slightly sneaky and contain a NUL
+latin1_nul_atom_test_() ->
+    Bad = binary_to_atom(<<0, 1, 255, 255, 255, 255>>, latin1),
+    [
+        ?_assertEqual(<<"\"\\u0000\\u0001ÿÿÿÿ\""/utf8>>, enc(Bad)),
+        ?_assertEqual(<<"\"\\u0000\\u0001\\u00FF\\u00FF\\u00FF\\u00FF\"">>,
+                      enc(Bad, [uescape]))
+    ].
+
+% From issue https://github.com/davisp/jiffy/issues/231
+% ERL_NIF_UTF8 was added in NIF 2.17 (OTP 26) though
+-if(?OTP_RELEASE >= 26).
+utf8_atom_test_() ->
+    % 2-byte UTF8
+    Satas = binary_to_atom(<<"ŝatas"/utf8>>, utf8),
+    % 3-byte UTF8 (Google translated this as "Hello")
+    Hello = binary_to_atom(<<"你好"/utf8>>, utf8),
+    % 4-byte UTF8 (Rocket)
+    Rocket = binary_to_atom(<<"🚀"/utf8>>, utf8),
+    [
+        ?_assertEqual(<<"\"", "ŝatas"/utf8, "\"">>, enc(Satas)),
+        ?_assertEqual(<<"\"", "你好"/utf8, "\"">>, enc(Hello)),
+        ?_assertEqual(<<"\"", "🚀"/utf8, "\"">>, enc(Rocket)),
+        ?_assertEqual(<<"\"\\u015Datas\"">>, enc(Satas, [uescape])),
+        ?_assertEqual(<<"\"\\u4F60\\u597D\"">>, enc(Hello, [uescape])),
+        ?_assertEqual(<<"\"\\uD83D\\uDE80\"">>,  enc(Rocket, [uescape])),
+        ?_assertEqual(<<"{\"", "ŝatas"/utf8, "\":\"v\"}">>, enc(#{Satas => 
<<"v">>})),
+        ?_assertEqual(atom_to_binary(Satas, utf8), dec(enc(Satas))),
+        ?_assertEqual(atom_to_binary(Hello, utf8), dec(enc(Hello))),
+        ?_assertEqual(atom_to_binary(Rocket, utf8), dec(enc(Rocket)))
+    ].
+-else.
+utf8_atom_test_() ->
+    % ERL_NIF_UTF8 isn't available so these atoms can't be extracted.
+    Satas = binary_to_atom(<<"ŝatas"/utf8>>, utf8),
+    Hello = binary_to_atom(<<"你好"/utf8>>, utf8),
+    Rocket = binary_to_atom(<<"🚀"/utf8>>, utf8),
+    [
+        ?_assertError({invalid_string, _}, enc(Satas)),
+        ?_assertError({invalid_string, _}, enc(Hello)),
+        ?_assertError({invalid_string, _}, enc(Rocket))
+    ].
+-endif.
 
 atom_key_test_() ->
     [

(couchdb-jiffy) 03/09: Encode UTF8 atoms (OTP 26+ only)

Reply via email to