This is an automated email from the ASF dual-hosted git repository. nickva pushed a commit to tag 2.0.0 in repository https://gitbox.apache.org/repos/asf/couchdb-jiffy.git
commit 25ad8b52b3f1a09d7a96541b85f2641e0a33ca41 Author: Nick Vatamaniuc <[email protected]> AuthorDate: Wed Apr 22 15:11:16 2026 -0400 Encode UTF8 atoms (OTP 26+ only) As a nice side-benefit we get to de-duplicate string encoding between atoms and strings. The only snag is this is an OTP 26+ only feature. Below that ERL NIF interface doesn't return atom in UTF8 format. Fix https://github.com/davisp/jiffy/issues/231 --- c_src/encoder.c | 168 +++++++++++++++-------------------------- test/jiffy_04_string_tests.erl | 46 ++++++++++- 2 files changed, 107 insertions(+), 107 deletions(-) diff --git a/c_src/encoder.c b/c_src/encoder.c index 196ecf8..731b78b 100644 --- a/c_src/encoder.c +++ b/c_src/encoder.c @@ -362,30 +362,32 @@ enc_special_character(Encoder* e, int val) { } } -static int -enc_atom(Encoder* e, ERL_NIF_TERM val) +// ERL_NIF_UTF8 was added in NIF 2.17 (OTP 26). We detect it to know +// if we can pass it to enif_get_atom() +#if ERL_NIF_MAJOR_VERSION > 2 \ + || (ERL_NIF_MAJOR_VERSION == 2 && ERL_NIF_MINOR_VERSION >= 17) +#define JIFFY_ENIF_HAS_UTF8 1 +#endif + +static inline int +enc_quoted(Encoder* e, + const unsigned char* JIFFY_RESTRICT data, + size_t size, + int latin1_only) { static const int MAX_ESCAPE_LEN = 12; - unsigned char data[512]; - - size_t size; + size_t i = 0; size_t start; - size_t i; - - if(!enif_get_atom(e->env, val, (char*)data, 512, ERL_NIF_LATIN1)) { - return 0; - } - - size = strlen((const char*)data); + size_t ulen; + int uval; + int esc_len; - /* Reserve space for the first quotation mark and most of the output. */ if(!enc_ensure(e, size + MAX_ESCAPE_LEN + 1)) { return 0; } e->p[e->i++] = '\"'; - i = 0; while(i < size) { if(!enc_ensure(e, MAX_ESCAPE_LEN)) { return 0; @@ -394,9 +396,7 @@ enc_atom(Encoder* e, ERL_NIF_TERM val) if(JIFFY_UNLIKELY(enc_special_character(e, data[i]))) { i++; } else if(JIFFY_LIKELY(data[i] < 0x80)) { - // Scan ahead for plain ASCII chars which don't need escaping. - // Since optionally users could escape forward slashes, too, we - // stop on them as well + // Scan ahead for plain ASCII chars that don't need escaping. start = i; i++; if(e->escape_forward_slashes) { @@ -417,111 +417,33 @@ enc_atom(Encoder* e, ERL_NIF_TERM val) } memcpy(&(e->p[e->i]), &data[start], run); e->i += run; - } else if(data[i] >= 0x80) { - /* The atom encoding is latin1, so we don't need validation - * as all latin1 characters are valid Unicode codepoints. */ - if (!e->uescape) { - e->i += unicode_to_utf8(data[i], &e->p[e->i]); + } else if(latin1_only) { + if(JIFFY_UNLIKELY(e->uescape)) { + e->i += unicode_uescape((int)data[i], &(e->p[e->i])); } else { - e->i += unicode_uescape(data[i], &e->p[e->i]); + e->i += unicode_to_utf8((int)data[i], &(e->p[e->i])); } - - i++; - } - } - - if(!enc_ensure(e, 1)) { - return 0; - } - - e->p[e->i++] = '\"'; - e->count++; - - return 1; -} - -static int -enc_string(Encoder* e, ERL_NIF_TERM val) -{ - static const int MAX_ESCAPE_LEN = 12; - ErlNifBinary bin; - - unsigned char* JIFFY_RESTRICT data; - size_t size; - int esc_len; - size_t ulen; - int uval; - size_t start; - size_t i; - - if(!enif_inspect_binary(e->env, val, &bin)) { - return 0; - } - - data = bin.data; - size = bin.size; - - /* Reserve space for the first quotation mark and most of the output. */ - if(!enc_ensure(e, size + MAX_ESCAPE_LEN + 1)) { - return 0; - } - - e->p[e->i++] = '\"'; - - i = 0; - while(i < size) { - if(!enc_ensure(e, MAX_ESCAPE_LEN)) { - return 0; - } - - if(enc_special_character(e, data[i])) { - i++; - } else if(data[i] < 0x80) { - // Scan ahead for plain ASCII char and memcpy them. Stop at quotes, - // backslashes, and forward slashes, since users can optionally - // choose to escape them too. - start = i; i++; - if(e->escape_forward_slashes) { - while(i < size - && data[i] >= 0x20 - && data[i] < 0x80 - && data[i] != '\"' - && data[i] != '\\' - && data[i] != '/') { - i++; - } - } else { - i = jiffy_scan_string_body(data, size, i); - } - size_t run = i - start; - if(!enc_ensure(e, run)) { - return 0; - } - memcpy(&(e->p[e->i]), &data[start], run); - e->i += run; - } else if(JIFFY_UNLIKELY(data[i] >= 0x80)) { - ulen = utf8_validate(&(data[i]), size - i); - - if (JIFFY_UNLIKELY(ulen == 0)) { + } else { + // UTF-8 2/3/4-byte sequence: validate, then copy as is or + // or uencode as \uXXXX + ulen = utf8_validate((unsigned char*)&(data[i]), size - i); + if(JIFFY_UNLIKELY(ulen == 0)) { return 0; - } else if (JIFFY_UNLIKELY(e->uescape)) { - uval = utf8_to_unicode(&(data[i]), size-i); + } else if(JIFFY_UNLIKELY(e->uescape)) { + uval = utf8_to_unicode((unsigned char*)&(data[i]), size - i); if(uval < 0) { return 0; } - esc_len = unicode_uescape(uval, &(e->p[e->i])); if(esc_len < 0) { return 0; } - e->i += esc_len; } else { memcpy(&e->p[e->i], &data[i], ulen); e->i += ulen; } - i += ulen; } } @@ -536,6 +458,40 @@ enc_string(Encoder* e, ERL_NIF_TERM val) return 1; } +static int +enc_atom(Encoder* e, ERL_NIF_TERM val) +{ + // 255 code points * max 4 UTF-8 bytes + NUL fits in 1024. + unsigned char data[1024]; + int n; + +#ifdef JIFFY_ENIF_HAS_UTF8 + n = enif_get_atom(e->env, val, (char*)data, sizeof(data), ERL_NIF_UTF8); + if(n <= 0) { + return 0; + } + return enc_quoted(e, data, (size_t)n - 1, 0); +#else + n = enif_get_atom(e->env, val, (char*)data, sizeof(data), ERL_NIF_LATIN1); + if(n <= 0) { + return 0; + } + return enc_quoted(e, data, (size_t)n - 1, 1); +#endif +} + +static int +enc_string(Encoder* e, ERL_NIF_TERM val) +{ + ErlNifBinary bin; + + if(!enif_inspect_binary(e->env, val, &bin)) { + return 0; + } + + return enc_quoted(e, bin.data, bin.size, 0); +} + static inline int enc_object_key(ErlNifEnv *env, Encoder* e, ERL_NIF_TERM val) { diff --git a/test/jiffy_04_string_tests.erl b/test/jiffy_04_string_tests.erl index c96e014..88276df 100644 --- a/test/jiffy_04_string_tests.erl +++ b/test/jiffy_04_string_tests.erl @@ -11,7 +11,51 @@ latin1_atom_test_() -> Key = list_to_atom([228]), %% `ä` Expected = <<"{\"", 195, 164, "\":\"bar\"}">>, - ?_assertEqual(Expected, jiffy:encode({[{Key, <<"bar">>}]})). + ?_assertEqual(Expected, enc({[{Key, <<"bar">>}]})). + +% These are slightly sneaky and contain a NUL +latin1_nul_atom_test_() -> + Bad = binary_to_atom(<<0, 1, 255, 255, 255, 255>>, latin1), + [ + ?_assertEqual(<<"\"\\u0000\\u0001ÿÿÿÿ\""/utf8>>, enc(Bad)), + ?_assertEqual(<<"\"\\u0000\\u0001\\u00FF\\u00FF\\u00FF\\u00FF\"">>, + enc(Bad, [uescape])) + ]. + +% From issue https://github.com/davisp/jiffy/issues/231 +% ERL_NIF_UTF8 was added in NIF 2.17 (OTP 26) though +-if(?OTP_RELEASE >= 26). +utf8_atom_test_() -> + % 2-byte UTF8 + Satas = binary_to_atom(<<"ŝatas"/utf8>>, utf8), + % 3-byte UTF8 (Google translated this as "Hello") + Hello = binary_to_atom(<<"你好"/utf8>>, utf8), + % 4-byte UTF8 (Rocket) + Rocket = binary_to_atom(<<"🚀"/utf8>>, utf8), + [ + ?_assertEqual(<<"\"", "ŝatas"/utf8, "\"">>, enc(Satas)), + ?_assertEqual(<<"\"", "你好"/utf8, "\"">>, enc(Hello)), + ?_assertEqual(<<"\"", "🚀"/utf8, "\"">>, enc(Rocket)), + ?_assertEqual(<<"\"\\u015Datas\"">>, enc(Satas, [uescape])), + ?_assertEqual(<<"\"\\u4F60\\u597D\"">>, enc(Hello, [uescape])), + ?_assertEqual(<<"\"\\uD83D\\uDE80\"">>, enc(Rocket, [uescape])), + ?_assertEqual(<<"{\"", "ŝatas"/utf8, "\":\"v\"}">>, enc(#{Satas => <<"v">>})), + ?_assertEqual(atom_to_binary(Satas, utf8), dec(enc(Satas))), + ?_assertEqual(atom_to_binary(Hello, utf8), dec(enc(Hello))), + ?_assertEqual(atom_to_binary(Rocket, utf8), dec(enc(Rocket))) + ]. +-else. +utf8_atom_test_() -> + % ERL_NIF_UTF8 isn't available so these atoms can't be extracted. + Satas = binary_to_atom(<<"ŝatas"/utf8>>, utf8), + Hello = binary_to_atom(<<"你好"/utf8>>, utf8), + Rocket = binary_to_atom(<<"🚀"/utf8>>, utf8), + [ + ?_assertError({invalid_string, _}, enc(Satas)), + ?_assertError({invalid_string, _}, enc(Hello)), + ?_assertError({invalid_string, _}, enc(Rocket)) + ]. +-endif. atom_key_test_() -> [
