mochijson2: fix utf8 edge case in #168, use unicode module rather than xmerl_ucs, 100% test coverage
Project: http://git-wip-us.apache.org/repos/asf/couchdb-mochiweb/repo Commit: http://git-wip-us.apache.org/repos/asf/couchdb-mochiweb/commit/57a890aa Tree: http://git-wip-us.apache.org/repos/asf/couchdb-mochiweb/tree/57a890aa Diff: http://git-wip-us.apache.org/repos/asf/couchdb-mochiweb/diff/57a890aa Branch: refs/heads/master Commit: 57a890aa2886d703c643bf05c99d3cf0e1ab28a0 Parents: 54ea016 Author: Bob Ippolito <[email protected]> Authored: Fri Mar 18 11:27:15 2016 -0700 Committer: Bob Ippolito <[email protected]> Committed: Fri Mar 18 14:50:28 2016 -0400 ---------------------------------------------------------------------- src/mochijson2.erl | 89 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 62 insertions(+), 27 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/couchdb-mochiweb/blob/57a890aa/src/mochijson2.erl ---------------------------------------------------------------------- diff --git a/src/mochijson2.erl b/src/mochijson2.erl index 3d880db..255398d 100644 --- a/src/mochijson2.erl +++ b/src/mochijson2.erl @@ -203,19 +203,13 @@ json_encode_proplist(Props, State) -> lists:reverse([$\} | Acc1]). json_encode_string(A, State) when is_atom(A) -> - L = atom_to_list(A), - case json_string_is_safe(L) of - true -> - [?Q, L, ?Q]; - false -> - json_encode_string_unicode(xmerl_ucs:from_utf8(L), State, [?Q]) - end; + json_encode_string(atom_to_binary(A, latin1), State); json_encode_string(B, State) when is_binary(B) -> case json_bin_is_safe(B) of true -> [?Q, B, ?Q]; false -> - json_encode_string_unicode(xmerl_ucs:from_utf8(B), State, [?Q]) + json_encode_string_unicode(unicode:characters_to_list(B), State, [?Q]) end; json_encode_string(I, _State) when is_integer(I) -> [?Q, integer_to_list(I), ?Q]; @@ -250,7 +244,7 @@ json_string_is_safe([C | Rest]) -> C when C < 16#7f -> json_string_is_safe(Rest); _ -> - false + exit({json_encode, {bad_char, C}}) end. json_bin_is_safe(<<>>) -> @@ -308,12 +302,13 @@ json_encode_string_unicode([C | Cs], State, Acc) -> C when C >= 0, C < $\s -> [unihex(C) | Acc]; C when C >= 16#7f, C =< 16#10FFFF, State#encoder.utf8 -> - [xmerl_ucs:to_utf8(C) | Acc]; + [unicode:characters_to_binary([C]) | Acc]; C when C >= 16#7f, C =< 16#10FFFF, not State#encoder.utf8 -> [unihex(C) | Acc]; C when C < 16#7f -> [C | Acc]; _ -> + %% json_string_is_safe guarantees that this branch is dead exit({json_encode, {bad_char, C}}) end, json_encode_string_unicode(Cs, State, Acc1). @@ -468,12 +463,14 @@ tokenize_string(B, S=#decoder{offset=O}, Acc) -> %% coalesce UTF-16 surrogate pair <<"\\u", D3, D2, D1, D0, _/binary>> = Rest, D = erlang:list_to_integer([D3,D2,D1,D0], 16), - [CodePoint] = xmerl_ucs:from_utf16be(<<C:16/big-unsigned-integer, - D:16/big-unsigned-integer>>), - Acc1 = lists:reverse(xmerl_ucs:to_utf8(CodePoint), Acc), + Acc1 = [unicode:characters_to_binary( + <<C:16/big-unsigned-integer, + D:16/big-unsigned-integer>>, + utf16) + | Acc], tokenize_string(B, ?ADV_COL(S, 12), Acc1); true -> - Acc1 = lists:reverse(xmerl_ucs:to_utf8(C), Acc), + Acc1 = [unicode:characters_to_binary([C]) | Acc], tokenize_string(B, ?ADV_COL(S, 6), Acc1) end; <<_:O/binary, C1, _/binary>> when C1 < 128 -> @@ -709,13 +706,13 @@ e2j_test_vec(utf8) -> %% test utf8 encoding encoder_utf8_test() -> %% safe conversion case (default) - [34,"\\u0001","\\u0442","\\u0435","\\u0441","\\u0442",34] = - encode(<<1,"\321\202\320\265\321\201\321\202">>), + <<"\"\\u0001\\u0442\\u0435\\u0441\\u0442\"">> = + iolist_to_binary(encode(<<1,"\321\202\320\265\321\201\321\202">>)), %% raw utf8 output (optional) Enc = mochijson2:encoder([{utf8, true}]), - [34,"\\u0001",[209,130],[208,181],[209,129],[209,130],34] = - Enc(<<1,"\321\202\320\265\321\201\321\202">>). + <<34,"\\u0001",209,130,208,181,209,129,209,130,34>> = + iolist_to_binary(Enc(<<1,"\321\202\320\265\321\201\321\202">>)). input_validation_test() -> Good = [ @@ -724,7 +721,7 @@ input_validation_test() -> {16#10196, <<?Q, 16#F0, 16#90, 16#86, 16#96, ?Q>>} %% denarius ], lists:foreach(fun({CodePoint, UTF8}) -> - Expect = list_to_binary(xmerl_ucs:to_utf8(CodePoint)), + Expect = unicode:characters_to_binary([CodePoint]), Expect = decode(UTF8) end, Good), @@ -759,7 +756,7 @@ inline_json_test() -> ok. big_unicode_test() -> - UTF8Seq = list_to_binary(xmerl_ucs:to_utf8(16#0001d120)), + UTF8Seq = unicode:characters_to_binary([16#0001d120]), ?assertEqual( <<"\"\\ud834\\udd20\"">>, iolist_to_binary(encode(UTF8Seq))), @@ -791,7 +788,10 @@ atom_test() -> iolist_to_binary(encode(foo))), ?assertEqual( <<"\"\\ud834\\udd20\"">>, - iolist_to_binary(encode(list_to_atom(xmerl_ucs:to_utf8(16#0001d120))))), + iolist_to_binary( + encode( + binary_to_atom( + unicode:characters_to_binary([16#0001d120]), latin1)))), ok. key_encode_test() -> @@ -836,18 +836,21 @@ unsafe_chars_test() -> json_string_is_safe([16#0001d120])), ?assertEqual( false, - json_bin_is_safe(list_to_binary(xmerl_ucs:to_utf8(16#0001d120)))), + json_bin_is_safe(unicode:characters_to_binary([16#0001d120]))), ?assertEqual( [16#0001d120], - xmerl_ucs:from_utf8( - binary_to_list( - decode(encode(list_to_atom(xmerl_ucs:to_utf8(16#0001d120))))))), + unicode:characters_to_list( + decode( + encode( + binary_to_atom( + unicode:characters_to_binary([16#0001d120]), + latin1))))), ?assertEqual( false, - json_string_is_safe([16#110000])), + json_string_is_safe([16#10ffff])), ?assertEqual( false, - json_bin_is_safe(list_to_binary(xmerl_ucs:to_utf8([16#110000])))), + json_bin_is_safe(unicode:characters_to_binary([16#10ffff]))), %% solidus can be escaped but isn't unsafe by default ?assertEqual( <<"/">>, @@ -904,4 +907,36 @@ format_test_() -> {eep18, {P}}, {proplist, P}]]. +array_test() -> + A = [<<"hello">>], + ?assertEqual(A, decode(encode({array, A}))). + +bad_char_test() -> + ?assertEqual( + {'EXIT', {json_encode, {bad_char, 16#110000}}}, + catch json_string_is_safe([16#110000])). + +utf8_roundtrip_test_() -> + %% These are the boundary cases for UTF8 encoding + Codepoints = [%% 7 bits -> 1 byte + 16#00, 16#7f, + %% 11 bits -> 2 bytes + 16#080, 16#07ff, + %% 16 bits -> 3 bytes + 16#0800, 16#ffff, + 16#d7ff, 16#e000, + %% 21 bits -> 4 bytes + 16#010000, 16#10ffff], + UTF8 = unicode:characters_to_binary(Codepoints), + Encode = encoder([{utf8, true}]), + [{"roundtrip escaped", + ?_assertEqual(UTF8, decode(encode(UTF8)))}, + {"roundtrip utf8", + ?_assertEqual(UTF8, decode(Encode(UTF8)))}]. + +utf8_non_character_test_() -> + S = unicode:characters_to_binary([16#ffff, 16#fffe]), + [{"roundtrip escaped", ?_assertEqual(S, decode(encode(S)))}, + {"roundtrip utf8", ?_assertEqual(S, decode((encoder([{utf8, true}]))(S)))}]. + -endif.
