master to 999f464

wohali Sun, 18 Feb 2018 09:30:11 -0800

mochijson2: fix utf8 edge case in #168, use unicode module rather than 
xmerl_ucs, 100% test coverage



Project: http://git-wip-us.apache.org/repos/asf/couchdb-mochiweb/repo
Commit: http://git-wip-us.apache.org/repos/asf/couchdb-mochiweb/commit/57a890aa
Tree: http://git-wip-us.apache.org/repos/asf/couchdb-mochiweb/tree/57a890aa
Diff: http://git-wip-us.apache.org/repos/asf/couchdb-mochiweb/diff/57a890aa

Branch: refs/heads/master
Commit: 57a890aa2886d703c643bf05c99d3cf0e1ab28a0
Parents: 54ea016
Author: Bob Ippolito <[email protected]>
Authored: Fri Mar 18 11:27:15 2016 -0700
Committer: Bob Ippolito <[email protected]>
Committed: Fri Mar 18 14:50:28 2016 -0400

----------------------------------------------------------------------
 src/mochijson2.erl | 89 ++++++++++++++++++++++++++++++++++---------------
 1 file changed, 62 insertions(+), 27 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/couchdb-mochiweb/blob/57a890aa/src/mochijson2.erl
----------------------------------------------------------------------
diff --git a/src/mochijson2.erl b/src/mochijson2.erl
index 3d880db..255398d 100644
--- a/src/mochijson2.erl
+++ b/src/mochijson2.erl
@@ -203,19 +203,13 @@ json_encode_proplist(Props, State) ->
     lists:reverse([$\} | Acc1]).
 
 json_encode_string(A, State) when is_atom(A) ->
-    L = atom_to_list(A),
-    case json_string_is_safe(L) of
-        true ->
-            [?Q, L, ?Q];
-        false ->
-            json_encode_string_unicode(xmerl_ucs:from_utf8(L), State, [?Q])
-    end;
+    json_encode_string(atom_to_binary(A, latin1), State);
 json_encode_string(B, State) when is_binary(B) ->
     case json_bin_is_safe(B) of
         true ->
             [?Q, B, ?Q];
         false ->
-            json_encode_string_unicode(xmerl_ucs:from_utf8(B), State, [?Q])
+            json_encode_string_unicode(unicode:characters_to_list(B), State, 
[?Q])
     end;
 json_encode_string(I, _State) when is_integer(I) ->
     [?Q, integer_to_list(I), ?Q];
@@ -250,7 +244,7 @@ json_string_is_safe([C | Rest]) ->
         C when C < 16#7f ->
             json_string_is_safe(Rest);
         _ ->
-            false
+            exit({json_encode, {bad_char, C}})
     end.
 
 json_bin_is_safe(<<>>) ->
@@ -308,12 +302,13 @@ json_encode_string_unicode([C | Cs], State, Acc) ->
                C when C >= 0, C < $\s ->
                    [unihex(C) | Acc];
                C when C >= 16#7f, C =< 16#10FFFF, State#encoder.utf8 ->
-                   [xmerl_ucs:to_utf8(C) | Acc];
+                   [unicode:characters_to_binary([C]) | Acc];
                C when  C >= 16#7f, C =< 16#10FFFF, not State#encoder.utf8 ->
                    [unihex(C) | Acc];
                C when C < 16#7f ->
                    [C | Acc];
                _ ->
+                   %% json_string_is_safe guarantees that this branch is dead
                    exit({json_encode, {bad_char, C}})
            end,
     json_encode_string_unicode(Cs, State, Acc1).
@@ -468,12 +463,14 @@ tokenize_string(B, S=#decoder{offset=O}, Acc) ->
                 %% coalesce UTF-16 surrogate pair
                 <<"\\u", D3, D2, D1, D0, _/binary>> = Rest,
                 D = erlang:list_to_integer([D3,D2,D1,D0], 16),
-                [CodePoint] = 
xmerl_ucs:from_utf16be(<<C:16/big-unsigned-integer,
-                    D:16/big-unsigned-integer>>),
-                Acc1 = lists:reverse(xmerl_ucs:to_utf8(CodePoint), Acc),
+                Acc1 = [unicode:characters_to_binary(
+                            <<C:16/big-unsigned-integer,
+                              D:16/big-unsigned-integer>>,
+                            utf16)
+                       | Acc],
                 tokenize_string(B, ?ADV_COL(S, 12), Acc1);
             true ->
-                Acc1 = lists:reverse(xmerl_ucs:to_utf8(C), Acc),
+                Acc1 = [unicode:characters_to_binary([C]) | Acc],
                 tokenize_string(B, ?ADV_COL(S, 6), Acc1)
             end;
         <<_:O/binary, C1, _/binary>> when C1 < 128 ->
@@ -709,13 +706,13 @@ e2j_test_vec(utf8) ->
 %% test utf8 encoding
 encoder_utf8_test() ->
     %% safe conversion case (default)
-    [34,"\\u0001","\\u0442","\\u0435","\\u0441","\\u0442",34] =
-        encode(<<1,"\321\202\320\265\321\201\321\202">>),
+    <<"\"\\u0001\\u0442\\u0435\\u0441\\u0442\"">> =
+        iolist_to_binary(encode(<<1,"\321\202\320\265\321\201\321\202">>)),
 
     %% raw utf8 output (optional)
     Enc = mochijson2:encoder([{utf8, true}]),
-    [34,"\\u0001",[209,130],[208,181],[209,129],[209,130],34] =
-        Enc(<<1,"\321\202\320\265\321\201\321\202">>).
+    <<34,"\\u0001",209,130,208,181,209,129,209,130,34>> =
+        iolist_to_binary(Enc(<<1,"\321\202\320\265\321\201\321\202">>)).
 
 input_validation_test() ->
     Good = [
@@ -724,7 +721,7 @@ input_validation_test() ->
         {16#10196, <<?Q, 16#F0, 16#90, 16#86, 16#96, ?Q>>} %% denarius
     ],
     lists:foreach(fun({CodePoint, UTF8}) ->
-        Expect = list_to_binary(xmerl_ucs:to_utf8(CodePoint)),
+        Expect = unicode:characters_to_binary([CodePoint]),
         Expect = decode(UTF8)
     end, Good),
 
@@ -759,7 +756,7 @@ inline_json_test() ->
     ok.
 
 big_unicode_test() ->
-    UTF8Seq = list_to_binary(xmerl_ucs:to_utf8(16#0001d120)),
+    UTF8Seq = unicode:characters_to_binary([16#0001d120]),
     ?assertEqual(
        <<"\"\\ud834\\udd20\"">>,
        iolist_to_binary(encode(UTF8Seq))),
@@ -791,7 +788,10 @@ atom_test() ->
        iolist_to_binary(encode(foo))),
     ?assertEqual(
        <<"\"\\ud834\\udd20\"">>,
-       iolist_to_binary(encode(list_to_atom(xmerl_ucs:to_utf8(16#0001d120))))),
+       iolist_to_binary(
+         encode(
+           binary_to_atom(
+             unicode:characters_to_binary([16#0001d120]), latin1)))),
     ok.
 
 key_encode_test() ->
@@ -836,18 +836,21 @@ unsafe_chars_test() ->
        json_string_is_safe([16#0001d120])),
     ?assertEqual(
        false,
-       json_bin_is_safe(list_to_binary(xmerl_ucs:to_utf8(16#0001d120)))),
+       json_bin_is_safe(unicode:characters_to_binary([16#0001d120]))),
     ?assertEqual(
        [16#0001d120],
-       xmerl_ucs:from_utf8(
-         binary_to_list(
-           decode(encode(list_to_atom(xmerl_ucs:to_utf8(16#0001d120))))))),
+       unicode:characters_to_list(
+         decode(
+           encode(
+             binary_to_atom(
+               unicode:characters_to_binary([16#0001d120]),
+               latin1))))),
     ?assertEqual(
        false,
-       json_string_is_safe([16#110000])),
+       json_string_is_safe([16#10ffff])),
     ?assertEqual(
        false,
-       json_bin_is_safe(list_to_binary(xmerl_ucs:to_utf8([16#110000])))),
+       json_bin_is_safe(unicode:characters_to_binary([16#10ffff]))),
     %% solidus can be escaped but isn't unsafe by default
     ?assertEqual(
        <<"/">>,
@@ -904,4 +907,36 @@ format_test_() ->
                    {eep18, {P}},
                    {proplist, P}]].
 
+array_test() ->
+    A = [<<"hello">>],
+    ?assertEqual(A, decode(encode({array, A}))).
+
+bad_char_test() ->
+    ?assertEqual(
+       {'EXIT', {json_encode, {bad_char, 16#110000}}},
+       catch json_string_is_safe([16#110000])).
+
+utf8_roundtrip_test_() ->
+    %% These are the boundary cases for UTF8 encoding
+    Codepoints = [%% 7 bits  -> 1 byte
+                  16#00, 16#7f,
+                  %% 11 bits -> 2 bytes
+                  16#080, 16#07ff,
+                  %% 16 bits -> 3 bytes
+                  16#0800, 16#ffff,
+                  16#d7ff, 16#e000,
+                  %% 21 bits -> 4 bytes
+                  16#010000, 16#10ffff],
+    UTF8 = unicode:characters_to_binary(Codepoints),
+    Encode = encoder([{utf8, true}]),
+    [{"roundtrip escaped",
+      ?_assertEqual(UTF8, decode(encode(UTF8)))},
+     {"roundtrip utf8",
+      ?_assertEqual(UTF8, decode(Encode(UTF8)))}].
+
+utf8_non_character_test_() ->
+    S = unicode:characters_to_binary([16#ffff, 16#fffe]),
+    [{"roundtrip escaped", ?_assertEqual(S, decode(encode(S)))},
+     {"roundtrip utf8", ?_assertEqual(S, decode((encoder([{utf8, 
true}]))(S)))}].
+
 -endif.

[22/50] mochiweb commit: updated refs/heads/master to 999f464

Reply via email to