[ https://issues.apache.org/jira/browse/MESOS-2013?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14189416#comment-14189416 ]
Benjamin Mahler commented on MESOS-2013: ---------------------------------------- Thanks for the report, here is the relevant TODO from some time ago: https://github.com/apache/mesos/blob/0.20.1/3rdparty/libprocess/3rdparty/stout/include/stout/json.hpp#L321 {code} inline std::ostream& operator << (std::ostream& out, const String& string) { // TODO(benh): This escaping DOES NOT handle unicode, it encodes as ASCII. // See RFC4627 for the JSON string specificiation. out << "\""; foreach (unsigned char c, string.value) { switch (c) { case '"': out << "\\\""; break; case '\\': out << "\\\\"; break; case '/': out << "\\/"; break; case '\b': out << "\\b"; break; case '\f': out << "\\f"; break; case '\n': out << "\\n"; break; case '\r': out << "\\r"; break; case '\t': out << "\\t"; break; default: // See RFC4627 for these ranges. if ((c >= 0x20 && c <= 0x21) || (c >= 0x23 && c <= 0x5B) || (c >= 0x5D && c < 0x7F)) { out << c; } else { // NOTE: We also escape all bytes > 0x7F since they imply more than // 1 byte in UTF-8. This is why we don't escape UTF-8 properly. // See RFC4627 for the escaping format: \uXXXX (X is a hex digit). // Each byte here will be of the form: \u00XX (this is why we need // setw and the cast to unsigned int). out << "\\u" << std::setfill('0') << std::setw(4) << std::hex << std::uppercase << (unsigned int) c; } break; } } out << "\""; return out; } {code} I was hoping we could leverage picojson's serialization now that we pull it in as a library, but it doesn't look like they're doing correctly from first glance: https://github.com/kazuho/picojson/blob/fa3498702cdf1fa48e334ff6c7b5599a2902674d/picojson.h#L406 {code} template <typename Iter> void serialize_str(const std::string& s, Iter oi) { *oi++ = '"'; for (std::string::const_iterator i = s.begin(); i != s.end(); ++i) { switch (*i) { #define MAP(val, sym) case val: copy(sym, oi); break MAP('"', "\\\""); MAP('\\', "\\\\"); MAP('/', "\\/"); MAP('\b', "\\b"); MAP('\f', "\\f"); MAP('\n', "\\n"); MAP('\r', "\\r"); MAP('\t', "\\t"); #undef MAP default: if (static_cast<unsigned char>(*i) < 0x20 || *i == 0x7f) { char buf[7]; SNPRINTF(buf, sizeof(buf), "\\u%04x", *i & 0xff); copy(buf, buf + 6, oi); } else { *oi++ = *i; } break; } } *oi++ = '"'; } {code} > Slave read endpoint doesn't encode non-ascii characters correctly > ----------------------------------------------------------------- > > Key: MESOS-2013 > URL: https://issues.apache.org/jira/browse/MESOS-2013 > Project: Mesos > Issue Type: Bug > Components: json api > Reporter: Whitney Sorenson > > Create a file in a sandbox with a non-ascii character, like this one: > http://www.fileformat.info/info/unicode/char/2018/index.htm > Hit the read endpoint for that file. > The response will have something like: > data: "\u00E2\u0080\u0098" > It should actually be: > data: "\u2018" > If you put either into JSON.parse() in the browser you will see the first > does not render correctly but the second does. -- This message was sent by Atlassian JIRA (v6.3.4#6332)