Title: [206648] trunk
Revision
206648
Author
achristen...@apple.com
Date
2016-09-30 11:46:28 -0700 (Fri, 30 Sep 2016)

Log Message

URLParser: handle syntax violations in non-UTF-8 encoded queries
https://bugs.webkit.org/show_bug.cgi?id=162770

Reviewed by Tim Horton.

Source/WebCore:

There is a fast path for queries of URLs that use UTF-8 encoding, which are quite common.
For non-UTF-8 encoded queries, which are less common, we put the code points in a Vector<UChar>
and encode them all at once.  If there is a syntax violation in the query, we need to copy the
syntax-violation-free string up to the beginning of the query, then encode the query.

Covered by new API tests.

* platform/URLParser.cpp:
(WebCore::URLParser::percentEncodeByte):
(WebCore::URLParser::encodeQuery):
(WebCore::URLParser::parse):
* platform/URLParser.h:

Tools:

* TestWebKitAPI/Tests/WebCore/URLParser.cpp:
(TestWebKitAPI::checkURL):
(TestWebKitAPI::TEST_F):
Tests with emoji change behavior when we insert a tab between the surrogates, so don't do the
insert-tab-at-each-location verification that syntax violations are handled correctly.

Modified Paths

Diff

Modified: trunk/Source/WebCore/ChangeLog (206647 => 206648)


--- trunk/Source/WebCore/ChangeLog	2016-09-30 18:44:16 UTC (rev 206647)
+++ trunk/Source/WebCore/ChangeLog	2016-09-30 18:46:28 UTC (rev 206648)
@@ -1,3 +1,23 @@
+2016-09-30  Alex Christensen  <achristen...@webkit.org>
+
+        URLParser: handle syntax violations in non-UTF-8 encoded queries
+        https://bugs.webkit.org/show_bug.cgi?id=162770
+
+        Reviewed by Tim Horton.
+
+        There is a fast path for queries of URLs that use UTF-8 encoding, which are quite common.
+        For non-UTF-8 encoded queries, which are less common, we put the code points in a Vector<UChar>
+        and encode them all at once.  If there is a syntax violation in the query, we need to copy the
+        syntax-violation-free string up to the beginning of the query, then encode the query.
+
+        Covered by new API tests.
+
+        * platform/URLParser.cpp:
+        (WebCore::URLParser::percentEncodeByte):
+        (WebCore::URLParser::encodeQuery):
+        (WebCore::URLParser::parse):
+        * platform/URLParser.h:
+
 2016-09-27  Anders Carlsson  <ander...@apple.com>
 
         Remove a couple of unused members from PlatformKeyboardEvent

Modified: trunk/Source/WebCore/platform/URLParser.cpp (206647 => 206648)


--- trunk/Source/WebCore/platform/URLParser.cpp	2016-09-30 18:44:16 UTC (rev 206647)
+++ trunk/Source/WebCore/platform/URLParser.cpp	2016-09-30 18:46:28 UTC (rev 206648)
@@ -494,6 +494,7 @@
 
 void URLParser::percentEncodeByte(uint8_t byte)
 {
+    ASSERT(m_didSeeSyntaxViolation);
     appendToASCIIBuffer('%');
     appendToASCIIBuffer(upperNibbleToASCIIHexDigit(byte));
     appendToASCIIBuffer(lowerNibbleToASCIIHexDigit(byte));
@@ -562,15 +563,39 @@
             appendToASCIIBuffer(byte);
     }
 }
-    
-void URLParser::encodeQuery(const Vector<UChar>& source, const TextEncoding& encoding)
+
+template<typename CharacterType>
+void URLParser::encodeQuery(const Vector<UChar>& source, const TextEncoding& encoding, CodePointIterator<CharacterType> iterator)
 {
     // FIXME: It is unclear in the spec what to do when encoding fails. The behavior should be specified and tested.
     CString encoded = encoding.encode(StringView(source.data(), source.size()), URLEncodedEntitiesForUnencodables);
     const char* data = ""
     size_t length = encoded.length();
-    for (size_t i = 0; i < length; ++i) {
+    
+    if (!length == !iterator.atEnd()) {
+        syntaxViolation(iterator);
+        return;
+    }
+    
+    size_t i = 0;
+    for (; i < length; ++i) {
+        ASSERT(!iterator.atEnd());
         uint8_t byte = data[i];
+        if (UNLIKELY(byte != *iterator)) {
+            syntaxViolation(iterator);
+            break;
+        }
+        if (UNLIKELY(shouldPercentEncodeQueryByte(byte))) {
+            syntaxViolation(iterator);
+            break;
+        }
+        appendToASCIIBuffer(byte);
+        ++iterator;
+    }
+    ASSERT((i == length) == iterator.atEnd());
+    for (; i < length; ++i) {
+        ASSERT(m_didSeeSyntaxViolation);
+        uint8_t byte = data[i];
         if (shouldPercentEncodeQueryByte(byte))
             percentEncodeByte(byte);
         else
@@ -1103,6 +1128,7 @@
     }
     CodePointIterator<CharacterType> c(input, input + endIndex);
     CodePointIterator<CharacterType> authorityOrHostBegin;
+    CodePointIterator<CharacterType> queryBegin;
     while (UNLIKELY(!c.atEnd() && isC0ControlOrSpace(*c))) {
         syntaxViolation(c);
         ++c;
@@ -1127,7 +1153,8 @@
         PathStart,
         Path,
         CannotBeABaseURLPath,
-        Query,
+        UTF8Query,
+        NonUTF8Query,
         Fragment,
     };
 
@@ -1288,8 +1315,13 @@
             case '?':
                 copyURLPartsUntil(base, URLPart::PathEnd, c);
                 appendToASCIIBuffer('?');
-                state = State::Query;
                 ++c;
+                if (isUTF8Encoding)
+                    state = State::UTF8Query;
+                else {
+                    queryBegin = c;
+                    state = State::NonUTF8Query;
+                }
                 break;
             case '#':
                 copyURLPartsUntil(base, URLPart::QueryEnd, c);
@@ -1431,6 +1463,13 @@
                 if (base.isValid() && base.protocolIs("file"))
                     copyURLPartsUntil(base, URLPart::PathEnd, c);
                 appendToASCIIBuffer("///?", 4);
+                ++c;
+                if (isUTF8Encoding)
+                    state = State::UTF8Query;
+                else {
+                    queryBegin = c;
+                    state = State::NonUTF8Query;
+                }
                 m_url.m_userStart = currentPosition(c) - 2;
                 m_url.m_userEnd = m_url.m_userStart;
                 m_url.m_passwordEnd = m_url.m_userStart;
@@ -1438,8 +1477,6 @@
                 m_url.m_portEnd = m_url.m_userStart;
                 m_url.m_pathAfterLastSlash = m_url.m_userStart + 1;
                 m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
-                state = State::Query;
-                ++c;
                 break;
             case '#':
                 syntaxViolation(c);
@@ -1533,9 +1570,14 @@
                             syntaxViolation(c);
                             appendToASCIIBuffer("/?", 2);
                             ++c;
+                            if (isUTF8Encoding)
+                                state = State::UTF8Query;
+                            else {
+                                queryBegin = c;
+                                state = State::NonUTF8Query;
+                            }
                             m_url.m_pathAfterLastSlash = currentPosition(c) - 1;
                             m_url.m_pathEnd = m_url.m_pathAfterLastSlash;
-                            state = State::Query;
                             break;
                         }
                         if (UNLIKELY(*c == '#')) {
@@ -1601,7 +1643,14 @@
             }
             if (*c == '?') {
                 m_url.m_pathEnd = currentPosition(c);
-                state = State::Query;
+                appendToASCIIBuffer('?');
+                ++c;
+                if (isUTF8Encoding)
+                    state = State::UTF8Query;
+                else {
+                    queryBegin = c;
+                    state = State::NonUTF8Query;
+                }
                 break;
             }
             if (*c == '#') {
@@ -1628,7 +1677,14 @@
             LOG_STATE("CannotBeABaseURLPath");
             if (*c == '?') {
                 m_url.m_pathEnd = currentPosition(c);
-                state = State::Query;
+                appendToASCIIBuffer('?');
+                ++c;
+                if (isUTF8Encoding)
+                    state = State::UTF8Query;
+                else {
+                    queryBegin = c;
+                    state = State::NonUTF8Query;
+                }
             } else if (*c == '#') {
                 m_url.m_pathEnd = currentPosition(c);
                 m_url.m_queryEnd = m_url.m_pathEnd;
@@ -1642,11 +1698,10 @@
                 ++c;
             }
             break;
-        case State::Query:
-            LOG_STATE("Query");
+        case State::UTF8Query:
+            LOG_STATE("UTF8Query");
+            ASSERT(queryBegin == CodePointIterator<CharacterType>());
             if (*c == '#') {
-                if (!isUTF8Encoding)
-                    encodeQuery(queryBuffer, encoding);
                 m_url.m_queryEnd = currentPosition(c);
                 state = State::Fragment;
                 break;
@@ -1657,6 +1712,20 @@
                 appendCodePoint(queryBuffer, *c);
             ++c;
             break;
+        case State::NonUTF8Query:
+            do {
+                LOG_STATE("NonUTF8Query");
+                ASSERT(queryBegin != CodePointIterator<CharacterType>());
+                if (*c == '#') {
+                    encodeQuery(queryBuffer, encoding, CodePointIterator<CharacterType>(queryBegin, c));
+                    m_url.m_queryEnd = currentPosition(c);
+                    state = State::Fragment;
+                    break;
+                }
+                appendCodePoint(queryBuffer, *c);
+                advance(c, queryBegin);
+            } while (!c.atEnd());
+            break;
         case State::Fragment:
             do {
                 LOG(URLParser, "State Fragment");
@@ -1859,13 +1928,19 @@
         m_url.m_queryEnd = m_url.m_pathEnd;
         m_url.m_fragmentEnd = m_url.m_pathEnd;
         break;
-    case State::Query:
-        LOG_FINAL_STATE("Query");
-        if (!isUTF8Encoding)
-            encodeQuery(queryBuffer, encoding);
+    case State::UTF8Query:
+        LOG_FINAL_STATE("UTF8Query");
+        ASSERT(queryBegin == CodePointIterator<CharacterType>());
         m_url.m_queryEnd = currentPosition(c);
         m_url.m_fragmentEnd = m_url.m_queryEnd;
         break;
+    case State::NonUTF8Query:
+        LOG_FINAL_STATE("NonUTF8Query");
+        ASSERT(queryBegin != CodePointIterator<CharacterType>());
+        encodeQuery(queryBuffer, encoding, CodePointIterator<CharacterType>(queryBegin, c));
+        m_url.m_queryEnd = currentPosition(c);
+        m_url.m_fragmentEnd = m_url.m_queryEnd;
+        break;
     case State::Fragment:
         {
             LOG_FINAL_STATE("Fragment");

Modified: trunk/Source/WebCore/platform/URLParser.h (206647 => 206648)


--- trunk/Source/WebCore/platform/URLParser.h	2016-09-30 18:44:16 UTC (rev 206647)
+++ trunk/Source/WebCore/platform/URLParser.h	2016-09-30 18:46:28 UTC (rev 206648)
@@ -92,7 +92,7 @@
     void appendToASCIIBuffer(UChar32);
     void appendToASCIIBuffer(const char*, size_t);
     void appendToASCIIBuffer(const LChar* characters, size_t size) { appendToASCIIBuffer(reinterpret_cast<const char*>(characters), size); }
-    void encodeQuery(const Vector<UChar>& source, const TextEncoding&);
+    template<typename CharacterType> void encodeQuery(const Vector<UChar>& source, const TextEncoding&, CodePointIterator<CharacterType>);
     void copyASCIIStringUntil(const String&, size_t lengthIf8Bit, size_t lengthIf16Bit);
     StringView parsedDataView(size_t start, size_t length);
 

Modified: trunk/Tools/ChangeLog (206647 => 206648)


--- trunk/Tools/ChangeLog	2016-09-30 18:44:16 UTC (rev 206647)
+++ trunk/Tools/ChangeLog	2016-09-30 18:46:28 UTC (rev 206648)
@@ -1,3 +1,16 @@
+2016-09-30  Alex Christensen  <achristen...@webkit.org>
+
+        URLParser: handle syntax violations in non-UTF-8 encoded queries
+        https://bugs.webkit.org/show_bug.cgi?id=162770
+
+        Reviewed by Tim Horton.
+
+        * TestWebKitAPI/Tests/WebCore/URLParser.cpp:
+        (TestWebKitAPI::checkURL):
+        (TestWebKitAPI::TEST_F):
+        Tests with emoji change behavior when we insert a tab between the surrogates, so don't do the
+        insert-tab-at-each-location verification that syntax violations are handled correctly.
+
 2016-09-30  Megan Gardner  <megan_gard...@apple.com>
 
         Make it possible to test web-related user-interface features

Modified: trunk/Tools/TestWebKitAPI/Tests/WebCore/URLParser.cpp (206647 => 206648)


--- trunk/Tools/TestWebKitAPI/Tests/WebCore/URLParser.cpp	2016-09-30 18:44:16 UTC (rev 206647)
+++ trunk/Tools/TestWebKitAPI/Tests/WebCore/URLParser.cpp	2016-09-30 18:46:28 UTC (rev 206648)
@@ -961,7 +961,7 @@
         {"http", "", "", "w", 0, "/", "%ED%A0%80", "", "http://w/?%ED%A0%80"});
 }
 
-static void checkURL(const String& urlString, const TextEncoding& encoding, const ExpectedParts& parts)
+static void checkURL(const String& urlString, const TextEncoding& encoding, const ExpectedParts& parts, bool checkTabs = true)
 {
     URLParser parser(urlString, { }, encoding);
     auto url = ""
@@ -975,13 +975,32 @@
     EXPECT_TRUE(eq(parts.fragment, url.fragmentIdentifier()));
     EXPECT_TRUE(eq(parts.string, url.string()));
 
-    // FIXME: check tabs here like we do for checkURL and checkRelativeURL.
+    if (checkTabs) {
+        for (size_t i = 0; i < urlString.length(); ++i) {
+            String urlStringWithTab = makeString(urlString.substring(0, i), "\t", urlString.substring(i));
+            ExpectedParts invalidPartsWithTab = {"", "", "", "", 0, "" , "", "", urlStringWithTab};
+            checkURL(urlStringWithTab, encoding, parts.isInvalid() ? invalidPartsWithTab : parts, false);
+        }
+    }
 }
 
 TEST_F(URLParserTest, QueryEncoding)
 {
-    checkURL(utf16String(u"http://host?ß😍#ß😍"), UTF8Encoding(), {"http", "", "", "host", 0, "/", "%C3%9F%F0%9F%98%8D", utf16String(u"ß😍"), utf16String(u"http://host/?%C3%9F%F0%9F%98%8D#ß😍")});
-    // FIXME: Add tests with other encodings.
+    checkURL(utf16String(u"http://host?ß😍#ß😍"), UTF8Encoding(), {"http", "", "", "host", 0, "/", "%C3%9F%F0%9F%98%8D", utf16String(u"ß😍"), utf16String(u"http://host/?%C3%9F%F0%9F%98%8D#ß😍")}, false);
+    checkURL(utf16String(u"http://host?ß😍#ß😍"), UTF8Encoding(), {"http", "", "", "host", 0, "/", "%C3%9F%F0%9F%98%8D", utf16String(u"ß😍"), utf16String(u"http://host/?%C3%9F%F0%9F%98%8D#ß😍")}, false);
+
+    TextEncoding latin1(String("latin1"));
+    checkURL("http://host/?query with%20spaces", latin1, {"http", "", "", "host", 0, "/", "query%20with%20spaces", "", "http://host/?query%20with%20spaces"});
+    checkURL("http://host/?query", latin1, {"http", "", "", "host", 0, "/", "query", "", "http://host/?query"});
+    checkURL("http://host/?\tquery", latin1, {"http", "", "", "host", 0, "/", "query", "", "http://host/?query"});
+    checkURL("http://host/?q\tuery", latin1, {"http", "", "", "host", 0, "/", "query", "", "http://host/?query"});
+    checkURL("http://host/?query with SpAcEs#fragment", latin1, {"http", "", "", "host", 0, "/", "query%20with%20SpAcEs", "fragment", "http://host/?query%20with%20SpAcEs#fragment"});
+
+    TextEncoding unrecognized(String("unrecognized invalid encoding name"));
+    checkURL("http://host/?query", unrecognized, {"http", "", "", "host", 0, "/", "", "", "http://host/?"});
+    checkURL("http://host/?", unrecognized, {"http", "", "", "host", 0, "/", "", "", "http://host/?"});
+
+    // FIXME: Add more tests with other encodings and things like non-ascii characters, emoji and unmatched surrogate pairs.
 }
 
 } // namespace TestWebKitAPI
_______________________________________________
webkit-changes mailing list
webkit-changes@lists.webkit.org
https://lists.webkit.org/mailman/listinfo/webkit-changes

Reply via email to