Title: [207152] trunk
Revision
207152
Author
[email protected]
Date
2016-10-11 11:28:50 -0700 (Tue, 11 Oct 2016)

Log Message

URLParser should percent-encode non-ASCII and non-printable characters in fragment
https://bugs.webkit.org/show_bug.cgi?id=163287

Reviewed by Brady Eidson.

Source/WebCore:

Based on discussion in https://github.com/whatwg/url/issues/150
If that discussion decides to keep the spec as-is (which keeps non-ASCII characters in the fragment
to match IE and Edge's behavior, which Chrome has followed for special schemes) then we can revert
this change later after enabling the URL parser.  Making this change keeps behavior matching Safari
and Firefox, as well as Chrome's handling of non-special schemes, such as data URLs.

Covered by updated API tests.

* platform/URLParser.cpp:
(WebCore::URLParser::appendToASCIIBuffer):
(WebCore::URLParser::copyURLPartsUntil):
(WebCore::URLParser::syntaxViolation):
(WebCore::URLParser::currentPosition):
(WebCore::URLParser::parse):
(WebCore::URLParser::fragmentSyntaxViolation): Deleted.
* platform/URLParser.h:
No more non-ASCII characters in canonicalized URLs.

Tools:

* TestWebKitAPI/Tests/WebCore/URLParser.cpp:
(TestWebKitAPI::TEST_F):

Modified Paths

Diff

Modified: trunk/Source/WebCore/ChangeLog (207151 => 207152)


--- trunk/Source/WebCore/ChangeLog	2016-10-11 18:20:23 UTC (rev 207151)
+++ trunk/Source/WebCore/ChangeLog	2016-10-11 18:28:50 UTC (rev 207152)
@@ -1,5 +1,30 @@
 2016-10-11  Alex Christensen  <[email protected]>
 
+        URLParser should percent-encode non-ASCII and non-printable characters in fragment
+        https://bugs.webkit.org/show_bug.cgi?id=163287
+
+        Reviewed by Brady Eidson.
+
+        Based on discussion in https://github.com/whatwg/url/issues/150
+        If that discussion decides to keep the spec as-is (which keeps non-ASCII characters in the fragment
+        to match IE and Edge's behavior, which Chrome has followed for special schemes) then we can revert
+        this change later after enabling the URL parser.  Making this change keeps behavior matching Safari
+        and Firefox, as well as Chrome's handling of non-special schemes, such as data URLs.
+
+        Covered by updated API tests.
+
+        * platform/URLParser.cpp:
+        (WebCore::URLParser::appendToASCIIBuffer):
+        (WebCore::URLParser::copyURLPartsUntil):
+        (WebCore::URLParser::syntaxViolation):
+        (WebCore::URLParser::currentPosition):
+        (WebCore::URLParser::parse):
+        (WebCore::URLParser::fragmentSyntaxViolation): Deleted.
+        * platform/URLParser.h:
+        No more non-ASCII characters in canonicalized URLs.
+
+2016-10-11  Alex Christensen  <[email protected]>
+
         Remove dead networking code
         https://bugs.webkit.org/show_bug.cgi?id=163263
 

Modified: trunk/Source/WebCore/platform/URLParser.cpp (207151 => 207152)


--- trunk/Source/WebCore/platform/URLParser.cpp	2016-10-11 18:20:23 UTC (rev 207151)
+++ trunk/Source/WebCore/platform/URLParser.cpp	2016-10-11 18:28:50 UTC (rev 207152)
@@ -462,7 +462,6 @@
 
 ALWAYS_INLINE void URLParser::appendToASCIIBuffer(UChar32 codePoint)
 {
-    ASSERT(m_unicodeFragmentBuffer.isEmpty());
     ASSERT(isASCII(codePoint));
     if (UNLIKELY(m_didSeeSyntaxViolation))
         m_asciiBuffer.append(codePoint);
@@ -470,7 +469,6 @@
 
 ALWAYS_INLINE void URLParser::appendToASCIIBuffer(const char* characters, size_t length)
 {
-    ASSERT(m_unicodeFragmentBuffer.isEmpty());
     if (UNLIKELY(m_didSeeSyntaxViolation))
         m_asciiBuffer.append(characters, length);
 }
@@ -829,7 +827,6 @@
     syntaxViolation(iterator);
 
     m_asciiBuffer.clear();
-    m_unicodeFragmentBuffer.clear();
     copyASCIIStringUntil(base.m_string, urlLengthUntilPart(base, part));
     switch (part) {
     case URLPart::FragmentEnd:
@@ -1018,7 +1015,6 @@
     m_didSeeSyntaxViolation = true;
     
     ASSERT(m_asciiBuffer.isEmpty());
-    ASSERT(m_unicodeFragmentBuffer.isEmpty());
     size_t codeUnitsToCopy = iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
     RELEASE_ASSERT(codeUnitsToCopy <= m_inputString.length());
     m_asciiBuffer.reserveCapacity(m_inputString.length());
@@ -1028,30 +1024,6 @@
     }
 }
 
-template<typename CharacterType>
-void URLParser::fragmentSyntaxViolation(const CodePointIterator<CharacterType>& iterator)
-{
-    ASSERT(m_didSeeUnicodeFragmentCodePoint);
-    if (m_didSeeSyntaxViolation)
-        return;
-    m_didSeeSyntaxViolation = true;
-
-    ASSERT(m_asciiBuffer.isEmpty());
-    ASSERT(m_unicodeFragmentBuffer.isEmpty());
-    size_t codeUnitsToCopy = iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
-    size_t asciiCodeUnitsToCopy = m_url.m_queryEnd;
-    size_t unicodeCodeUnitsToCopy = codeUnitsToCopy - asciiCodeUnitsToCopy;
-    RELEASE_ASSERT(codeUnitsToCopy <= m_inputString.length());
-    m_asciiBuffer.reserveCapacity(asciiCodeUnitsToCopy);
-    for (size_t i = 0; i < asciiCodeUnitsToCopy; ++i) {
-        ASSERT(isASCII(m_inputString[i]));
-        m_asciiBuffer.uncheckedAppend(m_inputString[i]);
-    }
-    m_unicodeFragmentBuffer.reserveCapacity(m_inputString.length() - asciiCodeUnitsToCopy);
-    for (size_t i = asciiCodeUnitsToCopy; i < asciiCodeUnitsToCopy + unicodeCodeUnitsToCopy; ++i)
-        m_unicodeFragmentBuffer.uncheckedAppend(m_inputString[i]);
-}
-
 void URLParser::failure()
 {
     m_url.invalidate();
@@ -1111,10 +1083,8 @@
 template<typename CharacterType>
 ALWAYS_INLINE size_t URLParser::currentPosition(const CodePointIterator<CharacterType>& iterator)
 {
-    if (UNLIKELY(m_didSeeSyntaxViolation)) {
-        ASSERT(m_unicodeFragmentBuffer.isEmpty());
+    if (UNLIKELY(m_didSeeSyntaxViolation))
         return m_asciiBuffer.size();
-    }
     
     return iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
 }
@@ -1160,7 +1130,6 @@
     URL_PARSER_LOG("Parsing URL <%s> base <%s> encoding <%s>", String(input, length).utf8().data(), base.string().utf8().data(), encoding.name());
     m_url = { };
     ASSERT(m_asciiBuffer.isEmpty());
-    ASSERT(m_unicodeFragmentBuffer.isEmpty());
     
     bool isUTF8Encoding = encoding == UTF8Encoding();
     Vector<UChar> queryBuffer;
@@ -1811,35 +1780,9 @@
             } while (!c.atEnd());
             break;
         case State::Fragment:
-            do {
-                URL_PARSER_LOG("State Fragment");
-                if (!c.atEnd() && isTabOrNewline(*c)) {
-                    if (m_didSeeUnicodeFragmentCodePoint)
-                        fragmentSyntaxViolation(c);
-                    else
-                        syntaxViolation(c);
-                    ++c;
-                    continue;
-                }
-                if (!m_didSeeUnicodeFragmentCodePoint && isASCII(*c)) {
-                    if (m_urlIsSpecial)
-                        appendToASCIIBuffer(*c);
-                    else
-                        utf8PercentEncode<isInSimpleEncodeSet>(c);
-                } else {
-                    if (m_urlIsSpecial) {
-                        m_didSeeUnicodeFragmentCodePoint = true;
-                        if (UNLIKELY(m_didSeeSyntaxViolation))
-                            appendCodePoint(m_unicodeFragmentBuffer, *c);
-                        else {
-                            ASSERT(m_asciiBuffer.isEmpty());
-                            ASSERT(m_unicodeFragmentBuffer.isEmpty());
-                        }
-                    } else
-                        utf8PercentEncode<isInSimpleEncodeSet>(c);
-                }
-                ++c;
-            } while (!c.atEnd());
+            URL_PARSER_LOG("State Fragment");
+            utf8PercentEncode<isInSimpleEncodeSet>(c);
+            ++c;
             break;
         }
     }
@@ -2043,28 +1986,16 @@
         m_url.m_fragmentEnd = m_url.m_queryEnd;
         break;
     case State::Fragment:
-        {
-            LOG_FINAL_STATE("Fragment");
-            size_t length = m_didSeeSyntaxViolation ? m_asciiBuffer.size() + m_unicodeFragmentBuffer.size() : c.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin));
-            m_url.m_fragmentEnd = length;
-            break;
-        }
+        LOG_FINAL_STATE("Fragment");
+        m_url.m_fragmentEnd = currentPosition(c);
+        break;
     }
 
     if (LIKELY(!m_didSeeSyntaxViolation)) {
         m_url.m_string = m_inputString;
         ASSERT(m_asciiBuffer.isEmpty());
-        ASSERT(m_unicodeFragmentBuffer.isEmpty());
-    } else if (!m_didSeeUnicodeFragmentCodePoint) {
-        ASSERT(m_unicodeFragmentBuffer.isEmpty());
+    } else
         m_url.m_string = String::adopt(WTFMove(m_asciiBuffer));
-    } else {
-        Vector<UChar> buffer;
-        buffer.reserveInitialCapacity(m_asciiBuffer.size() + m_unicodeFragmentBuffer.size());
-        buffer.appendVector(m_asciiBuffer);
-        buffer.appendVector(m_unicodeFragmentBuffer);
-        m_url.m_string = String::adopt(WTFMove(buffer));
-    }
     m_url.m_isValid = true;
     URL_PARSER_LOG("Parsed URL <%s>", m_url.m_string.utf8().data());
 }

Modified: trunk/Source/WebCore/platform/URLParser.h (207151 => 207152)


--- trunk/Source/WebCore/platform/URLParser.h	2016-10-11 18:20:23 UTC (rev 207151)
+++ trunk/Source/WebCore/platform/URLParser.h	2016-10-11 18:28:50 UTC (rev 207152)
@@ -51,8 +51,6 @@
 private:
     URL m_url;
     Vector<LChar> m_asciiBuffer;
-    Vector<UChar> m_unicodeFragmentBuffer;
-    bool m_didSeeUnicodeFragmentCodePoint { false };
     bool m_urlIsSpecial { false };
     bool m_hostHasPercentOrNonASCII { false };
     String m_inputString;
@@ -73,7 +71,6 @@
     void advance(CodePointIterator<CharacterType>&, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition);
     template<typename CharacterType> bool takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType>);
     template<typename CharacterType> void syntaxViolation(const CodePointIterator<CharacterType>&);
-    template<typename CharacterType> void fragmentSyntaxViolation(const CodePointIterator<CharacterType>&);
     template<typename CharacterType> bool isPercentEncodedDot(CodePointIterator<CharacterType>);
     template<typename CharacterType> bool isWindowsDriveLetter(CodePointIterator<CharacterType>);
     template<typename CharacterType> bool isSingleDotPathSegment(CodePointIterator<CharacterType>);

Modified: trunk/Tools/ChangeLog (207151 => 207152)


--- trunk/Tools/ChangeLog	2016-10-11 18:20:23 UTC (rev 207151)
+++ trunk/Tools/ChangeLog	2016-10-11 18:28:50 UTC (rev 207152)
@@ -1,5 +1,15 @@
 2016-10-11  Alex Christensen  <[email protected]>
 
+        URLParser should percent-encode non-ASCII and non-printable characters in fragment
+        https://bugs.webkit.org/show_bug.cgi?id=163287
+
+        Reviewed by Brady Eidson.
+
+        * TestWebKitAPI/Tests/WebCore/URLParser.cpp:
+        (TestWebKitAPI::TEST_F):
+
+2016-10-11  Alex Christensen  <[email protected]>
+
         Remove dead networking code
         https://bugs.webkit.org/show_bug.cgi?id=163263
 

Modified: trunk/Tools/TestWebKitAPI/Tests/WebCore/URLParser.cpp (207151 => 207152)


--- trunk/Tools/TestWebKitAPI/Tests/WebCore/URLParser.cpp	2016-10-11 18:20:23 UTC (rev 207151)
+++ trunk/Tools/TestWebKitAPI/Tests/WebCore/URLParser.cpp	2016-10-11 18:28:50 UTC (rev 207152)
@@ -331,6 +331,10 @@
     checkURL("aA://", {"aa", "", "", "", 0, "//", "", "", "aa://"});
     checkURL(utf16String(u"foo://host/#ПП\u0007 a</"), {"foo", "", "", "host", 0, "/", "", "%D0%9F%D0%9F%07 a</", "foo://host/#%D0%9F%D0%9F%07 a</"});
     checkURL(utf16String(u"foo://host/#\u0007 a</"), {"foo", "", "", "host", 0, "/", "", "%07 a</", "foo://host/#%07 a</"});
+    checkURL(utf16String(u"http://host?ĆŸšŸ˜#ĆŸšŸ˜"), {"http", "", "", "host", 0, "/", "%C3%9F%F0%9F%98%8D", "%C3%9F%F0%9F%98%8D", "http://host/?%C3%9F%F0%9F%98%8D#%C3%9F%F0%9F%98%8D"}, testTabsValueForSurrogatePairs);
+    checkURL(utf16String(u"http://host/path#šŸ’©\tšŸ’©"), {"http", "", "", "host", 0, "/path", "", "%F0%9F%92%A9%F0%9F%92%A9", "http://host/path#%F0%9F%92%A9%F0%9F%92%A9"}, testTabsValueForSurrogatePairs);
+    checkURL(utf16String(u"http://host/#ПП\u0007 a</"), {"http", "", "", "host", 0, "/", "", "%D0%9F%D0%9F%07 a</", "http://host/#%D0%9F%D0%9F%07 a</"});
+    checkURL(utf16String(u"http://host/#\u0007 a</"), {"http", "", "", "host", 0, "/", "", "%07 a</", "http://host/#%07 a</"});
 
     // This disagrees with the web platform test for http://:@www.example.com but agrees with Chrome and URL::parse,
     // and Firefox fails the web platform test differently. Maybe the web platform test ought to be changed.
@@ -456,6 +460,7 @@
     checkRelativeURL("  ", "http://host/path?query#fra#gment", {"http", "", "", "host", 0, "/path", "query", "", "http://host/path?query"});
     checkRelativeURL(" \a ", "http://host/#fragment", {"http", "", "", "host", 0, "/", "", "", "http://host/"});
     checkRelativeURL("foo://", "http://example.org/foo/bar", {"foo", "", "", "", 0, "//", "", "", "foo://"});
+    checkRelativeURL(utf16String(u"#β"), "http://example.org/foo/bar", {"http", "", "", "example.org", 0, "/foo/bar", "", "%CE%B2", "http://example.org/foo/bar#%CE%B2"});
 
     // The checking of slashes in SpecialAuthoritySlashes needed to get this to pass contradicts what is in the spec,
     // but it is included in the web platform tests.
@@ -630,9 +635,6 @@
     checkURLDifferences("file://[0:a:0:0:b:c:0:0]/path",
         {"file", "", "", "[0:a::b:c:0:0]", 0, "/path", "", "", "file://[0:a::b:c:0:0]/path"},
         {"file", "", "", "[0:a:0:0:b:c:0:0]", 0, "/path", "", "", "file://[0:a:0:0:b:c:0:0]/path"});
-    checkRelativeURLDifferences(utf16String(u"#β"), "http://example.org/foo/bar",
-        {"http", "", "", "example.org", 0, "/foo/bar", "", utf16String(u"β"), utf16String(u"http://example.org/foo/bar#β")},
-        {"http", "", "", "example.org", 0, "/foo/bar", "", "%CE%B2", "http://example.org/foo/bar#%CE%B2"});
     checkURLDifferences("http://",
         {"", "", "", "", 0, "", "", "", "http://"},
         {"http", "", "", "", 0, "/", "", "", "http:/"});
@@ -769,12 +771,6 @@
     checkURLDifferences("notspecial://@test@test@example:800\\path@end",
         {"notspecial", "@test@test@example", "800\\path", "end", 0, "/", "", "", "notspecial://%40test%40test%40example:800%5Cpath@end/"},
         {"", "", "", "", 0, "", "", "", "notspecial://@test@test@example:800\\path@end"});
-    checkURLDifferences(utf16String(u"http://host?ĆŸšŸ˜#ĆŸšŸ˜"),
-        {"http", "", "", "host", 0, "/", "%C3%9F%F0%9F%98%8D", utf16String(u"ĆŸšŸ˜"), utf16String(u"http://host/?%C3%9F%F0%9F%98%8D#ĆŸšŸ˜")},
-        {"http", "", "", "host", 0, "/", "%C3%9F%F0%9F%98%8D", "%C3%9F%F0%9F%98%8D", "http://host/?%C3%9F%F0%9F%98%8D#%C3%9F%F0%9F%98%8D"}, testTabsValueForSurrogatePairs);
-    checkURLDifferences(utf16String(u"http://host/path#šŸ’©\tšŸ’©"),
-        {"http", "", "", "host", 0, "/path", "", utf16String(u"šŸ’©šŸ’©"), utf16String(u"http://host/path#šŸ’©šŸ’©")},
-        {"http", "", "", "host", 0, "/path", "", "%F0%9F%92%A9%F0%9F%92%A9", "http://host/path#%F0%9F%92%A9%F0%9F%92%A9"});
     checkURLDifferences("http://%48OsT",
         {"http", "", "", "host", 0, "/", "", "", "http://host/"},
         {"http", "", "", "%48ost", 0, "/", "", "", "http://%48ost/"});
@@ -1060,12 +1056,6 @@
     checkURLDifferences("file://:0/path",
         {"", "", "", "", 0, "", "", "", "file://:0/path"},
         {"file", "", "", "", 0, "/path", "", "", "file://:0/path"});
-    checkURLDifferences(utf16String(u"http://host/#ПП\u0007 a</"),
-        {"http", "", "", "host", 0, "/", "", utf16String(u"ПП\u0007 a</"), utf16String(u"http://host/#ПП\u0007 a</")},
-        {"http", "", "", "host", 0, "/", "", "%D0%9F%D0%9F%07 a</", "http://host/#%D0%9F%D0%9F%07 a</"});
-    checkURLDifferences(utf16String(u"http://host/#\u0007 a</"),
-        {"http", "", "", "host", 0, "/", "", "\a a</", "http://host/#\a a</"},
-        {"http", "", "", "host", 0, "/", "", "%07 a</", "http://host/#%07 a</"});
 }
     
 static void shouldFail(const String& urlString)
@@ -1224,8 +1214,7 @@
 
 TEST_F(URLParserTest, QueryEncoding)
 {
-    checkURL(utf16String(u"http://host?ĆŸšŸ˜#ĆŸšŸ˜"), UTF8Encoding(), {"http", "", "", "host", 0, "/", "%C3%9F%F0%9F%98%8D", utf16String(u"ĆŸšŸ˜"), utf16String(u"http://host/?%C3%9F%F0%9F%98%8D#ĆŸšŸ˜")}, testTabsValueForSurrogatePairs);
-    checkURL(utf16String(u"http://host?ĆŸšŸ˜#ĆŸšŸ˜"), UTF8Encoding(), {"http", "", "", "host", 0, "/", "%C3%9F%F0%9F%98%8D", utf16String(u"ĆŸšŸ˜"), utf16String(u"http://host/?%C3%9F%F0%9F%98%8D#ĆŸšŸ˜")}, testTabsValueForSurrogatePairs);
+    checkURL(utf16String(u"http://host?ĆŸšŸ˜#ĆŸšŸ˜"), UTF8Encoding(), {"http", "", "", "host", 0, "/", "%C3%9F%F0%9F%98%8D", "%C3%9F%F0%9F%98%8D", utf16String(u"http://host/?%C3%9F%F0%9F%98%8D#%C3%9F%F0%9F%98%8D")}, testTabsValueForSurrogatePairs);
 
     TextEncoding latin1(String("latin1"));
     checkURL("http://host/?query with%20spaces", latin1, {"http", "", "", "host", 0, "/", "query%20with%20spaces", "", "http://host/?query%20with%20spaces"});
_______________________________________________
webkit-changes mailing list
[email protected]
https://lists.webkit.org/mailman/listinfo/webkit-changes

Reply via email to