Skip to site navigation (Press enter)

[webkit-changes] [206177] trunk

achristensen Tue, 20 Sep 2016 14:51:10 -0700

Title: [206177] trunk

Revision: 206177
Author: achristen...@apple.com
Date: 2016-09-20 14:50:30 -0700 (Tue, 20 Sep 2016)

Log Message

Reduce allocations in URLParser
https://bugs.webkit.org/show_bug.cgi?id=162241


Reviewed by Chris Dumez.

Source/WebCore:

Use Vectors instead of StringBuilders.  This allows us to use the inline capacity on the stack
for short URLs (<2KB) and also allows us to skip branches because we know whether the
contained type is UChar or LChar at compile time.  It also allows us to use uncheckedAppend.

Added new API tests for parts that were less tested, but there is
no change in behavior except for a performance improvement.

* platform/URLParser.cpp:
(WebCore::appendCodePoint):
(WebCore::encodeQuery):
(WebCore::URLParser::failure):
(WebCore::URLParser::parse):
(WebCore::percentDecode):
(WebCore::domainToASCII):
(WebCore::hasInvalidDomainCharacter):
(WebCore::URLParser::parseHost):
(WebCore::formURLDecode):
(WebCore::isC0Control): Deleted.
* platform/URLParser.h:

Tools:

* TestWebKitAPI/Tests/WebCore/URLParser.cpp:
(TestWebKitAPI::TEST_F):
(TestWebKitAPI::checkURL):

Modified Paths

trunk/Source/WebCore/ChangeLog
trunk/Source/WebCore/platform/URLParser.cpp
trunk/Source/WebCore/platform/URLParser.h
trunk/Tools/ChangeLog
trunk/Tools/TestWebKitAPI/Tests/WebCore/URLParser.cpp

Diff

Modified: trunk/Source/WebCore/ChangeLog (206176 => 206177)


--- trunk/Source/WebCore/ChangeLog	2016-09-20 21:43:26 UTC (rev 206176)
+++ trunk/Source/WebCore/ChangeLog	2016-09-20 21:50:30 UTC (rev 206177)
@@ -1,3 +1,30 @@
+2016-09-20  Alex Christensen  <achristen...@webkit.org>
+
+        Reduce allocations in URLParser
+        https://bugs.webkit.org/show_bug.cgi?id=162241
+
+        Reviewed by Chris Dumez.
+
+        Use Vectors instead of StringBuilders.  This allows us to use the inline capacity on the stack
+        for short URLs (<2KB) and also allows us to skip branches because we know whether the
+        contained type is UChar or LChar at compile time.  It also allows us to use uncheckedAppend.
+
+        Added new API tests for parts that were less tested, but there is
+        no change in behavior except for a performance improvement.
+
+        * platform/URLParser.cpp:
+        (WebCore::appendCodePoint):
+        (WebCore::encodeQuery):
+        (WebCore::URLParser::failure):
+        (WebCore::URLParser::parse):
+        (WebCore::percentDecode):
+        (WebCore::domainToASCII):
+        (WebCore::hasInvalidDomainCharacter):
+        (WebCore::URLParser::parseHost):
+        (WebCore::formURLDecode):
+        (WebCore::isC0Control): Deleted.
+        * platform/URLParser.h:
+
 2016-09-20  Nan Wang  <n_w...@apple.com>
 
         AX: voiceover does not read contents of input role="spinbutton"

Modified: trunk/Source/WebCore/platform/URLParser.cpp (206176 => 206177)


--- trunk/Source/WebCore/platform/URLParser.cpp	2016-09-20 21:43:26 UTC (rev 206176)
+++ trunk/Source/WebCore/platform/URLParser.cpp	2016-09-20 21:50:30 UTC (rev 206177)
@@ -30,10 +30,6 @@
 #include <array>
 #include <unicode/uidna.h>
 #include <unicode/utypes.h>
-#include <wtf/HashMap.h>
-#include <wtf/NeverDestroyed.h>
-#include <wtf/text/StringBuilder.h>
-#include <wtf/text/StringHash.h>
 
 namespace WebCore {
 
@@ -115,6 +111,17 @@
     m_begin += i;
     return *this;
 }
+    
+static void appendCodePoint(Vector<UChar>& destination, UChar32 codePoint)
+{
+    if (U_IS_BMP(codePoint)) {
+        destination.append(static_cast<UChar>(codePoint));
+        return;
+    }
+    destination.reserveCapacity(destination.size() + 2);
+    destination.uncheckedAppend(U16_LEAD(codePoint));
+    destination.uncheckedAppend(U16_TRAIL(codePoint));
+}
 
 enum URLCharacterClass {
     UserInfo = 0x1,
@@ -504,10 +511,10 @@
     }
 }
     
-inline static void encodeQuery(const StringBuilder& source, Vector<LChar>& destination, const TextEncoding& encoding)
+inline static void encodeQuery(const Vector<UChar>& source, Vector<LChar>& destination, const TextEncoding& encoding)
 {
     // FIXME: It is unclear in the spec what to do when encoding fails. The behavior should be specified and tested.
-    CString encoded = encoding.encode(source.toStringPreserveCapacity(), URLEncodedEntitiesForUnencodables);
+    CString encoded = encoding.encode(StringView(source.data(), source.size()), URLEncodedEntitiesForUnencodables);
     const char* data = ""
     size_t length = encoded.length();
     for (size_t i = 0; i < length; ++i) {
@@ -912,7 +919,7 @@
         return parse<serialized>(input.characters8(), input.length(), { }, UTF8Encoding());
     return parse<serialized>(input.characters16(), input.length(), { }, UTF8Encoding());
 }
-    
+
 template<bool serialized, typename CharacterType>
 URL URLParser::parse(const CharacterType* input, const unsigned length, const URL& base, const TextEncoding& encoding)
 {
@@ -923,7 +930,7 @@
     m_asciiBuffer.reserveCapacity(length);
     
     bool isUTF8Encoding = encoding == UTF8Encoding();
-    StringBuilder queryBuffer;
+    Vector<UChar> queryBuffer;
 
     unsigned endIndex = length;
     while (endIndex && isC0ControlOrSpace(input[endIndex - 1]))
@@ -1408,7 +1415,7 @@
             if (isUTF8Encoding)
                 utf8PercentEncodeQuery<serialized>(*c, m_asciiBuffer);
             else
-                queryBuffer.append(*c);
+                appendCodePoint(queryBuffer, *c);
             ++c;
             break;
         case State::Fragment:
@@ -1416,7 +1423,7 @@
             if (m_unicodeFragmentBuffer.isEmpty() && isASCII(*c))
                 m_asciiBuffer.append(*c);
             else
-                m_unicodeFragmentBuffer.append(*c);
+                appendCodePoint(m_unicodeFragmentBuffer, *c);
             ++c;
             break;
         }
@@ -1926,25 +1933,27 @@
     return address;
 }
 
-// FIXME: This should return a CString.
-inline static String percentDecode(const LChar* input, size_t length)
+const size_t defaultInlineBufferSize = 2048;
+
+inline static Vector<LChar, defaultInlineBufferSize> percentDecode(const LChar* input, size_t length)
 {
-    StringBuilder output;
+    Vector<LChar, defaultInlineBufferSize> output;
+    output.reserveInitialCapacity(length);
     
     for (size_t i = 0; i < length; ++i) {
         uint8_t byte = input[i];
         if (byte != '%')
-            output.append(byte);
+            output.uncheckedAppend(byte);
         else if (i < length - 2) {
             if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) {
-                output.append(toASCIIHexValue(input[i + 1], input[i + 2]));
+                output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2]));
                 i += 2;
             } else
-                output.append(byte);
+                output.uncheckedAppend(byte);
         } else
-            output.append(byte);
+            output.uncheckedAppend(byte);
     }
-    return output.toStringPreserveCapacity();
+    return output;
 }
 
 inline static bool containsOnlyASCII(const String& string)
@@ -1954,22 +1963,26 @@
     return charactersAreAllASCII(string.characters16(), string.length());
 }
 
-inline static Optional<String> domainToASCII(const String& domain)
+inline static Optional<Vector<LChar, defaultInlineBufferSize>> domainToASCII(const String& domain)
 {
-    const unsigned hostnameBufferLength = 2048;
-
+    Vector<LChar, defaultInlineBufferSize> ascii;
     if (containsOnlyASCII(domain)) {
-        if (domain.is8Bit())
-            return domain.convertToASCIILowercase();
-        Vector<LChar, hostnameBufferLength> buffer;
         size_t length = domain.length();
-        buffer.reserveInitialCapacity(length);
-        for (size_t i = 0; i < length; ++i)
-            buffer.append(toASCIILower(domain[i]));
-        return String(buffer.data(), length);
+        if (domain.is8Bit()) {
+            const LChar* characters = domain.characters8();
+            ascii.reserveInitialCapacity(length);
+            for (size_t i = 0; i < length; ++i)
+                ascii.uncheckedAppend(toASCIILower(characters[i]));
+        } else {
+            const UChar* characters = domain.characters16();
+            ascii.reserveInitialCapacity(length);
+            for (size_t i = 0; i < length; ++i)
+                ascii.uncheckedAppend(toASCIILower(characters[i]));
+        }
+        return ascii;
     }
     
-    UChar hostnameBuffer[hostnameBufferLength];
+    UChar hostnameBuffer[defaultInlineBufferSize];
     UErrorCode error = U_ZERO_ERROR;
 
 #if COMPILER(GCC) || COMPILER(CLANG)
@@ -1977,18 +1990,19 @@
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
 #endif
     // FIXME: This should use uidna_openUTS46 / uidna_close instead
-    int32_t numCharactersConverted = uidna_IDNToASCII(StringView(domain).upconvertedCharacters(), domain.length(), hostnameBuffer, hostnameBufferLength, UIDNA_ALLOW_UNASSIGNED, nullptr, &error);
+    int32_t numCharactersConverted = uidna_IDNToASCII(StringView(domain).upconvertedCharacters(), domain.length(), hostnameBuffer, defaultInlineBufferSize, UIDNA_ALLOW_UNASSIGNED, nullptr, &error);
 #if COMPILER(GCC) || COMPILER(CLANG)
 #pragma GCC diagnostic pop
 #endif
+    ASSERT(numCharactersConverted <= static_cast<int32_t>(defaultInlineBufferSize));
 
     if (error == U_ZERO_ERROR) {
-        LChar buffer[hostnameBufferLength];
         for (int32_t i = 0; i < numCharactersConverted; ++i) {
             ASSERT(isASCII(hostnameBuffer[i]));
-            buffer[i] = hostnameBuffer[i];
+            ASSERT(!isASCIIUpper(hostnameBuffer[i]));
         }
-        return String(buffer, numCharactersConverted);
+        ascii.append(hostnameBuffer, numCharactersConverted);
+        return ascii;
     }
 
     // FIXME: Check for U_BUFFER_OVERFLOW_ERROR and retry with an allocated buffer.
@@ -1995,12 +2009,10 @@
     return Nullopt;
 }
 
-inline static bool hasInvalidDomainCharacter(const String& asciiDomain)
+inline static bool hasInvalidDomainCharacter(const Vector<LChar, defaultInlineBufferSize>& asciiDomain)
 {
-    RELEASE_ASSERT(asciiDomain.is8Bit());
-    const LChar* characters = asciiDomain.characters8();
-    for (size_t i = 0; i < asciiDomain.length(); ++i) {
-        if (isInvalidDomainCharacter(characters[i]))
+    for (size_t i = 0; i < asciiDomain.size(); ++i) {
+        if (isInvalidDomainCharacter(asciiDomain[i]))
             return true;
     }
     return false;
@@ -2095,9 +2107,8 @@
         m_url.m_portEnd = m_asciiBuffer.size();
         return true;
     }
-
-    // FIXME: We probably don't need to make so many buffers and String copies.
-    StringBuilder utf8Encoded;
+    
+    Vector<LChar, defaultInlineBufferSize> utf8Encoded;
     for (; !iterator.atEnd(); ++iterator) {
         if (!serialized && isTabOrNewline(*iterator))
             continue;
@@ -2111,18 +2122,15 @@
         // FIXME: Check error.
         utf8Encoded.append(buffer, offset);
     }
-    RELEASE_ASSERT(utf8Encoded.is8Bit());
-    String percentDecoded = percentDecode(utf8Encoded.characters8(), utf8Encoded.length());
-    RELEASE_ASSERT(percentDecoded.is8Bit());
-    String domain = String::fromUTF8(percentDecoded.characters8(), percentDecoded.length());
+    Vector<LChar, defaultInlineBufferSize> percentDecoded = percentDecode(utf8Encoded.data(), utf8Encoded.size());
+    String domain = String::fromUTF8(percentDecoded.data(), percentDecoded.size());
     auto asciiDomain = domainToASCII(domain);
     if (!asciiDomain || hasInvalidDomainCharacter(asciiDomain.value()))
         return false;
-    String& asciiDomainValue = asciiDomain.value();
-    RELEASE_ASSERT(asciiDomainValue.is8Bit());
-    const LChar* asciiDomainCharacters = asciiDomainValue.characters8();
+    Vector<LChar, defaultInlineBufferSize>& asciiDomainValue = asciiDomain.value();
+    const LChar* asciiDomainCharacters = asciiDomainValue.data();
 
-    if (auto address = parseIPv4Host(CodePointIterator<LChar>(asciiDomainCharacters, asciiDomainCharacters + asciiDomainValue.length()))) {
+    if (auto address = parseIPv4Host(CodePointIterator<LChar>(asciiDomainValue.begin(), asciiDomainValue.end()))) {
         serializeIPv4(address.value(), m_asciiBuffer);
         m_url.m_hostEnd = m_asciiBuffer.size();
         if (iterator.atEnd()) {
@@ -2133,7 +2141,7 @@
         return parsePort<serialized>(iterator);
     }
 
-    m_asciiBuffer.append(asciiDomainCharacters, asciiDomainValue.length());
+    m_asciiBuffer.append(asciiDomainCharacters, asciiDomainValue.size());
     m_url.m_hostEnd = m_asciiBuffer.size();
     if (!iterator.atEnd()) {
         ASSERT(*iterator == ':');
@@ -2150,8 +2158,7 @@
     if (utf8.isNull())
         return Nullopt;
     auto percentDecoded = percentDecode(reinterpret_cast<const LChar*>(utf8.data()), utf8.length());
-    RELEASE_ASSERT(percentDecoded.is8Bit());
-    return String::fromUTF8(percentDecoded.characters8(), percentDecoded.length());
+    return String::fromUTF8(percentDecoded.data(), percentDecoded.size());
 }
 
 auto URLParser::parseURLEncodedForm(StringView input) -> URLEncodedForm

Modified: trunk/Source/WebCore/platform/URLParser.h (206176 => 206177)


--- trunk/Source/WebCore/platform/URLParser.h	2016-09-20 21:43:26 UTC (rev 206176)
+++ trunk/Source/WebCore/platform/URLParser.h	2016-09-20 21:50:30 UTC (rev 206177)
@@ -52,7 +52,7 @@
 private:
     URL m_url;
     Vector<LChar> m_asciiBuffer;
-    Vector<UChar32> m_unicodeFragmentBuffer;
+    Vector<UChar> m_unicodeFragmentBuffer;
     bool m_urlIsSpecial { false };
     bool m_hostHasPercentOrNonASCII { false };

Modified: trunk/Tools/ChangeLog (206176 => 206177)


--- trunk/Tools/ChangeLog	2016-09-20 21:43:26 UTC (rev 206176)
+++ trunk/Tools/ChangeLog	2016-09-20 21:50:30 UTC (rev 206177)
@@ -1,5 +1,16 @@
 2016-09-20  Alex Christensen  <achristen...@webkit.org>
 
+        Reduce allocations in URLParser
+        https://bugs.webkit.org/show_bug.cgi?id=162241
+
+        Reviewed by Chris Dumez.
+
+        * TestWebKitAPI/Tests/WebCore/URLParser.cpp:
+        (TestWebKitAPI::TEST_F):
+        (TestWebKitAPI::checkURL):
+
+2016-09-20  Alex Christensen  <achristen...@webkit.org>
+
         Align URLParser with web platform tests when parsing non-special relative URLs ending in AuthorityOrHost state
         https://bugs.webkit.org/show_bug.cgi?id=162251

Modified: trunk/Tools/TestWebKitAPI/Tests/WebCore/URLParser.cpp (206176 => 206177)


--- trunk/Tools/TestWebKitAPI/Tests/WebCore/URLParser.cpp	2016-09-20 21:43:26 UTC (rev 206176)
+++ trunk/Tools/TestWebKitAPI/Tests/WebCore/URLParser.cpp	2016-09-20 21:50:30 UTC (rev 206177)
@@ -26,6 +26,7 @@
 #include "config.h"
 #include <WebCore/URLParser.h>
 #include <wtf/MainThread.h>
+#include <wtf/text/StringBuilder.h>
 
 using namespace WebCore;
 
@@ -558,6 +559,9 @@
     checkRelativeURLDifferences("foo://", "http://example.org/foo/bar",
         {"foo", "", "", "", 0, "/", "", "", "foo:///"},
         {"foo", "", "", "", 0, "//", "", "", "foo://"});
+    checkURLDifferences(wideString(L"http://host?ß😍#ß😍"),
+        {"http", "", "", "host", 0, "/", "%C3%9F%F0%9F%98%8D", wideString(L"ß😍"), wideString(L"http://host/?%C3%9F%F0%9F%98%8D#ß😍")},
+        {"http", "", "", "host", 0, "/", "%C3%9F%F0%9F%98%8D", "%C3%9F%F0%9F%98%8D", "http://host/?%C3%9F%F0%9F%98%8D#%C3%9F%F0%9F%98%8D"});
 
     // This matches the spec and web platform tests, but not Chrome, Firefox, or URL::parse.
     checkRelativeURLDifferences("notspecial:/", "about:blank",
@@ -649,6 +653,10 @@
     checkURLDifferences("unknown://host:81",
         {"unknown", "", "", "host", 81, "/", "", "", "unknown://host:81/"},
         {"unknown", "", "", "host", 81, "", "", "", "unknown://host:81"});
+    checkURLDifferences("http://%48OsT",
+        {"http", "", "", "host", 0, "/", "", "", "http://host/"},
+        {"http", "", "", "%48ost", 0, "/", "", "", "http://%48ost/"});
+
 }
     
 static void shouldFail(const String& urlString)
@@ -713,4 +721,25 @@
     checkRelativeURL("notspecial:", "http://example.org/foo/bar", {"notspecial", "", "", "", 0, "", "", "", "notspecial:"});
 }
 
+static void checkURL(const String& urlString, const TextEncoding& encoding, const ExpectedParts& parts)
+{
+    URLParser parser;
+    auto url = "" { }, encoding);
+    EXPECT_TRUE(eq(parts.protocol, url.protocol()));
+    EXPECT_TRUE(eq(parts.user, url.user()));
+    EXPECT_TRUE(eq(parts.password, url.pass()));
+    EXPECT_TRUE(eq(parts.host, url.host()));
+    EXPECT_EQ(parts.port, url.port());
+    EXPECT_TRUE(eq(parts.path, url.path()));
+    EXPECT_TRUE(eq(parts.query, url.query()));
+    EXPECT_TRUE(eq(parts.fragment, url.fragmentIdentifier()));
+    EXPECT_TRUE(eq(parts.string, url.string()));
+}
+
+TEST_F(URLParserTest, QueryEncoding)
+{
+    checkURL(wideString(L"http://host?ß😍#ß😍"), UTF8Encoding(), {"http", "", "", "host", 0, "/", "%C3%9F%F0%9F%98%8D", wideString(L"ß😍"), wideString(L"http://host/?%C3%9F%F0%9F%98%8D#ß😍")});
+    // FIXME: Add tests with other encodings.
+}
+
 } // namespace TestWebKitAPI

_______________________________________________
webkit-changes mailing list
webkit-changes@lists.webkit.org
https://lists.webkit.org/mailman/listinfo/webkit-changes