Title: [100405] trunk/Source/_javascript_Core
Revision
100405
Author
msab...@apple.com
Date
2011-11-15 22:06:21 -0800 (Tue, 15 Nov 2011)

Log Message

Towards 8 bit Strings - Update utf8() and ascii() methods for 8 bit strings
https://bugs.webkit.org/show_bug.cgi?id=72323

Added 8 bit optimized paths for String and UString ascii() and utf8() methods.

Added String::characters8(), characters16() and is8Bit() helper methods.

Added an new Unicode::convertLatin1ToUTF8() method that works on
LChar (8 bit) strings that is a stripped down version of convertUTF16ToUTF8().

Reviewed by Geoff Garen.

* _javascript_Core.vcproj/_javascript_Core/_javascript_Core.def:
* runtime/UString.cpp:
(JSC::UString::utf8):
* wtf/text/WTFString.cpp:
(WTF::String::ascii):
(WTF::String::utf8):
* wtf/text/WTFString.h:
(WTF::String::characters8):
(WTF::String::characters16):
(WTF::String::is8Bit):
(WTF::LChar):
(WTF::UChar):
* wtf/unicode/UTF8.cpp:
(WTF::Unicode::convertLatin1ToUTF8):
* wtf/unicode/UTF8.h:
* wtf/unicode/Unicode.h:

Modified Paths

Diff

Modified: trunk/Source/_javascript_Core/ChangeLog (100404 => 100405)


--- trunk/Source/_javascript_Core/ChangeLog	2011-11-16 05:46:55 UTC (rev 100404)
+++ trunk/Source/_javascript_Core/ChangeLog	2011-11-16 06:06:21 UTC (rev 100405)
@@ -1,3 +1,34 @@
+2011-11-15  Michael Saboff  <msab...@apple.com>
+
+        Towards 8 bit Strings - Update utf8() and ascii() methods for 8 bit strings
+        https://bugs.webkit.org/show_bug.cgi?id=72323
+
+        Added 8 bit optimized paths for String and UString ascii() and utf8() methods.
+
+        Added String::characters8(), characters16() and is8Bit() helper methods.
+
+        Added an new Unicode::convertLatin1ToUTF8() method that works on
+        LChar (8 bit) strings that is a stripped down version of convertUTF16ToUTF8().
+
+        Reviewed by Geoff Garen.
+
+        * _javascript_Core.vcproj/_javascript_Core/_javascript_Core.def:
+        * runtime/UString.cpp:
+        (JSC::UString::utf8):
+        * wtf/text/WTFString.cpp:
+        (WTF::String::ascii):
+        (WTF::String::utf8):
+        * wtf/text/WTFString.h:
+        (WTF::String::characters8):
+        (WTF::String::characters16):
+        (WTF::String::is8Bit):
+        (WTF::LChar):
+        (WTF::UChar):
+        * wtf/unicode/UTF8.cpp:
+        (WTF::Unicode::convertLatin1ToUTF8):
+        * wtf/unicode/UTF8.h:
+        * wtf/unicode/Unicode.h:
+
 2011-11-15  Darin Adler  <da...@apple.com>
 
         REGRESSION (r98887): ParserArena and Keywords leaking

Modified: trunk/Source/_javascript_Core/_javascript_Core.vcproj/_javascript_Core/_javascript_Core.def (100404 => 100405)


--- trunk/Source/_javascript_Core/_javascript_Core.vcproj/_javascript_Core/_javascript_Core.def	2011-11-16 05:46:55 UTC (rev 100404)
+++ trunk/Source/_javascript_Core/_javascript_Core.vcproj/_javascript_Core/_javascript_Core.def	2011-11-16 06:06:21 UTC (rev 100405)
@@ -99,6 +99,7 @@
     ?constructEmptyArray@JSC@@YAPAVJSArray@1@PAVExecState@1@@Z
     ?constructEmptyObject@JSC@@YAPAVJSObject@1@PAVExecState@1@@Z
     ?constructFunctionSkippingEvalEnabledCheck@JSC@@YAPAVJSObject@1@PAVExecState@1@PAVJSGlobalObject@1@ABVArgList@1@ABVIdentifier@1@ABVUString@1@ABVTextPosition@WTF@@@Z
+    ?convertLatin1ToUTF8@Unicode@WTF@@YA?AW4ConversionResult@12@PAPBEPBEPAPADPAD@Z
     ?convertUTF16ToUTF8@Unicode@WTF@@YA?AW4ConversionResult@12@PAPB_WPB_WPAPADPAD_N@Z
     ?convertUTF8ToUTF16@Unicode@WTF@@YA?AW4ConversionResult@12@PAPBDPBDPAPA_WPA_W_N@Z
     ?create@ByteArray@WTF@@SA?AV?$PassRefPtr@VByteArray@WTF@@@2@I@Z

Modified: trunk/Source/_javascript_Core/runtime/UString.cpp (100404 => 100405)


--- trunk/Source/_javascript_Core/runtime/UString.cpp	2011-11-16 05:46:55 UTC (rev 100404)
+++ trunk/Source/_javascript_Core/runtime/UString.cpp	2011-11-16 06:06:21 UTC (rev 100405)
@@ -399,8 +399,8 @@
 {
     unsigned length = this->length();
 
-    if (is8Bit())
-        return CString(reinterpret_cast<const char*>(characters8()), length);
+    if (!length)
+        return CString("", 0);
 
     // Allocate a buffer big enough to hold all the characters
     // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes).
@@ -415,30 +415,38 @@
     if (length > numeric_limits<unsigned>::max() / 3)
         return CString();
 
-    const UChar* characters = this->characters16();
     Vector<char, 1024> bufferVector(length * 3);
-
     char* buffer = bufferVector.data();
-    ConversionResult result = convertUTF16ToUTF8(&characters, characters + length, &buffer, buffer + bufferVector.size(), strict);
-    ASSERT(result != targetExhausted); // (length * 3) should be sufficient for any conversion
 
-    // Only produced from strict conversion.
-    if (result == sourceIllegal)
-        return CString();
+    if (is8Bit()) {
+        const LChar* characters = this->characters8();
 
-    // Check for an unconverted high surrogate.
-    if (result == sourceExhausted) {
-        if (strict)
+        ConversionResult result = convertLatin1ToUTF8(&characters, characters + length, &buffer, buffer + bufferVector.size());
+        ASSERT_UNUSED(result, result != targetExhausted); // (length * 3) should be sufficient for any conversion
+    } else {
+        const UChar* characters = this->characters16();
+
+        ConversionResult result = convertUTF16ToUTF8(&characters, characters + length, &buffer, buffer + bufferVector.size(), strict);
+        ASSERT(result != targetExhausted); // (length * 3) should be sufficient for any conversion
+
+        // Only produced from strict conversion.
+        if (result == sourceIllegal)
             return CString();
-        // This should be one unpaired high surrogate. Treat it the same
-        // was as an unpaired high surrogate would have been handled in
-        // the middle of a string with non-strict conversion - which is
-        // to say, simply encode it to UTF-8.
-        ASSERT((characters + 1) == (this->characters() + length));
-        ASSERT((*characters >= 0xD800) && (*characters <= 0xDBFF));
-        // There should be room left, since one UChar hasn't been converted.
-        ASSERT((buffer + 3) <= (buffer + bufferVector.size()));
-        putUTF8Triple(buffer, *characters);
+
+        // Check for an unconverted high surrogate.
+        if (result == sourceExhausted) {
+            if (strict)
+                return CString();
+            // This should be one unpaired high surrogate. Treat it the same
+            // was as an unpaired high surrogate would have been handled in
+            // the middle of a string with non-strict conversion - which is
+            // to say, simply encode it to UTF-8.
+            ASSERT((characters + 1) == (this->characters() + length));
+            ASSERT((*characters >= 0xD800) && (*characters <= 0xDBFF));
+            // There should be room left, since one UChar hasn't been converted.
+            ASSERT((buffer + 3) <= (buffer + bufferVector.size()));
+            putUTF8Triple(buffer, *characters);
+        }
     }
 
     return CString(bufferVector.data(), buffer - bufferVector.data());

Modified: trunk/Source/_javascript_Core/wtf/text/WTFString.cpp (100404 => 100405)


--- trunk/Source/_javascript_Core/wtf/text/WTFString.cpp	2011-11-16 05:46:55 UTC (rev 100404)
+++ trunk/Source/_javascript_Core/wtf/text/WTFString.cpp	2011-11-16 06:06:21 UTC (rev 100405)
@@ -641,8 +641,28 @@
     // preserved, characters outside of this range are converted to '?'.
 
     unsigned length = this->length();
-    const UChar* characters = this->characters();
 
+    if (!length) {
+        char* characterBuffer;
+        return CString::newUninitialized(length, characterBuffer);
+    }
+
+    if (this->is8Bit()) {
+        const LChar* characters = this->characters8();
+
+        char* characterBuffer;
+        CString result = CString::newUninitialized(length, characterBuffer);
+
+        for (unsigned i = 0; i < length; ++i) {
+            LChar ch = characters[i];
+            characterBuffer[i] = ch && (ch < 0x20 || ch > 0x7f) ? '?' : ch;
+        }
+
+        return result;        
+    }
+
+    const UChar* characters = this->characters16();
+
     char* characterBuffer;
     CString result = CString::newUninitialized(length, characterBuffer);
 
@@ -685,8 +705,10 @@
 CString String::utf8(bool strict) const
 {
     unsigned length = this->length();
-    const UChar* characters = this->characters();
 
+    if (!length)
+        return CString("", 0);
+
     // Allocate a buffer big enough to hold all the characters
     // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes).
     // Optimization ideas, if we find this function is hot:
@@ -702,26 +724,36 @@
     Vector<char, 1024> bufferVector(length * 3);
 
     char* buffer = bufferVector.data();
-    ConversionResult result = convertUTF16ToUTF8(&characters, characters + length, &buffer, buffer + bufferVector.size(), strict);
-    ASSERT(result != targetExhausted); // (length * 3) should be sufficient for any conversion
 
-    // Only produced from strict conversion.
-    if (result == sourceIllegal)
-        return CString();
+    if (is8Bit()) {
+        const LChar* characters = this->characters8();
 
-    // Check for an unconverted high surrogate.
-    if (result == sourceExhausted) {
-        if (strict)
+        ConversionResult result = convertLatin1ToUTF8(&characters, characters + length, &buffer, buffer + bufferVector.size());
+        ASSERT_UNUSED(result, result != targetExhausted); // (length * 3) should be sufficient for any conversion
+    } else {
+        const UChar* characters = this->characters16();
+
+        ConversionResult result = convertUTF16ToUTF8(&characters, characters + length, &buffer, buffer + bufferVector.size(), strict);
+        ASSERT(result != targetExhausted); // (length * 3) should be sufficient for any conversion
+
+        // Only produced from strict conversion.
+        if (result == sourceIllegal)
             return CString();
-        // This should be one unpaired high surrogate. Treat it the same
-        // was as an unpaired high surrogate would have been handled in
-        // the middle of a string with non-strict conversion - which is
-        // to say, simply encode it to UTF-8.
-        ASSERT((characters + 1) == (this->characters() + length));
-        ASSERT((*characters >= 0xD800) && (*characters <= 0xDBFF));
-        // There should be room left, since one UChar hasn't been converted.
-        ASSERT((buffer + 3) <= (buffer + bufferVector.size()));
-        putUTF8Triple(buffer, *characters);
+
+        // Check for an unconverted high surrogate.
+        if (result == sourceExhausted) {
+            if (strict)
+                return CString();
+            // This should be one unpaired high surrogate. Treat it the same
+            // was as an unpaired high surrogate would have been handled in
+            // the middle of a string with non-strict conversion - which is
+            // to say, simply encode it to UTF-8.
+            ASSERT((characters + 1) == (this->characters() + length));
+            ASSERT((*characters >= 0xD800) && (*characters <= 0xDBFF));
+            // There should be room left, since one UChar hasn't been converted.
+            ASSERT((buffer + 3) <= (buffer + bufferVector.size()));
+            putUTF8Triple(buffer, *characters);
+        }
     }
 
     return CString(bufferVector.data(), buffer - bufferVector.data());

Modified: trunk/Source/_javascript_Core/wtf/text/WTFString.h (100404 => 100405)


--- trunk/Source/_javascript_Core/wtf/text/WTFString.h	2011-11-16 05:46:55 UTC (rev 100404)
+++ trunk/Source/_javascript_Core/wtf/text/WTFString.h	2011-11-16 06:06:21 UTC (rev 100405)
@@ -140,7 +140,28 @@
             return 0;
         return m_impl->characters();
     }
+    
+    const LChar* characters8() const
+    {
+        if (!m_impl)
+            return 0;
+        ASSERT(m_impl->is8Bit());
+        return m_impl->characters8();
+    }
 
+    const UChar* characters16() const
+    {
+        if (!m_impl)
+            return 0;
+        ASSERT(!m_impl->is8Bit());
+        return m_impl->characters16();
+    }
+
+    template <typename CharType>
+    inline const CharType* getCharacters() const;
+
+    bool is8Bit() const { return m_impl->is8Bit(); }
+
     WTF_EXPORT_PRIVATE CString ascii() const;
     WTF_EXPORT_PRIVATE CString latin1() const;
     WTF_EXPORT_PRIVATE CString utf8(bool strict = false) const;
@@ -396,6 +417,21 @@
 {
 }
 
+template<>
+inline const LChar* String::getCharacters<LChar>() const
+{
+    ASSERT(is8Bit());
+    return characters8();
+}
+
+template<>
+inline const UChar* String::getCharacters<UChar>() const
+{
+    ASSERT(!is8Bit());
+    return characters16();
+}
+
+
 #ifdef __OBJC__
 // This is for situations in WebKit where the long standing behavior has been
 // "nil if empty", so we try to maintain longstanding behavior for the sake of

Modified: trunk/Source/_javascript_Core/wtf/unicode/UTF8.cpp (100404 => 100405)


--- trunk/Source/_javascript_Core/wtf/unicode/UTF8.cpp	2011-11-16 05:46:55 UTC (rev 100404)
+++ trunk/Source/_javascript_Core/wtf/unicode/UTF8.cpp	2011-11-16 06:06:21 UTC (rev 100405)
@@ -125,6 +125,48 @@
 // for *legal* UTF-8 will be 4 or fewer bytes total.
 static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
 
+ConversionResult convertLatin1ToUTF8(
+                                     const LChar** sourceStart, const LChar* sourceEnd, 
+                                     char** targetStart, char* targetEnd)
+{
+    ConversionResult result = conversionOK;
+    const LChar* source = *sourceStart;
+    char* target = *targetStart;
+    while (source < sourceEnd) {
+        UChar32 ch;
+        unsigned short bytesToWrite = 0;
+        const UChar32 byteMask = 0xBF;
+        const UChar32 byteMark = 0x80; 
+        const LChar* oldSource = source; // In case we have to back up because of target overflow.
+        ch = static_cast<unsigned short>(*source++);
+
+        // Figure out how many bytes the result will require
+        if (ch < (UChar32)0x80)
+            bytesToWrite = 1;
+        else
+            bytesToWrite = 2;
+
+        target += bytesToWrite;
+        if (target > targetEnd) {
+            source = oldSource; // Back up source pointer!
+            target -= bytesToWrite;
+            result = targetExhausted;
+            break;
+        }
+        switch (bytesToWrite) { // note: everything falls through.
+        case 2:
+            *--target = (char)((ch | byteMark) & byteMask);
+            ch >>= 6;
+        case 1:
+            *--target =  (char)(ch | firstByteMark[bytesToWrite]);
+        }
+        target += bytesToWrite;
+    }
+    *sourceStart = source;
+    *targetStart = target;
+    return result;
+}
+
 ConversionResult convertUTF16ToUTF8(
     const UChar** sourceStart, const UChar* sourceEnd, 
     char** targetStart, char* targetEnd, bool strict)

Modified: trunk/Source/_javascript_Core/wtf/unicode/UTF8.h (100404 => 100405)


--- trunk/Source/_javascript_Core/wtf/unicode/UTF8.h	2011-11-16 05:46:55 UTC (rev 100404)
+++ trunk/Source/_javascript_Core/wtf/unicode/UTF8.h	2011-11-16 06:06:21 UTC (rev 100405)
@@ -66,6 +66,10 @@
                     const char** sourceStart, const char* sourceEnd, 
                     UChar** targetStart, UChar* targetEnd, bool strict = true);
 
+    ConversionResult convertLatin1ToUTF8(
+                    const LChar** sourceStart, const LChar* sourceEnd, 
+                    char** targetStart, char* targetEnd);
+
     ConversionResult convertUTF16ToUTF8(
                     const UChar** sourceStart, const UChar* sourceEnd, 
                     char** targetStart, char* targetEnd, bool strict = true);

Modified: trunk/Source/_javascript_Core/wtf/unicode/Unicode.h (100404 => 100405)


--- trunk/Source/_javascript_Core/wtf/unicode/Unicode.h	2011-11-16 05:46:55 UTC (rev 100404)
+++ trunk/Source/_javascript_Core/wtf/unicode/Unicode.h	2011-11-16 06:06:21 UTC (rev 100405)
@@ -25,6 +25,9 @@
 
 #include <wtf/Assertions.h>
 
+// Define platform neutral 8 bit character type (L is for Latin-1).
+typedef unsigned char LChar;
+
 #if USE(QT4_UNICODE)
 #include "qt4/UnicodeQt4.h"
 #elif USE(ICU_UNICODE)
@@ -39,7 +42,4 @@
 
 COMPILE_ASSERT(sizeof(UChar) == 2, UCharIsTwoBytes);
 
-// Define platform neutral 8 bit character type (L is for Latin-1).
-typedef unsigned char LChar;
-
 #endif // WTF_UNICODE_H
_______________________________________________
webkit-changes mailing list
webkit-changes@lists.webkit.org
http://lists.webkit.org/mailman/listinfo.cgi/webkit-changes

Reply via email to