Diff
Modified: trunk/Source/_javascript_Core/ChangeLog (100404 => 100405)
--- trunk/Source/_javascript_Core/ChangeLog 2011-11-16 05:46:55 UTC (rev 100404)
+++ trunk/Source/_javascript_Core/ChangeLog 2011-11-16 06:06:21 UTC (rev 100405)
@@ -1,3 +1,34 @@
+2011-11-15 Michael Saboff <msab...@apple.com>
+
+ Towards 8 bit Strings - Update utf8() and ascii() methods for 8 bit strings
+ https://bugs.webkit.org/show_bug.cgi?id=72323
+
+ Added 8 bit optimized paths for String and UString ascii() and utf8() methods.
+
+ Added String::characters8(), characters16() and is8Bit() helper methods.
+
+ Added an new Unicode::convertLatin1ToUTF8() method that works on
+ LChar (8 bit) strings that is a stripped down version of convertUTF16ToUTF8().
+
+ Reviewed by Geoff Garen.
+
+ * _javascript_Core.vcproj/_javascript_Core/_javascript_Core.def:
+ * runtime/UString.cpp:
+ (JSC::UString::utf8):
+ * wtf/text/WTFString.cpp:
+ (WTF::String::ascii):
+ (WTF::String::utf8):
+ * wtf/text/WTFString.h:
+ (WTF::String::characters8):
+ (WTF::String::characters16):
+ (WTF::String::is8Bit):
+ (WTF::LChar):
+ (WTF::UChar):
+ * wtf/unicode/UTF8.cpp:
+ (WTF::Unicode::convertLatin1ToUTF8):
+ * wtf/unicode/UTF8.h:
+ * wtf/unicode/Unicode.h:
+
2011-11-15 Darin Adler <da...@apple.com>
REGRESSION (r98887): ParserArena and Keywords leaking
Modified: trunk/Source/_javascript_Core/_javascript_Core.vcproj/_javascript_Core/_javascript_Core.def (100404 => 100405)
--- trunk/Source/_javascript_Core/_javascript_Core.vcproj/_javascript_Core/_javascript_Core.def 2011-11-16 05:46:55 UTC (rev 100404)
+++ trunk/Source/_javascript_Core/_javascript_Core.vcproj/_javascript_Core/_javascript_Core.def 2011-11-16 06:06:21 UTC (rev 100405)
@@ -99,6 +99,7 @@
?constructEmptyArray@JSC@@YAPAVJSArray@1@PAVExecState@1@@Z
?constructEmptyObject@JSC@@YAPAVJSObject@1@PAVExecState@1@@Z
?constructFunctionSkippingEvalEnabledCheck@JSC@@YAPAVJSObject@1@PAVExecState@1@PAVJSGlobalObject@1@ABVArgList@1@ABVIdentifier@1@ABVUString@1@ABVTextPosition@WTF@@@Z
+ ?convertLatin1ToUTF8@Unicode@WTF@@YA?AW4ConversionResult@12@PAPBEPBEPAPADPAD@Z
?convertUTF16ToUTF8@Unicode@WTF@@YA?AW4ConversionResult@12@PAPB_WPB_WPAPADPAD_N@Z
?convertUTF8ToUTF16@Unicode@WTF@@YA?AW4ConversionResult@12@PAPBDPBDPAPA_WPA_W_N@Z
?create@ByteArray@WTF@@SA?AV?$PassRefPtr@VByteArray@WTF@@@2@I@Z
Modified: trunk/Source/_javascript_Core/runtime/UString.cpp (100404 => 100405)
--- trunk/Source/_javascript_Core/runtime/UString.cpp 2011-11-16 05:46:55 UTC (rev 100404)
+++ trunk/Source/_javascript_Core/runtime/UString.cpp 2011-11-16 06:06:21 UTC (rev 100405)
@@ -399,8 +399,8 @@
{
unsigned length = this->length();
- if (is8Bit())
- return CString(reinterpret_cast<const char*>(characters8()), length);
+ if (!length)
+ return CString("", 0);
// Allocate a buffer big enough to hold all the characters
// (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes).
@@ -415,30 +415,38 @@
if (length > numeric_limits<unsigned>::max() / 3)
return CString();
- const UChar* characters = this->characters16();
Vector<char, 1024> bufferVector(length * 3);
-
char* buffer = bufferVector.data();
- ConversionResult result = convertUTF16ToUTF8(&characters, characters + length, &buffer, buffer + bufferVector.size(), strict);
- ASSERT(result != targetExhausted); // (length * 3) should be sufficient for any conversion
- // Only produced from strict conversion.
- if (result == sourceIllegal)
- return CString();
+ if (is8Bit()) {
+ const LChar* characters = this->characters8();
- // Check for an unconverted high surrogate.
- if (result == sourceExhausted) {
- if (strict)
+ ConversionResult result = convertLatin1ToUTF8(&characters, characters + length, &buffer, buffer + bufferVector.size());
+ ASSERT_UNUSED(result, result != targetExhausted); // (length * 3) should be sufficient for any conversion
+ } else {
+ const UChar* characters = this->characters16();
+
+ ConversionResult result = convertUTF16ToUTF8(&characters, characters + length, &buffer, buffer + bufferVector.size(), strict);
+ ASSERT(result != targetExhausted); // (length * 3) should be sufficient for any conversion
+
+ // Only produced from strict conversion.
+ if (result == sourceIllegal)
return CString();
- // This should be one unpaired high surrogate. Treat it the same
- // was as an unpaired high surrogate would have been handled in
- // the middle of a string with non-strict conversion - which is
- // to say, simply encode it to UTF-8.
- ASSERT((characters + 1) == (this->characters() + length));
- ASSERT((*characters >= 0xD800) && (*characters <= 0xDBFF));
- // There should be room left, since one UChar hasn't been converted.
- ASSERT((buffer + 3) <= (buffer + bufferVector.size()));
- putUTF8Triple(buffer, *characters);
+
+ // Check for an unconverted high surrogate.
+ if (result == sourceExhausted) {
+ if (strict)
+ return CString();
+ // This should be one unpaired high surrogate. Treat it the same
+ // was as an unpaired high surrogate would have been handled in
+ // the middle of a string with non-strict conversion - which is
+ // to say, simply encode it to UTF-8.
+ ASSERT((characters + 1) == (this->characters() + length));
+ ASSERT((*characters >= 0xD800) && (*characters <= 0xDBFF));
+ // There should be room left, since one UChar hasn't been converted.
+ ASSERT((buffer + 3) <= (buffer + bufferVector.size()));
+ putUTF8Triple(buffer, *characters);
+ }
}
return CString(bufferVector.data(), buffer - bufferVector.data());
Modified: trunk/Source/_javascript_Core/wtf/text/WTFString.cpp (100404 => 100405)
--- trunk/Source/_javascript_Core/wtf/text/WTFString.cpp 2011-11-16 05:46:55 UTC (rev 100404)
+++ trunk/Source/_javascript_Core/wtf/text/WTFString.cpp 2011-11-16 06:06:21 UTC (rev 100405)
@@ -641,8 +641,28 @@
// preserved, characters outside of this range are converted to '?'.
unsigned length = this->length();
- const UChar* characters = this->characters();
+ if (!length) {
+ char* characterBuffer;
+ return CString::newUninitialized(length, characterBuffer);
+ }
+
+ if (this->is8Bit()) {
+ const LChar* characters = this->characters8();
+
+ char* characterBuffer;
+ CString result = CString::newUninitialized(length, characterBuffer);
+
+ for (unsigned i = 0; i < length; ++i) {
+ LChar ch = characters[i];
+ characterBuffer[i] = ch && (ch < 0x20 || ch > 0x7f) ? '?' : ch;
+ }
+
+ return result;
+ }
+
+ const UChar* characters = this->characters16();
+
char* characterBuffer;
CString result = CString::newUninitialized(length, characterBuffer);
@@ -685,8 +705,10 @@
CString String::utf8(bool strict) const
{
unsigned length = this->length();
- const UChar* characters = this->characters();
+ if (!length)
+ return CString("", 0);
+
// Allocate a buffer big enough to hold all the characters
// (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes).
// Optimization ideas, if we find this function is hot:
@@ -702,26 +724,36 @@
Vector<char, 1024> bufferVector(length * 3);
char* buffer = bufferVector.data();
- ConversionResult result = convertUTF16ToUTF8(&characters, characters + length, &buffer, buffer + bufferVector.size(), strict);
- ASSERT(result != targetExhausted); // (length * 3) should be sufficient for any conversion
- // Only produced from strict conversion.
- if (result == sourceIllegal)
- return CString();
+ if (is8Bit()) {
+ const LChar* characters = this->characters8();
- // Check for an unconverted high surrogate.
- if (result == sourceExhausted) {
- if (strict)
+ ConversionResult result = convertLatin1ToUTF8(&characters, characters + length, &buffer, buffer + bufferVector.size());
+ ASSERT_UNUSED(result, result != targetExhausted); // (length * 3) should be sufficient for any conversion
+ } else {
+ const UChar* characters = this->characters16();
+
+ ConversionResult result = convertUTF16ToUTF8(&characters, characters + length, &buffer, buffer + bufferVector.size(), strict);
+ ASSERT(result != targetExhausted); // (length * 3) should be sufficient for any conversion
+
+ // Only produced from strict conversion.
+ if (result == sourceIllegal)
return CString();
- // This should be one unpaired high surrogate. Treat it the same
- // was as an unpaired high surrogate would have been handled in
- // the middle of a string with non-strict conversion - which is
- // to say, simply encode it to UTF-8.
- ASSERT((characters + 1) == (this->characters() + length));
- ASSERT((*characters >= 0xD800) && (*characters <= 0xDBFF));
- // There should be room left, since one UChar hasn't been converted.
- ASSERT((buffer + 3) <= (buffer + bufferVector.size()));
- putUTF8Triple(buffer, *characters);
+
+ // Check for an unconverted high surrogate.
+ if (result == sourceExhausted) {
+ if (strict)
+ return CString();
+ // This should be one unpaired high surrogate. Treat it the same
+ // was as an unpaired high surrogate would have been handled in
+ // the middle of a string with non-strict conversion - which is
+ // to say, simply encode it to UTF-8.
+ ASSERT((characters + 1) == (this->characters() + length));
+ ASSERT((*characters >= 0xD800) && (*characters <= 0xDBFF));
+ // There should be room left, since one UChar hasn't been converted.
+ ASSERT((buffer + 3) <= (buffer + bufferVector.size()));
+ putUTF8Triple(buffer, *characters);
+ }
}
return CString(bufferVector.data(), buffer - bufferVector.data());
Modified: trunk/Source/_javascript_Core/wtf/text/WTFString.h (100404 => 100405)
--- trunk/Source/_javascript_Core/wtf/text/WTFString.h 2011-11-16 05:46:55 UTC (rev 100404)
+++ trunk/Source/_javascript_Core/wtf/text/WTFString.h 2011-11-16 06:06:21 UTC (rev 100405)
@@ -140,7 +140,28 @@
return 0;
return m_impl->characters();
}
+
+ const LChar* characters8() const
+ {
+ if (!m_impl)
+ return 0;
+ ASSERT(m_impl->is8Bit());
+ return m_impl->characters8();
+ }
+ const UChar* characters16() const
+ {
+ if (!m_impl)
+ return 0;
+ ASSERT(!m_impl->is8Bit());
+ return m_impl->characters16();
+ }
+
+ template <typename CharType>
+ inline const CharType* getCharacters() const;
+
+ bool is8Bit() const { return m_impl->is8Bit(); }
+
WTF_EXPORT_PRIVATE CString ascii() const;
WTF_EXPORT_PRIVATE CString latin1() const;
WTF_EXPORT_PRIVATE CString utf8(bool strict = false) const;
@@ -396,6 +417,21 @@
{
}
+template<>
+inline const LChar* String::getCharacters<LChar>() const
+{
+ ASSERT(is8Bit());
+ return characters8();
+}
+
+template<>
+inline const UChar* String::getCharacters<UChar>() const
+{
+ ASSERT(!is8Bit());
+ return characters16();
+}
+
+
#ifdef __OBJC__
// This is for situations in WebKit where the long standing behavior has been
// "nil if empty", so we try to maintain longstanding behavior for the sake of
Modified: trunk/Source/_javascript_Core/wtf/unicode/UTF8.cpp (100404 => 100405)
--- trunk/Source/_javascript_Core/wtf/unicode/UTF8.cpp 2011-11-16 05:46:55 UTC (rev 100404)
+++ trunk/Source/_javascript_Core/wtf/unicode/UTF8.cpp 2011-11-16 06:06:21 UTC (rev 100405)
@@ -125,6 +125,48 @@
// for *legal* UTF-8 will be 4 or fewer bytes total.
static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
+ConversionResult convertLatin1ToUTF8(
+ const LChar** sourceStart, const LChar* sourceEnd,
+ char** targetStart, char* targetEnd)
+{
+ ConversionResult result = conversionOK;
+ const LChar* source = *sourceStart;
+ char* target = *targetStart;
+ while (source < sourceEnd) {
+ UChar32 ch;
+ unsigned short bytesToWrite = 0;
+ const UChar32 byteMask = 0xBF;
+ const UChar32 byteMark = 0x80;
+ const LChar* oldSource = source; // In case we have to back up because of target overflow.
+ ch = static_cast<unsigned short>(*source++);
+
+ // Figure out how many bytes the result will require
+ if (ch < (UChar32)0x80)
+ bytesToWrite = 1;
+ else
+ bytesToWrite = 2;
+
+ target += bytesToWrite;
+ if (target > targetEnd) {
+ source = oldSource; // Back up source pointer!
+ target -= bytesToWrite;
+ result = targetExhausted;
+ break;
+ }
+ switch (bytesToWrite) { // note: everything falls through.
+ case 2:
+ *--target = (char)((ch | byteMark) & byteMask);
+ ch >>= 6;
+ case 1:
+ *--target = (char)(ch | firstByteMark[bytesToWrite]);
+ }
+ target += bytesToWrite;
+ }
+ *sourceStart = source;
+ *targetStart = target;
+ return result;
+}
+
ConversionResult convertUTF16ToUTF8(
const UChar** sourceStart, const UChar* sourceEnd,
char** targetStart, char* targetEnd, bool strict)
Modified: trunk/Source/_javascript_Core/wtf/unicode/UTF8.h (100404 => 100405)
--- trunk/Source/_javascript_Core/wtf/unicode/UTF8.h 2011-11-16 05:46:55 UTC (rev 100404)
+++ trunk/Source/_javascript_Core/wtf/unicode/UTF8.h 2011-11-16 06:06:21 UTC (rev 100405)
@@ -66,6 +66,10 @@
const char** sourceStart, const char* sourceEnd,
UChar** targetStart, UChar* targetEnd, bool strict = true);
+ ConversionResult convertLatin1ToUTF8(
+ const LChar** sourceStart, const LChar* sourceEnd,
+ char** targetStart, char* targetEnd);
+
ConversionResult convertUTF16ToUTF8(
const UChar** sourceStart, const UChar* sourceEnd,
char** targetStart, char* targetEnd, bool strict = true);
Modified: trunk/Source/_javascript_Core/wtf/unicode/Unicode.h (100404 => 100405)
--- trunk/Source/_javascript_Core/wtf/unicode/Unicode.h 2011-11-16 05:46:55 UTC (rev 100404)
+++ trunk/Source/_javascript_Core/wtf/unicode/Unicode.h 2011-11-16 06:06:21 UTC (rev 100405)
@@ -25,6 +25,9 @@
#include <wtf/Assertions.h>
+// Define platform neutral 8 bit character type (L is for Latin-1).
+typedef unsigned char LChar;
+
#if USE(QT4_UNICODE)
#include "qt4/UnicodeQt4.h"
#elif USE(ICU_UNICODE)
@@ -39,7 +42,4 @@
COMPILE_ASSERT(sizeof(UChar) == 2, UCharIsTwoBytes);
-// Define platform neutral 8 bit character type (L is for Latin-1).
-typedef unsigned char LChar;
-
#endif // WTF_UNICODE_H