Diff
Modified: trunk/Source/WebCore/ChangeLog (223328 => 223329)
--- trunk/Source/WebCore/ChangeLog 2017-10-15 01:45:26 UTC (rev 223328)
+++ trunk/Source/WebCore/ChangeLog 2017-10-15 20:13:51 UTC (rev 223329)
@@ -1,3 +1,36 @@
+2017-10-15 Darin Adler <da...@apple.com>
+
+ UTF-8 decoding produces one replacement character per byte; Encoding standard requires one replacement character per illegal sequence instead
+ https://bugs.webkit.org/show_bug.cgi?id=178207
+
+ Reviewed by Sam Weinig.
+
+ * platform/text/TextCodecUTF8.cpp:
+ (WebCore::TextCodecUTF8::create): Deleted. Use a lambda instead.
+ (WebCore::TextCodecUTF8::registerCodecs): Use a lambda.
+ (WebCore::nonASCIISequenceLength): Changed to return 0 instead of 2 for the range 80-C1 since
+ none of those are valid sequence leading characters.
+ (WebCore::decodeNonASCIISequence): Changed the length argument to be in/out so the caller
+ knows how much of the sequence we decoded for failure cases. Simplified the length 2 section.
+ (WebCore::TextCodecUTF8::handleError): Deleted.
+ (WebCore::TextCodecUTF8::handlePartialSequence): Changed this into a pair of plain functions
+ rather than two template function specializations since the two functions are rather different.
+ For the one-byte version, got rid of the unused arguments. For the two-byte version, got rid
+ of the ignored return value, stopped using the handleError function since each error case
+ needs to be handled differently. In each error case consume the entire incorrect sequence
+ instead of just one byte.
+ (WebCore::TextCodecUTF8::decode): Updated for the above change, and changed the non-partial
+ incorrect sequence to consume the entire incorrect sequence instead of just one byte. Also
+ use WTF prefixes explicitly so we don't have to do "using namespace".
+ (WebCore::TextCodecUTF8::encode): Got rid of unneeded type punning, and added some inline
+ capacity to save one memory allocation when encoding shorter strings.
+
+ * platform/text/TextCodecUTF8.h: Use pragma once. Intialize m_partialSequenceSize where it
+ is defined and let the compiler generate the constructor. Updated for the changes above.
+
+ * platform/text/TextEncoding.h: Export a constructor now used by a unit test.
+ * platform/text/TextEncodingRegistry.h: Export newTextCodec, now used by a unit test.
+
2017-10-14 Antoine Quint <grao...@apple.com>
Remove all Web Animations code
Modified: trunk/Source/WebCore/platform/text/TextCodecUTF8.cpp (223328 => 223329)
--- trunk/Source/WebCore/platform/text/TextCodecUTF8.cpp 2017-10-15 01:45:26 UTC (rev 223328)
+++ trunk/Source/WebCore/platform/text/TextCodecUTF8.cpp 2017-10-15 20:13:51 UTC (rev 223329)
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved.
+ * Copyright (C) 2004-2017 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -31,18 +31,10 @@
#include <wtf/text/StringBuffer.h>
#include <wtf/unicode/CharacterNames.h>
-using namespace WTF;
-using namespace WTF::Unicode;
-
namespace WebCore {
const int nonCharacter = -1;
-std::unique_ptr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*)
-{
- return std::make_unique<TextCodecUTF8>();
-}
-
void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar)
{
// From https://encoding.spec.whatwg.org.
@@ -61,7 +53,9 @@
void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar)
{
- registrar("UTF-8", create, 0);
+ registrar("UTF-8", [] (const TextEncoding&, const void*) -> std::unique_ptr<TextCodec> {
+ return std::make_unique<TextCodecUTF8>();
+ }, nullptr);
}
static inline int nonASCIISequenceLength(uint8_t firstByte)
@@ -75,12 +69,12 @@
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
@@ -87,55 +81,76 @@
return lengths[firstByte];
}
-static inline int decodeNonASCIISequence(const uint8_t* sequence, unsigned length)
+static inline int decodeNonASCIISequence(const uint8_t* sequence, int& length)
{
ASSERT(!isASCII(sequence[0]));
if (length == 2) {
+ ASSERT(sequence[0] >= 0xC2);
ASSERT(sequence[0] <= 0xDF);
- if (sequence[0] < 0xC2)
+ if (sequence[1] < 0x80 || sequence[1] > 0xBF) {
+ length = 1;
return nonCharacter;
- if (sequence[1] < 0x80 || sequence[1] > 0xBF)
- return nonCharacter;
+ }
return ((sequence[0] << 6) + sequence[1]) - 0x00003080;
}
if (length == 3) {
- ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF);
+ ASSERT(sequence[0] >= 0xE0);
+ ASSERT(sequence[0] <= 0xEF);
switch (sequence[0]) {
case 0xE0:
- if (sequence[1] < 0xA0 || sequence[1] > 0xBF)
+ if (sequence[1] < 0xA0 || sequence[1] > 0xBF) {
+ length = 1;
return nonCharacter;
+ }
break;
case 0xED:
- if (sequence[1] < 0x80 || sequence[1] > 0x9F)
+ if (sequence[1] < 0x80 || sequence[1] > 0x9F) {
+ length = 1;
return nonCharacter;
+ }
break;
default:
- if (sequence[1] < 0x80 || sequence[1] > 0xBF)
+ if (sequence[1] < 0x80 || sequence[1] > 0xBF) {
+ length = 1;
return nonCharacter;
+ }
}
- if (sequence[2] < 0x80 || sequence[2] > 0xBF)
+ if (sequence[2] < 0x80 || sequence[2] > 0xBF) {
+ length = 2;
return nonCharacter;
+ }
return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 0x000E2080;
}
ASSERT(length == 4);
- ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4);
+ ASSERT(sequence[0] >= 0xF0);
+ ASSERT(sequence[0] <= 0xF4);
switch (sequence[0]) {
case 0xF0:
- if (sequence[1] < 0x90 || sequence[1] > 0xBF)
+ if (sequence[1] < 0x90 || sequence[1] > 0xBF) {
+ length = 1;
return nonCharacter;
+ }
break;
case 0xF4:
- if (sequence[1] < 0x80 || sequence[1] > 0x8F)
+ if (sequence[1] < 0x80 || sequence[1] > 0x8F) {
+ length = 1;
return nonCharacter;
+ }
break;
default:
- if (sequence[1] < 0x80 || sequence[1] > 0xBF)
+ if (sequence[1] < 0x80 || sequence[1] > 0xBF) {
+ length = 1;
return nonCharacter;
+ }
}
- if (sequence[2] < 0x80 || sequence[2] > 0xBF)
+ if (sequence[2] < 0x80 || sequence[2] > 0xBF) {
+ length = 2;
return nonCharacter;
- if (sequence[3] < 0x80 || sequence[3] > 0xBF)
+ }
+ if (sequence[3] < 0x80 || sequence[3] > 0xBF) {
+ length = 3;
return nonCharacter;
+ }
return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + sequence[3]) - 0x03C82080;
}
@@ -158,19 +173,8 @@
memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize);
}
-void TextCodecUTF8::handleError(UChar*& destination, bool stopOnError, bool& sawError)
+bool TextCodecUTF8::handlePartialSequence(LChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush)
{
- sawError = true;
- if (stopOnError)
- return;
- // Each error generates a replacement character and consumes one byte.
- *destination++ = replacementCharacter;
- consumePartialSequenceByte();
-}
-
-template <>
-bool TextCodecUTF8::handlePartialSequence<LChar>(LChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool, bool&)
-{
ASSERT(m_partialSequenceSize);
do {
if (isASCII(m_partialSequence[0])) {
@@ -201,7 +205,7 @@
m_partialSequenceSize = count;
}
int character = decodeNonASCIISequence(m_partialSequence, count);
- if ((character == nonCharacter) || (character > 0xff))
+ if (character == nonCharacter || character > 0xFF)
return true;
m_partialSequenceSize -= count;
@@ -211,8 +215,7 @@
return false;
}
-template <>
-bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError)
+void TextCodecUTF8::handlePartialSequence(UChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError)
{
ASSERT(m_partialSequenceSize);
do {
@@ -223,9 +226,11 @@
}
int count = nonASCIISequenceLength(m_partialSequence[0]);
if (!count) {
- handleError(destination, stopOnError, sawError);
+ sawError = true;
if (stopOnError)
- return false;
+ return;
+ *destination++ = replacementCharacter;
+ consumePartialSequenceByte();
continue;
}
if (count > m_partialSequenceSize) {
@@ -235,12 +240,15 @@
// add it to the existing partial sequence.
memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
m_partialSequenceSize += end - source;
- return false;
+ return;
}
// An incomplete partial sequence at the end is an error.
- handleError(destination, stopOnError, sawError);
+ sawError = true;
if (stopOnError)
- return false;
+ return;
+ *destination++ = replacementCharacter;
+ m_partialSequenceSize = 0;
+ source = end;
continue;
}
memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
@@ -249,9 +257,12 @@
}
int character = decodeNonASCIISequence(m_partialSequence, count);
if (character == nonCharacter) {
- handleError(destination, stopOnError, sawError);
+ sawError = true;
if (stopOnError)
- return false;
+ return;
+ *destination++ = replacementCharacter;
+ m_partialSequenceSize -= count;
+ memmove(m_partialSequence, m_partialSequence + count, m_partialSequenceSize);
continue;
}
@@ -258,8 +269,6 @@
m_partialSequenceSize -= count;
destination = appendCharacter(destination, character);
} while (m_partialSequenceSize);
-
- return false;
}
String TextCodecUTF8::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
@@ -271,7 +280,7 @@
const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
const uint8_t* end = source + length;
- const uint8_t* alignedEnd = alignToMachineWord(end);
+ const uint8_t* alignedEnd = WTF::alignToMachineWord(end);
LChar* destination = buffer.characters();
do {
@@ -281,7 +290,7 @@
// in some compilers.
LChar* destinationForHandlePartialSequence = destination;
const uint8_t* sourceForHandlePartialSequence = source;
- if (handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError)) {
+ if (handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush)) {
source = sourceForHandlePartialSequence;
goto upConvertTo16Bit;
}
@@ -294,14 +303,14 @@
while (source < end) {
if (isASCII(*source)) {
// Fast path for ASCII. Most UTF-8 text will be ASCII.
- if (isAlignedToMachineWord(source)) {
+ if (WTF::isAlignedToMachineWord(source)) {
while (source < alignedEnd) {
- MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);
- if (!isAllASCII<LChar>(chunk))
+ auto chunk = *reinterpret_cast_ptr<const WTF::MachineWord*>(source);
+ if (!WTF::isAllASCII<LChar>(chunk))
break;
copyASCIIMachineWord(destination, source);
- source += sizeof(MachineWord);
- destination += sizeof(MachineWord);
+ source += sizeof(WTF::MachineWord);
+ destination += sizeof(WTF::MachineWord);
}
if (source == end)
break;
@@ -330,10 +339,10 @@
sawError = true;
if (stopOnError)
break;
-
+
goto upConvertTo16Bit;
}
- if (character > 0xff)
+ if (character > 0xFF)
goto upConvertTo16Bit;
source += count;
@@ -371,14 +380,14 @@
while (source < end) {
if (isASCII(*source)) {
// Fast path for ASCII. Most UTF-8 text will be ASCII.
- if (isAlignedToMachineWord(source)) {
+ if (WTF::isAlignedToMachineWord(source)) {
while (source < alignedEnd) {
- MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);
- if (!isAllASCII<LChar>(chunk))
+ auto chunk = *reinterpret_cast_ptr<const WTF::MachineWord*>(source);
+ if (!WTF::isAllASCII<LChar>(chunk))
break;
copyASCIIMachineWord(destination16, source);
- source += sizeof(MachineWord);
- destination16 += sizeof(MachineWord);
+ source += sizeof(WTF::MachineWord);
+ destination16 += sizeof(WTF::MachineWord);
}
if (source == end)
break;
@@ -407,9 +416,8 @@
sawError = true;
if (stopOnError)
break;
- // Each error generates a replacement character and consumes one byte.
*destination16++ = replacementCharacter;
- ++source;
+ source += count ? count : 1;
continue;
}
source += count;
@@ -429,17 +437,15 @@
// Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2x).
if (length > std::numeric_limits<size_t>::max() / 3)
CRASH();
- Vector<uint8_t> bytes(length * 3);
- size_t i = 0;
+ Vector<char, 3000> bytes(length * 3);
size_t bytesWritten = 0;
- while (i < length) {
+ for (size_t i = 0; i < length; ) {
UChar32 character;
U16_NEXT(characters, i, length, character);
U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character);
}
-
- return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten);
+ return CString { bytes.data(), bytesWritten };
}
} // namespace WebCore
Modified: trunk/Source/WebCore/platform/text/TextCodecUTF8.h (223328 => 223329)
--- trunk/Source/WebCore/platform/text/TextCodecUTF8.h 2017-10-15 01:45:26 UTC (rev 223328)
+++ trunk/Source/WebCore/platform/text/TextCodecUTF8.h 2017-10-15 20:13:51 UTC (rev 223329)
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2011 Apple Inc. All rights reserved.
+ * Copyright (C) 2011-2017 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -23,8 +23,7 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#ifndef TextCodecUTF8_h
-#define TextCodecUTF8_h
+#pragma once
#include "TextCodec.h"
@@ -32,12 +31,6 @@
class TextCodecUTF8 : public TextCodec {
public:
- static std::unique_ptr<TextCodec> create(const TextEncoding&, const void*);
- TextCodecUTF8()
- : m_partialSequenceSize(0)
- {
- }
-
static void registerEncodingNames(EncodingNameRegistrar);
static void registerCodecs(TextCodecRegistrar);
@@ -45,12 +38,11 @@
String decode(const char*, size_t length, bool flush, bool stopOnError, bool& sawError) override;
CString encode(const UChar*, size_t length, UnencodableHandling) override;
- template <typename CharType>
- bool handlePartialSequence(CharType*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError);
- void handleError(UChar*& destination, bool stopOnError, bool& sawError);
+ bool handlePartialSequence(LChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush);
+ void handlePartialSequence(UChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError);
void consumePartialSequenceByte();
- int m_partialSequenceSize;
+ int m_partialSequenceSize { 0 };
uint8_t m_partialSequence[U8_MAX_LENGTH];
};
@@ -57,4 +49,3 @@
} // namespace WebCore
-#endif // TextCodecUTF8_h
Modified: trunk/Source/WebCore/platform/text/TextEncoding.h (223328 => 223329)
--- trunk/Source/WebCore/platform/text/TextEncoding.h 2017-10-15 01:45:26 UTC (rev 223328)
+++ trunk/Source/WebCore/platform/text/TextEncoding.h 2017-10-15 20:13:51 UTC (rev 223329)
@@ -33,7 +33,7 @@
class TextEncoding {
public:
TextEncoding() = default;
- TextEncoding(const char* name);
+ WEBCORE_EXPORT TextEncoding(const char* name);
WEBCORE_EXPORT TextEncoding(const String& name);
bool isValid() const { return m_name; }
Modified: trunk/Source/WebCore/platform/text/TextEncodingRegistry.h (223328 => 223329)
--- trunk/Source/WebCore/platform/text/TextEncodingRegistry.h 2017-10-15 01:45:26 UTC (rev 223328)
+++ trunk/Source/WebCore/platform/text/TextEncodingRegistry.h 2017-10-15 20:13:51 UTC (rev 223329)
@@ -35,7 +35,7 @@
// Use TextResourceDecoder::decode to decode resources, since it handles BOMs.
// Use TextEncoding::encode to encode, since it takes care of normalization.
-std::unique_ptr<TextCodec> newTextCodec(const TextEncoding&);
+WEBCORE_EXPORT std::unique_ptr<TextCodec> newTextCodec(const TextEncoding&);
// Only TextEncoding should use the following functions directly.
const char* atomicCanonicalTextEncodingName(const char* alias);
Modified: trunk/Tools/ChangeLog (223328 => 223329)
--- trunk/Tools/ChangeLog 2017-10-15 01:45:26 UTC (rev 223328)
+++ trunk/Tools/ChangeLog 2017-10-15 20:13:51 UTC (rev 223329)
@@ -1,3 +1,18 @@
+2017-10-15 Darin Adler <da...@apple.com>
+
+ UTF-8 decoding produces one replacement character per byte; Encoding standard requires one replacement character per illegal sequence instead
+ https://bugs.webkit.org/show_bug.cgi?id=178207
+
+ Reviewed by Sam Weinig.
+
+ * TestWebKitAPI/TestWebKitAPI.xcodeproj/project.pbxproj: Added test.
+ * TestWebKitAPI/Tests/WebCore/TextCodec.cpp: Added.
+ (TestWebKitAPI::decodeHexTestBytes): Decodes a string so we can write readable tests.
+ (TestWebKitAPI::escapeNonPrintableASCIICharacters): Encodes a string so we can write readable tests.
+ (TestWebKitAPI::TEST): Added some UTF-8 tests and UTF-8 invalid sequences tests.
+ Would be smart to add more tests for other cases, exercising the fast ASCII loop for example, and
+ other encodings.
+
2017-10-14 Adrian Perez de Castro <ape...@igalia.com>
[WPE] JHBuild build directory DependenciesWPE/Build is not removed by update-webkit-libs-jhbuild
Modified: trunk/Tools/TestWebKitAPI/TestWebKitAPI.xcodeproj/project.pbxproj (223328 => 223329)
--- trunk/Tools/TestWebKitAPI/TestWebKitAPI.xcodeproj/project.pbxproj 2017-10-15 01:45:26 UTC (rev 223328)
+++ trunk/Tools/TestWebKitAPI/TestWebKitAPI.xcodeproj/project.pbxproj 2017-10-15 20:13:51 UTC (rev 223329)
@@ -530,6 +530,7 @@
93AF4ED11506F130007FD57E /* lots-of-images.html in Copy Resources */ = {isa = PBXBuildFile; fileRef = 93AF4ECF1506F123007FD57E /* lots-of-images.html */; };
93CFA8671CEB9E38000565A8 /* autofocused-text-input.html in Copy Resources */ = {isa = PBXBuildFile; fileRef = 93CFA8661CEB9DE1000565A8 /* autofocused-text-input.html */; };
93E2D2761ED7D53200FA76F6 /* offscreen-iframe-of-media-document.html in Copy Resources */ = {isa = PBXBuildFile; fileRef = 93E2D2751ED7D51700FA76F6 /* offscreen-iframe-of-media-document.html */; };
+ 93E6193B1F931B3A00AF245E /* TextCodec.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 93A258981F92FF15003E510C /* TextCodec.cpp */; };
93F1DB3414DA20870024C362 /* NewFirstVisuallyNonEmptyLayout_Bundle.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 93F1DB3314DA20870024C362 /* NewFirstVisuallyNonEmptyLayout_Bundle.cpp */; };
93F1DB5714DB1B840024C362 /* NewFirstVisuallyNonEmptyLayoutFails_Bundle.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 93F1DB5614DB1B840024C362 /* NewFirstVisuallyNonEmptyLayoutFails_Bundle.cpp */; };
93F56DA71E5F9174003EDE84 /* libicucore.dylib in Frameworks */ = {isa = PBXBuildFile; fileRef = 7C83E0331D0A5F2700FEBCF3 /* libicucore.dylib */; };
@@ -1476,6 +1477,7 @@
936F727F1CD7D9D00068A0FB /* large-video-with-audio.mp4 */ = {isa = PBXFileReference; lastKnownFileType = file; path = "large-video-with-audio.mp4"; sourceTree = "<group>"; };
939BA91614103412001A01BD /* DeviceScaleFactorOnBack.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = DeviceScaleFactorOnBack.mm; sourceTree = "<group>"; };
939BFE3918E5548900883275 /* StringTruncator.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = StringTruncator.mm; sourceTree = "<group>"; };
+ 93A258981F92FF15003E510C /* TextCodec.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = TextCodec.cpp; sourceTree = "<group>"; };
93A427A8180D9B0700CD24D7 /* RefPtr.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = RefPtr.cpp; sourceTree = "<group>"; };
93A427AA180DA26400CD24D7 /* Ref.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Ref.cpp; sourceTree = "<group>"; };
93A427AC180DA60F00CD24D7 /* MoveOnly.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = MoveOnly.h; sourceTree = "<group>"; };
@@ -2152,6 +2154,7 @@
A179918A1E1CA24100A505ED /* SharedBufferTest.h */,
ECA680CD1E68CC0900731D20 /* StringUtilities.mm */,
CE4D5DE51F6743BA0072CFC6 /* StringWithDirection.cpp */,
+ 93A258981F92FF15003E510C /* TextCodec.cpp */,
CDC2C7141797089D00E627FB /* TimeRanges.cpp */,
7AD3FE8D1D75FB8D00B169A4 /* TransformationMatrix.cpp */,
440A1D3814A0103A008A66F2 /* URL.cpp */,
@@ -3453,6 +3456,7 @@
A14FC5901B8AE36F00D107EB /* TestProtocol.mm in Sources */,
7CCE7EAE1A411A3400447C4C /* TestsController.cpp in Sources */,
2EFF06D41D8AEDBB0004BB30 /* TestWKWebView.mm in Sources */,
+ 93E6193B1F931B3A00AF245E /* TextCodec.cpp in Sources */,
CE3524F91B1441C40028A7C5 /* TextFieldDidBeginAndEndEditing.cpp in Sources */,
7CCE7EDD1A411A9200447C4C /* TimeRanges.cpp in Sources */,
7CCE7ED31A411A7E00447C4C /* TypingStyleCrash.mm in Sources */,
Added: trunk/Tools/TestWebKitAPI/Tests/WebCore/TextCodec.cpp (0 => 223329)
--- trunk/Tools/TestWebKitAPI/Tests/WebCore/TextCodec.cpp (rev 0)
+++ trunk/Tools/TestWebKitAPI/Tests/WebCore/TextCodec.cpp 2017-10-15 20:13:51 UTC (rev 223329)
@@ -0,0 +1,236 @@
+/*
+ * Copyright (C) 2017 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <WebCore/TextEncoding.h>
+#include <WebCore/TextEncodingRegistry.h>
+#include <wtf/text/StringBuilder.h>
+
+namespace TestWebKitAPI {
+
+// Expects hex bytes with optional spaces between them.
+// Returns an empty vector if it encounters non-hex-digit characters.
+static Vector<char> decodeHexTestBytes(const char* input)
+{
+ Vector<char> result;
+ for (size_t i = 0; input[i]; ) {
+ if (!isASCIIHexDigit(input[i]))
+ return { };
+ if (!isASCIIHexDigit(input[i + 1]))
+ return { };
+ result.append(toASCIIHexValue(input[i], input[i + 1]));
+ i += 2;
+ if (input[i] == ' ')
+ ++i;
+ }
+ return result;
+}
+
+// The word "escape" here just means a format easy to read in test results.
+// It doesn't have to be a format suitable for other users or even one
+// that is completely unambiguous.
+static const char* escapeNonASCIIPrintableCharacters(StringView string)
+{
+ static char resultBuffer[100];
+ size_t i = 0;
+ auto append = [&i] (char character) {
+ if (i < sizeof(resultBuffer))
+ resultBuffer[i++] = character;
+ };
+ auto appendNibble = [append] (char nibble) {
+ if (nibble)
+ append(lowerNibbleToASCIIHexDigit(nibble));
+ };
+ auto appendLastNibble = [append] (char nibble) {
+ append(lowerNibbleToASCIIHexDigit(nibble));
+ };
+ for (auto character : string.codePoints()) {
+ if (isASCIIPrintable(character))
+ append(character);
+ else {
+ append('{');
+ for (unsigned i = 32 - 4; i; i -= 4)
+ appendNibble(character >> i);
+ appendLastNibble(character);
+ append('}');
+ }
+ }
+ if (i == sizeof(resultBuffer))
+ return "";
+ resultBuffer[i] = '\0';
+ return resultBuffer;
+}
+
+static const char* testDecode(const char* encodingName, std::initializer_list<const char*> inputs)
+{
+ StringBuilder resultBuilder;
+ auto codec = newTextCodec(WebCore::TextEncoding { encodingName });
+ size_t size = inputs.size();
+ for (size_t i = 0; i < size; ++i) {
+ auto vector = decodeHexTestBytes(inputs.begin()[i]);
+ bool last = i == size - 1;
+ resultBuilder.append(escapeNonASCIIPrintableCharacters(codec->decode(vector.data(), vector.size(), last)));
+ }
+ return escapeNonASCIIPrintableCharacters(resultBuilder.toString());
+}
+
+TEST(TextCodec, UTF8)
+{
+ EXPECT_STREQ("", testDecode("UTF-8", { "" }));
+
+ EXPECT_STREQ("{0}", testDecode("UTF-8", { "00" }));
+
+ EXPECT_STREQ("a", testDecode("UTF-8", { "61" }));
+ EXPECT_STREQ("a", testDecode("UTF-8", { "", "61" }));
+ EXPECT_STREQ("a", testDecode("UTF-8", { "61", "" }));
+ EXPECT_STREQ("a", testDecode("UTF-8", { "", "61", "" }));
+
+ EXPECT_STREQ("{B6}", testDecode("UTF-8", { "C2 B6" }));
+ EXPECT_STREQ("{B6}", testDecode("UTF-8", { "C2", "B6" }));
+ EXPECT_STREQ("{B6}", testDecode("UTF-8", { "", "C2", "", "B6", "" }));
+ EXPECT_STREQ("x{B6}", testDecode("UTF-8", { "78 C2 B6" }));
+ EXPECT_STREQ("{B6}x", testDecode("UTF-8", { "C2 B6 78" }));
+
+ EXPECT_STREQ("{2603}", testDecode("UTF-8", { "E2 98 83" }));
+ EXPECT_STREQ("{2603}", testDecode("UTF-8", { "E2", "98", "83" }));
+ EXPECT_STREQ("{2603}", testDecode("UTF-8", { "", "E2", "", "98", "83" }));
+ EXPECT_STREQ("{2603}", testDecode("UTF-8", { "E2 98", "83" }));
+ EXPECT_STREQ("{2603}", testDecode("UTF-8", { "", "E2 98", "", "83", "" }));
+ EXPECT_STREQ("{2603}", testDecode("UTF-8", { "E2", "9883" }));
+ EXPECT_STREQ("{2603}", testDecode("UTF-8", { "", "E2", "", "9883", "" }));
+ EXPECT_STREQ("x{2603}", testDecode("UTF-8", { "78 E2 98 83" }));
+ EXPECT_STREQ("{2603}x", testDecode("UTF-8", { "E2 98 83 78" }));
+
+ EXPECT_STREQ("{1F4A9}", testDecode("UTF-8", { "F0 9F 92 A9" }));
+ EXPECT_STREQ("{1F4A9}", testDecode("UTF-8", { "F0", "9F", "92", "A9" }));
+ EXPECT_STREQ("{1F4A9}", testDecode("UTF-8", { "", "F0", "", "9F", "", "92", "" , "A9", "" }));
+ EXPECT_STREQ("{1F4A9}", testDecode("UTF-8", { "F09F92", "A9" }));
+ EXPECT_STREQ("x{1F4A9}", testDecode("UTF-8", { "78 F0 9F 92 A9" }));
+ EXPECT_STREQ("{1F4A9}x", testDecode("UTF-8", { "F0 9F 92 A9 78" }));
+}
+
+TEST(TextCodec, UTF8InvalidSequences)
+{
+ EXPECT_STREQ("{FFFD}?", testDecode("UTF-8", { "E0 A5 3F" }));
+ EXPECT_STREQ("{FFFD}?", testDecode("UTF-8", { "E0 A5", "3F" }));
+ EXPECT_STREQ("{FFFD}?", testDecode("UTF-8", { "E0", "A5 3F" }));
+ EXPECT_STREQ("{FFFD}?", testDecode("UTF-8", { "E0", "A5", "3F" }));
+ EXPECT_STREQ("{FFFD}?", testDecode("UTF-8", { "E0", "", "A5", "", "3F" }));
+ EXPECT_STREQ("{FFFD}?", testDecode("UTF-8", { "E0", "", "A5", "", "3F", "" }));
+ EXPECT_STREQ("{FFFD}?", testDecode("UTF-8", { "", "E0", "", "A5", "", "3F", "" }));
+
+ EXPECT_STREQ("a{FFFD}?", testDecode("UTF-8", { "61 E0 A5 3F" }));
+ EXPECT_STREQ("a{FFFD}?", testDecode("UTF-8", { "61 E0 A5", "3F" }));
+ EXPECT_STREQ("a{FFFD}?", testDecode("UTF-8", { "61 E0", "A5 3F" }));
+ EXPECT_STREQ("a{FFFD}?", testDecode("UTF-8", { "61 E0", "A5", "3F" }));
+
+ EXPECT_STREQ("{B6}{FFFD}?", testDecode("UTF-8", { "C2 B6 E0 A5 3F" }));
+ EXPECT_STREQ("{B6}{FFFD}?", testDecode("UTF-8", { "C2 B6 E0 A5", "3F" }));
+ EXPECT_STREQ("{B6}{FFFD}?", testDecode("UTF-8", { "C2 B6 E0", "A5 3F" }));
+ EXPECT_STREQ("{B6}{FFFD}?", testDecode("UTF-8", { "C2 B6 E0", "A5", "3F" }));
+
+ EXPECT_STREQ("{FFFD}", testDecode("UTF-8", { "C2" }));
+ EXPECT_STREQ("{FFFD}", testDecode("UTF-8", { "E2" }));
+ EXPECT_STREQ("{FFFD}", testDecode("UTF-8", { "E2 98" }));
+ EXPECT_STREQ("{FFFD}", testDecode("UTF-8", { "F0" }));
+ EXPECT_STREQ("{FFFD}", testDecode("UTF-8", { "F0 9F" }));
+ EXPECT_STREQ("{FFFD}", testDecode("UTF-8", { "F0 9F 92" }));
+
+ EXPECT_STREQ("{FFFD}", testDecode("UTF-8", { "E2", "98" }));
+ EXPECT_STREQ("{FFFD}", testDecode("UTF-8", { "F0", "9F" }));
+ EXPECT_STREQ("{FFFD}", testDecode("UTF-8", { "F0 9F", "92" }));
+ EXPECT_STREQ("{FFFD}", testDecode("UTF-8", { "F0", "9F92" }));
+ EXPECT_STREQ("{FFFD}", testDecode("UTF-8", { "F0", "9F", "92" }));
+
+ EXPECT_STREQ("{FFFD}{FFFD}", testDecode("UTF-8", { "C0 80" }));
+ EXPECT_STREQ("{FFFD}{FFFD}{FFFD}", testDecode("UTF-8", { "E0 80 80" }));
+ EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}", testDecode("UTF-8", { "F0 80 80 80" }));
+ EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}", testDecode("UTF-8", { "F8 80 80 80 80" }));
+ EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}", testDecode("UTF-8", { "FC 80 80 80 80 80" }));
+
+ EXPECT_STREQ("{FFFD}{FFFD}", testDecode("UTF-8", { "C1 BF" }));
+ EXPECT_STREQ("{FFFD}{FFFD}{FFFD}", testDecode("UTF-8", { "E0 81 BF" }));
+ EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}", testDecode("UTF-8", { "F0 80 81 BF" }));
+ EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}", testDecode("UTF-8", { "F8 80 80 81 BF" }));
+ EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}", testDecode("UTF-8", { "FC 80 80 80 81 BF" }));
+
+ EXPECT_STREQ("{FFFD}{FFFD}{FFFD}", testDecode("UTF-8", { "E0 82 80" }));
+ EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}", testDecode("UTF-8", { "F0 80 82 80" }));
+ EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}", testDecode("UTF-8", { "F8 80 80 82 80" }));
+ EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}", testDecode("UTF-8", { "FC 80 80 80 82 80" }));
+
+ EXPECT_STREQ("{FFFD}{FFFD}{FFFD}", testDecode("UTF-8", { "E0 9F BF" }));
+ EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}", testDecode("UTF-8", { "F0 80 9F BF" }));
+ EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}", testDecode("UTF-8", { "F8 80 80 9F BF" }));
+ EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}", testDecode("UTF-8", { "FC 80 80 80 9F BF" }));
+
+ EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}", testDecode("UTF-8", { "F0 80 A0 80" }));
+ EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}", testDecode("UTF-8", { "F8 80 80 A0 80" }));
+ EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}", testDecode("UTF-8", { "FC 80 80 80 A0 80" }));
+
+ EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}", testDecode("UTF-8", { "F0 8F BF BF" }));
+ EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}", testDecode("UTF-8", { "F8 80 8F BF BF" }));
+ EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}", testDecode("UTF-8", { "FC 80 80 8F BF BF" }));
+
+ EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}", testDecode("UTF-8", { "F8 80 90 80 80" }));
+ EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}", testDecode("UTF-8", { "FC 80 80 90 80 80" }));
+
+ EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}", testDecode("UTF-8", { "F8 84 8F BF BF" }));
+ EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}", testDecode("UTF-8", { "FC 80 84 8F BF BF" }));
+
+ EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}", testDecode("UTF-8", { "F4 90 80 80" }));
+ EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}", testDecode("UTF-8", { "FB BF BF BF BF" }));
+ EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}", testDecode("UTF-8", { "FD BF BF BF BF BF" }));
+ EXPECT_STREQ("{FFFD}{FFFD}{FFFD}", testDecode("UTF-8", { "ED A0 80" }));
+ EXPECT_STREQ("{FFFD}{FFFD}{FFFD}", testDecode("UTF-8", { "ED BF BF" }));
+ EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}", testDecode("UTF-8", { "ED A0 BD ED B2 A9" }));
+
+ EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}", testDecode("UTF-8", { "F8 84 90 80 80" }));
+ EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}", testDecode("UTF-8", { "FC 80 84 90 80 80" }));
+ EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}", testDecode("UTF-8", { "F0 8D A0 80" }));
+ EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}", testDecode("UTF-8", { "F0 8D BF BF" }));
+ EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}", testDecode("UTF-8", { "F0 8D A0 BD F0 8D B2 A9" }));
+
+ EXPECT_STREQ("{FFFD}", testDecode("UTF-8", { "80" }));
+ EXPECT_STREQ("{FFFD}{FFFD}", testDecode("UTF-8", { "80 80" }));
+ EXPECT_STREQ("{FFFD}{FFFD}{FFFD}", testDecode("UTF-8", { "80 80 80" }));
+ EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}", testDecode("UTF-8", { "80 80 80 80" }));
+ EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}", testDecode("UTF-8", { "80 80 80 80 80" }));
+ EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}", testDecode("UTF-8", { "80 80 80 80 80 80" }));
+ EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}", testDecode("UTF-8", { "80 80 80 80 80 80 80" }));
+ EXPECT_STREQ("{B6}{FFFD}", testDecode("UTF-8", { "C2 B6 80" }));
+ EXPECT_STREQ("{2603}{FFFD}", testDecode("UTF-8", { "E2 98 83 80" }));
+ EXPECT_STREQ("{1F4A9}{FFFD}", testDecode("UTF-8", { "F0 9F 92 A9 80" }));
+ EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}", testDecode("UTF-8", { "FB BF BF BF BF 80" }));
+ EXPECT_STREQ("{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}{FFFD}", testDecode("UTF-8", { "FD BF BF BF BF BF 80" }));
+
+ EXPECT_STREQ("{FFFD}", testDecode("UTF-8", { "FE" }));
+ EXPECT_STREQ("{FFFD}{FFFD}", testDecode("UTF-8", { "FE 80" }));
+ EXPECT_STREQ("{FFFD}", testDecode("UTF-8", { "FF" }));
+ EXPECT_STREQ("{FFFD}{FFFD}", testDecode("UTF-8", { "FF 80" }));
+}
+
+}