Title: [237714] trunk/Source
Revision
237714
Author
hironori.fu...@sony.com
Date
2018-11-01 18:13:19 -0700 (Thu, 01 Nov 2018)

Log Message

Rename <wtf/unicode/UTF8.h> to <wtf/unicode/UTF8Conversion.h> in order to avoid conflicting with ICU's unicode/utf8.h
https://bugs.webkit.org/show_bug.cgi?id=189693

Reviewed by Yusuke Suzuki.

Source/_javascript_Core:

* API/JSClassRef.cpp: Replaced <wtf/unicode/UTF8.h> with <wtf/unicode/UTF8Conversion.h>.
* API/JSStringRef.cpp: Ditto.
* runtime/JSGlobalObjectFunctions.cpp: Ditto.
* wasm/WasmParser.h: Ditto.

Source/WebCore:

No new tests because there's no behaviro changes.

* platform/SharedBuffer.cpp: Replaced <wtf/unicode/UTF8.h> with <wtf/unicode/UTF8Conversion.h>.
* xml/XSLTProcessorLibxslt.cpp: Ditto.
* xml/parser/XMLDocumentParserLibxml2.cpp: Ditto.

Source/WebKit:

* Shared/API/APIString.h: Replaced <wtf/unicode/UTF8.h> with <wtf/unicode/UTF8Conversion.h>.

Source/WTF:

* WTF.xcodeproj/project.pbxproj: Replaced unicode/UTF8.{cpp,h} with unicode/UTF8Conversion.{cpp,h}.
* wtf/CMakeLists.txt: Ditto.
* wtf/text/AtomicStringImpl.cpp: Replaced <wtf/unicode/UTF8.h> with <wtf/unicode/UTF8Conversion.h>.
* wtf/text/StringImpl.cpp: Ditto.
* wtf/text/StringView.cpp: Ditto.
* wtf/text/WTFString.cpp: Ditto.
* wtf/unicode/UTF8Conversion.cpp: Renamed from Source/WTF/wtf/unicode/UTF8.cpp.
* wtf/unicode/UTF8Conversion.h: Renamed from Source/WTF/wtf/unicode/UTF8.h.

Modified Paths

Added Paths

Removed Paths

Diff

Modified: trunk/Source/_javascript_Core/API/JSClassRef.cpp (237713 => 237714)


--- trunk/Source/_javascript_Core/API/JSClassRef.cpp	2018-11-02 00:58:05 UTC (rev 237713)
+++ trunk/Source/_javascript_Core/API/JSClassRef.cpp	2018-11-02 01:13:19 UTC (rev 237714)
@@ -35,7 +35,7 @@
 #include "ObjectPrototype.h"
 #include "JSCInlines.h"
 #include <wtf/text/StringHash.h>
-#include <wtf/unicode/UTF8.h>
+#include <wtf/unicode/UTF8Conversion.h>
 
 using namespace JSC;
 using namespace WTF::Unicode;

Modified: trunk/Source/_javascript_Core/API/JSStringRef.cpp (237713 => 237714)


--- trunk/Source/_javascript_Core/API/JSStringRef.cpp	2018-11-02 00:58:05 UTC (rev 237713)
+++ trunk/Source/_javascript_Core/API/JSStringRef.cpp	2018-11-02 01:13:19 UTC (rev 237714)
@@ -29,7 +29,7 @@
 
 #include "InitializeThreading.h"
 #include "OpaqueJSString.h"
-#include <wtf/unicode/UTF8.h>
+#include <wtf/unicode/UTF8Conversion.h>
 
 using namespace JSC;
 using namespace WTF::Unicode;

Modified: trunk/Source/_javascript_Core/ChangeLog (237713 => 237714)


--- trunk/Source/_javascript_Core/ChangeLog	2018-11-02 00:58:05 UTC (rev 237713)
+++ trunk/Source/_javascript_Core/ChangeLog	2018-11-02 01:13:19 UTC (rev 237714)
@@ -1,3 +1,15 @@
+2018-11-01  Fujii Hironori  <hironori.fu...@sony.com>
+
+        Rename <wtf/unicode/UTF8.h> to <wtf/unicode/UTF8Conversion.h> in order to avoid conflicting with ICU's unicode/utf8.h
+        https://bugs.webkit.org/show_bug.cgi?id=189693
+
+        Reviewed by Yusuke Suzuki.
+
+        * API/JSClassRef.cpp: Replaced <wtf/unicode/UTF8.h> with <wtf/unicode/UTF8Conversion.h>.
+        * API/JSStringRef.cpp: Ditto.
+        * runtime/JSGlobalObjectFunctions.cpp: Ditto.
+        * wasm/WasmParser.h: Ditto.
+
 2018-11-01  Keith Miller  <keith_mil...@apple.com>
 
         Unreviewed, _javascript_Core should only guarantee to produce a

Modified: trunk/Source/_javascript_Core/runtime/JSGlobalObjectFunctions.cpp (237713 => 237714)


--- trunk/Source/_javascript_Core/runtime/JSGlobalObjectFunctions.cpp	2018-11-02 00:58:05 UTC (rev 237713)
+++ trunk/Source/_javascript_Core/runtime/JSGlobalObjectFunctions.cpp	2018-11-02 01:13:19 UTC (rev 237714)
@@ -55,7 +55,7 @@
 #include <wtf/MathExtras.h>
 #include <wtf/dtoa.h>
 #include <wtf/text/StringBuilder.h>
-#include <wtf/unicode/UTF8.h>
+#include <wtf/unicode/UTF8Conversion.h>
 
 using namespace WTF;
 using namespace Unicode;

Modified: trunk/Source/_javascript_Core/wasm/WasmParser.h (237713 => 237714)


--- trunk/Source/_javascript_Core/wasm/WasmParser.h	2018-11-02 00:58:05 UTC (rev 237713)
+++ trunk/Source/_javascript_Core/wasm/WasmParser.h	2018-11-02 01:13:19 UTC (rev 237714)
@@ -39,7 +39,7 @@
 #include <wtf/LEBDecoder.h>
 #include <wtf/StdLibExtras.h>
 #include <wtf/text/WTFString.h>
-#include <wtf/unicode/UTF8.h>
+#include <wtf/unicode/UTF8Conversion.h>
 
 namespace JSC { namespace Wasm {
 

Modified: trunk/Source/WTF/ChangeLog (237713 => 237714)


--- trunk/Source/WTF/ChangeLog	2018-11-02 00:58:05 UTC (rev 237713)
+++ trunk/Source/WTF/ChangeLog	2018-11-02 01:13:19 UTC (rev 237714)
@@ -1,3 +1,19 @@
+2018-11-01  Fujii Hironori  <hironori.fu...@sony.com>
+
+        Rename <wtf/unicode/UTF8.h> to <wtf/unicode/UTF8Conversion.h> in order to avoid conflicting with ICU's unicode/utf8.h
+        https://bugs.webkit.org/show_bug.cgi?id=189693
+
+        Reviewed by Yusuke Suzuki.
+
+        * WTF.xcodeproj/project.pbxproj: Replaced unicode/UTF8.{cpp,h} with unicode/UTF8Conversion.{cpp,h}.
+        * wtf/CMakeLists.txt: Ditto.
+        * wtf/text/AtomicStringImpl.cpp: Replaced <wtf/unicode/UTF8.h> with <wtf/unicode/UTF8Conversion.h>.
+        * wtf/text/StringImpl.cpp: Ditto.
+        * wtf/text/StringView.cpp: Ditto.
+        * wtf/text/WTFString.cpp: Ditto.
+        * wtf/unicode/UTF8Conversion.cpp: Renamed from Source/WTF/wtf/unicode/UTF8.cpp.
+        * wtf/unicode/UTF8Conversion.h: Renamed from Source/WTF/wtf/unicode/UTF8.h.
+
 2018-10-30  Don Olmstead  <don.olmst...@sony.com>
 
         [PlayStation] Enable _javascript_Core

Modified: trunk/Source/WTF/WTF.xcodeproj/project.pbxproj (237713 => 237714)


--- trunk/Source/WTF/WTF.xcodeproj/project.pbxproj	2018-11-02 00:58:05 UTC (rev 237713)
+++ trunk/Source/WTF/WTF.xcodeproj/project.pbxproj	2018-11-02 01:13:19 UTC (rev 237714)
@@ -136,7 +136,7 @@
 		A8A47451151A825B004123FF /* BinarySemaphore.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A8A4733A151A825B004123FF /* BinarySemaphore.cpp */; };
 		A8A47460151A825B004123FF /* CollatorDefault.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A8A4734B151A825B004123FF /* CollatorDefault.cpp */; };
 		A8A47463151A825B004123FF /* CollatorICU.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A8A47350151A825B004123FF /* CollatorICU.cpp */; };
-		A8A47469151A825B004123FF /* UTF8.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A8A47357151A825B004123FF /* UTF8.cpp */; };
+		A8A47469151A825B004123FF /* UTF8Conversion.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A8A47357151A825B004123FF /* UTF8Conversion.cpp */; };
 		AD89B6B71E6415080090707F /* MemoryPressureHandler.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AD89B6B51E6415080090707F /* MemoryPressureHandler.cpp */; };
 		AD89B6BA1E64150F0090707F /* MemoryPressureHandlerCocoa.mm in Sources */ = {isa = PBXBuildFile; fileRef = AD89B6B91E64150F0090707F /* MemoryPressureHandlerCocoa.mm */; };
 		ADF2CE671E39F106006889DB /* MemoryFootprintCocoa.cpp in Sources */ = {isa = PBXBuildFile; fileRef = ADF2CE651E39F106006889DB /* MemoryFootprintCocoa.cpp */; };
@@ -585,8 +585,8 @@
 		A8A4734A151A825B004123FF /* Collator.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Collator.h; sourceTree = "<group>"; };
 		A8A4734B151A825B004123FF /* CollatorDefault.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CollatorDefault.cpp; sourceTree = "<group>"; };
 		A8A47350151A825B004123FF /* CollatorICU.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CollatorICU.cpp; sourceTree = "<group>"; };
-		A8A47357151A825B004123FF /* UTF8.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = UTF8.cpp; sourceTree = "<group>"; };
-		A8A47358151A825B004123FF /* UTF8.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = UTF8.h; sourceTree = "<group>"; };
+		A8A47357151A825B004123FF /* UTF8Conversion.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = UTF8Conversion.cpp; sourceTree = "<group>"; };
+		A8A47358151A825B004123FF /* UTF8Conversion.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = UTF8Conversion.h; sourceTree = "<group>"; };
 		A8A4735C151A825B004123FF /* UnionFind.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = UnionFind.h; sourceTree = "<group>"; };
 		A8A4736F151A825B004123FF /* ValueCheck.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ValueCheck.h; sourceTree = "<group>"; };
 		A8A47370151A825B004123FF /* Vector.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Vector.h; sourceTree = "<group>"; };
@@ -1277,8 +1277,8 @@
 				A8A47349151A825B004123FF /* CharacterNames.h */,
 				A8A4734A151A825B004123FF /* Collator.h */,
 				A8A4734B151A825B004123FF /* CollatorDefault.cpp */,
-				A8A47357151A825B004123FF /* UTF8.cpp */,
-				A8A47358151A825B004123FF /* UTF8.h */,
+				A8A47357151A825B004123FF /* UTF8Conversion.cpp */,
+				A8A47358151A825B004123FF /* UTF8Conversion.h */,
 			);
 			path = unicode;
 			sourceTree = "<group>";
@@ -1592,7 +1592,7 @@
 				1C181C8F1D307AB800F5FA16 /* UTextProvider.cpp in Sources */,
 				1C181C911D307AB800F5FA16 /* UTextProviderLatin1.cpp in Sources */,
 				1C181C931D307AB800F5FA16 /* UTextProviderUTF16.cpp in Sources */,
-				A8A47469151A825B004123FF /* UTF8.cpp in Sources */,
+				A8A47469151A825B004123FF /* UTF8Conversion.cpp in Sources */,
 				7AFEC6B11EB22B5900DADE36 /* UUID.cpp in Sources */,
 				0F66B2921DC97BAB004A1D3F /* WallTime.cpp in Sources */,
 				1FA47C8A152502DA00568D1B /* WebCoreThread.cpp in Sources */,

Modified: trunk/Source/WTF/wtf/CMakeLists.txt (237713 => 237714)


--- trunk/Source/WTF/wtf/CMakeLists.txt	2018-11-02 00:58:05 UTC (rev 237713)
+++ trunk/Source/WTF/wtf/CMakeLists.txt	2018-11-02 01:13:19 UTC (rev 237714)
@@ -327,7 +327,7 @@
 
     unicode/CharacterNames.h
     unicode/Collator.h
-    unicode/UTF8.h
+    unicode/UTF8Conversion.h
 )
 
 set(WTF_SOURCES
@@ -440,7 +440,7 @@
     threads/BinarySemaphore.cpp
     threads/Signals.cpp
 
-    unicode/UTF8.cpp
+    unicode/UTF8Conversion.cpp
 )
 
 set(WTF_INCLUDE_DIRECTORIES

Modified: trunk/Source/WTF/wtf/text/AtomicStringImpl.cpp (237713 => 237714)


--- trunk/Source/WTF/wtf/text/AtomicStringImpl.cpp	2018-11-02 00:58:05 UTC (rev 237713)
+++ trunk/Source/WTF/wtf/text/AtomicStringImpl.cpp	2018-11-02 01:13:19 UTC (rev 237714)
@@ -32,7 +32,7 @@
 #include <wtf/text/AtomicStringTable.h>
 #include <wtf/text/IntegerToStringConversion.h>
 #include <wtf/text/StringHash.h>
-#include <wtf/unicode/UTF8.h>
+#include <wtf/unicode/UTF8Conversion.h>
 
 #if USE(WEB_THREAD)
 #include <wtf/Lock.h>

Modified: trunk/Source/WTF/wtf/text/StringImpl.cpp (237713 => 237714)


--- trunk/Source/WTF/wtf/text/StringImpl.cpp	2018-11-02 00:58:05 UTC (rev 237713)
+++ trunk/Source/WTF/wtf/text/StringImpl.cpp	2018-11-02 01:13:19 UTC (rev 237714)
@@ -36,7 +36,7 @@
 #include <wtf/text/SymbolImpl.h>
 #include <wtf/text/SymbolRegistry.h>
 #include <wtf/unicode/CharacterNames.h>
-#include <wtf/unicode/UTF8.h>
+#include <wtf/unicode/UTF8Conversion.h>
 
 #if STRING_STATS
 #include <unistd.h>

Modified: trunk/Source/WTF/wtf/text/StringView.cpp (237713 => 237714)


--- trunk/Source/WTF/wtf/text/StringView.cpp	2018-11-02 00:58:05 UTC (rev 237713)
+++ trunk/Source/WTF/wtf/text/StringView.cpp	2018-11-02 01:13:19 UTC (rev 237714)
@@ -35,7 +35,7 @@
 #include <wtf/Optional.h>
 #include <wtf/text/StringBuffer.h>
 #include <wtf/text/TextBreakIterator.h>
-#include <wtf/unicode/UTF8.h>
+#include <wtf/unicode/UTF8Conversion.h>
 
 namespace WTF {
 

Modified: trunk/Source/WTF/wtf/text/WTFString.cpp (237713 => 237714)


--- trunk/Source/WTF/wtf/text/WTFString.cpp	2018-11-02 00:58:05 UTC (rev 237713)
+++ trunk/Source/WTF/wtf/text/WTFString.cpp	2018-11-02 01:13:19 UTC (rev 237714)
@@ -34,7 +34,7 @@
 #include <wtf/text/IntegerToStringConversion.h>
 #include <wtf/text/StringToIntegerConversion.h>
 #include <wtf/unicode/CharacterNames.h>
-#include <wtf/unicode/UTF8.h>
+#include <wtf/unicode/UTF8Conversion.h>
 
 namespace WTF {
 

Deleted: trunk/Source/WTF/wtf/unicode/UTF8.cpp (237713 => 237714)


--- trunk/Source/WTF/wtf/unicode/UTF8.cpp	2018-11-02 00:58:05 UTC (rev 237713)
+++ trunk/Source/WTF/wtf/unicode/UTF8.cpp	2018-11-02 01:13:19 UTC (rev 237714)
@@ -1,488 +0,0 @@
-/*
- * Copyright (C) 2007, 2014 Apple Inc. All rights reserved.
- * Copyright (C) 2010 Patrick Gansterer <par...@paroga.com>
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
- */
-
-#include "config.h"
-#include <wtf/unicode/UTF8.h>
-
-#include <wtf/ASCIICType.h>
-#include <wtf/text/StringHasher.h>
-#include <wtf/unicode/CharacterNames.h>
-
-namespace WTF {
-namespace Unicode {
-
-inline int inlineUTF8SequenceLengthNonASCII(char b0)
-{
-    if ((b0 & 0xC0) != 0xC0)
-        return 0;
-    if ((b0 & 0xE0) == 0xC0)
-        return 2;
-    if ((b0 & 0xF0) == 0xE0)
-        return 3;
-    if ((b0 & 0xF8) == 0xF0)
-        return 4;
-    return 0;
-}
-
-inline int inlineUTF8SequenceLength(char b0)
-{
-    return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
-}
-
-int UTF8SequenceLength(char b0)
-{
-    return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
-}
-
-int decodeUTF8Sequence(const char* sequence)
-{
-    // Handle 0-byte sequences (never valid).
-    const unsigned char b0 = sequence[0];
-    const int length = inlineUTF8SequenceLength(b0);
-    if (length == 0)
-        return -1;
-
-    // Handle 1-byte sequences (plain ASCII).
-    const unsigned char b1 = sequence[1];
-    if (length == 1) {
-        if (b1)
-            return -1;
-        return b0;
-    }
-
-    // Handle 2-byte sequences.
-    if ((b1 & 0xC0) != 0x80)
-        return -1;
-    const unsigned char b2 = sequence[2];
-    if (length == 2) {
-        if (b2)
-            return -1;
-        const int c = ((b0 & 0x1F) << 6) | (b1 & 0x3F);
-        if (c < 0x80)
-            return -1;
-        return c;
-    }
-
-    // Handle 3-byte sequences.
-    if ((b2 & 0xC0) != 0x80)
-        return -1;
-    const unsigned char b3 = sequence[3];
-    if (length == 3) {
-        if (b3)
-            return -1;
-        const int c = ((b0 & 0xF) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
-        if (c < 0x800)
-            return -1;
-        // UTF-16 surrogates should never appear in UTF-8 data.
-        if (c >= 0xD800 && c <= 0xDFFF)
-            return -1;
-        return c;
-    }
-
-    // Handle 4-byte sequences.
-    if ((b3 & 0xC0) != 0x80)
-        return -1;
-    const unsigned char b4 = sequence[4];
-    if (length == 4) {
-        if (b4)
-            return -1;
-        const int c = ((b0 & 0x7) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
-        if (c < 0x10000 || c > 0x10FFFF)
-            return -1;
-        return c;
-    }
-
-    return -1;
-}
-
-// Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
-// into the first byte, depending on how many bytes follow.  There are
-// as many entries in this table as there are UTF-8 sequence types.
-// (I.e., one byte sequence, two byte... etc.). Remember that sequencs
-// for *legal* UTF-8 will be 4 or fewer bytes total.
-static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
-
-ConversionResult convertLatin1ToUTF8(
-    const LChar** sourceStart, const LChar* sourceEnd, 
-    char** targetStart, char* targetEnd)
-{
-    ConversionResult result = conversionOK;
-    const LChar* source = *sourceStart;
-    char* target = *targetStart;
-    while (source < sourceEnd) {
-        UChar32 ch;
-        unsigned short bytesToWrite = 0;
-        const UChar32 byteMask = 0xBF;
-        const UChar32 byteMark = 0x80; 
-        const LChar* oldSource = source; // In case we have to back up because of target overflow.
-        ch = static_cast<unsigned short>(*source++);
-
-        // Figure out how many bytes the result will require
-        if (ch < (UChar32)0x80)
-            bytesToWrite = 1;
-        else
-            bytesToWrite = 2;
-
-        target += bytesToWrite;
-        if (target > targetEnd) {
-            source = oldSource; // Back up source pointer!
-            target -= bytesToWrite;
-            result = targetExhausted;
-            break;
-        }
-        switch (bytesToWrite) { // note: everything falls through.
-        case 2:
-            *--target = (char)((ch | byteMark) & byteMask);
-            ch >>= 6;
-            FALLTHROUGH;
-        case 1:
-            *--target =  (char)(ch | firstByteMark[bytesToWrite]);
-        }
-        target += bytesToWrite;
-    }
-    *sourceStart = source;
-    *targetStart = target;
-    return result;
-}
-
-ConversionResult convertUTF16ToUTF8(
-    const UChar** sourceStart, const UChar* sourceEnd, 
-    char** targetStart, char* targetEnd, bool strict)
-{
-    ConversionResult result = conversionOK;
-    const UChar* source = *sourceStart;
-    char* target = *targetStart;
-    while (source < sourceEnd) {
-        UChar32 ch;
-        unsigned short bytesToWrite = 0;
-        const UChar32 byteMask = 0xBF;
-        const UChar32 byteMark = 0x80; 
-        const UChar* oldSource = source; // In case we have to back up because of target overflow.
-        ch = static_cast<unsigned short>(*source++);
-        // If we have a surrogate pair, convert to UChar32 first.
-        if (ch >= 0xD800 && ch <= 0xDBFF) {
-            // If the 16 bits following the high surrogate are in the source buffer...
-            if (source < sourceEnd) {
-                UChar32 ch2 = static_cast<unsigned short>(*source);
-                // If it's a low surrogate, convert to UChar32.
-                if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
-                    ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000;
-                    ++source;
-                } else if (strict) { // it's an unpaired high surrogate
-                    --source; // return to the illegal value itself
-                    result = sourceIllegal;
-                    break;
-                }
-            } else { // We don't have the 16 bits following the high surrogate.
-                --source; // return to the high surrogate
-                result = sourceExhausted;
-                break;
-            }
-        } else if (strict) {
-            // UTF-16 surrogate values are illegal in UTF-32
-            if (ch >= 0xDC00 && ch <= 0xDFFF) {
-                --source; // return to the illegal value itself
-                result = sourceIllegal;
-                break;
-            }
-        }
-        // Figure out how many bytes the result will require
-        if (ch < (UChar32)0x80) {
-            bytesToWrite = 1;
-        } else if (ch < (UChar32)0x800) {
-            bytesToWrite = 2;
-        } else if (ch < (UChar32)0x10000) {
-            bytesToWrite = 3;
-        } else if (ch < (UChar32)0x110000) {
-            bytesToWrite = 4;
-        } else {
-            bytesToWrite = 3;
-            ch = replacementCharacter;
-        }
-
-        target += bytesToWrite;
-        if (target > targetEnd) {
-            source = oldSource; // Back up source pointer!
-            target -= bytesToWrite;
-            result = targetExhausted;
-            break;
-        }
-        switch (bytesToWrite) { // note: everything falls through.
-            case 4: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6; FALLTHROUGH;
-            case 3: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6; FALLTHROUGH;
-            case 2: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6; FALLTHROUGH;
-            case 1: *--target =  (char)(ch | firstByteMark[bytesToWrite]);
-        }
-        target += bytesToWrite;
-    }
-    *sourceStart = source;
-    *targetStart = target;
-    return result;
-}
-
-// This must be called with the length pre-determined by the first byte.
-// If presented with a length > 4, this returns false.  The Unicode
-// definition of UTF-8 goes up to 4-byte sequences.
-static bool isLegalUTF8(const unsigned char* source, int length)
-{
-    unsigned char a;
-    const unsigned char* srcptr = source + length;
-    switch (length) {
-        default: return false;
-        // Everything else falls through when "true"...
-        case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; FALLTHROUGH;
-        case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; FALLTHROUGH;
-        case 2: if ((a = (*--srcptr)) > 0xBF) return false;
-
-        switch (*source) {
-            // no fall-through in this inner switch
-            case 0xE0: if (a < 0xA0) return false; break;
-            case 0xED: if (a > 0x9F) return false; break;
-            case 0xF0: if (a < 0x90) return false; break;
-            case 0xF4: if (a > 0x8F) return false; break;
-            default:   if (a < 0x80) return false;
-        }
-        FALLTHROUGH;
-
-        case 1: if (*source >= 0x80 && *source < 0xC2) return false;
-    }
-    if (*source > 0xF4)
-        return false;
-    return true;
-}
-
-// Magic values subtracted from a buffer value during UTF8 conversion.
-// This table contains as many values as there might be trailing bytes
-// in a UTF-8 sequence.
-static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 0x03C82080UL, static_cast<UChar32>(0xFA082080UL), static_cast<UChar32>(0x82082080UL) };
-
-static inline UChar32 readUTF8Sequence(const char*& sequence, unsigned length)
-{
-    UChar32 character = 0;
-
-    // The cases all fall through.
-    switch (length) {
-        case 6: character += static_cast<unsigned char>(*sequence++); character <<= 6; FALLTHROUGH;
-        case 5: character += static_cast<unsigned char>(*sequence++); character <<= 6; FALLTHROUGH;
-        case 4: character += static_cast<unsigned char>(*sequence++); character <<= 6; FALLTHROUGH;
-        case 3: character += static_cast<unsigned char>(*sequence++); character <<= 6; FALLTHROUGH;
-        case 2: character += static_cast<unsigned char>(*sequence++); character <<= 6; FALLTHROUGH;
-        case 1: character += static_cast<unsigned char>(*sequence++);
-    }
-
-    return character - offsetsFromUTF8[length - 1];
-}
-
-ConversionResult convertUTF8ToUTF16(
-    const char** sourceStart, const char* sourceEnd, 
-    UChar** targetStart, UChar* targetEnd, bool* sourceAllASCII, bool strict)
-{
-    ConversionResult result = conversionOK;
-    const char* source = *sourceStart;
-    UChar* target = *targetStart;
-    UChar orAllData = 0;
-    while (source < sourceEnd) {
-        int utf8SequenceLength = inlineUTF8SequenceLength(*source);
-        if (sourceEnd - source < utf8SequenceLength)  {
-            result = sourceExhausted;
-            break;
-        }
-        // Do this check whether lenient or strict
-        if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8SequenceLength)) {
-            result = sourceIllegal;
-            break;
-        }
-
-        UChar32 character = readUTF8Sequence(source, utf8SequenceLength);
-
-        if (target >= targetEnd) {
-            source -= utf8SequenceLength; // Back up source pointer!
-            result = targetExhausted;
-            break;
-        }
-
-        if (U_IS_BMP(character)) {
-            // UTF-16 surrogate values are illegal in UTF-32
-            if (U_IS_SURROGATE(character)) {
-                if (strict) {
-                    source -= utf8SequenceLength; // return to the illegal value itself
-                    result = sourceIllegal;
-                    break;
-                } else {
-                    *target++ = replacementCharacter;
-                    orAllData |= replacementCharacter;
-                }
-            } else {
-                *target++ = character; // normal case
-                orAllData |= character;
-            }
-        } else if (U_IS_SUPPLEMENTARY(character)) {
-            // target is a character in range 0xFFFF - 0x10FFFF
-            if (target + 1 >= targetEnd) {
-                source -= utf8SequenceLength; // Back up source pointer!
-                result = targetExhausted;
-                break;
-            }
-            *target++ = U16_LEAD(character);
-            *target++ = U16_TRAIL(character);
-            orAllData = 0xffff;
-        } else {
-            if (strict) {
-                source -= utf8SequenceLength; // return to the start
-                result = sourceIllegal;
-                break; // Bail out; shouldn't continue
-            } else {
-                *target++ = replacementCharacter;
-                orAllData |= replacementCharacter;
-            }
-        }
-    }
-    *sourceStart = source;
-    *targetStart = target;
-
-    if (sourceAllASCII)
-        *sourceAllASCII = !(orAllData & ~0x7f);
-
-    return result;
-}
-
-unsigned calculateStringHashAndLengthFromUTF8MaskingTop8Bits(const char* data, const char* dataEnd, unsigned& dataLength, unsigned& utf16Length)
-{
-    if (!data)
-        return 0;
-
-    StringHasher stringHasher;
-    dataLength = 0;
-    utf16Length = 0;
-
-    while (data < dataEnd || (!dataEnd && *data)) {
-        if (isASCII(*data)) {
-            stringHasher.addCharacter(*data++);
-            dataLength++;
-            utf16Length++;
-            continue;
-        }
-
-        int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*data);
-        dataLength += utf8SequenceLength;
-
-        if (!dataEnd) {
-            for (int i = 1; i < utf8SequenceLength; ++i) {
-                if (!data[i])
-                    return 0;
-            }
-        } else if (dataEnd - data < utf8SequenceLength)
-            return 0;
-
-        if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(data), utf8SequenceLength))
-            return 0;
-
-        UChar32 character = readUTF8Sequence(data, utf8SequenceLength);
-        ASSERT(!isASCII(character));
-
-        if (U_IS_BMP(character)) {
-            // UTF-16 surrogate values are illegal in UTF-32
-            if (U_IS_SURROGATE(character))
-                return 0;
-            stringHasher.addCharacter(static_cast<UChar>(character)); // normal case
-            utf16Length++;
-        } else if (U_IS_SUPPLEMENTARY(character)) {
-            stringHasher.addCharacters(static_cast<UChar>(U16_LEAD(character)),
-                                       static_cast<UChar>(U16_TRAIL(character)));
-            utf16Length += 2;
-        } else
-            return 0;
-    }
-
-    return stringHasher.hashWithTop8BitsMasked();
-}
-
-bool equalUTF16WithUTF8(const UChar* a, const char* b, const char* bEnd)
-{
-    while (b < bEnd) {
-        if (isASCII(*a) || isASCII(*b)) {
-            if (*a++ != *b++)
-                return false;
-            continue;
-        }
-
-        int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*b);
-
-        if (bEnd - b < utf8SequenceLength)
-            return false;
-
-        if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(b), utf8SequenceLength))
-            return false;
-
-        UChar32 character = readUTF8Sequence(b, utf8SequenceLength);
-        ASSERT(!isASCII(character));
-
-        if (U_IS_BMP(character)) {
-            // UTF-16 surrogate values are illegal in UTF-32
-            if (U_IS_SURROGATE(character))
-                return false;
-            if (*a++ != character)
-                return false;
-        } else if (U_IS_SUPPLEMENTARY(character)) {
-            if (*a++ != U16_LEAD(character))
-                return false;
-            if (*a++ != U16_TRAIL(character))
-                return false;
-        } else
-            return false;
-    }
-
-    return true;
-}
-
-bool equalLatin1WithUTF8(const LChar* a, const char* b, const char* bEnd)
-{
-    while (b < bEnd) {
-        if (isASCII(*a) || isASCII(*b)) {
-            if (*a++ != *b++)
-                return false;
-            continue;
-        }
-
-        if (b + 1 == bEnd)
-            return false;
-
-        if ((b[0] & 0xE0) != 0xC0 || (b[1] & 0xC0) != 0x80)
-            return false;
-
-        LChar character = ((b[0] & 0x1F) << 6) | (b[1] & 0x3F);
-
-        b += 2;
-
-        if (*a++ != character)
-            return false;
-    }
-
-    return true;
-}
-
-} // namespace Unicode
-} // namespace WTF

Deleted: trunk/Source/WTF/wtf/unicode/UTF8.h (237713 => 237714)


--- trunk/Source/WTF/wtf/unicode/UTF8.h	2018-11-02 00:58:05 UTC (rev 237713)
+++ trunk/Source/WTF/wtf/unicode/UTF8.h	2018-11-02 01:13:19 UTC (rev 237714)
@@ -1,84 +0,0 @@
-/*
- * Copyright (C) 2007 Apple Inc.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
- */
-
-#pragma once
-
-#include <unicode/utypes.h>
-#include <wtf/text/LChar.h>
-
-namespace WTF {
-namespace Unicode {
-
-    // Given a first byte, gives the length of the UTF-8 sequence it begins.
-    // Returns 0 for bytes that are not legal starts of UTF-8 sequences.
-    // Only allows sequences of up to 4 bytes, since that works for all Unicode characters (U-00000000 to U-0010FFFF).
-    WTF_EXPORT_PRIVATE int UTF8SequenceLength(char);
-
-    // Takes a null-terminated C-style string with a UTF-8 sequence in it and converts it to a character.
-    // Only allows Unicode characters (U-00000000 to U-0010FFFF).
-    // Returns -1 if the sequence is not valid (including presence of extra bytes).
-    WTF_EXPORT_PRIVATE int decodeUTF8Sequence(const char*);
-
-    typedef enum {
-            conversionOK,       // conversion successful
-            sourceExhausted,    // partial character in source, but hit end
-            targetExhausted,    // insuff. room in target for conversion
-            sourceIllegal       // source sequence is illegal/malformed
-    } ConversionResult;
-
-    // These conversion functions take a "strict" argument. When this
-    // flag is set to strict, both irregular sequences and isolated surrogates
-    // will cause an error.  When the flag is set to lenient, both irregular
-    // sequences and isolated surrogates are converted.
-    // 
-    // Whether the flag is strict or lenient, all illegal sequences will cause
-    // an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>,
-    // or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code
-    // must check for illegal sequences.
-    // 
-    // When the flag is set to lenient, characters over 0x10FFFF are converted
-    // to the replacement character; otherwise (when the flag is set to strict)
-    // they constitute an error.
-
-    WTF_EXPORT_PRIVATE ConversionResult convertUTF8ToUTF16(
-                    const char** sourceStart, const char* sourceEnd, 
-                    UChar** targetStart, UChar* targetEnd, bool* isSourceAllASCII = 0, bool strict = true);
-
-    WTF_EXPORT_PRIVATE ConversionResult convertLatin1ToUTF8(
-                    const LChar** sourceStart, const LChar* sourceEnd, 
-                    char** targetStart, char* targetEnd);
-
-    WTF_EXPORT_PRIVATE ConversionResult convertUTF16ToUTF8(
-                    const UChar** sourceStart, const UChar* sourceEnd, 
-                    char** targetStart, char* targetEnd, bool strict = true);
-
-    WTF_EXPORT_PRIVATE unsigned calculateStringHashAndLengthFromUTF8MaskingTop8Bits(const char* data, const char* dataEnd, unsigned& dataLength, unsigned& utf16Length);
-
-    // The caller of these functions already knows that the lengths are the same, so we omit an end argument for UTF-16 and Latin-1.
-    bool equalUTF16WithUTF8(const UChar* stringInUTF16, const char* stringInUTF8, const char* stringInUTF8End);
-    bool equalLatin1WithUTF8(const LChar* stringInLatin1, const char* stringInUTF8, const char* stringInUTF8End);
-
-} // namespace Unicode
-} // namespace WTF

Copied: trunk/Source/WTF/wtf/unicode/UTF8Conversion.cpp (from rev 237711, trunk/Source/WTF/wtf/unicode/UTF8.cpp) (0 => 237714)


--- trunk/Source/WTF/wtf/unicode/UTF8Conversion.cpp	                        (rev 0)
+++ trunk/Source/WTF/wtf/unicode/UTF8Conversion.cpp	2018-11-02 01:13:19 UTC (rev 237714)
@@ -0,0 +1,488 @@
+/*
+ * Copyright (C) 2007, 2014 Apple Inc. All rights reserved.
+ * Copyright (C) 2010 Patrick Gansterer <par...@paroga.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+ */
+
+#include "config.h"
+#include <wtf/unicode/UTF8Conversion.h>
+
+#include <wtf/ASCIICType.h>
+#include <wtf/text/StringHasher.h>
+#include <wtf/unicode/CharacterNames.h>
+
+namespace WTF {
+namespace Unicode {
+
+inline int inlineUTF8SequenceLengthNonASCII(char b0)
+{
+    if ((b0 & 0xC0) != 0xC0)
+        return 0;
+    if ((b0 & 0xE0) == 0xC0)
+        return 2;
+    if ((b0 & 0xF0) == 0xE0)
+        return 3;
+    if ((b0 & 0xF8) == 0xF0)
+        return 4;
+    return 0;
+}
+
+inline int inlineUTF8SequenceLength(char b0)
+{
+    return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
+}
+
+int UTF8SequenceLength(char b0)
+{
+    return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
+}
+
+int decodeUTF8Sequence(const char* sequence)
+{
+    // Handle 0-byte sequences (never valid).
+    const unsigned char b0 = sequence[0];
+    const int length = inlineUTF8SequenceLength(b0);
+    if (length == 0)
+        return -1;
+
+    // Handle 1-byte sequences (plain ASCII).
+    const unsigned char b1 = sequence[1];
+    if (length == 1) {
+        if (b1)
+            return -1;
+        return b0;
+    }
+
+    // Handle 2-byte sequences.
+    if ((b1 & 0xC0) != 0x80)
+        return -1;
+    const unsigned char b2 = sequence[2];
+    if (length == 2) {
+        if (b2)
+            return -1;
+        const int c = ((b0 & 0x1F) << 6) | (b1 & 0x3F);
+        if (c < 0x80)
+            return -1;
+        return c;
+    }
+
+    // Handle 3-byte sequences.
+    if ((b2 & 0xC0) != 0x80)
+        return -1;
+    const unsigned char b3 = sequence[3];
+    if (length == 3) {
+        if (b3)
+            return -1;
+        const int c = ((b0 & 0xF) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
+        if (c < 0x800)
+            return -1;
+        // UTF-16 surrogates should never appear in UTF-8 data.
+        if (c >= 0xD800 && c <= 0xDFFF)
+            return -1;
+        return c;
+    }
+
+    // Handle 4-byte sequences.
+    if ((b3 & 0xC0) != 0x80)
+        return -1;
+    const unsigned char b4 = sequence[4];
+    if (length == 4) {
+        if (b4)
+            return -1;
+        const int c = ((b0 & 0x7) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
+        if (c < 0x10000 || c > 0x10FFFF)
+            return -1;
+        return c;
+    }
+
+    return -1;
+}
+
+// Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
+// into the first byte, depending on how many bytes follow.  There are
+// as many entries in this table as there are UTF-8 sequence types.
+// (I.e., one byte sequence, two byte... etc.). Remember that sequencs
+// for *legal* UTF-8 will be 4 or fewer bytes total.
+static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
+
+ConversionResult convertLatin1ToUTF8(
+    const LChar** sourceStart, const LChar* sourceEnd, 
+    char** targetStart, char* targetEnd)
+{
+    ConversionResult result = conversionOK;
+    const LChar* source = *sourceStart;
+    char* target = *targetStart;
+    while (source < sourceEnd) {
+        UChar32 ch;
+        unsigned short bytesToWrite = 0;
+        const UChar32 byteMask = 0xBF;
+        const UChar32 byteMark = 0x80; 
+        const LChar* oldSource = source; // In case we have to back up because of target overflow.
+        ch = static_cast<unsigned short>(*source++);
+
+        // Figure out how many bytes the result will require
+        if (ch < (UChar32)0x80)
+            bytesToWrite = 1;
+        else
+            bytesToWrite = 2;
+
+        target += bytesToWrite;
+        if (target > targetEnd) {
+            source = oldSource; // Back up source pointer!
+            target -= bytesToWrite;
+            result = targetExhausted;
+            break;
+        }
+        switch (bytesToWrite) { // note: everything falls through.
+        case 2:
+            *--target = (char)((ch | byteMark) & byteMask);
+            ch >>= 6;
+            FALLTHROUGH;
+        case 1:
+            *--target =  (char)(ch | firstByteMark[bytesToWrite]);
+        }
+        target += bytesToWrite;
+    }
+    *sourceStart = source;
+    *targetStart = target;
+    return result;
+}
+
+ConversionResult convertUTF16ToUTF8(
+    const UChar** sourceStart, const UChar* sourceEnd, 
+    char** targetStart, char* targetEnd, bool strict)
+{
+    ConversionResult result = conversionOK;
+    const UChar* source = *sourceStart;
+    char* target = *targetStart;
+    while (source < sourceEnd) {
+        UChar32 ch;
+        unsigned short bytesToWrite = 0;
+        const UChar32 byteMask = 0xBF;
+        const UChar32 byteMark = 0x80; 
+        const UChar* oldSource = source; // In case we have to back up because of target overflow.
+        ch = static_cast<unsigned short>(*source++);
+        // If we have a surrogate pair, convert to UChar32 first.
+        if (ch >= 0xD800 && ch <= 0xDBFF) {
+            // If the 16 bits following the high surrogate are in the source buffer...
+            if (source < sourceEnd) {
+                UChar32 ch2 = static_cast<unsigned short>(*source);
+                // If it's a low surrogate, convert to UChar32.
+                if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
+                    ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000;
+                    ++source;
+                } else if (strict) { // it's an unpaired high surrogate
+                    --source; // return to the illegal value itself
+                    result = sourceIllegal;
+                    break;
+                }
+            } else { // We don't have the 16 bits following the high surrogate.
+                --source; // return to the high surrogate
+                result = sourceExhausted;
+                break;
+            }
+        } else if (strict) {
+            // UTF-16 surrogate values are illegal in UTF-32
+            if (ch >= 0xDC00 && ch <= 0xDFFF) {
+                --source; // return to the illegal value itself
+                result = sourceIllegal;
+                break;
+            }
+        }
+        // Figure out how many bytes the result will require
+        if (ch < (UChar32)0x80) {
+            bytesToWrite = 1;
+        } else if (ch < (UChar32)0x800) {
+            bytesToWrite = 2;
+        } else if (ch < (UChar32)0x10000) {
+            bytesToWrite = 3;
+        } else if (ch < (UChar32)0x110000) {
+            bytesToWrite = 4;
+        } else {
+            bytesToWrite = 3;
+            ch = replacementCharacter;
+        }
+
+        target += bytesToWrite;
+        if (target > targetEnd) {
+            source = oldSource; // Back up source pointer!
+            target -= bytesToWrite;
+            result = targetExhausted;
+            break;
+        }
+        switch (bytesToWrite) { // note: everything falls through.
+            case 4: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6; FALLTHROUGH;
+            case 3: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6; FALLTHROUGH;
+            case 2: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6; FALLTHROUGH;
+            case 1: *--target =  (char)(ch | firstByteMark[bytesToWrite]);
+        }
+        target += bytesToWrite;
+    }
+    *sourceStart = source;
+    *targetStart = target;
+    return result;
+}
+
+// This must be called with the length pre-determined by the first byte.
+// If presented with a length > 4, this returns false.  The Unicode
+// definition of UTF-8 goes up to 4-byte sequences.
+static bool isLegalUTF8(const unsigned char* source, int length)
+{
+    unsigned char a;
+    const unsigned char* srcptr = source + length;
+    switch (length) {
+        default: return false;
+        // Everything else falls through when "true"...
+        case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; FALLTHROUGH;
+        case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; FALLTHROUGH;
+        case 2: if ((a = (*--srcptr)) > 0xBF) return false;
+
+        switch (*source) {
+            // no fall-through in this inner switch
+            case 0xE0: if (a < 0xA0) return false; break;
+            case 0xED: if (a > 0x9F) return false; break;
+            case 0xF0: if (a < 0x90) return false; break;
+            case 0xF4: if (a > 0x8F) return false; break;
+            default:   if (a < 0x80) return false;
+        }
+        FALLTHROUGH;
+
+        case 1: if (*source >= 0x80 && *source < 0xC2) return false;
+    }
+    if (*source > 0xF4)
+        return false;
+    return true;
+}
+
+// Magic values subtracted from a buffer value during UTF8 conversion.
+// This table contains as many values as there might be trailing bytes
+// in a UTF-8 sequence.
+static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 0x03C82080UL, static_cast<UChar32>(0xFA082080UL), static_cast<UChar32>(0x82082080UL) };
+
+static inline UChar32 readUTF8Sequence(const char*& sequence, unsigned length)
+{
+    UChar32 character = 0;
+
+    // The cases all fall through.
+    switch (length) {
+        case 6: character += static_cast<unsigned char>(*sequence++); character <<= 6; FALLTHROUGH;
+        case 5: character += static_cast<unsigned char>(*sequence++); character <<= 6; FALLTHROUGH;
+        case 4: character += static_cast<unsigned char>(*sequence++); character <<= 6; FALLTHROUGH;
+        case 3: character += static_cast<unsigned char>(*sequence++); character <<= 6; FALLTHROUGH;
+        case 2: character += static_cast<unsigned char>(*sequence++); character <<= 6; FALLTHROUGH;
+        case 1: character += static_cast<unsigned char>(*sequence++);
+    }
+
+    return character - offsetsFromUTF8[length - 1];
+}
+
+ConversionResult convertUTF8ToUTF16(
+    const char** sourceStart, const char* sourceEnd, 
+    UChar** targetStart, UChar* targetEnd, bool* sourceAllASCII, bool strict)
+{
+    ConversionResult result = conversionOK;
+    const char* source = *sourceStart;
+    UChar* target = *targetStart;
+    UChar orAllData = 0;
+    while (source < sourceEnd) {
+        int utf8SequenceLength = inlineUTF8SequenceLength(*source);
+        if (sourceEnd - source < utf8SequenceLength)  {
+            result = sourceExhausted;
+            break;
+        }
+        // Do this check whether lenient or strict
+        if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8SequenceLength)) {
+            result = sourceIllegal;
+            break;
+        }
+
+        UChar32 character = readUTF8Sequence(source, utf8SequenceLength);
+
+        if (target >= targetEnd) {
+            source -= utf8SequenceLength; // Back up source pointer!
+            result = targetExhausted;
+            break;
+        }
+
+        if (U_IS_BMP(character)) {
+            // UTF-16 surrogate values are illegal in UTF-32
+            if (U_IS_SURROGATE(character)) {
+                if (strict) {
+                    source -= utf8SequenceLength; // return to the illegal value itself
+                    result = sourceIllegal;
+                    break;
+                } else {
+                    *target++ = replacementCharacter;
+                    orAllData |= replacementCharacter;
+                }
+            } else {
+                *target++ = character; // normal case
+                orAllData |= character;
+            }
+        } else if (U_IS_SUPPLEMENTARY(character)) {
+            // target is a character in range 0xFFFF - 0x10FFFF
+            if (target + 1 >= targetEnd) {
+                source -= utf8SequenceLength; // Back up source pointer!
+                result = targetExhausted;
+                break;
+            }
+            *target++ = U16_LEAD(character);
+            *target++ = U16_TRAIL(character);
+            orAllData = 0xffff;
+        } else {
+            if (strict) {
+                source -= utf8SequenceLength; // return to the start
+                result = sourceIllegal;
+                break; // Bail out; shouldn't continue
+            } else {
+                *target++ = replacementCharacter;
+                orAllData |= replacementCharacter;
+            }
+        }
+    }
+    *sourceStart = source;
+    *targetStart = target;
+
+    if (sourceAllASCII)
+        *sourceAllASCII = !(orAllData & ~0x7f);
+
+    return result;
+}
+
+unsigned calculateStringHashAndLengthFromUTF8MaskingTop8Bits(const char* data, const char* dataEnd, unsigned& dataLength, unsigned& utf16Length)
+{
+    if (!data)
+        return 0;
+
+    StringHasher stringHasher;
+    dataLength = 0;
+    utf16Length = 0;
+
+    while (data < dataEnd || (!dataEnd && *data)) {
+        if (isASCII(*data)) {
+            stringHasher.addCharacter(*data++);
+            dataLength++;
+            utf16Length++;
+            continue;
+        }
+
+        int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*data);
+        dataLength += utf8SequenceLength;
+
+        if (!dataEnd) {
+            for (int i = 1; i < utf8SequenceLength; ++i) {
+                if (!data[i])
+                    return 0;
+            }
+        } else if (dataEnd - data < utf8SequenceLength)
+            return 0;
+
+        if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(data), utf8SequenceLength))
+            return 0;
+
+        UChar32 character = readUTF8Sequence(data, utf8SequenceLength);
+        ASSERT(!isASCII(character));
+
+        if (U_IS_BMP(character)) {
+            // UTF-16 surrogate values are illegal in UTF-32
+            if (U_IS_SURROGATE(character))
+                return 0;
+            stringHasher.addCharacter(static_cast<UChar>(character)); // normal case
+            utf16Length++;
+        } else if (U_IS_SUPPLEMENTARY(character)) {
+            stringHasher.addCharacters(static_cast<UChar>(U16_LEAD(character)),
+                                       static_cast<UChar>(U16_TRAIL(character)));
+            utf16Length += 2;
+        } else
+            return 0;
+    }
+
+    return stringHasher.hashWithTop8BitsMasked();
+}
+
+bool equalUTF16WithUTF8(const UChar* a, const char* b, const char* bEnd)
+{
+    while (b < bEnd) {
+        if (isASCII(*a) || isASCII(*b)) {
+            if (*a++ != *b++)
+                return false;
+            continue;
+        }
+
+        int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*b);
+
+        if (bEnd - b < utf8SequenceLength)
+            return false;
+
+        if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(b), utf8SequenceLength))
+            return false;
+
+        UChar32 character = readUTF8Sequence(b, utf8SequenceLength);
+        ASSERT(!isASCII(character));
+
+        if (U_IS_BMP(character)) {
+            // UTF-16 surrogate values are illegal in UTF-32
+            if (U_IS_SURROGATE(character))
+                return false;
+            if (*a++ != character)
+                return false;
+        } else if (U_IS_SUPPLEMENTARY(character)) {
+            if (*a++ != U16_LEAD(character))
+                return false;
+            if (*a++ != U16_TRAIL(character))
+                return false;
+        } else
+            return false;
+    }
+
+    return true;
+}
+
+bool equalLatin1WithUTF8(const LChar* a, const char* b, const char* bEnd)
+{
+    while (b < bEnd) {
+        if (isASCII(*a) || isASCII(*b)) {
+            if (*a++ != *b++)
+                return false;
+            continue;
+        }
+
+        if (b + 1 == bEnd)
+            return false;
+
+        if ((b[0] & 0xE0) != 0xC0 || (b[1] & 0xC0) != 0x80)
+            return false;
+
+        LChar character = ((b[0] & 0x1F) << 6) | (b[1] & 0x3F);
+
+        b += 2;
+
+        if (*a++ != character)
+            return false;
+    }
+
+    return true;
+}
+
+} // namespace Unicode
+} // namespace WTF

Copied: trunk/Source/WTF/wtf/unicode/UTF8Conversion.h (from rev 237711, trunk/Source/WTF/wtf/unicode/UTF8.h) (0 => 237714)


--- trunk/Source/WTF/wtf/unicode/UTF8Conversion.h	                        (rev 0)
+++ trunk/Source/WTF/wtf/unicode/UTF8Conversion.h	2018-11-02 01:13:19 UTC (rev 237714)
@@ -0,0 +1,84 @@
+/*
+ * Copyright (C) 2007 Apple Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+ */
+
+#pragma once
+
+#include <unicode/utypes.h>
+#include <wtf/text/LChar.h>
+
+namespace WTF {
+namespace Unicode {
+
+    // Given a first byte, gives the length of the UTF-8 sequence it begins.
+    // Returns 0 for bytes that are not legal starts of UTF-8 sequences.
+    // Only allows sequences of up to 4 bytes, since that works for all Unicode characters (U-00000000 to U-0010FFFF).
+    WTF_EXPORT_PRIVATE int UTF8SequenceLength(char);
+
+    // Takes a null-terminated C-style string with a UTF-8 sequence in it and converts it to a character.
+    // Only allows Unicode characters (U-00000000 to U-0010FFFF).
+    // Returns -1 if the sequence is not valid (including presence of extra bytes).
+    WTF_EXPORT_PRIVATE int decodeUTF8Sequence(const char*);
+
+    typedef enum {
+            conversionOK,       // conversion successful
+            sourceExhausted,    // partial character in source, but hit end
+            targetExhausted,    // insuff. room in target for conversion
+            sourceIllegal       // source sequence is illegal/malformed
+    } ConversionResult;
+
+    // These conversion functions take a "strict" argument. When this
+    // flag is set to strict, both irregular sequences and isolated surrogates
+    // will cause an error.  When the flag is set to lenient, both irregular
+    // sequences and isolated surrogates are converted.
+    // 
+    // Whether the flag is strict or lenient, all illegal sequences will cause
+    // an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>,
+    // or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code
+    // must check for illegal sequences.
+    // 
+    // When the flag is set to lenient, characters over 0x10FFFF are converted
+    // to the replacement character; otherwise (when the flag is set to strict)
+    // they constitute an error.
+
+    WTF_EXPORT_PRIVATE ConversionResult convertUTF8ToUTF16(
+                    const char** sourceStart, const char* sourceEnd, 
+                    UChar** targetStart, UChar* targetEnd, bool* isSourceAllASCII = 0, bool strict = true);
+
+    WTF_EXPORT_PRIVATE ConversionResult convertLatin1ToUTF8(
+                    const LChar** sourceStart, const LChar* sourceEnd, 
+                    char** targetStart, char* targetEnd);
+
+    WTF_EXPORT_PRIVATE ConversionResult convertUTF16ToUTF8(
+                    const UChar** sourceStart, const UChar* sourceEnd, 
+                    char** targetStart, char* targetEnd, bool strict = true);
+
+    WTF_EXPORT_PRIVATE unsigned calculateStringHashAndLengthFromUTF8MaskingTop8Bits(const char* data, const char* dataEnd, unsigned& dataLength, unsigned& utf16Length);
+
+    // The caller of these functions already knows that the lengths are the same, so we omit an end argument for UTF-16 and Latin-1.
+    bool equalUTF16WithUTF8(const UChar* stringInUTF16, const char* stringInUTF8, const char* stringInUTF8End);
+    bool equalLatin1WithUTF8(const LChar* stringInLatin1, const char* stringInUTF8, const char* stringInUTF8End);
+
+} // namespace Unicode
+} // namespace WTF

Modified: trunk/Source/WebCore/ChangeLog (237713 => 237714)


--- trunk/Source/WebCore/ChangeLog	2018-11-02 00:58:05 UTC (rev 237713)
+++ trunk/Source/WebCore/ChangeLog	2018-11-02 01:13:19 UTC (rev 237714)
@@ -1,3 +1,16 @@
+2018-11-01  Fujii Hironori  <hironori.fu...@sony.com>
+
+        Rename <wtf/unicode/UTF8.h> to <wtf/unicode/UTF8Conversion.h> in order to avoid conflicting with ICU's unicode/utf8.h
+        https://bugs.webkit.org/show_bug.cgi?id=189693
+
+        Reviewed by Yusuke Suzuki.
+
+        No new tests because there's no behaviro changes.
+
+        * platform/SharedBuffer.cpp: Replaced <wtf/unicode/UTF8.h> with <wtf/unicode/UTF8Conversion.h>.
+        * xml/XSLTProcessorLibxslt.cpp: Ditto.
+        * xml/parser/XMLDocumentParserLibxml2.cpp: Ditto.
+
 2018-11-01  John Wilander  <wilan...@apple.com>
 
         In WebCore::ResourceLoadObserver, use document.sessionID().isEphemeral() when possible and check for page existence when not

Modified: trunk/Source/WebCore/platform/SharedBuffer.cpp (237713 => 237714)


--- trunk/Source/WebCore/platform/SharedBuffer.cpp	2018-11-02 00:58:05 UTC (rev 237713)
+++ trunk/Source/WebCore/platform/SharedBuffer.cpp	2018-11-02 01:13:19 UTC (rev 237714)
@@ -29,7 +29,7 @@
 #include "SharedBuffer.h"
 
 #include <algorithm>
-#include <wtf/unicode/UTF8.h>
+#include <wtf/unicode/UTF8Conversion.h>
 
 namespace WebCore {
 

Modified: trunk/Source/WebCore/xml/XSLTProcessorLibxslt.cpp (237713 => 237714)


--- trunk/Source/WebCore/xml/XSLTProcessorLibxslt.cpp	2018-11-02 00:58:05 UTC (rev 237713)
+++ trunk/Source/WebCore/xml/XSLTProcessorLibxslt.cpp	2018-11-02 01:13:19 UTC (rev 237714)
@@ -49,7 +49,7 @@
 #include <libxslt/xsltutils.h>
 #include <wtf/Assertions.h>
 #include <wtf/text/StringBuffer.h>
-#include <wtf/unicode/UTF8.h>
+#include <wtf/unicode/UTF8Conversion.h>
 
 #if OS(DARWIN) && !PLATFORM(GTK)
 #include "SoftLinkLibxslt.h"

Modified: trunk/Source/WebCore/xml/parser/XMLDocumentParserLibxml2.cpp (237713 => 237714)


--- trunk/Source/WebCore/xml/parser/XMLDocumentParserLibxml2.cpp	2018-11-02 00:58:05 UTC (rev 237713)
+++ trunk/Source/WebCore/xml/parser/XMLDocumentParserLibxml2.cpp	2018-11-02 01:13:19 UTC (rev 237714)
@@ -53,7 +53,7 @@
 #include "XMLNSNames.h"
 #include "XMLDocumentParserScope.h"
 #include <libxml/parserInternals.h>
-#include <wtf/unicode/UTF8.h>
+#include <wtf/unicode/UTF8Conversion.h>
 
 #if ENABLE(XSLT)
 #include "XMLTreeViewer.h"

Modified: trunk/Source/WebKit/ChangeLog (237713 => 237714)


--- trunk/Source/WebKit/ChangeLog	2018-11-02 00:58:05 UTC (rev 237713)
+++ trunk/Source/WebKit/ChangeLog	2018-11-02 01:13:19 UTC (rev 237714)
@@ -1,3 +1,12 @@
+2018-11-01  Fujii Hironori  <hironori.fu...@sony.com>
+
+        Rename <wtf/unicode/UTF8.h> to <wtf/unicode/UTF8Conversion.h> in order to avoid conflicting with ICU's unicode/utf8.h
+        https://bugs.webkit.org/show_bug.cgi?id=189693
+
+        Reviewed by Yusuke Suzuki.
+
+        * Shared/API/APIString.h: Replaced <wtf/unicode/UTF8.h> with <wtf/unicode/UTF8Conversion.h>.
+
 2018-11-01  Daniel Bates  <daba...@apple.com>
 
         Cleanup: Extraneous platform guarding of -_setUpSQLiteDatabaseTrackerClient

Modified: trunk/Source/WebKit/Shared/API/APIString.h (237713 => 237714)


--- trunk/Source/WebKit/Shared/API/APIString.h	2018-11-02 00:58:05 UTC (rev 237713)
+++ trunk/Source/WebKit/Shared/API/APIString.h	2018-11-02 01:13:19 UTC (rev 237714)
@@ -30,7 +30,7 @@
 #include <wtf/Ref.h>
 #include <wtf/text/StringView.h>
 #include <wtf/text/WTFString.h>
-#include <wtf/unicode/UTF8.h>
+#include <wtf/unicode/UTF8Conversion.h>
 
 namespace API {
 
_______________________________________________
webkit-changes mailing list
webkit-changes@lists.webkit.org
https://lists.webkit.org/mailman/listinfo/webkit-changes

Reply via email to