Diff
Modified: trunk/JSTests/ChangeLog (258530 => 258531)
--- trunk/JSTests/ChangeLog 2020-03-17 00:03:57 UTC (rev 258530)
+++ trunk/JSTests/ChangeLog 2020-03-17 00:12:17 UTC (rev 258531)
@@ -1,3 +1,15 @@
+2020-03-16 Keith Miller <keith_mil...@apple.com>
+
+ _javascript_ identifier grammar supports unescaped astral symbols, but JSC doesn’t
+ https://bugs.webkit.org/show_bug.cgi?id=208998
+
+ Reviewed by Michael Saboff.
+
+ * stress/unicode-identifiers-with-surrogate-pairs.js: Added.
+ (let.c.of.chars.eval.foo):
+ (throwsSyntaxError):
+ (let.c.of.continueChars.throwsSyntaxError.foo):
+
2020-03-13 Saam Barati <sbar...@apple.com>
skip wasm/function-tests/grow-memory-cause-gc.js on memory limited devices
Added: trunk/JSTests/stress/unicode-identifiers-with-surrogate-pairs.js (0 => 258531)
--- trunk/JSTests/stress/unicode-identifiers-with-surrogate-pairs.js (rev 0)
+++ trunk/JSTests/stress/unicode-identifiers-with-surrogate-pairs.js 2020-03-17 00:12:17 UTC (rev 258531)
@@ -0,0 +1,30 @@
+
+let chars = ["鴬", "𐊧", "Ϊ"];
+let continueChars = [unescape("\u0311"), String.fromCharCode(...[0xDB40, 0xDD96])];
+
+let o = { };
+for (let c of chars) {
+ eval(`var ${c};`);
+ eval(`function foo() { var ${c} }`);
+ eval(`o.${c}`);
+}
+
+function throwsSyntaxError(string) {
+ try {
+ eval(string);
+ } catch (e) {
+ if (!(e instanceof SyntaxError))
+ throw new Error(string);
+ return;
+ }
+ throw new Error(string);
+}
+
+for (let c of continueChars) {
+ throwsSyntaxError(`var ${c}`);
+ throwsSyntaxError(`function foo() { var ${c} }`);
+ throwsSyntaxError(`o.${c}`);
+ eval(`var ${("a" + c)}`);
+ eval(`o.${"a" + c}`);
+
+}
Modified: trunk/LayoutTests/ChangeLog (258530 => 258531)
--- trunk/LayoutTests/ChangeLog 2020-03-17 00:03:57 UTC (rev 258530)
+++ trunk/LayoutTests/ChangeLog 2020-03-17 00:12:17 UTC (rev 258531)
@@ -1,3 +1,16 @@
+2020-03-16 Keith Miller <keith_mil...@apple.com>
+
+ _javascript_ identifier grammar supports unescaped astral symbols, but JSC doesn’t
+ https://bugs.webkit.org/show_bug.cgi?id=208998
+
+ Reviewed by Michael Saboff.
+
+ Fix broken test that asserted a non-ID_START codepoint was a start codepoint and
+ an ID_START codepoint was not a valid codepoint...
+
+ * js/script-tests/unicode-escape-sequences.js:
+ * js/unicode-escape-sequences-expected.txt:
+
2020-03-16 Jason Lawrence <lawrenc...@apple.com>
[ Mac wk1 ] http/tests/security/clipboard/copy-paste-html-cross-origin-iframe-across-origin.html is flaky failing.
Modified: trunk/LayoutTests/js/script-tests/unicode-escape-sequences.js (258530 => 258531)
--- trunk/LayoutTests/js/script-tests/unicode-escape-sequences.js 2020-03-17 00:03:57 UTC (rev 258530)
+++ trunk/LayoutTests/js/script-tests/unicode-escape-sequences.js 2020-03-17 00:12:17 UTC (rev 258531)
@@ -74,8 +74,8 @@
testIdentifierStartUnicodeEscapeSequence("{41}", "0041");
testIdentifierStartUnicodeEscapeSequence("{102C0}", "D800,DEC0");
testIdentifierStartUnicodeEscapeSequence("{102c0}", "D800,DEC0");
-testIdentifierStartUnicodeEscapeSequence("{1D306}", "D834,DF06");
-testIdentifierStartUnicodeEscapeSequence("{1d306}", "D834,DF06");
+testIdentifierStartUnicodeEscapeSequence("{10000}", "D800,DC00");
+testIdentifierStartUnicodeEscapeSequence("{10001}", "D800,DC01");
testInvalidIdentifierStartUnicodeEscapeSequence("");
testInvalidIdentifierStartUnicodeEscapeSequence("{0}");
@@ -85,8 +85,6 @@
testInvalidIdentifierStartUnicodeEscapeSequence("{dc00}");
testInvalidIdentifierStartUnicodeEscapeSequence("{FFFF}");
testInvalidIdentifierStartUnicodeEscapeSequence("{ffff}");
-testInvalidIdentifierStartUnicodeEscapeSequence("{10000}");
-testInvalidIdentifierStartUnicodeEscapeSequence("{10001}");
testInvalidIdentifierStartUnicodeEscapeSequence("{10FFFE}");
testInvalidIdentifierStartUnicodeEscapeSequence("{10fffe}");
testInvalidIdentifierStartUnicodeEscapeSequence("{10FFFF}");
@@ -93,6 +91,8 @@
testInvalidIdentifierStartUnicodeEscapeSequence("{10ffff}");
testInvalidIdentifierStartUnicodeEscapeSequence("{00000000000000000000000010FFFF}");
testInvalidIdentifierStartUnicodeEscapeSequence("{00000000000000000000000010ffff}");
+testInvalidIdentifierStartUnicodeEscapeSequence("{1D306}");
+testInvalidIdentifierStartUnicodeEscapeSequence("{1d306}");
testInvalidIdentifierStartUnicodeEscapeSequence("x");
testInvalidIdentifierStartUnicodeEscapeSequence("{");
Modified: trunk/LayoutTests/js/unicode-escape-sequences-expected.txt (258530 => 258531)
--- trunk/LayoutTests/js/unicode-escape-sequences-expected.txt 2020-03-17 00:03:57 UTC (rev 258530)
+++ trunk/LayoutTests/js/unicode-escape-sequences-expected.txt 2020-03-17 00:12:17 UTC (rev 258531)
@@ -35,8 +35,8 @@
PASS codeUnits(function \u{41}(){}.name) is "0041"
PASS codeUnits(function \u{102C0}(){}.name) is "D800,DEC0"
PASS codeUnits(function \u{102c0}(){}.name) is "D800,DEC0"
-PASS codeUnits(function \u{1D306}(){}.name) is "D834,DF06"
-PASS codeUnits(function \u{1d306}(){}.name) is "D834,DF06"
+PASS codeUnits(function \u{10000}(){}.name) is "D800,DC00"
+PASS codeUnits(function \u{10001}(){}.name) is "D800,DC01"
PASS codeUnits(function \u(){}.name) threw exception SyntaxError: Invalid unicode escape in identifier: '\u'.
PASS codeUnits(function \u{0}(){}.name) threw exception SyntaxError: Invalid unicode escape in identifier: '\u{0}'.
PASS codeUnits(function \u{D800}(){}.name) threw exception SyntaxError: Invalid unicode escape in identifier: '\u{D800}'.
@@ -45,8 +45,6 @@
PASS codeUnits(function \u{dc00}(){}.name) threw exception SyntaxError: Invalid unicode escape in identifier: '\u{dc00}'.
PASS codeUnits(function \u{FFFF}(){}.name) threw exception SyntaxError: Invalid unicode escape in identifier: '\u{FFFF}'.
PASS codeUnits(function \u{ffff}(){}.name) threw exception SyntaxError: Invalid unicode escape in identifier: '\u{ffff}'.
-PASS codeUnits(function \u{10000}(){}.name) threw exception SyntaxError: Invalid unicode escape in identifier: '\u{10000}'.
-PASS codeUnits(function \u{10001}(){}.name) threw exception SyntaxError: Invalid unicode escape in identifier: '\u{10001}'.
PASS codeUnits(function \u{10FFFE}(){}.name) threw exception SyntaxError: Invalid unicode escape in identifier: '\u{10FFFE}'.
PASS codeUnits(function \u{10fffe}(){}.name) threw exception SyntaxError: Invalid unicode escape in identifier: '\u{10fffe}'.
PASS codeUnits(function \u{10FFFF}(){}.name) threw exception SyntaxError: Invalid unicode escape in identifier: '\u{10FFFF}'.
@@ -53,6 +51,8 @@
PASS codeUnits(function \u{10ffff}(){}.name) threw exception SyntaxError: Invalid unicode escape in identifier: '\u{10ffff}'.
PASS codeUnits(function \u{00000000000000000000000010FFFF}(){}.name) threw exception SyntaxError: Invalid unicode escape in identifier: '\u{00000000000000000000000010FFFF}'.
PASS codeUnits(function \u{00000000000000000000000010ffff}(){}.name) threw exception SyntaxError: Invalid unicode escape in identifier: '\u{00000000000000000000000010ffff}'.
+PASS codeUnits(function \u{1D306}(){}.name) threw exception SyntaxError: Invalid unicode escape in identifier: '\u{1D306}'.
+PASS codeUnits(function \u{1d306}(){}.name) threw exception SyntaxError: Invalid unicode escape in identifier: '\u{1d306}'.
PASS codeUnits(function \ux(){}.name) threw exception SyntaxError: Invalid unicode escape in identifier: '\u'.
PASS codeUnits(function \u{(){}.name) threw exception SyntaxError: Invalid unicode escape in identifier: '\u{'.
PASS codeUnits(function \u{}(){}.name) threw exception SyntaxError: Invalid unicode escape in identifier: '\u{'.
Modified: trunk/Source/_javascript_Core/ChangeLog (258530 => 258531)
--- trunk/Source/_javascript_Core/ChangeLog 2020-03-17 00:03:57 UTC (rev 258530)
+++ trunk/Source/_javascript_Core/ChangeLog 2020-03-17 00:12:17 UTC (rev 258531)
@@ -1,3 +1,48 @@
+2020-03-16 Keith Miller <keith_mil...@apple.com>
+
+ _javascript_ identifier grammar supports unescaped astral symbols, but JSC doesn’t
+ https://bugs.webkit.org/show_bug.cgi?id=208998
+
+ Reviewed by Michael Saboff.
+
+ This patch fixes a bug in the parser that allows for surrogate pairs when parsing identifiers.
+ It also makes a few other changes to the parser:
+
+ 1) When looking for keywords we just need to check that subsequent
+ character cannot be a identifier part or an escape start.
+
+ 2) The only time we call parseIdentifierSlowCase is when we hit an
+ escape start or a surrogate pair so we can optimize that to just
+ copy everything up slow character into our buffer.
+
+ 3) We shouldn't allow for asking if a UChar is an identifier start/part.
+
+ * KeywordLookupGenerator.py:
+ (Trie.printSubTreeAsC):
+ (Trie.printAsC):
+ * parser/Lexer.cpp:
+ (JSC::isNonLatin1IdentStart):
+ (JSC::isIdentStart):
+ (JSC::isSingleCharacterIdentStart):
+ (JSC::cannotBeIdentStart):
+ (JSC::isIdentPart):
+ (JSC::isSingleCharacterIdentPart):
+ (JSC::cannotBeIdentPartOrEscapeStart):
+ (JSC::Lexer<LChar>::currentCodePoint const):
+ (JSC::Lexer<UChar>::currentCodePoint const):
+ (JSC::Lexer<LChar>::parseIdentifier):
+ (JSC::Lexer<UChar>::parseIdentifier):
+ (JSC::Lexer<CharacterType>::parseIdentifierSlowCase):
+ (JSC::Lexer<T>::lexWithoutClearingLineTerminator):
+ (JSC::Lexer<T>::scanRegExp):
+ (JSC::isIdentPartIncludingEscapeTemplate): Deleted.
+ (JSC::isIdentPartIncludingEscape): Deleted.
+ * parser/Lexer.h:
+ (JSC::Lexer::setOffsetFromSourcePtr): Deleted.
+ * parser/Parser.cpp:
+ (JSC::Parser<LexerType>::printUnexpectedTokenText):
+ * parser/ParserTokens.h:
+
2020-03-13 Sergio Villar Senin <svil...@igalia.com>
[WebXR] IDLs, stubs and build configuration for WPE
Modified: trunk/Source/_javascript_Core/KeywordLookupGenerator.py (258530 => 258531)
--- trunk/Source/_javascript_Core/KeywordLookupGenerator.py 2020-03-17 00:03:57 UTC (rev 258530)
+++ trunk/Source/_javascript_Core/KeywordLookupGenerator.py 2020-03-17 00:12:17 UTC (rev 258531)
@@ -141,7 +141,7 @@
str = makePadding(indent)
if self.value != None:
- print(str + "if (!isIdentPartIncludingEscape(code+%d, m_codeEnd)) {" % (len(self.fullPrefix)))
+ print(str + "if (LIKELY(cannotBeIdentPartOrEscapeStart(code[%d]))) {" % (len(self.fullPrefix)))
print(str + " internalShift<%d>();" % len(self.fullPrefix))
print(str + " if (shouldCreateIdentifier)")
print(str + (" data->ident = &m_vm.propertyNames->%sKeyword;" % self.fullPrefix))
@@ -184,8 +184,8 @@
def printAsC(self):
print("namespace JSC {")
print("")
- print("static ALWAYS_INLINE bool isIdentPartIncludingEscape(const LChar* code, const LChar* codeEnd);")
- print("static ALWAYS_INLINE bool isIdentPartIncludingEscape(const UChar* code, const UChar* codeEnd);")
+ print("static ALWAYS_INLINE bool cannotBeIdentPartOrEscapeStart(LChar);")
+ print("static ALWAYS_INLINE bool cannotBeIdentPartOrEscapeStart(UChar);")
# max length + 1 so we don't need to do any bounds checking at all
print("static constexpr int maxTokenLength = %d;" % (self.maxLength() + 1))
print("")
Modified: trunk/Source/_javascript_Core/parser/Lexer.cpp (258530 => 258531)
--- trunk/Source/_javascript_Core/parser/Lexer.cpp 2020-03-17 00:03:57 UTC (rev 258530)
+++ trunk/Source/_javascript_Core/parser/Lexer.cpp 2020-03-17 00:12:17 UTC (rev 258531)
@@ -732,87 +732,98 @@
shift();
}
-static NEVER_INLINE bool isNonLatin1IdentStart(UChar c)
+static bool isNonLatin1IdentStart(UChar32 c)
{
return u_hasBinaryProperty(c, UCHAR_ID_START);
}
-static inline bool isIdentStart(LChar c)
+template<typename CharacterType>
+static ALWAYS_INLINE bool isIdentStart(CharacterType c)
{
- return typesOfLatin1Characters[c] == CharacterIdentifierStart;
+ static_assert(std::is_same_v<CharacterType, LChar> || std::is_same_v<CharacterType, UChar32>, "Call isSingleCharacterIdentStart for UChars that don't need to check for surrogate pairs");
+ if (!isLatin1(c))
+ return isNonLatin1IdentStart(c);
+ return typesOfLatin1Characters[static_cast<LChar>(c)] == CharacterIdentifierStart;
}
-static inline bool isIdentStart(UChar32 c)
+static ALWAYS_INLINE bool isSingleCharacterIdentStart(UChar c)
{
- return isLatin1(c) ? isIdentStart(static_cast<LChar>(c)) : isNonLatin1IdentStart(c);
+ if (LIKELY(isLatin1(c)))
+ return isIdentStart(static_cast<LChar>(c));
+ return !U16_IS_SURROGATE(c) && isIdentStart(static_cast<UChar32>(c));
}
+static ALWAYS_INLINE bool cannotBeIdentStart(LChar c)
+{
+ return !isIdentStart(c) && c != '\\';
+}
+
+static ALWAYS_INLINE bool cannotBeIdentStart(UChar c)
+{
+ if (LIKELY(isLatin1(c)))
+ return cannotBeIdentStart(static_cast<LChar>(c));
+ return Lexer<UChar>::isWhiteSpace(c) || Lexer<UChar>::isLineTerminator(c);
+}
+
static NEVER_INLINE bool isNonLatin1IdentPart(UChar32 c)
{
return u_hasBinaryProperty(c, UCHAR_ID_CONTINUE) || c == 0x200C || c == 0x200D;
}
-static ALWAYS_INLINE bool isIdentPart(LChar c)
+template<typename CharacterType>
+static ALWAYS_INLINE bool isIdentPart(CharacterType c)
{
+ static_assert(std::is_same_v<CharacterType, LChar> || std::is_same_v<CharacterType, UChar32>, "Call isSingleCharacterIdentPart for UChars that don't need to check for surrogate pairs");
+ if (!isLatin1(c))
+ return isNonLatin1IdentPart(c);
+
// Character types are divided into two groups depending on whether they can be part of an
// identifier or not. Those whose type value is less or equal than CharacterOtherIdentifierPart can be
// part of an identifier. (See the CharacterType definition for more details.)
- return typesOfLatin1Characters[c] <= CharacterOtherIdentifierPart;
+ return typesOfLatin1Characters[static_cast<LChar>(c)] <= CharacterOtherIdentifierPart;
}
-static ALWAYS_INLINE bool isIdentPart(UChar32 c)
+static ALWAYS_INLINE bool isSingleCharacterIdentPart(UChar c)
{
- return isLatin1(c) ? isIdentPart(static_cast<LChar>(c)) : isNonLatin1IdentPart(c);
+ if (LIKELY(isLatin1(c)))
+ return isIdentPart(static_cast<LChar>(c));
+ return !U16_IS_SURROGATE(c) && isIdentPart(static_cast<UChar32>(c));
}
-static ALWAYS_INLINE bool isIdentPart(UChar c)
+static ALWAYS_INLINE bool cannotBeIdentPartOrEscapeStart(LChar c)
{
- return isIdentPart(static_cast<UChar32>(c));
+ return !isIdentPart(c) && c != '\\';
}
-template<typename CharacterType> ALWAYS_INLINE bool isIdentPartIncludingEscapeTemplate(const CharacterType* code, const CharacterType* codeEnd)
+// NOTE: This may give give false negatives (for non-ascii) but won't give false posititves.
+// This means it can be used to detect the end of a keyword (all keywords are ascii)
+static ALWAYS_INLINE bool cannotBeIdentPartOrEscapeStart(UChar c)
{
- if (isIdentPart(code[0]))
- return true;
+ if (LIKELY(isLatin1(c)))
+ return cannotBeIdentPartOrEscapeStart(static_cast<LChar>(c));
+ return Lexer<UChar>::isWhiteSpace(c) || Lexer<UChar>::isLineTerminator(c);
+}
- // Shortest sequence handled below is \u{0}, which is 5 characters.
- if (!(code[0] == '\\' && codeEnd - code >= 5 && code[1] == 'u'))
- return false;
- if (code[2] == '{') {
- UChar32 codePoint = 0;
- const CharacterType* pointer;
- for (pointer = &code[3]; pointer < codeEnd; ++pointer) {
- auto digit = *pointer;
- if (!isASCIIHexDigit(digit))
- break;
- codePoint = (codePoint << 4) | toASCIIHexValue(digit);
- if (codePoint > UCHAR_MAX_VALUE)
- return false;
- }
- return isIdentPart(codePoint) && pointer < codeEnd && *pointer == '}';
- }
-
- // Shortest sequence handled below is \uXXXX, which is 6 characters.
- if (codeEnd - code < 6)
- return false;
-
- auto character1 = code[2];
- auto character2 = code[3];
- auto character3 = code[4];
- auto character4 = code[5];
- return isASCIIHexDigit(character1) && isASCIIHexDigit(character2) && isASCIIHexDigit(character3) && isASCIIHexDigit(character4)
- && isIdentPart(Lexer<LChar>::convertUnicode(character1, character2, character3, character4));
+template<>
+ALWAYS_INLINE UChar32 Lexer<LChar>::currentCodePoint() const
+{
+ return m_current;
}
-static ALWAYS_INLINE bool isIdentPartIncludingEscape(const LChar* code, const LChar* codeEnd)
+template<>
+ALWAYS_INLINE UChar32 Lexer<UChar>::currentCodePoint() const
{
- return isIdentPartIncludingEscapeTemplate(code, codeEnd);
-}
+ ASSERT_WITH_MESSAGE(!isIdentStart(static_cast<UChar32>(U_SENTINEL)), "error values shouldn't appear as a valid identifier start code point");
+ if (!U16_IS_SURROGATE(m_current))
+ return m_current;
-static ALWAYS_INLINE bool isIdentPartIncludingEscape(const UChar* code, const UChar* codeEnd)
-{
- return isIdentPartIncludingEscapeTemplate(code, codeEnd);
+ UChar trail = peek(1);
+ if (UNLIKELY(!U16_IS_LEAD(m_current) || !U16_IS_SURROGATE_TRAIL(trail)))
+ return U_SENTINEL;
+
+ UChar32 codePoint = U16_GET_SUPPLEMENTARY(m_current, trail);
+ return codePoint;
}
template<typename CharacterType>
@@ -952,15 +963,12 @@
}
const LChar* identifierStart = currentSourcePtr();
- unsigned identifierLineStart = currentLineStartOffset();
-
+ ASSERT(isIdentStart(m_current) || m_current == '\\');
while (isIdentPart(m_current))
shift();
- if (UNLIKELY(m_current == '\\')) {
- setOffsetFromSourcePtr(identifierStart, identifierLineStart);
- return parseIdentifierSlowCase<shouldCreateIdentifier>(tokenData, lexerFlags, strictMode);
- }
+ if (UNLIKELY(m_current == '\\'))
+ return parseIdentifierSlowCase<shouldCreateIdentifier>(tokenData, lexerFlags, strictMode, identifierStart);
const Identifier* ident = nullptr;
@@ -1007,6 +1015,7 @@
template <>
template <bool shouldCreateIdentifier> ALWAYS_INLINE JSTokenType Lexer<UChar>::parseIdentifier(JSTokenData* tokenData, OptionSet<LexerFlags> lexerFlags, bool strictMode)
{
+ ASSERT(!m_parsingBuiltinFunction);
tokenData->escaped = false;
const ptrdiff_t remaining = m_codeEnd - m_code;
if ((remaining >= maxTokenLength) && !lexerFlags.contains(LexerFlags::IgnoreReservedWords)) {
@@ -1016,70 +1025,32 @@
return keyword == RESERVED_IF_STRICT && !strictMode ? IDENT : keyword;
}
}
-
- bool isPrivateName = m_current == '@' && m_parsingBuiltinFunction;
- bool isWellKnownSymbol = false;
- if (isPrivateName) {
- ASSERT(m_parsingBuiltinFunction);
- shift();
- if (m_current == '@') {
- isWellKnownSymbol = true;
- shift();
- }
- }
-
const UChar* identifierStart = currentSourcePtr();
- int identifierLineStart = currentLineStartOffset();
-
UChar orAllChars = 0;
-
- while (isIdentPart(m_current)) {
+ ASSERT(isSingleCharacterIdentStart(m_current) || U16_IS_SURROGATE(m_current) || m_current == '\\');
+ while (isSingleCharacterIdentPart(m_current)) {
orAllChars |= m_current;
shift();
}
- if (UNLIKELY(m_current == '\\')) {
- ASSERT(!isPrivateName);
- setOffsetFromSourcePtr(identifierStart, identifierLineStart);
- return parseIdentifierSlowCase<shouldCreateIdentifier>(tokenData, lexerFlags, strictMode);
- }
+ if (UNLIKELY(U16_IS_SURROGATE(m_current) || m_current == '\\'))
+ return parseIdentifierSlowCase<shouldCreateIdentifier>(tokenData, lexerFlags, strictMode, identifierStart);
- bool isAll8Bit = false;
-
- if (!(orAllChars & ~0xff))
- isAll8Bit = true;
-
+ bool isAll8Bit = !(orAllChars & ~0xff);
const Identifier* ident = nullptr;
- if (shouldCreateIdentifier || m_parsingBuiltinFunction) {
+ if (shouldCreateIdentifier) {
int identifierLength = currentSourcePtr() - identifierStart;
- if (m_parsingBuiltinFunction && isPrivateName) {
- if (isWellKnownSymbol)
- ident = &m_arena->makeIdentifier(m_vm, m_vm.propertyNames->builtinNames().lookUpWellKnownSymbol(identifierStart, identifierLength));
- else
- ident = &m_arena->makeIdentifier(m_vm, m_vm.propertyNames->builtinNames().lookUpPrivateName(identifierStart, identifierLength));
- if (!ident)
- return INVALID_PRIVATE_NAME_ERRORTOK;
- } else {
- if (isAll8Bit)
- ident = makeIdentifierLCharFromUChar(identifierStart, identifierLength);
- else
- ident = makeIdentifier(identifierStart, identifierLength);
- if (m_parsingBuiltinFunction) {
- if (!isSafeBuiltinIdentifier(m_vm, ident)) {
- m_lexErrorMessage = makeString("The use of '", ident->string(), "' is disallowed in builtin functions.");
- return ERRORTOK;
- }
- if (*ident == m_vm.propertyNames->undefinedKeyword)
- tokenData->ident = &m_vm.propertyNames->undefinedPrivateName;
- }
- }
+ if (isAll8Bit)
+ ident = makeIdentifierLCharFromUChar(identifierStart, identifierLength);
+ else
+ ident = makeIdentifier(identifierStart, identifierLength);
tokenData->ident = ident;
} else
tokenData->ident = nullptr;
- if (UNLIKELY((remaining < maxTokenLength) && !lexerFlags.contains(LexerFlags::IgnoreReservedWords)) && !isPrivateName) {
+ if (UNLIKELY((remaining < maxTokenLength) && !lexerFlags.contains(LexerFlags::IgnoreReservedWords))) {
ASSERT(shouldCreateIdentifier);
if (remaining < maxTokenLength) {
const HashTableValue* entry = JSC::mainTable.entry(*ident);
@@ -1095,49 +1066,74 @@
return IDENT;
}
-template<typename CharacterType> template<bool shouldCreateIdentifier> JSTokenType Lexer<CharacterType>::parseIdentifierSlowCase(JSTokenData* tokenData, OptionSet<LexerFlags> lexerFlags, bool strictMode)
+template<typename CharacterType>
+template<bool shouldCreateIdentifier>
+JSTokenType Lexer<CharacterType>::parseIdentifierSlowCase(JSTokenData* tokenData, OptionSet<LexerFlags> lexerFlags, bool strictMode, const CharacterType* identifierStart)
{
- tokenData->escaped = true;
- auto identifierStart = currentSourcePtr();
- bool bufferRequired = false;
+ ASSERT(U16_IS_SURROGATE(m_current) || m_current == '\\');
+ ASSERT(m_buffer16.isEmpty());
+ ASSERT(!tokenData->escaped);
+ auto fillBuffer = [&] (bool isStart = false) {
+ // \uXXXX unicode characters or Surrogate pairs.
+ if (identifierStart != currentSourcePtr())
+ m_buffer16.append(identifierStart, currentSourcePtr() - identifierStart);
+
+ if (m_current == '\\') {
+ tokenData->escaped = true;
+ shift();
+ if (UNLIKELY(m_current != 'u'))
+ return atEnd() ? UNTERMINATED_IDENTIFIER_ESCAPE_ERRORTOK : INVALID_IDENTIFIER_ESCAPE_ERRORTOK;
+ shift();
+ auto character = parseUnicodeEscape();
+ if (UNLIKELY(!character.isValid()))
+ return character.isIncomplete() ? UNTERMINATED_IDENTIFIER_UNICODE_ESCAPE_ERRORTOK : INVALID_IDENTIFIER_UNICODE_ESCAPE_ERRORTOK;
+ if (UNLIKELY(isStart ? !isIdentStart(character.value()) : !isIdentPart(character.value())))
+ return INVALID_IDENTIFIER_UNICODE_ESCAPE_ERRORTOK;
+ if (shouldCreateIdentifier)
+ recordUnicodeCodePoint(character.value());
+ identifierStart = currentSourcePtr();
+ return IDENT;
+ }
+
+ ASSERT(U16_IS_SURROGATE(m_current));
+ if (UNLIKELY(!U16_IS_SURROGATE_LEAD(m_current)))
+ return INVALID_UNICODE_ENCODING_ERRORTOK;
+
+ UChar32 codePoint = currentCodePoint();
+ if (UNLIKELY(codePoint == U_SENTINEL))
+ return INVALID_UNICODE_ENCODING_ERRORTOK;
+ if (UNLIKELY(isStart ? !isNonLatin1IdentStart(codePoint) : !isNonLatin1IdentPart(codePoint)))
+ return INVALID_IDENTIFIER_UNICODE_ERRORTOK;
+ append16(m_code, 2);
+ shift();
+ shift();
+ identifierStart = currentSourcePtr();
+ return IDENT;
+ };
+
+ JSTokenType type = fillBuffer(identifierStart == currentSourcePtr());
+ if (UNLIKELY(type & ErrorTokenFlag))
+ return type;
+
while (true) {
- if (LIKELY(isIdentPart(m_current))) {
+ if (LIKELY(isSingleCharacterIdentPart(m_current))) {
shift();
continue;
}
- if (LIKELY(m_current != '\\'))
+ if (!U16_IS_SURROGATE(m_current) && m_current != '\\')
break;
- // \uXXXX unicode characters.
- bufferRequired = true;
- if (identifierStart != currentSourcePtr())
- m_buffer16.append(identifierStart, currentSourcePtr() - identifierStart);
- shift();
- if (UNLIKELY(m_current != 'u'))
- return atEnd() ? UNTERMINATED_IDENTIFIER_ESCAPE_ERRORTOK : INVALID_IDENTIFIER_ESCAPE_ERRORTOK;
- shift();
- auto character = parseUnicodeEscape();
- if (UNLIKELY(!character.isValid()))
- return character.isIncomplete() ? UNTERMINATED_IDENTIFIER_UNICODE_ESCAPE_ERRORTOK : INVALID_IDENTIFIER_UNICODE_ESCAPE_ERRORTOK;
- if (UNLIKELY(m_buffer16.size() ? !isIdentPart(character.value()) : !isIdentStart(character.value())))
- return INVALID_IDENTIFIER_UNICODE_ESCAPE_ERRORTOK;
- if (shouldCreateIdentifier)
- recordUnicodeCodePoint(character.value());
- identifierStart = currentSourcePtr();
+ type = fillBuffer();
+ if (UNLIKELY(type & ErrorTokenFlag))
+ return type;
}
- int identifierLength;
const Identifier* ident = nullptr;
if (shouldCreateIdentifier) {
- if (!bufferRequired) {
- identifierLength = currentSourcePtr() - identifierStart;
- ident = makeIdentifier(identifierStart, identifierLength);
- } else {
- if (identifierStart != currentSourcePtr())
- m_buffer16.append(identifierStart, currentSourcePtr() - identifierStart);
- ident = makeIdentifier(m_buffer16.data(), m_buffer16.size());
- }
+ if (identifierStart != currentSourcePtr())
+ m_buffer16.append(identifierStart, currentSourcePtr() - identifierStart);
+ ident = makeIdentifier(m_buffer16.data(), m_buffer16.size());
tokenData->ident = ident;
} else
@@ -1152,7 +1148,7 @@
return IDENT;
JSTokenType token = static_cast<JSTokenType>(entry->lexerValue());
if ((token != RESERVED_IF_STRICT) || strictMode)
- return bufferRequired ? UNEXPECTED_ESCAPE_ERRORTOK : token;
+ return UNEXPECTED_ESCAPE_ERRORTOK;
}
return IDENT;
@@ -1912,12 +1908,16 @@
CharacterType type;
if (LIKELY(isLatin1(m_current)))
type = static_cast<CharacterType>(typesOfLatin1Characters[m_current]);
- else if (isNonLatin1IdentStart(m_current))
- type = CharacterIdentifierStart;
- else if (isLineTerminator(m_current))
- type = CharacterLineTerminator;
- else
- type = CharacterInvalid;
+ else {
+ UChar32 codePoint;
+ U16_GET(m_code, 0, 0, m_codeEnd - m_code, codePoint);
+ if (isNonLatin1IdentStart(codePoint))
+ type = CharacterIdentifierStart;
+ else if (isLineTerminator(m_current))
+ type = CharacterLineTerminator;
+ else
+ type = CharacterInvalid;
+ }
switch (type) {
case CharacterGreater:
@@ -2231,7 +2231,12 @@
if (token == INTEGER)
token = tokenTypeForIntegerLikeToken(tokenData->doubleValue);
- if (UNLIKELY(isIdentStart(m_current))) {
+ if (LIKELY(cannotBeIdentStart(m_current))) {
+ m_buffer8.shrink(0);
+ break;
+ }
+
+ if (UNLIKELY(isIdentStart(currentCodePoint()))) {
m_lexErrorMessage = "No identifiers allowed directly after numeric literal"_s;
token = atEnd() ? UNTERMINATED_NUMERIC_LITERAL_ERRORTOK : INVALID_NUMERIC_LITERAL_ERRORTOK;
goto returnError;
@@ -2262,7 +2267,14 @@
tokenData->radix = 16;
}
- if (UNLIKELY(isIdentStart(m_current))) {
+ if (LIKELY(cannotBeIdentStart(m_current))) {
+ if (LIKELY(token != BIGINT))
+ token = tokenTypeForIntegerLikeToken(tokenData->doubleValue);
+ m_buffer8.shrink(0);
+ break;
+ }
+
+ if (UNLIKELY(isIdentStart(currentCodePoint()))) {
m_lexErrorMessage = "No space between hexadecimal literal and identifier"_s;
token = UNTERMINATED_HEX_NUMBER_ERRORTOK;
goto returnError;
@@ -2294,7 +2306,14 @@
tokenData->radix = 2;
}
- if (UNLIKELY(isIdentStart(m_current))) {
+ if (LIKELY(cannotBeIdentStart(m_current))) {
+ if (LIKELY(token != BIGINT))
+ token = tokenTypeForIntegerLikeToken(tokenData->doubleValue);
+ m_buffer8.shrink(0);
+ break;
+ }
+
+ if (UNLIKELY(isIdentStart(currentCodePoint()))) {
m_lexErrorMessage = "No space between binary literal and identifier"_s;
token = UNTERMINATED_BINARY_NUMBER_ERRORTOK;
goto returnError;
@@ -2327,7 +2346,14 @@
tokenData->radix = 8;
}
- if (UNLIKELY(isIdentStart(m_current))) {
+ if (LIKELY(cannotBeIdentStart(m_current))) {
+ if (LIKELY(token != BIGINT))
+ token = tokenTypeForIntegerLikeToken(tokenData->doubleValue);
+ m_buffer8.shrink(0);
+ break;
+ }
+
+ if (UNLIKELY(isIdentStart(currentCodePoint()))) {
m_lexErrorMessage = "No space between octal literal and identifier"_s;
token = UNTERMINATED_OCTAL_NUMBER_ERRORTOK;
goto returnError;
@@ -2394,7 +2420,12 @@
}
}
- if (UNLIKELY(isIdentStart(m_current))) {
+ if (LIKELY(cannotBeIdentStart(m_current))) {
+ m_buffer8.shrink(0);
+ break;
+ }
+
+ if (UNLIKELY(isIdentStart(currentCodePoint()))) {
m_lexErrorMessage = "No identifiers allowed directly after numeric literal"_s;
token = atEnd() ? UNTERMINATED_NUMERIC_LITERAL_ERRORTOK : INVALID_NUMERIC_LITERAL_ERRORTOK;
goto returnError;
@@ -2416,9 +2447,14 @@
token = STRING;
break;
}
- case CharacterIdentifierStart:
- ASSERT(isIdentStart(m_current));
+ case CharacterIdentifierStart: {
+ if constexpr (ASSERT_ENABLED) {
+ UChar32 codePoint;
+ U16_GET(m_code, 0, 0, m_codeEnd - m_code, codePoint);
+ ASSERT(isIdentStart(codePoint));
+ }
FALLTHROUGH;
+ }
case CharacterBackSlash:
parseIdent:
if (lexerFlags.contains(LexerFlags::DontBuildKeywords))
@@ -2578,19 +2614,30 @@
}
tokenData->pattern = makeRightSizedIdentifier(m_buffer16.data(), m_buffer16.size(), charactersOredTogether);
-
m_buffer16.shrink(0);
- charactersOredTogether = 0;
- while (isIdentPart(m_current)) {
- record16(m_current);
- orCharacter<T>(charactersOredTogether, m_current);
+ ASSERT(m_buffer8.isEmpty());
+ while (LIKELY(isLatin1(m_current)) && isIdentPart(static_cast<LChar>(m_current))) {
+ record8(static_cast<LChar>(m_current));
shift();
}
- tokenData->flags = makeRightSizedIdentifier(m_buffer16.data(), m_buffer16.size(), charactersOredTogether);
- m_buffer16.shrink(0);
+ // Normally this would not be a lex error but dealing with surrogate pairs here is annoying and it's going to be an error anyway...
+ if (UNLIKELY(!isLatin1(m_current))) {
+ m_buffer8.shrink(0);
+ JSTokenType token = INVALID_IDENTIFIER_UNICODE_ERRORTOK;
+ fillTokenInfo(tokenRecord, token, m_lineNumber, currentOffset(), currentLineStartOffset(), currentPosition());
+ m_error = true;
+ String codePoint = String::fromCodePoint(currentCodePoint());
+ if (!codePoint)
+ codePoint = "`invalid unicode character`";
+ m_lexErrorMessage = makeString("Invalid non-latin character in RexExp literal's flags '", getToken(*tokenRecord), codePoint, "'");
+ return token;
+ }
+ tokenData->flags = makeIdentifier(m_buffer8.data(), m_buffer8.size());
+ m_buffer8.shrink(0);
+
// Since RegExp always ends with /, m_atLineStart always becomes false.
m_atLineStart = false;
Modified: trunk/Source/_javascript_Core/parser/Lexer.h (258530 => 258531)
--- trunk/Source/_javascript_Core/parser/Lexer.h 2020-03-17 00:03:57 UTC (rev 258530)
+++ trunk/Source/_javascript_Core/parser/Lexer.h 2020-03-17 00:12:17 UTC (rev 258531)
@@ -135,6 +135,7 @@
void append16(const LChar*, size_t);
void append16(const UChar* characters, size_t length) { m_buffer16.append(characters, length); }
+ UChar32 currentCodePoint() const;
ALWAYS_INLINE void shift();
ALWAYS_INLINE bool atEnd() const;
ALWAYS_INLINE T peek(int offset) const;
@@ -147,7 +148,6 @@
String invalidCharacterMessage() const;
ALWAYS_INLINE const T* currentSourcePtr() const;
- ALWAYS_INLINE void setOffsetFromSourcePtr(const T* sourcePtr, unsigned lineStartOffset) { setOffset(offsetFromSourcePtr(sourcePtr), lineStartOffset); }
ALWAYS_INLINE void setCodeStart(const StringView&);
@@ -166,7 +166,7 @@
template <int shiftAmount> void internalShift();
template <bool shouldCreateIdentifier> ALWAYS_INLINE JSTokenType parseKeyword(JSTokenData*);
template <bool shouldBuildIdentifiers> ALWAYS_INLINE JSTokenType parseIdentifier(JSTokenData*, OptionSet<LexerFlags>, bool strictMode);
- template <bool shouldBuildIdentifiers> NEVER_INLINE JSTokenType parseIdentifierSlowCase(JSTokenData*, OptionSet<LexerFlags>, bool strictMode);
+ template <bool shouldBuildIdentifiers> NEVER_INLINE JSTokenType parseIdentifierSlowCase(JSTokenData*, OptionSet<LexerFlags>, bool strictMode, const T* identifierStart);
enum StringParseResult {
StringParsedSuccessfully,
StringUnterminated,
Modified: trunk/Source/_javascript_Core/parser/Parser.cpp (258530 => 258531)
--- trunk/Source/_javascript_Core/parser/Parser.cpp 2020-03-17 00:03:57 UTC (rev 258530)
+++ trunk/Source/_javascript_Core/parser/Parser.cpp 2020-03-17 00:12:17 UTC (rev 258531)
@@ -5220,6 +5220,12 @@
case INVALID_STRING_LITERAL_ERRORTOK:
out.print("Invalid string literal: '", getToken(), "'");
return;
+ case INVALID_UNICODE_ENCODING_ERRORTOK:
+ out.print("Invalid unicode encoding: '", getToken(), "'");
+ return;
+ case INVALID_IDENTIFIER_UNICODE_ERRORTOK:
+ out.print("Invalid unicode code point in identifier: '", getToken(), "'");
+ return;
case ERRORTOK:
out.print("Unrecognized token '", getToken(), "'");
return;
Modified: trunk/Source/_javascript_Core/parser/ParserTokens.h (258530 => 258531)
--- trunk/Source/_javascript_Core/parser/ParserTokens.h 2020-03-17 00:03:57 UTC (rev 258530)
+++ trunk/Source/_javascript_Core/parser/ParserTokens.h 2020-03-17 00:12:17 UTC (rev 258531)
@@ -33,7 +33,7 @@
class Identifier;
enum {
- // Token Bitfield: 0b000000000RTE000IIIIPPPPKUXXXXXXX
+ // Token Bitfield: 0b000000000RTE00IIIIPPPPKUXXXXXXXX
// R = right-associative bit
// T = unterminated error flag
// E = error flag
@@ -43,12 +43,12 @@
// U = unary operator flag
//
// We must keep the upper 8bit (1byte) region empty. JSTokenType must be 24bits.
- UnaryOpTokenFlag = 128,
- KeywordTokenFlag = 256,
- BinaryOpTokenPrecedenceShift = 9,
+ UnaryOpTokenFlag = 1 << 8,
+ KeywordTokenFlag = 1 << 9,
+ BinaryOpTokenPrecedenceShift = 10,
BinaryOpTokenAllowsInPrecedenceAdditionalShift = 4,
BinaryOpTokenPrecedenceMask = 15 << BinaryOpTokenPrecedenceShift,
- ErrorTokenFlag = 1 << (BinaryOpTokenAllowsInPrecedenceAdditionalShift + BinaryOpTokenPrecedenceShift + 7),
+ ErrorTokenFlag = 1 << (BinaryOpTokenAllowsInPrecedenceAdditionalShift + BinaryOpTokenPrecedenceShift + 6),
UnterminatedErrorTokenFlag = ErrorTokenFlag << 1,
RightAssociativeBinaryOpTokenFlag = UnterminatedErrorTokenFlag << 1
};
@@ -192,6 +192,8 @@
UNTERMINATED_REGEXP_LITERAL_ERRORTOK = 14 | ErrorTokenFlag | UnterminatedErrorTokenFlag,
INVALID_TEMPLATE_LITERAL_ERRORTOK = 15 | ErrorTokenFlag,
UNEXPECTED_ESCAPE_ERRORTOK = 16 | ErrorTokenFlag,
+ INVALID_UNICODE_ENCODING_ERRORTOK = 17 | ErrorTokenFlag,
+ INVALID_IDENTIFIER_UNICODE_ERRORTOK = 18 | ErrorTokenFlag,
};
static_assert(static_cast<unsigned>(POW) <= 0x00ffffffU, "JSTokenType must be 24bits.");
Modified: trunk/Source/WTF/ChangeLog (258530 => 258531)
--- trunk/Source/WTF/ChangeLog 2020-03-17 00:03:57 UTC (rev 258530)
+++ trunk/Source/WTF/ChangeLog 2020-03-17 00:12:17 UTC (rev 258531)
@@ -1,3 +1,14 @@
+2020-03-16 Keith Miller <keith_mil...@apple.com>
+
+ _javascript_ identifier grammar supports unescaped astral symbols, but JSC doesn’t
+ https://bugs.webkit.org/show_bug.cgi?id=208998
+
+ Reviewed by Michael Saboff.
+
+ * wtf/text/WTFString.cpp:
+ (WTF::String::fromCodePoint):
+ * wtf/text/WTFString.h:
+
2020-03-15 Yusuke Suzuki <ysuz...@apple.com>
Should not use variable-length-array (VLA)
Modified: trunk/Source/WTF/wtf/text/WTFString.cpp (258530 => 258531)
--- trunk/Source/WTF/wtf/text/WTFString.cpp 2020-03-17 00:03:57 UTC (rev 258530)
+++ trunk/Source/WTF/wtf/text/WTFString.cpp 2020-03-17 00:12:17 UTC (rev 258531)
@@ -888,6 +888,15 @@
return utf8;
}
+String String::fromCodePoint(UChar32 codePoint)
+{
+ UChar buffer[2];
+ uint8_t length = 0;
+ UBool error = false;
+ U16_APPEND(buffer, length, 2, codePoint, error);
+ return error ? String() : String(buffer, length);
+}
+
// String Operations
template<typename CharacterType>
static unsigned lengthOfCharactersAsInteger(const CharacterType* data, size_t length)
Modified: trunk/Source/WTF/wtf/text/WTFString.h (258530 => 258531)
--- trunk/Source/WTF/wtf/text/WTFString.h 2020-03-17 00:03:57 UTC (rev 258530)
+++ trunk/Source/WTF/wtf/text/WTFString.h 2020-03-17 00:12:17 UTC (rev 258531)
@@ -356,6 +356,8 @@
WTF_EXPORT_PRIVATE static String fromUTF8WithLatin1Fallback(const LChar*, size_t);
static String fromUTF8WithLatin1Fallback(const char* characters, size_t length) { return fromUTF8WithLatin1Fallback(reinterpret_cast<const LChar*>(characters), length); };
+ WTF_EXPORT_PRIVATE static String fromCodePoint(UChar32 codePoint);
+
// Determines the writing direction using the Unicode Bidi Algorithm rules P2 and P3.
UCharDirection defaultWritingDirection(bool* hasStrongDirectionality = nullptr) const;