https://github.com/hnakamura5 updated https://github.com/llvm/llvm-project/pull/78571
>From b472c08735b3ce3b6f7b81e499a2ef16c3faad4a Mon Sep 17 00:00:00 2001 From: hnakamura5 <hnakamu...@outlook.com> Date: Thu, 18 Jan 2024 21:49:06 +0900 Subject: [PATCH 1/2] [clang-format] Support of TableGen identifiers beginning with a number. --- clang/lib/Format/FormatTokenLexer.cpp | 44 ++++++++++++++++++- clang/lib/Format/FormatTokenLexer.h | 4 ++ clang/unittests/Format/TokenAnnotatorTest.cpp | 21 +++++++++ 3 files changed, 68 insertions(+), 1 deletion(-) diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp index 25ac9be57c81a9a..f1982533f112c75 100644 --- a/clang/lib/Format/FormatTokenLexer.cpp +++ b/clang/lib/Format/FormatTokenLexer.cpp @@ -93,8 +93,10 @@ ArrayRef<FormatToken *> FormatTokenLexer::lex() { // string literals are correctly identified. handleCSharpVerbatimAndInterpolatedStrings(); } - if (Style.isTableGen()) + if (Style.isTableGen()) { handleTableGenMultilineString(); + handleTableGenNumericLikeIdentifier(); + } if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline) FirstInLineIndex = Tokens.size() - 1; } while (Tokens.back()->isNot(tok::eof)); @@ -804,6 +806,46 @@ void FormatTokenLexer::handleTableGenMultilineString() { FirstLineText, MultiLineString->OriginalColumn, Style.TabWidth, Encoding); } +void FormatTokenLexer::handleTableGenNumericLikeIdentifier() { + FormatToken *Tok = Tokens.back(); + // TableGen identifiers can begin with digits. Such tokens are lexed as + // numeric_constant now. + if (Tok->isNot(tok::numeric_constant)) + return; + StringRef Text = Tok->TokenText; + // Identifiers cannot begin with + or -. + if (Text.size() < 1 || Text[0] == '+' || Text[0] == '-') + return; + // The following check is based on llvm::TGLexer::LexToken. + if (isdigit(Text[0])) { + size_t I = 0; + char NextChar = (char)0; + // Identifiers in TalbleGen may begin with digits. Skip to first non-digit. + do { + NextChar = Text[I++]; + } while (I < Text.size() && isdigit(NextChar)); + // All the characters are digits. + if (I >= Text.size()) + return; + // Base character. But it does not check the first 0 and that the base is + // the second character. + if (NextChar == 'x' || NextChar == 'b') { + char NextNextChar = Text[I]; + // This is regarded as binary number. + if (isxdigit(NextNextChar)) { + if (NextChar == 'b' && (NextNextChar == '0' || NextNextChar == '1')) + return; + // Regarded as hex number or decimal number. + if (NextChar == 'x' || isdigit(NextNextChar)) + return; + } + } + } + // Otherwise, this is actually a identifier. + Tok->Tok.setKind(tok::identifier); + Tok->Tok.setIdentifierInfo(nullptr); +} + void FormatTokenLexer::handleTemplateStrings() { FormatToken *BacktickToken = Tokens.back(); diff --git a/clang/lib/Format/FormatTokenLexer.h b/clang/lib/Format/FormatTokenLexer.h index 1dec6bbc41514cd..65dd733bd53352a 100644 --- a/clang/lib/Format/FormatTokenLexer.h +++ b/clang/lib/Format/FormatTokenLexer.h @@ -97,6 +97,10 @@ class FormatTokenLexer { // Handles TableGen multiline strings. It has the form [{ ... }]. void handleTableGenMultilineString(); + // Handles TableGen numeric like identifiers. + // They have a forms of [0-9]*[_a-zA-Z]([_a-zA-Z0-9]*). But limited to the + // case it is not lexed as an integer. + void handleTableGenNumericLikeIdentifier(); void tryParsePythonComment(); diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp index 117d8fe8f7dc12e..753e749befa57e9 100644 --- a/clang/unittests/Format/TokenAnnotatorTest.cpp +++ b/clang/unittests/Format/TokenAnnotatorTest.cpp @@ -2209,6 +2209,27 @@ TEST_F(TokenAnnotatorTest, UnderstandTableGenTokens) { EXPECT_EQ(Tokens[0]->ColumnWidth, sizeof("[{ It can break\n") - 1); EXPECT_TRUE(Tokens[0]->IsMultiline); EXPECT_EQ(Tokens[0]->LastLineColumnWidth, sizeof(" the string. }]") - 1); + + // Identifier tokens. In TableGen, identifiers can begin with a number. + // In ambiguous cases, the lexer tries to lex it as a number. + // Even if the try fails, it does not fall back to identifier lexing and + // regard as an error. + // The ambiguity is not documented. The result of those tests are based on the + // implementation of llvm::TGLexer::LexToken. + Tokens = Annotate("1234"); + EXPECT_TOKEN(Tokens[0], tok::numeric_constant, TT_Unknown); + Tokens = Annotate("0x1abC"); + EXPECT_TOKEN(Tokens[0], tok::numeric_constant, TT_Unknown); + // This is invalid syntax of number, but not an identifier. + Tokens = Annotate("0x1234x"); + EXPECT_TOKEN(Tokens[0], tok::numeric_constant, TT_Unknown); + Tokens = Annotate("identifier"); + EXPECT_TOKEN(Tokens[0], tok::identifier, TT_Unknown); + // Identifier beginning with a number. + Tokens = Annotate("2dVector"); + EXPECT_TOKEN(Tokens[0], tok::identifier, TT_Unknown); + Tokens = Annotate("01234Vector"); + EXPECT_TOKEN(Tokens[0], tok::identifier, TT_Unknown); } TEST_F(TokenAnnotatorTest, UnderstandConstructors) { >From 70e6c7cf3f41571697ce5eb37ed1bc20d161b247 Mon Sep 17 00:00:00 2001 From: hnakamura5 <hnakamu...@outlook.com> Date: Sat, 20 Jan 2024 00:30:59 +0900 Subject: [PATCH 2/2] Changed the algorithm of handleTableGenNumericLikeIdentifier --- clang/lib/Format/FormatTokenLexer.cpp | 52 +++++++++---------- clang/unittests/Format/TokenAnnotatorTest.cpp | 2 + 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp index f1982533f112c75..52a55ea23b5f2f7 100644 --- a/clang/lib/Format/FormatTokenLexer.cpp +++ b/clang/lib/Format/FormatTokenLexer.cpp @@ -813,37 +813,35 @@ void FormatTokenLexer::handleTableGenNumericLikeIdentifier() { if (Tok->isNot(tok::numeric_constant)) return; StringRef Text = Tok->TokenText; - // Identifiers cannot begin with + or -. + // The following check is based on llvm::TGLexer::LexToken. + // That lexes the token as a number if any of the following holds: + // 1. It starts with '+', '-'. + // 2. All the characters are digits. + // 3. The first non-digit character is 'b', and the next is '0' or '1'. + // 4. The first non-digit character is 'x', and the next is a hex digit. + // Note that in the case 3 and 4, if the next character does not exists in + // this token, the token is an identifier. if (Text.size() < 1 || Text[0] == '+' || Text[0] == '-') return; - // The following check is based on llvm::TGLexer::LexToken. - if (isdigit(Text[0])) { - size_t I = 0; - char NextChar = (char)0; - // Identifiers in TalbleGen may begin with digits. Skip to first non-digit. - do { - NextChar = Text[I++]; - } while (I < Text.size() && isdigit(NextChar)); - // All the characters are digits. - if (I >= Text.size()) + const auto NonDigitPos = Text.find_if([](char C) { return !isdigit(C); }); + // All the characters are digits + if (NonDigitPos == StringRef::npos) + return; + char FirstNonDigit = Text[NonDigitPos]; + if (NonDigitPos < Text.size() - 1) { + char TheNext = Text[NonDigitPos + 1]; + // Regarded as a binary number. + if (FirstNonDigit == 'b' && (TheNext == '0' || TheNext == '1')) return; - // Base character. But it does not check the first 0 and that the base is - // the second character. - if (NextChar == 'x' || NextChar == 'b') { - char NextNextChar = Text[I]; - // This is regarded as binary number. - if (isxdigit(NextNextChar)) { - if (NextChar == 'b' && (NextNextChar == '0' || NextNextChar == '1')) - return; - // Regarded as hex number or decimal number. - if (NextChar == 'x' || isdigit(NextNextChar)) - return; - } - } + // Regarded as hex number. + if (FirstNonDigit == 'x' && isxdigit(TheNext)) + return; + } + if (isalpha(FirstNonDigit) || FirstNonDigit == '_') { + // This is actually an identifier in TableGen. + Tok->Tok.setKind(tok::identifier); + Tok->Tok.setIdentifierInfo(nullptr); } - // Otherwise, this is actually a identifier. - Tok->Tok.setKind(tok::identifier); - Tok->Tok.setIdentifierInfo(nullptr); } void FormatTokenLexer::handleTemplateStrings() { diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp index 753e749befa57e9..64b2abac5cce531 100644 --- a/clang/unittests/Format/TokenAnnotatorTest.cpp +++ b/clang/unittests/Format/TokenAnnotatorTest.cpp @@ -2226,6 +2226,8 @@ TEST_F(TokenAnnotatorTest, UnderstandTableGenTokens) { Tokens = Annotate("identifier"); EXPECT_TOKEN(Tokens[0], tok::identifier, TT_Unknown); // Identifier beginning with a number. + Tokens = Annotate("0x"); + EXPECT_TOKEN(Tokens[0], tok::identifier, TT_Unknown); Tokens = Annotate("2dVector"); EXPECT_TOKEN(Tokens[0], tok::identifier, TT_Unknown); Tokens = Annotate("01234Vector"); _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits