sstwcw updated this revision to Diff 429530.
sstwcw added a comment.
- address review comments
Repository:
rG LLVM Github Monorepo
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D124748/new/
https://reviews.llvm.org/D124748
Files:
clang/lib/Format/FormatTokenLexer.cpp
clang/lib/Format/FormatTokenLexer.h
Index: clang/lib/Format/FormatTokenLexer.h
===================================================================
--- clang/lib/Format/FormatTokenLexer.h
+++ clang/lib/Format/FormatTokenLexer.h
@@ -92,6 +92,8 @@
bool tryMergeConflictMarkers();
+ void resizeToken(size_t NewLen);
+
FormatToken *getStashedToken();
FormatToken *getNextToken();
Index: clang/lib/Format/FormatTokenLexer.cpp
===================================================================
--- clang/lib/Format/FormatTokenLexer.cpp
+++ clang/lib/Format/FormatTokenLexer.cpp
@@ -836,6 +836,51 @@
return FormatTok;
}
+/// Resize the current token to the new length and make the lexer continue from
+/// the end of the resized token.
+void FormatTokenLexer::resizeToken(size_t NewLen) {
+ resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(
+ Lex->getBufferLocation() - FormatTok->TokenText.size() + NewLen)));
+ FormatTok->TokenText = FormatTok->TokenText.substr(0, NewLen);
+ FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
+ FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
+ Encoding);
+ FormatTok->Tok.setLength(NewLen);
+}
+
+/// Count the length of leading whitespace in a token.
+static size_t countLeadingWhitespace(StringRef Text) {
+ // Basically counting the length matched by this regex.
+ // "^([\n\r\f\v \t]|(\\\\|\\?\\?/)[\n\r])+"
+ // Directly using the regex turned out to be slow. With the regex
+ // version formatting all files in this directory took about 1.25
+ // seconds. This version took about 0.5 seconds.
+ const char *Cur = Text.begin();
+ while (Cur < Text.end()) {
+ if (isspace(Cur[0])) {
+ ++Cur;
+ } else if (Cur[0] == '\\' && (Cur[1] == '\n' || Cur[1] == '\r')) {
+ // A '\' followed by a newline always escapes the newline, regardless
+ // of whether there is another '\' before it.
+ // The source has a null byte at the end. So the end of the entire input
+ // isn't reached yet. Also the lexer doesn't break apart an escaped
+ // newline.
+ assert(Text.end() - Cur >= 2);
+ Cur += 2;
+ } else if (Cur[0] == '?' && Cur[1] == '?' && Cur[2] == '/' &&
+ (Cur[3] == '\n' || Cur[3] == '\r')) {
+ // Newlines can also be escaped by a '?' '?' '/' trigraph. By the way, the
+ // characters are quoted individually in this comment because if we write
+ // them together some compilers warn that we have a trigraph in the code.
+ assert(Text.end() - Cur >= 4);
+ Cur += 4;
+ } else {
+ break;
+ }
+ }
+ return Cur - Text.begin();
+}
+
FormatToken *FormatTokenLexer::getNextToken() {
if (StateStack.top() == LexerState::TOKEN_STASHED) {
StateStack.pop();
@@ -850,34 +895,33 @@
IsFirstToken = false;
// Consume and record whitespace until we find a significant token.
+ // Some tok::unknown tokens are not just whitespace, e.g. whitespace
+ // followed by a symbol such as backtick. Those symbols may be
+ // significant in other languages.
unsigned WhitespaceLength = TrailingWhitespace;
- while (FormatTok->is(tok::unknown)) {
+ while (FormatTok->isNot(tok::eof)) {
+ auto LeadingWhitespace = countLeadingWhitespace(FormatTok->TokenText);
+ if (LeadingWhitespace == 0)
+ break;
+ if (LeadingWhitespace < FormatTok->TokenText.size())
+ resizeToken(LeadingWhitespace);
StringRef Text = FormatTok->TokenText;
- auto EscapesNewline = [&](int pos) {
- // A '\r' here is just part of '\r\n'. Skip it.
- if (pos >= 0 && Text[pos] == '\r')
- --pos;
- // See whether there is an odd number of '\' before this.
- // FIXME: This is wrong. A '\' followed by a newline is always removed,
- // regardless of whether there is another '\' before it.
- // FIXME: Newlines can also be escaped by a '?' '?' '/' trigraph.
- unsigned count = 0;
- for (; pos >= 0; --pos, ++count)
- if (Text[pos] != '\\')
- break;
- return count & 1;
- };
- // FIXME: This miscounts tok:unknown tokens that are not just
- // whitespace, e.g. a '`' character.
+ bool InEscape = false;
for (int i = 0, e = Text.size(); i != e; ++i) {
switch (Text[i]) {
+ case '\r':
+ // If this is a CRLF sequence, break here and the LF will be handled on
+ // the next loop iteration. Otherwise, this is a single Mac CR, treat it
+ // the same as a single LF.
+ if (i + 1 < e && Text[i + 1] == '\n')
+ break;
+ LLVM_FALLTHROUGH;
case '\n':
++FormatTok->NewlinesBefore;
- FormatTok->HasUnescapedNewline = !EscapesNewline(i - 1);
- FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
- Column = 0;
- break;
- case '\r':
+ if (!InEscape)
+ FormatTok->HasUnescapedNewline = true;
+ else
+ InEscape = false;
FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
Column = 0;
break;
@@ -893,24 +937,32 @@
Style.TabWidth - (Style.TabWidth ? Column % Style.TabWidth : 0);
break;
case '\\':
- if (i + 1 == e || (Text[i + 1] != '\r' && Text[i + 1] != '\n'))
- FormatTok->setType(TT_ImplicitStringLiteral);
+ case '?':
+ case '/':
+ // The text was entirely whitespace when this loop was entered. Thus
+ // this has to be an escape sequence.
+ assert(Text.substr(i, 2) == "\\\r" || Text.substr(i, 2) == "\\\n" ||
+ Text.substr(i, 4) == "\?\?/\r" ||
+ Text.substr(i, 4) == "\?\?/\n" ||
+ (i >= 1 && (Text.substr(i - 1, 4) == "\?\?/\r" ||
+ Text.substr(i - 1, 4) == "\?\?/\n")) ||
+ (i >= 2 && (Text.substr(i - 2, 4) == "\?\?/\r" ||
+ Text.substr(i - 2, 4) == "\?\?/\n")));
+ InEscape = true;
break;
default:
- FormatTok->setType(TT_ImplicitStringLiteral);
+ // This shouldn't happen.
+ assert(false);
break;
}
- if (FormatTok->getType() == TT_ImplicitStringLiteral)
- break;
}
-
- if (FormatTok->is(TT_ImplicitStringLiteral))
- break;
- WhitespaceLength += FormatTok->Tok.getLength();
-
+ WhitespaceLength += Text.size();
readRawToken(*FormatTok);
}
+ if (FormatTok->is(tok::unknown))
+ FormatTok->setType(TT_ImplicitStringLiteral);
+
// JavaScript and Java do not allow to escape the end of the line with a
// backslash. Backslashes are syntax errors in plain source, but can occur in
// comments. When a single line comment ends with a \, it'll cause the next
@@ -924,41 +976,13 @@
while (BackslashPos != StringRef::npos) {
if (BackslashPos + 1 < FormatTok->TokenText.size() &&
FormatTok->TokenText[BackslashPos + 1] == '\n') {
- const char *Offset = Lex->getBufferLocation();
- Offset -= FormatTok->TokenText.size();
- Offset += BackslashPos + 1;
- resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
- FormatTok->TokenText = FormatTok->TokenText.substr(0, BackslashPos + 1);
- FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
- FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
- Encoding);
+ resizeToken(BackslashPos + 1);
break;
}
BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1);
}
}
- // In case the token starts with escaped newlines, we want to
- // take them into account as whitespace - this pattern is quite frequent
- // in macro definitions.
- // FIXME: Add a more explicit test.
- while (FormatTok->TokenText.size() > 1 && FormatTok->TokenText[0] == '\\') {
- unsigned SkippedWhitespace = 0;
- if (FormatTok->TokenText.size() > 2 &&
- (FormatTok->TokenText[1] == '\r' && FormatTok->TokenText[2] == '\n'))
- SkippedWhitespace = 3;
- else if (FormatTok->TokenText[1] == '\n')
- SkippedWhitespace = 2;
- else
- break;
-
- ++FormatTok->NewlinesBefore;
- WhitespaceLength += SkippedWhitespace;
- FormatTok->LastNewlineOffset = SkippedWhitespace;
- Column = 0;
- FormatTok->TokenText = FormatTok->TokenText.substr(SkippedWhitespace);
- }
-
FormatTok->WhitespaceRange = SourceRange(
WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits