sammccall created this revision. sammccall added a reviewer: hokein. Herald added a project: All. sammccall requested review of this revision. Herald added subscribers: cfe-commits, alextsao1999. Herald added a project: clang-tools-extra.
It turns out clang::expandUCNs only works on tokens that contain valid UCNs and no other random escapes, and clang only uses it on raw_identifiers. Currently we can hit an assertion by creating tokens with stray non-valid-UCN backslashes in them. Fortunately, expanding UCNs in raw_identifiers is actually all we need. Most tokens (keywords, punctuation) can't have them. UCNs in literals can be treated as escape sequences like \n even this isn't the standard's interpretation. This more or less matches how clang works. (See https://isocpp.org/files/papers/P2194R0.pdf which points out that the standard's description of how UCNs work is misaligned with real implementations) Repository: rG LLVM Github Monorepo https://reviews.llvm.org/D125049 Files: clang-tools-extra/pseudo/include/clang-pseudo/Token.h clang-tools-extra/pseudo/lib/Lex.cpp clang-tools-extra/pseudo/test/crash/crash.c clang-tools-extra/pseudo/tool/ClangPseudo.cpp Index: clang-tools-extra/pseudo/tool/ClangPseudo.cpp =================================================================== --- clang-tools-extra/pseudo/tool/ClangPseudo.cpp +++ clang-tools-extra/pseudo/tool/ClangPseudo.cpp @@ -17,6 +17,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Signals.h" using clang::pseudo::Grammar; using llvm::cl::desc; @@ -52,6 +53,7 @@ int main(int argc, char *argv[]) { llvm::cl::ParseCommandLineOptions(argc, argv, ""); + llvm::sys::PrintStackTraceOnErrorSignal(argv[0]); clang::LangOptions LangOpts = clang::pseudo::genericLangOpts(); std::string SourceText; Index: clang-tools-extra/pseudo/test/crash/crash.c =================================================================== --- /dev/null +++ clang-tools-extra/pseudo/test/crash/crash.c @@ -0,0 +1,3 @@ +// RUN: clang-pseudo -source=%s +\ +\ x Index: clang-tools-extra/pseudo/lib/Lex.cpp =================================================================== --- clang-tools-extra/pseudo/lib/Lex.cpp +++ clang-tools-extra/pseudo/lib/Lex.cpp @@ -90,12 +90,23 @@ assert(CharSize != 0 && "no progress!"); Pos += CharSize; } - // Remove universal character names (UCN). + llvm::StringRef Text = CleanBuffer; llvm::SmallString<64> UCNBuffer; - clang::expandUCNs(UCNBuffer, CleanBuffer); + // A surface reading of the standard suggests UCNs might appear anywhere. + // But we need only decode them in raw_identifiers. + // - they cannot appear in punctuation/keyword tokens, because UCNs + // cannot encode basic characters outside of literals [lex.charset] + // - they can appear in literals, but we need not unescape them now. + // We treat them as escape sequences when evaluating the literal. + // - comments are handled similarly to literals + // This is good fortune, because expandUCNs requires its input to be a + // reasonably valid identifier (e.g. without stray backslashes). + if (Tok.Kind == tok::raw_identifier) { + clang::expandUCNs(UCNBuffer, CleanBuffer); + Text = UCNBuffer; + } - llvm::StringRef Text = llvm::StringRef(UCNBuffer).copy(*CleanedStorage); - Tok.Data = Text.data(); + Tok.Data = Text.copy(*CleanedStorage).data(); Tok.Length = Text.size(); Tok.Flags &= ~static_cast<decltype(Tok.Flags)>(LexFlags::NeedsCleaning); } Index: clang-tools-extra/pseudo/include/clang-pseudo/Token.h =================================================================== --- clang-tools-extra/pseudo/include/clang-pseudo/Token.h +++ clang-tools-extra/pseudo/include/clang-pseudo/Token.h @@ -199,12 +199,15 @@ clang::Language = clang::Language::CXX, clang::LangStandard::Kind = clang::LangStandard::lang_unspecified); -/// Derives a token stream by decoding escapes, interpreting raw_identifiers and -/// splitting the greatergreater token. +/// Decoding raw tokens written in the source code, returning a derived stream. /// -/// Tokens containing UCNs, escaped newlines, trigraphs etc are decoded and -/// their backing data is owned by the returned stream. -/// raw_identifier tokens are assigned specific types (identifier, keyword etc). +/// - escaped newlines within tokens are removed +/// - trigraphs are replaced with the characters they encode +/// - UCNs within raw_identifiers are replaced by the characters they encode +/// (UCNs within strings, comments etc are not translated) +/// - raw_identifier tokens are assigned their correct keyword type +/// - the >> token is split into separate > > tokens +/// (we use a modified grammar where >> is a nonterminal, not a token) /// /// The StartsPPLine flag is preserved. ///
Index: clang-tools-extra/pseudo/tool/ClangPseudo.cpp =================================================================== --- clang-tools-extra/pseudo/tool/ClangPseudo.cpp +++ clang-tools-extra/pseudo/tool/ClangPseudo.cpp @@ -17,6 +17,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Signals.h" using clang::pseudo::Grammar; using llvm::cl::desc; @@ -52,6 +53,7 @@ int main(int argc, char *argv[]) { llvm::cl::ParseCommandLineOptions(argc, argv, ""); + llvm::sys::PrintStackTraceOnErrorSignal(argv[0]); clang::LangOptions LangOpts = clang::pseudo::genericLangOpts(); std::string SourceText; Index: clang-tools-extra/pseudo/test/crash/crash.c =================================================================== --- /dev/null +++ clang-tools-extra/pseudo/test/crash/crash.c @@ -0,0 +1,3 @@ +// RUN: clang-pseudo -source=%s +\ +\ x Index: clang-tools-extra/pseudo/lib/Lex.cpp =================================================================== --- clang-tools-extra/pseudo/lib/Lex.cpp +++ clang-tools-extra/pseudo/lib/Lex.cpp @@ -90,12 +90,23 @@ assert(CharSize != 0 && "no progress!"); Pos += CharSize; } - // Remove universal character names (UCN). + llvm::StringRef Text = CleanBuffer; llvm::SmallString<64> UCNBuffer; - clang::expandUCNs(UCNBuffer, CleanBuffer); + // A surface reading of the standard suggests UCNs might appear anywhere. + // But we need only decode them in raw_identifiers. + // - they cannot appear in punctuation/keyword tokens, because UCNs + // cannot encode basic characters outside of literals [lex.charset] + // - they can appear in literals, but we need not unescape them now. + // We treat them as escape sequences when evaluating the literal. + // - comments are handled similarly to literals + // This is good fortune, because expandUCNs requires its input to be a + // reasonably valid identifier (e.g. without stray backslashes). + if (Tok.Kind == tok::raw_identifier) { + clang::expandUCNs(UCNBuffer, CleanBuffer); + Text = UCNBuffer; + } - llvm::StringRef Text = llvm::StringRef(UCNBuffer).copy(*CleanedStorage); - Tok.Data = Text.data(); + Tok.Data = Text.copy(*CleanedStorage).data(); Tok.Length = Text.size(); Tok.Flags &= ~static_cast<decltype(Tok.Flags)>(LexFlags::NeedsCleaning); } Index: clang-tools-extra/pseudo/include/clang-pseudo/Token.h =================================================================== --- clang-tools-extra/pseudo/include/clang-pseudo/Token.h +++ clang-tools-extra/pseudo/include/clang-pseudo/Token.h @@ -199,12 +199,15 @@ clang::Language = clang::Language::CXX, clang::LangStandard::Kind = clang::LangStandard::lang_unspecified); -/// Derives a token stream by decoding escapes, interpreting raw_identifiers and -/// splitting the greatergreater token. +/// Decoding raw tokens written in the source code, returning a derived stream. /// -/// Tokens containing UCNs, escaped newlines, trigraphs etc are decoded and -/// their backing data is owned by the returned stream. -/// raw_identifier tokens are assigned specific types (identifier, keyword etc). +/// - escaped newlines within tokens are removed +/// - trigraphs are replaced with the characters they encode +/// - UCNs within raw_identifiers are replaced by the characters they encode +/// (UCNs within strings, comments etc are not translated) +/// - raw_identifier tokens are assigned their correct keyword type +/// - the >> token is split into separate > > tokens +/// (we use a modified grammar where >> is a nonterminal, not a token) /// /// The StartsPPLine flag is preserved. ///
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits