ilya-biryukov updated this revision to Diff 192661.
ilya-biryukov added a comment.

- s/macroMacroInvocation/something else...


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D59887/new/

https://reviews.llvm.org/D59887

Files:
  clang/include/clang/Tooling/Syntax/TokenBuffer.h
  clang/lib/Tooling/CMakeLists.txt
  clang/lib/Tooling/Syntax/CMakeLists.txt
  clang/lib/Tooling/Syntax/TokenBuffer.cpp
  clang/unittests/Tooling/CMakeLists.txt
  clang/unittests/Tooling/Syntax/CMakeLists.txt
  clang/unittests/Tooling/Syntax/TokenBufferTest.cpp

Index: clang/unittests/Tooling/Syntax/TokenBufferTest.cpp
===================================================================
--- /dev/null
+++ clang/unittests/Tooling/Syntax/TokenBufferTest.cpp
@@ -0,0 +1,470 @@
+//===- TokenBufferTest.cpp ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/TokenBuffer.h"
+#include "clang/AST/ASTConsumer.h"
+#include "clang/AST/Expr.h"
+#include "clang/Basic/Diagnostic.h"
+#include "clang/Basic/DiagnosticIDs.h"
+#include "clang/Basic/DiagnosticOptions.h"
+#include "clang/Basic/FileManager.h"
+#include "clang/Basic/FileSystemOptions.h"
+#include "clang/Basic/LLVM.h"
+#include "clang/Basic/LangOptions.h"
+#include "clang/Basic/SourceLocation.h"
+#include "clang/Basic/SourceManager.h"
+#include "clang/Basic/TokenKinds.def"
+#include "clang/Basic/TokenKinds.h"
+#include "clang/Frontend/CompilerInstance.h"
+#include "clang/Frontend/FrontendAction.h"
+#include "clang/Frontend/Utils.h"
+#include "clang/Lex/Lexer.h"
+#include "clang/Lex/PreprocessorOptions.h"
+#include "clang/Lex/Token.h"
+#include "clang/Tooling/Tooling.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/IntrusiveRefCntPtr.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/VirtualFileSystem.h"
+#include "llvm/Support/raw_os_ostream.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Testing/Support/Annotations.h"
+#include <cassert>
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include <memory>
+#include <ostream>
+#include <string>
+
+using namespace clang;
+using namespace clang::syntax;
+
+using ::testing::AllOf;
+using ::testing::Contains;
+using ::testing::ElementsAre;
+using ::testing::IsEmpty;
+using ::testing::Matcher;
+using ::testing::Pointwise;
+
+// Debug printers.
+// FIXME: This should live somewhere else or be implemented as 'operator
+// <<(raw_ostream&, T)'.
+namespace clang {
+namespace tok {
+inline void PrintTo(TokenKind K, std::ostream *OS) {
+  *OS << tok::getTokenName(K);
+}
+} // namespace tok
+namespace syntax {
+inline void PrintTo(const syntax::Token &T, std::ostream *OS) {
+  PrintTo(T.kind(), OS);
+  OS->flush();
+}
+} // namespace syntax
+} // namespace clang
+
+namespace {
+// Matchers for clang::Token.
+MATCHER_P(Kind, K, "") { return arg.kind() == K; }
+MATCHER_P2(HasText, Text, SourceMgr, "") {
+  return arg.text(*SourceMgr) == Text;
+}
+MATCHER_P2(IsIdent, Text, SourceMgr, "") {
+  return arg.kind() == tok::identifier && arg.text(*SourceMgr) == Text;
+}
+/// Checks the start and end location of a token are equal to SourceRng.
+MATCHER_P(RangeIs, SourceRng, "") {
+  return arg.location() == SourceRng.first &&
+         arg.endLocation() == SourceRng.second;
+}
+/// Checks the passed tuple has two similar tokens, i.e. both are of the same
+/// kind and have the same text if they are identifiers.
+MATCHER_P(IsSameToken, SourceMgr, "") {
+  auto &L = std::get<0>(arg);
+  auto &R = std::get<1>(arg);
+  if (L.kind() != R.kind())
+    return false;
+  return L.text(*SourceMgr) == L.text(*SourceMgr);
+}
+
+class TokenBufferTest : public ::testing::Test {
+public:
+  /// Run the clang frontend, collect the preprocessed tokens from the frontend
+  /// invocation and store them in this->Tokens.
+  /// This also clears SourceManager before running the compiler.
+  void recordTokens(llvm::StringRef Code) {
+    class RecordTokens : public ASTFrontendAction {
+    public:
+      explicit RecordTokens(TokenBuffer &Result) : Result(Result) {}
+
+      bool BeginSourceFileAction(CompilerInstance &CI) override {
+        assert(!Collector && "expected only a single call to BeginSourceFile");
+        Collector.emplace(CI.getPreprocessor());
+        return true;
+      }
+      void EndSourceFileAction() override {
+        assert(Collector && "BeginSourceFileAction was never called");
+        Result = std::move(*Collector).consume();
+      }
+
+      std::unique_ptr<ASTConsumer>
+      CreateASTConsumer(CompilerInstance &CI, StringRef InFile) override {
+        return llvm::make_unique<ASTConsumer>();
+      }
+
+    private:
+      TokenBuffer &Result;
+      llvm::Optional<TokenCollector> Collector;
+    };
+
+    constexpr const char *FileName = "./input.cpp";
+    FS->addFile(FileName, time_t(), llvm::MemoryBuffer::getMemBufferCopy(""));
+    // Prepare to run a compiler.
+    std::vector<const char *> Args = {"tok-test", "-std=c++03", "-fsyntax-only",
+                                      FileName};
+    auto CI = createInvocationFromCommandLine(Args, Diags, FS);
+    assert(CI);
+    CI->getFrontendOpts().DisableFree = false;
+    CI->getPreprocessorOpts().addRemappedFile(
+        FileName, llvm::MemoryBuffer::getMemBufferCopy(Code).release());
+    LangOpts = *CI->getLangOpts();
+    CompilerInstance Compiler;
+    Compiler.setInvocation(std::move(CI));
+    if (!Diags->getClient())
+      Diags->setClient(new IgnoringDiagConsumer);
+    Compiler.setDiagnostics(Diags.get());
+    Compiler.setFileManager(FileMgr.get());
+    Compiler.setSourceManager(SourceMgr.get());
+
+    this->Buffer = TokenBuffer();
+    RecordTokens Recorder(this->Buffer);
+    ASSERT_TRUE(Compiler.ExecuteAction(Recorder))
+        << "failed to run the frontend";
+  }
+
+  /// Run syntax::tokenize() and return the results.
+  TokenBuffer tokenize(llvm::StringRef Text) {
+    // Null-terminate so that we always see 'tok::eof' at the end.
+    std::string NullTerminated = Text.str();
+    auto FID = SourceMgr->createFileID(llvm::MemoryBuffer::getMemBufferCopy(
+        StringRef(NullTerminated.data(), NullTerminated.size() + 1)));
+    return syntax::tokenize(FID, *SourceMgr, LangOpts);
+  }
+
+  /// Checks that lexing \p ExpectedText in raw mode would produce the same
+  /// token stream as the one stored in this->Buffer.tokens().
+  void checkTokens(llvm::StringRef ExpectedText) {
+    auto TokenizedCode = tokenize(ExpectedText);
+    std::vector<syntax::Token> ExpectedTokens = TokenizedCode.tokens();
+    EXPECT_THAT(std::vector<syntax::Token>(Buffer.tokens()),
+                Pointwise(IsSameToken(), ExpectedTokens))
+        << "\texpected tokens: " << ExpectedText;
+  }
+
+  struct ExpectedInvocation {
+    ExpectedInvocation(std::string From, std::string To,
+                      llvm::Optional<llvm::Range> Range = llvm::None)
+        : From(std::move(From)), To(std::move(To)), Range(Range) {}
+    /// A textual representation of the macro tokens.
+    std::string From;
+    /// A textual representation of the tokens after macro replacement.
+    std::string To;
+    /// A text range the macro invocation in the source code.
+    llvm::Optional<llvm::Range> Range;
+  };
+  /// Checks the this->Buffer.macroInvocations() match the \p Expected ones.
+  void checkMacroInvocations(llvm::ArrayRef<ExpectedInvocation> Expected) {
+    auto Actual = Buffer.macroInvocations();
+    ASSERT_EQ(Actual.size(), Expected.size());
+
+    for (unsigned I = 0; I < Actual.size(); ++I) {
+      auto &A = Actual[I];
+      auto &E = Expected[I];
+
+      if (E.Range)
+        ASSERT_EQ(A.macroRange(Buffer, *SourceMgr),
+                  (std::pair<unsigned, unsigned>(E.Range->Begin, E.Range->End)))
+            << "\trange does not match";
+
+      ASSERT_THAT(
+          std::vector<syntax::Token>(A.macroTokens(Buffer)),
+          Pointwise(IsSameToken(), std::vector<syntax::Token>(
+                                       tokenize(E.From).tokens().drop_back())))
+          << "\tmacro tokens do not match, expected " << E.From;
+
+      ASSERT_THAT(
+          std::vector<syntax::Token>(A.tokens(Buffer)),
+          Pointwise(IsSameToken(), std::vector<syntax::Token>(
+                                       tokenize(E.To).tokens().drop_back())))
+          << "\ttokens after macro replacements do not match, expected " << E.To;
+    }
+  }
+
+  // Specialized versions of matchers that rely on SourceManager.
+  Matcher<syntax::Token> IsIdent(std::string Text) const {
+    return ::IsIdent(Text, SourceMgr.get());
+  }
+  Matcher<syntax::Token> HasText(std::string Text) const {
+    return ::HasText(Text, SourceMgr.get());
+  }
+  Matcher<syntax::Token> RangeIs(llvm::Range R) const {
+    std::pair<SourceLocation, SourceLocation> Ls;
+    Ls.first = SourceMgr->getLocForStartOfFile(SourceMgr->getMainFileID())
+                   .getLocWithOffset(R.Begin);
+    Ls.second = SourceMgr->getLocForStartOfFile(SourceMgr->getMainFileID())
+                    .getLocWithOffset(R.End);
+    return ::RangeIs(Ls);
+  }
+  Matcher<std::tuple<const syntax::Token &, const syntax::Token &>>
+  IsSameToken() const {
+    return ::IsSameToken(SourceMgr.get());
+  }
+
+  // Data fields.
+  llvm::IntrusiveRefCntPtr<DiagnosticsEngine> Diags =
+      new DiagnosticsEngine(new DiagnosticIDs, new DiagnosticOptions);
+  IntrusiveRefCntPtr<llvm::vfs::InMemoryFileSystem> FS =
+      new llvm::vfs::InMemoryFileSystem;
+  llvm::IntrusiveRefCntPtr<FileManager> FileMgr =
+      new FileManager(FileSystemOptions(), FS);
+  llvm::IntrusiveRefCntPtr<SourceManager> SourceMgr =
+      new SourceManager(*Diags, *FileMgr);
+  /// Contains last result of calling recordTokens().
+  TokenBuffer Buffer;
+  /// Contains options from last run of recordTokens().
+  LangOptions LangOpts;
+};
+
+TEST_F(TokenBufferTest, RawMode) {
+  EXPECT_THAT(tokenize("int main() {}").tokens(),
+              ElementsAre(Kind(tok::kw_int), IsIdent("main"),
+                          Kind(tok::l_paren), Kind(tok::r_paren),
+                          Kind(tok::l_brace), Kind(tok::r_brace),
+                          Kind(tok::eof)));
+  // Comments are ignored for now.
+  EXPECT_THAT(tokenize("/* foo */int a; // more comments").tokens(),
+              ElementsAre(Kind(tok::kw_int), IsIdent("a"), Kind(tok::semi),
+                          Kind(tok::eof)));
+}
+
+TEST_F(TokenBufferTest, Basic) {
+  recordTokens("int main() {}");
+  EXPECT_THAT(Buffer.tokens(),
+              ElementsAre(Kind(tok::kw_int), IsIdent("main"),
+                          Kind(tok::l_paren), Kind(tok::r_paren),
+                          Kind(tok::l_brace), Kind(tok::r_brace),
+                          Kind(tok::eof)));
+  // All kinds of whitespace are ignored.
+  recordTokens("\t\n  int\t\n  main\t\n  (\t\n  )\t\n{\t\n  }\t\n");
+  EXPECT_THAT(Buffer.tokens(),
+              ElementsAre(Kind(tok::kw_int), IsIdent("main"),
+                          Kind(tok::l_paren), Kind(tok::r_paren),
+                          Kind(tok::l_brace), Kind(tok::r_brace),
+                          Kind(tok::eof)));
+
+  llvm::Annotations Code(R"cpp(
+    $r1[[int]] $r2[[a]] $r3[[=]] $r4[["foo bar baz"]] $r5[[;]]
+  )cpp");
+  recordTokens(Code.code());
+  EXPECT_THAT(
+      Buffer.tokens(),
+      ElementsAre(AllOf(Kind(tok::kw_int), RangeIs(Code.range("r1"))),
+                  AllOf(Kind(tok::identifier), RangeIs(Code.range("r2"))),
+                  AllOf(Kind(tok::equal), RangeIs(Code.range("r3"))),
+                  AllOf(Kind(tok::string_literal), RangeIs(Code.range("r4"))),
+                  AllOf(Kind(tok::semi), RangeIs(Code.range("r5"))),
+                  Kind(tok::eof)));
+}
+
+TEST_F(TokenBufferTest, MacroDirectives) {
+  // Macro directives are not stored anywhere at the moment.
+  recordTokens(R"cpp(
+    #define FOO a
+    #include "unresolved_file.h"
+    #undef FOO
+    #ifdef X
+    #else
+    #endif
+    #ifndef Y
+    #endif
+    #if 1
+    #elif 2
+    #else
+    #endif
+    #pragma once
+    #pragma something lalala
+
+    int a;
+  )cpp");
+
+  checkTokens("int a;");
+  EXPECT_THAT(Buffer.macroInvocations(), IsEmpty());
+  EXPECT_THAT(Buffer.macroTokens(), IsEmpty());
+}
+
+TEST_F(TokenBufferTest, MacroReplacements) {
+  // A simple object-like macro.
+  llvm::Annotations Code(R"cpp(
+    #define INT int const
+    [[INT]] a;
+    )cpp");
+  recordTokens(Code.code());
+
+  checkTokens("int const a;");
+  checkMacroInvocations({{"INT", "int const", Code.range()}});
+
+  // A simple function-like macro.
+  Code = llvm::Annotations(R"cpp(
+    #define INT(a) const int
+    [[INT(10+10)]] a;
+    )cpp");
+  recordTokens(Code.code());
+
+  checkTokens("const int a;");
+  checkMacroInvocations({{"INT(10+10)", "const int", Code.range()}});
+
+  // Recursive macro replacements.
+  Code = llvm::Annotations(R"cpp(
+    #define ID(X) X
+    #define INT int const
+    [[ID(ID(INT))]] a;
+  )cpp");
+  recordTokens(Code.code());
+
+  checkTokens("int const a;");
+  checkMacroInvocations({{"ID(ID(INT))", "int const", Code.range()}});
+
+  // Empty macro replacement.
+  Code = llvm::Annotations(R"cpp(
+    #define EMPTY
+    #define EMPTY_FUNC(X)
+    $m[[EMPTY]]
+    $f[[EMPTY_FUNC(1+2+3)]]
+  )cpp");
+  recordTokens(Code.code());
+
+  checkTokens("");
+  checkMacroInvocations({{"EMPTY", "", Code.range("m")},
+                   {"EMPTY_FUNC(1+2+3)", "", Code.range("f")}});
+}
+
+TEST_F(TokenBufferTest, SpecialTokens) {
+  // Tokens coming from concatenations.
+  recordTokens(R"cpp(
+    #define CONCAT(a, b) a ## b
+    int a = CONCAT(1, 2);
+  )cpp");
+  checkTokens("int a = 12;");
+  // Multi-line tokens with slashes at the end.
+  recordTokens("i\\\nn\\\nt");
+  EXPECT_THAT(Buffer.tokens(),
+              ElementsAre(AllOf(Kind(tok::kw_int), HasText("i\\\nn\\\nt")),
+                          Kind(tok::eof)));
+  // FIXME: test tokens with digraphs and UCN identifiers.
+}
+
+TEST_F(TokenBufferTest, LateBoundTokens) {
+  // The parser eventually breaks the first '>>' into two tokens ('>' and '>'),
+  // but we chooses to record them as a single token (for now).
+  llvm::Annotations Code(R"cpp(
+    template <class T>
+    struct foo { int a; };
+    int bar = foo<foo<int$br[[>>]]().a;
+    int baz = 10 $op[[>>]] 2;
+  )cpp");
+  recordTokens(Code.code());
+  EXPECT_THAT(std::vector<syntax::Token>(Buffer.tokens()),
+              AllOf(Contains(AllOf(Kind(tok::greatergreater),
+                                   RangeIs(Code.range("br")))),
+                    Contains(AllOf(Kind(tok::greatergreater),
+                                   RangeIs(Code.range("op"))))));
+}
+
+TEST_F(TokenBufferTest, DelayedParsing) {
+  llvm::StringLiteral Code = R"cpp(
+    struct Foo {
+      int method() {
+        // Parser will visit method bodies and initializers multiple time, but
+        // TokenBuffer should only record the first walk over the tokens;
+        return 100;
+      }
+      int a = 10;
+      int b = 20;
+
+      struct Subclass {
+        void foo() {
+          Foo().method();
+        }
+      };
+    };
+  )cpp";
+  recordTokens(Code);
+  // Checks that lexing in raw mode produces the same results, hence we're not
+  // recording any tokens twice and the order is the same.
+  checkTokens(Code);
+}
+
+TEST_F(TokenBufferTest, Offsets) {
+  llvm::Annotations Code("");
+  auto OfKind = [this](tok::TokenKind K) {
+    auto It = llvm::find_if(
+        Buffer.tokens(), [K](const syntax::Token &T) { return T.kind() == K; });
+    assert(It != Buffer.tokens().end());
+    return It;
+  };
+  auto Range = [&Code](llvm::StringRef Name) {
+    auto R = Code.range(Name);
+    return std::pair<unsigned, unsigned>(R.Begin, R.End);
+  };
+
+  Code = llvm::Annotations(R"cpp(
+    $all[[int $a[[a]] = $numbers[[100 + 200]];]]
+  )cpp");
+
+  recordTokens(Code.code());
+  EXPECT_EQ(Buffer.toOffsetRange(OfKind(tok::kw_int),
+                                 std::next(OfKind(tok::semi)), *SourceMgr),
+            Range("all"));
+  EXPECT_EQ(Buffer.toOffsetRange(OfKind(tok::identifier),
+                                 std::next(OfKind(tok::identifier)),
+                                 *SourceMgr),
+            Range("a"));
+  EXPECT_EQ(Buffer.toOffsetRange(OfKind(tok::numeric_constant),
+                                 OfKind(tok::semi), *SourceMgr),
+            Range("numbers"));
+
+  Code = llvm::Annotations(R"cpp(
+    #define ID(a) a
+    #define NUMBERS 100 + 200
+    $all[[ID(int) $a[[ID(a)]] = $numbers[[NUMBERS]];]]
+  )cpp");
+  recordTokens(Code.code());
+  EXPECT_EQ(Buffer.toOffsetRange(OfKind(tok::kw_int),
+                                 std::next(OfKind(tok::semi)), *SourceMgr),
+            Range("all"));
+  EXPECT_EQ(*Buffer.toOffsetRange(OfKind(tok::identifier),
+                                 std::next(OfKind(tok::identifier)),
+                                 *SourceMgr),
+            Range("a"));
+  EXPECT_EQ(*Buffer.toOffsetRange(OfKind(tok::numeric_constant),
+                                 OfKind(tok::semi), *SourceMgr),
+            Range("numbers"));
+  // Ranges not fully covering macro invocations should fail.
+  EXPECT_EQ(Buffer.toOffsetRange(OfKind(tok::numeric_constant),
+                                 std::next(OfKind(tok::numeric_constant)),
+                                 *SourceMgr),
+            llvm::None);
+  EXPECT_EQ(Buffer.toOffsetRange(OfKind(tok::plus),
+                                 std::next(OfKind(tok::plus)), *SourceMgr),
+            llvm::None);
+}
+
+} // namespace
Index: clang/unittests/Tooling/Syntax/CMakeLists.txt
===================================================================
--- /dev/null
+++ clang/unittests/Tooling/Syntax/CMakeLists.txt
@@ -0,0 +1,20 @@
+set(LLVM_LINK_COMPONENTS
+  ${LLVM_TARGETS_TO_BUILD}
+  Support
+  )
+
+add_clang_unittest(TokenBufferTest
+  TokenBufferTest.cpp
+)
+
+target_link_libraries(TokenBufferTest
+  PRIVATE
+  clangAST
+  clangBasic
+  clangFrontend
+  clangLex
+  clangSerialization
+  clangTooling
+  clangToolingSyntax
+  LLVMTestingSupport
+  )
Index: clang/unittests/Tooling/CMakeLists.txt
===================================================================
--- clang/unittests/Tooling/CMakeLists.txt
+++ clang/unittests/Tooling/CMakeLists.txt
@@ -67,3 +67,6 @@
   clangToolingInclusions
   clangToolingRefactor
   )
+
+
+add_subdirectory(Syntax)
Index: clang/lib/Tooling/Syntax/TokenBuffer.cpp
===================================================================
--- /dev/null
+++ clang/lib/Tooling/Syntax/TokenBuffer.cpp
@@ -0,0 +1,388 @@
+//===- TokenBuffer.cpp - store tokens of preprocessed files ---*- C++ -*-=====//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include "clang/Tooling/Syntax/TokenBuffer.h"
+#include "clang/Basic/Diagnostic.h"
+#include "clang/Basic/IdentifierTable.h"
+#include "clang/Basic/SourceLocation.h"
+#include "clang/Basic/SourceManager.h"
+#include "clang/Basic/TokenKinds.def"
+#include "clang/Lex/Preprocessor.h"
+#include "clang/Lex/Token.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/Support/FormatVariadic.h"
+#include <iterator>
+
+using namespace clang;
+using namespace clang::syntax;
+
+syntax::Token::Token(const clang::Token &T)
+    : Token(T.getLocation(), T.getLength(), T.getKind()) {
+  assert(!T.isAnnotation());
+}
+llvm::StringRef syntax::Token::text(const SourceManager &SM) const {
+  bool Invalid = false;
+  const char *Start = SM.getCharacterData(location(), &Invalid);
+  assert(!Invalid);
+  return llvm::StringRef(Start, length());
+}
+
+TokenBuffer syntax::tokenize(FileID FID, const SourceManager &SM,
+                             const LangOptions &LO) {
+  std::vector<syntax::Token> Tokens;
+  IdentifierTable Identifiers(LO);
+  auto AddToken = [&](clang::Token T) {
+    if (T.getKind() == tok::raw_identifier && !T.needsCleaning() &&
+        !T.hasUCN()) { // FIXME: support needsCleaning and hasUCN cases.
+      clang::IdentifierInfo &II = Identifiers.get(T.getRawIdentifier());
+      T.setIdentifierInfo(&II);
+      T.setKind(II.getTokenID());
+    }
+    Tokens.push_back(syntax::Token(T));
+  };
+
+  Lexer L(FID, SM.getBuffer(FID), SM, LO);
+
+  clang::Token T;
+  while (!L.LexFromRawLexer(T))
+    AddToken(T);
+  AddToken(T);
+
+  return TokenBuffer(std::move(Tokens));
+}
+
+class TokenCollector::Callbacks : public PPCallbacks {
+public:
+  Callbacks(const SourceManager &SM, const LangOptions &LO, TokenBuffer &Result)
+      : Result(Result), SM(SM), LO(LO) {}
+
+  void FileChanged(SourceLocation Loc, FileChangeReason Reason,
+                   SrcMgr::CharacteristicKind FileType,
+                   FileID PrevFID) override {
+    assert(Loc.isFileID());
+    InsideMainFile = SM.getFileID(Loc) == SM.getMainFileID();
+    flushMacroInvocation();
+  }
+
+  void MacroDefined(const clang::Token &MacroNameTok,
+                    const MacroDirective *MD) override {
+    flushMacroInvocation();
+    handleMacroDirective(MacroNameTok.getLocation(), /*AnchorDiff=*/2);
+  }
+
+  void MacroUndefined(const clang::Token &MacroNameTok,
+                      const MacroDefinition &MD,
+                      const MacroDirective *Undef) override {
+    flushMacroInvocation();
+    handleMacroDirective(MacroNameTok.getLocation(), /*AnchorDiff=*/2);
+  }
+
+  void InclusionDirective(SourceLocation HashLoc,
+                          const clang::Token &IncludeTok, StringRef FileName,
+                          bool IsAngled, CharSourceRange FilenameRange,
+                          const FileEntry *File, StringRef SearchPath,
+                          StringRef RelativePath, const Module *Imported,
+                          SrcMgr::CharacteristicKind FileType) override {
+    flushMacroInvocation();
+    handleMacroDirective(IncludeTok.getLocation(), /*AnchorDiff=*/1);
+  }
+
+  void If(SourceLocation Loc, SourceRange ConditionRange,
+          ConditionValueKind ConditionValue) override {
+    flushMacroInvocation();
+    handleMacroDirective(Loc, /*AnchorDiff=*/1);
+  }
+
+  void Elif(SourceLocation Loc, SourceRange ConditionRange,
+            ConditionValueKind ConditionValue, SourceLocation IfLoc) override {
+    flushMacroInvocation();
+    handleMacroDirective(Loc, /*AnchorDiff=*/1);
+  }
+
+  void Ifdef(SourceLocation Loc, const clang::Token &MacroNameTok,
+             const MacroDefinition &MD) override {
+    flushMacroInvocation();
+    handleMacroDirective(Loc, /*AnchorDiff=*/1);
+  }
+
+  void Ifndef(SourceLocation Loc, const clang::Token &MacroNameTok,
+              const MacroDefinition &MD) override {
+    flushMacroInvocation();
+    handleMacroDirective(Loc, /*AnchorDiff=*/1);
+  }
+
+  void Else(SourceLocation Loc, SourceLocation IfLoc) override {
+    flushCurrentExpansion(Loc);
+    handleMacroDirective(Loc, /*AnchorDiff=*/1);
+  }
+
+  void Endif(SourceLocation Loc, SourceLocation IfLoc) override {
+    flushMacroInvocation();
+    handleMacroDirective(Loc, /*AnchorDiff=*/1);
+  }
+
+  // FIXME: missing moduleImport(), Ident(), ...
+
+  void PragmaDirective(SourceLocation Loc,
+                       PragmaIntroducerKind Introducer) override {
+    if (!InsideMainFile)
+      return;
+    assert(PragmaStart.isInvalid() && "Recursive #pragma directives?");
+    PragmaStart = Loc;
+  }
+
+  void tokenLexed(const clang::Token &T) {
+    if (!InsideMainFile)
+      return;
+    auto L = T.getLocation();
+    assert(L.isValid());
+
+    // Parser sometimes goes through the same tokens again, we are only
+    // interested in the initial iteration.
+    if (!Result.Tokens.empty() &&
+        !SM.isBeforeInTranslationUnit(Result.Tokens.back().location(), L))
+      return;
+    flushCurrentExpansion(L);
+
+    if (ExpansionStart.isValid() && SM.getExpansionLoc(L) != ExpansionStart) {
+      // There are intermediate replacements while processing macro arguments.
+      // Skip them, they will be reported again.
+      return;
+    }
+
+    DEBUG_WITH_TYPE("collect-tokens",
+                    llvm::dbgs() << llvm::formatv(
+                        "$[token], name - {0}, length - {1}, spelling - {2}\n",
+                        tok::getTokenName(T.getKind()), T.getLength(),
+                        Lexer::getSpelling(T, SM, LO)));
+    Result.Tokens.push_back(syntax::Token(T));
+    assert(Result.Tokens.back().location().isValid());
+
+    // Process the end of #pragma directive.
+    if (PragmaStart.isValid() && T.getKind() == tok::eod) {
+      handleMacroDirective(PragmaStart, /*AnchorDiff=*/0);
+      PragmaStart = SourceLocation();
+      return;
+    }
+  }
+
+  void MacroExpands(const clang::Token &MacroNameTok, const MacroDefinition &MD,
+                    SourceRange Range, const MacroArgs *Args) override {
+    if (!InsideMainFile)
+      return;
+
+    auto MacroNameLoc = MacroNameTok.getLocation();
+    flushCurrentExpansion(MacroNameLoc);
+
+    // Note that MacroNameTok was not reported yet.
+    auto ExpansionStart =
+        std::find_if(Result.Tokens.rbegin(), Result.Tokens.rend(),
+                     [&](const syntax::Token &T) {
+                       return SM.isBeforeInTranslationUnit(T.location(),
+                                                           MacroNameLoc);
+                     })
+            .base();
+    if (MacroInvocationFile.isValid()) {
+      // This is a recursive macro replacement, no need to record it.
+      DEBUG_WITH_TYPE("collect-tokens",
+                      llvm::dbgs() << llvm::formatv(
+                          "$[macro-invocation] dropping {0} macro tokens\n",
+                          std::distance(ExpansionStart, Result.Tokens.end())));
+      Result.Tokens.erase(ExpansionStart, Result.Tokens.end());
+      return;
+    }
+    // This is a new top-level macro invocation, record it.
+    MacroInvocation MI;
+    MI.BeginMacroToken = Result.MacroTokens.size();
+    MI.EndMacroToken =
+        MI.BeginMacroToken + (Result.Tokens.end() - ExpansionStart) + 1;
+    // Store the macro name and macro arguments, they are used when calculating
+    // textual.
+    Result.MacroTokens.push_back(syntax::Token(MacroNameTok));
+    for (auto &T : llvm::make_range(ExpansionStart, Result.Tokens.end()))
+      Result.MacroTokens.push_back(T);
+    // Macro call tokens are not part of the token stream after preprocessing,
+    // so remove them.
+    DEBUG_WITH_TYPE("collect-tokens",
+                    llvm::dbgs() << llvm::formatv(
+                        "$[macro-invocation] dropping {0} macro tokens\n",
+                        std::distance(ExpansionStart, Result.Tokens.end())));
+    Result.Tokens.erase(ExpansionStart, Result.Tokens.end());
+
+    MI.BeginToken = Result.Tokens.size();
+    // MI.EndToken is filled after the macro invocation finishes.
+    Result.MacroInvocations.push_back(MI);
+    // We have to record where invocation ends in order to track it properly.
+    std::tie(MacroInvocationFile, ExpansionEndOffset) =
+        SM.getDecomposedLoc(Range.getEnd());
+    this->ExpansionStart = Range.getBegin();
+  }
+
+private:
+  void handleMacroDirective(SourceLocation Anchor, int AnchorOffset) {
+    if (!InsideMainFile)
+      return;
+
+    flushCurrentExpansion(Anchor);
+
+    assert(!Result.Tokens.empty());
+    assert(Result.Tokens.back().kind() == tok::eod);
+    auto MacroStart = std::find_if(Result.Tokens.rbegin(), Result.Tokens.rend(),
+                                   [&](const syntax::Token &T) {
+                                     return T.location() == Anchor;
+                                   })
+                          .base();
+    // MacroStart now points a few tokens after the start of the macro, e.g.
+    //   # define MACRO ^...
+    //   # include ^...
+    // we want to move it to point to 'define' or 'include', respectively.
+    //   # ^define MACRO ...
+    //   # ^include ...
+    assert(std::distance(Result.Tokens.begin(), MacroStart) >= AnchorOffset);
+    std::advance(MacroStart, -AnchorOffset);
+
+    DEBUG_WITH_TYPE("collect-tokens",
+                    llvm::dbgs() << llvm::formatv(
+                        "$[pp-directive] dropping {0} macro directive tokens\n",
+                        std::distance(MacroStart, Result.Tokens.end())));
+    Result.Tokens.erase(MacroStart, Result.Tokens.end());
+  }
+
+private:
+  void flushMacroInvocation() {
+    if (!MacroInvocationFile.isValid())
+      return;
+    assert(!Result.MacroInvocations.empty());
+    assert(Result.MacroInvocations.back().EndToken == 0);
+    Result.MacroInvocations.back().EndToken = Result.Tokens.size();
+
+    MacroInvocationFile = FileID();
+    ExpansionStart = SourceLocation();
+    ExpansionEndOffset = 0;
+  }
+
+  void flushCurrentExpansion(SourceLocation L) {
+    // assert(L.isValid());
+    if (!MacroInvocationFile.isValid())
+      return;
+    FileID File;
+    unsigned Offset;
+    std::tie(File, Offset) = SM.getDecomposedLoc(L);
+    if (File != MacroInvocationFile || Offset <= ExpansionEndOffset)
+      return;
+    // Check we are not inside the current macro arguments.
+    flushMacroInvocation();
+  }
+
+  bool InsideMainFile = false;
+  // The start location of the currently processed #pragma directive.
+  SourceLocation PragmaStart;
+  /// When valid, the file of the last active top-level macro invocation.
+  FileID MacroInvocationFile;
+  SourceLocation ExpansionStart;
+  unsigned ExpansionEndOffset = 0;
+  TokenBuffer &Result;
+  const SourceManager &SM;
+  const LangOptions &LO;
+};
+
+llvm::ArrayRef<syntax::Token>
+MacroInvocation::tokens(const TokenBuffer &B) const {
+  return B.tokens().slice(BeginToken,
+                          EndToken - BeginToken);
+}
+
+llvm::ArrayRef<syntax::Token>
+MacroInvocation::macroTokens(const TokenBuffer &B) const {
+  return B.macroTokens().slice(BeginMacroToken, EndMacroToken - BeginMacroToken);
+}
+
+std::pair<unsigned, unsigned>
+MacroInvocation::macroRange(const TokenBuffer &B,
+                           const SourceManager &SM) const {
+  auto M = macroTokens(B);
+  return {SM.getFileOffset(M.front().location()),
+          SM.getFileOffset(M.back().endLocation())};
+}
+
+TokenBuffer::TokenBuffer(std::vector<syntax::Token> Tokens)
+    : Tokens(std::move(Tokens)) {
+#ifndef NDEBUG
+  for (const auto &T : this->Tokens)
+    assert(T.location().isFileID());
+#endif
+}
+
+TokenCollector::TokenCollector(Preprocessor &PP) {
+  auto CBOwner = llvm::make_unique<Callbacks>(PP.getSourceManager(),
+                                              PP.getLangOpts(), Tokens);
+  auto *CB = CBOwner.get();
+
+  PP.addPPCallbacks(std::move(CBOwner));
+  PP.setTokenWatcher([CB](const clang::Token &T) { CB->tokenLexed(T); });
+}
+
+TokenBuffer TokenCollector::consume() && { return std::move(Tokens); }
+
+llvm::Optional<std::pair<unsigned, unsigned>>
+TokenBuffer::toOffsetRange(const Token *Begin, const Token *End,
+                           const SourceManager &SM) const {
+  assert(Begin < End);
+  unsigned BeginIndex = Begin - Tokens.data();
+  unsigned EndIndex = End - Tokens.data();
+
+  // Find the first macro call that intersects with our range.
+  auto FirstCall =
+      std::upper_bound(MacroInvocations.begin(), MacroInvocations.end(), BeginIndex,
+                       [](unsigned L, const MacroInvocation &R) {
+                         return L < R.BeginToken;
+                       });
+  if (FirstCall != MacroInvocations.begin()) {
+    --FirstCall;
+    if (FirstCall->EndToken <= BeginIndex)
+      FirstCall = MacroInvocations.end();
+  } else {
+    FirstCall = MacroInvocations.end();
+  }
+  // Find the last macro call that intersects with our range.
+  auto LastCall =
+      std::lower_bound(MacroInvocations.begin(), MacroInvocations.end(), EndIndex,
+                       [](const MacroInvocation &L, unsigned R) {
+                         return L.EndToken < R;
+                       });
+  if (LastCall != MacroInvocations.end() && EndIndex <= LastCall->BeginToken)
+    LastCall = MacroInvocations.end();
+  // Only allow changes that involve the whole macro calls, disallow anything
+  // that changes macros in between.
+  // FIXME: also allow changes uniquely mapping to macro arguments.
+  assert(FirstCall == MacroInvocations.end() || LastCall == MacroInvocations.end() ||
+         FirstCall <= LastCall);
+
+  // Check the first macro call is fully-covered.
+  if (FirstCall != MacroInvocations.end() &&
+      (FirstCall->BeginToken < BeginIndex ||
+       EndIndex < FirstCall->EndToken)) {
+    return llvm::None;
+  }
+  // Check the last macro call is fully-covered.
+  if (LastCall != MacroInvocations.end() &&
+      (LastCall->BeginToken < BeginIndex ||
+       EndIndex < LastCall->EndToken)) {
+    return llvm::None;
+  }
+
+  unsigned BeginOffset =
+      SM.getFileOffset(FirstCall != MacroInvocations.end()
+                           ? FirstCall->macroTokens(*this).front().location()
+                           : Begin->location());
+  unsigned EndOffset =
+      SM.getFileOffset(LastCall != MacroInvocations.end()
+                           ? LastCall->macroTokens(*this).back().endLocation()
+                           : std::prev(End)->endLocation());
+  return std::make_pair(BeginOffset, EndOffset);
+}
Index: clang/lib/Tooling/Syntax/CMakeLists.txt
===================================================================
--- /dev/null
+++ clang/lib/Tooling/Syntax/CMakeLists.txt
@@ -0,0 +1,10 @@
+set(LLVM_LINK_COMPONENTS Support)
+
+add_clang_library(clangToolingSyntax
+  TokenBuffer.cpp
+
+  LINK_LIBS
+  clangBasic
+  clangFrontend
+  clangLex
+  )
Index: clang/lib/Tooling/CMakeLists.txt
===================================================================
--- clang/lib/Tooling/CMakeLists.txt
+++ clang/lib/Tooling/CMakeLists.txt
@@ -7,6 +7,7 @@
 add_subdirectory(Inclusions)
 add_subdirectory(Refactoring)
 add_subdirectory(ASTDiff)
+add_subdirectory(Syntax)
 
 add_clang_library(clangTooling
   AllTUsExecution.cpp
Index: clang/include/clang/Tooling/Syntax/TokenBuffer.h
===================================================================
--- /dev/null
+++ clang/include/clang/Tooling/Syntax/TokenBuffer.h
@@ -0,0 +1,201 @@
+//===- TokenBuffer.h - store tokens of preprocessed files -----*- C++ -*-=====//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLING_SYNTAX_TOKEN_BUFFER_H
+#define LLVM_CLANG_TOOLING_SYNTAX_TOKEN_BUFFER_H
+
+#include "clang/Basic/LangOptions.h"
+#include "clang/Basic/SourceLocation.h"
+#include "clang/Basic/SourceManager.h"
+#include "clang/Basic/TokenKinds.def"
+#include "clang/Basic/TokenKinds.h"
+#include "clang/Lex/Token.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include <cstdint>
+
+namespace clang {
+class Preprocessor;
+
+namespace syntax {
+class TokenBuffer;
+
+/// A token coming directly from a file or from a macro invocation. Has just
+/// enough information to locate the token in the source code.
+class Token {
+public:
+  Token() = default;
+  Token(SourceLocation Location, unsigned Length, tok::TokenKind Kind)
+      : Location(Location), Length(Length), Kind(Kind) {}
+  /// EXPECTS: clang::Token is not an annotation token.
+  explicit Token(const clang::Token &T);
+
+  tok::TokenKind kind() const { return Kind; }
+  SourceLocation location() const { return Location; }
+  SourceLocation endLocation() const {
+    return Location.getLocWithOffset(Length);
+  }
+  unsigned length() const { return Length; }
+
+  /// Get the substring covered by the token. Note that will include all
+  /// digraphs, newline continuations, etc. E.g. 'int' and
+  ///    in\
+  ///    t
+  /// both same kind tok::kw_int, but results of getText are different.
+  llvm::StringRef text(const SourceManager &SM) const;
+
+private:
+  SourceLocation Location;
+  unsigned Length = 0;
+  tok::TokenKind Kind = tok::NUM_TOKENS;
+};
+
+static_assert(sizeof(Token) <= 16, "Token is unresonably large");
+
+/// A top-level macro invocation inside a file, e.g.
+///   #define FOO 1+2
+///   #define BAR(a) a + 1
+///   FOO // invocation #1, tokens = {'1','+','2'}, macroTokens = {'FOO'}.
+///   BAR(1) // invocation #2, tokens = {'a', '+', '1'},
+///                            macroTokens = {'BAR', '(', '1', ')'}.
+class MacroInvocation {
+public:
+  /// The tokens after preprocessor replacements.
+  llvm::ArrayRef<syntax::Token> tokens(const TokenBuffer &B) const;
+  /// Tokens that appear in the text of the file, i.e. a name of an object-like
+  /// macro or a name, arguments and parentheses of a function-like macro.
+  llvm::ArrayRef<syntax::Token> macroTokens(const TokenBuffer &B) const;
+  /// The range covering macroTokens().
+  std::pair<unsigned, unsigned> macroRange(const TokenBuffer &B,
+                                           const SourceManager &SM) const;
+
+private:
+  friend class TokenCollector;
+  friend class TokenBuffer;
+  unsigned BeginToken = 0;
+  unsigned EndToken = 0;
+  unsigned BeginMacroToken = 0;
+  unsigned EndMacroToken = 0;
+};
+
+/// A list of tokens obtained by lexing and preprocessing a text buffer and a
+/// set of helpers to allow mapping the tokens after preprocessing to the
+/// corresponding code written in a file. TokenBuffer has information about two
+/// token streams:
+///    1. tokens produced by the preprocessor after all macro replacements,
+///    2. original tokens from the source code of a file before any macro
+///       replacements occurred.
+/// The tokens for (1) are stored directly and can be accessed with the tokens()
+/// method. However, some of these tokens may come from macro invocations and so
+/// they don't correspond directly to any text in a file, e.g.
+///
+///     #define FOO 10
+///     int a = FOO;  // no token '10' in the file, just 'FOO'
+///
+/// For these tokens, TokenBuffer allows to obtain the macro name and macro
+/// arguments that were originally seen in the source code with the
+/// 'toOffsetRange()' method.
+///
+/// There are two ways to build a TokenBuffer:
+///   1. If you are running a clang frontend invocation, use the TokenCollector
+///      class,
+///   2. if you only need to lex a file, use the tokenize() helper.
+class TokenBuffer {
+public:
+  TokenBuffer() = default;
+  // Assumes no macro replacements have taken place.
+  TokenBuffer(std::vector<syntax::Token> Tokens);
+
+  /// All tokens produced by the preprocessor after macro replacements. Source
+  /// locations found in the clang AST will always point to one of the tokens in
+  /// the corresponding token buffer.
+  llvm::ArrayRef<syntax::Token> tokens() const { return Tokens; }
+  /// Attempt to map a subrange of tokens() into a continuous substring of
+  /// the original source file. The tranformation may not be possible if the
+  /// tokens cross macro invocations in the middle, e.g.
+  ///    #define FOO 1*2
+  ///    #define BAR 3*4
+  ///    FOO + BAR
+  ///
+  /// A call toOffsetRange(Begin="1", End="3") will return a range covering [FOO
+  /// +]. However, a call toOffsetRange(Begin="2", End="4") will return None,
+  /// because no range in the original source file uniquely corresponds to the
+  /// inputs.
+  llvm::Optional<std::pair<unsigned, unsigned>>
+  toOffsetRange(const Token *Begin, const Token *End,
+                const SourceManager &SM) const;
+
+  /// All top-level macro invocations from the corresponding file. Includes both
+  /// function-like and object-like macros. E.g would contain 3 entries for the
+  /// following code:
+  ///     #define FOO 2*5
+  ///     #define BAR(a,b) a+b+FOO
+  ///     BAR(FOO, FOO) // #1
+  ///     int a = FOO; // #2
+  ///     int b = BAR(a, BAR(6, FOO)); // #3
+  /// Note that neither macro replacements inside macro arguments (e.g. 'FOO' in
+  /// 'BAR(FOO, FOO)') nor recursive macro replacements are present in the
+  /// result.
+  llvm::ArrayRef<MacroInvocation> macroInvocations() const { return MacroInvocations; }
+  /// Tokens of macro directives and top-level macro invocations. These tokens
+  /// are not part of the final token stream produced by the preprocessor,
+  /// but they correspond to the tokens seen in the source code.
+  ///     #define DECL(name) int name = 10
+  ///     DECL(a);
+  /// For the input above, we would get tokens() = {"int", "a", "=", "10", ";"}
+  /// and macroTokens() = {"DECL", "(", "a", ")"}.
+  /// FIXME: we do not yet store tokens of directives, like #include, #define,
+  ///        #pragma, etc.
+  llvm::ArrayRef<syntax::Token> macroTokens() const { return MacroTokens; }
+
+private:
+  friend class TokenCollector;
+  friend class MacroInvocation;
+  /// Tokens produced after preprocessing, not including the tokens from the
+  /// #include'd files.
+  std::vector<syntax::Token> Tokens;
+  /// Tokens forming top-level macro invocations, i.e. all macro names and macro
+  /// arguments.
+  std::vector<syntax::Token> MacroTokens;
+  /// A list of all top-level macro invocations.
+  std::vector<MacroInvocation> MacroInvocations;
+};
+
+/// Lex the text buffer, corresponding to \p FID, in raw mode and record the
+/// resulting tokens. Does minimal post-processing on raw identifiers, setting
+/// their corresponding token kind. This is a very low-level function, most
+/// users should prefer to use TokenCollector. Lexing in raw mode produces
+/// wildly different results from what one might expect when running a C++
+/// frontend, e.g. preprocessor does not run at all.
+TokenBuffer tokenize(FileID FID, const SourceManager &SM,
+                     const LangOptions &LO);
+
+/// Collects tokens for the main file while running the frontend action. An
+/// instance of this object should be created on
+/// FrontendAction::BeginSourceFile() and the results should be consumed after
+/// FrontendAction::Execute() finishes.
+class TokenCollector {
+public:
+  /// Adds the hooks to collect the tokens. Should be called before the
+  /// preprocessing starts, i.e. as a part of BeginSourceFile() or
+  /// CreateASTConsumer().
+  TokenCollector(Preprocessor &P);
+
+  /// Consumes the result. Should be called after preprocessing is finished,
+  /// i.e. after running Execute().
+  TokenBuffer consume() &&;
+
+private:
+  class Callbacks;
+  TokenBuffer Tokens;
+};
+
+} // namespace syntax
+} // namespace clang
+
+#endif
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to