ilya-biryukov updated this revision to Diff 204488.
ilya-biryukov marked 7 inline comments as done.
ilya-biryukov added a comment.

- A leaf node stores a single token
- Restructure code to avoid special-casing leaf nodes


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D61637/new/

https://reviews.llvm.org/D61637

Files:
  clang/include/clang/Tooling/Syntax/BuildTree.h
  clang/include/clang/Tooling/Syntax/Nodes.h
  clang/include/clang/Tooling/Syntax/Tree.h
  clang/lib/Tooling/Syntax/BuildTree.cpp
  clang/lib/Tooling/Syntax/CMakeLists.txt
  clang/lib/Tooling/Syntax/Nodes.cpp
  clang/lib/Tooling/Syntax/Tree.cpp
  clang/unittests/Tooling/Syntax/CMakeLists.txt
  clang/unittests/Tooling/Syntax/TreeTest.cpp
  llvm/utils/gn/secondary/clang/lib/Tooling/Syntax/BUILD.gn

Index: llvm/utils/gn/secondary/clang/lib/Tooling/Syntax/BUILD.gn
===================================================================
--- llvm/utils/gn/secondary/clang/lib/Tooling/Syntax/BUILD.gn
+++ llvm/utils/gn/secondary/clang/lib/Tooling/Syntax/BUILD.gn
@@ -8,6 +8,10 @@
     "//llvm/lib/Support",
   ]
   sources = [
+    "Arena.cpp",
+    "BuildFromAST.cpp",
+    "Cascade.cpp",
+    "Nodes.cpp",
     "Tokens.cpp",
   ]
 }
Index: clang/unittests/Tooling/Syntax/TreeTest.cpp
===================================================================
--- /dev/null
+++ clang/unittests/Tooling/Syntax/TreeTest.cpp
@@ -0,0 +1,160 @@
+//===- TreeTest.cpp -------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Tree.h"
+#include "clang/AST/ASTConsumer.h"
+#include "clang/AST/Decl.h"
+#include "clang/Frontend/CompilerInstance.h"
+#include "clang/Frontend/FrontendAction.h"
+#include "clang/Lex/PreprocessorOptions.h"
+#include "clang/Tooling/Syntax/BuildTree.h"
+#include "clang/Tooling/Syntax/Nodes.h"
+#include "clang/Tooling/Tooling.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include <cstdlib>
+
+using namespace clang;
+
+namespace {
+class SyntaxTreeTest : public ::testing::Test {
+protected:
+  // Build a syntax tree for the code.
+  syntax::TranslationUnit *buildTree(llvm::StringRef Code) {
+    // FIXME: this code is almost the identical to the one in TokensTest. Share
+    //        it.
+    class BuildSyntaxTree : public ASTConsumer {
+    public:
+      BuildSyntaxTree(syntax::TranslationUnit *&Root,
+                      std::unique_ptr<syntax::Arena> &Arena,
+                      std::unique_ptr<syntax::TokenCollector> Tokens)
+          : Root(Root), Arena(Arena), Tokens(std::move(Tokens)) {
+        assert(this->Tokens);
+      }
+
+      void HandleTranslationUnit(ASTContext &Ctx) override {
+        Arena = llvm::make_unique<syntax::Arena>(Ctx.getSourceManager(),
+                                                 Ctx.getLangOpts(),
+                                                 std::move(*Tokens).consume());
+        Tokens = nullptr; // make sure we fail if this gets called twice.
+        Root = syntax::buildSyntaxTree(*Arena, *Ctx.getTranslationUnitDecl());
+      }
+
+    private:
+      syntax::TranslationUnit *&Root;
+      std::unique_ptr<syntax::Arena> &Arena;
+      std::unique_ptr<syntax::TokenCollector> Tokens;
+    };
+
+    class BuildSyntaxTreeAction : public ASTFrontendAction {
+    public:
+      BuildSyntaxTreeAction(syntax::TranslationUnit *&Root,
+                            std::unique_ptr<syntax::Arena> &Arena)
+          : Root(Root), Arena(Arena) {}
+
+      std::unique_ptr<ASTConsumer>
+      CreateASTConsumer(CompilerInstance &CI, StringRef InFile) override {
+        // We start recording the tokens, ast consumer will take on the result.
+        auto Tokens =
+            llvm::make_unique<syntax::TokenCollector>(CI.getPreprocessor());
+        return llvm::make_unique<BuildSyntaxTree>(Root, Arena,
+                                                  std::move(Tokens));
+      }
+
+    private:
+      syntax::TranslationUnit *&Root;
+      std::unique_ptr<syntax::Arena> &Arena;
+    };
+
+    constexpr const char *FileName = "./input.cpp";
+    FS->addFile(FileName, time_t(), llvm::MemoryBuffer::getMemBufferCopy(""));
+    // Prepare to run a compiler.
+    std::vector<const char *> Args = {"tok-test", "-std=c++03", "-fsyntax-only",
+                                      FileName};
+    auto CI = createInvocationFromCommandLine(Args, Diags, FS);
+    assert(CI);
+    CI->getFrontendOpts().DisableFree = false;
+    CI->getPreprocessorOpts().addRemappedFile(
+        FileName, llvm::MemoryBuffer::getMemBufferCopy(Code).release());
+    CompilerInstance Compiler;
+    Compiler.setInvocation(std::move(CI));
+    if (!Diags->getClient())
+      Diags->setClient(new IgnoringDiagConsumer);
+    Compiler.setDiagnostics(Diags.get());
+    Compiler.setFileManager(FileMgr.get());
+    Compiler.setSourceManager(SourceMgr.get());
+
+    syntax::TranslationUnit *Root = nullptr;
+    BuildSyntaxTreeAction Recorder(Root, this->Arena);
+    if (!Compiler.ExecuteAction(Recorder)) {
+      ADD_FAILURE() << "failed to run the frontend";
+      std::abort();
+    }
+    return Root;
+  }
+
+  // Adds a file to the test VFS.
+  void addFile(llvm::StringRef Path, llvm::StringRef Contents) {
+    if (!FS->addFile(Path, time_t(),
+                     llvm::MemoryBuffer::getMemBufferCopy(Contents))) {
+      ADD_FAILURE() << "could not add a file to VFS: " << Path;
+    }
+  }
+
+  // Data fields.
+  llvm::IntrusiveRefCntPtr<DiagnosticsEngine> Diags =
+      new DiagnosticsEngine(new DiagnosticIDs, new DiagnosticOptions);
+  IntrusiveRefCntPtr<llvm::vfs::InMemoryFileSystem> FS =
+      new llvm::vfs::InMemoryFileSystem;
+  llvm::IntrusiveRefCntPtr<FileManager> FileMgr =
+      new FileManager(FileSystemOptions(), FS);
+  llvm::IntrusiveRefCntPtr<SourceManager> SourceMgr =
+      new SourceManager(*Diags, *FileMgr);
+  // Set after calling buildTree().
+  std::unique_ptr<syntax::Arena> Arena;
+};
+
+TEST_F(SyntaxTreeTest, Basic) {
+  std::pair</*Input*/ std::string, /*Expected*/ std::string> Cases[] = {{
+      R"cpp(
+int main() {}
+void foo() {}
+    )cpp",
+      R"txt(
+translation-unit
+|-top-level-decl
+| |-recovery-node
+| | |-int
+| | |-main
+| | |-(
+| | `-)
+| `-compound-statment
+|   |-{
+|   `-}
+|-top-level-decl
+| |-recovery-node
+| | |-void
+| | |-foo
+| | |-(
+| | `-)
+| `-compound-statment
+|   |-{
+|   `-}
+`-<eof>
+)txt"}};
+
+  for (const auto &T : Cases) {
+    auto *Root = buildTree(T.first);
+    std::string Expected = llvm::StringRef(T.second).trim().str();
+    std::string Actual = llvm::StringRef(Root->dump(*Arena)).trim();
+    EXPECT_EQ(Expected, Actual) << "the resulting dump is:\n" << Actual;
+  }
+}
+} // namespace
Index: clang/unittests/Tooling/Syntax/CMakeLists.txt
===================================================================
--- clang/unittests/Tooling/Syntax/CMakeLists.txt
+++ clang/unittests/Tooling/Syntax/CMakeLists.txt
@@ -3,6 +3,7 @@
   )
 
 add_clang_unittest(SyntaxTests
+  TreeTest.cpp
   TokensTest.cpp
 )
 
Index: clang/lib/Tooling/Syntax/Tree.cpp
===================================================================
--- /dev/null
+++ clang/lib/Tooling/Syntax/Tree.cpp
@@ -0,0 +1,137 @@
+//===- Tree.cpp -----------------------------------------------*- C++ -*-=====//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include "clang/Tooling/Syntax/Tree.h"
+#include "clang/Basic/TokenKinds.h"
+#include "clang/Tooling/Syntax/Nodes.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Casting.h"
+
+using namespace clang;
+
+syntax::Arena::Arena(SourceManager &SourceMgr, const LangOptions &LangOpts,
+                     TokenBuffer Tokens)
+    : SourceMgr(SourceMgr), LangOpts(LangOpts), Tokens(std::move(Tokens)) {}
+
+const clang::syntax::TokenBuffer &syntax::Arena::tokenBuffer() const {
+  return Tokens;
+}
+
+std::pair<FileID, llvm::ArrayRef<syntax::Token>>
+syntax::Arena::lexBuffer(std::unique_ptr<llvm::MemoryBuffer> Input) {
+  auto FID = SourceMgr.createFileID(std::move(Input));
+  auto It = ExtraTokens.try_emplace(FID, tokenize(FID, SourceMgr, LangOpts));
+  assert(It.second && "duplicate FileID");
+  return {FID, It.first->second};
+}
+
+syntax::Leaf::Leaf(const syntax::Token *Tok) : Node(NodeKind::Leaf), Tok(Tok) {
+  assert(Tok != nullptr);
+}
+
+bool syntax::Leaf::classof(const Node *N) {
+  return N->kind() == NodeKind::Leaf;
+}
+
+bool syntax::TreeNode::classof(const Node *N) {
+  return N->kind() > NodeKind::Leaf;
+}
+
+void syntax::TreeNode::prependChildLowLevel(Node *Child) {
+  assert(Child->Parent == nullptr);
+  assert(Child->NextSibling == nullptr);
+  Child->Parent = this;
+  Child->NextSibling = this->FirstChild;
+  this->FirstChild = Child;
+}
+
+namespace {
+static void traverse(const syntax::Node *N,
+                     llvm::function_ref<void(const syntax::Node *)> Visit) {
+  if (auto *T = dyn_cast<syntax::TreeNode>(N)) {
+    for (auto *C = T->firstChild(); C; C = C->nextSibling())
+      traverse(C, Visit);
+  }
+  Visit(N);
+}
+static void dumpTokens(llvm::raw_ostream &OS, ArrayRef<syntax::Token> Tokens,
+                       const SourceManager &SM) {
+  assert(!Tokens.empty());
+  bool First = true;
+  for (const auto &T : Tokens) {
+    if (!First)
+      OS << " ";
+    else
+      First = false;
+    // Handle 'eof' separately, calling text() on it produces an empty string.
+    if (T.kind() == tok::eof) {
+      OS << "<eof>";
+      continue;
+    }
+    OS << T.text(SM);
+  }
+}
+
+static void dumpTree(llvm::raw_ostream &OS, const syntax::Node *N,
+                     const syntax::Arena &A, std::vector<bool> IndentMask) {
+  if (auto *L = llvm::dyn_cast<syntax::Leaf>(N)) {
+    dumpTokens(OS, *L->token(), A.sourceManager());
+    OS << "\n";
+    return;
+  }
+
+  auto *T = llvm::cast<syntax::TreeNode>(N);
+  OS << T->kind() << "\n";
+
+  for (auto It = T->firstChild(); It != nullptr; It = It->nextSibling()) {
+    for (bool Filled : IndentMask) {
+      if (Filled)
+        OS << "| ";
+      else
+        OS << "  ";
+    }
+    if (!It->nextSibling()) {
+      OS << "`-";
+      IndentMask.push_back(false);
+    } else {
+      OS << "|-";
+      IndentMask.push_back(true);
+    }
+    dumpTree(OS, It, A, IndentMask);
+    IndentMask.pop_back();
+  }
+}
+} // namespace
+
+std::string syntax::Node::dump(const Arena &A) const {
+  std::string Str;
+  llvm::raw_string_ostream OS(Str);
+  dumpTree(OS, this, A, /*IndentMask=*/{});
+  return std::move(OS.str());
+}
+
+std::string syntax::Node::dumpTokens(const Arena &A) const {
+  std::string Storage;
+  llvm::raw_string_ostream OS(Storage);
+  traverse(this, [&](const syntax::Node *N) {
+    auto *L = llvm::dyn_cast<syntax::Leaf>(N);
+    if (!L)
+      return;
+    ::dumpTokens(OS, *L->token(), A.sourceManager());
+  });
+  return OS.str();
+}
+
+syntax::Leaf *syntax::TreeNode::findLeaf(tok::TokenKind K) {
+  for (auto *C = FirstChild; C; C = C->nextSibling()) {
+    auto *L = dyn_cast<Leaf>(C);
+    if (L && L->token()->kind() == K)
+      return L;
+  }
+  return nullptr;
+}
Index: clang/lib/Tooling/Syntax/Nodes.cpp
===================================================================
--- /dev/null
+++ clang/lib/Tooling/Syntax/Nodes.cpp
@@ -0,0 +1,35 @@
+//===- Nodes.cpp ----------------------------------------------*- C++ -*-=====//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include "clang/Tooling/Syntax/Nodes.h"
+#include "clang/Basic/TokenKinds.h"
+
+using namespace clang;
+
+llvm::raw_ostream& syntax::operator << (llvm::raw_ostream &OS, NodeKind K) {
+  switch (K) {
+  case NodeKind::TranslationUnit:
+    return OS << "translation-unit";
+  case NodeKind::RecoveryNode:
+    return OS << "recovery-node";
+  case NodeKind::Leaf:
+    return OS << "tokens";
+  case NodeKind::CompoundStatement:
+    return OS << "compound-statment";
+  case NodeKind::TopLevelDecl:
+    return OS << "top-level-decl";
+  }
+  llvm_unreachable("invalid NodeKind");
+}
+
+syntax::Leaf *syntax::CompoundStatement::lbrace() {
+  return findLeaf(tok::l_brace);
+}
+
+syntax::Leaf *syntax::CompoundStatement::rbrace() {
+  return findLeaf(tok::r_brace);
+}
Index: clang/lib/Tooling/Syntax/CMakeLists.txt
===================================================================
--- clang/lib/Tooling/Syntax/CMakeLists.txt
+++ clang/lib/Tooling/Syntax/CMakeLists.txt
@@ -1,10 +1,15 @@
 set(LLVM_LINK_COMPONENTS Support)
 
 add_clang_library(clangToolingSyntax
+  BuildTree.cpp
+  Nodes.cpp
   Tokens.cpp
+  Tree.cpp
 
   LINK_LIBS
+  clangAST
   clangBasic
   clangFrontend
   clangLex
+  clangToolingCore
   )
Index: clang/lib/Tooling/Syntax/BuildTree.cpp
===================================================================
--- /dev/null
+++ clang/lib/Tooling/Syntax/BuildTree.cpp
@@ -0,0 +1,268 @@
+//===- BuildTree.cpp ------------------------------------------*- C++ -*-=====//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include "clang/Tooling/Syntax/BuildTree.h"
+#include "clang/AST/RecursiveASTVisitor.h"
+#include "clang/Basic/LLVM.h"
+#include "clang/Basic/SourceLocation.h"
+#include "clang/Basic/SourceManager.h"
+#include "clang/Basic/TokenKinds.h"
+#include "clang/Lex/Lexer.h"
+#include "clang/Tooling/Syntax/Nodes.h"
+#include "clang/Tooling/Syntax/Tokens.h"
+#include "clang/Tooling/Syntax/Tree.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace clang;
+
+/// A helper class for constructing the syntax tree while traversing a clang
+/// AST. The tree is built left-to-right and bottom-up. At each point of the
+/// traversal we maintain a list of pending nodes.
+class syntax::TreeBuilder {
+public:
+  TreeBuilder(syntax::Arena &Arena)
+      : Arena(Arena), Unconsumed(Arena.tokenBuffer().expandedTokens()) {
+    advanceToken();
+  }
+
+  llvm::BumpPtrAllocator &allocator() { return Arena.allocator(); }
+
+  /// Populate children for \p New, assuming it covers the token range with
+  /// tokens covered located between \p First and \p Last (inclusive!).
+  /// All pending  nodes which fall into the range between \p First and \p Last
+  /// are added as children of the new node.
+  void learnNode(SourceLocation Fist, SourceLocation Last,
+                 syntax::TreeNode *New);
+
+  /// Add a leaf node for a token starting at \p Loc.
+  void learnTokenNode(SourceLocation Loc, tok::TokenKind Kind);
+
+  /// Finish building the tree and create a root TranslationUnit node. No
+  /// further calls to learn* methods are allowed after this call.
+  void learnRoot();
+
+  /// Consume the root node.
+  syntax::TranslationUnit *consume() && {
+    auto Nodes = std::move(Pending).finalize();
+    assert(Nodes.size() == 1 &&
+           Nodes.front().Node->kind() == NodeKind::TranslationUnit);
+    assert(Unconsumed.empty());
+    return cast<syntax::TranslationUnit>(Nodes.front().Node);
+  }
+
+private:
+  const syntax::Token *findToken(SourceLocation TokLoc) const;
+
+  void learnNodeImpl(const syntax::Token *Begin, const syntax::Token *End,
+                     syntax::TreeNode *New);
+
+  /// Advance until we see the leaf node for the \p UpTo token. All skipped
+  /// nodes are added as RecoveryNode.
+  void skipUntil(const syntax::Token *UpTo);
+
+  /// Look at the node for the next token.
+  syntax::Leaf *peek() { return NextLeaf; }
+
+  /// Advance to the next token.
+  void advanceToken() {
+    if (Unconsumed.empty()) {
+      assert(NextLeaf && "trying to advance past the end of the token stream");
+      NextLeaf = nullptr;
+      return;
+    }
+    NextLeaf = new (Arena.allocator()) Leaf(Unconsumed.begin());
+    Unconsumed = Unconsumed.drop_front();
+  }
+
+  struct NodeForRange {
+    NodeForRange(llvm::ArrayRef<syntax::Token> Tokens, syntax::Node *Node)
+        : Tokens(Tokens), Node(Node) {}
+
+    llvm::ArrayRef<syntax::Token> Tokens;
+    syntax::Node *Node;
+  };
+
+  /// A collection of trees for the a prefix of the expanded tokens.
+  /// Ensures that added nodes properly nest and cover the whole token stream.
+  struct Forest {
+    /// Add a leaf node for the next token.
+    void appendToken(syntax::Leaf *L) {
+      assert(Trees.empty() || Trees.back().Tokens.end() == L->token() &&
+                                  "all tokens must be covered");
+      Trees.push_back(NodeForRange{*L->token(), L});
+    }
+
+    /// Add \p Node to the forest and fill its children nodes based on the \p
+    /// Spanned range.
+    /// EXPECTS: \p Spanned is a suffix of tokens currently in the forest.
+    void foldChildren(llvm::ArrayRef<syntax::Token> Spanned,
+                      syntax::TreeNode *Node) {
+      assert(Trees.empty() || Trees.back().Tokens.end() == Spanned.end() &&
+                                  "all tokens must be covered");
+      assert(Node->firstChild() == nullptr && "node already has children");
+
+      for (; !Trees.empty() && Spanned.begin() <= Trees.back().Tokens.begin();
+           Trees.pop_back()) {
+        Node->prependChildLowLevel(Trees.back().Node);
+      }
+
+      assert(Trees.empty() || Trees.back().Tokens.end() == Spanned.begin());
+      Trees.push_back(NodeForRange{Spanned, Node});
+    }
+
+    std::vector<NodeForRange> finalize() && { return std::move(Trees); }
+
+    std::string str(const syntax::Arena &A) const {
+      std::string R;
+      for (const auto &N : Trees) {
+        R += llvm::formatv("- '{0}' covers '{1}'+{2} tokens\n", N.Node->kind(),
+                           N.Tokens.front().text(A.sourceManager()),
+                           N.Tokens.size() - 1);
+        R += N.Node->dump(A);
+      }
+      return R;
+    }
+
+  private:
+    /// A list of nodes and the token ranges they cover.
+    /// FIXME: we need only the first token of each node, save storage.
+    std::vector<NodeForRange> Trees;
+  };
+
+  /// For debugging purposes.
+  std::string str() { return Pending.str(Arena); }
+
+  syntax::Arena &Arena;
+  /// Tokens that were not consumed yet.
+  llvm::ArrayRef<syntax::Token> Unconsumed;
+  /// A node for the next token.
+  syntax::Leaf *NextLeaf = nullptr;
+  Forest Pending;
+};
+
+namespace {
+class BuildTreeVisitor : public RecursiveASTVisitor<BuildTreeVisitor> {
+public:
+  explicit BuildTreeVisitor(ASTContext &Ctx, syntax::TreeBuilder &Builder)
+      : Builder(Builder), LangOpts(Ctx.getLangOpts()) {}
+
+  bool shouldTraversePostOrder() const { return true; }
+
+  bool TraverseDecl(Decl *D) {
+    if (!D || isa<TranslationUnitDecl>(D))
+      return RecursiveASTVisitor::TraverseDecl(D);
+    if (!llvm::isa<TranslationUnitDecl>(D->getDeclContext()))
+      return true; // Only build top-level decls for now, do not recurse.
+    return RecursiveASTVisitor::TraverseDecl(D);
+  }
+
+  bool TraverseCompoundStmt(CompoundStmt *S) {
+    Builder.learnTokenNode(S->getLBracLoc(), tok::l_brace);
+    Builder.learnTokenNode(S->getRBracLoc(), tok::r_brace);
+
+    Builder.learnNode(S->getBeginLoc(), S->getEndLoc(),
+                      new (allocator()) syntax::CompoundStatement());
+    // (!) we do not recurse into compound statements for now.
+    return true;
+  }
+
+  bool VisitDecl(Decl *D) {
+    assert(llvm::isa<TranslationUnitDecl>(D->getDeclContext()) &&
+           "expected a top-level decl");
+    assert(!D->isImplicit());
+    Builder.learnNode(D->getBeginLoc(), D->getEndLoc(),
+                      new (allocator()) syntax::TopLevelDecl());
+    return true;
+  }
+
+  bool WalkUpFromTranslationUnitDecl(TranslationUnitDecl *TU) {
+    Builder.learnRoot();
+    // (!) we do not want to call VisitDecl() at this point.
+    return true;
+  }
+
+private:
+  /// A small helper to save some typing.
+  llvm::BumpPtrAllocator &allocator() { return Builder.allocator(); }
+
+  syntax::TreeBuilder &Builder;
+  const LangOptions &LangOpts;
+};
+} // namespace
+
+void syntax::TreeBuilder::learnNode(SourceLocation First, SourceLocation Last,
+                                    syntax::TreeNode *New) {
+  assert(First.isValid());
+  assert(Last.isValid());
+  assert(First == Last ||
+         Arena.sourceManager().isBeforeInTranslationUnit(First, Last));
+
+  learnNodeImpl(findToken(First), std::next(findToken(Last)), New);
+}
+
+void syntax::TreeBuilder::learnNodeImpl(const syntax::Token *Begin,
+                                        const syntax::Token *End,
+                                        syntax::TreeNode *New) {
+  skipUntil(End);
+  Pending.foldChildren(llvm::makeArrayRef(Begin, End), New);
+}
+
+void syntax::TreeBuilder::skipUntil(const syntax::Token *UpTo) {
+  assert(peek()->token() <= UpTo);
+
+  const syntax::Token *Begin = peek()->token();
+  // We do not need to skip any tokens.
+  if (Begin == UpTo)
+    return;
+
+  for (; peek()->token() != UpTo; advanceToken())
+    Pending.appendToken(peek());
+  Pending.foldChildren(llvm::makeArrayRef(Begin, UpTo),
+                       new (Arena.allocator()) syntax::RecoveryNode);
+}
+
+void syntax::TreeBuilder::learnTokenNode(SourceLocation Loc,
+                                         tok::TokenKind Kind) {
+  auto *T = findToken(Loc);
+  assert(T->kind() == Kind);
+
+  skipUntil(T);
+  Pending.appendToken(peek());
+  advanceToken();
+}
+
+void syntax::TreeBuilder::learnRoot() {
+  auto Tokens = Arena.tokenBuffer().expandedTokens();
+  // Add 'eof' as a separate token node, it should not go into recovery node.
+  learnTokenNode(Tokens.back().location(), tok::eof);
+  Pending.foldChildren(Tokens, new (Arena.allocator()) syntax::TranslationUnit);
+}
+
+const syntax::Token *
+syntax::TreeBuilder::findToken(SourceLocation TokLoc) const {
+  auto Tokens = Arena.tokenBuffer().expandedTokens();
+  auto &SM = Arena.sourceManager();
+  auto It = llvm::bsearch(Tokens, [&](const syntax::Token &T) {
+    return !SM.isBeforeInTranslationUnit(T.location(), TokLoc);
+  });
+  assert(It != Tokens.end());
+  assert(It->location() == TokLoc);
+  return &*It;
+}
+
+syntax::TranslationUnit *
+syntax::buildSyntaxTree(Arena &A, const TranslationUnitDecl &TU) {
+  TreeBuilder Builder(A);
+  BuildTreeVisitor(TU.getASTContext(), Builder).TraverseAST(TU.getASTContext());
+  return std::move(Builder).consume();
+}
Index: clang/include/clang/Tooling/Syntax/Tree.h
===================================================================
--- /dev/null
+++ clang/include/clang/Tooling/Syntax/Tree.h
@@ -0,0 +1,138 @@
+//===- Tree.h - structure of the syntax tree ------------------*- C++ -*-=====//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Defines the basic structure of the syntax tree. There are two kinds of nodes
+// in a tree:
+//   - leaf nodes correspond to a continuous subrange of tokens in expanded
+//     token stream.
+//   - composite (or tree) nodes correspond to language grammar constructs.
+//
+// The tree is initially built from an AST. Each node of a newly built tree
+// covers a continous subrange of expanded tokens (i.e. tokens after
+// preprocessing), the specific tokens coverered are stored in the leaf nodes of
+// a tree. A post-order traversal of a tree will visit leaf nodes in an order
+// corresponding the original order of expanded tokens.
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_CLANG_TOOLING_SYNTAX_TREE_CASCADE_H
+#define LLVM_CLANG_TOOLING_SYNTAX_TREE_CASCADE_H
+
+#include "clang/Basic/TokenKinds.h"
+#include "clang/Basic/LangOptions.h"
+#include "clang/Basic/SourceLocation.h"
+#include "clang/Basic/SourceManager.h"
+#include "clang/Tooling/Syntax/Tokens.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/ADT/ArrayRef.h"
+#include <cstdint>
+
+namespace clang {
+namespace syntax {
+
+/// A memory arena for syntax trees. Also tracks the underlying token buffers,
+/// source manager, etc.
+class Arena {
+public:
+  Arena(SourceManager &SourceMgr, const LangOptions &LangOpts,
+        TokenBuffer Tokens);
+
+  const SourceManager &sourceManager() const { return SourceMgr; }
+  const LangOptions &langOptions() const { return LangOpts; }
+
+  const TokenBuffer &tokenBuffer() const;
+  llvm::BumpPtrAllocator &allocator() { return Allocator; }
+
+  /// Add \p Buffer to the underlying source manager, tokenize it and store the
+  /// resulting tokens. Useful when there is a need to materialize tokens that
+  /// were not written in user code.
+  std::pair<FileID, llvm::ArrayRef<syntax::Token>>
+  lexBuffer(std::unique_ptr<llvm::MemoryBuffer> Buffer);
+
+private:
+  SourceManager &SourceMgr;
+  const LangOptions &LangOpts;
+  TokenBuffer Tokens;
+  /// IDs and storage for additional tokenized files.
+  llvm::DenseMap<FileID, std::vector<syntax::Token>> ExtraTokens;
+  /// Keeps all the allocated nodes and their intermediate data structures.
+  llvm::BumpPtrAllocator Allocator;
+};
+
+class TreeNode;
+class TreeBuilder;
+enum class NodeKind : uint16_t;
+
+/// A node in a syntax tree. Knows only about its parent and kind.
+class Node {
+public:
+  /// Newly created nodes are detached from a tree, parent and sibling links are
+  /// set when the node is added as a child to another one.
+  Node(NodeKind Kind) : Parent(nullptr), NextSibling(nullptr), Kind(Kind) {}
+  NodeKind kind() const { return Kind; }
+
+  const TreeNode *parent() const { return Parent; }
+  TreeNode *parent() { return Parent; }
+
+  const Node *nextSibling() const { return NextSibling; }
+  Node *nextSibling() { return NextSibling; }
+
+  /// Dumps the structure of a subtree. For debugging and testing purposes.
+  std::string dump(const Arena &A) const;
+  /// Dumps the tokens forming this subtree.
+  std::string dumpTokens(const Arena &A) const;
+
+private:
+  // TreeNode is allowed to change the Parent link.
+  friend class TreeNode;
+
+  TreeNode *Parent;
+  Node *NextSibling;
+  NodeKind Kind;
+};
+
+/// A leaf node points to a single token inside the expanded token stream.
+class Leaf final : public Node {
+public:
+  Leaf(const syntax::Token* T);
+  static bool classof(const Node *N);
+
+  const syntax::Token* token() const { return Tok; }
+
+private:
+  const syntax::Token* Tok;
+};
+
+/// A composite tree node that has children.
+class TreeNode : public Node {
+public:
+  using Node::Node;
+  static bool classof(const Node *N);
+
+  Node *firstChild() { return FirstChild; }
+  const Node *firstChild() const { return FirstChild; }
+
+protected:
+  /// Find a leaf with a single token of a corresponding kind.
+  /// EXPECTS: the leaf node of a corresponding kind is found.
+  /// ENSURES: the result is non-null.
+  syntax::Leaf *findLeaf(tok::TokenKind K);
+
+private:
+  /// Prepend \p Child to the list of children and and sets the parent pointer.
+  /// A very low-level operation that does not check any invariants, only used
+  /// by TreeBuilder.
+  void prependChildLowLevel(Node *Child);
+  friend class TreeBuilder;
+
+  Node *FirstChild = nullptr;
+};
+
+} // namespace syntax
+} // namespace clang
+
+#endif
Index: clang/include/clang/Tooling/Syntax/Nodes.h
===================================================================
--- /dev/null
+++ clang/include/clang/Tooling/Syntax/Nodes.h
@@ -0,0 +1,81 @@
+//===- Nodes.h - syntax nodes for C/C++ grammar constructs ----*- C++ -*-=====//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Syntax tree nodes for C, C++ and Objective-C grammar constructs.
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_CLANG_TOOLING_SYNTAX_NODES_H
+#define LLVM_CLANG_TOOLING_SYNTAX_NODES_H
+#include "clang/Basic/TokenKinds.h"
+#include "clang/Lex/Token.h"
+#include "clang/Tooling/Syntax/Tokens.h"
+#include "clang/Tooling/Syntax/Tree.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/raw_ostream.h"
+namespace clang {
+namespace syntax {
+
+/// A kind of a syntax node, used for implementing casts.
+enum class NodeKind : uint16_t {
+  Leaf,
+  RecoveryNode,
+  TranslationUnit,
+  TopLevelDecl,
+  CompoundStatement,
+};
+/// For debugging purposes.
+llvm::raw_ostream& operator << (llvm::raw_ostream &OS, NodeKind K);
+
+/// A root node for a translation unit. Parent is always null.
+class TranslationUnit final : public TreeNode {
+public:
+  TranslationUnit() : TreeNode(NodeKind::TranslationUnit) {}
+  static bool classof(const Node *N) {
+    return N->kind() == NodeKind::TranslationUnit;
+  }
+};
+
+/// A tree node of an unknown kind, i.e. a syntax error or a construct from the
+/// clang AST without a syntax counterpart.
+/// These nodes can appear at any place in the syntax tree.
+class RecoveryNode final : public TreeNode {
+public:
+  RecoveryNode() : TreeNode(NodeKind::RecoveryNode) {}
+  static bool classof(const Node *N) {
+    return N->kind() == NodeKind::RecoveryNode;
+  }
+};
+
+/// FIXME: this node is temporary and will be replaced with nodes for various
+///        'declarations' and 'declarators' from the C/C++ grammar
+///
+/// Represents any top-level declaration. Only there to give the syntax tree a
+/// bit of structure until we implement syntax nodes for declarations and
+/// declarators.
+class TopLevelDecl final : public TreeNode {
+public:
+  TopLevelDecl() : TreeNode(NodeKind::TopLevelDecl) {}
+  static bool classof(const Node *N) {
+    return N->kind() == NodeKind::TopLevelDecl;
+  }
+};
+
+/// Represents a compound statement.
+class CompoundStatement final : public TreeNode {
+public:
+  CompoundStatement() : TreeNode(NodeKind::CompoundStatement) {}
+  static bool classof(const Node *N) {
+    return N->kind() == NodeKind::CompoundStatement;
+  }
+
+  Leaf *lbrace();
+  Leaf *rbrace();
+};
+
+} // namespace syntax
+} // namespace clang
+#endif
Index: clang/include/clang/Tooling/Syntax/BuildTree.h
===================================================================
--- /dev/null
+++ clang/include/clang/Tooling/Syntax/BuildTree.h
@@ -0,0 +1,36 @@
+//===- BuildTree.h - build syntax trees -----------------------*- C++ -*-=====//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Syntax tree is a parse tree for the C++ source code designed to allow
+// mutations.
+//
+// The code is divided in the following way:
+//   - 'Tree/Cascade.h' defines the basic structure of the syntax tree,
+//   - 'Tree/Nodes.h' defines the concrete node classes, corresponding to the
+//   - 'Tree.h' (this file) defines an operation to constuct a tree from an AST.
+//
+// This is still work in progress and highly experimental, we leave room for
+// ourselves to completely change the design and/or implementation.
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_CLANG_TOOLING_SYNTAX_TREE_H
+#define LLVM_CLANG_TOOLING_SYNTAX_TREE_H
+
+#include "clang/AST/Decl.h"
+#include "clang/Tooling/Core/Replacement.h"
+#include "clang/Tooling/Syntax/Nodes.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+
+namespace clang {
+namespace syntax {
+
+/// Build a syntax tree for the main file.
+syntax::TranslationUnit *buildSyntaxTree(Arena &A,
+                                         const clang::TranslationUnitDecl &TU);
+} // namespace syntax
+} // namespace clang
+#endif
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to