[PATCH] D125667: [pseudo] A basic implementation of compiling cxx grammar at build time.

Haojian Wu via Phabricator via cfe-commits Tue, 24 May 2022 12:07:58 -0700

hokein updated this revision to Diff 431746.
hokein marked 3 inline comments as done.
hokein added a comment.


rebase, and emit string chunks rather than a long raw string literal (to make 
msvc happy)


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D125667/new/

https://reviews.llvm.org/D125667

Files:
  clang-tools-extra/pseudo/CMakeLists.txt
  clang-tools-extra/pseudo/gen/CMakeLists.txt
  clang-tools-extra/pseudo/gen/Main.cpp
  clang-tools-extra/pseudo/include/CMakeLists.txt
  clang-tools-extra/pseudo/include/clang-pseudo/cxx/CXX.h
  clang-tools-extra/pseudo/lib/CMakeLists.txt
  clang-tools-extra/pseudo/lib/Grammar.cpp
  clang-tools-extra/pseudo/lib/GrammarBNF.cpp
  clang-tools-extra/pseudo/lib/LRGraph.cpp
  clang-tools-extra/pseudo/lib/LRTable.cpp
  clang-tools-extra/pseudo/lib/LRTableBuild.cpp
  clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt
  clang-tools-extra/pseudo/lib/cxx/CXX.cpp
  clang-tools-extra/pseudo/lib/grammar/CMakeLists.txt
  clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
  clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
  clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp
  clang-tools-extra/pseudo/lib/grammar/LRTable.cpp
  clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp

Index: clang-tools-extra/pseudo/lib/LRTableBuild.cpp
===================================================================
--- /dev/null
+++ clang-tools-extra/pseudo/lib/LRTableBuild.cpp
@@ -1,146 +0,0 @@
-//===--- LRTableBuild.cpp - Build a LRTable from LRGraph ---------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/Grammar.h"
-#include "clang-pseudo/LRGraph.h"
-#include "clang-pseudo/LRTable.h"
-#include "clang/Basic/TokenKinds.h"
-#include <cstdint>
-
-namespace llvm {
-template <> struct DenseMapInfo<clang::pseudo::LRTable::Entry> {
-  using Entry = clang::pseudo::LRTable::Entry;
-  static inline Entry getEmptyKey() {
-    static Entry E{static_cast<clang::pseudo::SymbolID>(-1), 0,
-                   clang::pseudo::LRTable::Action::sentinel()};
-    return E;
-  }
-  static inline Entry getTombstoneKey() {
-    static Entry E{static_cast<clang::pseudo::SymbolID>(-2), 0,
-                   clang::pseudo::LRTable::Action::sentinel()};
-    return E;
-  }
-  static unsigned getHashValue(const Entry &I) {
-    return llvm::hash_combine(I.State, I.Symbol, I.Act.opaque());
-  }
-  static bool isEqual(const Entry &LHS, const Entry &RHS) {
-    return LHS.State == RHS.State && LHS.Symbol == RHS.Symbol &&
-           LHS.Act == RHS.Act;
-  }
-};
-} // namespace llvm
-
-namespace clang {
-namespace pseudo {
-
-class LRTable::Builder {
-public:
-  Builder(llvm::ArrayRef<std::pair<SymbolID, StateID>> StartStates)
-      : StartStates(StartStates) {}
-
-  bool insert(Entry E) { return Entries.insert(std::move(E)).second; }
-  LRTable build(const GrammarTable &GT) && {
-    // E.g. given the following parsing table with 3 states and 3 terminals:
-    //
-    //            a    b     c
-    // +-------+----+-------+-+
-    // |state0 |    | s0,r0 | |
-    // |state1 | acc|       | |
-    // |state2 |    |  r1   | |
-    // +-------+----+-------+-+
-    //
-    // The final LRTable:
-    //  - TerminalOffset: [a] = 0, [b] = 1, [c] = 4, [d] = 4 (d is a sentinel)
-    //  -  States:     [ 1,    0,  0,  2]
-    //    Actions:     [ acc, s0, r0, r1]
-    //                   ~~~ corresponding range for terminal a
-    //                        ~~~~~~~~~~ corresponding range for terminal b
-    // First step, we sort all entries by (Symbol, State, Action).
-    std::vector<Entry> Sorted(Entries.begin(), Entries.end());
-    llvm::sort(Sorted, [](const Entry &L, const Entry &R) {
-      return std::forward_as_tuple(L.Symbol, L.State, L.Act.opaque()) <
-             std::forward_as_tuple(R.Symbol, R.State, R.Act.opaque());
-    });
-
-    LRTable Table;
-    Table.Actions.reserve(Sorted.size());
-    Table.States.reserve(Sorted.size());
-    // We are good to finalize the States and Actions.
-    for (const auto &E : Sorted) {
-      Table.Actions.push_back(E.Act);
-      Table.States.push_back(E.State);
-    }
-    // Initialize the terminal and nonterminal offset, all ranges are empty by
-    // default.
-    Table.TerminalOffset = std::vector<uint32_t>(GT.Terminals.size() + 1, 0);
-    Table.NontermOffset = std::vector<uint32_t>(GT.Nonterminals.size() + 1, 0);
-    size_t SortedIndex = 0;
-    for (SymbolID NonterminalID = 0; NonterminalID < Table.NontermOffset.size();
-         ++NonterminalID) {
-      Table.NontermOffset[NonterminalID] = SortedIndex;
-      while (SortedIndex < Sorted.size() &&
-             Sorted[SortedIndex].Symbol == NonterminalID)
-        ++SortedIndex;
-    }
-    for (size_t Terminal = 0; Terminal < Table.TerminalOffset.size();
-         ++Terminal) {
-      Table.TerminalOffset[Terminal] = SortedIndex;
-      while (SortedIndex < Sorted.size() &&
-             Sorted[SortedIndex].Symbol ==
-                 tokenSymbol(static_cast<tok::TokenKind>(Terminal)))
-        ++SortedIndex;
-    }
-    Table.StartStates = std::move(StartStates);
-    return Table;
-  }
-
-private:
-  llvm::DenseSet<Entry> Entries;
-  std::vector<std::pair<SymbolID, StateID>> StartStates;
-};
-
-LRTable LRTable::buildForTests(const GrammarTable &GT,
-                               llvm::ArrayRef<Entry> Entries) {
-  Builder Build({});
-  for (const Entry &E : Entries)
-    Build.insert(E);
-  return std::move(Build).build(GT);
-}
-
-LRTable LRTable::buildSLR(const Grammar &G) {
-  auto Graph = LRGraph::buildLR0(G);
-  Builder Build(Graph.startStates());
-  for (const auto &T : Graph.edges()) {
-    Action Act = isToken(T.Label) ? Action::shift(T.Dst) : Action::goTo(T.Dst);
-    Build.insert({T.Src, T.Label, Act});
-  }
-  assert(Graph.states().size() <= (1 << StateBits) &&
-         "Graph states execceds the maximum limit!");
-  auto FollowSets = followSets(G);
-  for (StateID SID = 0; SID < Graph.states().size(); ++SID) {
-    for (const Item &I : Graph.states()[SID].Items) {
-      // If we've just parsed the start symbol, we can accept the input.
-      if (G.lookupRule(I.rule()).Target == G.underscore() && !I.hasNext()) {
-        Build.insert({SID, tokenSymbol(tok::eof), Action::accept(I.rule())});
-        continue;
-      }
-      if (!I.hasNext()) {
-        // If we've reached the end of a rule A := ..., then we can reduce if
-        // the next token is in the follow set of A.
-        for (SymbolID Follow : FollowSets[G.lookupRule(I.rule()).Target]) {
-          assert(isToken(Follow));
-          Build.insert({SID, Follow, Action::reduce(I.rule())});
-        }
-      }
-    }
-  }
-  return std::move(Build).build(G.table());
-}
-
-} // namespace pseudo
-} // namespace clang
Index: clang-tools-extra/pseudo/lib/LRTable.cpp
===================================================================
--- /dev/null
+++ clang-tools-extra/pseudo/lib/LRTable.cpp
@@ -1,135 +0,0 @@
-//===--- LRTable.cpp - Parsing table for LR parsers --------------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/LRTable.h"
-#include "clang-pseudo/Grammar.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/raw_ostream.h"
-
-namespace clang {
-namespace pseudo {
-
-llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const LRTable::Action &A) {
-  switch (A.kind()) {
-  case LRTable::Action::Shift:
-    return OS << llvm::formatv("shift state {0}", A.getShiftState());
-  case LRTable::Action::Reduce:
-    return OS << llvm::formatv("reduce by rule {0}", A.getReduceRule());
-  case LRTable::Action::GoTo:
-    return OS << llvm::formatv("go to state {0}", A.getGoToState());
-  case LRTable::Action::Accept:
-    return OS << "acc";
-  case LRTable::Action::Sentinel:
-    llvm_unreachable("unexpected Sentinel action kind!");
-  }
-  llvm_unreachable("unexpected action kind!");
-}
-
-std::string LRTable::dumpStatistics() const {
-  StateID NumOfStates = 0;
-  for (StateID It : States)
-    NumOfStates = std::max(It, NumOfStates);
-  return llvm::formatv(R"(
-Statistics of the LR parsing table:
-    number of states: {0}
-    number of actions: {1}
-    size of the table (bytes): {2}
-)",
-                       NumOfStates, Actions.size(), bytes())
-      .str();
-}
-
-std::string LRTable::dumpForTests(const Grammar &G) const {
-  std::string Result;
-  llvm::raw_string_ostream OS(Result);
-  StateID MaxState = 0;
-  for (StateID It : States)
-    MaxState = std::max(MaxState, It);
-  OS << "LRTable:\n";
-  for (StateID S = 0; S <= MaxState; ++S) {
-    OS << llvm::formatv("State {0}\n", S);
-    for (uint16_t Terminal = 0; Terminal < NumTerminals; ++Terminal) {
-      SymbolID TokID = tokenSymbol(static_cast<tok::TokenKind>(Terminal));
-      for (auto A : find(S, TokID)) {
-        if (A.kind() == LRTable::Action::Shift)
-          OS.indent(4) << llvm::formatv("'{0}': shift state {1}\n",
-                                        G.symbolName(TokID), A.getShiftState());
-        else if (A.kind() == LRTable::Action::Reduce)
-          OS.indent(4) << llvm::formatv("'{0}': reduce by rule {1} '{2}'\n",
-                                        G.symbolName(TokID), A.getReduceRule(),
-                                        G.dumpRule(A.getReduceRule()));
-        else if (A.kind() == LRTable::Action::Accept)
-          OS.indent(4) << llvm::formatv("'{0}': accept\n", G.symbolName(TokID));
-      }
-    }
-    for (SymbolID NontermID = 0; NontermID < G.table().Nonterminals.size();
-         ++NontermID) {
-      if (find(S, NontermID).empty())
-        continue;
-      OS.indent(4) << llvm::formatv("'{0}': go to state {1}\n",
-                                    G.symbolName(NontermID),
-                                    getGoToState(S, NontermID));
-    }
-  }
-  return OS.str();
-}
-
-llvm::ArrayRef<LRTable::Action> LRTable::getActions(StateID State,
-                                                    SymbolID Terminal) const {
-  assert(pseudo::isToken(Terminal) && "expect terminal symbol!");
-  return find(State, Terminal);
-}
-
-LRTable::StateID LRTable::getGoToState(StateID State,
-                                       SymbolID Nonterminal) const {
-  assert(pseudo::isNonterminal(Nonterminal) && "expected nonterminal symbol!");
-  auto Result = find(State, Nonterminal);
-  assert(Result.size() == 1 && Result.front().kind() == Action::GoTo);
-  return Result.front().getGoToState();
-}
-
-llvm::ArrayRef<LRTable::Action> LRTable::find(StateID Src, SymbolID ID) const {
-  size_t Idx = isToken(ID) ? static_cast<size_t>(symbolToToken(ID)) : ID;
-  assert(isToken(ID) ? Idx + 1 < TerminalOffset.size()
-                     : Idx + 1 < NontermOffset.size());
-  std::pair<size_t, size_t> TargetStateRange =
-      isToken(ID) ? std::make_pair(TerminalOffset[Idx], TerminalOffset[Idx + 1])
-                  : std::make_pair(NontermOffset[Idx], NontermOffset[Idx + 1]);
-  auto TargetedStates =
-      llvm::makeArrayRef(States.data() + TargetStateRange.first,
-                         States.data() + TargetStateRange.second);
-
-  assert(llvm::is_sorted(TargetedStates) &&
-         "subrange of the StateIdx should be sorted!");
-  const LRTable::StateID *Start = llvm::partition_point(
-      TargetedStates, [&Src](LRTable::StateID S) { return S < Src; });
-  if (Start == TargetedStates.end())
-    return {};
-  const LRTable::StateID *End = Start;
-  while (End != TargetedStates.end() && *End == Src)
-    ++End;
-  return llvm::makeArrayRef(&Actions[Start - States.data()],
-                            /*length=*/End - Start);
-}
-
-LRTable::StateID LRTable::getStartState(SymbolID Target) const {
-  assert(llvm::is_sorted(StartStates) && "StartStates must be sorted!");
-  auto It = llvm::partition_point(
-      StartStates, [Target](const std::pair<SymbolID, StateID> &X) {
-        return X.first < Target;
-      });
-  assert(It != StartStates.end() && It->first == Target &&
-         "target symbol doesn't have a start state!");
-  return It->second;
-}
-
-} // namespace pseudo
-} // namespace clang
Index: clang-tools-extra/pseudo/lib/LRGraph.cpp
===================================================================
--- /dev/null
+++ clang-tools-extra/pseudo/lib/LRGraph.cpp
@@ -1,242 +0,0 @@
-//===--- LRGraph.cpp - -------------------------------------------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/LRGraph.h"
-#include "clang-pseudo/Grammar.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/Hashing.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/raw_ostream.h"
-
-using ItemSet = std::vector<clang::pseudo::Item>;
-
-namespace llvm {
-// Support clang::pseudo::Item as DenseMap keys.
-template <> struct DenseMapInfo<ItemSet> {
-  static inline ItemSet getEmptyKey() {
-    return {DenseMapInfo<clang::pseudo::Item>::getEmptyKey()};
-  }
-  static inline ItemSet getTombstoneKey() {
-    return {DenseMapInfo<clang::pseudo::Item>::getTombstoneKey()};
-  }
-  static unsigned getHashValue(const ItemSet &I) {
-    return llvm::hash_combine_range(I.begin(), I.end());
-  }
-  static bool isEqual(const ItemSet &LHS, const ItemSet &RHS) {
-    return LHS == RHS;
-  }
-};
-} // namespace llvm
-
-namespace clang {
-namespace pseudo {
-namespace {
-
-struct SortByNextSymbol {
-  SortByNextSymbol(const Grammar &G) : G(G) {}
-  bool operator()(const Item &L, const Item &R) {
-    if (L.hasNext() && R.hasNext() && L.next(G) != R.next(G))
-      return L.next(G) < R.next(G);
-    if (L.hasNext() != R.hasNext())
-      return L.hasNext() < R.hasNext(); //  a trailing dot is minimal.
-    return L < R;
-  }
-  const Grammar &G;
-};
-
-// Computes a closure of the given item set S:
-//  - extends the given S to contain all options for parsing next token;
-//  - nonterminals after a dot are recursively expanded into the begin-state
-//    of all production rules that produce that nonterminal;
-//
-// Given
-//   Grammar rules = [ _ := E, E := E - T, E := T, T := n, T := ( E ) ]
-//   Input = [ E := . T ]
-// returns [ E :=  . T, T := . n, T := . ( E ) ]
-State closure(ItemSet Queue, const Grammar &G) {
-  llvm::DenseSet<Item> InQueue = {Queue.begin(), Queue.end()};
-  // We reuse the passed-by-value Queue as the final result, as it's already
-  // initialized to the right elements.
-  size_t ItIndex = 0;
-  while (ItIndex < Queue.size()) {
-    const Item &ExpandingItem = Queue[ItIndex];
-    ++ItIndex;
-    if (!ExpandingItem.hasNext())
-      continue;
-
-    SymbolID NextSym = ExpandingItem.next(G);
-    if (pseudo::isToken(NextSym))
-      continue;
-    auto RRange = G.table().Nonterminals[NextSym].RuleRange;
-    for (RuleID RID = RRange.Start; RID < RRange.End; ++RID) {
-      Item NewItem = Item::start(RID, G);
-      if (InQueue.insert(NewItem).second) // new
-        Queue.push_back(std::move(NewItem));
-    }
-  }
-  Queue.shrink_to_fit();
-  llvm::sort(Queue, SortByNextSymbol(G));
-  return {std::move(Queue)};
-}
-
-// Returns all next (with a dot advanced) kernel item sets, partitioned by the
-// advanced symbol.
-//
-// Given
-//  S = [ E := . a b, E := E . - T ]
-// returns [
-//   {id(a), [ E := a . b ]},
-//   {id(-), [ E := E - . T ]}
-// ]
-std::vector<std::pair<SymbolID, ItemSet>>
-nextAvailableKernelItems(const State &S, const Grammar &G) {
-  std::vector<std::pair<SymbolID, ItemSet>> Results;
-  llvm::ArrayRef<Item> AllItems = S.Items;
-  AllItems = AllItems.drop_while([](const Item &I) { return !I.hasNext(); });
-  while (!AllItems.empty()) {
-    SymbolID AdvancedSymbol = AllItems.front().next(G);
-    auto Batch = AllItems.take_while([AdvancedSymbol, &G](const Item &I) {
-      assert(I.hasNext());
-      return I.next(G) == AdvancedSymbol;
-    });
-    assert(!Batch.empty());
-    AllItems = AllItems.drop_front(Batch.size());
-
-    // Advance a dot over the Symbol.
-    ItemSet Next;
-    for (const Item &I : Batch)
-      Next.push_back(I.advance());
-    // sort the set to keep order determinism for hash computation.
-    llvm::sort(Next);
-    Results.push_back({AdvancedSymbol, std::move(Next)});
-  }
-  return Results;
-}
-
-} // namespace
-
-std::string Item::dump(const Grammar &G) const {
-  const auto &Rule = G.lookupRule(RID);
-  auto ToNames = [&](llvm::ArrayRef<SymbolID> Syms) {
-    std::vector<llvm::StringRef> Results;
-    for (auto SID : Syms)
-      Results.push_back(G.symbolName(SID));
-    return Results;
-  };
-  return llvm::formatv("{0} := {1} â¢ {2}", G.symbolName(Rule.Target),
-                       llvm::join(ToNames(Rule.seq().take_front(DotPos)), " "),
-                       llvm::join(ToNames(Rule.seq().drop_front(DotPos)), " "))
-      .str();
-}
-
-std::string State::dump(const Grammar &G, unsigned Indent) const {
-  std::string Result;
-  llvm::raw_string_ostream OS(Result);
-  for (const auto &Item : Items)
-    OS.indent(Indent) << llvm::formatv("{0}\n", Item.dump(G));
-  return OS.str();
-}
-
-std::string LRGraph::dumpForTests(const Grammar &G) const {
-  std::string Result;
-  llvm::raw_string_ostream OS(Result);
-  OS << "States:\n";
-  for (StateID ID = 0; ID < States.size(); ++ID) {
-    OS << llvm::formatv("State {0}\n", ID);
-    OS << States[ID].dump(G, /*Indent*/ 4);
-  }
-  for (const auto &E : Edges) {
-    OS << llvm::formatv("{0} ->[{1}] {2}\n", E.Src, G.symbolName(E.Label),
-                        E.Dst);
-  }
-  return OS.str();
-}
-
-LRGraph LRGraph::buildLR0(const Grammar &G) {
-  class Builder {
-  public:
-    Builder(const Grammar &G) : G(G) {}
-
-    // Adds a given state if not existed.
-    std::pair<StateID, /*inserted*/ bool> insert(ItemSet KernelItems) {
-      assert(llvm::is_sorted(KernelItems) &&
-             "Item must be sorted before inserting to a hash map!");
-      auto It = StatesIndex.find(KernelItems);
-      if (It != StatesIndex.end())
-        return {It->second, false};
-      States.push_back(closure(KernelItems, G));
-      StateID NextStateID = States.size() - 1;
-      StatesIndex.insert({std::move(KernelItems), NextStateID});
-      return {NextStateID, true};
-    }
-
-    void insertEdge(StateID Src, StateID Dst, SymbolID Label) {
-      Edges.push_back({Src, Dst, Label});
-    }
-
-    // Returns a state with the given id.
-    const State &find(StateID ID) const {
-      assert(ID < States.size());
-      return States[ID];
-    }
-
-    void addStartState(SymbolID Sym, StateID State) {
-      StartStates.push_back({Sym, State});
-    }
-
-    LRGraph build() && {
-      States.shrink_to_fit();
-      Edges.shrink_to_fit();
-      llvm::sort(StartStates);
-      StartStates.shrink_to_fit();
-      return LRGraph(std::move(States), std::move(Edges),
-                     std::move(StartStates));
-    }
-
-  private:
-    // Key is the **kernel** item sets.
-    llvm::DenseMap<ItemSet, /*index of States*/ size_t> StatesIndex;
-    std::vector<State> States;
-    std::vector<Edge> Edges;
-    const Grammar &G;
-    std::vector<std::pair<SymbolID, StateID>> StartStates;
-  } Builder(G);
-
-  std::vector<StateID> PendingStates;
-  // Initialize states with the start symbol.
-  auto RRange = G.table().Nonterminals[G.underscore()].RuleRange;
-  for (RuleID RID = RRange.Start; RID < RRange.End; ++RID) {
-    auto StartState = std::vector<Item>{Item::start(RID, G)};
-    auto Result = Builder.insert(std::move(StartState));
-    assert(Result.second && "State must be new");
-    PendingStates.push_back(Result.first);
-
-    const Rule &StartRule = G.lookupRule(RID);
-    assert(StartRule.Size == 1 &&
-           "Start rule must have exactly one symbol in its body!");
-    Builder.addStartState(StartRule.seq().front(), Result.first);
-  }
-
-  while (!PendingStates.empty()) {
-    auto CurrentStateID = PendingStates.back();
-    PendingStates.pop_back();
-    for (auto Next :
-         nextAvailableKernelItems(Builder.find(CurrentStateID), G)) {
-      auto Insert = Builder.insert(Next.second);
-      if (Insert.second) // new state, insert to the pending queue.
-        PendingStates.push_back(Insert.first);
-      Builder.insertEdge(CurrentStateID, Insert.first, Next.first);
-    }
-  }
-  return std::move(Builder).build();
-}
-
-} // namespace pseudo
-} // namespace clang
Index: clang-tools-extra/pseudo/lib/GrammarBNF.cpp
===================================================================
--- /dev/null
+++ clang-tools-extra/pseudo/lib/GrammarBNF.cpp
@@ -1,300 +0,0 @@
-//===--- GrammarBNF.cpp - build grammar from BNF files  ----------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/Grammar.h"
-#include "clang/Basic/TokenKinds.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/FormatVariadic.h"
-#include <memory>
-#include <utility>
-
-namespace clang {
-namespace pseudo {
-
-namespace {
-static const llvm::StringRef OptSuffix = "_opt";
-static const llvm::StringRef StartSymbol = "_";
-
-// Builds grammar from BNF files.
-class GrammarBuilder {
-public:
-  GrammarBuilder(std::vector<std::string> &Diagnostics)
-      : Diagnostics(Diagnostics) {}
-
-  std::unique_ptr<Grammar> build(llvm::StringRef BNF) {
-    auto Specs = eliminateOptional(parse(BNF));
-
-    assert(llvm::all_of(Specs,
-                        [](const RuleSpec &R) {
-                          if (R.Target.endswith(OptSuffix))
-                            return false;
-                          return llvm::all_of(
-                              R.Sequence, [](const RuleSpec::Element &E) {
-                                return !E.Symbol.endswith(OptSuffix);
-                              });
-                        }) &&
-           "Optional symbols should be eliminated!");
-
-    auto T = std::make_unique<GrammarTable>();
-
-    // Assemble the name->ID and ID->nonterminal name maps.
-    llvm::DenseSet<llvm::StringRef> UniqueNonterminals;
-    llvm::DenseMap<llvm::StringRef, SymbolID> SymbolIds;
-    for (uint16_t I = 0; I < NumTerminals; ++I)
-      SymbolIds.try_emplace(T->Terminals[I], tokenSymbol(tok::TokenKind(I)));
-    auto Consider = [&](llvm::StringRef Name) {
-      if (!SymbolIds.count(Name))
-        UniqueNonterminals.insert(Name);
-    };
-    for (const auto &Spec : Specs) {
-      Consider(Spec.Target);
-      for (const RuleSpec::Element &Elt : Spec.Sequence)
-        Consider(Elt.Symbol);
-    }
-    llvm::for_each(UniqueNonterminals, [&T](llvm::StringRef Name) {
-      T->Nonterminals.emplace_back();
-      T->Nonterminals.back().Name = Name.str();
-    });
-    assert(T->Nonterminals.size() < (1 << (SymbolBits - 1)) &&
-           "Too many nonterminals to fit in SymbolID bits!");
-    llvm::sort(T->Nonterminals, [](const GrammarTable::Nonterminal &L,
-                                   const GrammarTable::Nonterminal &R) {
-      return L.Name < R.Name;
-    });
-    // Build name -> ID maps for nonterminals.
-    for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID)
-      SymbolIds.try_emplace(T->Nonterminals[SID].Name, SID);
-
-    // Convert the rules.
-    T->Rules.reserve(Specs.size());
-    std::vector<SymbolID> Symbols;
-    auto Lookup = [SymbolIds](llvm::StringRef Name) {
-      auto It = SymbolIds.find(Name);
-      assert(It != SymbolIds.end() && "Didn't find the symbol in SymbolIds!");
-      return It->second;
-    };
-    for (const auto &Spec : Specs) {
-      assert(Spec.Sequence.size() <= Rule::MaxElements);
-      Symbols.clear();
-      for (const RuleSpec::Element &Elt : Spec.Sequence)
-        Symbols.push_back(Lookup(Elt.Symbol));
-      T->Rules.push_back(Rule(Lookup(Spec.Target), Symbols));
-    }
-    assert(T->Rules.size() < (1 << RuleBits) &&
-           "Too many rules to fit in RuleID bits!");
-    const auto &SymbolOrder = getTopologicalOrder(T.get());
-    llvm::stable_sort(
-        T->Rules, [&SymbolOrder](const Rule &Left, const Rule &Right) {
-          // Sorted by the topological order of the nonterminal Target.
-          return SymbolOrder[Left.Target] < SymbolOrder[Right.Target];
-        });
-    for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID) {
-      auto StartIt = llvm::partition_point(T->Rules, [&](const Rule &R) {
-        return SymbolOrder[R.Target] < SymbolOrder[SID];
-      });
-      RuleID Start = StartIt - T->Rules.begin();
-      RuleID End = Start;
-      while (End < T->Rules.size() && T->Rules[End].Target == SID)
-        ++End;
-      T->Nonterminals[SID].RuleRange = {Start, End};
-    }
-    auto G = std::make_unique<Grammar>(std::move(T));
-    diagnoseGrammar(*G);
-    return G;
-  }
-
-  // Gets topological order for nonterminal symbols.
-  //
-  // The topological order is defined as: if a *single* nonterminal A produces
-  // (or transitively) a nonterminal B (that said, there is a production rule
-  // B := A), then A is less than B.
-  //
-  // It returns the sort key for each symbol, the array is indexed by SymbolID.
-  std::vector<unsigned> getTopologicalOrder(GrammarTable *T) {
-    std::vector<std::pair<SymbolID, SymbolID>> Dependencies;
-    for (const auto &Rule : T->Rules) {
-      // if A := B, A depends on B.
-      if (Rule.Size == 1 && pseudo::isNonterminal(Rule.Sequence[0]))
-        Dependencies.push_back({Rule.Target, Rule.Sequence[0]});
-    }
-    llvm::sort(Dependencies);
-    std::vector<SymbolID> Order;
-    // Each nonterminal state flows: NotVisited -> Visiting -> Visited.
-    enum State {
-      NotVisited,
-      Visiting,
-      Visited,
-    };
-    std::vector<State> VisitStates(T->Nonterminals.size(), NotVisited);
-    std::function<void(SymbolID)> DFS = [&](SymbolID SID) -> void {
-      if (VisitStates[SID] == Visited)
-        return;
-      if (VisitStates[SID] == Visiting) {
-        Diagnostics.push_back(
-            llvm::formatv("The grammar contains a cycle involving symbol {0}",
-                          T->Nonterminals[SID].Name));
-        return;
-      }
-      VisitStates[SID] = Visiting;
-      for (auto It = llvm::lower_bound(Dependencies,
-                                       std::pair<SymbolID, SymbolID>{SID, 0});
-           It != Dependencies.end() && It->first == SID; ++It)
-        DFS(It->second);
-      VisitStates[SID] = Visited;
-      Order.push_back(SID);
-    };
-    for (SymbolID ID = 0; ID != T->Nonterminals.size(); ++ID)
-      DFS(ID);
-    std::vector<unsigned> Result(T->Nonterminals.size(), 0);
-    for (size_t I = 0; I < Order.size(); ++I)
-      Result[Order[I]] = I;
-    return Result;
-  }
-
-private:
-  // Text representation of a BNF grammar rule.
-  struct RuleSpec {
-    llvm::StringRef Target;
-    struct Element {
-      llvm::StringRef Symbol; // Name of the symbol
-    };
-    std::vector<Element> Sequence;
-
-    std::string toString() const {
-      std::vector<llvm::StringRef> Body;
-      for (const auto &E : Sequence)
-        Body.push_back(E.Symbol);
-      return llvm::formatv("{0} := {1}", Target, llvm::join(Body, " "));
-    }
-  };
-
-  std::vector<RuleSpec> parse(llvm::StringRef Lines) {
-    std::vector<RuleSpec> Specs;
-    for (llvm::StringRef Line : llvm::split(Lines, '\n')) {
-      Line = Line.trim();
-      // Strip anything coming after the '#' (comment).
-      Line = Line.take_while([](char C) { return C != '#'; });
-      if (Line.empty())
-        continue;
-      RuleSpec Rule;
-      if (parseLine(Line, Rule))
-        Specs.push_back(std::move(Rule));
-    }
-    return Specs;
-  }
-
-  bool parseLine(llvm::StringRef Line, RuleSpec &Out) {
-    auto Parts = Line.split(":=");
-    if (Parts.first == Line) { // no separator in Line
-      Diagnostics.push_back(
-          llvm::formatv("Failed to parse '{0}': no separator :=", Line).str());
-      return false;
-    }
-
-    Out.Target = Parts.first.trim();
-    Out.Sequence.clear();
-    for (llvm::StringRef Chunk : llvm::split(Parts.second, ' ')) {
-      Chunk = Chunk.trim();
-      if (Chunk.empty())
-        continue; // skip empty
-
-      Out.Sequence.push_back({Chunk});
-    }
-    return true;
-  };
-
-  // Inlines all _opt symbols.
-  // For example, a rule E := id +_opt id, after elimination, we have two
-  // equivalent rules:
-  //   1) E := id + id
-  //   2) E := id id
-  std::vector<RuleSpec> eliminateOptional(llvm::ArrayRef<RuleSpec> Input) {
-    std::vector<RuleSpec> Results;
-    std::vector<RuleSpec::Element> Storage;
-    for (const auto &R : Input) {
-      eliminateOptionalTail(
-          R.Sequence, Storage, [&Results, &Storage, &R, this]() {
-            if (Storage.empty()) {
-              Diagnostics.push_back(
-                  llvm::formatv("Rule '{0}' has a nullable RHS", R.toString()));
-              return;
-            }
-            Results.push_back({R.Target, Storage});
-          });
-      assert(Storage.empty());
-    }
-    return Results;
-  }
-  void eliminateOptionalTail(llvm::ArrayRef<RuleSpec::Element> Elements,
-                             std::vector<RuleSpec::Element> &Result,
-                             llvm::function_ref<void()> CB) {
-    if (Elements.empty())
-      return CB();
-    auto Front = Elements.front();
-    if (!Front.Symbol.endswith(OptSuffix)) {
-      Result.push_back(std::move(Front));
-      eliminateOptionalTail(Elements.drop_front(1), Result, CB);
-      Result.pop_back();
-      return;
-    }
-    // Enumerate two options: skip the opt symbol, or inline the symbol.
-    eliminateOptionalTail(Elements.drop_front(1), Result, CB); // skip
-    Front.Symbol = Front.Symbol.drop_back(OptSuffix.size());   // drop "_opt"
-    Result.push_back(std::move(Front));
-    eliminateOptionalTail(Elements.drop_front(1), Result, CB);
-    Result.pop_back();
-  }
-
-  // Diagnoses the grammar and emit warnings if any.
-  void diagnoseGrammar(const Grammar &G) {
-    const auto &T = G.table();
-    for (SymbolID SID = 0; SID < T.Nonterminals.size(); ++SID) {
-      auto Range = T.Nonterminals[SID].RuleRange;
-      if (Range.Start == Range.End)
-        Diagnostics.push_back(
-            llvm::formatv("No rules for nonterminal: {0}", G.symbolName(SID)));
-      llvm::StringRef NameRef = T.Nonterminals[SID].Name;
-      if (llvm::all_of(NameRef, llvm::isAlpha) && NameRef.upper() == NameRef) {
-        Diagnostics.push_back(llvm::formatv(
-            "Token-like name {0} is used as a nonterminal", G.symbolName(SID)));
-      }
-    }
-    for (RuleID RID = 0; RID + 1u < T.Rules.size(); ++RID) {
-      if (T.Rules[RID] == T.Rules[RID + 1])
-        Diagnostics.push_back(
-            llvm::formatv("Duplicate rule: `{0}`", G.dumpRule(RID)));
-      // Warning for nullable nonterminals
-      if (T.Rules[RID].Size == 0)
-        Diagnostics.push_back(
-            llvm::formatv("Rule `{0}` has a nullable RHS", G.dumpRule(RID)));
-    }
-    // symbol-id -> used counts
-    std::vector<unsigned> UseCounts(T.Nonterminals.size(), 0);
-    for (const Rule &R : T.Rules)
-      for (SymbolID SID : R.seq())
-        if (isNonterminal(SID))
-          ++UseCounts[SID];
-    for (SymbolID SID = 0; SID < UseCounts.size(); ++SID)
-      if (UseCounts[SID] == 0 && T.Nonterminals[SID].Name != StartSymbol)
-        Diagnostics.push_back(
-            llvm::formatv("Nonterminal never used: {0}", G.symbolName(SID)));
-  }
-  std::vector<std::string> &Diagnostics;
-};
-} // namespace
-
-std::unique_ptr<Grammar>
-Grammar::parseBNF(llvm::StringRef BNF, std::vector<std::string> &Diagnostics) {
-  Diagnostics.clear();
-  return GrammarBuilder(Diagnostics).build(BNF);
-}
-
-} // namespace pseudo
-} // namespace clang
Index: clang-tools-extra/pseudo/lib/Grammar.cpp
===================================================================
--- /dev/null
+++ clang-tools-extra/pseudo/lib/Grammar.cpp
@@ -1,187 +0,0 @@
-//===--- Grammar.cpp - Grammar for clang pseudoparser  -----------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang-pseudo/Grammar.h"
-#include "clang/Basic/TokenKinds.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/raw_ostream.h"
-
-namespace clang {
-namespace pseudo {
-
-Rule::Rule(SymbolID Target, llvm::ArrayRef<SymbolID> Sequence)
-    : Target(Target), Size(static_cast<uint8_t>(Sequence.size())) {
-  assert(Sequence.size() <= Rule::MaxElements);
-  llvm::copy(Sequence, this->Sequence);
-}
-
-Grammar::Grammar(std::unique_ptr<GrammarTable> Table) : T(std::move(Table)) {
-  Underscore = *findNonterminal("_");
-}
-
-llvm::ArrayRef<Rule> Grammar::rulesFor(SymbolID SID) const {
-  assert(isNonterminal(SID));
-  const auto &R = T->Nonterminals[SID].RuleRange;
-  assert(R.End <= T->Rules.size());
-  return llvm::makeArrayRef(&T->Rules[R.Start], R.End - R.Start);
-}
-
-const Rule &Grammar::lookupRule(RuleID RID) const {
-  assert(RID < T->Rules.size());
-  return T->Rules[RID];
-}
-
-llvm::StringRef Grammar::symbolName(SymbolID SID) const {
-  if (isToken(SID))
-    return T->Terminals[symbolToToken(SID)];
-  return T->Nonterminals[SID].Name;
-}
-
-llvm::Optional<SymbolID> Grammar::findNonterminal(llvm::StringRef Name) const {
-  auto It = llvm::partition_point(
-      T->Nonterminals,
-      [&](const GrammarTable::Nonterminal &X) { return X.Name < Name; });
-  if (It != T->Nonterminals.end() && It->Name == Name)
-    return It - T->Nonterminals.begin();
-  return llvm::None;
-}
-
-std::string Grammar::dumpRule(RuleID RID) const {
-  std::string Result;
-  llvm::raw_string_ostream OS(Result);
-  const Rule &R = T->Rules[RID];
-  OS << symbolName(R.Target) << " :=";
-  for (SymbolID SID : R.seq())
-    OS << " " << symbolName(SID);
-  return Result;
-}
-
-std::string Grammar::dumpRules(SymbolID SID) const {
-  assert(isNonterminal(SID));
-  std::string Result;
-  const auto &Range = T->Nonterminals[SID].RuleRange;
-  for (RuleID RID = Range.Start; RID < Range.End; ++RID)
-    Result.append(dumpRule(RID)).push_back('\n');
-  return Result;
-}
-
-std::string Grammar::dump() const {
-  std::string Result;
-  llvm::raw_string_ostream OS(Result);
-  OS << "Nonterminals:\n";
-  for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID)
-    OS << llvm::formatv("  {0} {1}\n", SID, symbolName(SID));
-  OS << "Rules:\n";
-  for (RuleID RID = 0; RID < T->Rules.size(); ++RID)
-    OS << llvm::formatv("  {0} {1}\n", RID, dumpRule(RID));
-  return OS.str();
-}
-
-std::vector<llvm::DenseSet<SymbolID>> firstSets(const Grammar &G) {
-  std::vector<llvm::DenseSet<SymbolID>> FirstSets(
-      G.table().Nonterminals.size());
-  auto ExpandFirstSet = [&FirstSets](SymbolID Target, SymbolID First) {
-    assert(isNonterminal(Target));
-    if (isToken(First))
-      return FirstSets[Target].insert(First).second;
-    bool Changed = false;
-    for (SymbolID SID : FirstSets[First])
-      Changed |= FirstSets[Target].insert(SID).second;
-    return Changed;
-  };
-
-  // A rule S := T ... implies elements in FIRST(S):
-  //  - if T is a terminal, FIRST(S) contains T
-  //  - if T is a nonterminal, FIRST(S) contains FIRST(T)
-  // Since FIRST(T) may not have been fully computed yet, FIRST(S) itself may
-  // end up being incomplete.
-  // We iterate until we hit a fixed point.
-  // (This isn't particularly efficient, but table building isn't on the
-  // critical path).
-  bool Changed = true;
-  while (Changed) {
-    Changed = false;
-    for (const auto &R : G.table().Rules)
-      // We only need to consider the first element because symbols are
-      // non-nullable.
-      Changed |= ExpandFirstSet(R.Target, R.seq().front());
-  }
-  return FirstSets;
-}
-
-std::vector<llvm::DenseSet<SymbolID>> followSets(const Grammar &G) {
-  auto FirstSets = firstSets(G);
-  std::vector<llvm::DenseSet<SymbolID>> FollowSets(
-      G.table().Nonterminals.size());
-  // Expand the follow set of a nonterminal symbol Y by adding all from the
-  // given symbol set.
-  auto ExpandFollowSet = [&FollowSets](SymbolID Y,
-                                       const llvm::DenseSet<SymbolID> &ToAdd) {
-    assert(isNonterminal(Y));
-    bool Changed = false;
-    for (SymbolID F : ToAdd)
-      Changed |= FollowSets[Y].insert(F).second;
-    return Changed;
-  };
-  // Follow sets is computed based on the following 3 rules, the computation
-  // is completed at a fixed point where there is no more new symbols can be
-  // added to any of the follow sets.
-  //
-  // Rule 1: add endmarker to the FOLLOW(S), where S is the start symbol of the
-  // augmented grammar, in our case it is '_'.
-  FollowSets[G.underscore()].insert(tokenSymbol(tok::eof));
-  bool Changed = true;
-  while (Changed) {
-    Changed = false;
-    for (const auto &R : G.table().Rules) {
-      // Rule 2: for a rule X := ... Y Z, we add all symbols from FIRST(Z) to
-      // FOLLOW(Y).
-      for (size_t I = 0; I + 1 < R.seq().size(); ++I) {
-        if (isToken(R.seq()[I]))
-          continue;
-        // We only need to consider the next symbol because symbols are
-        // non-nullable.
-        SymbolID Next = R.seq()[I + 1];
-        if (isToken(Next))
-          // First set for a terminal is itself.
-          Changed |= ExpandFollowSet(R.seq()[I], {Next});
-        else
-          Changed |= ExpandFollowSet(R.seq()[I], FirstSets[Next]);
-      }
-      // Rule 3: for a rule X := ... Z, we add all symbols from FOLLOW(X) to
-      // FOLLOW(Z).
-      SymbolID Z = R.seq().back();
-      if (isNonterminal(Z))
-        Changed |= ExpandFollowSet(Z, FollowSets[R.Target]);
-    }
-  }
-  return FollowSets;
-}
-
-static llvm::ArrayRef<std::string> getTerminalNames() {
-  static const std::vector<std::string> *TerminalNames = []() {
-    static std::vector<std::string> TerminalNames;
-    TerminalNames.reserve(NumTerminals);
-    for (unsigned I = 0; I < NumTerminals; ++I) {
-      tok::TokenKind K = static_cast<tok::TokenKind>(I);
-      if (const auto *Punc = tok::getPunctuatorSpelling(K))
-        TerminalNames.push_back(Punc);
-      else
-        TerminalNames.push_back(llvm::StringRef(tok::getTokenName(K)).upper());
-    }
-    return &TerminalNames;
-  }();
-  return *TerminalNames;
-}
-GrammarTable::GrammarTable() : Terminals(getTerminalNames()) {}
-
-} // namespace pseudo
-} // namespace clang
Index: clang-tools-extra/pseudo/lib/grammar/CMakeLists.txt
===================================================================
--- /dev/null
+++ clang-tools-extra/pseudo/lib/grammar/CMakeLists.txt
@@ -0,0 +1,18 @@
+set(LLVM_LINK_COMPONENTS Support)
+
+# This library intents to keep as minimal dependencies as possible, it is a base
+# library of the cxx generator, to avoid creating long dep paths in the build
+# graph.
+add_clang_library(clangPseudoGrammar
+  Grammar.cpp
+  GrammarBNF.cpp
+  LRGraph.cpp
+  LRTable.cpp
+  LRTableBuild.cpp
+
+  # FIXME: can we get rid of the clangBasic dependency? We need it for the
+  # clang::tok::getTokenName and clang::tok::getPunctuatorSpelling functions, we
+  # could consider remimplement these functions.
+  LINK_LIBS
+  clangBasic
+  )
Index: clang-tools-extra/pseudo/lib/cxx/CXX.cpp
===================================================================
--- /dev/null
+++ clang-tools-extra/pseudo/lib/cxx/CXX.cpp
@@ -0,0 +1,34 @@
+//===--- CXX.cpp - Define public interfaces for C++ grammar ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang-pseudo/cxx/CXX.h"
+#include "clang-pseudo/LRTable.h"
+
+namespace clang {
+namespace pseudo {
+namespace cxx {
+
+static const char *CXXBNF =
+#include "CXXBNF.inc"
+    ;
+
+const Grammar &getGrammar() {
+  static std::vector<std::string> Diags;
+  static Grammar *G = Grammar::parseBNF(CXXBNF, Diags).release();
+  assert(Diags.empty());
+  return *G;
+}
+
+const LRTable &getLRTable() {
+  static LRTable *Table = new LRTable(LRTable::buildSLR(getGrammar()));
+  return *Table;
+}
+
+} // namespace cxx
+} // namespace pseudo
+} // namespace clang
Index: clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt
===================================================================
--- /dev/null
+++ clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_clang_library(clangPseudoCXX
+  CXX.cpp
+
+  DEPENDS
+  cxx_gen
+
+  LINK_LIBS
+  clangPseudoGrammar
+  )
Index: clang-tools-extra/pseudo/lib/CMakeLists.txt
===================================================================
--- clang-tools-extra/pseudo/lib/CMakeLists.txt
+++ clang-tools-extra/pseudo/lib/CMakeLists.txt
@@ -1,3 +1,6 @@
+add_subdirectory(cxx)
+add_subdirectory(grammar)
+
 set(LLVM_LINK_COMPONENTS Support)
 
 add_clang_library(clangPseudo
@@ -5,15 +8,11 @@
   DirectiveTree.cpp
   Forest.cpp
   GLR.cpp
-  Grammar.cpp
-  GrammarBNF.cpp
   Lex.cpp
-  LRGraph.cpp
-  LRTable.cpp
-  LRTableBuild.cpp
   Token.cpp
 
   LINK_LIBS
   clangBasic
   clangLex
+  clangPseudoGrammar
   )
Index: clang-tools-extra/pseudo/include/clang-pseudo/cxx/CXX.h
===================================================================
--- /dev/null
+++ clang-tools-extra/pseudo/include/clang-pseudo/cxx/CXX.h
@@ -0,0 +1,51 @@
+//===--- CXX.h - Public interfaces for the C++ grammar -----------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file defines public interfaces for the C++ grammar
+//  (pseudo/lib/cxx.bnf). It provides a fast way to access core building pieces
+//  of the LR parser, e.g. Grammar, LRTable, rather than parsing the grammar
+//  file at the runtime.
+//
+//  We do a compilation of the C++ BNF grammar at build time, and generate
+//  critical data sources. The implementation of the interfaces are based on the
+//  generated data sources.
+//
+//  FIXME: not everything is fully compiled yet. The implementation of the
+//  interfaces are still parsing the grammar file at the runtime.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CLANG_PSEUDO_CXX_CXX_H
+#define CLANG_PSEUDO_CXX_CXX_H
+
+#include "clang-pseudo/Grammar.h"
+
+namespace clang {
+namespace pseudo {
+class LRTable;
+
+namespace cxx {
+// Symbol represents nonterminal symbols in the C++ grammar.
+// It provides a simple uniform way to access a particular nonterminal.
+enum class Symbol : SymbolID {
+#define NONTERMINAL(X, Y) X = Y,
+#include "CXXSymbols.inc"
+#undef NONTERMINAL
+};
+
+// Returns the C++ grammar.
+const Grammar &getGrammar();
+// Returns the corresponding LRTable for the C++ grammar.
+const LRTable &getLRTable();
+
+} // namespace cxx
+
+} // namespace pseudo
+} // namespace clang
+
+#endif // CLANG_PSEUDO_CXX_CXX_H
Index: clang-tools-extra/pseudo/include/CMakeLists.txt
===================================================================
--- /dev/null
+++ clang-tools-extra/pseudo/include/CMakeLists.txt
@@ -0,0 +1,29 @@
+# The cxx.bnf grammar file
+set(cxx_bnf ${CMAKE_CURRENT_SOURCE_DIR}/../lib/cxx.bnf)
+
+# Generate inc files.
+set(cxx_symbols_inc ${CMAKE_CURRENT_BINARY_DIR}/CXXSymbols.inc)
+add_custom_command(OUTPUT ${cxx_symbols_inc}
+   COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/pseudo-gen"
+     --grammar ${cxx_bnf}
+     --emit-symbol-list
+     > ${cxx_symbols_inc}
+   COMMENT "Generating nonterminal symbol file for cxx grammar..."
+   DEPENDS pseudo-gen
+   VERBATIM)
+
+set(cxx_bnf_inc ${CMAKE_CURRENT_BINARY_DIR}/CXXBNF.inc)
+add_custom_command(OUTPUT ${cxx_bnf_inc}
+   COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/pseudo-gen"
+     --grammar ${cxx_bnf}
+     --emit-grammar-content
+     > ${cxx_bnf_inc}
+   COMMENT "Generating bnf string file for cxx grammar..."
+   DEPENDS pseudo-gen
+   VERBATIM)
+
+# add_custom_command does not create a new target, we need to deine a target
+# explicitly, so that other targets can depend on it.
+add_custom_target(cxx_gen
+    DEPENDS ${cxx_symbols_inc} ${cxx_bnf_inc}
+    VERBATIM)
Index: clang-tools-extra/pseudo/gen/Main.cpp
===================================================================
--- /dev/null
+++ clang-tools-extra/pseudo/gen/Main.cpp
@@ -0,0 +1,89 @@
+//===--- Main.cpp - Compile BNF grammar -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a tool to compile a BNF grammar, it is used by the build system to
+// generate a necessary data bits to statically construct core pieces (Grammar,
+// LRTable etc) of the LR parser.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang-pseudo/Grammar.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include <algorithm>
+
+using llvm::cl::desc;
+using llvm::cl::init;
+using llvm::cl::opt;
+using llvm::cl::values;
+
+namespace {
+enum EmitType {
+  EmitSymbolList,
+  EmitGrammarContent,
+};
+
+opt<std::string> Grammar("grammar", desc("Parse a BNF grammar file."),
+                         init(""));
+opt<EmitType>
+    Emit(desc("which information to emit:"),
+         values(clEnumValN(EmitSymbolList, "emit-symbol-list",
+                           "Print nonterminal symbols (default)"),
+                clEnumValN(EmitGrammarContent, "emit-grammar-content",
+                           "Print the BNF grammar content as a string")));
+std::string readOrDie(llvm::StringRef Path) {
+  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
+      llvm::MemoryBuffer::getFile(Path);
+  if (std::error_code EC = Text.getError()) {
+    llvm::errs() << "Error: can't read grammar file '" << Path
+                 << "': " << EC.message() << "\n";
+    ::exit(1);
+  }
+  return Text.get()->getBuffer().str();
+}
+} // namespace
+
+int main(int argc, char *argv[]) {
+  llvm::cl::ParseCommandLineOptions(argc, argv, "");
+  if (!Grammar.getNumOccurrences()) {
+    llvm::errs() << "Grammar file must be provided!\n";
+    return 1;
+  }
+
+  std::string GrammarText = readOrDie(Grammar);
+  std::vector<std::string> Diags;
+  auto G = clang::pseudo::Grammar::parseBNF(GrammarText, Diags);
+
+  if (!Diags.empty()) {
+    llvm::errs() << llvm::join(Diags, "\n");
+    return 1;
+  }
+  switch (Emit) {
+
+  case EmitSymbolList:
+    for (clang::pseudo::SymbolID ID = 0; ID < G->table().Nonterminals.size();
+         ++ID) {
+      std::string Name = G->symbolName(ID).str();
+      // translation-unit -> translation_unit
+      std::replace(Name.begin(), Name.end(), '-', '_');
+      llvm::outs() << (llvm::formatv("NONTERMINAL({0}, {1})\n", Name, ID));
+    }
+    break;
+  case EmitGrammarContent:
+    for (llvm::StringRef Line : llvm::split(GrammarText, '\n')) {
+      llvm::outs() << '"';
+      llvm::outs().write_escaped((Line + "\n").str());
+      llvm::outs() << "\"\n";
+    }
+    break;
+  }
+
+  return 0;
+}
Index: clang-tools-extra/pseudo/gen/CMakeLists.txt
===================================================================
--- /dev/null
+++ clang-tools-extra/pseudo/gen/CMakeLists.txt
@@ -0,0 +1,10 @@
+set(LLVM_LINK_COMPONENTS Support)
+
+add_clang_executable(pseudo-gen
+  Main.cpp
+  )
+
+target_link_libraries(pseudo-gen
+  PRIVATE
+  clangPseudoGrammar
+  )
Index: clang-tools-extra/pseudo/CMakeLists.txt
===================================================================
--- clang-tools-extra/pseudo/CMakeLists.txt
+++ clang-tools-extra/pseudo/CMakeLists.txt
@@ -1,5 +1,7 @@
 include_directories(include)
 include_directories(${CMAKE_CURRENT_BINARY_DIR}/include)
+add_subdirectory(include)
+add_subdirectory(gen)
 add_subdirectory(lib)
 add_subdirectory(tool)
 add_subdirectory(fuzzer)

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D125667: [pseudo] A basic implementation of compiling cxx grammar at build time.

Reply via email to