sammccall updated this revision to Diff 418127.
sammccall added a comment.

Smarter reduce algorithm


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D122408/new/

https://reviews.llvm.org/D122408

Files:
  clang-tools-extra/pseudo/include/clang-pseudo/Forest.h
  clang-tools-extra/pseudo/include/clang-pseudo/GLRParser.h
  clang-tools-extra/pseudo/lib/CMakeLists.txt
  clang-tools-extra/pseudo/lib/Forest.cpp
  clang-tools-extra/pseudo/lib/GLRParser.cpp
  clang-tools-extra/pseudo/tool/ClangPseudo.cpp

Index: clang-tools-extra/pseudo/tool/ClangPseudo.cpp
===================================================================
--- clang-tools-extra/pseudo/tool/ClangPseudo.cpp
+++ clang-tools-extra/pseudo/tool/ClangPseudo.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang-pseudo/DirectiveMap.h"
+#include "clang-pseudo/GLRParser.h"
 #include "clang-pseudo/Grammar.h"
 #include "clang-pseudo/LRGraph.h"
 #include "clang-pseudo/LRTable.h"
@@ -35,6 +36,8 @@
 static opt<bool>
     PrintDirectiveMap("print-directive-map",
                       desc("Print directive structure of source code"));
+static opt<bool> PrintStats("print-stats", desc("Print processing statistics"));
+static opt<bool> PrintForest("print-forest", desc("Print parse forest"));
 
 static std::string readOrDie(llvm::StringRef Path) {
   llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
@@ -50,6 +53,29 @@
 int main(int argc, char *argv[]) {
   llvm::cl::ParseCommandLineOptions(argc, argv, "");
 
+  clang::LangOptions LangOpts; // FIXME: use real options.
+  LangOpts.CPlusPlus = 1;
+  llvm::Optional<clang::pseudo::TokenStream> RawStream;
+  llvm::Optional<clang::pseudo::DirectiveMap> DirectiveStructure;
+  llvm::Optional<clang::pseudo::TokenStream> ParseableStream;
+  if (Source.getNumOccurrences()) {
+    std::string Text = readOrDie(Source);
+    RawStream = clang::pseudo::lex(Text, LangOpts);
+
+    if (PrintSource)
+      RawStream->print(llvm::outs());
+    if (PrintTokens)
+      llvm::outs() << *RawStream;
+
+    DirectiveStructure = clang::pseudo::DirectiveMap::parse(*RawStream);
+    if (PrintDirectiveMap)
+      llvm::outs() << *DirectiveStructure;
+
+    ParseableStream = clang::pseudo::stripComments(cook(*RawStream, LangOpts));
+    if (PrintTokens)
+      llvm::outs() << "Cooked:\n" << *ParseableStream;
+  }
+
   if (Grammar.getNumOccurrences()) {
     std::string Text = readOrDie(Grammar);
     std::vector<std::string> Diags;
@@ -65,23 +91,30 @@
       llvm::outs() << G->dump();
     if (PrintGraph)
       llvm::outs() << clang::pseudo::LRGraph::buildLR0(*G).dumpForTests(*G);
+    auto LRTable = clang::pseudo::LRTable::buildSLR(*G);
     if (PrintTable)
-      llvm::outs() << clang::pseudo::LRTable::buildSLR(*G).dumpForTests(*G);
-    return 0;
-  }
+      llvm::outs() << LRTable.dumpForTests(*G);
 
-  if (Source.getNumOccurrences()) {
-    std::string Text = readOrDie(Source);
-    clang::LangOptions LangOpts; // FIXME: use real options.
-    auto Stream = clang::pseudo::lex(Text, LangOpts);
-    auto Structure = clang::pseudo::DirectiveMap::parse(Stream);
+    if (ParseableStream) {
+      clang::pseudo::ForestArena Arena;
+      clang::pseudo::GLRParser Parser(LRTable, *G, Arena);
+      const auto *Root = Parser.parse(Arena.createTerminals(*ParseableStream));
+      if (Root) {
+        llvm::outs() << "parsed successfully!\n";
+        if (PrintForest)
+          llvm::outs() << Root->dumpRecursive(*G, /*Abbreviated=*/true);
+      } else {
+        llvm::outs() << "parse failed!\n";
+      }
+      if (PrintStats) {
+        llvm::outs() << "Forest bytes: " << Arena.bytes()
+                     << " nodes: " << Arena.nodeCount() << "\n";
+        llvm::outs() << "GSS bytes: " << Parser.getGSS().bytes()
+                     << " nodes: " << Parser.getGSS().nodeCount() << "\n";
+      }
+    }
 
-    if (PrintDirectiveMap)
-      llvm::outs() << Structure;
-    if (PrintSource)
-      Stream.print(llvm::outs());
-    if (PrintTokens)
-      llvm::outs() << Stream;
+    return 0;
   }
 
   return 0;
Index: clang-tools-extra/pseudo/lib/GLRParser.cpp
===================================================================
--- /dev/null
+++ clang-tools-extra/pseudo/lib/GLRParser.cpp
@@ -0,0 +1,332 @@
+//===--- GLRParser.cpp   -----------------------------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang-pseudo/GLRParser.h"
+#include "clang-pseudo/Grammar.h"
+#include "clang-pseudo/LRTable.h"
+#include "clang/Basic/TokenKinds.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormatVariadic.h"
+#include <algorithm>
+#include <memory>
+#include <queue>
+#include <tuple>
+
+#define DEBUG_TYPE "GLRParser.cpp"
+
+namespace clang {
+namespace pseudo {
+
+using StateID = LRTable::StateID;
+
+const ForestNode *GLRParser::parse(llvm::ArrayRef<ForestNode> Terminals) {
+  std::vector<const GSS::Node *> NewHeads = {
+      GSS.addNode(/*State=*/0, nullptr, {})};
+  for (const ForestNode &Terminal : Terminals) {
+    LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Next token {0} (id={1})\n",
+                                             G.symbolName(Terminal.symbol()),
+                                             Terminal.symbol()));
+    for (const auto &AS : NewHeads)
+      addActions(AS, Terminal.symbol());
+    NewHeads.clear();
+    performReduction(Terminal.symbol());
+    performShift(&Terminal, &NewHeads);
+  }
+  LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Next is eof\n"));
+  for (const auto &AS : NewHeads)
+    addActions(AS, tokenSymbol(tok::eof));
+  performReduction(tokenSymbol(tok::eof));
+
+  // FIXME: recover if we don't accept, don't return null!
+  if (!PendingAccept.empty()) {
+    assert(PendingAccept.size() == 1);
+    LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Accept: {0} accepted results:\n",
+                                             PendingAccept.size()));
+    for (const auto &A : PendingAccept)
+      LLVM_DEBUG(llvm::dbgs()
+                 << "  - " << G.symbolName(A.Head->Payload->symbol()) << "\n");
+    return PendingAccept.front().Head->Payload;
+  }
+  return nullptr;
+}
+
+void GLRParser::performShift(const ForestNode *Tok,
+                             std::vector<const GSS::Node *> *NewHeads) {
+  assert(Tok->kind() == ForestNode::Terminal);
+  assert(PendingReduce.empty() &&
+         "Reduce actions must be performed before shift actions");
+  LLVM_DEBUG(llvm::dbgs() << llvm::formatv("  Shift {0} ({1} active heads):\n",
+                                           G.symbolName(Tok->symbol()),
+                                           PendingShift.size()));
+
+  // Merge the stack -- if multiple stack heads will reach the same state after
+  // shifting a token, we shift only once by combining these heads.
+  //
+  // E.g. we have two heads (2, 3) in the GSS, and will shift both to reach 4:
+  //   0---1---2
+  //       └---3
+  // After the shift action, the GSS is:
+  //   0---1---2---4
+  //       └---3---┘
+  //
+  // We group pending shifts by their target state so we can merge them.
+  llvm::stable_sort(PendingShift, [](const Step &L, const Step &R) {
+    return L.Action.getShiftState() < R.Action.getShiftState();
+  });
+  auto Rest = llvm::makeArrayRef(PendingShift);
+  while (!Rest.empty()) {
+    // Collect the batch of PendingShift that have compatible shift states.
+    // Their heads become TempParents, the parents of the new GSS node.
+    StateID NextState = Rest.front().Action.getShiftState();
+    auto &Parents = TempGSSNodes;
+    for (Parents.clear();
+         !Rest.empty() && Rest.front().Action.getShiftState() == NextState;
+         Rest = Rest.drop_front()) {
+      assert(!llvm::is_contained(Parents, Rest.front().Head) &&
+             "shift: duplicated stack heads!");
+      Parents.push_back(Rest.front().Head);
+    }
+    LLVM_DEBUG(llvm::dbgs() << llvm::formatv("    --> S{0} ({1} heads)\n",
+                                             NextState, Parents.size()));
+    NewHeads->push_back(GSS.addNode(NextState, Tok, Parents));
+  }
+  PendingShift.clear();
+}
+
+namespace {
+// A KeyedQueue yields pairs of keys and values in order of the keys.
+template <typename Key, typename Value>
+using KeyedQueue =
+    std::priority_queue<std::pair<Key, Value>,
+                        std::vector<std::pair<Key, Value>>, llvm::less_first>;
+
+template <typename T> void sortAndUnique(std::vector<T> &Vec) {
+  llvm::sort(Vec);
+  Vec.erase(std::unique(Vec.begin(), Vec.end()), Vec.end());
+}
+
+} // namespace
+
+// Perform reduces until no more are possible.
+//
+// Generally this means walking up from the heads gathering ForestNodes that
+// will match the RHS of the rule we're reducing into a sequence ForestNode,
+// and ending up at a base node.
+// Then we push a new GSS node onto that base, taking care to:
+//  - pack alternative sequence ForestNodes into an ambiguous ForestNode.
+//  - use the same GSS node for multiple heads if the parse state matches.
+//
+// Examples of reduction:
+//   Before (simple):
+//     0--1(expr)--2(semi)
+//   After reducing 2 by `stmt := expr semi`:
+//     0--3(stmt)                // 3 is goto(0, stmt)
+//
+//   Before (splitting due to R/R conflict):
+//     0--1(IDENTIFIER)
+//   After reducing 1 by `class-name := IDENTIFIER` & `enum-name := IDENTIFIER`:
+//     0--2(class-name)          // 2 is goto(0, class-name)
+//     └--3(enum-name)           // 3 is goto(0, enum-name)
+//
+//   Before (splitting due to multiple bases):
+//     0--2(class-name)--4(STAR)
+//     └--3(enum-name)---┘
+//   After reducing 4 by ptr-operator := STAR:
+//     0--2(class-name)--5(ptr-operator)    // 5 is goto(2, ptr-operator)
+//     └--3(enum-name)---6(ptr-operator)    // 6 is goto(3, ptr-operator)
+//
+//   Before (joining due to same goto state):
+//     0--1(A)--3(C)
+//     └--2(B)--4(D)
+//   After reducing 3 by E := C and 4 by E := D
+//     0--1(A)--5(E)             // 5 is goto(1, E) and goto(2, E)
+//     └--2(B)--┘
+//
+//   Before (joining due to same base):
+//     0--1(A)--3(STAR)
+//     └--2(B)--4(STAR)
+//   After reducing 3 by C := A STAR and 2 by C := B STAR:
+//     0--5(C)                   // 5 is goto(0, C)
+void GLRParser::performReduction(SymbolID NextTok) {
+  // There are two interacting complications:
+  // 1.  Performing one reduce can unlock new reduces on the newly-created head.
+  // 2a. The ambiguous ForestNodes must be complete (have all sequence nodes).
+  //     This means we must have unlocked all the reduces that contribute to it.
+  // 2b. Similarly, the new GSS nodes must be complete (have all parents).
+  //
+  // We define a "family" of reduces as those that produce the same symbol and
+  // cover the same range of tokens. These are exactly the set of reductions
+  // whose sequence nodes would be covered by the same ambiguous node.
+  // We wish to process a whole family at a time (to satisfy complication 2),
+  // and can address complication 1 by carefully ordering the families:
+  // - Process families covering fewer tokens first.
+  //   A reduce can't depend on a longer reduce!
+  // - For equal token ranges: if S := T, process T families before S families.
+  //   Parsing T can't depend on an equal-length S, as the grammar is acyclic.
+  //
+  // This isn't quite enough: we don't know the token length of the reduction
+  // until we walk up the stack to perform the pop.
+  // So we perform the pop part upfront, and place the push specification on
+  // priority queues such that we can retrieve a family at a time.
+
+  // A reduction family is characterized by its token range and symbol produced.
+  // It is used as a key in the priority queues to group pushes by family.
+  struct Family {
+    Token::Index Start;
+    SymbolID Symbol;
+    // Rule must produce Symbol and can otherwise be arbitrary.
+    // RuleIDs have the topological order based on the acyclic grammar.
+    // FIXME: should SymbolIDs be so ordered instead?
+    RuleID Rule;
+
+    bool operator==(const Family &Other) const {
+      return Start == Other.Start && Symbol == Other.Symbol;
+    }
+    // The larger Family is the one that should be processed first.
+    bool operator<(const Family &Other) const {
+      if (Start != Other.Start)
+        return Start < Other.Start;
+      if (Symbol != Other.Symbol)
+        return Rule > Other.Rule;
+      assert(*this == Other);
+      return false;
+    }
+  };
+
+  // The base nodes are the heads after popping the GSS nodes we are reducing.
+  // We don't care which rule yielded each base. If Family.Symbol is S, the
+  // base includes an item X := ... • S ... and since the grammar is
+  // context-free, *all* parses of S are valid here.
+  // FIXME: reuse the queues across calls instead of reallocating.
+  KeyedQueue<Family, const GSS::Node *> Bases;
+
+  // A sequence is the ForestNode payloads of the GSS nodes we are reducing.
+  // These are the RHS of the rule, the RuleID is stored in the Family.
+  // They specify a sequence ForestNode we may build (but we dedup first).
+  using Sequence = llvm::SmallVector<const ForestNode *, Rule::MaxElements>;
+  KeyedQueue<Family, Sequence> Sequences;
+
+  Sequence TempSequence;
+  // Pop walks up the parent chain(s) for a reduction from Head by to Rule.
+  // Once we reach the end, record the bases and sequences.
+  auto Pop = [&](const GSS::Node *Head, RuleID RID) {
+    LLVM_DEBUG(llvm::dbgs() << "  Pop " << G.dumpRule(RID) << "\n");
+    const auto &Rule = G.lookupRule(RID);
+    Family F{/*Start=*/0, /*Symbol=*/Rule.Target, /*Rule=*/RID};
+    TempSequence.resize_for_overwrite(Rule.Size);
+    auto DFS = [&](const GSS::Node *N, unsigned I, auto &DFS) {
+      if (I == Rule.Size) {
+        F.Start = TempSequence.front()->startTokenIndex();
+        Bases.emplace(F, N);
+        LLVM_DEBUG(llvm::dbgs() << "    --> base at S" << N->State << "\n");
+        Sequences.emplace(F, TempSequence);
+        return;
+      }
+      TempSequence[Rule.Size - 1 - I] = N->Payload;
+      for (const GSS::Node *Parent : N->parents())
+        DFS(Parent, I + 1, DFS);
+    };
+    DFS(Head, 0, DFS);
+  };
+  auto PopPending = [&] {
+    for (const Step &Pending : PendingReduce)
+      Pop(Pending.Head, Pending.Action.getReduceRule());
+    PendingReduce.clear();
+  };
+
+  std::vector<std::pair</*Goto*/ StateID, const GSS::Node *>> FamilyBases;
+  std::vector<std::pair<RuleID, Sequence>> FamilySequences;
+  // Main reduction loop:
+  //  - pop as much as we can
+  //  - push one family
+  //  - this produces new heads which may enable more pops
+  PopPending();
+  while (!Bases.empty()) {
+    // We should always have bases and sequences for the same families.
+    Family F = Bases.top().first;
+    assert(!Sequences.empty());
+    assert(Sequences.top().first == F);
+
+    LLVM_DEBUG(llvm::dbgs() << "  Push " << G.symbolName(F.Symbol)
+                            << " from token " << F.Start << "\n");
+
+    // Grab the sequences for this family.
+    FamilySequences.clear();
+    do {
+      FamilySequences.emplace_back(Sequences.top().first.Rule,
+                                   Sequences.top().second);
+      Sequences.pop();
+    } while (!Sequences.empty() && Sequences.top().first == F);
+    // Build a forest node for each unique sequence.
+    sortAndUnique(FamilySequences);
+    auto &SequenceNodes = TempForestNodes;
+    SequenceNodes.clear();
+    for (const auto &SequenceSpec : FamilySequences)
+      SequenceNodes.push_back(&Forest.createSequence(
+          F.Symbol, SequenceSpec.first, SequenceSpec.second));
+    // Wrap in an ambiguous node if needed.
+    const ForestNode *Parsed =
+        SequenceNodes.size() == 1
+            ? SequenceNodes.front()
+            : &Forest.createAmbiguous(F.Symbol, SequenceNodes);
+    LLVM_DEBUG(llvm::dbgs() << "    --> " << Parsed->dump(G) << "\n");
+
+    // Grab the bases for this family.
+    // As well as deduplicating them, we'll group by the goto state.
+    FamilyBases.clear();
+    do {
+      FamilyBases.emplace_back(
+          ParsingTable.getGoToState(Bases.top().second->State, F.Symbol),
+          Bases.top().second);
+      Bases.pop();
+    } while (!Bases.empty() && Bases.top().first == F);
+    sortAndUnique(FamilyBases);
+    // Create a GSS node for each unique goto state.
+    llvm::ArrayRef<decltype(FamilyBases)::value_type> BasesLeft = FamilyBases;
+    while (!BasesLeft.empty()) {
+      StateID NextState = BasesLeft.front().first;
+      auto &Parents = TempGSSNodes;
+      Parents.clear();
+      for (const auto &Base : BasesLeft) {
+        if (Base.first != NextState)
+          break;
+        Parents.push_back(Base.second);
+      }
+      BasesLeft = BasesLeft.drop_front(Parents.size());
+
+      const GSS::Node *NewHead = GSS.addNode(NextState, Parsed, Parents);
+      addActions(NewHead, NextTok);
+    }
+    PopPending();
+  }
+  assert(Sequences.empty());
+}
+
+void GLRParser::addActions(const GSS::Node *Head, SymbolID NextTok) {
+  for (const auto &Action : ParsingTable.getActions(Head->State, NextTok)) {
+    switch (Action.kind()) {
+    case LRTable::Action::Shift:
+      PendingShift.push_back({Head, Action});
+      break;
+    case LRTable::Action::Reduce:
+      PendingReduce.push_back({Head, Action});
+      break;
+    case LRTable::Action::Accept:
+      PendingAccept.push_back({Head, Action});
+      break;
+    default:
+      llvm_unreachable("unexpected action kind!");
+    }
+  }
+}
+
+} // namespace pseudo
+} // namespace clang
Index: clang-tools-extra/pseudo/lib/Forest.cpp
===================================================================
--- clang-tools-extra/pseudo/lib/Forest.cpp
+++ clang-tools-extra/pseudo/lib/Forest.cpp
@@ -46,15 +46,23 @@
       };
   CountVisits(this);
 
+  // The box-drawing characters that should be added as a child is rendered.
+  struct LineDecoration {
+    std::string Prefix;         // Prepended to every line.
+    llvm::StringRef First;      // added to the child's line.
+    llvm::StringRef Subsequent; // added to descendants' lines.
+  };
+
   // We print a "#<id>" for nonterminal forest nodes that are being dumped
   // multiple times.
   llvm::DenseMap<const ForestNode *, size_t> ReferenceIds;
   std::string Result;
   constexpr Token::Index KEnd = std::numeric_limits<Token::Index>::max();
-  std::function<void(const ForestNode *, unsigned, Token::Index,
-                     llvm::Optional<SymbolID>)>
-      Dump = [&](const ForestNode *P, unsigned Level, Token::Index End,
-                 llvm::Optional<SymbolID> ElidedParent) {
+  std::function<void(const ForestNode *, Token::Index, llvm::Optional<SymbolID>,
+                     LineDecoration &LineDec)>
+      Dump = [&](const ForestNode *P, Token::Index End,
+                 llvm::Optional<SymbolID> ElidedParent,
+                 LineDecoration LineDec) {
         llvm::ArrayRef<const ForestNode *> Children;
         auto EndOfElement = [&](size_t ChildIndex) {
           return ChildIndex + 1 == Children.size()
@@ -72,18 +80,19 @@
               if (Children[I]->startTokenIndex() == P->startTokenIndex() &&
                   EndOfElement(I) == End) {
                 return Dump(
-                    Children[I], Level, End,
-                    /*ElidedParent=*/ElidedParent.getValueOr(P->symbol()));
+                    Children[I], End,
+                    /*ElidedParent=*/ElidedParent.getValueOr(P->symbol()),
+                    LineDec);
               }
           }
         }
 
-        // FIXME: pretty ascii trees
         if (End == KEnd)
           Result += llvm::formatv("[{0,3}, end) ", P->startTokenIndex());
         else
           Result += llvm::formatv("[{0,3}, {1,3}) ", P->startTokenIndex(), End);
-        Result.append(2 * Level, ' ');
+        Result += LineDec.Prefix;
+        Result += LineDec.First;
         if (ElidedParent.hasValue()) {
           Result += G.symbolName(*ElidedParent);
           Result += "~";
@@ -99,12 +108,23 @@
         }
         Result.push_back('\n');
 
-        ++Level;
-        for (size_t I = 0; I < Children.size(); ++I)
-          Dump(Children[I], Level,
-               P->kind() == Sequence ? EndOfElement(I) : End, llvm::None);
+        auto OldPrefixSize = LineDec.Prefix.size();
+        LineDec.Prefix += LineDec.Subsequent;
+        for (size_t I = 0; I < Children.size(); ++I) {
+          if (I == Children.size() - 1) {
+            LineDec.First = "└─";
+            LineDec.Subsequent = "  ";
+          } else {
+            LineDec.First = "├─";
+            LineDec.Subsequent = "│ ";
+          }
+          Dump(Children[I], P->kind() == Sequence ? EndOfElement(I) : End,
+               llvm::None, LineDec);
+        }
+        LineDec.Prefix.resize(OldPrefixSize);
       };
-  Dump(this, 0, KEnd, llvm::None);
+  LineDecoration LineDec;
+  Dump(this, KEnd, llvm::None, LineDec);
   return Result;
 }
 
Index: clang-tools-extra/pseudo/lib/CMakeLists.txt
===================================================================
--- clang-tools-extra/pseudo/lib/CMakeLists.txt
+++ clang-tools-extra/pseudo/lib/CMakeLists.txt
@@ -3,6 +3,7 @@
 add_clang_library(clangPseudo
   DirectiveMap.cpp
   Forest.cpp
+  GLRParser.cpp
   Grammar.cpp
   GrammarBNF.cpp
   Lex.cpp
Index: clang-tools-extra/pseudo/include/clang-pseudo/GLRParser.h
===================================================================
--- /dev/null
+++ clang-tools-extra/pseudo/include/clang-pseudo/GLRParser.h
@@ -0,0 +1,161 @@
+//===--- GLRParser.h - Implement a standard GLR parser -----------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements a standard Generalized LR (GLR) parsing algorithm.
+//
+// The GLR parser behaves as a normal LR parser until it encounters a conflict.
+// To handle a conflict (where there are multiple actions could perform), the
+// parser will simulate nondeterminism by doing a breadth-first search
+// over all the possibilities.
+//
+// Basic mechanisims of the GLR parser:
+//  - A number of processes are operated in parallel.
+//  - Each process has its own parsing stack and behaves as a standard
+//    determinism LR parser.
+//  - When a process encounters a conflict, it will be fork (one for each
+//    avaiable action).
+//  - When a process encounters an error, it is abandoned.
+//  - All process are synchronized by the lookahead token: they perfrom shift
+//    action at the same time, which means some processes need wait until other
+//    processes have performed all reduce actions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CLANG_PSEUDO_GLRPARSER_H
+#define CLANG_PSEUDO_GLRPARSER_H
+
+#include "clang-pseudo/Forest.h"
+#include "clang-pseudo/Grammar.h"
+#include "clang-pseudo/LRTable.h"
+#include "llvm/Support/Allocator.h"
+#include <vector>
+
+namespace clang {
+namespace pseudo {
+
+// A Graph-Structured Stack represents all parse stacks of a GLR parser.
+//
+// Each node stores a parse state, the last parsed ForestNode, and the parent
+// node. There may be several heads (top of stack), and the parser operates by:
+// - shift: pushing terminal symbols on top of the stack
+// - reduce: replace N symbols on top of the stack with one nonterminal
+//
+// The structure is a DAG rather than a stack:
+// - GLR allows multiple actions (conflicts) on the same head, producing forks
+//   where several nodes have the same parent
+// - The parser merges nodes with the same (state, ForestNode), producing joins
+//   where one node has multiple parents
+//
+// The parser is responsible for creating nodes and keeping track of the set of
+// heads. The GSS class is mostly an arena for them.
+struct GSS {
+  // A node represents a partial parse of the input up to some point.
+  //
+  // It is the equivalent of a frame in an LR parse stack.
+  // Like such a frame, it has an LR parse state and an AST node representing
+  // the last parsed symbol (a ForestNode in our case).
+  // Unlike a regular LR stack frame, it may have multiple parents.
+  //
+  // Nodes are not exactly pushed and popped on the stack: pushing is just
+  // allocating a new head node with a parent pointer to the old head. Popping
+  // is just forgetting about a node and remembering its parent instead.
+  struct alignas(struct Node *) Node {
+    // LR state describing how parsing should continue from this head.
+    LRTable::StateID State;
+    // Number of the parents of this node, which are stored as trailing objects.
+    // The parents hold previous parsed symbols, and may resume control after
+    // this node is reduced.
+    unsigned ParentCount;
+    // The parse node for the last parsed symbol.
+    // This symbol appears on the left of the dot in the parse state's items.
+    // (In the literature, the node is attached to the *edge* to the parent).
+    const ForestNode *Payload = nullptr;
+
+    // FIXME: Most nodes live a fairly short time, and are simply discarded.
+    // Is it worth refcounting them (we have empty padding) and returning to a
+    // freelist, to keep the working set small?
+
+    llvm::ArrayRef<const Node *> parents() const {
+      return llvm::makeArrayRef(reinterpret_cast<const Node *const *>(this + 1),
+                                ParentCount);
+    };
+    // Parents are stored as a trailing array of Node*.
+  };
+
+  // Allocates a new node in the graph.
+  const Node *addNode(LRTable::StateID State, const ForestNode *Symbol,
+                      llvm::ArrayRef<const Node *> Parents) {
+    ++NodeCount;
+    Node *Result = new (Arena.Allocate(
+        sizeof(Node) + Parents.size() * sizeof(Node *), alignof(Node)))
+        Node({State, static_cast<unsigned>(Parents.size())});
+    Result->Payload = Symbol;
+    if (!Parents.empty())
+      llvm::copy(Parents, reinterpret_cast<const Node **>(Result + 1));
+    return Result;
+  }
+
+  size_t bytes() const { return Arena.getTotalMemory() + sizeof(*this); }
+  size_t nodeCount() const { return NodeCount; }
+
+private:
+  llvm::BumpPtrAllocator Arena;
+  unsigned NodeCount = 0;
+};
+
+// FIXME: what's the useful public interface here?
+class GLRParser {
+public:
+  GLRParser(const LRTable &T, const Grammar &G, ForestArena &Arena)
+      : ParsingTable(T), G(G), Forest(Arena) {}
+
+  // FIXME: provide start symbol too?
+  const ForestNode *parse(llvm::ArrayRef<ForestNode> Terminals);
+
+  const GSS &getGSS() const { return GSS; }
+
+private:
+  // Apply PendingShift actions, producing NewHeads.
+  void performShift(const ForestNode *Tok,
+                    std::vector<const GSS::Node *> *NewHeads);
+  void performReduction(SymbolID Next);
+
+  void addActions(const GSS::Node *Head, SymbolID Next);
+
+  const LRTable &ParsingTable;
+  const Grammar &G;
+
+  // An active stack head can have multiple available actions (reduce/reduce
+  // actions, reduce/shift actions)
+  // A step is any one action applied to any one stack head.
+  struct Step {
+    // A corresponding stack head.
+    const GSS::Node *Head = nullptr;
+    // An action associated with the Head.
+    LRTable::Action Action = LRTable::Action::sentinel();
+  };
+  // A list of active shift actions.
+  std::vector<Step> PendingShift;
+  // A list of active reduce actions.
+  std::vector<Step> PendingReduce;
+  // A list of active accept action.
+  std::vector<Step> PendingAccept;
+
+  // Temporary storage we reuse instead of reallocating each time.
+  std::vector<const GSS::Node *> TempGSSNodes;
+  std::vector<const ForestNode *> TempForestNodes;
+
+  GSS GSS;
+  ForestArena &Forest;
+  llvm::ArrayRef<ForestNode> Terminals;
+};
+
+} // namespace pseudo
+} // namespace clang
+
+#endif
Index: clang-tools-extra/pseudo/include/clang-pseudo/Forest.h
===================================================================
--- clang-tools-extra/pseudo/include/clang-pseudo/Forest.h
+++ clang-tools-extra/pseudo/include/clang-pseudo/Forest.h
@@ -17,6 +17,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef CLANG_PSEUDO_FOREST_H
+#define CLANG_PSEUDO_FOREST_H
+
 #include "clang-pseudo/Grammar.h"
 #include "clang-pseudo/Token.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -176,3 +179,5 @@
 
 } // namespace pseudo
 } // namespace clang
+
+#endif
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to