[PATCH] D122303: [pseudo] Sort nonterminals based on their reduction order.

Haojian Wu via Phabricator via cfe-commits Wed, 23 Mar 2022 13:40:58 -0700

hokein updated this revision to Diff 417731.
hokein marked an inline comment as done.
hokein added a comment.


address comments, remove the extraction of TestGrammar.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D122303/new/

https://reviews.llvm.org/D122303

Files:
  clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h
  clang-tools-extra/pseudo/lib/GrammarBNF.cpp
  clang-tools-extra/pseudo/test/lr-build-basic.test
  clang-tools-extra/pseudo/test/lr-build-conflicts.test
  clang-tools-extra/pseudo/unittests/GrammarTest.cpp

Index: clang-tools-extra/pseudo/unittests/GrammarTest.cpp
===================================================================
--- clang-tools-extra/pseudo/unittests/GrammarTest.cpp
+++ clang-tools-extra/pseudo/unittests/GrammarTest.cpp
@@ -44,6 +44,16 @@
     return 0;
   }
 
+  RuleID ruleFor(llvm::StringRef NonterminalName) const {
+    auto RuleRange = G->table().Nonterminals[id(NonterminalName)].RuleRange;
+    if (RuleRange.End - RuleRange.Start == 1)
+      return G->table().Nonterminals[id(NonterminalName)].RuleRange.Start;
+    ADD_FAILURE() << "Expected a single rule for " << NonterminalName
+                  << ", but it has " << RuleRange.End - RuleRange.Start
+                  << " rule!\n";
+    return 0;
+  }
+
 protected:
   std::unique_ptr<Grammar> G;
   std::vector<std::string> Diags;
@@ -74,6 +84,21 @@
                                    Sequence(id("INT"), id(";"))));
 }
 
+TEST_F(GrammarTest, RuleIDSorted) {
+  build(R"bnf(
+    _ := x
+
+    x := y
+    y := z
+    z := IDENTIFIER
+  )bnf");
+  ASSERT_TRUE(Diags.empty());
+
+  EXPECT_LT(ruleFor("z"), ruleFor("y"));
+  EXPECT_LT(ruleFor("y"), ruleFor("x"));
+  EXPECT_LT(ruleFor("x"), ruleFor("_"));
+}
+
 TEST_F(GrammarTest, Diagnostics) {
   build(R"cpp(
     _ := ,_opt
@@ -82,6 +107,9 @@
     _ := IDENFIFIE # a typo of the terminal IDENFITIER
 
     invalid
+    # cycle
+    a := b
+    b := a
   )cpp");
 
   EXPECT_EQ(G->startSymbol(), id("_"));
@@ -91,7 +119,8 @@
                          "No rules for nonterminal: undefined-sym",
                          "Failed to parse 'invalid': no separator :=",
                          "Token-like name IDENFIFIE is used as a nonterminal",
-                         "No rules for nonterminal: IDENFIFIE"));
+                         "No rules for nonterminal: IDENFIFIE",
+                         "The grammar is cyclic, see symbol a"));
 }
 
 TEST_F(GrammarTest, FirstAndFollowSets) {
Index: clang-tools-extra/pseudo/test/lr-build-conflicts.test
===================================================================
--- clang-tools-extra/pseudo/test/lr-build-conflicts.test
+++ clang-tools-extra/pseudo/test/lr-build-conflicts.test
@@ -5,8 +5,8 @@
 # RUN: clang-pseudo -grammar %s -print-graph | FileCheck %s --check-prefix=GRAPH
 #      GRAPH: States
 # GRAPH-NEXT: State 0
-# GRAPH-NEXT:     _ :=  â¢ expr
 # GRAPH-NEXT:     expr :=  â¢ expr - expr
+# GRAPH-NEXT:     _ :=  â¢ expr
 # GRAPH-NEXT:     expr :=  â¢ IDENTIFIER
 # GRAPH-NEXT: State 1
 # GRAPH-NEXT:     _ := expr â¢ 
@@ -42,6 +42,6 @@
 # TABLE-NEXT:     'IDENTIFIER': shift state 2
 # TABLE-NEXT:     'expr': go to state 4
 # TABLE-NEXT: State 4
-# TABLE-NEXT:     'EOF': reduce by rule 2 'expr := expr - expr'
+# TABLE-NEXT:     'EOF': reduce by rule 0 'expr := expr - expr'
 # TABLE-NEXT:     '-': shift state 3
-# TABLE-NEXT:     '-': reduce by rule 2 'expr := expr - expr'
+# TABLE-NEXT:     '-': reduce by rule 0 'expr := expr - expr'
Index: clang-tools-extra/pseudo/test/lr-build-basic.test
===================================================================
--- clang-tools-extra/pseudo/test/lr-build-basic.test
+++ clang-tools-extra/pseudo/test/lr-build-basic.test
@@ -26,4 +26,4 @@
 # TABLE-NEXT: State 2
 # TABLE-NEXT:     'EOF': reduce by rule 1 'expr := id'
 # TABLE-NEXT: State 3
-# TABLE-NEXT:     'EOF': reduce by rule 2 'id := IDENTIFIER'
+# TABLE-NEXT:     'EOF': reduce by rule 0 'id := IDENTIFIER'
Index: clang-tools-extra/pseudo/lib/GrammarBNF.cpp
===================================================================
--- clang-tools-extra/pseudo/lib/GrammarBNF.cpp
+++ clang-tools-extra/pseudo/lib/GrammarBNF.cpp
@@ -9,6 +9,7 @@
 #include "clang-pseudo/Grammar.h"
 #include "clang/Basic/TokenKinds.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/FormatVariadic.h"
 #include <memory>
@@ -87,23 +88,81 @@
     }
     assert(T->Rules.size() < (1 << RuleBits) &&
            "Too many rules to fit in RuleID bits!");
-    llvm::sort(T->Rules, [](const Rule &Left, const Rule &Right) {
-      // Sorted by the Target.
-      return std::tie(Left.Target, Left.Size) <
-             std::tie(Right.Target, Right.Size);
-    });
-    RuleID RulePos = 0;
+    const auto &TopologicalOrder = getTopologicalOrder(T.get());
+    llvm::stable_sort(
+        T->Rules, [&TopologicalOrder](const Rule &Left, const Rule &Right) {
+          // Sorted by the topological order of the nonterminal Target.
+          return TopologicalOrder[Left.Target] < TopologicalOrder[Right.Target];
+        });
     for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID) {
-      RuleID Start = RulePos;
-      while (RulePos < T->Rules.size() && T->Rules[RulePos].Target == SID)
-        ++RulePos;
-      T->Nonterminals[SID].RuleRange = {Start, RulePos};
+      auto StartIt = llvm::partition_point(T->Rules, [&](const Rule &R) {
+        return TopologicalOrder[R.Target] < TopologicalOrder[SID];
+      });
+      RuleID Start = StartIt - T->Rules.begin();
+      RuleID End = Start;
+      while (End < T->Rules.size() &&
+             TopologicalOrder[T->Rules[End].Target] == TopologicalOrder[SID])
+        ++End;
+      T->Nonterminals[SID].RuleRange = {Start, End};
     }
     auto G = std::make_unique<Grammar>(std::move(T));
     diagnoseGrammar(*G);
     return G;
   }
 
+  // Gets topological order for nonterminal symbols.
+  //
+  // The topological order is defined as: if a *single* nonterminal A produces
+  // (or transitively) a nonterminal B (that said, there is a production rule
+  // B := A), then A is less than B.
+  //
+  // It returns a "lookup" vector where the index is SymbolID, and the value is
+  // the index of the SymbolID in the topological-orderred array.
+  std::vector<unsigned> getTopologicalOrder(GrammarTable *T) {
+    std::vector<std::pair<SymbolID, SymbolID>> Dependencies;
+    for (const auto &Rule : T->Rules) {
+      // if A := B, A depends on B.
+      if (Rule.Size == 1 && pseudo::isNonterminal(Rule.Sequence[0]))
+        Dependencies.push_back({Rule.Target, Rule.Sequence[0]});
+    }
+    llvm::sort(Dependencies);
+    std::vector<SymbolID> Order;
+    // Each nonterminal state flows: NotVisited -> Visiting -> Visited.
+    enum State {
+      NotVisited,
+      Visiting,
+      Visited,
+    };
+    std::vector<State> VisitStates(T->Nonterminals.size(), NotVisited);
+    std::function<void(SymbolID)> DFS = [&](SymbolID SID) -> void {
+      if (VisitStates[SID] == Visited)
+        return;
+      if (VisitStates[SID] == Visiting) {
+        Diagnostics.push_back(
+            llvm::formatv("The grammar is cyclic, see symbol {0}",
+                          T->Nonterminals[SID].Name));
+        return;
+      }
+      VisitStates[SID] = Visiting;
+      auto It = llvm::partition_point(
+          Dependencies, [&SID](const std::pair<SymbolID, SymbolID> &D) {
+            return D.first < SID;
+          });
+      while (It != Dependencies.end() && It->first == SID) {
+        DFS(It->second);
+        ++It;
+      }
+      VisitStates[SID] = Visited;
+      Order.push_back(SID);
+    };
+    for (SymbolID ID = 0; ID != T->Nonterminals.size(); ++ID)
+      DFS(ID);
+    std::vector<unsigned> Resuslt(T->Nonterminals.size(), 0);
+    for (size_t I = 0; I < Order.size(); ++I)
+      Resuslt[Order[I]] = I;
+    return Resuslt;
+  }
+
 private:
   // Text representation of a BNF grammar rule.
   struct RuleSpec {
Index: clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h
===================================================================
--- clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h
+++ clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h
@@ -165,8 +165,15 @@
     } RuleRange;
   };
 
-  // The rules are sorted (and thus grouped) by target symbol.
-  // RuleID is the index of the vector.
+  // RuleID is an index into this table of rule definitions.
+  //
+  // Rules with the same target symbol (LHS) are grouped into a single range.
+  // The relative order of different target symbols is *not* by SymbolID, but
+  // rather a topological sort: if S := T then the rules producing T have lower
+  // RuleIDs than rules producing S.
+  // (This strange order simplifies the GLR parser: for a given token range, if
+  // we reduce in increasing RuleID order then we need never backtrack --
+  // prerequisite reductions are reached before dependent ones).
   std::vector<Rule> Rules;
   // A table of terminals (aka tokens). It corresponds to the clang::Token.
   // clang::tok::TokenKind is the index of the table.

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D122303: [pseudo] Sort nonterminals based on their reduction order.

Reply via email to