hokein updated this revision to Diff 417731.
hokein marked an inline comment as done.
hokein added a comment.
address comments, remove the extraction of TestGrammar.
Repository:
rG LLVM Github Monorepo
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D122303/new/
https://reviews.llvm.org/D122303
Files:
clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h
clang-tools-extra/pseudo/lib/GrammarBNF.cpp
clang-tools-extra/pseudo/test/lr-build-basic.test
clang-tools-extra/pseudo/test/lr-build-conflicts.test
clang-tools-extra/pseudo/unittests/GrammarTest.cpp
Index: clang-tools-extra/pseudo/unittests/GrammarTest.cpp
===================================================================
--- clang-tools-extra/pseudo/unittests/GrammarTest.cpp
+++ clang-tools-extra/pseudo/unittests/GrammarTest.cpp
@@ -44,6 +44,16 @@
return 0;
}
+ RuleID ruleFor(llvm::StringRef NonterminalName) const {
+ auto RuleRange = G->table().Nonterminals[id(NonterminalName)].RuleRange;
+ if (RuleRange.End - RuleRange.Start == 1)
+ return G->table().Nonterminals[id(NonterminalName)].RuleRange.Start;
+ ADD_FAILURE() << "Expected a single rule for " << NonterminalName
+ << ", but it has " << RuleRange.End - RuleRange.Start
+ << " rule!\n";
+ return 0;
+ }
+
protected:
std::unique_ptr<Grammar> G;
std::vector<std::string> Diags;
@@ -74,6 +84,21 @@
Sequence(id("INT"), id(";"))));
}
+TEST_F(GrammarTest, RuleIDSorted) {
+ build(R"bnf(
+ _ := x
+
+ x := y
+ y := z
+ z := IDENTIFIER
+ )bnf");
+ ASSERT_TRUE(Diags.empty());
+
+ EXPECT_LT(ruleFor("z"), ruleFor("y"));
+ EXPECT_LT(ruleFor("y"), ruleFor("x"));
+ EXPECT_LT(ruleFor("x"), ruleFor("_"));
+}
+
TEST_F(GrammarTest, Diagnostics) {
build(R"cpp(
_ := ,_opt
@@ -82,6 +107,9 @@
_ := IDENFIFIE # a typo of the terminal IDENFITIER
invalid
+ # cycle
+ a := b
+ b := a
)cpp");
EXPECT_EQ(G->startSymbol(), id("_"));
@@ -91,7 +119,8 @@
"No rules for nonterminal: undefined-sym",
"Failed to parse 'invalid': no separator :=",
"Token-like name IDENFIFIE is used as a nonterminal",
- "No rules for nonterminal: IDENFIFIE"));
+ "No rules for nonterminal: IDENFIFIE",
+ "The grammar is cyclic, see symbol a"));
}
TEST_F(GrammarTest, FirstAndFollowSets) {
Index: clang-tools-extra/pseudo/test/lr-build-conflicts.test
===================================================================
--- clang-tools-extra/pseudo/test/lr-build-conflicts.test
+++ clang-tools-extra/pseudo/test/lr-build-conflicts.test
@@ -5,8 +5,8 @@
# RUN: clang-pseudo -grammar %s -print-graph | FileCheck %s --check-prefix=GRAPH
# GRAPH: States
# GRAPH-NEXT: State 0
-# GRAPH-NEXT: _ := ⢠expr
# GRAPH-NEXT: expr := ⢠expr - expr
+# GRAPH-NEXT: _ := ⢠expr
# GRAPH-NEXT: expr := ⢠IDENTIFIER
# GRAPH-NEXT: State 1
# GRAPH-NEXT: _ := expr â¢
@@ -42,6 +42,6 @@
# TABLE-NEXT: 'IDENTIFIER': shift state 2
# TABLE-NEXT: 'expr': go to state 4
# TABLE-NEXT: State 4
-# TABLE-NEXT: 'EOF': reduce by rule 2 'expr := expr - expr'
+# TABLE-NEXT: 'EOF': reduce by rule 0 'expr := expr - expr'
# TABLE-NEXT: '-': shift state 3
-# TABLE-NEXT: '-': reduce by rule 2 'expr := expr - expr'
+# TABLE-NEXT: '-': reduce by rule 0 'expr := expr - expr'
Index: clang-tools-extra/pseudo/test/lr-build-basic.test
===================================================================
--- clang-tools-extra/pseudo/test/lr-build-basic.test
+++ clang-tools-extra/pseudo/test/lr-build-basic.test
@@ -26,4 +26,4 @@
# TABLE-NEXT: State 2
# TABLE-NEXT: 'EOF': reduce by rule 1 'expr := id'
# TABLE-NEXT: State 3
-# TABLE-NEXT: 'EOF': reduce by rule 2 'id := IDENTIFIER'
+# TABLE-NEXT: 'EOF': reduce by rule 0 'id := IDENTIFIER'
Index: clang-tools-extra/pseudo/lib/GrammarBNF.cpp
===================================================================
--- clang-tools-extra/pseudo/lib/GrammarBNF.cpp
+++ clang-tools-extra/pseudo/lib/GrammarBNF.cpp
@@ -9,6 +9,7 @@
#include "clang-pseudo/Grammar.h"
#include "clang/Basic/TokenKinds.h"
#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Support/FormatVariadic.h"
#include <memory>
@@ -87,23 +88,81 @@
}
assert(T->Rules.size() < (1 << RuleBits) &&
"Too many rules to fit in RuleID bits!");
- llvm::sort(T->Rules, [](const Rule &Left, const Rule &Right) {
- // Sorted by the Target.
- return std::tie(Left.Target, Left.Size) <
- std::tie(Right.Target, Right.Size);
- });
- RuleID RulePos = 0;
+ const auto &TopologicalOrder = getTopologicalOrder(T.get());
+ llvm::stable_sort(
+ T->Rules, [&TopologicalOrder](const Rule &Left, const Rule &Right) {
+ // Sorted by the topological order of the nonterminal Target.
+ return TopologicalOrder[Left.Target] < TopologicalOrder[Right.Target];
+ });
for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID) {
- RuleID Start = RulePos;
- while (RulePos < T->Rules.size() && T->Rules[RulePos].Target == SID)
- ++RulePos;
- T->Nonterminals[SID].RuleRange = {Start, RulePos};
+ auto StartIt = llvm::partition_point(T->Rules, [&](const Rule &R) {
+ return TopologicalOrder[R.Target] < TopologicalOrder[SID];
+ });
+ RuleID Start = StartIt - T->Rules.begin();
+ RuleID End = Start;
+ while (End < T->Rules.size() &&
+ TopologicalOrder[T->Rules[End].Target] == TopologicalOrder[SID])
+ ++End;
+ T->Nonterminals[SID].RuleRange = {Start, End};
}
auto G = std::make_unique<Grammar>(std::move(T));
diagnoseGrammar(*G);
return G;
}
+ // Gets topological order for nonterminal symbols.
+ //
+ // The topological order is defined as: if a *single* nonterminal A produces
+ // (or transitively) a nonterminal B (that said, there is a production rule
+ // B := A), then A is less than B.
+ //
+ // It returns a "lookup" vector where the index is SymbolID, and the value is
+ // the index of the SymbolID in the topological-orderred array.
+ std::vector<unsigned> getTopologicalOrder(GrammarTable *T) {
+ std::vector<std::pair<SymbolID, SymbolID>> Dependencies;
+ for (const auto &Rule : T->Rules) {
+ // if A := B, A depends on B.
+ if (Rule.Size == 1 && pseudo::isNonterminal(Rule.Sequence[0]))
+ Dependencies.push_back({Rule.Target, Rule.Sequence[0]});
+ }
+ llvm::sort(Dependencies);
+ std::vector<SymbolID> Order;
+ // Each nonterminal state flows: NotVisited -> Visiting -> Visited.
+ enum State {
+ NotVisited,
+ Visiting,
+ Visited,
+ };
+ std::vector<State> VisitStates(T->Nonterminals.size(), NotVisited);
+ std::function<void(SymbolID)> DFS = [&](SymbolID SID) -> void {
+ if (VisitStates[SID] == Visited)
+ return;
+ if (VisitStates[SID] == Visiting) {
+ Diagnostics.push_back(
+ llvm::formatv("The grammar is cyclic, see symbol {0}",
+ T->Nonterminals[SID].Name));
+ return;
+ }
+ VisitStates[SID] = Visiting;
+ auto It = llvm::partition_point(
+ Dependencies, [&SID](const std::pair<SymbolID, SymbolID> &D) {
+ return D.first < SID;
+ });
+ while (It != Dependencies.end() && It->first == SID) {
+ DFS(It->second);
+ ++It;
+ }
+ VisitStates[SID] = Visited;
+ Order.push_back(SID);
+ };
+ for (SymbolID ID = 0; ID != T->Nonterminals.size(); ++ID)
+ DFS(ID);
+ std::vector<unsigned> Resuslt(T->Nonterminals.size(), 0);
+ for (size_t I = 0; I < Order.size(); ++I)
+ Resuslt[Order[I]] = I;
+ return Resuslt;
+ }
+
private:
// Text representation of a BNF grammar rule.
struct RuleSpec {
Index: clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h
===================================================================
--- clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h
+++ clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h
@@ -165,8 +165,15 @@
} RuleRange;
};
- // The rules are sorted (and thus grouped) by target symbol.
- // RuleID is the index of the vector.
+ // RuleID is an index into this table of rule definitions.
+ //
+ // Rules with the same target symbol (LHS) are grouped into a single range.
+ // The relative order of different target symbols is *not* by SymbolID, but
+ // rather a topological sort: if S := T then the rules producing T have lower
+ // RuleIDs than rules producing S.
+ // (This strange order simplifies the GLR parser: for a given token range, if
+ // we reduce in increasing RuleID order then we need never backtrack --
+ // prerequisite reductions are reached before dependent ones).
std::vector<Rule> Rules;
// A table of terminals (aka tokens). It corresponds to the clang::Token.
// clang::tok::TokenKind is the index of the table.
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits