[PATCH] D118196: [syntax][pseudo] Implement LR parsing table.

Haojian Wu via Phabricator via cfe-commits Mon, 31 Jan 2022 01:41:24 -0800

hokein updated this revision to Diff 404442.
hokein added a comment.

Polish the LRTable implementation, using two flat vectors


memory-size (bytes): 1380908 => 664600


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D118196/new/

https://reviews.llvm.org/D118196

Files:
  clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h
  clang/include/clang/Tooling/Syntax/Pseudo/LRAutomaton.h
  clang/include/clang/Tooling/Syntax/Pseudo/LRTable.h
  clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
  clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp
  clang/lib/Tooling/Syntax/Pseudo/LRBuilder.cpp
  clang/lib/Tooling/Syntax/Pseudo/LRTable.cpp
  clang/tools/clang-pseudo/ClangPseudo.cpp
  clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
  clang/unittests/Tooling/Syntax/Pseudo/GrammarTests.cpp
  clang/unittests/Tooling/Syntax/Pseudo/LRBuilderTests.cpp

Index: clang/unittests/Tooling/Syntax/Pseudo/LRBuilderTests.cpp
===================================================================
--- /dev/null
+++ clang/unittests/Tooling/Syntax/Pseudo/LRBuilderTests.cpp
@@ -0,0 +1,112 @@
+//===--- LRBuilderTests.cpp --------------------------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/LRAutomaton.h"
+#include "clang/Tooling/Syntax/Pseudo/LRTable.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include <memory>
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+namespace {
+
+TEST(TableTest, BuildLRTable) {
+  struct TestCase {
+    llvm::StringRef BNF;
+    llvm::StringRef ExpectedStates;
+    llvm::StringRef ExpectedTable;
+  };
+
+  TestCase Cases[] = {
+      {
+          R"bnf(
+_ := expr
+expr := IDENTIFIER
+      )bnf",
+          R"(States:
+State 0
+    _ :=  â¢ expr
+    expr :=  â¢ IDENTIFIER
+State 1
+    expr := IDENTIFIER â¢ 
+State 2
+    _ := expr â¢ 
+)",
+          R"(LRTable:
+State 0
+    'IDENTIFIER': shift, and go to state 1
+    'expr': go to state 2
+State 1
+    'EOF': reduce by rule 1 'expr := IDENTIFIER'
+State 2
+    'EOF': accept
+)",
+      },
+      {
+          // A grammar with a S/R conflict in SLR table: (id-id)-id, or
+          // id-(id-id).
+          R"bnf(
+_ := expr
+expr := expr - expr  # S/R conflict at state 4 on '-' token
+expr := IDENTIFIER
+      )bnf",
+          R"(States:
+State 0
+    _ :=  â¢ expr
+    expr :=  â¢ IDENTIFIER
+    expr :=  â¢ expr - expr
+State 1
+    expr := IDENTIFIER â¢ 
+State 2
+    _ := expr â¢ 
+    expr := expr â¢ - expr
+State 3
+    expr :=  â¢ IDENTIFIER
+    expr :=  â¢ expr - expr
+    expr := expr - â¢ expr
+State 4
+    expr := expr â¢ - expr
+    expr := expr - expr â¢ 
+)",
+          R"(LRTable:
+State 0
+    'IDENTIFIER': shift, and go to state 1
+    'expr': go to state 2
+State 1
+    'EOF': reduce by rule 1 'expr := IDENTIFIER'
+    '-': reduce by rule 1 'expr := IDENTIFIER'
+State 2
+    'EOF': accept
+    '-': shift, and go to state 3
+State 3
+    'IDENTIFIER': shift, and go to state 1
+    'expr': go to state 4
+State 4
+    'EOF': reduce by rule 2 'expr := expr - expr'
+    '-': shift, and go to state 3
+    '-': reduce by rule 2 'expr := expr - expr'
+)",
+      },
+  };
+  for (const auto &C : Cases) {
+    std::vector<std::string> Diags;
+    auto G = Grammar::parseBNF(C.BNF, Diags);
+    ASSERT_THAT(Diags, testing::IsEmpty());
+    auto Automaton = LRAutomaton::build(*G);
+    auto Table = LRTable::build(*G);
+    EXPECT_EQ(Automaton.dumpForTests(*G), C.ExpectedStates);
+    EXPECT_EQ(Table.dumpForTests(*G), C.ExpectedTable);
+  }
+}
+
+} // namespace
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
Index: clang/unittests/Tooling/Syntax/Pseudo/GrammarTests.cpp
===================================================================
--- clang/unittests/Tooling/Syntax/Pseudo/GrammarTests.cpp
+++ clang/unittests/Tooling/Syntax/Pseudo/GrammarTests.cpp
@@ -1,4 +1,4 @@
-//===--- GrammarTests.cpp - grammar tests  ------------------------*- C++-*-===//
+//===--- GrammarTests.cpp - grammar tests  -----------------------*- C++-*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -19,6 +19,7 @@
 using testing::AllOf;
 using testing::ElementsAre;
 using testing::IsEmpty;
+using testing::Pair;
 using testing::UnorderedElementsAre;
 
 MATCHER_P(TargetID, SID, "") { return arg.target() == SID; }
@@ -34,7 +35,7 @@
     G = Grammar::parseBNF(BNF, Diags);
   }
 
-  SymbolID lookup(llvm::StringRef Name) const {
+  SymbolID id(llvm::StringRef Name) const {
     for (unsigned I = 0; I < NumTerminals; ++I)
       if (G->table().Terminals[I] == Name)
         return tokenSymbol(static_cast<tok::TokenKind>(I));
@@ -53,10 +54,10 @@
   build("expression := IDENTIFIER + expression # comment");
   EXPECT_THAT(Diags, IsEmpty());
 
-  auto ExpectedRule = AllOf(TargetID(lookup("expression")),
-                        Sequence(lookup("IDENTIFIER"), lookup("+"),
-                                       lookup("expression")));
-  auto ExpressionID = lookup("expression");
+  auto ExpectedRule =
+      AllOf(TargetID(id("expression")),
+            Sequence(id("IDENTIFIER"), id("+"), id("expression")));
+  auto ExpressionID = id("expression");
   // auto RuleID = 0;
   EXPECT_EQ(G->symbolName(ExpressionID), "expression");
   EXPECT_THAT(G->lookupRules(ExpressionID), UnorderedElementsAre(ExpectedRule));
@@ -71,11 +72,10 @@
   build("_ := CONST_opt INT ;_opt");
   EXPECT_THAT(Diags, IsEmpty());
   EXPECT_THAT(G->table().Rules,
-              UnorderedElementsAre(
-                  Sequence(lookup("INT")),
-                  Sequence(lookup("CONST"), lookup("INT")),
-                  Sequence(lookup("CONST"), lookup("INT"), lookup(";")),
-                  Sequence(lookup("INT"), lookup(";"))));
+              UnorderedElementsAre(Sequence(id("INT")),
+                                   Sequence(id("CONST"), id("INT")),
+                                   Sequence(id("CONST"), id("INT"), id(";")),
+                                   Sequence(id("INT"), id(";"))));
 }
 
 TEST_F(GrammarTest, Diagnostics) {
@@ -97,6 +97,66 @@
                          "No rules for nonterminal: IDENFIFIE"));
 }
 
+TEST_F(GrammarTest, FirstAndFollow) {
+  build(
+      R"bnf(
+_ := expr
+expr := expr - term
+expr := term
+term := IDENTIFIER
+term := ( expr )
+)bnf");
+  ASSERT_TRUE(Diags.empty());
+  auto ToPair = [](std::vector<llvm::DenseSet<SymbolID>> Input) {
+    std::vector<std::pair<SymbolID, llvm::DenseSet<SymbolID>>> Sets;
+    for (SymbolID ID = 0; ID < Input.size(); ++ID)
+      Sets.emplace_back(ID, std::move(Input[ID]));
+    return Sets;
+  };
+
+  EXPECT_THAT(
+      ToPair(G->firstSets()),
+      UnorderedElementsAre(
+          Pair(id("_"), UnorderedElementsAre(id("IDENTIFIER"), id("("))),
+          Pair(id("expr"), UnorderedElementsAre(id("IDENTIFIER"), id("("))),
+          Pair(id("term"), UnorderedElementsAre(id("IDENTIFIER"), id("(")))));
+  EXPECT_THAT(
+      ToPair(G->followSets()),
+      UnorderedElementsAre(
+          Pair(id("_"), UnorderedElementsAre(id("EOF"))),
+          Pair(id("expr"), UnorderedElementsAre(id("-"), id("EOF"), id(")"))),
+          Pair(id("term"), UnorderedElementsAre(id("-"), id("EOF"), id(")")))));
+
+  build(R"bnf(
+# A simplfied C++ decl-specifier-seq.
+_ := decl-specifier-seq
+decl-specifier-seq := decl-specifier decl-specifier-seq
+decl-specifier-seq := decl-specifier
+decl-specifier := simple-type-specifier
+decl-specifier := INLINE
+simple-type-specifier := INT
+   )bnf");
+  ASSERT_TRUE(Diags.empty());
+  EXPECT_THAT(
+      ToPair(G->firstSets()),
+      UnorderedElementsAre(
+          Pair(id("_"), UnorderedElementsAre(id("INLINE"), id("INT"))),
+          Pair(id("decl-specifier-seq"),
+               UnorderedElementsAre(id("INLINE"), id("INT"))),
+          Pair(id("simple-type-specifier"), UnorderedElementsAre(id("INT"))),
+          Pair(id("decl-specifier"),
+               UnorderedElementsAre(id("INLINE"), id("INT")))));
+  EXPECT_THAT(
+      ToPair(G->followSets()),
+      UnorderedElementsAre(
+          Pair(id("_"), UnorderedElementsAre(id("EOF"))),
+          Pair(id("decl-specifier-seq"), UnorderedElementsAre(id("EOF"))),
+          Pair(id("decl-specifier"),
+               UnorderedElementsAre(id("INLINE"), id("INT"), id("EOF"))),
+          Pair(id("simple-type-specifier"),
+               UnorderedElementsAre(id("INLINE"), id("INT"), id("EOF")))));
+}
+
 } // namespace
 } // namespace pseudo
 } // namespace syntax
Index: clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
===================================================================
--- clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
+++ clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
@@ -5,6 +5,7 @@
 add_custom_target(ClangPseudoUnitTests)
 add_unittest(ClangPseudoUnitTests ClangPseudoTests
   GrammarTests.cpp
+  LRBuilderTests.cpp
   )
 
 clang_target_link_libraries(ClangPseudoTests
Index: clang/tools/clang-pseudo/ClangPseudo.cpp
===================================================================
--- clang/tools/clang-pseudo/ClangPseudo.cpp
+++ clang/tools/clang-pseudo/ClangPseudo.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Tooling/Syntax/Pseudo/Grammar.h"
+#include "clang/Tooling/Syntax/Pseudo/LRTable.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormatVariadic.h"
@@ -42,6 +43,8 @@
       return 2;
     }
     llvm::errs() << llvm::formatv("grammar file {0} is parsed successfully\n", CheckGrammar);
+    llvm::errs()
+        << clang::syntax::pseudo::LRTable::build(*RSpecs).dumpStatistics();
     return 0;
   }
   return 0;
Index: clang/lib/Tooling/Syntax/Pseudo/LRTable.cpp
===================================================================
--- /dev/null
+++ clang/lib/Tooling/Syntax/Pseudo/LRTable.cpp
@@ -0,0 +1,79 @@
+//===--- LRTable.cpp - Parsing table for LR parsers --------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/LRTable.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+
+std::string LRTable::dumpStatistics() const {
+  StateID NumOfStates = 0;
+  for (auto It : Index)
+    NumOfStates = std::max(It.From, NumOfStates);
+  return llvm::formatv(R"(
+Statistics of the LR parsing table:
+    number of states: {0}
+    number of actions: {1}
+    size of the table (bytes): {2}
+)",
+                       NumOfStates, Actions.size(), bytes())
+      .str();
+}
+
+std::string LRTable::dumpForTests(const Grammar &G) const {
+  std::string Result;
+  llvm::raw_string_ostream OS(Result);
+  StateID MaxState = 0;
+  for (const auto &It : Index)
+    MaxState = std::max(MaxState, It.From);
+  OS << "LRTable:\n";
+  for (StateID S = 0; S <= MaxState; ++S) {
+    OS << llvm::formatv("State {0}\n", S);
+    for (SymbolID ID : G.symbols()) {
+      for (auto A : find({S, ID})) {
+        if (A.isShift())
+          OS << llvm::formatv("    '{0}': shift, and go to state {1}\n",
+                              G.symbolName(ID), A.shift());
+        else if (A.isReduce())
+          OS << llvm::formatv("    '{0}': reduce by rule {1} '{2}'\n",
+                              G.symbolName(ID), A.reduce(),
+                              G.dumpRule(A.reduce()));
+        else if (A.isAccept())
+          OS << llvm::formatv("    '{0}': accept\n", G.symbolName(ID));
+      }
+    }
+    for (SymbolID ID = 0; ID < G.table().Nonterminals.size(); ++ID) {
+      if (find({S, ID}).empty())
+        continue;
+      OS << llvm::formatv("    '{0}': go to state {1}\n", G.symbolName(ID),
+                          goTo(S, ID));
+    }
+  }
+  return OS.str();
+}
+
+llvm::ArrayRef<LRTable::Action> LRTable::find(IndexKey Key) const {
+  assert(Index.size() == Actions.size());
+  auto It = llvm::partition_point(
+      Index, [&Key](const IndexKey &E) { return E < Key; });
+  if (It == Index.end())
+    return {};
+  size_t Start = It - Index.begin();
+  auto End = Start;
+  while (End < Index.size() && Index[End] == Key)
+    ++End;
+  return llvm::makeArrayRef(&Actions[Start], &Actions[End]);
+}
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
Index: clang/lib/Tooling/Syntax/Pseudo/LRBuilder.cpp
===================================================================
--- /dev/null
+++ clang/lib/Tooling/Syntax/Pseudo/LRBuilder.cpp
@@ -0,0 +1,240 @@
+//===--- LRBuilder.cpp - Build LRAutomaton and parsing tables ----*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
+#include "clang/Tooling/Syntax/Pseudo/LRAutomaton.h"
+#include "clang/Tooling/Syntax/Pseudo/LRTable.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+#include <map>
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+
+std::string Item::dump(const Grammar &G) const {
+  const auto &Rule = G.lookupRule(RID);
+  auto ToNames = [&](llvm::ArrayRef<SymbolID> Syms) {
+    std::vector<llvm::StringRef> Results;
+    for (auto SID : Syms)
+      Results.push_back(G.symbolName(SID));
+    return Results;
+  };
+  return llvm::formatv("{0} := {1} â¢ {2}\n", G.symbolName(Rule.target()),
+                       llvm::join(ToNames(Rule.seq().take_front(DotPos)), " "),
+                       llvm::join(ToNames(Rule.seq().drop_front(DotPos)), " "))
+      .str();
+}
+
+llvm::hash_code hash_code(const State &S) {
+  return llvm::hash_combine_range(S.begin(), S.end());
+}
+
+std::string dump(const State &S, const Grammar &G) {
+  std::string Result;
+  llvm::raw_string_ostream OS(Result);
+  for (const auto &Item : S)
+    OS << llvm::formatv("    {0}", Item.dump(G));
+  return OS.str();
+}
+
+// Computes a closure of the given item set S:
+//  - extends the given S to contain all options for parsing next token;
+//  - nonterminals after a dot are recursively expanded into the begin-state
+//    of all production rules that produce that nonterminal;
+//
+// Given
+//   Grammar rules = [ _ := E, E := E-T, E := T, T := n, T := (E) ]
+//   Input = [ E := .T ]
+// returns [ E := .T, T := .n, T := .(E) ]
+static State closure(ItemSet IS, const Grammar &G) {
+  std::vector<Item> ProcessingItems = {IS.begin(), IS.end()};
+  while (!ProcessingItems.empty()) {
+    auto ProcessingItem = std::move(ProcessingItems.back());
+    ProcessingItems.pop_back();
+
+    if (!ProcessingItem.hasNext())
+      continue;
+    SymbolID NextSym = ProcessingItem.next(G);
+
+    if (pseudo::isToken(NextSym))
+      continue;
+    auto RRange = G.table().Nonterminals[NextSym].RuleRange;
+    for (RuleID RID = RRange.start; RID < RRange.end; ++RID) {
+      Item NewItem(RID, G.lookupRule(RID).size(), /*dot*/ 0);
+      if (IS.insert(NewItem).second) // new
+        ProcessingItems.push_back(std::move(NewItem));
+    }
+  }
+  return IS;
+}
+// For all items A := a.Xb in S, returns a closure itemset with a dot
+// advance of X.
+static State advance(const State &S, SymbolID X, const Grammar &G) {
+  ItemSet AdvanceX;
+  for (const Item &I : S) {
+    if (I.hasNext() && I.next(G) == X)
+      AdvanceX.insert(I.advance());
+  }
+  return closure(std::move(AdvanceX), G);
+}
+
+std::string LRAutomaton::dumpForTests(const Grammar &G) const {
+  std::string Result;
+  llvm::raw_string_ostream OS(Result);
+
+  OS << "States:\n";
+  for (StateID ID = 0; ID < States.size(); ++ID) {
+    OS << llvm::formatv("State {0}\n", ID);
+    OS << dump(States[ID], G);
+  }
+  return OS.str();
+}
+
+LRAutomaton LRAutomaton::build(const Grammar &G) {
+  class Builder {
+  public:
+    // Adds a given state if not existed.
+    std::pair<StateID, /*inserted*/ bool> insert(State S) {
+      auto Hash = hash_code(S);
+      auto It = StatesIndex.find(Hash);
+      if (It != StatesIndex.end())
+        return {It->second, false}; // no insertion took place.
+      States.push_back(std::move(S));
+      StatesIndex.insert({Hash, States.size() - 1});
+      return {States.size() - 1, true};
+    }
+    void insertEdge(StateID From, StateID To, SymbolID Input) {
+      Edges.push_back({From, To, Input});
+    }
+    // Returns a state with the given id.
+    const State &find(StateID ID) const {
+      assert(ID < States.size());
+      return States[ID];
+    }
+
+    LRAutomaton build() && {
+      States.resize(States.size());
+      Edges.resize(Edges.size());
+      return LRAutomaton(std::move(States), std::move(Edges));
+    }
+
+  private:
+    llvm::DenseMap<llvm::hash_code, size_t> StatesIndex;
+    std::vector<State> States;
+    std::vector<Edge> Edges;
+  } Builder;
+
+  std::vector<StateID> PendingStates;
+  // Initialize with the augmented symbol states.
+  auto RRange = G.table().Nonterminals[G.augmentedSymbol()].RuleRange;
+  for (RuleID RID = RRange.start; RID < RRange.end; ++RID) {
+    const auto &Rule = G.lookupRule(RID);
+    assert(Rule.size() == 1 &&
+           "augmented symbol rule must be _ := start_symbol.");
+    auto StartState = closure({{{RID, Rule.size(), 0}}}, G);
+    auto Result = Builder.insert(std::move(StartState));
+    assert(Result.second && "State must be new");
+    PendingStates.push_back(Result.first);
+  }
+
+  while (!PendingStates.empty()) {
+    auto CurrentStateID = PendingStates.back();
+    PendingStates.pop_back();
+    for (SymbolID Move : G.symbols()) {
+      State NextState = advance(Builder.find(CurrentStateID), Move, G);
+      if (NextState.empty())
+        continue;
+      auto Insert = Builder.insert(NextState);
+      if (Insert.second) // new state, insert to the processing.
+        PendingStates.push_back(Insert.first);
+      Builder.insertEdge(CurrentStateID, Insert.first, Move);
+    }
+  }
+  return std::move(Builder).build();
+}
+
+LRTable LRTable::build(const Grammar &G) {
+  class Builder {
+  public:
+    void addAction(StateID From, SymbolID Next, LRTable::Action New) {
+      auto It = Index.find({From, Next});
+      if (It == Index.end()) {
+        StorageActions.push_back(New);
+        Index.insert({{From, Next}, {StorageActions.size() - 1}});
+        return;
+      }
+      // Avoid duplicated actions.
+      if (llvm::any_of(It->second, [&](size_t index) {
+            return StorageActions[index] == New;
+          }))
+        return;
+      StorageActions.push_back(New);
+      It->second.push_back(StorageActions.size() - 1);
+    }
+    LRTable build() && {
+      LRTable Table;
+      Table.Actions.reserve(StorageActions.size());
+      Table.Index.reserve(StorageActions.size());
+      for (auto It : Index) {
+        for (auto Pos : It.second) {
+          Table.Actions.push_back(StorageActions[Pos]);
+          Table.Index.push_back(It.first);
+        }
+      }
+      assert(llvm::is_sorted(Table.Index));
+      return Table;
+    }
+
+  private:
+    std::vector<LRTable::Action> StorageActions;
+    // We want the iteration to be ordered.
+    std::map<LRTable::IndexKey, std::vector<size_t>> Index;
+  } Builder;
+
+  auto Automaton = LRAutomaton::build(G);
+  for (const auto &T : Automaton.edges())
+    Builder.addAction(T.From, T.Input,
+                      LRTable::Action{isToken(T.Input)
+                                          ? LRTable::Action::Kind::Shift
+                                          : LRTable::Action::Kind::GoTo,
+                                      {T.To}});
+
+  auto FollowSets = G.followSets();
+  for (StateID SID = 0; SID < Automaton.states().size(); ++SID) {
+    for (const Item &I : Automaton.states()[SID]) {
+      // If "_ := start_symbol ." in the State, set Action[State, eof] to
+      // accept.
+      if (G.lookupRule(I.ruleID()).target() == G.augmentedSymbol() &&
+          !I.hasNext()) {
+        Builder.addAction(SID, tokenSymbol(tok::eof),
+                          LRTable::Action{LRTable::Action::Kind::Accept, {}});
+        continue;
+      }
+      if (!I.hasNext()) {
+        // If a state i has an item "A := x.", set Action[i, a] to reduce
+        // "A := x" for all a in FOLLOW(A).
+        for (SymbolID Follow : FollowSets[I.rule(G).target()]) {
+          assert(isToken(Follow));
+          Builder.addAction(
+              SID, Follow,
+              LRTable::Action{LRTable::Action::Kind::Reduce, {I.ruleID()}});
+        }
+      }
+    }
+  }
+  return std::move(Builder).build();
+}
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
Index: clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp
===================================================================
--- clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp
+++ clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Tooling/Syntax/Pseudo/Grammar.h"
+#include "clang/Basic/TokenKinds.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
@@ -35,6 +36,87 @@
   return T->Rules[RID];
 }
 
+std::vector<llvm::DenseSet<SymbolID>> Grammar::firstSets() const {
+  std::vector<llvm::DenseSet<SymbolID>> FirstSets(T->Nonterminals.size());
+  auto ExpandFirstSet = [&FirstSets](SymbolID Target, SymbolID First) {
+    assert(isNonterminal(Target));
+    if (isToken(First))
+      return FirstSets[Target].insert(First).second;
+    bool Changed = false;
+    for (SymbolID SID : FirstSets[First])
+      Changed |= FirstSets[Target].insert(SID).second;
+    return Changed;
+  };
+
+  bool Changed = false;
+  do {
+    Changed = false;
+    for (const auto &R : T->Rules) {
+      assert(R.size() > 0);
+      Changed |= ExpandFirstSet(R.target(), R.seq().front());
+    }
+  } while (Changed);
+
+  return FirstSets;
+}
+
+std::vector<llvm::DenseSet<SymbolID>> Grammar::followSets() const {
+  auto FirstSets = firstSets();
+  std::vector<llvm::DenseSet<SymbolID>> FollowSets(T->Nonterminals.size());
+  // Expand the follow set of a non-terminal symbol Y by adding all from the
+  // given symbol set.
+  auto ExpandFollowSet = [&FollowSets](SymbolID Y,
+                                       const llvm::DenseSet<SymbolID> &ToAdd) {
+    assert(isNonterminal(Y));
+    bool Changed = false;
+    for (SymbolID F : ToAdd)
+      Changed |= FollowSets[Y].insert(F).second;
+    return Changed;
+  };
+  // Follow sets is computed based on the following 3 rules, the computation
+  // is completed at a fixed point where there is no more new symbols can be
+  // added to any of the follow sets.
+  // The algorithm doesn't consider nullable-nonterminal case, as we prohibit
+  // nullable-symbol in the grammar.
+  //
+  // Rule 1: add endmarker to the FOLLOW(S), where S is the start symbol.
+  FollowSets[augmentedSymbol()].insert(tokenSymbol(tok::eof));
+  bool Changed = false;
+  do {
+    Changed = false;
+    for (const auto &R : T->Rules) {
+      // Rule 2: for a rule X := ...Yz, we add all symbols from FIRST(z) to
+      // FOLLOW(Y).
+      for (size_t i = 0; i + 1 < R.seq().size(); ++i) {
+        if (isToken(R.seq()[i]))
+          continue;
+        SymbolID Next = R.seq()[i + 1];
+        // First set for a non-terminal is itself.
+        auto First =
+            isToken(Next) ? llvm::DenseSet<SymbolID>{Next} : FirstSets[Next];
+        Changed |= ExpandFollowSet(R.seq()[i], First);
+      }
+      SymbolID Z = R.seq().back();
+      if (!isNonterminal(Z))
+        continue;
+      // Rule 3: for a rule X := ....Z, we add all symbols from FOLLOW(X) to
+      // FOLLOW(Z).
+      Changed |= ExpandFollowSet(Z, FollowSets[R.target()]);
+    }
+  } while (Changed);
+  return FollowSets;
+}
+
+std::vector<SymbolID> Grammar::symbols() const {
+  std::vector<SymbolID> Syms;
+  for (uint16_t Terminal = 0; Terminal < NumTerminals; ++Terminal)
+    Syms.push_back(tokenSymbol(static_cast<tok::TokenKind>(Terminal)));
+  for (SymbolID Nonterminal = 0; Nonterminal < T->Nonterminals.size();
+       ++Nonterminal)
+    Syms.push_back(Nonterminal);
+  return Syms;
+}
+
 llvm::StringRef Grammar::symbolName(SymbolID SID) const {
   if (isToken(SID))
     return T->Terminals[symbolToToken(SID)];
Index: clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
===================================================================
--- clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
+++ clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
@@ -3,6 +3,8 @@
 add_clang_library(clangSyntaxPseudo
   Grammar.cpp
   GrammarBNF.cpp
+  LRTable.cpp
+  LRBuilder.cpp
   
   LINK_LIBS
   clangBasic
Index: clang/include/clang/Tooling/Syntax/Pseudo/LRTable.h
===================================================================
--- /dev/null
+++ clang/include/clang/Tooling/Syntax/Pseudo/LRTable.h
@@ -0,0 +1,147 @@
+//===--- LRTable.h - Define LR Parsing Table ---------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_TABLE_H
+#define LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_TABLE_H
+
+#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
+#include "llvm/ADT/ArrayRef.h"
+#include <vector>
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+
+using StateID = uint16_t;
+
+// A dense (thus memory-efficient) implementation of the LR parsing table.
+//
+// The LR parsing table is used to drive LR parsing. It is being consulted by LR
+// parsers to make parsing decision. Conceptually, the parsing table answers the
+// question "Given a lookahead token and current state on top of the stack, what
+// is the next step?".
+//
+// The LR parsing table is constructed from a LR automaton, and it has two
+// kinds of entries:
+//   - Action(I, a) specifies the next action (shift/reduce/accept) on state I
+//     under a terminal a.
+//   - GoTo(I, X) specifies a transition from state I under a non-terminal X.
+//
+// Unlike a typical LR parsing table, it allows conflicted actions -- an Action
+// entry can have multiple available actions.
+class LRTable {
+public:
+  struct Action {
+    enum class Kind : uint8_t {
+      // Action table
+      Error = 0,
+      Shift,
+      Reduce,
+      Accept,
+      // GoTo table
+      GoTo,
+    };
+
+    bool isReduce() const { return K == Kind::Reduce; }
+    bool isShift() const { return K == Kind::Shift; }
+    bool isGoTo() const { return K == Kind::GoTo; }
+    bool isAccept() const { return K == Kind::Accept; }
+    StateID shift() const {
+      assert(isShift());
+      return Shift;
+    }
+    StateID goTo() const {
+      assert(isGoTo());
+      return GoToState;
+    }
+    RuleID reduce() const {
+      assert(isReduce());
+      return Rule;
+    }
+
+    bool operator==(const Action &L) const {
+      if (K != L.K)
+        return false;
+      switch (K) {
+      case Kind::Accept:
+      case Kind::Error:
+        return true;
+      case Kind::Shift:
+        return Shift == L.Shift;
+      case Kind::Reduce:
+        return Rule == L.Rule;
+      case Kind::GoTo:
+        return GoToState == L.GoToState;
+      }
+      llvm_unreachable("Unhandled Action::Kind");
+    }
+
+    Kind K = Kind::Error;
+    // Value
+    union {
+      StateID Shift;     // For shift action
+      RuleID Rule;       // For reduce action
+      StateID GoToState; // For go-to
+    };
+  };
+
+  // Returns all available actions for the state From on a terminal.
+  llvm::ArrayRef<Action> actions(StateID From, SymbolID Token) const {
+    assert(pseudo::isToken(Token));
+    auto Actions = find({From, Token});
+    assert(Actions.size() > 0);
+    return Actions;
+  }
+
+  // Returns the transisted state for the state From on a nonterminal.
+  StateID goTo(StateID From, SymbolID NonTerminal) const {
+    assert(pseudo::isNonterminal(NonTerminal));
+    auto Goto = find({From, NonTerminal});
+    assert(Goto.size() == 1 && Goto.front().isGoTo());
+    return Goto.front().goTo();
+  }
+
+  size_t bytes() const {
+    return sizeof(*this) + Actions.capacity() * sizeof(Action) +
+           Index.capacity() * sizeof(IndexKey);
+  }
+
+  std::string dumpStatistics() const;
+  std::string dumpForTests(const Grammar &G) const;
+
+  // Build a SLR(1) parsing table, see the Dragron book.
+  static LRTable build(const Grammar &G);
+
+private:
+  struct IndexKey {
+    StateID From;
+    SymbolID Next;
+    friend bool operator==(const IndexKey &L, const IndexKey &R) {
+      return std::tie(L.From, L.Next) == std::tie(R.From, R.Next);
+    }
+    friend bool operator<(const IndexKey &L, const IndexKey &R) {
+      return std::tie(L.From, L.Next) < std::tie(R.From, R.Next);
+    }
+  };
+  static_assert(sizeof(IndexKey) <= 4, "expected to be <= 4 bytes");
+
+  // Looks up available actions for the given Key.
+  // Returns empty if no available actions in the table.
+  llvm::ArrayRef<Action> find(IndexKey Key) const;
+
+  // A sorted table of lookup keys, lookup results are stored in Actions.
+  std::vector<IndexKey> Index; // Sorted
+  // A flat table of available actions, in parallel with Index.
+  std::vector<Action> Actions;
+};
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
+
+#endif // LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_TABLE_H
Index: clang/include/clang/Tooling/Syntax/Pseudo/LRAutomaton.h
===================================================================
--- /dev/null
+++ clang/include/clang/Tooling/Syntax/Pseudo/LRAutomaton.h
@@ -0,0 +1,119 @@
+//===--- LRAutomaton.h - Build an LR automaton  ------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRAUTOMATON_H
+#define LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRAUTOMATON_H
+
+#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
+#include "clang/Tooling/Syntax/Pseudo/LRTable.h"
+#include "llvm/ADT/Hashing.h"
+#include <set>
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+
+// An LR item -- a grammar rule with a dot at some position of the body.
+// e.g. a production rule A := xy yields 3 items:
+//   A := .xy
+//   A := x.y
+//   A := xy.
+// An item indicates how much of a production rule has been recognized at a
+// position (described by dot), for example, A := x.y indicates that we have
+// recognized the x part from the input, and we hope next to see the input
+// derivable from y.
+class Item {
+public:
+  Item(RuleID ID, uint8_t RuleLength, uint8_t DotPos)
+      : RID(ID), RuleLength(RuleLength), DotPos(DotPos) {}
+
+  bool hasNext() const { return DotPos < RuleLength; }
+  SymbolID next(const Grammar &G) const {
+    assert(hasNext());
+    return rule(G).Sequence[DotPos];
+  }
+
+  Item advance() const {
+    assert(hasNext());
+    return Item(RID, RuleLength, DotPos + 1);
+  }
+  const pseudo::Rule &rule(const Grammar &G) const { return G.lookupRule(RID); }
+  RuleID ruleID() const { return RID; }
+  int dotPos() const { return DotPos; }
+  std::string dump(const Grammar &G) const;
+
+  bool operator==(const Item &I) const {
+    return DotPos == I.DotPos && RID == I.RID;
+  }
+  bool operator<(const Item &I) const {
+    return std::tie(RID, DotPos) < std::tie(I.RID, I.DotPos);
+  }
+  friend llvm::hash_code hash_value(const Item &I) {
+    return llvm::hash_combine(I.RID, I.DotPos);
+  }
+
+private:
+  RuleID RID;
+  uint8_t RuleLength; // the length of rule body.
+  uint8_t DotPos;
+};
+
+using ItemSet = std::set<Item>;
+// A state of the LR automaton is an item set, which contains all possible rules
+// that the LR parser may be parsing in that state.
+//
+// Conceptually, If we knew in advance what we're parsing, at any point we're
+// partway through parsing a production, sitting on a stack of partially parsed
+// productions. But because we don't know, there could be *several* productions
+// we're partway through. The set of possibilities is the parser state, and we
+// precompute all the transitions between these states.
+using State = ItemSet;
+llvm::hash_code hash_code(const State &S);
+std::string dump(const State &S, const Grammar &G);
+
+// A deterministic finite state automaton for LR parsing.
+//
+// Intuitively, an LR automaton is a transition graph. The graph has a
+// collection of nodes, called States. Each state corresponds to a particular
+// item set, which represents a condition that could occur duing the process of
+// parsing a production. Edges are directed from one state to another. Each edge
+// is labeled by a grammar symbol (terminal or non-terminal).
+//
+// The LR automaton is used to construct the LR parsing table (See comments in
+// LRTable) which is used to drive the LR parser.
+class LRAutomaton {
+public:
+  // Constructs an LR(0) automaton.
+  static LRAutomaton build(const Grammar &);
+
+  // Edge represents a transition in the LR automaton.
+  // If the parser is at state From, with a lookahead symbol Input, then it
+  // transit to state To.
+  struct Edge {
+    StateID From, To;
+    SymbolID Input;
+  };
+
+  llvm::ArrayRef<State> states() const { return States; }
+  llvm::ArrayRef<Edge> edges() const { return Edges; }
+
+  std::string dumpForTests(const Grammar &) const;
+
+private:
+  LRAutomaton(std::vector<State> States, std::vector<Edge> Edges)
+      : States(std::move(States)), Edges(std::move(Edges)) {}
+
+  std::vector<State> States;
+  std::vector<Edge> Edges;
+};
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
+
+#endif // LLVM_CLANG_TOOLING_SYNTAX_PSEUDO_LRAUTOMATON_H
Index: clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h
===================================================================
--- clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h
+++ clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h
@@ -38,6 +38,7 @@
 
 #include "clang/Basic/TokenKinds.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/StringRef.h"
 #include <cstdint>
 #include <vector>
@@ -123,6 +124,17 @@
   llvm::ArrayRef<Rule> lookupRules(SymbolID SID) const;
   const Rule &lookupRule(RuleID RID) const;
 
+  // Returns all symbols.
+  std::vector<SymbolID> symbols() const;
+
+  // Returns the SymbolID of the augmented symbol '_'
+  SymbolID augmentedSymbol() const { return 0; }
+
+  // Computes the FIRST(X) for all non-terminal symbol X.
+  std::vector<llvm::DenseSet<SymbolID>> firstSets() const;
+  // Computes the FOLLOW(X) for all non-terminal symbol X.
+  std::vector<llvm::DenseSet<SymbolID>> followSets() const;
+
   // Get symbol (terminal or non-terminal) name.
   // Terminals have names like "," (kw_comma) or "OPERATOR" (kw_operator).
   llvm::StringRef symbolName(SymbolID) const;
@@ -153,7 +165,7 @@
   // The rules are sorted (and thus grouped) by target symbol.
   // RuleID is the index of the vector.
   std::vector<Rule> Rules;
-  // A table of terminals (aka tokens). It correspond to the clang::Token.
+  // A table of terminals (aka tokens). It corresponds to the clang::Token.
   // clang::tok::TokenKind is the index of the table.
   std::vector<std::string> Terminals;
   // A table of nonterminals, sorted by name.

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D118196: [syntax][pseudo] Implement LR parsing table.

Reply via email to