From 38aa15e180a166e3222cda72b42493949bd6a03d Mon Sep 17 00:00:00 2001
From: Adrian Vogelsgesang <avogelsgesang@tableau.com>
Date: Thu, 3 Jan 2019 02:00:58 +0100
Subject: [PATCH] lalr1.cc: LAC support

This commit implements lookahead correction (LAC) for the C++ skeleton.
LAC is a mechanism to make sure that we report the correct list of
expected tokens if a syntax error occurs.
So far, LAC was only supported for the C skeleton "yacc.c".

This commit:
* makes the yacc.c a bit more readable by duplicating the
  declaration of `int yyx;`
* adds LAC to the lalr1.cc skeleton
* updates the docs
---
 data/skeletons/lalr1.cc | 233 ++++++++++++++++++++++++++++++++++++----
 data/skeletons/yacc.c   |  19 ++--
 doc/bison.texi          |   2 +-
 3 files changed, 222 insertions(+), 32 deletions(-)

diff --git a/data/skeletons/lalr1.cc b/data/skeletons/lalr1.cc
index f76e3340..0b9dd726 100644
--- a/data/skeletons/lalr1.cc
+++ b/data/skeletons/lalr1.cc
@@ -20,6 +20,14 @@ m4_include(b4_skeletonsdir/[c++.m4])
 # api.value.type=variant is valid.
 m4_define([b4_value_type_setup_variant])
 
+# Check the value of %define parse.lac, where LAC stands for lookahead
+# correction.
+b4_percent_define_default([[parse.lac]], [[none]])
+b4_define_flag_if([lac])
+m4_define([b4_lac_flag],
+          [m4_if(b4_percent_define_get([[parse.lac]]),
+                 [none], [[0]], [[1]])])
+
 
 # b4_integral_parser_table_declare(TABLE-NAME, CONTENT, COMMENT)
 # --------------------------------------------------------------
@@ -220,7 +228,18 @@ m4_define([b4_shared_declarations],
   private:
     /// This class is not copyable.
     ]b4_parser_class[ (const ]b4_parser_class[&);
-    ]b4_parser_class[& operator= (const ]b4_parser_class[&);
+    ]b4_parser_class[& operator= (const ]b4_parser_class[&);]b4_lac_if([[
+
+    /// Check the lookahead `token`
+    /// \returns  true iff the token will be eventually shifted
+    bool yy_lac_check_ (token_number_type token) const;
+    /// Establish the initial context if no initial context currently exists
+    /// \returns  true iff the token will be eventually shifted
+    bool yy_lac_establish_ (token_number_type token);
+    /// Discard any previous initial lookahead context because of event
+    /// \param event  the event which caused the lookahead to be discarded.
+    ///               Only used for debbuging output.
+    void yy_lac_discard_ (const char* event);]])[
 
     /// State numbers.
     typedef int state_type;
@@ -234,7 +253,7 @@ m4_define([b4_shared_declarations],
     /// Compute post-reduction state.
     /// \param yystate   the current state
     /// \param yysym     the nonterminal to push on the stack
-    state_type yy_lr_goto_state_ (state_type yystate, int yysym);
+    static state_type yy_lr_goto_state_ (state_type yystate, int yysym);
 
     /// Whether the given \c yypact_ value indicates a defaulted state.
     /// \param yyvalue   the value to check
@@ -344,7 +363,16 @@ m4_define([b4_shared_declarations],
     typedef stack<stack_symbol_type> stack_type;
 
     /// The stack.
-    stack_type yystack_;
+    stack_type yystack_;]b4_lac_if([[
+    /// The stack for LAC.
+    /// Logically, the yylac_stack's lifetime is confined to the function yylac_check_.
+    /// We just store it as a member of this class to hold on to the memory
+    /// and to avoid frequent reallocations. Since yylac_check_ is const, this member
+    /// must be mutable.
+    mutable stack<by_state> yylac_stack_;
+    /// Was an initial LAC context established?
+    bool yy_lac_established_;
+    ]],[])[
 
     /// Push a new state on the stack.
     /// \param m    a debug message to display
@@ -774,7 +802,11 @@ m4_if(b4_prefix, [yy], [],
     stack_symbol_type yyerror_range[3];]])[
 
     /// The return value of parse ().
-    int yyresult;
+    int yyresult;]b4_lac_if([[
+
+    /// Discard the LAC context in case there still is one
+    /// left from a previous invocation.
+    yy_lac_discard_ ("init");]])[
 
 #if YY_EXCEPTIONS
     try
@@ -843,14 +875,21 @@ b4_dollar_popdef])[]dnl
        to detect an error, take that action.  */
     yyn += yyla.type_get ();
     if (yyn < 0 || yylast_ < yyn || yycheck_[yyn] != yyla.type_get ())
-      goto yydefault;
+      {]b4_lac_if([[
+        if (!yy_lac_establish_ (yyla.type_get ()))
+           goto yyerrlab;]])[
+        goto yydefault;
+      }
 
     // Reduce or error.
     yyn = yytable_[yyn];
     if (yyn <= 0)
       {
         if (yy_table_value_is_error_ (yyn))
-          goto yyerrlab;
+          goto yyerrlab;]b4_lac_if([[
+        if (!yy_lac_establish_ (yyla.type_get ()))
+           goto yyerrlab;
+]])[
         yyn = -yyn;
         goto yyreduce;
       }
@@ -860,7 +899,8 @@ b4_dollar_popdef])[]dnl
       --yyerrstatus_;
 
     // Shift the lookahead token.
-    yypush_ ("Shifting", yyn, YY_MOVE (yyla));
+    yypush_ ("Shifting", yyn, YY_MOVE (yyla));]b4_lac_if([[
+    yy_lac_discard_ ("shift");]])[
     goto yynewstate;
 
 
@@ -1020,7 +1060,8 @@ b4_dollar_popdef])[]dnl
       yyerror_range[2].location = yyla.location;
       YYLLOC_DEFAULT (error_token.location, yyerror_range, 2);]])[
 
-      // Shift the error token.
+      // Shift the error token.]b4_lac_if([[
+      yy_lac_discard_ ("error recovery");]])[
       error_token.state = yyn;
       yypush_ ("Shifting", YY_MOVE (error_token));
     }
@@ -1085,8 +1126,141 @@ b4_dollar_popdef])[]dnl
   {
     error (]b4_join(b4_locations_if([yyexc.location]),
                     [[yyexc.what ()]])[);
+  }]b4_lac_if([[
+
+  // Check the lookahead `token`
+  bool
+  ]b4_parser_class[::yy_lac_check_ (token_number_type token) const
+  {
+    // Logically, the yylac_stack's lifetime is confined to this function.
+    // Clear it, to get rid of potential left-overs from previous call.
+    yylac_stack_.clear();
+    // Reduce until we encounter a shift and thereby accept the token
+#if ]b4_api_PREFIX[DEBUG
+    YYCDEBUG << "LAC: checking lookahead " << yytname_[token] << ':';
+#endif
+    int lac_top = 0;
+    while (1)
+      {
+        state_type top_state = yylac_stack_.size() ? yylac_stack_[0].state : yystack_[lac_top].state;
+        int yyrule = yypact_[top_state];
+        if (yy_pact_value_is_default_ (yyrule)
+            || (yyrule += token) < 0 || yylast_ < yyrule
+            || yycheck_[yyrule] != token)
+          {
+            // Use the default action
+            yyrule = yydefact_[top_state];
+            if (yyrule == 0)
+              {
+                YYCDEBUG << " Err\n";
+                return false;
+              }
+          }
+        else
+          {
+            // Use the action from yytable
+            yyrule = yytable_[yyrule];
+            if (yy_table_value_is_error_ (yyrule))
+              {
+                YYCDEBUG << " Err\n";
+                return false;
+              }
+            if (0 < yyrule)
+              {
+                YYCDEBUG << " S" << yyrule << '\n';
+                return true;
+              }
+            yyrule = -yyrule;
+          }
+        // By now we know we have to simulate a reduce
+        YYCDEBUG << " R" << yyrule - 1;
+        // Pop the corresponding number of values from the stack
+        {
+          unsigned yylen = yyr2_[yyrule];
+          // First pop from the LAC stack as many tokens as possible
+          size_t lac_size = yylac_stack_.size();
+          if (yylen < lac_size)
+            {
+              yylac_stack_.pop(yylen);
+              yylen = 0;
+            }
+          else if (lac_size)
+            {
+              yylac_stack_.clear();
+              yylen -= lac_size;
+            }
+          // Only aftwerwards look at the main stack.
+          // We simulate popping elements by incrementing lac_top.
+          lac_top += yylen;
+        }
+        // Keep top_state in sync with the updated stack
+        top_state = yylac_stack_.size() ? yylac_stack_[0].state : yystack_[lac_top].state;
+        // Push the resulting state of the reduction
+        state_type state = yy_lr_goto_state_ (top_state, yyr1_[yyrule]);
+        YYCDEBUG << " G" << state;
+        yylac_stack_.push(by_state(state));
+      }
+  }
+
+  // Establish the initial context if no initial context currently exists
+  bool
+  ]b4_parser_class[::yy_lac_establish_ (token_number_type token)
+  {
+    /* Establish the initial context for the current lookahead if no initial
+       context is currently established.
+
+       We define a context as a snapshot of the parser stacks.  We define
+       the initial context for a lookahead as the context in which the
+       parser initially examines that lookahead in order to select a
+       syntactic action.  Thus, if the lookahead eventually proves
+       syntactically unacceptable (possibly in a later context reached via a
+       series of reductions), the initial context can be used to determine
+       the exact set of tokens that would be syntactically acceptable in the
+       lookahead's place.  Moreover, it is the context after which any
+       further semantic actions would be erroneous because they would be
+       determined by a syntactically unacceptable token.
+
+       yy_lac_establish_ should be invoked when a reduction is about to be
+       performed in an inconsistent state (which, for the purposes of LAC,
+       includes consistent states that don't know they're consistent because
+       their default reductions have been disabled).
+
+       For parse.lac=full, the implementation of yy_lac_establish_ is as
+       follows.  If no initial context is currently established for the
+       current lookahead, then check if that lookahead can eventually be
+       shifted if syntactic actions continue from the current context.  */
+    if(!yy_lac_established_)
+      {
+#if ]b4_api_PREFIX[DEBUG
+        YYCDEBUG << "LAC: initial context established for " << yytname_[token] << '\n';
+#endif
+        yy_lac_established_ = true;
+        return yy_lac_check_(token);
+      }
+    return true;
   }
 
+  // Discard any previous initial lookahead context
+  void
+  ]b4_parser_class[::yy_lac_discard_ (const char* evt)
+  {
+   /* Discard any previous initial lookahead context because of Event,
+      which may be a lookahead change or an invalidation of the currently
+      established initial context for the current lookahead.
+
+      The most common example of a lookahead change is a shift.  An example
+      of both cases is syntax error recovery.  That is, a syntax error
+      occurs when the lookahead is syntactically erroneous for the
+      currently established initial context, so error recovery manipulates
+      the parser stacks to try to find a new initial context in which the
+      current lookahead is syntactically acceptable.  If it fails to find
+      such a context, it discards the lookahead.  */
+    if (yy_lac_established_) {
+      YYCDEBUG << "LAC: initial context discarded due to " << evt << '\n';
+      yy_lac_established_ = false;
+    }
+  }]])[
+
   // Generate an error message.
   std::string
   ]b4_parser_class[::yysyntax_error_ (]dnl
@@ -1115,35 +1289,50 @@ b4_error_verbose_if([state_type yystate, const symbol_type& yyla],
          a consistent state with a default action.  There might have
          been a previous inconsistent state, consistent state with a
          non-default action, or user semantic action that manipulated
-         yyla.  (However, yyla is currently not documented for users.)
+         yyla.  (However, yyla is currently not documented for users.)]b4_lac_if([[
+         In the first two cases, it might appear that the current syntax
+         error should have been detected in the previous state when
+         yy_lac_check was invoked.  However, at that time, there might
+         have been a different syntax error that discarded a different
+         initial context during error recovery, leaving behind the
+         current lookahead.]], [[
        - Of course, the expected token list depends on states to have
          correct lookahead information, and it depends on the parser not
          to perform extra reductions after fetching a lookahead from the
-         scanner and before detecting a syntax error.  Thus, state
-         merging (from LALR or IELR) and default reductions corrupt the
-         expected token list.  However, the list is correct for
-         canonical LR with one exception: it will still contain any
-         token that will not be accepted due to an error action in a
-         later state.
+         scanner and before detecting a syntax error.  Thus, state merging
+         (from LALR or IELR) and default reductions corrupt the expected
+         token list.  However, the list is correct for canonical LR with
+         one exception: it will still contain any token that will not be
+         accepted due to an error action in a later state.]])[
     */
     if (!yyla.empty ())
       {
         int yytoken = yyla.type_get ();
-        yyarg[yycount++] = yytname_[yytoken];
+        yyarg[yycount++] = yytname_[yytoken];]b4_lac_if([[
+
+#if ]b4_api_PREFIX[DEBUG
+        // Execute LAC once. We don't care if it is succesful, we
+        // only do it for the sake of debugging output.
+        if (!yy_lac_established_) yy_lac_check_(yytoken);
+#endif]])[
+
         int yyn = yypact_[yystate];
         if (!yy_pact_value_is_default_ (yyn))
-          {
+          {]b4_lac_if([[
+            for (token_number_type yyx = 0; yyx < yyntokens_; ++yyx)
+              if (yyx != yyterror_ && yy_lac_check_(yyx))
+                {]], [[
             /* Start YYX at -YYN if negative to avoid negative indexes in
                YYCHECK.  In other words, skip the first -YYN actions for
                this state because they are default actions.  */
-            int yyxbegin = yyn < 0 ? -yyn : 0;
+            token_number_type yyxbegin = yyn < 0 ? -yyn : 0;
             // Stay within bounds of both yycheck and yytname.
-            int yychecklim = yylast_ - yyn + 1;
-            int yyxend = yychecklim < yyntokens_ ? yychecklim : yyntokens_;
-            for (int yyx = yyxbegin; yyx < yyxend; ++yyx)
+            token_number_type yychecklim = yylast_ - yyn + 1;
+            token_number_type yyxend = yychecklim < yyntokens_ ? yychecklim : yyntokens_;
+            for (token_number_type yyx = yyxbegin; yyx < yyxend; ++yyx)
               if (yycheck_[yyx + yyn] == yyx && yyx != yyterror_
                   && !yy_table_value_is_error_ (yytable_[yyx + yyn]))
-                {
+                {]])[
                   if (yycount == YYERROR_VERBOSE_ARGS_MAXIMUM)
                     {
                       yycount = 1;
diff --git a/data/skeletons/yacc.c b/data/skeletons/yacc.c
index e9c31a84..54dc6ea1 100644
--- a/data/skeletons/yacc.c
+++ b/data/skeletons/yacc.c
@@ -1188,15 +1188,8 @@ yysyntax_error (YYSIZE_T *yymsg_alloc, char **yymsg,
       YYDPRINTF ((stderr, "Constructing syntax error message\n"));]])[
       yyarg[yycount++] = yytname[yytoken];
       if (!yypact_value_is_default (yyn))
-        {]b4_lac_if([], [[
-          /* Start YYX at -YYN if negative to avoid negative indexes in
-             YYCHECK.  In other words, skip the first -YYN actions for
-             this state because they are default actions.  */
-          int yyxbegin = yyn < 0 ? -yyn : 0;
-          /* Stay within bounds of both yycheck and yytname.  */
-          int yychecklim = YYLAST - yyn + 1;
-          int yyxend = yychecklim < YYNTOKENS ? yychecklim : YYNTOKENS;]])[
-          int yyx;]b4_lac_if([[
+        {]b4_lac_if([[
+          int yyx;
 
           for (yyx = 0; yyx < YYNTOKENS; ++yyx)
             if (yyx != YYTERROR && yyx != YYUNDEFTOK)
@@ -1209,6 +1202,14 @@ yysyntax_error (YYSIZE_T *yymsg_alloc, char **yymsg,
                   if (yy_lac_status == 1)
                     continue;
                 }]], [[
+          /* Start YYX at -YYN if negative to avoid negative indexes in
+             YYCHECK.  In other words, skip the first -YYN actions for
+             this state because they are default actions.  */
+          int yyxbegin = yyn < 0 ? -yyn : 0;
+          /* Stay within bounds of both yycheck and yytname.  */
+          int yychecklim = YYLAST - yyn + 1;
+          int yyxend = yychecklim < YYNTOKENS ? yychecklim : YYNTOKENS;
+          int yyx;
 
           for (yyx = yyxbegin; yyx < yyxend; ++yyx)
             if (yycheck[yyx + yyn] == yyx && yyx != YYTERROR
diff --git a/doc/bison.texi b/doc/bison.texi
index c0b14383..c939b434 100644
--- a/doc/bison.texi
+++ b/doc/bison.texi
@@ -8697,7 +8697,7 @@ Enable LAC to improve syntax error handling.
 @item @code{none} (default)
 @item @code{full}
 @end itemize
-This feature is currently only available for deterministic parsers in C.
+This feature is currently only available for deterministic parsers in C and C++.
 @end deffn
 
 Conceptually, the LAC mechanism is straight-forward.  Whenever the parser
-- 
2.18.1

