[PATCH branch-1.6] eval: Reject incomplete base prefix

Eric Blake via M4-patches Wed, 28 May 2025 18:40:19 -0700

POSIX is clear that strtol("0x") is an error (there must be digits
after the explicit base); and other m4 implementations that rely on
strtol for eval have rejected "eval(0x)".  Although it can be argued
that rejecting something that used to silently work as 0 might break
existing scripts, we had never documented it as a feature and the risk
is low.


Meanwhile, parsing "eval(08)" as "0" followed by "8" always produces a
semantic error, since 8 was found when an operator was expected; but
the error message is nicer if a digit too large for the base triggers
the same new error as for an incomplete base ["invalid number"] rather
than the old way ["bad expression in eval (excess input)"].

* src/eval.c (MIN_PREC): New define to avoid a magic number to
parse_expr.
(BADNUM, INVALID_NUMBER): New enum values.  Shuffle values on
other eval_tokens to keep math consistent.
(eval_lex): Return BADNUM on failed numeric parse.
(primary, parse_expr, evaluate): Handle invalid numbers.
* doc/m4.texi (Eval): Test it.
* NEWS: Document the bug fix.
---

This patch is worth back-porting to m4 1.4.x.

For reference, BSD m4 and m4p both reject "eval(0x)".  Checking other
languages, bash rejects $((0b)) but accidentally accepts $((0x)).
gawk silently accepts 0x, but hey, it silently accepts garbage like 0q
too.  C, Python, Go, Rust, and many other compiled languages reject
bare 0x.

 NEWS        |  4 +++
 doc/m4.texi |  8 +++++-
 src/eval.c  | 75 +++++++++++++++++++++++++++++++++++------------------
 3 files changed, 61 insertions(+), 26 deletions(-)

diff --git a/NEWS b/NEWS
index f25dac3d..960e42f7 100644
--- a/NEWS
+++ b/NEWS
@@ -122,6 +122,10 @@ GNU M4 NEWS - User visible changes.
    context of a macro name, rather than acting on the empty string.  This
    was already done for `define', `pushdef', `builtin', and `indir'.

+** Fix the `eval' builtin to reject input like `0x' that C does not
+   accept as an integer literal, rather than silently treating it as zero
+   (present since "the beginning").
+
 ** Enhance the `eval' builtin to understand the `?:' and `>>>' operators,
    and downgrade a failed parse due to an unknown operator from an error to
    a warning (the same as for all other syntax errors).  Further, the
diff --git a/doc/m4.texi b/doc/m4.texi
index 01b0a915..2bd2d042 100644
--- a/doc/m4.texi
+++ b/doc/m4.texi
@@ -7341,6 +7341,12 @@ Eval
 eval(`1+')
 @error{}m4:stdin:10: warning: eval: missing operand: '1+'
 @result{}
+eval(`0x')
+@error{}m4:stdin:11: warning: eval: invalid number: '0x'
+@result{}
+eval(`01239')
+@error{}m4:stdin:12: warning: eval: invalid number: '01239'
+@result{}
 define(`foo', `666')
 @result{}
 eval(0r36:foo)
@@ -7350,7 +7356,7 @@ Eval
 eval(`0R36:'defn(`foo'))
 @result{}7998
 eval(`foo / 6')
-@error{}m4:stdin:15: warning: eval: bad input: 'foo / 6'
+@error{}m4:stdin:17: warning: eval: bad input: 'foo / 6'
 @result{}
 eval(foo / 6)
 @result{}111
diff --git a/src/eval.c b/src/eval.c
index 236b34ca..18cae9e2 100644
--- a/src/eval.c
+++ b/src/eval.c
@@ -28,10 +28,13 @@

 /* Evaluates token types.  */

+#define MIN_PREC 2
+
 typedef enum eval_token
 {
-  /* Value / 10 is precedence order.  */
+  /* Value / 10 is precedence order, if >= MIN_PREC.  */
   ERROR = 0,
+  BADNUM,
   BADOP,
   EMPTY,
   EOTEXT,
@@ -41,28 +44,28 @@ typedef enum eval_token
   NOT,
   NUMBER,
   COLON,
-  QUESTION = 10,
-  LOR = 20,
-  LAND = 30,
-  OR = 40,
-  XOR = 50,
-  AND = 60,
-  EQ = 70,
+  QUESTION = 20,
+  LOR = 30,
+  LAND = 40,
+  OR = 50,
+  XOR = 60,
+  AND = 70,
+  EQ = 80,
   NOTEQ,
-  GT = 80,
+  GT = 90,
   GTEQ,
   LS,
   LSEQ,
-  LSHIFT = 90,
+  LSHIFT = 100,
   RSHIFT,
   URSHIFT,
   /* precedence given for binary op; PLUS and MINUS also serve as a unary op */
-  PLUS = 100,
+  PLUS = 110,
   MINUS,
-  TIMES = 110,
+  TIMES = 120,
   DIVIDE,
   MODULO,
-  EXPONENT = 120
+  EXPONENT = 130
 }
 eval_token;

@@ -82,6 +85,7 @@ typedef enum eval_error
   MISSING_COLON,
   UNKNOWN_INPUT,
   EXCESS_INPUT,
+  INVALID_NUMBER,
   INVALID_OPERATOR,
   MISSING_VALUE,
   EMPTY_ARGUMENT
@@ -141,6 +145,7 @@ eval_lex (int32_t *val)
          Therefore use an unsigned integer type to avoid undefined behaviour
          when parsing '-2147483648'.  */
       uint32_t value;
+      bool seen_digit = false;

       if (*eval_text == '0')
         {
@@ -166,19 +171,20 @@ eval_lex (int32_t *val)
               while (c_isdigit (*eval_text) && base <= 36)
                 base = 10 * base + *eval_text++ - '0';
               if (base == 0 || base > 36 || *eval_text != ':')
-                return ERROR;
+                return BADNUM;
               eval_text++;
               break;

             default:
               base = 8;
+              seen_digit = true;
             }
         }
       else
         base = 10;

       value = 0;
-      for (; *eval_text; eval_text++)
+      for (; *eval_text; eval_text++, seen_digit = true)
         {
           if (c_isdigit (*eval_text))
             digit = *eval_text - '0';
@@ -196,14 +202,16 @@ eval_lex (int32_t *val)
               else if (digit == 0 && value == 0)
                 continue;
               else
-                break;
+                return BADNUM;
             }
           else if (digit >= base)
-            break;
+            return BADNUM;
           else
             value = value * base + digit;
         }
       *val = value;
+      if (!seen_digit)
+        return BADNUM;
       return NUMBER;
     }

@@ -336,13 +344,15 @@ primary (int32_t *v1)
       /* Parenthesis */
     case LEFTP:
       er = primary (v1);
-      er = parse_expr (v1, er, 1);
+      er = parse_expr (v1, er, MIN_PREC);
       if (er >= SYNTAX_ERROR)
         return er;
       switch (eval_lex (&v2))
         {
         case ERROR:
           return UNKNOWN_INPUT;
+        case BADNUM:
+          return INVALID_NUMBER;
         case BADOP:
           return INVALID_OPERATOR;
         case RIGHTP:
@@ -374,6 +384,8 @@ primary (int32_t *v1)
       /* Anything else */
     case ERROR:
       return UNKNOWN_INPUT;
+    case BADNUM:
+      return INVALID_NUMBER;
     case BADOP:
       return INVALID_OPERATOR;
     case EMPTY:
@@ -550,14 +562,16 @@ parse_expr (int32_t *v1, eval_error er, unsigned min_prec)
           break;

         case QUESTION:
-          if (et2 == BADOP)
+          if (et2 == BADNUM)
+            er = INVALID_NUMBER;
+          else if (et2 == BADOP)
             er = INVALID_OPERATOR;
           else if (et2 != COLON)
             er = MISSING_COLON;
           else
             {
               er3 = primary (&v3);
-              er3 = parse_expr (&v3, er3, 1);
+              er3 = parse_expr (&v3, er3, MIN_PREC);
               if (er3 >= SYNTAX_ERROR)
                 return er3;
               if (*v1)
@@ -591,14 +605,21 @@ evaluate (const call_info *me, const char *expr, size_t 
len, int32_t *val)

   eval_init_lex (expr, len);
   err = primary (val);
-  err = parse_expr (val, err, 1);
+  err = parse_expr (val, err, MIN_PREC);

   if (err == NO_ERROR && eval_text != end_text)
     {
-      if (eval_lex (val) == BADOP)
-        err = INVALID_OPERATOR;
-      else
-        err = EXCESS_INPUT;
+      switch (eval_lex (val))
+        {
+        case BADNUM:
+          err = INVALID_NUMBER;
+          break;
+        case BADOP:
+          err = INVALID_OPERATOR;
+          break;
+        default:
+          err = EXCESS_INPUT;
+        }
     }

   if (err != NO_ERROR)
@@ -634,6 +655,10 @@ evaluate (const call_info *me, const char *expr, size_t 
len, int32_t *val)
       m4_warn (0, me, _("excess input: %s"), expr);
       break;

+    case INVALID_NUMBER:
+      m4_warn (0, me, _("invalid number: %s"), expr);
+      break;
+
     case INVALID_OPERATOR:
       m4_warn (0, me, _("invalid operator: %s"), expr);
       break;
-- 
2.49.0


_______________________________________________
M4-patches mailing list
[email protected]
https://lists.gnu.org/mailman/listinfo/m4-patches

[PATCH branch-1.6] eval: Reject incomplete base prefix

Reply via email to