When dealing with tokens, we often know the length of the token from
the time it was parsed or created by expansion; save this information
alongside the token instead of calling strlen() everywhere to re-learn
it, for a slight optimization.  The placement of the new member in
struct token_data is intentional to avoid changing the size of the
struct on 64-bit machines, even if the size only matters for text
tokens.  The code already has a number of places that assume a maximum
token length bounded by int; scrubbing that to allow a full size_t
would be a larger patch.

* src/m4.h (token_data): Add size member.
(TOKEN_DATA_LEN, SYMBOL_TEXT_LEN): New macros.
* src/input.c (next_token): Remember size.
* src/macro.c (expand_argument, collect_arguments): Likewise.
* src/builtin.c (ARGLEN): New macro.
(define_user_macro): Set length, and warn user on oversize content.
(m4_eval): Use compile-time bound for radix.
(dump_args, m4_ifdef, m4_ifelse, m4_builtin, m4_indir, m4_defn)
(m4_maketemp, m4_mkstemp, m4_m4wrap, m4_len, m4_substr, m4_translit)
(m4_regexp, m4_patsubst, expand_user_macro): Utilize known size.
---
 src/builtin.c | 84 ++++++++++++++++++++++++++++-----------------------
 src/input.c   |  1 +
 src/m4.h      |  7 +++++
 src/macro.c   |  5 +++
 4 files changed, 59 insertions(+), 38 deletions(-)

diff --git a/src/builtin.c b/src/builtin.c
index 3447dc33..69eb2a8c 100644
--- a/src/builtin.c
+++ b/src/builtin.c
@@ -32,6 +32,7 @@
 #include "wait-process.h"

 #define ARG(i) (argc > (i) ? TOKEN_DATA_TEXT (argv[i]) : "")
+#define ARGLEN(i) (argc > (i) ? TOKEN_DATA_LEN (argv[i]) : 0)

 /* Initialization of builtin and predefined macros.  The table
    "builtin_tab" is both used for initialization, and by the "builtin"
@@ -290,6 +291,15 @@ define_user_macro (const char *name, const char *text, 
symbol_lookup mode)
 {
   symbol *s;
   char *defn = xstrdup (text ? text : "");
+  size_t len = strlen (defn);
+
+  if (len > INT_MAX)
+    {
+      M4ERROR ((warning_status, 0,
+                _("macro `%s' definition too long; truncating to INT_MAX 
bytes"),
+                  name));
+      len = INT_MAX;
+    }

   s = lookup_symbol (name, mode);
   if (SYMBOL_TYPE (s) == TOKEN_TEXT)
@@ -297,12 +307,12 @@ define_user_macro (const char *name, const char *text, 
symbol_lookup mode)

   SYMBOL_TYPE (s) = TOKEN_TEXT;
   SYMBOL_TEXT (s) = defn;
+  SYMBOL_TEXT_LEN (s) = len;

   /* Implement --warn-macro-sequence.  */
   if (macro_sequence_inuse && text)
     {
       regoff_t offset = 0;
-      size_t len = strlen (defn);

       while ((offset = re_search (&macro_sequence_buf, defn, len, offset,
                                   len - offset, &macro_sequence_regs)) >= 0)
@@ -345,9 +355,7 @@ builtin_init (void)
       {
         if (prefix_all_builtins)
           {
-            string = (char *) xmalloc (strlen (bp->name) + 4);
-            strcpy (string, "m4_");
-            strcat (string, bp->name);
+            string = xasprintf ("m4_%s", bp->name);
             define_builtin (string, bp, SYMBOL_INSERT);
             free (string);
           }
@@ -514,7 +522,7 @@ dump_args (struct obstack *obs, int argc, token_data **argv,
       if (quoted)
         obstack_grow (obs, lquote.string, lquote.length);
       obstack_grow (obs, TOKEN_DATA_TEXT (argv[i]),
-                    strlen (TOKEN_DATA_TEXT (argv[i])));
+                    TOKEN_DATA_LEN (argv[i]));
       if (quoted)
         obstack_grow (obs, rquote.string, rquote.length);
     }
@@ -623,27 +631,25 @@ static void
 m4_ifdef (struct obstack *obs, int argc, token_data **argv)
 {
   symbol *s;
-  const char *result;
+  int result = 0;

   if (bad_argc (argv[0], argc, 3, 4))
     return;
   s = lookup_symbol (ARG (1), SYMBOL_LOOKUP);

   if (s != NULL && SYMBOL_TYPE (s) != TOKEN_VOID)
-    result = ARG (2);
+    result = 2;
   else if (argc >= 4)
-    result = ARG (3);
-  else
-    result = NULL;
+    result = 3;

-  if (result != NULL)
-    obstack_grow (obs, result, strlen (result));
+  if (result)
+    obstack_grow (obs, ARG (result), ARGLEN (result));
 }

 static void
 m4_ifelse (struct obstack *obs, int argc, token_data **argv)
 {
-  const char *result;
+  int result;
   token_data *me = argv[0];

   if (argc == 2)
@@ -658,11 +664,11 @@ m4_ifelse (struct obstack *obs, int argc, token_data 
**argv)
   argv++;
   argc--;

-  result = NULL;
-  while (result == NULL)
+  result = 0;
+  while (!result)

     if (STREQ (ARG (0), ARG (1)))
-      result = ARG (2);
+      result = 2;

     else
       switch (argc)
@@ -672,7 +678,7 @@ m4_ifelse (struct obstack *obs, int argc, token_data **argv)

         case 4:
         case 5:
-          result = ARG (3);
+          result = 3;
           break;

         default:
@@ -680,7 +686,7 @@ m4_ifelse (struct obstack *obs, int argc, token_data **argv)
           argv += 3;
         }

-  obstack_grow (obs, result, strlen (result));
+  obstack_grow (obs, ARG (result), ARGLEN (result));
 }
 
 /*-------------------------------------------------------------------.
@@ -832,6 +838,7 @@ m4_builtin (struct obstack *obs, int argc, token_data 
**argv)
             {
               TOKEN_DATA_TYPE (argv[i]) = TOKEN_TEXT;
               TOKEN_DATA_TEXT (argv[i]) = (char *) "";
+              TOKEN_DATA_LEN (argv[i]) = 0;
             }
       bp->func (obs, argc - 1, argv + 1);
     }
@@ -873,6 +880,7 @@ m4_indir (struct obstack *obs, int argc, token_data **argv)
             {
               TOKEN_DATA_TYPE (argv[i]) = TOKEN_TEXT;
               TOKEN_DATA_TEXT (argv[i]) = (char *) "";
+              TOKEN_DATA_LEN (argv[i]) = 0;
             }
       call_macro (s, argc - 1, argv + 1, obs);
     }
@@ -906,7 +914,7 @@ m4_defn (struct obstack *obs, int argc, token_data **argv)
         {
         case TOKEN_TEXT:
           obstack_grow (obs, lquote.string, lquote.length);
-          obstack_grow (obs, SYMBOL_TEXT (s), strlen (SYMBOL_TEXT (s)));
+          obstack_grow (obs, SYMBOL_TEXT (s), SYMBOL_TEXT_LEN (s));
           obstack_grow (obs, rquote.string, rquote.length);
           break;

@@ -1099,7 +1107,7 @@ m4_eval (struct obstack *obs, int argc, token_data **argv)
   if (*ARG (2) && !numeric_arg (argv[0], ARG (2), &radix))
     return;

-  if (radix < 1 || radix > (int) strlen (digits))
+  if (radix < 1 || radix > 36)
     {
       M4ERROR ((warning_status, 0,
                 _("radix %d in builtin `%s' out of range"),
@@ -1464,7 +1472,7 @@ m4_maketemp (struct obstack *obs, int argc, token_data 
**argv)
            maketemp(XXXXXXXX) -> `X00nnnnn', where nnnnn is 16-bit pid
       */
       const char *str = ARG (1);
-      int len = strlen (str);
+      int len = ARGLEN (1);
       int i;
       int len2;
       const char *e;
@@ -1486,7 +1494,7 @@ m4_maketemp (struct obstack *obs, int argc, token_data 
**argv)
         }
     }
   else
-    mkstemp_helper (obs, ARG (0), ARG (1), strlen (ARG (1)));
+    mkstemp_helper (obs, ARG (0), ARG (1), ARGLEN (1));
 }

 static void
@@ -1494,7 +1502,7 @@ m4_mkstemp (struct obstack *obs, int argc, token_data 
**argv)
 {
   if (bad_argc (argv[0], argc, 2, 2))
     return;
-  mkstemp_helper (obs, ARG (0), ARG (1), strlen (ARG (1)));
+  mkstemp_helper (obs, ARG (0), ARG (1), ARGLEN (1));
 }

 /*----------------------------------------.
@@ -1589,7 +1597,7 @@ m4_m4wrap (struct obstack *obs, int argc, token_data 
**argv)
   if (bad_argc (argv[0], argc, 2, -1))
     return;
   if (no_gnu_extensions)
-    obstack_grow (obs, ARG (1), strlen (ARG (1)));
+    obstack_grow (obs, ARG (1), ARGLEN (1));
   else
     dump_args (obs, argc, argv, ' ', false);
   obstack_1grow (obs, '\0');
@@ -1743,7 +1751,7 @@ m4_len (struct obstack *obs, int argc, token_data **argv)
 {
   if (bad_argc (argv[0], argc, 2, 2))
     return;
-  shipout_int (obs, strlen (ARG (1)));
+  shipout_int (obs, ARGLEN (1));
 }

 /*-------------------------------------------------------------------.
@@ -1791,11 +1799,11 @@ m4_substr (struct obstack *obs, int argc, token_data 
**argv)
     {
       /* builtin(`substr') is blank, but substr(`abc') is abc.  */
       if (argc == 2)
-        obstack_grow (obs, ARG (1), strlen (ARG (1)));
+        obstack_grow (obs, ARG (1), ARGLEN (1));
       return;
     }

-  length = avail = strlen (ARG (1));
+  length = avail = ARGLEN (1);
   if (!numeric_arg (argv[0], ARG (2), &start))
     return;

@@ -1868,6 +1876,7 @@ static void
 m4_translit (struct obstack *obs, int argc, token_data **argv)
 {
   const char *data = ARG (1);
+  int datalen = ARGLEN (1);
   const char *from = ARG (2);
   const char *to;
   char map[UCHAR_MAX + 1];
@@ -1878,7 +1887,7 @@ m4_translit (struct obstack *obs, int argc, token_data 
**argv)
     {
       /* builtin(`translit') is blank, but translit(`abc') is abc.  */
       if (2 <= argc)
-        obstack_grow (obs, data, strlen (data));
+        obstack_grow (obs, data, datalen);
       return;
     }

@@ -1895,7 +1904,7 @@ m4_translit (struct obstack *obs, int argc, token_data 
**argv)
   if (!from[1] || !from[2])
     {
       const char *p;
-      size_t len = strlen (data);
+      size_t len = datalen;
       while ((p = (char *) memchr2 (data, from[0], from[1], len)))
         {
           obstack_grow (obs, data, p - data);
@@ -2079,7 +2088,7 @@ m4_regexp (struct obstack *obs, int argc, token_data 
**argv)
   regexp = TOKEN_DATA_TEXT (argv[2]);

   init_pattern_buffer (&buf, &regs);
-  msg = re_compile_pattern (regexp, strlen (regexp), &buf);
+  msg = re_compile_pattern (regexp, ARGLEN (2), &buf);

   if (msg != NULL)
     {
@@ -2089,7 +2098,7 @@ m4_regexp (struct obstack *obs, int argc, token_data 
**argv)
       return;
     }

-  length = strlen (victim);
+  length = ARGLEN (1);
   /* Avoid overhead of allocating regs if we won't use it.  */
   startpos = re_search (&buf, victim, length, 0, length,
                         argc == 3 ? NULL : &regs);
@@ -2132,14 +2141,14 @@ m4_patsubst (struct obstack *obs, int argc, token_data 
**argv)
     {
       /* builtin(`patsubst') is blank, but patsubst(`abc') is abc.  */
       if (argc == 2)
-        obstack_grow (obs, ARG (1), strlen (ARG (1)));
+        obstack_grow (obs, ARG (1), ARGLEN (1));
       return;
     }

   regexp = TOKEN_DATA_TEXT (argv[2]);

   init_pattern_buffer (&buf, &regs);
-  msg = re_compile_pattern (regexp, strlen (regexp), &buf);
+  msg = re_compile_pattern (regexp, ARGLEN (2), &buf);

   if (msg != NULL)
     {
@@ -2150,7 +2159,7 @@ m4_patsubst (struct obstack *obs, int argc, token_data 
**argv)
     }

   victim = TOKEN_DATA_TEXT (argv[1]);
-  length = strlen (victim);
+  length = ARGLEN (1);

   offset = 0;
   while (offset <= length)
@@ -2230,13 +2239,14 @@ expand_user_macro (struct obstack *obs, symbol *sym,
                    int argc, token_data **argv)
 {
   const char *text = SYMBOL_TEXT (sym);
+  const char *end = text + SYMBOL_TEXT_LEN (sym);
   int i;
   while (1)
     {
       const char *dollar = strchr (text, '$');
       if (!dollar)
         {
-          obstack_grow (obs, text, strlen (text));
+          obstack_grow (obs, text, end - text);
           return;
         }
       obstack_grow (obs, text, dollar - text);
@@ -2254,9 +2264,7 @@ expand_user_macro (struct obstack *obs, symbol *sym,
               for (i = 0; c_isdigit (*text); text++)
                 i = i*10 + (*text - '0');
             }
-          if (i < argc)
-            obstack_grow (obs, TOKEN_DATA_TEXT (argv[i]),
-                          strlen (TOKEN_DATA_TEXT (argv[i])));
+          obstack_grow (obs, ARG (i), ARGLEN (i));
           break;

         case '#': /* number of arguments */
diff --git a/src/input.c b/src/input.c
index f8d5c0ba..1695ad10 100644
--- a/src/input.c
+++ b/src/input.c
@@ -1020,6 +1020,7 @@ next_token (token_data *td, int *line)
   obstack_1grow (&token_stack, '\0');

   TOKEN_DATA_TYPE (td) = TOKEN_TEXT;
+  TOKEN_DATA_LEN (td) = obstack_object_size (&token_stack) - 1;
   TOKEN_DATA_TEXT (td) = (char *) obstack_finish (&token_stack);
 #ifdef ENABLE_CHANGEWORD
   if (orig_text == NULL)
diff --git a/src/m4.h b/src/m4.h
index 858cb0de..b84f4262 100644
--- a/src/m4.h
+++ b/src/m4.h
@@ -272,6 +272,11 @@ enum token_data_type
 struct token_data
 {
   enum token_data_type type;
+  /* Several places in the code only work with tokens no larger than
+     2G.  Although len only matters for a text token, putting it here
+     instead of in the union allows struct token_data to be
+     smaller.  */
+  int len;
   union
     {
       struct
@@ -288,6 +293,7 @@ struct token_data
 };

 #define TOKEN_DATA_TYPE(Td)             ((Td)->type)
+#define TOKEN_DATA_LEN(Td)              ((Td)->len)
 #define TOKEN_DATA_TEXT(Td)             ((Td)->u.u_t.text)
 #ifdef ENABLE_CHANGEWORD
 # define TOKEN_DATA_ORIG_TEXT(Td)       ((Td)->u.u_t.original_text)
@@ -381,6 +387,7 @@ struct symbol
 #define SYMBOL_NAME(S)          ((S)->name)
 #define SYMBOL_TYPE(S)          (TOKEN_DATA_TYPE (&(S)->data))
 #define SYMBOL_TEXT(S)          (TOKEN_DATA_TEXT (&(S)->data))
+#define SYMBOL_TEXT_LEN(S)      (TOKEN_DATA_LEN (&(S)->data))
 #define SYMBOL_FUNC(S)          (TOKEN_DATA_FUNC (&(S)->data))

 typedef enum symbol_lookup symbol_lookup;
diff --git a/src/macro.c b/src/macro.c
index 0e02b43c..76e469e2 100644
--- a/src/macro.c
+++ b/src/macro.c
@@ -145,6 +145,7 @@ expand_argument (struct obstack *obs, token_data *argp)
   int paren_level;
   const char *file = current_file;
   int line = current_line;
+  size_t len;

   TOKEN_DATA_TYPE (argp) = TOKEN_VOID;

@@ -168,12 +169,14 @@ expand_argument (struct obstack *obs, token_data *argp)
             {
               /* The argument MUST be finished, whether we want it or not.  */
               obstack_1grow (obs, '\0');
+              len = obstack_object_size (obs) - 1;
               text = (char *) obstack_finish (obs);

               if (TOKEN_DATA_TYPE (argp) == TOKEN_VOID)
                 {
                   TOKEN_DATA_TYPE (argp) = TOKEN_TEXT;
                   TOKEN_DATA_TEXT (argp) = text;
+                  TOKEN_DATA_LEN (argp) = len;
                 }
               return t == TOKEN_COMMA;
             }
@@ -235,6 +238,7 @@ collect_arguments (symbol *sym, struct obstack *argptr,

   TOKEN_DATA_TYPE (&td) = TOKEN_TEXT;
   TOKEN_DATA_TEXT (&td) = SYMBOL_NAME (sym);
+  TOKEN_DATA_LEN (&td) = strlen (SYMBOL_NAME (sym));
   tdp = (token_data *) obstack_copy (arguments, &td, sizeof td);
   obstack_ptr_grow (argptr, tdp);

@@ -249,6 +253,7 @@ collect_arguments (symbol *sym, struct obstack *argptr,
             {
               TOKEN_DATA_TYPE (&td) = TOKEN_TEXT;
               TOKEN_DATA_TEXT (&td) = (char *) "";
+              TOKEN_DATA_LEN (&td) = 0;
             }
           tdp = (token_data *) obstack_copy (arguments, &td, sizeof td);
           obstack_ptr_grow (argptr, tdp);
-- 
2.48.1


_______________________________________________
M4-patches mailing list
[email protected]
https://lists.gnu.org/mailman/listinfo/m4-patches

Reply via email to