When dealing with tokens, we often know the length of the token from
the time it was parsed or created by expansion; save this information
alongside the token instead of calling strlen() everywhere to re-learn
it, for a slight optimization. The placement of the new member in
struct token_data is intentional to avoid changing the size of the
struct on 64-bit machines, even if the size only matters for text
tokens. The code already has a number of places that assume a maximum
token length bounded by int; scrubbing that to allow a full size_t
would be a larger patch.
* src/m4.h (token_data): Add size member.
(TOKEN_DATA_LEN, SYMBOL_TEXT_LEN): New macros.
* src/input.c (next_token): Remember size.
* src/macro.c (expand_argument, collect_arguments): Likewise.
* src/builtin.c (ARGLEN): New macro.
(define_user_macro): Set length, and warn user on oversize content.
(m4_eval): Use compile-time bound for radix.
(dump_args, m4_ifdef, m4_ifelse, m4_builtin, m4_indir, m4_defn)
(m4_maketemp, m4_mkstemp, m4_m4wrap, m4_len, m4_substr, m4_translit)
(m4_regexp, m4_patsubst, expand_user_macro): Utilize known size.
---
src/builtin.c | 84 ++++++++++++++++++++++++++++-----------------------
src/input.c | 1 +
src/m4.h | 7 +++++
src/macro.c | 5 +++
4 files changed, 59 insertions(+), 38 deletions(-)
diff --git a/src/builtin.c b/src/builtin.c
index 3447dc33..69eb2a8c 100644
--- a/src/builtin.c
+++ b/src/builtin.c
@@ -32,6 +32,7 @@
#include "wait-process.h"
#define ARG(i) (argc > (i) ? TOKEN_DATA_TEXT (argv[i]) : "")
+#define ARGLEN(i) (argc > (i) ? TOKEN_DATA_LEN (argv[i]) : 0)
/* Initialization of builtin and predefined macros. The table
"builtin_tab" is both used for initialization, and by the "builtin"
@@ -290,6 +291,15 @@ define_user_macro (const char *name, const char *text,
symbol_lookup mode)
{
symbol *s;
char *defn = xstrdup (text ? text : "");
+ size_t len = strlen (defn);
+
+ if (len > INT_MAX)
+ {
+ M4ERROR ((warning_status, 0,
+ _("macro `%s' definition too long; truncating to INT_MAX
bytes"),
+ name));
+ len = INT_MAX;
+ }
s = lookup_symbol (name, mode);
if (SYMBOL_TYPE (s) == TOKEN_TEXT)
@@ -297,12 +307,12 @@ define_user_macro (const char *name, const char *text,
symbol_lookup mode)
SYMBOL_TYPE (s) = TOKEN_TEXT;
SYMBOL_TEXT (s) = defn;
+ SYMBOL_TEXT_LEN (s) = len;
/* Implement --warn-macro-sequence. */
if (macro_sequence_inuse && text)
{
regoff_t offset = 0;
- size_t len = strlen (defn);
while ((offset = re_search (¯o_sequence_buf, defn, len, offset,
len - offset, ¯o_sequence_regs)) >= 0)
@@ -345,9 +355,7 @@ builtin_init (void)
{
if (prefix_all_builtins)
{
- string = (char *) xmalloc (strlen (bp->name) + 4);
- strcpy (string, "m4_");
- strcat (string, bp->name);
+ string = xasprintf ("m4_%s", bp->name);
define_builtin (string, bp, SYMBOL_INSERT);
free (string);
}
@@ -514,7 +522,7 @@ dump_args (struct obstack *obs, int argc, token_data **argv,
if (quoted)
obstack_grow (obs, lquote.string, lquote.length);
obstack_grow (obs, TOKEN_DATA_TEXT (argv[i]),
- strlen (TOKEN_DATA_TEXT (argv[i])));
+ TOKEN_DATA_LEN (argv[i]));
if (quoted)
obstack_grow (obs, rquote.string, rquote.length);
}
@@ -623,27 +631,25 @@ static void
m4_ifdef (struct obstack *obs, int argc, token_data **argv)
{
symbol *s;
- const char *result;
+ int result = 0;
if (bad_argc (argv[0], argc, 3, 4))
return;
s = lookup_symbol (ARG (1), SYMBOL_LOOKUP);
if (s != NULL && SYMBOL_TYPE (s) != TOKEN_VOID)
- result = ARG (2);
+ result = 2;
else if (argc >= 4)
- result = ARG (3);
- else
- result = NULL;
+ result = 3;
- if (result != NULL)
- obstack_grow (obs, result, strlen (result));
+ if (result)
+ obstack_grow (obs, ARG (result), ARGLEN (result));
}
static void
m4_ifelse (struct obstack *obs, int argc, token_data **argv)
{
- const char *result;
+ int result;
token_data *me = argv[0];
if (argc == 2)
@@ -658,11 +664,11 @@ m4_ifelse (struct obstack *obs, int argc, token_data
**argv)
argv++;
argc--;
- result = NULL;
- while (result == NULL)
+ result = 0;
+ while (!result)
if (STREQ (ARG (0), ARG (1)))
- result = ARG (2);
+ result = 2;
else
switch (argc)
@@ -672,7 +678,7 @@ m4_ifelse (struct obstack *obs, int argc, token_data **argv)
case 4:
case 5:
- result = ARG (3);
+ result = 3;
break;
default:
@@ -680,7 +686,7 @@ m4_ifelse (struct obstack *obs, int argc, token_data **argv)
argv += 3;
}
- obstack_grow (obs, result, strlen (result));
+ obstack_grow (obs, ARG (result), ARGLEN (result));
}
/*-------------------------------------------------------------------.
@@ -832,6 +838,7 @@ m4_builtin (struct obstack *obs, int argc, token_data
**argv)
{
TOKEN_DATA_TYPE (argv[i]) = TOKEN_TEXT;
TOKEN_DATA_TEXT (argv[i]) = (char *) "";
+ TOKEN_DATA_LEN (argv[i]) = 0;
}
bp->func (obs, argc - 1, argv + 1);
}
@@ -873,6 +880,7 @@ m4_indir (struct obstack *obs, int argc, token_data **argv)
{
TOKEN_DATA_TYPE (argv[i]) = TOKEN_TEXT;
TOKEN_DATA_TEXT (argv[i]) = (char *) "";
+ TOKEN_DATA_LEN (argv[i]) = 0;
}
call_macro (s, argc - 1, argv + 1, obs);
}
@@ -906,7 +914,7 @@ m4_defn (struct obstack *obs, int argc, token_data **argv)
{
case TOKEN_TEXT:
obstack_grow (obs, lquote.string, lquote.length);
- obstack_grow (obs, SYMBOL_TEXT (s), strlen (SYMBOL_TEXT (s)));
+ obstack_grow (obs, SYMBOL_TEXT (s), SYMBOL_TEXT_LEN (s));
obstack_grow (obs, rquote.string, rquote.length);
break;
@@ -1099,7 +1107,7 @@ m4_eval (struct obstack *obs, int argc, token_data **argv)
if (*ARG (2) && !numeric_arg (argv[0], ARG (2), &radix))
return;
- if (radix < 1 || radix > (int) strlen (digits))
+ if (radix < 1 || radix > 36)
{
M4ERROR ((warning_status, 0,
_("radix %d in builtin `%s' out of range"),
@@ -1464,7 +1472,7 @@ m4_maketemp (struct obstack *obs, int argc, token_data
**argv)
maketemp(XXXXXXXX) -> `X00nnnnn', where nnnnn is 16-bit pid
*/
const char *str = ARG (1);
- int len = strlen (str);
+ int len = ARGLEN (1);
int i;
int len2;
const char *e;
@@ -1486,7 +1494,7 @@ m4_maketemp (struct obstack *obs, int argc, token_data
**argv)
}
}
else
- mkstemp_helper (obs, ARG (0), ARG (1), strlen (ARG (1)));
+ mkstemp_helper (obs, ARG (0), ARG (1), ARGLEN (1));
}
static void
@@ -1494,7 +1502,7 @@ m4_mkstemp (struct obstack *obs, int argc, token_data
**argv)
{
if (bad_argc (argv[0], argc, 2, 2))
return;
- mkstemp_helper (obs, ARG (0), ARG (1), strlen (ARG (1)));
+ mkstemp_helper (obs, ARG (0), ARG (1), ARGLEN (1));
}
/*----------------------------------------.
@@ -1589,7 +1597,7 @@ m4_m4wrap (struct obstack *obs, int argc, token_data
**argv)
if (bad_argc (argv[0], argc, 2, -1))
return;
if (no_gnu_extensions)
- obstack_grow (obs, ARG (1), strlen (ARG (1)));
+ obstack_grow (obs, ARG (1), ARGLEN (1));
else
dump_args (obs, argc, argv, ' ', false);
obstack_1grow (obs, '\0');
@@ -1743,7 +1751,7 @@ m4_len (struct obstack *obs, int argc, token_data **argv)
{
if (bad_argc (argv[0], argc, 2, 2))
return;
- shipout_int (obs, strlen (ARG (1)));
+ shipout_int (obs, ARGLEN (1));
}
/*-------------------------------------------------------------------.
@@ -1791,11 +1799,11 @@ m4_substr (struct obstack *obs, int argc, token_data
**argv)
{
/* builtin(`substr') is blank, but substr(`abc') is abc. */
if (argc == 2)
- obstack_grow (obs, ARG (1), strlen (ARG (1)));
+ obstack_grow (obs, ARG (1), ARGLEN (1));
return;
}
- length = avail = strlen (ARG (1));
+ length = avail = ARGLEN (1);
if (!numeric_arg (argv[0], ARG (2), &start))
return;
@@ -1868,6 +1876,7 @@ static void
m4_translit (struct obstack *obs, int argc, token_data **argv)
{
const char *data = ARG (1);
+ int datalen = ARGLEN (1);
const char *from = ARG (2);
const char *to;
char map[UCHAR_MAX + 1];
@@ -1878,7 +1887,7 @@ m4_translit (struct obstack *obs, int argc, token_data
**argv)
{
/* builtin(`translit') is blank, but translit(`abc') is abc. */
if (2 <= argc)
- obstack_grow (obs, data, strlen (data));
+ obstack_grow (obs, data, datalen);
return;
}
@@ -1895,7 +1904,7 @@ m4_translit (struct obstack *obs, int argc, token_data
**argv)
if (!from[1] || !from[2])
{
const char *p;
- size_t len = strlen (data);
+ size_t len = datalen;
while ((p = (char *) memchr2 (data, from[0], from[1], len)))
{
obstack_grow (obs, data, p - data);
@@ -2079,7 +2088,7 @@ m4_regexp (struct obstack *obs, int argc, token_data
**argv)
regexp = TOKEN_DATA_TEXT (argv[2]);
init_pattern_buffer (&buf, ®s);
- msg = re_compile_pattern (regexp, strlen (regexp), &buf);
+ msg = re_compile_pattern (regexp, ARGLEN (2), &buf);
if (msg != NULL)
{
@@ -2089,7 +2098,7 @@ m4_regexp (struct obstack *obs, int argc, token_data
**argv)
return;
}
- length = strlen (victim);
+ length = ARGLEN (1);
/* Avoid overhead of allocating regs if we won't use it. */
startpos = re_search (&buf, victim, length, 0, length,
argc == 3 ? NULL : ®s);
@@ -2132,14 +2141,14 @@ m4_patsubst (struct obstack *obs, int argc, token_data
**argv)
{
/* builtin(`patsubst') is blank, but patsubst(`abc') is abc. */
if (argc == 2)
- obstack_grow (obs, ARG (1), strlen (ARG (1)));
+ obstack_grow (obs, ARG (1), ARGLEN (1));
return;
}
regexp = TOKEN_DATA_TEXT (argv[2]);
init_pattern_buffer (&buf, ®s);
- msg = re_compile_pattern (regexp, strlen (regexp), &buf);
+ msg = re_compile_pattern (regexp, ARGLEN (2), &buf);
if (msg != NULL)
{
@@ -2150,7 +2159,7 @@ m4_patsubst (struct obstack *obs, int argc, token_data
**argv)
}
victim = TOKEN_DATA_TEXT (argv[1]);
- length = strlen (victim);
+ length = ARGLEN (1);
offset = 0;
while (offset <= length)
@@ -2230,13 +2239,14 @@ expand_user_macro (struct obstack *obs, symbol *sym,
int argc, token_data **argv)
{
const char *text = SYMBOL_TEXT (sym);
+ const char *end = text + SYMBOL_TEXT_LEN (sym);
int i;
while (1)
{
const char *dollar = strchr (text, '$');
if (!dollar)
{
- obstack_grow (obs, text, strlen (text));
+ obstack_grow (obs, text, end - text);
return;
}
obstack_grow (obs, text, dollar - text);
@@ -2254,9 +2264,7 @@ expand_user_macro (struct obstack *obs, symbol *sym,
for (i = 0; c_isdigit (*text); text++)
i = i*10 + (*text - '0');
}
- if (i < argc)
- obstack_grow (obs, TOKEN_DATA_TEXT (argv[i]),
- strlen (TOKEN_DATA_TEXT (argv[i])));
+ obstack_grow (obs, ARG (i), ARGLEN (i));
break;
case '#': /* number of arguments */
diff --git a/src/input.c b/src/input.c
index f8d5c0ba..1695ad10 100644
--- a/src/input.c
+++ b/src/input.c
@@ -1020,6 +1020,7 @@ next_token (token_data *td, int *line)
obstack_1grow (&token_stack, '\0');
TOKEN_DATA_TYPE (td) = TOKEN_TEXT;
+ TOKEN_DATA_LEN (td) = obstack_object_size (&token_stack) - 1;
TOKEN_DATA_TEXT (td) = (char *) obstack_finish (&token_stack);
#ifdef ENABLE_CHANGEWORD
if (orig_text == NULL)
diff --git a/src/m4.h b/src/m4.h
index 858cb0de..b84f4262 100644
--- a/src/m4.h
+++ b/src/m4.h
@@ -272,6 +272,11 @@ enum token_data_type
struct token_data
{
enum token_data_type type;
+ /* Several places in the code only work with tokens no larger than
+ 2G. Although len only matters for a text token, putting it here
+ instead of in the union allows struct token_data to be
+ smaller. */
+ int len;
union
{
struct
@@ -288,6 +293,7 @@ struct token_data
};
#define TOKEN_DATA_TYPE(Td) ((Td)->type)
+#define TOKEN_DATA_LEN(Td) ((Td)->len)
#define TOKEN_DATA_TEXT(Td) ((Td)->u.u_t.text)
#ifdef ENABLE_CHANGEWORD
# define TOKEN_DATA_ORIG_TEXT(Td) ((Td)->u.u_t.original_text)
@@ -381,6 +387,7 @@ struct symbol
#define SYMBOL_NAME(S) ((S)->name)
#define SYMBOL_TYPE(S) (TOKEN_DATA_TYPE (&(S)->data))
#define SYMBOL_TEXT(S) (TOKEN_DATA_TEXT (&(S)->data))
+#define SYMBOL_TEXT_LEN(S) (TOKEN_DATA_LEN (&(S)->data))
#define SYMBOL_FUNC(S) (TOKEN_DATA_FUNC (&(S)->data))
typedef enum symbol_lookup symbol_lookup;
diff --git a/src/macro.c b/src/macro.c
index 0e02b43c..76e469e2 100644
--- a/src/macro.c
+++ b/src/macro.c
@@ -145,6 +145,7 @@ expand_argument (struct obstack *obs, token_data *argp)
int paren_level;
const char *file = current_file;
int line = current_line;
+ size_t len;
TOKEN_DATA_TYPE (argp) = TOKEN_VOID;
@@ -168,12 +169,14 @@ expand_argument (struct obstack *obs, token_data *argp)
{
/* The argument MUST be finished, whether we want it or not. */
obstack_1grow (obs, '\0');
+ len = obstack_object_size (obs) - 1;
text = (char *) obstack_finish (obs);
if (TOKEN_DATA_TYPE (argp) == TOKEN_VOID)
{
TOKEN_DATA_TYPE (argp) = TOKEN_TEXT;
TOKEN_DATA_TEXT (argp) = text;
+ TOKEN_DATA_LEN (argp) = len;
}
return t == TOKEN_COMMA;
}
@@ -235,6 +238,7 @@ collect_arguments (symbol *sym, struct obstack *argptr,
TOKEN_DATA_TYPE (&td) = TOKEN_TEXT;
TOKEN_DATA_TEXT (&td) = SYMBOL_NAME (sym);
+ TOKEN_DATA_LEN (&td) = strlen (SYMBOL_NAME (sym));
tdp = (token_data *) obstack_copy (arguments, &td, sizeof td);
obstack_ptr_grow (argptr, tdp);
@@ -249,6 +253,7 @@ collect_arguments (symbol *sym, struct obstack *argptr,
{
TOKEN_DATA_TYPE (&td) = TOKEN_TEXT;
TOKEN_DATA_TEXT (&td) = (char *) "";
+ TOKEN_DATA_LEN (&td) = 0;
}
tdp = (token_data *) obstack_copy (arguments, &td, sizeof td);
obstack_ptr_grow (argptr, tdp);
--
2.48.1
_______________________________________________
M4-patches mailing list
[email protected]
https://lists.gnu.org/mailman/listinfo/m4-patches