Module Name:    src
Committed By:   rillig
Date:           Sat Feb  3 19:25:16 UTC 2024

Modified Files:
        src/usr.bin/xlint/lint1: ckgetopt.c debug.c emit1.c externs1.h init.c
            lex.c lint1.h tree.c

Log Message:
lint: keep strings in their source representation

This allows further analysis depending on whether individual characters are
escaped as octal, hexadecimal or not at all.


To generate a diff of this commit:
cvs rdiff -u -r1.21 -r1.22 src/usr.bin/xlint/lint1/ckgetopt.c
cvs rdiff -u -r1.69 -r1.70 src/usr.bin/xlint/lint1/debug.c
cvs rdiff -u -r1.84 -r1.85 src/usr.bin/xlint/lint1/emit1.c
cvs rdiff -u -r1.214 -r1.215 src/usr.bin/xlint/lint1/externs1.h
cvs rdiff -u -r1.257 -r1.258 src/usr.bin/xlint/lint1/init.c
cvs rdiff -u -r1.211 -r1.212 src/usr.bin/xlint/lint1/lex.c \
    src/usr.bin/xlint/lint1/lint1.h
cvs rdiff -u -r1.601 -r1.602 src/usr.bin/xlint/lint1/tree.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/usr.bin/xlint/lint1/ckgetopt.c
diff -u src/usr.bin/xlint/lint1/ckgetopt.c:1.21 src/usr.bin/xlint/lint1/ckgetopt.c:1.22
--- src/usr.bin/xlint/lint1/ckgetopt.c:1.21	Sat Feb  3 12:57:12 2024
+++ src/usr.bin/xlint/lint1/ckgetopt.c	Sat Feb  3 19:25:16 2024
@@ -1,4 +1,4 @@
-/* $NetBSD: ckgetopt.c,v 1.21 2024/02/03 12:57:12 rillig Exp $ */
+/* $NetBSD: ckgetopt.c,v 1.22 2024/02/03 19:25:16 rillig Exp $ */
 
 /*-
  * Copyright (c) 2021 The NetBSD Foundation, Inc.
@@ -35,7 +35,7 @@
 
 #include <sys/cdefs.h>
 #if defined(__RCSID)
-__RCSID("$NetBSD: ckgetopt.c,v 1.21 2024/02/03 12:57:12 rillig Exp $");
+__RCSID("$NetBSD: ckgetopt.c,v 1.22 2024/02/03 19:25:16 rillig Exp $");
 #endif
 
 #include <stdbool.h>
@@ -100,7 +100,12 @@ is_getopt_condition(const tnode_t *tn, c
 	    && last_arg->tn_left->tn_op == ADDR
 	    && last_arg->tn_left->tn_left->tn_op == STRING
 	    && (str = last_arg->tn_left->tn_left->tn_string)->data != NULL) {
-		*out_options = xstrdup(str->data);
+		buffer buf;
+		buf_init(&buf);
+		quoted_iterator it = { .start = 0 };
+		while (quoted_next(str, &it))
+			buf_add_char(&buf, (char)it.value);
+		*out_options = buf.data;
 		return true;
 	}
 	return false;

Index: src/usr.bin/xlint/lint1/debug.c
diff -u src/usr.bin/xlint/lint1/debug.c:1.69 src/usr.bin/xlint/lint1/debug.c:1.70
--- src/usr.bin/xlint/lint1/debug.c:1.69	Fri Feb  2 16:25:58 2024
+++ src/usr.bin/xlint/lint1/debug.c	Sat Feb  3 19:25:16 2024
@@ -1,4 +1,4 @@
-/* $NetBSD: debug.c,v 1.69 2024/02/02 16:25:58 rillig Exp $ */
+/* $NetBSD: debug.c,v 1.70 2024/02/03 19:25:16 rillig Exp $ */
 
 /*-
  * Copyright (c) 2021 The NetBSD Foundation, Inc.
@@ -35,7 +35,7 @@
 
 #include <sys/cdefs.h>
 #if defined(__RCSID)
-__RCSID("$NetBSD: debug.c,v 1.69 2024/02/02 16:25:58 rillig Exp $");
+__RCSID("$NetBSD: debug.c,v 1.70 2024/02/03 19:25:16 rillig Exp $");
 #endif
 
 #include <stdlib.h>
@@ -235,11 +235,10 @@ debug_node(const tnode_t *tn) // NOLINT(
 		debug_printf("\n");
 		break;
 	case STRING:
-		debug_printf(", length %zu", tn->tn_string->len);
 		if (tn->tn_string->data != NULL)
-			// TODO: May contain \0 or control characters.
-			debug_printf(", \"%s\"", tn->tn_string->data);
-		debug_printf("\n");
+			debug_printf(", %s\n", tn->tn_string->data);
+		else
+			debug_printf(", length %zu\n", tn->tn_string->len);
 		break;
 	default:
 		debug_printf("\n");

Index: src/usr.bin/xlint/lint1/emit1.c
diff -u src/usr.bin/xlint/lint1/emit1.c:1.84 src/usr.bin/xlint/lint1/emit1.c:1.85
--- src/usr.bin/xlint/lint1/emit1.c:1.84	Sat Feb  3 12:57:12 2024
+++ src/usr.bin/xlint/lint1/emit1.c	Sat Feb  3 19:25:16 2024
@@ -1,4 +1,4 @@
-/* $NetBSD: emit1.c,v 1.84 2024/02/03 12:57:12 rillig Exp $ */
+/* $NetBSD: emit1.c,v 1.85 2024/02/03 19:25:16 rillig Exp $ */
 
 /*
  * Copyright (c) 1996 Christopher G. Demetriou.  All Rights Reserved.
@@ -38,9 +38,11 @@
 
 #include <sys/cdefs.h>
 #if defined(__RCSID)
-__RCSID("$NetBSD: emit1.c,v 1.84 2024/02/03 12:57:12 rillig Exp $");
+__RCSID("$NetBSD: emit1.c,v 1.85 2024/02/03 19:25:16 rillig Exp $");
 #endif
 
+#include <stdlib.h>
+
 #include "lint1.h"
 
 static void outtt(sym_t *, sym_t *);
@@ -367,10 +369,17 @@ outcall(const tnode_t *tn, bool retval_u
 		} else if (arg->tn_op == ADDR &&
 		    arg->tn_left->tn_op == STRING &&
 		    arg->tn_left->tn_string->data != NULL) {
-			/* constant string, write all format specifiers */
+			buffer buf;
+			buf_init(&buf);
+			quoted_iterator it = { .start = 0 };
+			while (quoted_next(arg->tn_left->tn_string, &it))
+				buf_add_char(&buf, (char)it.value);
+
+			/* string literal, write all format specifiers */
 			outchar('s');
 			outint(n);
-			outfstrg(arg->tn_left->tn_string->data);
+			outfstrg(buf.data);
+			free(buf.data);
 		}
 	}
 	outchar((char)(retval_discarded ? 'd' : retval_used ? 'u' : 'i'));

Index: src/usr.bin/xlint/lint1/externs1.h
diff -u src/usr.bin/xlint/lint1/externs1.h:1.214 src/usr.bin/xlint/lint1/externs1.h:1.215
--- src/usr.bin/xlint/lint1/externs1.h:1.214	Sat Feb  3 12:57:12 2024
+++ src/usr.bin/xlint/lint1/externs1.h	Sat Feb  3 19:25:16 2024
@@ -1,4 +1,4 @@
-/*	$NetBSD: externs1.h,v 1.214 2024/02/03 12:57:12 rillig Exp $	*/
+/*	$NetBSD: externs1.h,v 1.215 2024/02/03 19:25:16 rillig Exp $	*/
 
 /*
  * Copyright (c) 1994, 1995 Jochen Pohl
@@ -396,6 +396,7 @@ void lex_comment(void);
 void lex_slash_slash_comment(void);
 void lex_unknown_character(int);
 int lex_input(void);
+bool quoted_next(const buffer *, quoted_iterator *);
 
 /*
  * ckbool.c

Index: src/usr.bin/xlint/lint1/init.c
diff -u src/usr.bin/xlint/lint1/init.c:1.257 src/usr.bin/xlint/lint1/init.c:1.258
--- src/usr.bin/xlint/lint1/init.c:1.257	Thu Feb  1 18:37:06 2024
+++ src/usr.bin/xlint/lint1/init.c	Sat Feb  3 19:25:16 2024
@@ -1,4 +1,4 @@
-/*	$NetBSD: init.c,v 1.257 2024/02/01 18:37:06 rillig Exp $	*/
+/*	$NetBSD: init.c,v 1.258 2024/02/03 19:25:16 rillig Exp $	*/
 
 /*
  * Copyright (c) 1994, 1995 Jochen Pohl
@@ -38,7 +38,7 @@
 
 #include <sys/cdefs.h>
 #if defined(__RCSID)
-__RCSID("$NetBSD: init.c,v 1.257 2024/02/01 18:37:06 rillig Exp $");
+__RCSID("$NetBSD: init.c,v 1.258 2024/02/03 19:25:16 rillig Exp $");
 #endif
 
 #include <stdlib.h>
@@ -888,6 +888,12 @@ initialization_init_array_from_string(in
 		return false;
 
 	size_t len = tn->tn_string->len;
+	if (tn->tn_string->data != NULL) {
+		quoted_iterator it = { .start = 0 };
+		for (len = 0; quoted_next(tn->tn_string, &it); len++)
+			continue;
+	}
+
 	if (!tp->t_incomplete_array && (size_t)tp->t_dim < len) {
 		/* string literal too long (%lu) for target array (%lu) */
 		warning(187, (unsigned long)len, (unsigned long)tp->t_dim);

Index: src/usr.bin/xlint/lint1/lex.c
diff -u src/usr.bin/xlint/lint1/lex.c:1.211 src/usr.bin/xlint/lint1/lex.c:1.212
--- src/usr.bin/xlint/lint1/lex.c:1.211	Sat Feb  3 18:58:05 2024
+++ src/usr.bin/xlint/lint1/lex.c	Sat Feb  3 19:25:16 2024
@@ -1,4 +1,4 @@
-/* $NetBSD: lex.c,v 1.211 2024/02/03 18:58:05 rillig Exp $ */
+/* $NetBSD: lex.c,v 1.212 2024/02/03 19:25:16 rillig Exp $ */
 
 /*
  * Copyright (c) 1996 Christopher G. Demetriou.  All Rights Reserved.
@@ -38,7 +38,7 @@
 
 #include <sys/cdefs.h>
 #if defined(__RCSID)
-__RCSID("$NetBSD: lex.c,v 1.211 2024/02/03 18:58:05 rillig Exp $");
+__RCSID("$NetBSD: lex.c,v 1.212 2024/02/03 19:25:16 rillig Exp $");
 #endif
 
 #include <ctype.h>
@@ -753,191 +753,263 @@ lex_operator(int t, op_t o)
 	return t;
 }
 
-static int prev_byte = -1;
-
-static int
-read_escaped_oct(int c, bool wide)
+static buffer *
+read_quoted(bool *complete, bool wide, char delim)
 {
-	int n = 3;
-	int value = 0;
-	do {
-		value = (value << 3) + (c - '0');
-		c = read_byte();
-	} while (--n > 0 && '0' <= c && c <= '7');
-	prev_byte = c;
-	if (value > TARG_UCHAR_MAX && !wide) {
-		/* character escape does not fit in character */
-		warning(76);
-		value &= CHAR_MASK;
-	}
-	return value;
-}
-
-static int64_t
-read_escaped_hex(int c, bool wide)
-{
-	if (!allow_c90)
-		/* \x undefined in traditional C */
-		warning(82);
-	uint64_t value = 0;
-	uint64_t mask = value_bits(wide ? 32 : CHAR_SIZE);
-	int state = 0;		/* 0 = no digits, 1 = OK, 2 = overflow */
-	while (c = read_byte(), isxdigit(c)) {
-		c = isdigit(c) ? c - '0' : toupper(c) - 'A' + 10;
-		value = (value << 4) + c;
-		if (state == 2)
-			continue;
-		if ((value & ~mask) != 0) {
-			/* overflow in hex escape */
-			warning(75);
-			state = 2;
-		} else {
-			state = 1;
+	buffer *buf = xcalloc(1, sizeof(*buf));
+	buf_init(buf);
+	if (wide)
+		buf_add_char(buf, 'L');
+	buf_add_char(buf, delim);
+
+	for (;;) {
+		int c = read_byte();
+		if (c <= 0)
+			break;
+		buf_add_char(buf, (char)c);
+		if (c == '\n')
+			break;
+		if (c == delim) {
+			*complete = true;
+			return buf;
+		}
+		if (c == '\\') {
+			c = read_byte();
+			buf_add_char(buf, (char)(c <= 0 ? ' ' : c));
+			if (c <= 0)
+				break;
 		}
 	}
-	prev_byte = c;
-	if (state == 0) {
-		/* no hex digits follow \x */
-		error(74);
-	}
-	if (state == 2)
-		value &= mask;
-	return (int64_t)value;
+	*complete = false;
+	buf_add_char(buf, delim);
+	return buf;
 }
 
-static int64_t
-read_escaped_backslash(int delim, bool wide)
+bool
+quoted_next(const buffer *lit, quoted_iterator *it)
 {
-	int c;
+	const char *s = lit->data;
+	size_t len = lit->len;
 
-	switch (c = read_byte()) {
-	case '"':
-		if (!allow_c90 && delim == '\'')
-			/* \" inside character constants undef... */
-			warning(262);
-		return '"';
-	case '\'':
-		return '\'';
-	case '?':
-		if (!allow_c90)
-			/* \? undefined in traditional C */
-			warning(263);
-		return '?';
+	*it = (quoted_iterator){ .i = it->i, .start = it->i };
+
+	char delim = s[s[0] == 'L' ? 1 : 0];
+
+	bool in_the_middle = it->i > 0;
+	if (it->i == 0) {
+		it->start = s[0] == 'L' ? 2 : 1;
+		it->i = it->start;
+	}
+
+	for (;;) {
+		if (s[it->i] != delim)
+			break;
+		if (it->i + 1 == len)
+			return false;
+		it->next_literal = in_the_middle;
+		it->start += 2;
+		it->i += 2;
+	}
+
+again:
+	switch (s[it->i]) {
 	case '\\':
-		return '\\';
+		it->i++;
+		goto backslash;
+	case '\n':
+		it->unescaped_newline = true;
+		return false;
+	default:
+		it->value = (unsigned char)s[it->i++];
+		return true;
+	}
+
+backslash:
+	it->escaped = true;
+	if ('0' <= s[it->i] && s[it->i] <= '7')
+		goto octal_escape;
+	switch (s[it->i++]) {
+	case '\n':
+		goto again;
 	case 'a':
-		if (!allow_c90)
-			/* \a undefined in traditional C */
-			warning(81);
-		return '\a';
+		it->named_escape = true;
+		it->value = '\a';
+		it->invalid_escape = !allow_c90;
+		return true;
 	case 'b':
-		return '\b';
+		it->named_escape = true;
+		it->value = '\b';
+		return true;
 	case 'e':
-		if (!allow_gcc)
-			break;
-		/* Not in the C standard yet, compilers recognize it */
-		/* LINTED 79 */
-		return '\e';
+		it->named_escape = true;
+		it->value = '\033';
+		it->invalid_escape = !allow_gcc;
+		return true;
 	case 'f':
-		return '\f';
+		it->named_escape = true;
+		it->value = '\f';
+		return true;
 	case 'n':
-		return '\n';
+		it->named_escape = true;
+		it->value = '\n';
+		return true;
 	case 'r':
-		return '\r';
+		it->named_escape = true;
+		it->value = '\r';
+		return true;
 	case 't':
-		return '\t';
+		it->named_escape = true;
+		it->value = '\t';
+		return true;
 	case 'v':
-		if (!allow_c90)
-			/* \v undefined in traditional C */
-			warning(264);
-		return '\v';
-	case '8': case '9':
-		/* bad octal digit '%c' */
-		warning(77, c);
-		/* FALLTHROUGH */
-	case '0': case '1': case '2': case '3':
-	case '4': case '5': case '6': case '7':
-		return read_escaped_oct(c, wide);
+		it->named_escape = true;
+		it->value = '\v';
+		it->invalid_escape = !allow_c90;
+		return true;
 	case 'x':
-		return read_escaped_hex(c, wide);
-	case '\n':
-		return -3;
-	case EOF:
-		return -2;
+		goto hex_escape;
+	case '"':
+		it->literal_escape = true;
+		it->value = '"';
+		it->invalid_escape = !allow_c90 && delim == '\'';
+		return true;
+	case '?':
+		it->literal_escape = true;
+		it->value = '?';
+		it->invalid_escape = !allow_c90;
+		return true;
 	default:
-		break;
+		it->invalid_escape = true;
+		/* FALLTHROUGH */
+	case '\'':
+	case '\\':
+		it->literal_escape = true;
+		it->value = (unsigned char)s[it->i - 1];
+		return true;
 	}
-	if (isprint(c))
-		/* dubious escape \%c */
-		warning(79, c);
-	else
-		/* dubious escape \%o */
-		warning(80, c);
-	return c;
-}
 
-/*
- * Read a character which is part of a character constant or of a string
- * and handle escapes.
- *
- * 'delim' is '\'' for character constants and '"' for string literals.
- *
- * Returns -1 if the end of the character constant or string is reached,
- * -2 if the EOF is reached, and the character otherwise.
- */
-static int64_t
-get_escaped_char(int delim, bool wide)
-{
+octal_escape:
+	it->octal_digits++;
+	it->value = s[it->i++] - '0';
+	if ('0' <= s[it->i] && s[it->i] <= '7') {
+		it->octal_digits++;
+		it->value = 8 * it->value + (s[it->i++] - '0');
+		if ('0' <= s[it->i] && s[it->i] <= '7') {
+			it->octal_digits++;
+			it->value = 8 * it->value + (s[it->i++] - '0');
+			it->overflow = it->value > TARG_UCHAR_MAX
+			    && s[0] != 'L';
+		}
+	}
+	return true;
 
-	int64_t c = prev_byte;
-	if (c != -1)
-		prev_byte = -1;
-	else
-		c = read_byte();
+hex_escape:
+	for (;;) {
+		char ch = s[it->i];
+		unsigned digit_value;
+		if ('0' <= ch && ch <= '9')
+			digit_value = ch - '0';
+		else if ('A' <= ch && ch <= 'F')
+			digit_value = 10 + (ch - 'A');
+		else if ('a' <= ch && ch <= 'f')
+			digit_value = 10 + (ch - 'a');
+		else
+			break;
 
-	if (c == delim)
-		return -1;
-	switch (c) {
-	case '\n':
-		/* newline in string or char constant */
-		error(254);
-		return -2;
-	case '\0':
-		/* syntax error '%s' */
-		error(249, "EOF or null byte in literal");
-		return -2;
-	case EOF:
-		return -2;
-	case '\\':
-		c = read_escaped_backslash(delim, wide);
-		if (c == -3)
-			return get_escaped_char(delim, wide);
-		break;
-	default:
-		if (c != ' ' && (isspace(c) || iscntrl(c))) {
+		it->i++;
+		it->value = 16 * it->value + digit_value;
+		uint64_t limit = s[0] == 'L' ? TARG_UINT_MAX : TARG_UCHAR_MAX;
+		if (it->value > limit)
+			it->overflow = true;
+		if (it->hex_digits < 3)
+			it->hex_digits++;
+	}
+	it->missing_hex_digits = it->hex_digits == 0;
+	return true;
+}
+
+static void
+check_quoted(const buffer *buf, bool complete, char delim)
+{
+	quoted_iterator it = { .start = 0 };
+	while (quoted_next(buf, &it)) {
+		if (it.missing_hex_digits)
+			/* no hex digits follow \x */
+			error(74);
+		if (it.hex_digits > 0 && !allow_c90)
+			/* \x undefined in traditional C */
+			warning(82);
+		else if (!it.invalid_escape)
+			;
+		else if (it.value == '8' || it.value == '9')
+			/* bad octal digit '%c' */
+			warning(77, (int)it.value);
+		else if (it.literal_escape && it.value == '?')
+			/* \? undefined in traditional C */
+			warning(263);
+		else if (it.literal_escape && it.value == '"')
+			/* \" inside character constants undefined in ... */
+			warning(262);
+		else if (it.named_escape && it.value == '\a')
+			/* \a undefined in traditional C */
+			warning(81);
+		else if (it.named_escape && it.value == '\v')
+			/* \v undefined in traditional C */
+			warning(264);
+		else {
+			unsigned char ch = buf->data[it.i - 1];
+			if (isprint(ch))
+				/* dubious escape \%c */
+				warning(79, ch);
+			else
+				/* dubious escape \%o */
+				warning(80, ch);
+		}
+		if (it.overflow && it.hex_digits > 0)
+			/* overflow in hex escape */
+			warning(75);
+		if (it.overflow && it.octal_digits > 0)
+			/* character escape does not fit in character */
+			warning(76);
+		if (it.value < ' ' && !it.escaped && complete)
 			/* invisible character U+%04X in %s */
-			query_message(17, (unsigned int)c, delim == '"'
+			query_message(17, (unsigned)it.value, delim == '"'
 			    ? "string literal" : "character constant");
-		}
 	}
-	return c;
+	if (it.unescaped_newline)
+		/* newline in string or char constant */
+		error(254);
+	if (!complete && delim == '"')
+		/* unterminated string constant */
+		error(258);
+	if (!complete && delim == '\'')
+		/* unterminated character constant */
+		error(253);
+}
+
+static buffer *
+lex_quoted(char delim, bool wide)
+{
+	bool complete;
+	buffer *buf = read_quoted(&complete, wide, delim);
+	check_quoted(buf, complete, delim);
+	return buf;
 }
 
 /* Called if lex found a leading "'". */
 int
 lex_character_constant(void)
 {
+	buffer *buf = lex_quoted('\'', false);
 
 	size_t n = 0;
-	int64_t val = 0, c;
-	while ((c = get_escaped_char('\'', false)) >= 0) {
-		val = (int64_t)((uint64_t)val << CHAR_SIZE) + c;
+	uint64_t val = 0;
+	quoted_iterator it = { .start = 0 };
+	while (quoted_next(buf, &it)) {
+		val = (val << CHAR_SIZE) + it.value;
 		n++;
 	}
-	if (c == -2) {
-		/* unterminated character constant */
-		error(253);
-	} else if (n > sizeof(int) || (n > 1 && (pflag || hflag))) {
+	if (n > sizeof(int) || (n > 1 && (pflag || hflag))) {
 		/*
 		 * XXX: ^^ should rather be sizeof(TARG_INT). Luckily,
 		 * sizeof(int) is the same on all supported platforms.
@@ -947,17 +1019,19 @@ lex_character_constant(void)
 	} else if (n > 1) {
 		/* multi-character character constant */
 		warning(294);
-	} else if (n == 0) {
+	} else if (n == 0 && !it.unescaped_newline) {
 		/* empty character constant */
 		error(73);
 	}
-	if (n == 1)
-		val = convert_integer(val, CHAR, CHAR_SIZE);
+
+	int64_t cval = n == 1
+	    ? convert_integer((int64_t)val, CHAR, CHAR_SIZE)
+	    : (int64_t)val;
 
 	yylval.y_val = xcalloc(1, sizeof(*yylval.y_val));
 	yylval.y_val->v_tspec = INT;
 	yylval.y_val->v_char_constant = true;
-	yylval.y_val->u.integer = val;
+	yylval.y_val->u.integer = cval;
 
 	return T_CON;
 }
@@ -968,22 +1042,20 @@ lex_character_constant(void)
 int
 lex_wide_character_constant(void)
 {
-	char buf[MB_LEN_MAX + 1];
-	size_t nmax = MB_CUR_MAX;
+	buffer *buf = lex_quoted('\'', true);
 
-	int64_t c;
-	size_t n = 0;
-	while ((c = get_escaped_char('\'', true)) >= 0) {
+	static char wbuf[MB_LEN_MAX + 1];
+	size_t n = 0, nmax = MB_CUR_MAX;
+
+	quoted_iterator it = { .start = 0 };
+	while (quoted_next(buf, &it)) {
 		if (n < nmax)
-			buf[n] = (char)c;
+			wbuf[n] = (char)it.value;
 		n++;
 	}
 
 	wchar_t wc = 0;
-	if (c == -2) {
-		/* unterminated character constant */
-		error(253);
-	} else if (n == 0) {
+	if (n == 0) {
 		/* empty character constant */
 		error(73);
 	} else if (n > nmax) {
@@ -991,9 +1063,9 @@ lex_wide_character_constant(void)
 		/* too many characters in character constant */
 		error(71);
 	} else {
-		buf[n] = '\0';
+		wbuf[n] = '\0';
 		(void)mbtowc(NULL, NULL, 0);
-		if (mbtowc(&wc, buf, nmax) < 0)
+		if (mbtowc(&wc, wbuf, nmax) < 0)
 			/* invalid multibyte character */
 			error(291);
 	}
@@ -1246,18 +1318,7 @@ clear_warn_flags(void)
 int
 lex_string(void)
 {
-	buffer *buf = xcalloc(1, sizeof(*buf));
-	buf_init(buf);
-
-	int64_t c;
-	while ((c = get_escaped_char('"', false)) >= 0)
-		buf_add_char(buf, (char)c);
-	if (c == -2)
-		/* unterminated string constant */
-		error(258);
-
-
-	yylval.y_string = buf;
+	yylval.y_string = lex_quoted('"', false);
 	return T_STRING;
 }
 
@@ -1283,20 +1344,18 @@ wide_length(const buffer *buf)
 int
 lex_wide_string(void)
 {
-	int64_t c;
+	buffer *buf = lex_quoted('"', true);
 
-	buffer buf;
-	buf_init(&buf);
-	while ((c = get_escaped_char('"', true)) >= 0)
-		buf_add_char(&buf, (char)c);
-	if (c == -2)
-		/* unterminated string constant */
-		error(258);
+	buffer str;
+	buf_init(&str);
+	quoted_iterator it = { .start = 0 };
+	while (quoted_next(buf, &it))
+		buf_add_char(&str, (char)it.value);
 
-	buffer *str = xcalloc(1, sizeof(*str));
-	str->len = wide_length(&buf);
+	free(buf->data);
+	*buf = (buffer) { .len = wide_length(&str) };
 
-	yylval.y_string = str;
+	yylval.y_string = buf;
 	return T_STRING;
 }
 
Index: src/usr.bin/xlint/lint1/lint1.h
diff -u src/usr.bin/xlint/lint1/lint1.h:1.211 src/usr.bin/xlint/lint1/lint1.h:1.212
--- src/usr.bin/xlint/lint1/lint1.h:1.211	Thu Feb  1 18:37:06 2024
+++ src/usr.bin/xlint/lint1/lint1.h	Sat Feb  3 19:25:16 2024
@@ -1,4 +1,4 @@
-/* $NetBSD: lint1.h,v 1.211 2024/02/01 18:37:06 rillig Exp $ */
+/* $NetBSD: lint1.h,v 1.212 2024/02/03 19:25:16 rillig Exp $ */
 
 /*
  * Copyright (c) 1996 Christopher G. Demetriou.  All Rights Reserved.
@@ -311,9 +311,14 @@ typedef struct tnode {
 		} tn_s;
 		sym_t	*_tn_sym;	/* symbol if op == NAME */
 		val_t	_tn_val;	/* value if op == CON */
-		buffer	*_tn_string;	/* string if op == STRING; for wide
-					 * char strings, data is NULL but len
-					 * is valid */
+		buffer	*_tn_string;	/* string if op == STRING; for
+					 * character strings, 'data' points to
+					 * the concatenated string literals in
+					 * source form, and 'len' is the
+					 * length of the concatenation; for
+					 * wide strings, 'data' is NULL and
+					 * 'len' is the number of resulting
+					 * characters */
 	} tn_u;
 } tnode_t;
 
@@ -518,6 +523,22 @@ typedef enum {
 	LC_VARARGS,
 } lint_comment;
 
+typedef struct {
+	size_t start;
+	size_t i;
+	uint64_t value;
+	bool escaped;		/* \n, \003, \x24 */
+	bool named_escape;	/* \a, \n, etc. */
+	bool literal_escape;	/* \?, \\, etc. */
+	uint8_t octal_digits;	/* 1 to 3; 0 means not applicable */
+	uint8_t hex_digits;	/* 1 to 3; 0 means not applicable */
+	bool next_literal;	/* when a new string literal begins */
+	bool invalid_escape;	/* single-character escape, recoverable */
+	bool overflow;		/* for octal and hex escapes */
+	bool missing_hex_digits;
+	bool unescaped_newline;	/* stops iterating */
+} quoted_iterator;
+
 #include "externs1.h"
 
 #define lint_assert(cond)						\

Index: src/usr.bin/xlint/lint1/tree.c
diff -u src/usr.bin/xlint/lint1/tree.c:1.601 src/usr.bin/xlint/lint1/tree.c:1.602
--- src/usr.bin/xlint/lint1/tree.c:1.601	Thu Feb  1 21:19:13 2024
+++ src/usr.bin/xlint/lint1/tree.c	Sat Feb  3 19:25:16 2024
@@ -1,4 +1,4 @@
-/*	$NetBSD: tree.c,v 1.601 2024/02/01 21:19:13 rillig Exp $	*/
+/*	$NetBSD: tree.c,v 1.602 2024/02/03 19:25:16 rillig Exp $	*/
 
 /*
  * Copyright (c) 1994, 1995 Jochen Pohl
@@ -37,7 +37,7 @@
 
 #include <sys/cdefs.h>
 #if defined(__RCSID)
-__RCSID("$NetBSD: tree.c,v 1.601 2024/02/01 21:19:13 rillig Exp $");
+__RCSID("$NetBSD: tree.c,v 1.602 2024/02/03 19:25:16 rillig Exp $");
 #endif
 
 #include <float.h>
@@ -522,14 +522,19 @@ build_name(sym_t *sym, bool is_funcname)
 }
 
 tnode_t *
-build_string(buffer *strg)
+build_string(buffer *lit)
 {
-	size_t len = strg->len;
+	size_t value_len = lit->len;
+	if (lit->data != NULL) {
+		quoted_iterator it = { .start = 0 };
+		for (value_len = 0; quoted_next(lit, &it); value_len++)
+			continue;
+	}
 
 	type_t *tp = expr_zero_alloc(sizeof(*tp), "type");
 	tp->t_tspec = ARRAY;
-	tp->t_subt = gettyp(strg->data != NULL ? CHAR : WCHAR_TSPEC);
-	tp->t_dim = (int)(len + 1);
+	tp->t_subt = gettyp(lit->data != NULL ? CHAR : WCHAR_TSPEC);
+	tp->t_dim = (int)(value_len + 1);
 
 	tnode_t *n = expr_alloc_tnode();
 	n->tn_op = STRING;
@@ -537,15 +542,15 @@ build_string(buffer *strg)
 	n->tn_lvalue = true;
 
 	n->tn_string = expr_zero_alloc(sizeof(*n->tn_string), "tnode.string");
-	n->tn_string->len = len;
+	n->tn_string->len = lit->len;
 
-	if (strg->data != NULL) {
-		n->tn_string->data = expr_zero_alloc(len + 1,
+	if (lit->data != NULL) {
+		n->tn_string->data = expr_zero_alloc(lit->len + 1,
 		    "tnode.string.data");
-		(void)memcpy(n->tn_string->data, strg->data, len + 1);
-		free(strg->data);
+		(void)memcpy(n->tn_string->data, lit->data, lit->len + 1);
+		free(lit->data);
 	}
-	free(strg);
+	free(lit);
 
 	return n;
 }

Reply via email to