Module Name:    src
Committed By:   martin
Date:           Mon Sep 10 15:45:11 UTC 2018

Modified Files:
        src/bin/sh [netbsd-8]: expand.c parser.c syntax.c syntax.h

Log Message:
Pull up following revision(s) via patch (requested by kre in ticket #1015):

        bin/sh/expand.c: revision 1.124
        bin/sh/expand.c: revision 1.127
        bin/sh/parser.c: revision 1.148
        bin/sh/parser.c: revision 1.149
        bin/sh/syntax.c: revision 1.6
        bin/sh/syntax.h: revision 1.9 (partial)

First pass at fixing some of the more arcane pattern matching
possibilities that we do not currently handle all that well.

This mostly means (for now) making sure that quoted pattern
magic characters (as well as quoted sh syntax magic chars)
are properly marked, so they remain known as being quoted,
and do not turn into pattern magic.   Also, make sure that an
unquoted \ in a pattern always quotes whatever comes next
(which, unlike in regular expressions, includes inside []
matches),

 -

Part 2 of pattern matching (glob etc) fixes.
Attempt to correctly deal with \ (both when it is a literal,
in appropriate cases, and when it appears as CTLESC when it was
detected as a quoting character during parsing).

In a pattern, in sh, no quoted character can ever be anything other
than a literal character.   This is quite different than regular
expressions, and even different than other uses of glob matching,
where shell quoting is not an issue.

In something like
        ls ?\*.c
the ? is a meta-character, the * is a literal (it was quoted).  This
is nothing new, sh has handled that properly for ever.

But the same happens with
        VAR='?\*.c'
and
        ls $VAR
which has not always been handled correctly.   Of course, in
        ls "$VAR"
nothing in VAR is a meta-character (the entire expansion is quoted)
so even the '\' must match literally (or more accurately, no matching
happens - VAR simply contains an "unusual" filename).  But if it had
been
        ls *"$VAR"
then we would be looking for filenames that end with the literal 5
characters that make up $VAR.

The same kinds of things are requires of matching patterns in case
statements, and sub-strings with the % and # operators in variable
expansions.

While here, the final remnant of the ancient !! pattern matching
hack has been removed (the code that actually implemented it was
long gone, but one small piece remained, not doing any real harm,
but potentially wasting time - if someone gave a pattern which would
once have invoked that hack.)


To generate a diff of this commit:
cvs rdiff -u -r1.110.2.4 -r1.110.2.5 src/bin/sh/expand.c
cvs rdiff -u -r1.132.2.5 -r1.132.2.6 src/bin/sh/parser.c
cvs rdiff -u -r1.3.26.1 -r1.3.26.2 src/bin/sh/syntax.c
cvs rdiff -u -r1.7.2.1 -r1.7.2.2 src/bin/sh/syntax.h

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/bin/sh/expand.c
diff -u src/bin/sh/expand.c:1.110.2.4 src/bin/sh/expand.c:1.110.2.5
--- src/bin/sh/expand.c:1.110.2.4	Fri Jul 13 14:32:01 2018
+++ src/bin/sh/expand.c	Mon Sep 10 15:45:11 2018
@@ -1,4 +1,4 @@
-/*	$NetBSD: expand.c,v 1.110.2.4 2018/07/13 14:32:01 martin Exp $	*/
+/*	$NetBSD: expand.c,v 1.110.2.5 2018/09/10 15:45:11 martin Exp $	*/
 
 /*-
  * Copyright (c) 1991, 1993
@@ -37,7 +37,7 @@
 #if 0
 static char sccsid[] = "@(#)expand.c	8.5 (Berkeley) 5/15/95";
 #else
-__RCSID("$NetBSD: expand.c,v 1.110.2.4 2018/07/13 14:32:01 martin Exp $");
+__RCSID("$NetBSD: expand.c,v 1.110.2.5 2018/09/10 15:45:11 martin Exp $");
 #endif
 #endif /* not lint */
 
@@ -924,7 +924,9 @@ evalvar(const char *p, int flag)
 					varlen++;
 			} else {
 				while (*val) {
-					if (quotes && syntax[(int)*val] == CCTL)
+					if (quotes && (varflags & VSQUOTE) &&
+					    (syntax[(int)*val] == CCTL ||
+					     syntax[(int)*val] == CBACK))
 						STPUTC(CTLESC, expdest);
 					STPUTC(*val++, expdest);
 				}
@@ -1106,7 +1108,7 @@ varvalue(const char *name, int quoted, i
 	int num;
 	char *p;
 	int i;
-	char sep;
+	int sep;
 	char **ap;
 	char const *syntax;
 
@@ -1164,10 +1166,14 @@ varvalue(const char *name, int quoted, i
 			STRTODEST(p);
 			if (!*ap)
 				break;
-			if (sep)
+			if (sep) {
+				if (quoted && (flag & (EXP_GLOB|EXP_CASE)) &&
+				    (SQSYNTAX[sep] == CCTL || SQSYNTAX[sep] == CSBACK))
+					STPUTC(CTLESC, expdest);
 				STPUTC(sep, expdest);
-			else if ((flag & (EXP_SPLIT|EXP_IN_QUOTES)) == EXP_SPLIT
-			    && !quoted && **ap != '\0')
+			} else
+			    if ((flag & (EXP_SPLIT|EXP_IN_QUOTES)) == EXP_SPLIT
+			      && !quoted && **ap != '\0')
 				STPUTC('\0', expdest);
 		}
 		return;
@@ -1457,22 +1463,59 @@ expmeta(char *enddir, char *name)
 			metaflag = 1;
 		else if (*p == '[') {
 			q = p + 1;
-			if (*q == '!')
+			if (*q == '!' || *q == '^')
 				q++;
 			for (;;) {
 				while (*q == CTLQUOTEMARK || *q == CTLNONL)
 					q++;
-				if (*q == CTLESC)
+				if (*q == ']') {
 					q++;
-				if (*q == '/' || *q == '\0')
+					metaflag = 1;
 					break;
-				if (*++q == ']') {
+				}
+				if (*q == '[' && q[1] == ':') {
+					/*
+					 * character class, look for :] ending
+					 * also stop on ']' (end bracket expr)
+					 * or '\0' or '/' (end pattern)
+					 */
+					while (*++q != '\0' && *q != ']' &&
+					    *q != '/') {
+						if (*q == CTLESC) {
+							if (*++q == '\0')
+								break;
+							if (*q == '/')
+								break;
+						} else if (*q == ':' &&
+						    q[1] == ']')
+							break;
+					}
+					if (*q == ':') {
+						/*
+						 * stopped at ':]'
+						 * still in [...]
+						 * skip ":]" and continue;
+						 */
+						q += 2;
+						continue;
+					}
+
+					/* done at end of pattern, not [...] */
+					if (*q == '\0' || *q == '/')
+						break;
+
+					/* found the ']', we have a [...] */
 					metaflag = 1;
+					q++;	/* skip ']' */
 					break;
 				}
+				if (*q == CTLESC)
+					q++;
+				/* end of pattern cannot be escaped */
+				if (*q == '/' || *q == '\0')
+					break;
+				q++;
 			}
-		} else if (*p == '!' && p[1] == '!'	&& (p == name || p[-1] == '/')) {
-			metaflag = 1;
 		} else if (*p == '\0')
 			break;
 		else if (*p == CTLQUOTEMARK || *p == CTLNONL)
@@ -1690,12 +1733,26 @@ patmatch(const char *pattern, const char
 	for (;;) {
 		switch (c = *p++) {
 		case '\0':
+			if (squoted && *q == CTLESC) {
+				if (q[1] == '\0')
+					q++;
+			}
 			if (*q != '\0')
 				goto backtrack;
 			return 1;
 		case CTLESC:
 			if (squoted && *q == CTLESC)
 				q++;
+			if (*p == '\0' && *q == '\0') {
+				VTRACE(DBG_MATCH, ("match-\\\n"));
+				return 1;
+			}
+			if (*q++ != *p++)
+				goto backtrack;
+			break;
+		case '\\':
+			if (squoted && *q == CTLESC)
+				q++;
 			if (*q++ != *p++)
 				goto backtrack;
 			break;
@@ -1725,6 +1782,10 @@ patmatch(const char *pattern, const char
 					q++;
 				}
 			}
+			if (c == CTLESC && p[1] == '\0') {
+				VTRACE(DBG_MATCH, ("match+\\\n"));
+				return 1;
+			}
 			/*
 			 * First try the shortest match for the '*' that
 			 * could work. We can forget any earlier '*' since
@@ -1739,19 +1800,31 @@ patmatch(const char *pattern, const char
 			int invert, found;
 			unsigned char chr;
 
+			/*
+			 * First quick check to see if there is a
+			 * possible matching ']' - if not, then this
+			 * is not a char class, and the '[' is just
+			 * a literal '['.
+			 *
+			 * This check will not detect all non classes, but
+			 * that's OK - It just means that we execute the
+			 * harder code sometimes when it it cannot succeed.
+			 */
 			endp = p;
-			if (*endp == '!')
+			if (*endp == '!' || *endp == '^')
 				endp++;
 			for (;;) {
 				while (*endp == CTLQUOTEMARK || *endp==CTLNONL)
 					endp++;
 				if (*endp == '\0')
-					goto dft;		/* no matching ] */
+					goto dft;	/* no matching ] */
 				if (*endp == CTLESC)
 					endp++;
 				if (*++endp == ']')
 					break;
 			}
+			/* end shortcut */
+
 			invert = 0;
 			savep = p, saveq = q;
 			invert = 0;
@@ -1762,6 +1835,8 @@ patmatch(const char *pattern, const char
 			found = 0;
 			if (*q == '\0')
 				return 0;
+			if (squoted && *q == CTLESC)
+				q++;
 			chr = (unsigned char)*q++;
 			c = *p++;
 			do {
@@ -1779,12 +1854,12 @@ patmatch(const char *pattern, const char
 						continue;
 					}
 				}
-				if (c == CTLESC)
+				if (c == CTLESC || c == '\\')
 					c = *p++;
 				wc = (unsigned char)c;
 				if (*p == '-' && p[1] != ']') {
 					p++;
-					if (*p == CTLESC)
+					if (*p == CTLESC || *p == '\\')
 						p++;
 					wc2 = (unsigned char)*p++;
 					if (   collate_range_cmp(chr, wc) >= 0

Index: src/bin/sh/parser.c
diff -u src/bin/sh/parser.c:1.132.2.5 src/bin/sh/parser.c:1.132.2.6
--- src/bin/sh/parser.c:1.132.2.5	Sat Aug 25 14:45:37 2018
+++ src/bin/sh/parser.c	Mon Sep 10 15:45:11 2018
@@ -1,4 +1,4 @@
-/*	$NetBSD: parser.c,v 1.132.2.5 2018/08/25 14:45:37 martin Exp $	*/
+/*	$NetBSD: parser.c,v 1.132.2.6 2018/09/10 15:45:11 martin Exp $	*/
 
 /*-
  * Copyright (c) 1991, 1993
@@ -37,7 +37,7 @@
 #if 0
 static char sccsid[] = "@(#)parser.c	8.7 (Berkeley) 5/16/95";
 #else
-__RCSID("$NetBSD: parser.c,v 1.132.2.5 2018/08/25 14:45:37 martin Exp $");
+__RCSID("$NetBSD: parser.c,v 1.132.2.6 2018/09/10 15:45:11 martin Exp $");
 #endif
 #endif /* not lint */
 
@@ -1633,7 +1633,7 @@ readtoken1(int firstc, char const *syn, 
 	for (c = firstc ;; c = pgetc_macro()) {	/* until of token */
 		if (syntax == ARISYNTAX)
 			out = insert_elided_nl(out);
-		CHECKSTRSPACE(4, out);	/* permit 4 calls to USTPUTC */
+		CHECKSTRSPACE(6, out);	/* permit 6 calls to USTPUTC */
 		switch (syntax[c]) {
 		case CNL:	/* '\n' */
 			if (syntax == BASESYNTAX && varnest == 0)
@@ -1646,6 +1646,9 @@ readtoken1(int firstc, char const *syn, 
 				setprompt(0);
 			continue;
 
+		case CSBACK:	/* single quoted backslash */
+			USTPUTC(CTLESC, out);
+			/* FALLTHROUGH */
 		case CWORD:
 			USTPUTC(c, out);
 			continue;
@@ -1672,9 +1675,11 @@ readtoken1(int firstc, char const *syn, 
 			}
 			quotef = 1;	/* current token is quoted */
 			if (ISDBLQUOTE() && c != '\\' && c != '`' &&
-			    c != '$' && (c != '"' || magicq))
+			    c != '$' && (c != '"' || magicq)) {
+				USTPUTC(CTLESC, out);
 				USTPUTC('\\', out);
-			if (SQSYNTAX[c] == CCTL)
+			}
+			if (SQSYNTAX[c] == CCTL || SQSYNTAX[c] == CSBACK)
 				USTPUTC(CTLESC, out);
 			else if (!magicq) {
 				USTPUTC(CTLQUOTEMARK, out);

Index: src/bin/sh/syntax.c
diff -u src/bin/sh/syntax.c:1.3.26.1 src/bin/sh/syntax.c:1.3.26.2
--- src/bin/sh/syntax.c:1.3.26.1	Sun Jul 23 14:58:14 2017
+++ src/bin/sh/syntax.c	Mon Sep 10 15:45:11 2018
@@ -1,7 +1,7 @@
-/*	$NetBSD: syntax.c,v 1.3.26.1 2017/07/23 14:58:14 snj Exp $	*/
+/*	$NetBSD: syntax.c,v 1.3.26.2 2018/09/10 15:45:11 martin Exp $	*/
 
 #include <sys/cdefs.h>
-__RCSID("$NetBSD: syntax.c,v 1.3.26.1 2017/07/23 14:58:14 snj Exp $");
+__RCSID("$NetBSD: syntax.c,v 1.3.26.2 2018/09/10 15:45:11 martin Exp $");
 
 #include <limits.h>
 #include "shell.h"
@@ -46,7 +46,7 @@ const char dqsyntax[257] = { CEOF,
     set('`', CBQUOTE)
     set('$', CVAR)
     set('}', CENDVAR)
-    /* ':/' for tilde expansion, '-' for [a\-x] pattern ranges */
+    /* ':/' for tilde expansion, '-]' for [a\-x] pattern ranges */
     set('!', CCTL)
     set('*', CCTL)
     set('?', CCTL)
@@ -56,6 +56,7 @@ const char dqsyntax[257] = { CEOF,
     set(':', CCTL)
     set('/', CCTL)
     set('-', CCTL)
+    set(']', CCTL)
 };
 
 /* syntax table used when in single quotes */
@@ -63,7 +64,8 @@ const char sqsyntax[257] = { CEOF,
     set_range(CTL_FIRST, CTL_LAST, CCTL)
     set('\n', CNL)
     set('\'', CSQUOTE)
-    /* ':/' for tilde expansion, '-' for [a\-x] pattern ranges */
+    set('\\', CSBACK)
+    /* ':/' for tilde expansion, '-]' for [a\-x] pattern ranges */
     set('!', CCTL)
     set('*', CCTL)
     set('?', CCTL)
@@ -73,6 +75,7 @@ const char sqsyntax[257] = { CEOF,
     set(':', CCTL)
     set('/', CCTL)
     set('-', CCTL)
+    set(']', CCTL)
 };
 
 /* syntax table used when in arithmetic */

Index: src/bin/sh/syntax.h
diff -u src/bin/sh/syntax.h:1.7.2.1 src/bin/sh/syntax.h:1.7.2.2
--- src/bin/sh/syntax.h:1.7.2.1	Sun Jul 23 14:58:14 2017
+++ src/bin/sh/syntax.h	Mon Sep 10 15:45:11 2018
@@ -1,4 +1,4 @@
-/*	$NetBSD: syntax.h,v 1.7.2.1 2017/07/23 14:58:14 snj Exp $	*/
+/*	$NetBSD: syntax.h,v 1.7.2.2 2018/09/10 15:45:11 martin Exp $	*/
 
 /*-
  * Copyright (c) 1991, 1993
@@ -49,6 +49,7 @@
 #define CEOF 10			/* end of file */
 #define CCTL 11			/* like CWORD, except it must be escaped */
 #define CSPCL 12		/* these terminate a word */
+#define CSBACK 13		/* a backslash in a single quote syntax */
 
 /* Syntax classes for is_ functions */
 #define ISDIGIT 01		/* a digit */

Reply via email to