On Sat, Jun 18, 2011 at 08:16:05PM -0600, Ingo Schwarze wrote: > When a regular expression has zero-length matches in a string, > both sed(1) global replacement (/g) and replacement of numbered > instances (e.g. /2) are broken. This is not even limited to sed -E. > Both Otto's patch and my own refactoring patch on misc@ only address > global replacement and leave numbered instances broken. > > Already now, code in the three parts of the switch statement > is rather repetitive, and by fixing the numbered instances case, > code duplication would become much worse. Thus, i propose > the following full rewrite of the whole switch statement. > > This is a bit scary because... > Well, sed(1) is not exactly the tool we want to break. > Hence, a test suite is included. > Both my patch and GNU sed(1) pass the test suite. > > Our -current sed fails 43 of these tests, quite a few of them > in spectacular ways, like > > $ echo | sed 's/$/x/2' # expect "" > x > $ echo aaa | sed 's/a*/x/2' # expect "aaa" > aaax > $ echo abc | sed -E 's/a|$/x/g' # expect "xbcx" > x > $ echo abc | sed -E 's/()/x/2' # expect "axbc" > xabc > $ echo abc | sed -E 's/()/x/42' # expect "abc" > xabc > > One common source of confusion on misc@ was that Perl allows > empty matches right after other matches, like in: > > $ perl -Mstrict -we '$_ = "abc"; s/b|()/x/g; print "$_\n";' > xaxxcx > $ perl -Mstrict -we '$_ = "a"; s/^|a|$/x/g; print "$_\n";' > xxx > > I consider that broken behaviour in Perl. For example, think about > the case of "a" =~ /^|a/. The branch /^/ matches at 0, length 0. > The branch /a/ matches at 0, length 1. So by greediness, the latter > ought to prevail. But then, we have already consumed the character, > and there is no way to get a second match. Hence, > > $ echo abc | sed -E 's/b|()/x/g' > xaxcx > $ echo a | sed -E 's/^|a|$/x/g' > x > > Comments? > Ingo
Couple of comments; - re_off is no longer used. - slen is unsigned. We need to be 100% sure it cannot wrap. ATM I'm not completely convinced the empty match case is safe. If you look at the FreeBSD code, it appears the case match[0].rm_eo == slen can happen, and in that case slen will wrap. They fixed it by making slen a signed type (regoff_t). See rev 1.30 and 1.31 in their tree. When I run the regress test suite from FreeBSD (in src/tools/regression/usr.bin/sed) I see a couple of failures with our sed. I did not have time yet to look if these failures have anything with this diff and if we should incoorporate them. -Otto > > > Index: usr.bin/sed/process.c > =================================================================== > RCS file: /cvs/src/usr.bin/sed/process.c,v > retrieving revision 1.15 > diff -u -p -r1.15 process.c > --- usr.bin/sed/process.c 27 Oct 2009 23:59:43 -0000 1.15 > +++ usr.bin/sed/process.c 19 Jun 2011 01:47:19 -0000 > @@ -333,60 +333,47 @@ substitute(struct s_command *cp) > n = cp->u.s->n; > lastempty = 1; > > - switch (n) { > - case 0: /* Global */ > - do { > - if (lastempty || match[0].rm_so != match[0].rm_eo) { > - /* Locate start of replaced string. */ > - re_off = match[0].rm_so; > - /* Copy leading retained string. */ > - cspace(&SS, s, re_off, APPEND); > - /* Add in regular expression. */ > - regsub(&SS, s, cp->u.s->new); > - } > + do { > + /* Copy the leading retained string. */ > + if (n <= 1 && match[0].rm_so) > + cspace(&SS, s, match[0].rm_so, APPEND); > > - /* Move past this match. */ > - if (match[0].rm_so != match[0].rm_eo) { > - s += match[0].rm_eo; > - slen -= match[0].rm_eo; > - lastempty = 0; > + /* Skip zero-length matches right after other matches. */ > + if (lastempty || match[0].rm_so || > + match[0].rm_so != match[0].rm_eo) { > + if (n <= 1) { > + /* Want this match: append replacement. */ > + regsub(&SS, s, cp->u.s->new); > + if (n == 1) > + n = -1; > } else { > - if (match[0].rm_so == 0) > - cspace(&SS, s, match[0].rm_so + 1, > - APPEND); > - else > - cspace(&SS, s + match[0].rm_so, 1, > - APPEND); > - s += match[0].rm_so + 1; > - slen -= match[0].rm_so + 1; > - lastempty = 1; > + /* Want a later match: append original. */ > + if (match[0].rm_eo) > + cspace(&SS, s, match[0].rm_eo, APPEND); > + n--; > } > - } while (slen > 0 && regexec_e(re, s, REG_NOTBOL, 0, slen)); > - /* Copy trailing retained string. */ > - if (slen > 0) > - cspace(&SS, s, slen, APPEND); > - break; > - default: /* Nth occurrence */ > - while (--n) { > - s += match[0].rm_eo; > - slen -= match[0].rm_eo; > - if (!regexec_e(re, s, REG_NOTBOL, 0, slen)) > - return (0); > } > - /* FALLTHROUGH */ > - case 1: /* 1st occurrence */ > - /* Locate start of replaced string. */ > - re_off = match[0].rm_so + (s - ps); > - /* Copy leading retained string. */ > - cspace(&SS, ps, re_off, APPEND); > - /* Add in regular expression. */ > - regsub(&SS, s, cp->u.s->new); > - /* Copy trailing retained string. */ > + > + /* After a zero-length match, advance one byte. */ > + if (match[0].rm_so == match[0].rm_eo) { > + cspace(&SS, s + match[0].rm_eo++, 1, APPEND); > + lastempty = 1; > + } else > + lastempty = 0; > + > + /* Move past this match. */ > s += match[0].rm_eo; > slen -= match[0].rm_eo; > + > + } while (n >= 0 && slen > 0 && regexec_e(re, s, REG_NOTBOL, 0, slen)); > + > + /* Did not find the requested number of matches. */ > + if (n > 1) > + return (0); > + > + /* Copy the trailing retained string. */ > + if (slen > 0) > cspace(&SS, s, slen, APPEND); > - break; > - } > > /* > * Swap the substitute space and the pattern space, and make sure > Index: regress/usr.bin/sed/Makefile > =================================================================== > RCS file: /cvs/src/regress/usr.bin/sed/Makefile,v > retrieving revision 1.2 > diff -u -p -r1.2 Makefile > --- regress/usr.bin/sed/Makefile 13 Oct 2008 13:27:33 -0000 1.2 > +++ regress/usr.bin/sed/Makefile 19 Jun 2011 01:47:19 -0000 > @@ -3,11 +3,14 @@ > > SED= /usr/bin/sed > > -REGRESS_TARGETS= sedtest hanoi math sierpinski > +REGRESS_TARGETS= sedtest substitute hanoi math sierpinski > > sedtest: > sh ${.CURDIR}/$@.sh ${SED} $@.out > diff ${.CURDIR}/$@.expected $@.out > + > +substitute: > + sh ${.CURDIR}/$@.sh > > hanoi: > ${SED} -f ${.CURDIR}/$@.sed ${.CURDIR}/$@.in > $@.out > Index: regress/usr.bin/sed/substitute.sh > =================================================================== > RCS file: regress/usr.bin/sed/substitute.sh > diff -N regress/usr.bin/sed/substitute.sh > --- /dev/null 1 Jan 1970 00:00:00 -0000 > +++ regress/usr.bin/sed/substitute.sh 19 Jun 2011 01:47:19 -0000 > @@ -0,0 +1,74 @@ > +#!/bin/sh > + > +# error counter > +err=0 > + > +# test function; arguments are: > +# input string > +# regular expression to replace > +# wanted output for /g (global substitution) > +# wanted output for /1 (substitution of first match) and so on > +t() { > + # global substitution > + in=$1 > + expr=$2 > + want=$3 > + shift 3 > + out=`echo "$in" | sed -E "s/$expr/x/g"` > + [ "X$out" = "X$want" ] || echo "$in/$expr/g/$want/$out ($((++err)))" > + > + # substitution with specific index > + num=1 > + while [ $# -gt 0 ]; do > + want=$1 > + shift > + out=`echo "$in" | sed -E "s/$expr/x/$num"` > + [ "X$out" = "X$want" ] || \ > + echo "$in/$expr/$num/$want/$out ($((++err)))" > + num=$((num+1)) > + done > + > + # substitution with excessive index > + out=`echo "$in" | sed -E "s/$expr/x/$num"` > + [ "X$out" = "X$in" ] || echo "$in/$expr/$num/=/$out ($((++err)))" > +} > + > +t '' ^ x x > +t '' '()' x x > +t '' '$' x x > +t '' '^|$' x x > +t a ^ xa xa > +t a '()' xax xa ax > +t a '$' ax ax > +t a '^|a' x x > +t a '^|$' xax xa ax > +t a '^|a|$' x x > +t a 'a|$' x x > +t ab ^ xab xab > +t ab '()' xaxbx xab axb abx > +t ab '$' abx abx > +t ab '^|a' xb xb > +t ab '^|b' xax xab ax > +t ab '^|$' xabx xab abx > +t ab '^|a|$' xbx xb abx > +t ab '^|b|$' xax xab ax > +t ab '^|a|b|$' xx xb ax > +t ab '^|ab|$' x x > +t ab 'a|()' xbx xb abx > +t ab 'a|$' xbx xb abx > +t ab 'ab|$' x x > +t ab 'b|()' xax xab ax > +t ab 'b|$' ax ax > +t abc '^|b' xaxc xabc axc > +t abc '^|b|$' xaxcx xabc axc abcx > +t abc '^|bc|$' xax xabc ax > +t abc 'ab|()' xcx xc abcx > +t abc 'ab|$' xcx xc abcx > +t abc 'b|()' xaxcx xabc axc abcx > +t abc 'bc|()' xax xabc ax > +t abc 'b|$' axcx axc abcx > +t aa a xx xa ax > +t aa 'a|()' xx xa ax > +t aa 'a*' x x > + > +exit $err