Re: counting zero-length matches in sed(1)

Otto Moerbeek Fri, 24 Jun 2011 12:42:02 -0700

On Sat, Jun 18, 2011 at 08:16:05PM -0600, Ingo Schwarze wrote:

> When a regular expression has zero-length matches in a string,
> both sed(1) global replacement (/g) and replacement of numbered
> instances (e.g. /2) are broken.  This is not even limited to sed -E.
> Both Otto's patch and my own refactoring patch on misc@ only address
> global replacement and leave numbered instances broken.
> 
> Already now, code in the three parts of the switch statement
> is rather repetitive, and by fixing the numbered instances case,
> code duplication would become much worse.  Thus, i propose
> the following full rewrite of the whole switch statement.
> 
> This is a bit scary because...
> Well, sed(1) is not exactly the tool we want to break.
> Hence, a test suite is included.
> Both my patch and GNU sed(1) pass the test suite.
> 
> Our -current sed fails 43 of these tests, quite a few of them
> in spectacular ways, like
> 
>   $ echo | sed 's/$/x/2'   # expect ""
>   x
>   $ echo aaa | sed 's/a*/x/2'   # expect "aaa"
>   aaax
>   $ echo abc | sed -E 's/a|$/x/g'   # expect "xbcx"
>   x
>   $ echo abc | sed -E 's/()/x/2'    # expect "axbc"
>   xabc
>   $ echo abc | sed -E 's/()/x/42'   # expect "abc"
>   xabc
> 
> One common source of confusion on misc@ was that Perl allows
> empty matches right after other matches, like in:
> 
>   $ perl -Mstrict -we '$_ = "abc"; s/b|()/x/g; print "$_\n";'
>   xaxxcx
>   $ perl -Mstrict -we '$_ = "a"; s/^|a|$/x/g; print "$_\n";'
>   xxx
> 
> I consider that broken behaviour in Perl.  For example, think about
> the case of "a" =~ /^|a/.  The branch /^/ matches at 0, length 0.
> The branch /a/ matches at 0, length 1.  So by greediness, the latter
> ought to prevail.  But then, we have already consumed the character,
> and there is no way to get a second match.  Hence,
> 
>   $ echo abc | sed -E 's/b|()/x/g'
>   xaxcx
>   $ echo a | sed -E 's/^|a|$/x/g'  
>   x
> 
> Comments?
>   Ingo


Couple of comments;

- re_off is no longer used.  

- slen is unsigned. We need to be 100% sure it cannot wrap. ATM I'm
not completely convinced the empty match case is safe. If you look at
the FreeBSD code, it appears the case match[0].rm_eo == slen can
happen, and in that case slen will wrap.  They fixed it by making slen
a signed type (regoff_t). See rev 1.30 and 1.31 in their tree. 

When I run the regress test suite from FreeBSD (in
src/tools/regression/usr.bin/sed) I see a couple of failures with our
sed. I did not have time yet to look if these failures have anything
with this diff and if we should incoorporate them. 

        -Otto

> 
> 
> Index: usr.bin/sed/process.c
> ===================================================================
> RCS file: /cvs/src/usr.bin/sed/process.c,v
> retrieving revision 1.15
> diff -u -p -r1.15 process.c
> --- usr.bin/sed/process.c     27 Oct 2009 23:59:43 -0000      1.15
> +++ usr.bin/sed/process.c     19 Jun 2011 01:47:19 -0000
> @@ -333,60 +333,47 @@ substitute(struct s_command *cp)
>       n = cp->u.s->n;
>       lastempty = 1;
>  
> -     switch (n) {
> -     case 0:                                 /* Global */
> -             do {
> -                     if (lastempty || match[0].rm_so != match[0].rm_eo) {
> -                             /* Locate start of replaced string. */
> -                             re_off = match[0].rm_so;
> -                             /* Copy leading retained string. */
> -                             cspace(&SS, s, re_off, APPEND);
> -                             /* Add in regular expression. */
> -                             regsub(&SS, s, cp->u.s->new);
> -                     }
> +     do {
> +             /* Copy the leading retained string. */
> +             if (n <= 1 && match[0].rm_so)
> +                     cspace(&SS, s, match[0].rm_so, APPEND);
>  
> -                     /* Move past this match. */
> -                     if (match[0].rm_so != match[0].rm_eo) {
> -                             s += match[0].rm_eo;
> -                             slen -= match[0].rm_eo;
> -                             lastempty = 0;
> +             /* Skip zero-length matches right after other matches. */
> +             if (lastempty || match[0].rm_so ||
> +                 match[0].rm_so != match[0].rm_eo) {
> +                     if (n <= 1) {
> +                             /* Want this match: append replacement. */
> +                             regsub(&SS, s, cp->u.s->new);
> +                             if (n == 1)
> +                                     n = -1;
>                       } else {
> -                             if (match[0].rm_so == 0)
> -                                     cspace(&SS, s, match[0].rm_so + 1,
> -                                         APPEND);
> -                             else
> -                                     cspace(&SS, s + match[0].rm_so, 1,
> -                                         APPEND);
> -                             s += match[0].rm_so + 1;
> -                             slen -= match[0].rm_so + 1;
> -                             lastempty = 1;
> +                             /* Want a later match: append original. */
> +                             if (match[0].rm_eo)
> +                                     cspace(&SS, s, match[0].rm_eo, APPEND);
> +                             n--;
>                       }
> -             } while (slen > 0 && regexec_e(re, s, REG_NOTBOL, 0, slen));
> -             /* Copy trailing retained string. */
> -             if (slen > 0)
> -                     cspace(&SS, s, slen, APPEND);
> -             break;
> -     default:                                /* Nth occurrence */
> -             while (--n) {
> -                     s += match[0].rm_eo;
> -                     slen -= match[0].rm_eo;
> -                     if (!regexec_e(re, s, REG_NOTBOL, 0, slen))
> -                             return (0);
>               }
> -             /* FALLTHROUGH */
> -     case 1:                                 /* 1st occurrence */
> -             /* Locate start of replaced string. */
> -             re_off = match[0].rm_so + (s - ps);
> -             /* Copy leading retained string. */
> -             cspace(&SS, ps, re_off, APPEND);
> -             /* Add in regular expression. */
> -             regsub(&SS, s, cp->u.s->new);
> -             /* Copy trailing retained string. */
> +
> +             /* After a zero-length match, advance one byte. */
> +             if (match[0].rm_so == match[0].rm_eo) {
> +                     cspace(&SS, s + match[0].rm_eo++, 1, APPEND);
> +                     lastempty = 1;
> +             } else
> +                     lastempty = 0;
> +
> +             /* Move past this match. */
>               s += match[0].rm_eo;
>               slen -= match[0].rm_eo;
> +
> +     } while (n >= 0 && slen > 0 && regexec_e(re, s, REG_NOTBOL, 0, slen));
> +
> +     /* Did not find the requested number of matches. */
> +     if (n > 1)
> +             return (0);
> +
> +     /* Copy the trailing retained string. */
> +     if (slen > 0)
>               cspace(&SS, s, slen, APPEND);
> -             break;
> -     }
>  
>       /*
>        * Swap the substitute space and the pattern space, and make sure
> Index: regress/usr.bin/sed/Makefile
> ===================================================================
> RCS file: /cvs/src/regress/usr.bin/sed/Makefile,v
> retrieving revision 1.2
> diff -u -p -r1.2 Makefile
> --- regress/usr.bin/sed/Makefile      13 Oct 2008 13:27:33 -0000      1.2
> +++ regress/usr.bin/sed/Makefile      19 Jun 2011 01:47:19 -0000
> @@ -3,11 +3,14 @@
>  
>  SED= /usr/bin/sed
>  
> -REGRESS_TARGETS= sedtest hanoi math sierpinski
> +REGRESS_TARGETS= sedtest substitute hanoi math sierpinski
>  
>  sedtest:
>       sh ${.CURDIR}/$@.sh ${SED} $@.out
>       diff ${.CURDIR}/$@.expected $@.out
> +
> +substitute:
> +     sh ${.CURDIR}/$@.sh
>  
>  hanoi:
>       ${SED} -f ${.CURDIR}/$@.sed ${.CURDIR}/$@.in > $@.out
> Index: regress/usr.bin/sed/substitute.sh
> ===================================================================
> RCS file: regress/usr.bin/sed/substitute.sh
> diff -N regress/usr.bin/sed/substitute.sh
> --- /dev/null 1 Jan 1970 00:00:00 -0000
> +++ regress/usr.bin/sed/substitute.sh 19 Jun 2011 01:47:19 -0000
> @@ -0,0 +1,74 @@
> +#!/bin/sh
> +
> +# error counter
> +err=0
> +
> +# test function; arguments are:
> +# input string
> +# regular expression to replace
> +# wanted output for /g (global substitution)
> +# wanted output for /1 (substitution of first match) and so on
> +t() {
> +     # global substitution
> +     in=$1
> +     expr=$2
> +     want=$3
> +     shift 3
> +     out=`echo "$in" | sed -E "s/$expr/x/g"`
> +     [ "X$out" = "X$want" ] || echo "$in/$expr/g/$want/$out ($((++err)))"
> +
> +     # substitution with specific index
> +     num=1
> +     while [ $# -gt 0 ]; do
> +             want=$1
> +             shift
> +             out=`echo "$in" | sed -E "s/$expr/x/$num"`
> +             [ "X$out" = "X$want" ] || \
> +                     echo "$in/$expr/$num/$want/$out ($((++err)))"
> +             num=$((num+1))
> +     done
> +
> +     # substitution with excessive index
> +     out=`echo "$in" | sed -E "s/$expr/x/$num"`
> +     [ "X$out" = "X$in" ] || echo "$in/$expr/$num/=/$out ($((++err)))"
> +}
> +
> +t '' ^ x x
> +t '' '()' x x
> +t '' '$' x x
> +t '' '^|$' x x
> +t a ^ xa xa
> +t a '()' xax xa ax
> +t a '$' ax ax
> +t a '^|a' x x
> +t a '^|$' xax xa ax
> +t a '^|a|$' x x
> +t a 'a|$' x x
> +t ab ^ xab xab
> +t ab '()' xaxbx xab axb abx
> +t ab '$' abx abx
> +t ab '^|a' xb xb
> +t ab '^|b' xax xab ax
> +t ab '^|$' xabx xab abx
> +t ab '^|a|$' xbx xb abx
> +t ab '^|b|$' xax xab ax
> +t ab '^|a|b|$' xx xb ax
> +t ab '^|ab|$' x x
> +t ab 'a|()' xbx xb abx
> +t ab 'a|$' xbx xb abx
> +t ab 'ab|$' x x
> +t ab 'b|()' xax xab ax
> +t ab 'b|$' ax ax
> +t abc '^|b' xaxc xabc axc
> +t abc '^|b|$' xaxcx xabc axc abcx
> +t abc '^|bc|$' xax xabc ax
> +t abc 'ab|()' xcx xc abcx
> +t abc 'ab|$' xcx xc abcx
> +t abc 'b|()' xaxcx xabc axc abcx
> +t abc 'bc|()' xax xabc ax
> +t abc 'b|$' axcx axc abcx
> +t aa a xx xa ax
> +t aa 'a|()' xx xa ax
> +t aa 'a*' x x
> +
> +exit $err

Re: counting zero-length matches in sed(1)

Reply via email to