When a regular expression has zero-length matches in a string,
both sed(1) global replacement (/g) and replacement of numbered
instances (e.g. /2) are broken.  This is not even limited to sed -E.
Both Otto's patch and my own refactoring patch on misc@ only address
global replacement and leave numbered instances broken.

Already now, code in the three parts of the switch statement
is rather repetitive, and by fixing the numbered instances case,
code duplication would become much worse.  Thus, i propose
the following full rewrite of the whole switch statement.

This is a bit scary because...
Well, sed(1) is not exactly the tool we want to break.
Hence, a test suite is included.
Both my patch and GNU sed(1) pass the test suite.

Our -current sed fails 43 of these tests, quite a few of them
in spectacular ways, like

  $ echo | sed 's/$/x/2'   # expect ""
  x
  $ echo aaa | sed 's/a*/x/2'   # expect "aaa"
  aaax
  $ echo abc | sed -E 's/a|$/x/g'   # expect "xbcx"
  x
  $ echo abc | sed -E 's/()/x/2'    # expect "axbc"
  xabc
  $ echo abc | sed -E 's/()/x/42'   # expect "abc"
  xabc

One common source of confusion on misc@ was that Perl allows
empty matches right after other matches, like in:

  $ perl -Mstrict -we '$_ = "abc"; s/b|()/x/g; print "$_\n";'
  xaxxcx
  $ perl -Mstrict -we '$_ = "a"; s/^|a|$/x/g; print "$_\n";'
  xxx

I consider that broken behaviour in Perl.  For example, think about
the case of "a" =~ /^|a/.  The branch /^/ matches at 0, length 0.
The branch /a/ matches at 0, length 1.  So by greediness, the latter
ought to prevail.  But then, we have already consumed the character,
and there is no way to get a second match.  Hence,

  $ echo abc | sed -E 's/b|()/x/g'
  xaxcx
  $ echo a | sed -E 's/^|a|$/x/g'  
  x

Comments?
  Ingo


Index: usr.bin/sed/process.c
===================================================================
RCS file: /cvs/src/usr.bin/sed/process.c,v
retrieving revision 1.15
diff -u -p -r1.15 process.c
--- usr.bin/sed/process.c       27 Oct 2009 23:59:43 -0000      1.15
+++ usr.bin/sed/process.c       19 Jun 2011 01:47:19 -0000
@@ -333,60 +333,47 @@ substitute(struct s_command *cp)
        n = cp->u.s->n;
        lastempty = 1;
 
-       switch (n) {
-       case 0:                                 /* Global */
-               do {
-                       if (lastempty || match[0].rm_so != match[0].rm_eo) {
-                               /* Locate start of replaced string. */
-                               re_off = match[0].rm_so;
-                               /* Copy leading retained string. */
-                               cspace(&SS, s, re_off, APPEND);
-                               /* Add in regular expression. */
-                               regsub(&SS, s, cp->u.s->new);
-                       }
+       do {
+               /* Copy the leading retained string. */
+               if (n <= 1 && match[0].rm_so)
+                       cspace(&SS, s, match[0].rm_so, APPEND);
 
-                       /* Move past this match. */
-                       if (match[0].rm_so != match[0].rm_eo) {
-                               s += match[0].rm_eo;
-                               slen -= match[0].rm_eo;
-                               lastempty = 0;
+               /* Skip zero-length matches right after other matches. */
+               if (lastempty || match[0].rm_so ||
+                   match[0].rm_so != match[0].rm_eo) {
+                       if (n <= 1) {
+                               /* Want this match: append replacement. */
+                               regsub(&SS, s, cp->u.s->new);
+                               if (n == 1)
+                                       n = -1;
                        } else {
-                               if (match[0].rm_so == 0)
-                                       cspace(&SS, s, match[0].rm_so + 1,
-                                           APPEND);
-                               else
-                                       cspace(&SS, s + match[0].rm_so, 1,
-                                           APPEND);
-                               s += match[0].rm_so + 1;
-                               slen -= match[0].rm_so + 1;
-                               lastempty = 1;
+                               /* Want a later match: append original. */
+                               if (match[0].rm_eo)
+                                       cspace(&SS, s, match[0].rm_eo, APPEND);
+                               n--;
                        }
-               } while (slen > 0 && regexec_e(re, s, REG_NOTBOL, 0, slen));
-               /* Copy trailing retained string. */
-               if (slen > 0)
-                       cspace(&SS, s, slen, APPEND);
-               break;
-       default:                                /* Nth occurrence */
-               while (--n) {
-                       s += match[0].rm_eo;
-                       slen -= match[0].rm_eo;
-                       if (!regexec_e(re, s, REG_NOTBOL, 0, slen))
-                               return (0);
                }
-               /* FALLTHROUGH */
-       case 1:                                 /* 1st occurrence */
-               /* Locate start of replaced string. */
-               re_off = match[0].rm_so + (s - ps);
-               /* Copy leading retained string. */
-               cspace(&SS, ps, re_off, APPEND);
-               /* Add in regular expression. */
-               regsub(&SS, s, cp->u.s->new);
-               /* Copy trailing retained string. */
+
+               /* After a zero-length match, advance one byte. */
+               if (match[0].rm_so == match[0].rm_eo) {
+                       cspace(&SS, s + match[0].rm_eo++, 1, APPEND);
+                       lastempty = 1;
+               } else
+                       lastempty = 0;
+
+               /* Move past this match. */
                s += match[0].rm_eo;
                slen -= match[0].rm_eo;
+
+       } while (n >= 0 && slen > 0 && regexec_e(re, s, REG_NOTBOL, 0, slen));
+
+       /* Did not find the requested number of matches. */
+       if (n > 1)
+               return (0);
+
+       /* Copy the trailing retained string. */
+       if (slen > 0)
                cspace(&SS, s, slen, APPEND);
-               break;
-       }
 
        /*
         * Swap the substitute space and the pattern space, and make sure
Index: regress/usr.bin/sed/Makefile
===================================================================
RCS file: /cvs/src/regress/usr.bin/sed/Makefile,v
retrieving revision 1.2
diff -u -p -r1.2 Makefile
--- regress/usr.bin/sed/Makefile        13 Oct 2008 13:27:33 -0000      1.2
+++ regress/usr.bin/sed/Makefile        19 Jun 2011 01:47:19 -0000
@@ -3,11 +3,14 @@
 
 SED=   /usr/bin/sed
 
-REGRESS_TARGETS= sedtest hanoi math sierpinski
+REGRESS_TARGETS= sedtest substitute hanoi math sierpinski
 
 sedtest:
        sh ${.CURDIR}/$@.sh ${SED} $@.out
        diff ${.CURDIR}/$@.expected $@.out
+
+substitute:
+       sh ${.CURDIR}/$@.sh
 
 hanoi:
        ${SED} -f ${.CURDIR}/$@.sed ${.CURDIR}/$@.in > $@.out
Index: regress/usr.bin/sed/substitute.sh
===================================================================
RCS file: regress/usr.bin/sed/substitute.sh
diff -N regress/usr.bin/sed/substitute.sh
--- /dev/null   1 Jan 1970 00:00:00 -0000
+++ regress/usr.bin/sed/substitute.sh   19 Jun 2011 01:47:19 -0000
@@ -0,0 +1,74 @@
+#!/bin/sh
+
+# error counter
+err=0
+
+# test function; arguments are:
+# input string
+# regular expression to replace
+# wanted output for /g (global substitution)
+# wanted output for /1 (substitution of first match) and so on
+t() {
+       # global substitution
+       in=$1
+       expr=$2
+       want=$3
+       shift 3
+       out=`echo "$in" | sed -E "s/$expr/x/g"`
+       [ "X$out" = "X$want" ] || echo "$in/$expr/g/$want/$out ($((++err)))"
+
+       # substitution with specific index
+       num=1
+       while [ $# -gt 0 ]; do
+               want=$1
+               shift
+               out=`echo "$in" | sed -E "s/$expr/x/$num"`
+               [ "X$out" = "X$want" ] || \
+                       echo "$in/$expr/$num/$want/$out ($((++err)))"
+               num=$((num+1))
+       done
+
+       # substitution with excessive index
+       out=`echo "$in" | sed -E "s/$expr/x/$num"`
+       [ "X$out" = "X$in" ] || echo "$in/$expr/$num/=/$out ($((++err)))"
+}
+
+t '' ^ x x
+t '' '()' x x
+t '' '$' x x
+t '' '^|$' x x
+t a ^ xa xa
+t a '()' xax xa ax
+t a '$' ax ax
+t a '^|a' x x
+t a '^|$' xax xa ax
+t a '^|a|$' x x
+t a 'a|$' x x
+t ab ^ xab xab
+t ab '()' xaxbx xab axb abx
+t ab '$' abx abx
+t ab '^|a' xb xb
+t ab '^|b' xax xab ax
+t ab '^|$' xabx xab abx
+t ab '^|a|$' xbx xb abx
+t ab '^|b|$' xax xab ax
+t ab '^|a|b|$' xx xb ax
+t ab '^|ab|$' x x
+t ab 'a|()' xbx xb abx
+t ab 'a|$' xbx xb abx
+t ab 'ab|$' x x
+t ab 'b|()' xax xab ax
+t ab 'b|$' ax ax
+t abc '^|b' xaxc xabc axc
+t abc '^|b|$' xaxcx xabc axc abcx
+t abc '^|bc|$' xax xabc ax
+t abc 'ab|()' xcx xc abcx
+t abc 'ab|$' xcx xc abcx
+t abc 'b|()' xaxcx xabc axc abcx
+t abc 'bc|()' xax xabc ax
+t abc 'b|$' axcx axc abcx
+t aa a xx xa ax
+t aa 'a|()' xx xa ax
+t aa 'a*' x x
+
+exit $err

Reply via email to