Re: counting zero-length matches in sed(1)

2011-09-17 Thread Ingo Schwarze
Before the 5.0 lock, i rewrote the sed(1) s/// loop to fix
multiple bugs with respect to zero-length matches.

The patch that went in,

  http://marc.info/?l=openbsd-techm=131145325927701w=2

had to be backed out due to a regression:
When the input did not end in a trailing newline character
and there was an empty match at the end, the committed code
added a spurious '\0' character to the output.

A fix for that was posted, but not committed again
to not disrupt the 5.0 release:

  http://marc.info/?l=openbsd-techm=131167188124500w=2

Here is the full patch against -current including the
fix to avoid the regression.

OK?


Index: process.c
===
RCS file: /cvs/src/usr.bin/sed/process.c,v
retrieving revision 1.17
diff -u -p -r1.17 process.c
--- process.c   26 Jul 2011 08:47:07 -  1.17
+++ process.c   17 Sep 2011 08:54:51 -
@@ -312,7 +312,7 @@ substitute(struct s_command *cp)
 {
SPACE tspace;
regex_t *re;
-   size_t re_off, slen;
+   regoff_t slen;
int n, lastempty;
char *s;
 
@@ -333,60 +333,55 @@ substitute(struct s_command *cp)
n = cp-u.s-n;
lastempty = 1;
 
-   switch (n) {
-   case 0: /* Global */
-   do {
-   if (lastempty || match[0].rm_so != match[0].rm_eo) {
-   /* Locate start of replaced string. */
-   re_off = match[0].rm_so;
-   /* Copy leading retained string. */
-   cspace(SS, s, re_off, APPEND);
-   /* Add in regular expression. */
-   regsub(SS, s, cp-u.s-new);
-   }
+   do {
+   /* Copy the leading retained string. */
+   if (n = 1  match[0].rm_so)
+   cspace(SS, s, match[0].rm_so, APPEND);
 
-   /* Move past this match. */
-   if (match[0].rm_so != match[0].rm_eo) {
-   s += match[0].rm_eo;
-   slen -= match[0].rm_eo;
-   lastempty = 0;
+   /* Skip zero-length matches right after other matches. */
+   if (lastempty || match[0].rm_so ||
+   match[0].rm_so != match[0].rm_eo) {
+   if (n = 1) {
+   /* Want this match: append replacement. */
+   regsub(SS, s, cp-u.s-new);
+   if (n == 1)
+   n = -1;
} else {
-   if (match[0].rm_so == 0)
-   cspace(SS, s, match[0].rm_so + 1,
-   APPEND);
-   else
-   cspace(SS, s + match[0].rm_so, 1,
-   APPEND);
-   s += match[0].rm_so + 1;
-   slen -= match[0].rm_so + 1;
-   lastempty = 1;
+   /* Want a later match: append original. */
+   if (match[0].rm_eo)
+   cspace(SS, s, match[0].rm_eo, APPEND);
+   n--;
}
-   } while (slen  0  regexec_e(re, s, REG_NOTBOL, 0, slen));
-   /* Copy trailing retained string. */
-   if (slen  0)
-   cspace(SS, s, slen, APPEND);
-   break;
-   default:/* Nth occurrence */
-   while (--n) {
-   s += match[0].rm_eo;
-   slen -= match[0].rm_eo;
-   if (!regexec_e(re, s, REG_NOTBOL, 0, slen))
-   return (0);
}
-   /* FALLTHROUGH */
-   case 1: /* 1st occurrence */
-   /* Locate start of replaced string. */
-   re_off = match[0].rm_so + (s - ps);
-   /* Copy leading retained string. */
-   cspace(SS, ps, re_off, APPEND);
-   /* Add in regular expression. */
-   regsub(SS, s, cp-u.s-new);
-   /* Copy trailing retained string. */
+
+   /* Move past this match. */
s += match[0].rm_eo;
slen -= match[0].rm_eo;
+
+   /*
+* After a zero-length match, advance one byte,
+* and at the end of the line, terminate.
+*/
+   if (match[0].rm_so == match[0].rm_eo) {
+   if (*s == '\0' || *s == '\n')
+   slen = -1;
+   else
+   

Re: counting zero-length matches in sed(1)

2011-07-26 Thread Ingo Schwarze
In case you wonder which patch i'm talking about, here it is.
Note this is not against -current, but against the backed-out
version for brevity.

Not fishing for OKs at this time, just FYI.


Index: process.c
===
RCS file: /cvs/src/usr.bin/sed/process.c,v
retrieving revision 1.16
diff -u -p -r1.16 process.c
--- process.c   24 Jul 2011 13:59:15 -  1.16
+++ process.c   26 Jul 2011 09:03:31 -
@@ -363,11 +363,12 @@ substitute(struct s_command *cp)
 * and at the end of the line, terminate.
 */
if (match[0].rm_so == match[0].rm_eo) {
-   if (*s == '\n')
+   if (*s == '\0' || *s == '\n')
slen = -1;
else
slen--;
-   cspace(SS, s++, 1, APPEND);
+   if (*s != '\0')
+   cspace(SS, s++, 1, APPEND);
lastempty = 1;
} else
lastempty = 0;


- Forwarded message from Ingo Schwarze schwa...@cvs.openbsd.org -

From: Ingo Schwarze schwa...@cvs.openbsd.org
Date: Tue, 26 Jul 2011 02:47:07 -0600 (MDT)
To: source-chan...@cvs.openbsd.org
Subject: CVS: cvs.openbsd.org: src

CVSROOT:/cvs
Module name:src
Changes by: schwa...@cvs.openbsd.org2011/07/26 02:47:07

Modified files:
usr.bin/sed: process.c 

Log message:
Backout previous, naddy@ found the following regression:
When the input does not end in a trailing newline character
and there is an empty match at the end, the new code adds
a spurious '\0' character.
I have a fix, but otto@ prefers backout and full re-evaluation
after release.

- End forwarded message -



Re: counting zero-length matches in sed(1)

2011-07-21 Thread Ingo Schwarze
So, finally, here is the update of my patch to fix zero-length
matches in sed(1).  Sorry for the ridiculous delay.

Changes with respect to the first version:

 - Removed the now unused local variable re_off (noticed by otto@).
 - Make slen signed, or zero-length matches at EOF without a trailing
   newline would cause it to underflow (suspected by otto@ after
   looking at FreeBSD code, suspicion confirmed myself).
 - Advancing after zero-length matches was not quite right:
   A zero-length match before the final character of a file caused the
   zero-length match after the final character to get lost if the file
   ended without a terminating newline (noticed myself by changing
   echo $in to echo -n $in in substitute.sh of our own test suite);
   fixed by reworking the end of the do...while loop, which also
   makes it easier to understand.

This now passes our own test suite both as it is and with -n,
and otto@ checked that the remaining failures with respect to the
FreeBSD test suite look unrelated to this particular patch.

I'm leaving the original rationale at the end, for reference.


Index: process.c
===
RCS file: /cvs/src/usr.bin/sed/process.c,v
retrieving revision 1.15
diff -u -p -r1.15 process.c
--- process.c   27 Oct 2009 23:59:43 -  1.15
+++ process.c   22 Jul 2011 00:11:49 -
@@ -312,7 +312,7 @@ substitute(struct s_command *cp)
 {
SPACE tspace;
regex_t *re;
-   size_t re_off, slen;
+   regoff_t slen;
int n, lastempty;
char *s;
 
@@ -333,60 +333,54 @@ substitute(struct s_command *cp)
n = cp-u.s-n;
lastempty = 1;
 
-   switch (n) {
-   case 0: /* Global */
-   do {
-   if (lastempty || match[0].rm_so != match[0].rm_eo) {
-   /* Locate start of replaced string. */
-   re_off = match[0].rm_so;
-   /* Copy leading retained string. */
-   cspace(SS, s, re_off, APPEND);
-   /* Add in regular expression. */
-   regsub(SS, s, cp-u.s-new);
-   }
+   do {
+   /* Copy the leading retained string. */
+   if (n = 1  match[0].rm_so)
+   cspace(SS, s, match[0].rm_so, APPEND);
 
-   /* Move past this match. */
-   if (match[0].rm_so != match[0].rm_eo) {
-   s += match[0].rm_eo;
-   slen -= match[0].rm_eo;
-   lastempty = 0;
+   /* Skip zero-length matches right after other matches. */
+   if (lastempty || match[0].rm_so ||
+   match[0].rm_so != match[0].rm_eo) {
+   if (n = 1) {
+   /* Want this match: append replacement. */
+   regsub(SS, s, cp-u.s-new);
+   if (n == 1)
+   n = -1;
} else {
-   if (match[0].rm_so == 0)
-   cspace(SS, s, match[0].rm_so + 1,
-   APPEND);
-   else
-   cspace(SS, s + match[0].rm_so, 1,
-   APPEND);
-   s += match[0].rm_so + 1;
-   slen -= match[0].rm_so + 1;
-   lastempty = 1;
+   /* Want a later match: append original. */
+   if (match[0].rm_eo)
+   cspace(SS, s, match[0].rm_eo, APPEND);
+   n--;
}
-   } while (slen  0  regexec_e(re, s, REG_NOTBOL, 0, slen));
-   /* Copy trailing retained string. */
-   if (slen  0)
-   cspace(SS, s, slen, APPEND);
-   break;
-   default:/* Nth occurrence */
-   while (--n) {
-   s += match[0].rm_eo;
-   slen -= match[0].rm_eo;
-   if (!regexec_e(re, s, REG_NOTBOL, 0, slen))
-   return (0);
}
-   /* FALLTHROUGH */
-   case 1: /* 1st occurrence */
-   /* Locate start of replaced string. */
-   re_off = match[0].rm_so + (s - ps);
-   /* Copy leading retained string. */
-   cspace(SS, ps, re_off, APPEND);
-   /* Add in regular expression. */
-   regsub(SS, s, cp-u.s-new);
-   /* Copy trailing retained string. */
+
+  

Re: counting zero-length matches in sed(1)

2011-06-24 Thread Otto Moerbeek
On Sat, Jun 18, 2011 at 08:16:05PM -0600, Ingo Schwarze wrote:

 When a regular expression has zero-length matches in a string,
 both sed(1) global replacement (/g) and replacement of numbered
 instances (e.g. /2) are broken.  This is not even limited to sed -E.
 Both Otto's patch and my own refactoring patch on misc@ only address
 global replacement and leave numbered instances broken.
 
 Already now, code in the three parts of the switch statement
 is rather repetitive, and by fixing the numbered instances case,
 code duplication would become much worse.  Thus, i propose
 the following full rewrite of the whole switch statement.
 
 This is a bit scary because...
 Well, sed(1) is not exactly the tool we want to break.
 Hence, a test suite is included.
 Both my patch and GNU sed(1) pass the test suite.
 
 Our -current sed fails 43 of these tests, quite a few of them
 in spectacular ways, like
 
   $ echo | sed 's/$/x/2'   # expect 
   x
   $ echo aaa | sed 's/a*/x/2'   # expect aaa
   aaax
   $ echo abc | sed -E 's/a|$/x/g'   # expect xbcx
   x
   $ echo abc | sed -E 's/()/x/2'# expect axbc
   xabc
   $ echo abc | sed -E 's/()/x/42'   # expect abc
   xabc
 
 One common source of confusion on misc@ was that Perl allows
 empty matches right after other matches, like in:
 
   $ perl -Mstrict -we '$_ = abc; s/b|()/x/g; print $_\n;'
   xaxxcx
   $ perl -Mstrict -we '$_ = a; s/^|a|$/x/g; print $_\n;'
   xxx
 
 I consider that broken behaviour in Perl.  For example, think about
 the case of a =~ /^|a/.  The branch /^/ matches at 0, length 0.
 The branch /a/ matches at 0, length 1.  So by greediness, the latter
 ought to prevail.  But then, we have already consumed the character,
 and there is no way to get a second match.  Hence,
 
   $ echo abc | sed -E 's/b|()/x/g'
   xaxcx
   $ echo a | sed -E 's/^|a|$/x/g'  
   x
 
 Comments?
   Ingo

Couple of comments;

- re_off is no longer used.  

- slen is unsigned. We need to be 100% sure it cannot wrap. ATM I'm
not completely convinced the empty match case is safe. If you look at
the FreeBSD code, it appears the case match[0].rm_eo == slen can
happen, and in that case slen will wrap.  They fixed it by making slen
a signed type (regoff_t). See rev 1.30 and 1.31 in their tree. 

When I run the regress test suite from FreeBSD (in
src/tools/regression/usr.bin/sed) I see a couple of failures with our
sed. I did not have time yet to look if these failures have anything
with this diff and if we should incoorporate them. 

-Otto

 
 
 Index: usr.bin/sed/process.c
 ===
 RCS file: /cvs/src/usr.bin/sed/process.c,v
 retrieving revision 1.15
 diff -u -p -r1.15 process.c
 --- usr.bin/sed/process.c 27 Oct 2009 23:59:43 -  1.15
 +++ usr.bin/sed/process.c 19 Jun 2011 01:47:19 -
 @@ -333,60 +333,47 @@ substitute(struct s_command *cp)
   n = cp-u.s-n;
   lastempty = 1;
  
 - switch (n) {
 - case 0: /* Global */
 - do {
 - if (lastempty || match[0].rm_so != match[0].rm_eo) {
 - /* Locate start of replaced string. */
 - re_off = match[0].rm_so;
 - /* Copy leading retained string. */
 - cspace(SS, s, re_off, APPEND);
 - /* Add in regular expression. */
 - regsub(SS, s, cp-u.s-new);
 - }
 + do {
 + /* Copy the leading retained string. */
 + if (n = 1  match[0].rm_so)
 + cspace(SS, s, match[0].rm_so, APPEND);
  
 - /* Move past this match. */
 - if (match[0].rm_so != match[0].rm_eo) {
 - s += match[0].rm_eo;
 - slen -= match[0].rm_eo;
 - lastempty = 0;
 + /* Skip zero-length matches right after other matches. */
 + if (lastempty || match[0].rm_so ||
 + match[0].rm_so != match[0].rm_eo) {
 + if (n = 1) {
 + /* Want this match: append replacement. */
 + regsub(SS, s, cp-u.s-new);
 + if (n == 1)
 + n = -1;
   } else {
 - if (match[0].rm_so == 0)
 - cspace(SS, s, match[0].rm_so + 1,
 - APPEND);
 - else
 - cspace(SS, s + match[0].rm_so, 1,
 - APPEND);
 - s += match[0].rm_so + 1;
 - slen -= match[0].rm_so + 1;
 - lastempty = 1;
 + /* Want a later match: append 

Re: counting zero-length matches in sed(1)

2011-06-19 Thread Otto Moerbeek
On Sat, Jun 18, 2011 at 08:16:05PM -0600, Ingo Schwarze wrote:

 When a regular expression has zero-length matches in a string,
 both sed(1) global replacement (/g) and replacement of numbered
 instances (e.g. /2) are broken.  This is not even limited to sed -E.
 Both Otto's patch and my own refactoring patch on misc@ only address
 global replacement and leave numbered instances broken.
 
 Already now, code in the three parts of the switch statement
 is rather repetitive, and by fixing the numbered instances case,
 code duplication would become much worse.  Thus, i propose
 the following full rewrite of the whole switch statement.
 
 This is a bit scary because...
 Well, sed(1) is not exactly the tool we want to break.
 Hence, a test suite is included.
 Both my patch and GNU sed(1) pass the test suite.
 
 Our -current sed fails 43 of these tests, quite a few of them
 in spectacular ways, like
 
   $ echo | sed 's/$/x/2'   # expect 
   x
   $ echo aaa | sed 's/a*/x/2'   # expect aaa
   aaax
   $ echo abc | sed -E 's/a|$/x/g'   # expect xbcx
   x
   $ echo abc | sed -E 's/()/x/2'# expect axbc
   xabc
   $ echo abc | sed -E 's/()/x/42'   # expect abc
   xabc
 
 One common source of confusion on misc@ was that Perl allows
 empty matches right after other matches, like in:
 
   $ perl -Mstrict -we '$_ = abc; s/b|()/x/g; print $_\n;'
   xaxxcx
   $ perl -Mstrict -we '$_ = a; s/^|a|$/x/g; print $_\n;'
   xxx
 
 I consider that broken behaviour in Perl.  For example, think about
 the case of a =~ /^|a/.  The branch /^/ matches at 0, length 0.
 The branch /a/ matches at 0, length 1.  So by greediness, the latter
 ought to prevail.  But then, we have already consumed the character,
 and there is no way to get a second match.  Hence,
 
   $ echo abc | sed -E 's/b|()/x/g'
   xaxcx
   $ echo a | sed -E 's/^|a|$/x/g'  
   x
 
 Comments?

Very good you're attacking this! When I was working on my first diff,
it already crossed my mind: this code is too complex, it should be
rewritten.  That said, a full review of this will take some effort. I
hope to do it the coming week. 

-Otto

   Ingo
 
 
 Index: usr.bin/sed/process.c
 ===
 RCS file: /cvs/src/usr.bin/sed/process.c,v
 retrieving revision 1.15
 diff -u -p -r1.15 process.c
 --- usr.bin/sed/process.c 27 Oct 2009 23:59:43 -  1.15
 +++ usr.bin/sed/process.c 19 Jun 2011 01:47:19 -
 @@ -333,60 +333,47 @@ substitute(struct s_command *cp)
   n = cp-u.s-n;
   lastempty = 1;
  
 - switch (n) {
 - case 0: /* Global */
 - do {
 - if (lastempty || match[0].rm_so != match[0].rm_eo) {
 - /* Locate start of replaced string. */
 - re_off = match[0].rm_so;
 - /* Copy leading retained string. */
 - cspace(SS, s, re_off, APPEND);
 - /* Add in regular expression. */
 - regsub(SS, s, cp-u.s-new);
 - }
 + do {
 + /* Copy the leading retained string. */
 + if (n = 1  match[0].rm_so)
 + cspace(SS, s, match[0].rm_so, APPEND);
  
 - /* Move past this match. */
 - if (match[0].rm_so != match[0].rm_eo) {
 - s += match[0].rm_eo;
 - slen -= match[0].rm_eo;
 - lastempty = 0;
 + /* Skip zero-length matches right after other matches. */
 + if (lastempty || match[0].rm_so ||
 + match[0].rm_so != match[0].rm_eo) {
 + if (n = 1) {
 + /* Want this match: append replacement. */
 + regsub(SS, s, cp-u.s-new);
 + if (n == 1)
 + n = -1;
   } else {
 - if (match[0].rm_so == 0)
 - cspace(SS, s, match[0].rm_so + 1,
 - APPEND);
 - else
 - cspace(SS, s + match[0].rm_so, 1,
 - APPEND);
 - s += match[0].rm_so + 1;
 - slen -= match[0].rm_so + 1;
 - lastempty = 1;
 + /* Want a later match: append original. */
 + if (match[0].rm_eo)
 + cspace(SS, s, match[0].rm_eo, APPEND);
 + n--;
   }
 - } while (slen  0  regexec_e(re, s, REG_NOTBOL, 0, slen));
 - /* Copy trailing retained string. */
 - if (slen  0)
 - cspace(SS, s, 

counting zero-length matches in sed(1)

2011-06-18 Thread Ingo Schwarze
When a regular expression has zero-length matches in a string,
both sed(1) global replacement (/g) and replacement of numbered
instances (e.g. /2) are broken.  This is not even limited to sed -E.
Both Otto's patch and my own refactoring patch on misc@ only address
global replacement and leave numbered instances broken.

Already now, code in the three parts of the switch statement
is rather repetitive, and by fixing the numbered instances case,
code duplication would become much worse.  Thus, i propose
the following full rewrite of the whole switch statement.

This is a bit scary because...
Well, sed(1) is not exactly the tool we want to break.
Hence, a test suite is included.
Both my patch and GNU sed(1) pass the test suite.

Our -current sed fails 43 of these tests, quite a few of them
in spectacular ways, like

  $ echo | sed 's/$/x/2'   # expect 
  x
  $ echo aaa | sed 's/a*/x/2'   # expect aaa
  aaax
  $ echo abc | sed -E 's/a|$/x/g'   # expect xbcx
  x
  $ echo abc | sed -E 's/()/x/2'# expect axbc
  xabc
  $ echo abc | sed -E 's/()/x/42'   # expect abc
  xabc

One common source of confusion on misc@ was that Perl allows
empty matches right after other matches, like in:

  $ perl -Mstrict -we '$_ = abc; s/b|()/x/g; print $_\n;'
  xaxxcx
  $ perl -Mstrict -we '$_ = a; s/^|a|$/x/g; print $_\n;'
  xxx

I consider that broken behaviour in Perl.  For example, think about
the case of a =~ /^|a/.  The branch /^/ matches at 0, length 0.
The branch /a/ matches at 0, length 1.  So by greediness, the latter
ought to prevail.  But then, we have already consumed the character,
and there is no way to get a second match.  Hence,

  $ echo abc | sed -E 's/b|()/x/g'
  xaxcx
  $ echo a | sed -E 's/^|a|$/x/g'  
  x

Comments?
  Ingo


Index: usr.bin/sed/process.c
===
RCS file: /cvs/src/usr.bin/sed/process.c,v
retrieving revision 1.15
diff -u -p -r1.15 process.c
--- usr.bin/sed/process.c   27 Oct 2009 23:59:43 -  1.15
+++ usr.bin/sed/process.c   19 Jun 2011 01:47:19 -
@@ -333,60 +333,47 @@ substitute(struct s_command *cp)
n = cp-u.s-n;
lastempty = 1;
 
-   switch (n) {
-   case 0: /* Global */
-   do {
-   if (lastempty || match[0].rm_so != match[0].rm_eo) {
-   /* Locate start of replaced string. */
-   re_off = match[0].rm_so;
-   /* Copy leading retained string. */
-   cspace(SS, s, re_off, APPEND);
-   /* Add in regular expression. */
-   regsub(SS, s, cp-u.s-new);
-   }
+   do {
+   /* Copy the leading retained string. */
+   if (n = 1  match[0].rm_so)
+   cspace(SS, s, match[0].rm_so, APPEND);
 
-   /* Move past this match. */
-   if (match[0].rm_so != match[0].rm_eo) {
-   s += match[0].rm_eo;
-   slen -= match[0].rm_eo;
-   lastempty = 0;
+   /* Skip zero-length matches right after other matches. */
+   if (lastempty || match[0].rm_so ||
+   match[0].rm_so != match[0].rm_eo) {
+   if (n = 1) {
+   /* Want this match: append replacement. */
+   regsub(SS, s, cp-u.s-new);
+   if (n == 1)
+   n = -1;
} else {
-   if (match[0].rm_so == 0)
-   cspace(SS, s, match[0].rm_so + 1,
-   APPEND);
-   else
-   cspace(SS, s + match[0].rm_so, 1,
-   APPEND);
-   s += match[0].rm_so + 1;
-   slen -= match[0].rm_so + 1;
-   lastempty = 1;
+   /* Want a later match: append original. */
+   if (match[0].rm_eo)
+   cspace(SS, s, match[0].rm_eo, APPEND);
+   n--;
}
-   } while (slen  0  regexec_e(re, s, REG_NOTBOL, 0, slen));
-   /* Copy trailing retained string. */
-   if (slen  0)
-   cspace(SS, s, slen, APPEND);
-   break;
-   default:/* Nth occurrence */
-   while (--n) {
-   s += match[0].rm_eo;
-   slen -= match[0].rm_eo;
-   if (!regexec_e(re, s, REG_NOTBOL, 0, slen))
-