Here's an updated diff based on a comment I found in vi's substitute
code.
When a 0-length match is found directly on the position where the 
previous non-0-length match ended it should skip that match.

Consider the following:
text: aabb
command: s/a*/X/g
without check: XXbXbX
with check: XbXbX

This command isn't possible in the current ed, since it borks out on an
infinite loop detection, so the previous diff would still be an
improvement, but it's not what sed/vi does.

On 05/29/16 15:18, Martijn van Duren wrote:
> Hello tech@,
> 
> Here's a first attempt at fixing the begin of word bug as also found in
> sed. It does a little more then I initially sed^ht out to do, but I also
> found some other unexpected behaviour.
> 1) It removes the isbinary test and the accompanying
> NUL_TO_NEWLINE/NEWLINE_TO_NUL conversions. If a NUL-byte is found in a
> text ed detects it as a binary file and converts every NUL to a newline
> prior to doing the regexec commands. After this is done it converts
> every newline back to a NUL-byte. Luckily one can never find a native
> newline character in a binary file. Right? Right?
> 2) Currently ed aborts the substitution when it finds a only-positioning 
> match on location 0 to prevent an infinite loop. Because of this s/^/X/g
> will never match. If this occurs just increment the next offset by one.
> 3) The else clause in the do/while loop contains similar code as the if
> clause. So determine the offset, hoist the code above the if and remove 
> the else clause for readability.
> 4) Don't increment the txt-pointer, but use REG_STARTEND|REG_NOTBOL
> instead. This fixes the actual bug.
> 
> There's still code in ed that uses the newline/NUL conversion, but we
> don't support binary editing in ed, so if you do so it's at your own
> risk.
> It *might* be possible to make it work properly if the conversions were
> removed and get_compiled_pattern in re.c would use REG_PEND, but I"m not
> going to chase windmills.
> 
> Any thoughts/comments?
> 
> Minimally tested.
> 
> martijn@
> 

Index: sub.c
===================================================================
RCS file: /cvs/src/bin/ed/sub.c,v
retrieving revision 1.15
diff -u -p -r1.15 sub.c
--- sub.c       22 Mar 2016 17:58:28 -0000      1.15
+++ sub.c       29 May 2016 14:16:29 -0000
@@ -180,52 +180,54 @@ substitute_matching_text(regex_t *pat, l
        int off = 0;
        int changed = 0;
        int matchno = 0;
-       int i = 0;
+       int nempty = -1;
+       off_t i = 0, skip;
        regmatch_t rm[SE_MAX];
        char *txt;
        char *eot;
 
        if ((txt = get_sbuf_line(lp)) == NULL)
                return ERR;
-       if (isbinary)
-               NUL_TO_NEWLINE(txt, lp->len);
        eot = txt + lp->len;
-       if (!regexec(pat, txt, SE_MAX, rm, 0)) {
+       rm[0].rm_so = 0;
+       rm[0].rm_eo = lp->len;
+       if (!regexec(pat, txt, SE_MAX, rm, REG_STARTEND)) {
                do {
-                       if (!kth || kth == ++matchno) {
-                               changed++;
-                               i = rm[0].rm_so;
-                               REALLOC(rbuf, rbufsz, off + i, ERR);
-                               if (isbinary)
-                                       NEWLINE_TO_NUL(txt, rm[0].rm_eo);
-                               memcpy(rbuf + off, txt, i);
-                               off += i;
+/* Don't allow an 0-length match after a non-0-length match */
+                       if (rm[0].rm_so == nempty && rm[0].rm_eo == nempty) {
+                               rm[0].rm_so++;
+                               rm[0].rm_eo = lp->len;
+                               nempty = -1;
+                               continue;
+                       }
+                       nempty = -1;
+                       skip = (!kth || kth == ++matchno) ?
+                           rm[0].rm_so : rm[0].rm_eo;
+                       REALLOC(rbuf, rbufsz, off + skip - i, ERR);
+                       memcpy(rbuf + off, txt + i, skip - i);
+                       off += (skip - i);
+                       i = rm[0].rm_eo;
+                       if (!kth || kth == matchno) {
+                               changed = 1;
                                if ((off = apply_subst_template(txt, rm, off,
                                    pat->re_nsub)) < 0)
                                        return ERR;
-                       } else {
-                               i = rm[0].rm_eo;
-                               REALLOC(rbuf, rbufsz, off + i, ERR);
-                               if (isbinary)
-                                       NEWLINE_TO_NUL(txt, i);
-                               memcpy(rbuf + off, txt, i);
-                               off += i;
+                               if (kth)
+                                       break;
                        }
-                       txt += rm[0].rm_eo;
-               } while (*txt && (!changed || ((gflag & GSG) && rm[0].rm_eo)) &&
-                   !regexec(pat, txt, SE_MAX, rm, REG_NOTBOL));
-               i = eot - txt;
-               REALLOC(rbuf, rbufsz, off + i + 2, ERR);
-               if (i > 0 && !rm[0].rm_eo && (gflag & GSG)) {
-                       seterrmsg("infinite substitution loop");
-                       return  ERR;
-               }
-               if (isbinary)
-                       NEWLINE_TO_NUL(txt, i);
-               memcpy(rbuf + off, txt, i);
-               memcpy(rbuf + off + i, "\n", 2);
+
+                       if (rm[0].rm_so == rm[0].rm_eo)
+                               rm[0].rm_so = rm[0].rm_eo + 1;
+                       else
+                               nempty = rm[0].rm_so = rm[0].rm_eo;
+                       rm[0].rm_eo = lp->len;
+               } while ((txt + i < eot) &&
+                   !regexec(pat, txt, SE_MAX, rm, REG_STARTEND | REG_NOTBOL));
+               REALLOC(rbuf, rbufsz, off + lp->len - i + 2, ERR);
+               memcpy(rbuf + off, txt + i, lp->len - i);
+               memcpy(rbuf + off + lp->len - i, "\n", 2);
        }
-       return changed ? off + i + 1 : 0;
+       return changed ? off + lp->len - i + 1 : 0;
 }

Reply via email to