Hi Martijn,

Martijn van Duren wrote on Sun, May 29, 2016 at 03:18:57PM +0200:

> 1) It removes the isbinary test and the accompanying
> NUL_TO_NEWLINE/NEWLINE_TO_NUL conversions. If a NUL-byte is found in a
> text ed detects it as a binary file and converts every NUL to a newline
> prior to doing the regexec commands. After this is done it converts
> every newline back to a NUL-byte. Luckily one can never find a native
> newline character in a binary file. Right? Right?

I think this part of your analysis is not quite right.
Even when processing a binary file, ed(1) still considers it
to consist of lines and stores each line separately.
So indeed, none of the lines can contain a native newline
character.  Consider:

   $ printf "a\000b\nc" > testfile
   $ printf ",n\n" | ed testfile | hexdump -C 
  5
  00000000  31 09 61 00 62 0a 32 09  63 0a   |1.a.b.2.c.|
  0000000a
   $ printf ",s/b/B/g\nwq\n" | ed testfile    
  5
  5
   $ hexdump -C testfile                   
  00000000  61 00 42 0a 63                   |a.B.c|
  00000005

The fact that the file still doesn't end in a newline proves that
it was treated as binary, the line numbers prove that it was split
into lines and reassembled, and both the NUL and the newline are
still intact even after doing a replacement.

> There's still code in ed that uses the newline/NUL conversion, but we
> don't support binary editing in ed, so if you do so it's at your own
> risk.

What makes you think that we do not support binary editing in ed?
I think we do.

There is one limitation.  When editing a file with ed(1) s///,
you cannot have '\n' in the replacement string because it would
end the command and there is no way to escape it.  So there is
no easy way to split lines with ed(1) s///.  But that doesn't have
much to do with binary files, it applies to text files as well.

Note that you can replace a byte with NUL in a binary file:

   $ printf "a\000b\ncde" > testfile         
   $ printf ",s/d/\000/\nwq\n" | ed testfile 
  7
  7
   $ hexdump -C testfile                     
  00000000  61 00 62 0a 63 00 65               |a.b.c.e|
  00000007

> It *might* be possible to make it work properly if the conversions were
> removed and get_compiled_pattern in re.c would use REG_PEND, but I"m not
> going to chase windmills.

That doesn't appear to be needed, stuff already works, even with NUL
in the pattern:

   $ printf "a\000b" > testfile                  
   $ printf "s/\000/x/\nwq\n" | ed testfile      
  3
  3
   $ hexdump -C testfile                    
  00000000  61 78 62                            |axb|
  00000003

> Any thoughts/comments?

Can you remove the changes deleting

        if (isbinary)
                NUL_TO_NEWLINE(txt, lp->len);

from your patch?
They break replacing NUL bytes in binary files for me:

   $ printf "a\000b" > testfile                   
   $ printf "H\ns/\000/x/\nwq\n" | ./obj/ed testfile
  3
  ?
  script, line 2: no match

I have not tested all other aspects yet, but we should get this
regression out of the way.

Thanks,
  Ingo


> Index: sub.c
> ===================================================================
> RCS file: /cvs/src/bin/ed/sub.c,v
> retrieving revision 1.15
> diff -u -p -r1.15 sub.c
> --- sub.c     22 Mar 2016 17:58:28 -0000      1.15
> +++ sub.c     29 May 2016 13:12:58 -0000
> @@ -180,52 +180,43 @@ substitute_matching_text(regex_t *pat, l
>       int off = 0;
>       int changed = 0;
>       int matchno = 0;
> -     int i = 0;
> +     off_t i = 0, skip;
>       regmatch_t rm[SE_MAX];
>       char *txt;
>       char *eot;
>  
>       if ((txt = get_sbuf_line(lp)) == NULL)
>               return ERR;
> -     if (isbinary)
> -             NUL_TO_NEWLINE(txt, lp->len);

Do not delete this.

>       eot = txt + lp->len;
> -     if (!regexec(pat, txt, SE_MAX, rm, 0)) {
> +     rm[0].rm_so = 0;
> +     rm[0].rm_eo = lp->len;
> +     if (!regexec(pat, txt, SE_MAX, rm, REG_STARTEND)) {
>               do {
> -                     if (!kth || kth == ++matchno) {
> -                             changed++;
> -                             i = rm[0].rm_so;
> -                             REALLOC(rbuf, rbufsz, off + i, ERR);
> -                             if (isbinary)
> -                                     NEWLINE_TO_NUL(txt, rm[0].rm_eo);
> -                             memcpy(rbuf + off, txt, i);
> -                             off += i;
> +                     skip = (!kth || kth == ++matchno) ?
> +                         rm[0].rm_so : rm[0].rm_eo;
> +                     REALLOC(rbuf, rbufsz, off + skip - i, ERR);
> +                     memcpy(rbuf + off, txt + i, skip - i);
> +                     off += (skip - i);
> +                     i = rm[0].rm_eo;
> +                     if (!kth || kth == matchno) {
> +                             changed = 1;
>                               if ((off = apply_subst_template(txt, rm, off,
>                                   pat->re_nsub)) < 0)
>                                       return ERR;
> -                     } else {
> -                             i = rm[0].rm_eo;
> -                             REALLOC(rbuf, rbufsz, off + i, ERR);
> -                             if (isbinary)
> -                                     NEWLINE_TO_NUL(txt, i);
> -                             memcpy(rbuf + off, txt, i);
> -                             off += i;
> +                             if (kth)
> +                                     break;
>                       }
> -                     txt += rm[0].rm_eo;
> -             } while (*txt && (!changed || ((gflag & GSG) && rm[0].rm_eo)) &&
> -                 !regexec(pat, txt, SE_MAX, rm, REG_NOTBOL));
> -             i = eot - txt;
> -             REALLOC(rbuf, rbufsz, off + i + 2, ERR);
> -             if (i > 0 && !rm[0].rm_eo && (gflag & GSG)) {
> -                     seterrmsg("infinite substitution loop");
> -                     return  ERR;
> -             }
> -             if (isbinary)
> -                     NEWLINE_TO_NUL(txt, i);
> -             memcpy(rbuf + off, txt, i);
> -             memcpy(rbuf + off + i, "\n", 2);
> +
> +                     rm[0].rm_so = (rm[0].rm_so == rm[0].rm_eo) ?
> +                         rm[0].rm_eo + 1 : rm[0].rm_eo;
> +                     rm[0].rm_eo = lp->len;
> +             } while ((txt + i < eot) &&
> +                 !regexec(pat, txt, SE_MAX, rm, REG_STARTEND | REG_NOTBOL));
> +             REALLOC(rbuf, rbufsz, off + lp->len - i + 2, ERR);
> +             memcpy(rbuf + off, txt + i, lp->len - i);
> +             memcpy(rbuf + off + lp->len - i, "\n", 2);
>       }
> -     return changed ? off + i + 1 : 0;
> +     return changed ? off + lp->len - i + 1 : 0;
>  }
> 

Reply via email to