ping any OKs?
On 05/05/16 00:17, Martijn van Duren wrote: > Hello tech@, > > On 04/23/16 07:21, Jonathan Gray wrote on bugs@: >> $ cat foo.sh >> #!/bin/sh >> >> $1 -r ' >> s/[[:space:]]//g >> s/\<sf([0-9]+),/ISL_SFLOAT@\1,/g >> ' >> $ cat foo.csv >> R32G32B32A32_FLOAT , 128, 1, 1, 1, sf32, sf32, sf32, sf32, , >> , , linear, >> $ ./foo.sh sed < foo.csv >> R32G32B32A32_FLOAT,128,1,1,1,ISL_SFLOAT@32,sf32,ISL_SFLOAT@32,sf32,,,,linear, >> $ ./foo.sh gsed < foo.csv >> R32G32B32A32_FLOAT,128,1,1,1,ISL_SFLOAT@32,ISL_SFLOAT@32,ISL_SFLOAT@32,ISL_SFLOAT@32,,,,linear, >> >> Encountered with code that was recently added to Mesa: >> >> https://cgit.freedesktop.org/mesa/mesa/tree/src/intel/isl/isl_format_layout_gen.bash >> > > After a long discussion with several other developers I came up with > the following diff. It fixes the bug and also passes both the > regression tests and a full bulk build (thanks to ajacoutot@). > > I'd like to post it here for wider exposure. Please test to as much sed > scripts as you can find. Comments and OKs welcome. > > Diff looks sane to jasper@ > > For those interested: The problem comes from the fact that the string > pointer increments to the end of the previous match and is then called > with the REG_NOTBOL. The REG_NOTBOL combined with a match at the begin > of the string causes our regex library to treat the word as not begin of > word. > The TRE implementation does the reverse and treats this case as if it > always is begin of word. This causes a similar bug under MacOS: > $ echo 'foo foofoo' | sed -E 's/\<foo/bar/g' > bar barbar > > I've solved this problem by converting sed to use REG_STARTEND more > explicitly. Although this isn't a POSIX specified flag, it is already > used by sed and shouldn't be a problem. > > The patch in engine.c from the library is because beginp is set in > matcher to start, so this one always results to true. > > I also tried to set up a testcase with gnu regex, but it turned out that > they don't set re_nsub after regcomp, so the sed compilation phase fails. > After examining gnu sed's code I found that it uses an own internal API > not specified by POSIX. > > Index: ./lib/libc/regex/engine.c > =================================================================== > RCS file: /cvs/src/lib/libc/regex/engine.c,v > retrieving revision 1.19 > diff -u -p -r1.19 engine.c > --- ./lib/libc/regex/engine.c 28 Dec 2015 23:01:22 -0000 1.19 > +++ ./lib/libc/regex/engine.c 2 May 2016 08:50:20 -0000 > @@ -674,7 +674,7 @@ fast(struct match *m, char *start, char > states fresh = m->fresh; > states tmp = m->tmp; > char *p = start; > - int c = (start == m->beginp) ? OUT : *(start-1); > + int c = (start == m->offp) ? OUT : *(start-1); > int lastc; /* previous c */ > int flagch; > int i; > @@ -758,7 +758,7 @@ slow(struct match *m, char *start, char > states empty = m->empty; > states tmp = m->tmp; > char *p = start; > - int c = (start == m->beginp) ? OUT : *(start-1); > + int c = (start == m->offp) ? OUT : *(start-1); > int lastc; /* previous c */ > int flagch; > int i; > Index: ./usr.bin/sed/process.c > =================================================================== > RCS file: /cvs/src/usr.bin/sed/process.c,v > retrieving revision 1.27 > diff -u -p -r1.27 process.c > --- ./usr.bin/sed/process.c 26 Oct 2015 14:08:47 -0000 1.27 > +++ ./usr.bin/sed/process.c 2 May 2016 08:50:21 -0000 > @@ -61,7 +61,8 @@ static SPACE HS, PS, SS; > static inline int applies(struct s_command *); > static void flush_appends(void); > static void lputs(char *); > -static inline int regexec_e(regex_t *, const char *, int, int, size_t); > +static inline int regexec_e(regex_t *, const char *, int, int, size_t, > + size_t); > static void regsub(SPACE *, char *, char *); > static int substitute(struct s_command *); > > @@ -267,7 +268,7 @@ new: if (!nflag && !pd) > * (lastline, linenumber, ps). > */ > #define MATCH(a) \ > - (a)->type == AT_RE ? regexec_e((a)->u.r, ps, 0, 1, psl) : \ > + (a)->type == AT_RE ? regexec_e((a)->u.r, ps, 0, 1, 0, psl) : \ > (a)->type == AT_LINE ? linenum == (a)->u.l : lastline() > > /* > @@ -335,6 +336,7 @@ substitute(struct s_command *cp) > regex_t *re; > regoff_t slen; > int n, lastempty; > + size_t le = 0; > char *s; > > s = ps; > @@ -346,7 +348,7 @@ substitute(struct s_command *cp) > cp->u.s->maxbref); > } > } > - if (!regexec_e(re, s, 0, 0, psl)) > + if (!regexec_e(re, s, 0, 0, 0, psl)) > return (0); > > SS.len = 0; /* Clean substitute space. */ > @@ -356,28 +358,29 @@ substitute(struct s_command *cp) > > do { > /* Copy the leading retained string. */ > - if (n <= 1 && match[0].rm_so) > - cspace(&SS, s, match[0].rm_so, APPEND); > + if (n <= 1 && (match[0].rm_so - le)) > + cspace(&SS, s, match[0].rm_so - le, APPEND); > > /* Skip zero-length matches right after other matches. */ > - if (lastempty || match[0].rm_so || > + if (lastempty || (match[0].rm_so - le) || > match[0].rm_so != match[0].rm_eo) { > if (n <= 1) { > /* Want this match: append replacement. */ > - regsub(&SS, s, cp->u.s->new); > + regsub(&SS, ps, cp->u.s->new); > if (n == 1) > n = -1; > } else { > /* Want a later match: append original. */ > - if (match[0].rm_eo) > - cspace(&SS, s, match[0].rm_eo, APPEND); > + if (match[0].rm_eo - le) > + cspace(&SS, s, match[0].rm_eo - le, > APPEND); > n--; > } > } > > /* Move past this match. */ > - s += match[0].rm_eo; > - slen -= match[0].rm_eo; > + s += (match[0].rm_eo - le); > + slen -= (match[0].rm_eo - le); > + le = match[0].rm_eo; > > /* > * After a zero-length match, advance one byte, > @@ -388,13 +391,15 @@ substitute(struct s_command *cp) > slen = -1; > else > slen--; > - if (*s != '\0') > + if (*s != '\0') { > cspace(&SS, s++, 1, APPEND); > + le++; > + } > lastempty = 1; > } else > lastempty = 0; > > - } while (n >= 0 && slen >= 0 && regexec_e(re, s, REG_NOTBOL, 0, slen)); > + } while (n >= 0 && slen >= 0 && regexec_e(re, ps, 0, 0, le, psl)); > > /* Did not find the requested number of matches. */ > if (n > 1) > @@ -509,7 +514,7 @@ lputs(char *s) > > static inline int > regexec_e(regex_t *preg, const char *string, int eflags, > - int nomatch, size_t slen) > + int nomatch, size_t start, size_t stop) > { > int eval; > > @@ -520,8 +525,8 @@ regexec_e(regex_t *preg, const char *str > defpreg = preg; > > /* Set anchors */ > - match[0].rm_so = 0; > - match[0].rm_eo = slen; > + match[0].rm_so = start; > + match[0].rm_eo = stop; > > eval = regexec(defpreg, string, > nomatch ? 0 : maxnsub + 1, match, eflags | REG_STARTEND); >