On Tue, Oct 29, 2002 at 07:21:01AM -0800, Jeff Bailey wrote: > On Tue, Oct 29, 2002 at 02:50:29PM +0000, Colin Watson wrote: > > Thanks a lot! Not to rush you, but any luck so far, or can I help? I've > > had five duplicates so far, so I have plenty of motivation ... > > Sorry, I haven't looked at it yet. I just finished my weekend from hell > (I had 2 speaking engagements, plus another conference, but had Marcus > Brinkmann visiting from Germany). It's on my todo list for this week, > though. > > If only I could put off that dayjob thing... =)
I know the feeling. :-) I think I've found a patch from CVS that clears up this segfault. First off, a gdb transcript to explain why I think this patch is relevant: (gdb) run foo Starting program: /home/cjwatson/src/debian/man-db/man-db-2.4.0/src/apropos foo foo (1) - No manpage for this program, utility or function. Program received signal SIGSEGV, Segmentation fault. 0x400c0968 in re_string_context_at (input=0x80bdfe8, idx=25, eflags=0, newline_anchor=0) at regex_internal.c:540 warning: Source file is more recent than executable. 540 c = re_string_byte_at (input, idx); (gdb) bt #0 0x400c0968 in re_string_context_at (input=0x80bdfe8, idx=25, eflags=0, newline_anchor=0) at regex_internal.c:540 #1 0x400c0749 in re_string_reconstruct (pstr=0xbffff700, idx=26, eflags=0, newline=0) at regex_internal.c:439 #2 0x400bba7e in re_search_internal (preg=0x804f260, string=0x80bdfa4 "memory management-related functions for use with TIFF files", length=59, start=0, range=70, stop=134995944, nmatch=0, pmatch=0x0, eflags=0) at regexec.c:665 #3 0x400bb191 in __regexec (preg=0x804f260, string=0x19 <Address 0x19 out of bounds>, nmatch=0, pmatch=0x0, eflags=0) at regexec.c:210 #4 0x08049901 in parse_whatis (page=0xbffffab0 "foo", lowpage=0x8050e00 "foo", whatis=0x80bdfa4 "memory management-related functions for use with TIFF files") at whatis.c:413 #5 0x08049a8e in apropos (page=0xbffffab0 "foo", lowpage=0x8050e00 "foo") at whatis.c:488 #6 0x08049d94 in search (page=0xbffffab0 "foo") at whatis.c:563 #7 0x0804a103 in main (argc=2, argv=0xbffff974) at whatis.c:711 (gdb) up #1 0x400c0749 in re_string_reconstruct (pstr=0xbffff700, idx=26, eflags=0, newline=0) at regex_internal.c:439 439 pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags, (gdb) list 434 offset = idx; 435 } 436 437 if (offset != 0) 438 { 439 pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags, 440 newline); 441 /* Are the characters which are already checked remain? */ 442 if (offset < pstr->valid_len) 443 { (gdb) p offset $1 = 26 (gdb) p pstr->valid_len $2 = 0 (gdb) q Here's the patch. I've backported it such that it applies against glibc 2.1.3-4, since there were some whitespace changes in between. If you want to take those too for simplicity, I can put together a combined patch. The crucial bits are chunks 4 and 5 of regex_internal.c; I think the rest are dependent. Note that the test suite is still running here. --------------------- PatchSet 46089 Date: 2002/11/13 04:00:08 Author: roland Log: 2002-11-11 Isamu Hasegawa <[EMAIL PROTECTED]> * posix/regex_internal.c (re_string_skip_chars): Also return the last wide character. (re_string_reconstruct): Calculate the context by itself when the offset points out of the valid range. (re_string_context_at): Use wide character when MB_CUR_MAX > 1. * posix/regex_internal.h (WIDE_NEWLINE_CHAR): New macro. (IS_WIDE_WORD_CHAR): New macro. (IS_WIDE_NEWLINE): New macro. Members: posix/regex_internal.c:1.14->1.15 posix/regex_internal.h:1.17->1.18 Index: posix/regex_internal.c =================================================================== RCS file: /cvs/glibc/libc/posix/regex_internal.c,v retrieving revision 1.14 retrieving revision 1.15 diff -p -u -u -r1.14 -r1.15 --- posix/regex_internal.c 6 Nov 2002 19:24:20 -0000 1.14 +++ posix/regex_internal.c 13 Nov 2002 04:00:08 -0000 1.15 @@ -66,7 +66,8 @@ static void re_string_construct_common ( re_string_t *pstr, RE_TRANSLATE_TYPE trans, int icase); #ifdef RE_ENABLE_I18N -static int re_string_skip_chars (re_string_t *pstr, int new_raw_idx); +static int re_string_skip_chars (re_string_t *pstr, int new_raw_idx, + wint_t *last_wc); #endif /* RE_ENABLE_I18N */ static re_dfastate_t *create_newstate_common (re_dfa_t *dfa, const re_node_set *nodes, @@ -333,21 +334,24 @@ build_wcs_upper_buffer (pstr) Return the index. */ static int -re_string_skip_chars (pstr, new_raw_idx) +re_string_skip_chars (pstr, new_raw_idx, last_wc) re_string_t *pstr; int new_raw_idx; + wint_t *last_wc; { mbstate_t prev_st; int rawbuf_idx, mbclen; + wchar_t wc = 0; /* Skip the characters which are not necessary to check. */ for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_len; rawbuf_idx < new_raw_idx;) { - int remain_len = pstr->len - rawbuf_idx; + int remain_len; + remain_len = pstr->len - rawbuf_idx; prev_st = pstr->cur_state; - mbclen = mbrlen ((const char *) pstr->raw_mbs + rawbuf_idx, remain_len, - &pstr->cur_state); + mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx, + remain_len, &pstr->cur_state); if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0)) { /* We treat these cases as a singlebyte character. */ @@ -357,6 +361,7 @@ re_string_skip_chars (pstr, new_raw_idx) /* Then proceed the next character. */ rawbuf_idx += mbclen; } + *last_wc = (wint_t) wc; return rawbuf_idx; } #endif /* RE_ENABLE_I18N */ @@ -436,12 +441,12 @@ re_string_reconstruct (pstr, idx, eflags if (offset != 0) { - pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags, - newline); /* Are the characters which are already checked remain? */ if (offset < pstr->valid_len) { /* Yes, move them to the front of the buffer. */ + pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags, + newline); #ifdef RE_ENABLE_I18N if (MB_CUR_MAX > 1) memmove (pstr->wcs, pstr->wcs + offset, @@ -466,11 +471,26 @@ re_string_reconstruct (pstr, idx, eflags if (MB_CUR_MAX > 1) { int wcs_idx; - pstr->valid_len = re_string_skip_chars (pstr, idx) - idx; + wint_t wc; + pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx; for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx) pstr->wcs[wcs_idx] = WEOF; + if (pstr->trans && wc <= 0xff) + wc = pstr->trans[wc]; + pstr->tip_context = (IS_WIDE_WORD_CHAR (wc) ? CONTEXT_WORD + : ((newline && IS_WIDE_NEWLINE (wc)) + ? CONTEXT_NEWLINE : 0)); } + else #endif /* RE_ENABLE_I18N */ + { + int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1]; + if (pstr->trans) + c = pstr->trans[c]; + pstr->tip_context = (IS_WORD_CHAR (c) ? CONTEXT_WORD + : ((newline && IS_NEWLINE (c)) + ? CONTEXT_NEWLINE : 0)); + } } if (!MBS_CASE_ALLOCATED (pstr)) { @@ -537,10 +557,32 @@ re_string_context_at (input, idx, eflags return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF : CONTEXT_NEWLINE | CONTEXT_ENDBUF); } - c = re_string_byte_at (input, idx); - if (IS_WORD_CHAR (c)) - return CONTEXT_WORD; - return (newline_anchor && IS_NEWLINE (c)) ? CONTEXT_NEWLINE : 0; + if (MB_CUR_MAX == 1) + { + c = re_string_byte_at (input, idx); + if (IS_WORD_CHAR (c)) + return CONTEXT_WORD; + return (newline_anchor && IS_NEWLINE (c)) ? CONTEXT_NEWLINE : 0; + } + else + { + wint_t wc; + int wc_idx = idx; + while(input->wcs[wc_idx] == WEOF) + { +#ifdef DEBUG + /* It must not happen. */ + assert (wc_idx >= 0); +#endif + --wc_idx; + if (wc_idx < 0) + return input->tip_context; + } + wc = input->wcs[wc_idx]; + if (IS_WIDE_WORD_CHAR (wc)) + return CONTEXT_WORD; + return (newline_anchor && IS_WIDE_NEWLINE (wc)) ? CONTEXT_NEWLINE : 0; + } } /* Functions for set operation. */ Index: posix/regex_internal.h =================================================================== RCS file: /cvs/glibc/libc/posix/regex_internal.h,v retrieving revision 1.17 retrieving revision 1.18 diff -p -u -u -r1.17 -r1.18 --- posix/regex_internal.h 6 Nov 2002 20:35:39 -0000 1.17 +++ posix/regex_internal.h 13 Nov 2002 04:00:08 -0000 1.18 @@ -30,6 +30,7 @@ /* The character which represents newline. */ #define NEWLINE_CHAR '\n' +#define WIDE_NEWLINE_CHAR L'\n' /* Rename to standard API for using out of glibc. */ #ifndef _LIBC @@ -355,6 +356,8 @@ typedef struct bin_tree_t bin_tree_t; #define IS_WORD_CHAR(ch) (isalnum (ch) || (ch) == '_') #define IS_NEWLINE(ch) ((ch) == NEWLINE_CHAR) +#define IS_WIDE_WORD_CHAR(ch) (iswalnum (ch) || (ch) == L'_') +#define IS_WIDE_NEWLINE(ch) ((ch) == WIDE_NEWLINE_CHAR) #define NOT_SATISFY_PREV_CONSTRAINT(constraint,context) \ ((((constraint) & PREV_WORD_CONSTRAINT) && !IS_WORD_CONTEXT (context)) \ --------------------- PatchSet 46090 Date: 2002/11/13 04:00:16 Author: roland Log: