Bug#510219: Info received (Bug#510219: sed: cannot tell if some chars are letters, digits, etc.)

2009-01-09 Thread Paolo Bonzini
This was bug 9697 in glibc's bugzilla, and is fixed by the patch at git
diff 4c2a6f3d 37bdc055ce^ in git://sources.redhat.com/git/glibc.git
(gitweb interface at http://repo.or.cz/w/glibc.git).

Another regex bug (bug 697 in glibc's bugzilla), by the way, is fixed by
the patch at git diff 37bdc055^ 9d565658.

Paolo



-- 
To UNSUBSCRIBE, email to debian-bugs-dist-requ...@lists.debian.org
with a subject of unsubscribe. Trouble? Contact listmas...@lists.debian.org



Bug#510219: Info received (Bug#510219: sed: cannot tell if some chars are letters, digits, etc.)

2009-01-09 Thread Clint Adams
On Fri, Jan 09, 2009 at 09:33:45AM +0100, Paolo Bonzini wrote:
 This was bug 9697 in glibc's bugzilla, and is fixed by the patch at git
 diff 4c2a6f3d 37bdc055ce^ in git://sources.redhat.com/git/glibc.git
 (gitweb interface at http://repo.or.cz/w/glibc.git).
 
 Another regex bug (bug 697 in glibc's bugzilla), by the way, is fixed by
 the patch at git diff 37bdc055^ 9d565658.

Attaching those two for posterity.
diff --git a/ChangeLog b/ChangeLog
index 182bd26..8829b44 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2009-01-04  Paolo Bonzini  bonz...@gnu.org
+
+   [BZ 9697]
+   * posix/bug-regex17.c: Add testcases.
+   * posix/regcomp.c (re_compile_fastmap_iter): Rewrite COMPLEX_BRACKET
+   handling.
+
 2009-01-05  Martin Schwidefsky  schwidef...@de.ibm.com
 
* sysdeps/unix/sysv/linux/s390/bits/libc-vdso.h: New file.
diff --git a/posix/bug-regex17.c b/posix/bug-regex17.c
index b42f9b6..1c11a1d 100644
--- a/posix/bug-regex17.c
+++ b/posix/bug-regex17.c
@@ -1,5 +1,5 @@
-/* Turkish regular expression tests.
-   Copyright (C) 2002, 2003 Free Software Foundation, Inc.
+/* German regular expression tests.
+   Copyright (C) 2002, 2003, 2009 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Jakub Jelinek ja...@redhat.com, 2002.
 
@@ -33,10 +33,10 @@ struct
   int flags, nmatch;
   regmatch_t rm[5];
 } tests[] = {
-  /* \xc3\x84  LATIN CAPITAL LETTER A WITH DIAERESIS
- \xc3\x96  LATIN CAPITAL LETTER O WITH DIAERESIS
- \xc3\xa4  LATIN SMALL LETTER A WITH DIAERESIS
- \xc3\xb6  LATIN SMALL LETTER O WITH DIAERESIS  */
+  /* U+00C4\xc3\x84LATIN CAPITAL LETTER A WITH DIAERESIS
+ U+00D6\xc3\x96LATIN CAPITAL LETTER O WITH DIAERESIS
+ U+00E4\xc3\xa4LATIN SMALL LETTER A WITH DIAERESIS
+ U+00F6\xc3\xb6LATIN SMALL LETTER O WITH DIAERESIS  */
   { \xc3\x84\xc3\x96*\xc3\xb6$, aB\xc3\xa4\xc3\xb6\xc3\xb6\xc3\x96, 
REG_ICASE, 2,
 { { 2, 10 }, { -1, -1 } } },
   { [\xc3\x84x]\xc3\x96*\xc3\xb6$, aB\xc3\x84\xc3\xb6\xc3\xb6\xc3\x96, 
REG_ICASE, 2,
@@ -45,10 +45,22 @@ struct
 { { 2, 10 }, { -1, -1 } } },
   { [^x]\xc3\x96*\xc3\xb6$, aB\xc3\xa4\xc3\xb6\xc3\xb6\xc3\x96, REG_ICASE, 
2,
 { { 2, 10 }, { -1, -1 } } },
+
+  /* Tests for bug 9697:
+ U+00DF\xc3\x9fLATIN SMALL LETTER SHARP S
+ U+02DA\xcb\x9aRING ABOVE
+ U+02E2\xcb\xa2MODIFIER LETTER SMALL S  */
+  { [a-z]|[^a-z], \xcb\xa2, REG_EXTENDED, 2,
+{ { 0, 2 }, { -1, -1 } } },
+  { [a-z], \xc3\x9f, REG_EXTENDED, 2,
+{ { 0, 2 }, { -1, -1 } } },
+  { [^a-z], \xcb\x9a, REG_EXTENDED, 2,
+{ { 0, 2 }, { -1, -1 } } },
 };
 
-int
-main (void)
+
+static int
+do_test (void)
 {
   regex_t re;
   regmatch_t rm[5];
@@ -93,3 +105,6 @@ main (void)
 
   return ret;
 }
+
+#define TEST_FUNCTION do_test ()
+#include ../test-skeleton.c
diff --git a/posix/regcomp.c b/posix/regcomp.c
index d5a0535..4843cfe 100644
--- a/posix/regcomp.c
+++ b/posix/regcomp.c
@@ -1,5 +1,6 @@
 /* Extended regular expression matching and search library.
-   Copyright (C) 2002,2003,2004,2005,2006,2007 Free Software Foundation, Inc.
+   Copyright (C) 2002,2003,2004,2005,2006,2007,2009
+   Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Isamu Hasegawa is...@yamato.ibm.com.
 
@@ -350,47 +351,67 @@ re_compile_fastmap_iter (regex_t *bufp, const 
re_dfastate_t *init_state,
 #ifdef RE_ENABLE_I18N
   else if (type == COMPLEX_BRACKET)
{
- int i;
  re_charset_t *cset = dfa-nodes[node].opr.mbcset;
- if (cset-non_match || cset-ncoll_syms || cset-nequiv_classes
- || cset-nranges || cset-nchar_classes)
-   {
+ int i;
+
 # ifdef _LIBC
- if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
+ /* See if we have to try all bytes which start multiple collation
+elements.
+e.g. In da_DK, we want to catch 'a' since aa is a valid
+ collation element, and don't catch 'b' since 'b' is
+ the only collation element which starts from 'b' (and
+ it is caught by SIMPLE_BRACKET).  */
+ if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
+  (cset-ncoll_syms || cset-nranges))
{
- /* In this case we want to catch the bytes which are
-the first byte of any collation elements.
-e.g. In da_DK, we want to catch 'a' since aa
- is a valid collation element, and don't catch
- 'b' since 'b' is the only collation element
- which starts from 'b'.  */
  const int32_t *table = (const int32_t *)
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
  for (i = 0; i  SBC_MAX; ++i)
if (table[i]  0)