When using single-byte strings, invalid collating symbols and invalid
equivalence classes are treated as the character 0xFF:

    [[ $'\xFF' == [[.ERR.]] ]]     # 0
    [[ $'\xFF' == [[=ERR=]] ]]     # 0
    [[ Z == [A-[.ERR.]] ]]         # 0

The problem stems from the -1 return value of COLLSYM being stored in an
unsigned char, as well as the FOLD macro casting its argument to unsigned
char (even when unchanged).
--

Without FNM_NOESCAPE, escaping a `[' within a bracket expression usually
prevents it from acting as the start of a collating symbol, equivalence
class, or character class. However, when parsing the ending point (only)
of a range expression, an escaped `[' does not lose its special meaning
and can still act as the start of a collating symbol:

    [[ 2 == [[.two.]] ]]           # 0
    [[ 2 == [\[.two.]] ]]          # 1
    [[ 2 == [0-[.two.]] ]]         # 0
    [[ 2 == [0-\[.two.]] ]]        # 0

This seems like a straightforward order of operations swap in the code
that handles this specific case.
--

Another edge case concerns equivalence classes and character classes in
a position where they could act as a starting point or ending point of
a range expression. In neither case do they actally serve as a range end
(as expected). However, there is a difference in behavior depending on
their being encountered at the start of or at the end of (what looks
like) a range:

    [[ A == [[:digit:]-A] ]]       # 0
    [[ A == [A-[:digit:]] ]]       # 1
    [[ A == [A-[:digit:]  ]]       # 0

What happens is that the `[' that follows `-' is treated as the end of
a range expression, so the bracket expression consists of the range A-[
and the characters `:' `d' `i' `g' `i' `t' `:'.  The final `]' in the
second case above is outside of the bracket expression.

Neither behavior (nor even their combination) is obviously wrong.
Rather, the problem with this "range-aware" bracket expression parsing
behavior is that it is not consistent with code in the many other parts
of the shell that need to find the ends of bracket expressions.

For example, the code that splits an extglob pattern-list into patterns:

    [[ A ==   [A-[:digit:]  ]]     # 0
    [[ A == @([A-[:digit:]) ]]     # 1

or even the code in the BRACKMATCH function itself that looks for the
end of the bracket expression after it finds a match:

    [[ A == [XA-[:digit:]  ]]      # 0
    [[ X == [XA-[:digit:]  ]]      # 1
    [[ A == [XA-[:digit:]] ]]      # 1
    [[ X == [XA-[:digit:]] ]]      # 0

--
This patch simplifies how the loops in BRACKMATCH walk through the
pattern string, which makes the diff rather large (but I think the
end result is more straightforward, and fewer LOC). If you prefer a
series instead, please let me know.

The changes in behavior are:

* [=FAKE=], [.FAKE.], and A-[.FAKE.] no longer match any character
* A-\[.C.] is no longer treated as A-[.C.]
* A-[=E=] is treated the same as [=E=]-A (which bash treats as matching
  any of [=E=], `-', or `A'). Same for A-[:class:].

The last one is debatable, shells and regex implementations vary widely
in how they treat cases like this. But it's much easier to make the
bracket matching code treat A-[=E=] the same as [=E=]-A than it is to
teach all the other parts of the shell the difference.
---
 lib/glob/sm_loop.c | 203 ++++++++++++++++++---------------------------
 1 file changed, 82 insertions(+), 121 deletions(-)

diff --git a/lib/glob/sm_loop.c b/lib/glob/sm_loop.c
index 303e1f0c..ec54bad7 100644
--- a/lib/glob/sm_loop.c
+++ b/lib/glob/sm_loop.c
@@ -397,9 +397,10 @@ PARSE_SUBBRACKET (CHAR *p, int flags)
 static CHAR *
 BRACKMATCH (CHAR *p, U_CHAR test, int flags)
 {
-  register CHAR cstart, cend, c;
-  register int not;    /* Nonzero if the sense of the character class is 
inverted.  */
-  int forcecoll, isrange;
+  CHAR c;
+  INT cstart, cend;
+  int not;    /* Nonzero if the sense of the character class is inverted.  */
+  int forcecoll;
   INT pc;
   CHAR *savep;
   CHAR *close;
@@ -417,13 +418,13 @@ BRACKMATCH (CHAR *p, U_CHAR test, int flags)
   if (not = (*p == L('!') || *p == L('^')))
     ++p;
 
-  c = *p++;
   for (;;)
     {
-      /* Initialize cstart and cend in case `-' is the last
-        character of the pattern. */
-      cstart = cend = c;
-      forcecoll = 0;
+      c = *p++;
+
+      /* `]' ends the bracket expression, unless it is the first character. */
+      if (c == L(']') && (p > savep + not + 1))
+       break;
 
       /* POSIX.2 equivalence class:  [=c=].  See POSIX.2 2.8.3.2.  Find
         the end of the equivalence class, move the pattern pointer past
@@ -432,28 +433,15 @@ BRACKMATCH (CHAR *p, U_CHAR test, int flags)
        {
          p++;
          pc = COLLSYM (p, close - p);
-         pc = FOLD (pc);
          p = close + 2;
 
-         if (COLLEQUIV (test, pc))
-           {
-/*[*/        /* Move past the closing `]', since the first thing we do at
-                the `matched:' label is back p up one. */
-             p++;
-             goto matched;
-           }
+         if (pc != INVALID && COLLEQUIV (test, FOLD (pc)))
+           goto matched;
+
          else
-           {
-             c = *p++;
-             if (c == L('\0'))
-               return ((test == L('[')) ? savep : (CHAR *)0); /*]*/
-             else if (c == L('/') && (flags & FNM_PATHNAME))
-               return ((test == L('[')) ? savep : (CHAR *)0); /*]*/
-             else if (c == L(']'))
-               break;
-             c = FOLD (c);
-             continue;
-           }
+           /* Continue the loop here, since this expression can't be
+              the first part of a range expression. */
+           continue;
        }
 
       /* POSIX.2 character class expression.  See POSIX.2 2.8.3.2. */
@@ -490,26 +478,11 @@ BRACKMATCH (CHAR *p, U_CHAR test, int flags)
          p = close + 2;
 
          if (pc)
-           {
-/*[*/        /* Move past the closing `]', since the first thing we do at
-                the `matched:' label is back p up one. */
-             p++;
-             goto matched;
-           }
+           goto matched;
          else
-           {
-             /* continue the loop here, since this expression can't be
-                the first part of a range expression. */
-             c = *p++;
-             if (c == L('\0'))
-               return ((test == L('[')) ? savep : (CHAR *)0);
-             else if (c == L('/') && (flags & FNM_PATHNAME))
-               return ((test == L('[')) ? savep : (CHAR *)0); /*]*/
-             else if (c == L(']'))
-               break;
-             c = FOLD (c);
-             continue;
-           }
+           /* Continue the loop here, since this expression can't be
+              the first part of a range expression. */
+           continue;
        }
  
       /* POSIX.2 collating symbols.  See POSIX.2 2.8.3.2.  Find the end of
@@ -523,105 +496,92 @@ BRACKMATCH (CHAR *p, U_CHAR test, int flags)
          p = close + 2;
          forcecoll = 1;
        }
-
-      if (!(flags & FNM_NOESCAPE) && c == L('\\'))
+      else
        {
-         if (*p == '\0')
-           return ((test == L('[')) ? savep : (CHAR *)0);
-         else if (*p == L('/') && (flags & FNM_PATHNAME))
+         if (!(flags & FNM_NOESCAPE) && c == L('\\'))
+           c = *p++;
+
+         /* POSIX.2 2.8.3.1.2 says: `An expression containing a `[' that
+            is not preceded by a backslash and is not part of a bracket
+            expression produces undefined results.'  This implementation
+            treats the `[' as just a character to be matched if there is
+            not a closing `]'. */
+         if (c == L('\0'))
            return ((test == L('[')) ? savep : (CHAR *)0);
-         cstart = cend = *p++;
-       }
 
-      cstart = cend = FOLD (cstart);
-      isrange = 0;
-
-      /* POSIX.2 2.8.3.1.2 says: `An expression containing a `[' that
-        is not preceded by a backslash and is not part of a bracket
-        expression produces undefined results.'  This implementation
-        treats the `[' as just a character to be matched if there is
-        not a closing `]'. */
-      if (c == L('\0'))
-       return ((test == L('[')) ? savep : (CHAR *)0);
-
-      /* POSIX.2 2.13.3 says: `If a <slash> character is found following an
-         unescaped <left-square-bracket> character before a corresponding
-         <right-square-bracket> is found, the open bracket shall be treated
-         as an ordinary character.' If we find a slash in a bracket
-         expression and the flags indicate we're supposed to be treating the
-         string like a pathname, we have to treat the `[' as just a character
-         to be matched. */
-      if (c == L('/') && (flags & FNM_PATHNAME))
-       return ((test == L('[')) ? savep : (CHAR *)0);
-
-      c = *p++;
-      c = FOLD (c);
+         /* POSIX.2 2.13.3 says: `If a <slash> character is found following an
+            unescaped <left-square-bracket> character before a corresponding
+            <right-square-bracket> is found, the open bracket shall be treated
+            as an ordinary character.' If we find a slash in a bracket
+            expression and the flags indicate we're supposed to be treating the
+            string like a pathname, we have to treat the `[' as just a 
character
+            to be matched. */
+         if (c == L('/') && (flags & FNM_PATHNAME))
+           return ((test == L('[')) ? savep : (CHAR *)0);
 
-      if (c == L('\0'))
-       return ((test == L('[')) ? savep : (CHAR *)0);
-      else if (c == L('/') && (flags & FNM_PATHNAME))
-       return ((test == L('[')) ? savep : (CHAR *)0);
+         cstart = c;
+         forcecoll = 0;
+       }
 
-      /* This introduces a range, unless the `-' is the last
-        character of the class.  Find the end of the range
-        and move past it. */
-      if (c == L('-') && *p != L(']'))
+      /* Range expression, unless `-' is followed by a `]' or an equivalence
+        class or a character class.  POSIX.1-2024 9.3.5.7 says: ``The starting
+        range point and the ending range point shall be a collating element or
+        collating symbol. An equivalence class expression used as a starting
+        or ending point of a range expression produces unspecified results.''
+        We treat the `-' in `a-[=c=]' as matching itself to be consistent with
+        how we handled `[=c=]' above. */
+      if (p[0] == L('-') && p[1] != L(']') &&
+         !(p[1] == L('[') && (p[2] == L('=') || p[2] == L(':')) && 
PARSE_SUBBRACKET (p + 2, flags) != NULL))
        {
-         cend = *p++;
-         if (!(flags & FNM_NOESCAPE) && cend == L('\\'))
-           cend = *p++;
-         if (cend == L('\0'))
-           return ((test == L('[')) ? savep : (CHAR *)0);
-         else if (cend == L('/') && (flags & FNM_PATHNAME))
-           return ((test == L('[')) ? savep : (CHAR *)0);
-         if (cend == L('[') && *p == L('.') && (close = PARSE_SUBBRACKET (p, 
flags)) != NULL)
+         p++;          /* step over `-' */
+         c = *p++;
+         if (c == L('[') && *p == L('.') && (close = PARSE_SUBBRACKET (p, 
flags)) != NULL)
            {
              p++;
              cend = COLLSYM (p, close - p);
              p = close + 2;
              forcecoll = 1;
            }
-         cend = FOLD (cend);
+         else
+           {
+             if (!(flags & FNM_NOESCAPE) && c == L('\\'))
+               c = *p++;
+             if (c == L('\0'))
+               return ((test == L('[')) ? savep : (CHAR *)0);
+             else if (c == L('/') && (flags & FNM_PATHNAME))
+               return ((test == L('[')) ? savep : (CHAR *)0);
+             cend = c;
+           }
 
-         c = *p++;
+         /* A range with an invalid start or end matches nothing. */
+         if (cstart == INVALID || cend == INVALID)
+           continue;
 
+         cstart = FOLD (cstart);
+         cend = FOLD (cend);
          /* POSIX.2 2.8.3.2:  ``The ending range point shall collate
             equal to or higher than the starting range point; otherwise
             the expression shall be treated as invalid.''  Note that this
             applies to only the range expression; the rest of the bracket
             expression is still checked for matches. */
-         if (RANGECMP (cstart, cend, forcecoll) > 0)
-           {
-             if (c == L(']'))
-               break;
-             c = FOLD (c);
-             continue;
-           }
-         isrange = 1;
+         if (RANGECMP (cstart, cend, forcecoll) <= 0 &&
+             RANGECMP (test, cstart, forcecoll) >= 0 && RANGECMP (test, cend, 
forcecoll) <= 0)
+           goto matched;
+       }
+      else
+       {
+         /* Not a range. */
+         if (cstart != INVALID && test == FOLD (cstart))
+           goto matched;
        }
-
-      if (isrange == 0 && test == cstart)
-        goto matched;
-      if (isrange && RANGECMP (test, cstart, forcecoll) >= 0 && RANGECMP 
(test, cend, forcecoll) <= 0)
-       goto matched;
-
-      if (c == L(']'))
-       break;
     }
   /* No match. */
   return (!not ? (CHAR *)0 : p);
 
 matched:
   /* Skip the rest of the [...] that already matched.  */
-  c = *--p;
   while (1)
     {
-      /* A `[' without a matching `]' is just another character to match. */
-      if (c == L('\0'))
-       return ((test == L('[')) ? savep : (CHAR *)0);
-      else if (c == L('/') && (flags & FNM_PATHNAME))
-       return ((test == L('[')) ? savep : (CHAR *)0);
-
       c = *p++;
       if (c == L('[') && (*p == L('=') || *p == L(':') || *p == L('.')))
        {
@@ -637,15 +597,16 @@ matched:
          the bracket expression. */
       else if (c == L(']'))
        break;
-      else if (!(flags & FNM_NOESCAPE) && c == L('\\'))
+      else
        {
-         if (*p == '\0')
+         if (!(flags & FNM_NOESCAPE) && c == L('\\'))
+           c = *p++;
+         if (c == '\0')
            return ((test == L('[')) ? savep : (CHAR *)0);
          /* We don't allow backslash to quote slash if we're matching 
pathnames */
-         else if (*p == L('/') && (flags & FNM_PATHNAME))
+         else if (c == L('/') && (flags & FNM_PATHNAME))
            return ((test == L('[')) ? savep : (CHAR *)0);
          /* Posix issue 8 leaves this unspecified for the shell. */
-         ++p;
        }
     }
   return (not ? (CHAR *)0 : p);
-- 
2.51.0


Reply via email to