dfs         01/03/29 08:33:19

  Modified:    src/java/org/apache/oro/text/regex OpCode.java
                        Perl5Compiler.java Perl5Debug.java
                        Perl5Matcher.java
  Log:
  Applied Takashi's fix for his posix character class patch.
  
  Revision  Changes    Path
  1.5       +22 -11    jakarta-oro/src/java/org/apache/oro/text/regex/OpCode.java
  
  Index: OpCode.java
  ===================================================================
  RCS file: /home/cvs/jakarta-oro/src/java/org/apache/oro/text/regex/OpCode.java,v
  retrieving revision 1.4
  retrieving revision 1.5
  diff -u -r1.4 -r1.5
  --- OpCode.java       2001/01/29 00:19:00     1.4
  +++ OpCode.java       2001/03/29 16:33:17     1.5
  @@ -63,7 +63,7 @@
    * op-codes used in a compiled regular expression.
   
    @author <a href="mailto:[EMAIL PROTECTED]">Daniel F. Savarese</a>
  - @version $Id: OpCode.java,v 1.4 2001/01/29 00:19:00 dfs Exp $
  + @version $Id: OpCode.java,v 1.5 2001/03/29 16:33:17 dfs Exp $
    */
   final class OpCode {
   
  @@ -91,8 +91,8 @@
        _NOTHING = 15,  // no       Match empty string.
        _STAR    = 16,  // yes      Match this (simple) thing 0 or more times.
        _PLUS    = 17,  // yes      Match this (simple) thing 1 or more times.
  -     _ALNUM   = 18,  // no       Match any alphanumeric character
  -     _NALNUM  = 19,  // no       Match any non-alphanumeric character
  +     _WORD   = 18,   // no       Match any word character
  +     _NWORD  = 19,   // no       Match any non-word character
        _BOUND   = 20,  // no       Match "" at any word boundary
        _NBOUND  = 21,  // no       Match "" at any word non-boundary
        _SPACE   = 22,  // no       Match any whitespace character
  @@ -123,20 +123,29 @@
        _UPPER   = 45,
        _XDIGIT  = 46,
        _OPCODE  = 47,
  -     _ONECHAR = 48;
  +     _NOPCODE = 48,
  +     _ONECHAR = 49,
  +     _ALNUM   = 50,
  +     _ASCII   = 51;
   
     // Lengths of the various operands.
     static final int _operandLength[] = {
  -    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0,
  -    0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0
  +    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // OpCode 0-9
  +    2, 2, 0, 0, 0, 0, 0, 0, 0, 0, // OpCode 10-19
  +    0, 0, 0, 0, 0, 0, 1, 1, 1, 0, // OpCode 20-29
  +    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // OpCode 30-39
  +    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // OpCode 40-49 
  +    0, 0                          // OpCode 50-51 
     };
   
     static final char _opType[] = {
        _END, _BOL, _BOL, _BOL, _EOL, _EOL, _EOL, _ANY, _ANY, _ANYOF, _CURLY,
  -     _CURLY, _BRANCH, _BACK, _EXACTLY, _NOTHING, _STAR, _PLUS, _ALNUM,
  -     _NALNUM, _BOUND, _NBOUND, _SPACE, _NSPACE, _DIGIT, _NDIGIT, _REF,
  +     _CURLY, _BRANCH, _BACK, _EXACTLY, _NOTHING, _STAR, _PLUS, _WORD,
  +     _NWORD, _BOUND, _NBOUND, _SPACE, _NSPACE, _DIGIT, _NDIGIT, _REF,
        _OPEN, _CLOSE, _MINMOD, _BOL, _BRANCH, _BRANCH, _END, _WHILEM,
  -     _ANYOFUN, _NANYOFUN
  +     _ANYOFUN, _NANYOFUN, _RANGE, _ALPHA, _BLANK, _CNTRL, _GRAPH,
  +     _LOWER, _PRINT, _PUNCT, _UPPER, _XDIGIT, _OPCODE, _NOPCODE,
  +     _ONECHAR, _ALNUM, _ASCII
     };
   
     static final char _opLengthVaries[] = {
  @@ -144,8 +153,10 @@
     };
   
     static final char _opLengthOne[] = {
  -    _ANY, _SANY, _ANYOF, _ALNUM, _NALNUM, _SPACE, _NSPACE, _DIGIT, _NDIGIT, 
  -    _ANYOFUN, _NANYOFUN
  +    _ANY, _SANY, _ANYOF, _WORD, _NWORD, _SPACE, _NSPACE, _DIGIT, _NDIGIT, 
  +    _ANYOFUN, _NANYOFUN, _ALPHA, _BLANK, _CNTRL, _GRAPH, _LOWER, _PRINT,
  +    _PUNCT, _UPPER, _XDIGIT, _OPCODE, _NOPCODE, _ONECHAR, _ALNUM,
  +    _ASCII
     };
   
     static final int  _NULL_OFFSET  = -1;
  
  
  
  1.8       +99 -84    
jakarta-oro/src/java/org/apache/oro/text/regex/Perl5Compiler.java
  
  Index: Perl5Compiler.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-oro/src/java/org/apache/oro/text/regex/Perl5Compiler.java,v
  retrieving revision 1.7
  retrieving revision 1.8
  diff -u -r1.7 -r1.8
  --- Perl5Compiler.java        2001/01/29 00:22:05     1.7
  +++ Perl5Compiler.java        2001/03/29 16:33:17     1.8
  @@ -67,7 +67,7 @@
    * information about Perl5 regular expressions.
   
    @author <a href="mailto:[EMAIL PROTECTED]">Daniel F. Savarese</a>
  - @version $Id: Perl5Compiler.java,v 1.7 2001/01/29 00:22:05 dfs Exp $
  + @version $Id: Perl5Compiler.java,v 1.8 2001/03/29 16:33:17 dfs Exp $
   
    * @see PatternCompiler
    * @see MalformedPatternException
  @@ -110,18 +110,20 @@
     
     static {
       __hashPOSIX = new HashMap();
  -    __hashPOSIX.put("alnum",     new Character('w'));
  +    __hashPOSIX.put("alnum",     new Character(OpCode._ALNUM));
  +    __hashPOSIX.put("word",      new Character(OpCode._WORD));
       __hashPOSIX.put("alpha",     new Character(OpCode._ALPHA));
       __hashPOSIX.put("blank",     new Character(OpCode._BLANK));
       __hashPOSIX.put("cntrl",     new Character(OpCode._CNTRL));
  -    __hashPOSIX.put("digit",     new Character('d'));
  +    __hashPOSIX.put("digit",     new Character(OpCode._DIGIT));
       __hashPOSIX.put("graph",     new Character(OpCode._GRAPH));
       __hashPOSIX.put("lower",     new Character(OpCode._LOWER));
       __hashPOSIX.put("print",     new Character(OpCode._PRINT));
       __hashPOSIX.put("punct",     new Character(OpCode._PUNCT));
  -    __hashPOSIX.put("space",     new Character('s'));
  +    __hashPOSIX.put("space",     new Character(OpCode._SPACE));
       __hashPOSIX.put("upper",     new Character(OpCode._UPPER));
       __hashPOSIX.put("xdigit",    new Character(OpCode._XDIGIT));
  +    __hashPOSIX.put("ascii",     new Character(OpCode._ASCII));
     }
   
   
  @@ -642,12 +644,12 @@
          __getNextChar();
          break;
        case 'w':
  -       offset = __emitNode(OpCode._ALNUM);
  +       offset = __emitNode(OpCode._WORD);
          retFlags[0] |= (__NONNULL | __SIMPLE);
          __getNextChar();
          break;
        case 'W':
  -       offset = __emitNode(OpCode._NALNUM);
  +       offset = __emitNode(OpCode._NWORD);
          retFlags[0] |= (__NONNULL | __SIMPLE);
          __getNextChar();
          break;
  @@ -732,7 +734,8 @@
          if(__input._isAtEnd())
            throw new
              MalformedPatternException("Trailing \\ in expression.");
  -       // fall through to default
  +
  +     // fall through to default
        default:
          doDefault = true;
          break tryAgain;
  @@ -864,7 +867,6 @@
              break forLoop;
            }
            break;
  -
          case CharStringPointer._END_OF_STRING:
          case '\0':
            if(pOffset >= maxOffset)
  @@ -876,7 +878,6 @@
            break;
          } // end backslash switch
          break;
  -
        case '#':
          if((__modifierFlags[0] & __EXTENDED) != 0) {
            while(pOffset < maxOffset && __input._getValue(pOffset) != '\n')
  @@ -1106,7 +1107,9 @@
     private int __parseUnicodeClass() throws MalformedPatternException {
       boolean range = false, skipTest;
       char clss, lastclss = Character.MAX_VALUE;
  +
       int offset, numLength[] = { 0 };
  +    boolean negFlag[] = new boolean[1];
       boolean opcodeFlag; /* clss isn't character when this flag true. */
   
       if(__input._getValue() == '^') {
  @@ -1136,83 +1139,85 @@
          clss = __input._postIncrement();
        } else {
          /* try POSIX expression */
  -       char posixOpCode = __parsePOSIX();
  +       char posixOpCode = __parsePOSIX(negFlag);
          if(posixOpCode != 0){
            opcodeFlag = true;
            clss = posixOpCode;
          }
        }
  -
  -     switch(clss){
  -     case 'w':
  -       opcodeFlag = true;
  -       clss = OpCode._ALNUM;
  -       lastclss = Character.MAX_VALUE;
  -       break;
  -     case 'W':
  -       opcodeFlag = true;
  -       clss = OpCode._NALNUM;
  -       lastclss = Character.MAX_VALUE;
  -       break;
  -     case 's':
  -       opcodeFlag = true;
  -       clss = OpCode._SPACE;
  -       lastclss = Character.MAX_VALUE;
  -       break;
  -     case 'S':
  -       opcodeFlag = true;
  -       clss = OpCode._NSPACE;
  -       lastclss = Character.MAX_VALUE;
  -       break;
  -     case 'd':
  -       opcodeFlag = true;
  -       clss = OpCode._DIGIT;
  -       lastclss = Character.MAX_VALUE;
  -       break;
  -     case 'D':
  -       opcodeFlag = true;
  -       clss = OpCode._NDIGIT;
  -       lastclss = Character.MAX_VALUE;
  -       break;
  -     case 'n':
  -       clss = '\n';
  -       break;
  -     case 'r':
  -       clss = '\r';
  -       break;
  -     case 't':
  -       clss = '\t';
  -       break;
  -     case 'f':
  -       clss = '\f';
  -       break;
  -     case 'b':
  -       clss = '\b';
  -       break;
  -     case 'e':
  -       clss = '\033';
  -       break;
  -     case 'a':
  -       clss = '\007';
  -       break;
  -     case 'x':
  -       clss = (char)__parseHex(__input._array, __input._getOffset(), 2,
  -                               numLength);
  -       __input._increment(numLength[0]);
  -       break;
  -     case 'c':
  -       clss = __input._postIncrement();
  -       if(Character.isLowerCase(clss))
  -         clss = Character.toUpperCase(clss);
  -       clss ^= 64;
  -       break;
  -     case '0': case '1': case '2': case '3': case '4':
  -     case '5': case '6': case '7': case '8': case '9':
  -       clss = (char)__parseOctal(__input._array, __input._getOffset() - 1,
  -                                 3, numLength);
  -       __input._increment(numLength[0] - 1);
  -       break;
  -     default:
  +        if (opcodeFlag != true) {
  +       switch(clss){
  +       case 'w':
  +         opcodeFlag = true;
  +         clss = OpCode._WORD;
  +         lastclss = Character.MAX_VALUE;
  +         break;
  +       case 'W':
  +         opcodeFlag = true;
  +         clss = OpCode._NWORD;
  +         lastclss = Character.MAX_VALUE;
  +         break;
  +       case 's':
  +         opcodeFlag = true;
  +         clss = OpCode._SPACE;
  +         lastclss = Character.MAX_VALUE;
  +         break;
  +       case 'S':
  +         opcodeFlag = true;
  +         clss = OpCode._NSPACE;
  +         lastclss = Character.MAX_VALUE;
  +         break;
  +       case 'd':
  +         opcodeFlag = true;
  +         clss = OpCode._DIGIT;
  +         lastclss = Character.MAX_VALUE;
  +         break;
  +       case 'D':
  +         opcodeFlag = true;
  +         clss = OpCode._NDIGIT;
  +         lastclss = Character.MAX_VALUE;
  +         break;
  +       case 'n':
  +         clss = '\n';
  +         break;
  +       case 'r':
  +         clss = '\r';
  +         break;
  +       case 't':
  +         clss = '\t';
  +         break;
  +       case 'f':
  +         clss = '\f';
  +         break;
  +       case 'b':
  +         clss = '\b';
  +         break;
  +       case 'e':
  +         clss = '\033';
  +         break;
  +       case 'a':
  +         clss = '\007';
  +         break;
  +       case 'x':
  +         clss = (char)__parseHex(__input._array, __input._getOffset(), 2,
  +                                 numLength);
  +         __input._increment(numLength[0]);
  +         break;
  +       case 'c':
  +         clss = __input._postIncrement();
  +         if(Character.isLowerCase(clss))
  +             clss = Character.toUpperCase(clss);
  +         clss ^= 64;
  +         break;
  +       case '0': case '1': case '2': case '3': case '4':
  +       case '5': case '6': case '7': case '8': case '9':
  +         clss = (char)__parseOctal(__input._array, __input._getOffset() - 1,
  +                                   3, numLength);
  +         __input._increment(numLength[0] - 1);
  +         break;
  +       default:
  +            break;
  +          }
        }
         }
   
  @@ -1235,7 +1240,10 @@
   
       if(lastclss == clss) {
         if(opcodeFlag == true) {
  -     __emitCode(OpCode._OPCODE);
  +        if(negFlag[0] == false)
  +       __emitCode(OpCode._OPCODE);
  +        else 
  +       __emitCode(OpCode._NOPCODE);
         } else {
        __emitCode(OpCode._ONECHAR);
         }
  @@ -1281,7 +1289,7 @@
      * 
      * @return OpCode. return 0 when fail parsing POSIX expression.
      */
  -  private char __parsePOSIX() throws MalformedPatternException {
  +  private char __parsePOSIX(boolean negFlag[]) throws MalformedPatternException {
       int offset = __input._getOffset();
       int len = __input._getLength();
       int pos = offset;
  @@ -1290,6 +1298,12 @@
       Object opcode;
   
       if( value != ':' ) return 0;
  +    if( __input._getValue(pos) == '^' ) {
  +      negFlag[0] = true;
  +      pos++;
  +    } else {
  +      negFlag[0] = false;
  +    }
   
       buf = new StringBuffer();
       
  @@ -1311,7 +1325,8 @@
         return 0;
   
       __input._setOffset(pos);
  -
  +//    
System.out.println("posix="+buf.toString()+":"+((Character)opcode).charValue());
  +    
       return ((Character)opcode).charValue();
     }
   
  
  
  
  1.4       +14 -3     jakarta-oro/src/java/org/apache/oro/text/regex/Perl5Debug.java
  
  Index: Perl5Debug.java
  ===================================================================
  RCS file: /home/cvs/jakarta-oro/src/java/org/apache/oro/text/regex/Perl5Debug.java,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- Perl5Debug.java   2001/01/29 00:19:01     1.3
  +++ Perl5Debug.java   2001/03/29 16:33:17     1.4
  @@ -68,7 +68,7 @@
    * comparison with the program generated by Perl5 with the -r option.
   
    @author <a href="mailto:[EMAIL PROTECTED]">Daniel F. Savarese</a>
  - @version $Id: Perl5Debug.java,v 1.3 2001/01/29 00:19:01 dfs Exp $
  + @version $Id: Perl5Debug.java,v 1.4 2001/03/29 16:33:17 dfs Exp $
   
    * @see Perl5Pattern
    */
  @@ -199,14 +199,25 @@
       case OpCode._NOTHING: str = "NOTHING"; break;
       case OpCode._BACK  : str = "BACK"; break;
       case OpCode._END   : str = "END"; break;
  -    case OpCode._ALNUM : str = "ALNUM"; break;
  -    case OpCode._NALNUM: str = "NALNUM"; break;
  +    case OpCode._WORD : str = "WORD"; break;
  +    case OpCode._NWORD: str = "NWORD"; break;
       case OpCode._BOUND : str = "BOUND"; break;
       case OpCode._NBOUND: str = "NBOUND"; break;
       case OpCode._SPACE : str = "SPACE"; break;
       case OpCode._NSPACE: str = "NSPACE"; break;
       case OpCode._DIGIT : str = "DIGIT"; break;
       case OpCode._NDIGIT: str = "NDIGIT"; break;
  +    case OpCode._ALPHA : str = "ALPHA"; break;
  +    case OpCode._BLANK : str = "BLANK"; break;
  +    case OpCode._CNTRL : str = "CNTRL"; break;
  +    case OpCode._GRAPH : str = "GRAPH"; break;
  +    case OpCode._LOWER : str = "LOWER"; break;
  +    case OpCode._PRINT : str = "PRINT"; break;
  +    case OpCode._PUNCT : str = "PUNCT"; break;
  +    case OpCode._UPPER : str = "UPPER"; break;
  +    case OpCode._XDIGIT: str = "XDIGIT"; break;
  +    case OpCode._ALNUM : str = "ALNUM"; break;
  +    case OpCode._ASCII : str = "ASCII"; break;
       case OpCode._CURLY :
         buffer.append("CURLY {");
         buffer.append((int)OpCode._getArg1(program, offset));
  
  
  
  1.10      +22 -14    jakarta-oro/src/java/org/apache/oro/text/regex/Perl5Matcher.java
  
  Index: Perl5Matcher.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-oro/src/java/org/apache/oro/text/regex/Perl5Matcher.java,v
  retrieving revision 1.9
  retrieving revision 1.10
  diff -u -r1.9 -r1.10
  --- Perl5Matcher.java 2001/01/29 00:22:05     1.9
  +++ Perl5Matcher.java 2001/03/29 16:33:18     1.10
  @@ -66,7 +66,7 @@
    * Perl5Compiler.
   
    @author <a href="mailto:[EMAIL PROTECTED]">Daniel F. Savarese</a>
  - @version $Id: Perl5Matcher.java,v 1.9 2001/01/29 00:22:05 dfs Exp $
  + @version $Id: Perl5Matcher.java,v 1.10 2001/03/29 16:33:18 dfs Exp $
   
    * @see PatternMatcher
    * @see Perl5Compiler
  @@ -512,7 +512,7 @@
          }
          break;
   
  -     case OpCode._ALNUM:
  +     case OpCode._WORD:
          while(__currentOffset < endOffset) {
            ch = __input[__currentOffset];
            if(OpCode._isWordCharacter(ch)) {
  @@ -527,7 +527,7 @@
          }
          break;
   
  -     case OpCode._NALNUM:
  +     case OpCode._NWORD:
          while(__currentOffset < endOffset) {
            ch = __input[__currentOffset];
            if(!OpCode._isWordCharacter(ch)) {
  @@ -637,14 +637,24 @@
        } else {
          offset+=2;
        }
  +
  +      } else if(__program[offset] == OpCode._ONECHAR) {
  +             offset++;
  +     if(__program[offset++] == code) return isANYOF;
   
  -      } else if( __program[offset] == OpCode._OPCODE ){
  +      } else {
  +     isANYOF = (__program[offset] == OpCode._OPCODE) 
  +       ? isANYOF : !isANYOF;
  +
        offset++;
        switch ( __program[offset++] ) {
        case OpCode._ALNUM:
  +       if(Character.isLetterOrDigit(code)) return isANYOF;
  +       break;
  +     case OpCode._WORD:
          if(OpCode._isWordCharacter(code)) return isANYOF;
          break;
  -     case OpCode._NALNUM:
  +     case OpCode._NWORD:
          if(!OpCode._isWordCharacter(code)) return isANYOF;
          break;
        case OpCode._SPACE:
  @@ -697,12 +707,10 @@
              (code >= 'a' && code <= 'f') ||
              (code >= 'A' && code <= 'F')) return isANYOF;
          break;
  -     }
  -      } else if((__program[offset++] == OpCode._ONECHAR) &&
  -             (__program[offset++] == code))
  -     {
  -       return isANYOF;
  +     case OpCode._ASCII:
  +       if(code < 0x80)return isANYOF;
        }
  +      } 
       }
       return !isANYOF;
     }
  @@ -785,12 +793,12 @@
         }
         break;
   
  -    case OpCode._ALNUM:
  +    case OpCode._WORD:
         while(scan < eol && OpCode._isWordCharacter(__input[scan]))
        ++scan;
         break;
   
  -    case OpCode._NALNUM:
  +    case OpCode._NWORD:
         while(scan < eol && !OpCode._isWordCharacter(__input[scan]))
        ++scan;
         break;
  @@ -953,7 +961,7 @@
        nextChar = (inputRemains ? __input[input] : __EOS);
        break;
   
  -      case OpCode._ALNUM:
  +      case OpCode._WORD:
        if(!inputRemains)
          return false;
        if(!OpCode._isWordCharacter(nextChar))
  @@ -962,7 +970,7 @@
        nextChar = (inputRemains ? __input[input] : __EOS);
        break;
   
  -      case OpCode._NALNUM:
  +      case OpCode._NWORD:
        if(!inputRemains && input >= __eol)
          return false;
        if(OpCode._isWordCharacter(nextChar))
  
  
  

Reply via email to