ChangeLog
2006-01-30  Ito Kazumitsu  <[EMAIL PROTECTED]>

        Fixes bug #26002
        * gnu/regexp/gnu/regexp/RE.java(initialize): Parse /\p{prop}/.
        (NamedProperty): New inner class.
        (getNamedProperty): New method.
        (getRETokenNamedProperty): New Method.
        * gnu/regexp/RESyntax.java(RE_NAMED_PROPERTY): New syntax falg.
        * gnu/regexp/RETokenNamedProperty.java: New file.

Index: classpath/gnu/regexp/RE.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RE.java,v
retrieving revision 1.13
diff -u -r1.13 RE.java
--- classpath/gnu/regexp/RE.java        30 Jan 2006 12:35:54 -0000      1.13
+++ classpath/gnu/regexp/RE.java        30 Jan 2006 13:09:08 -0000
@@ -459,6 +459,7 @@
            // FIXME: asciiEsc == 0 means asciiEsc is not set. But what if
            // \u0000 is used as a meaningful character?
             char asciiEsc = 0;
+           NamedProperty np = null;
            if (("dswDSW".indexOf(pattern[index]) != -1) && 
syntax.get(RESyntax.RE_CHAR_CLASS_ESC_IN_LISTS)) {
              switch (pattern[index]) {
              case 'D':
@@ -478,6 +479,12 @@
                break;
              }
            }
+           if (("pP".indexOf(pattern[index]) != -1) && 
syntax.get(RESyntax.RE_NAMED_PROPERTY)) {
+             np = getNamedProperty(pattern, index - 1, pLength);
+             if (np == null)
+               throw new REException("invalid escape sequence", 
REException.REG_ESCAPE, index);
+             index = index - 1 + np.len - 1;
+           }
            else {
              CharExpression ce = getCharExpression(pattern, index - 1, 
pLength, syntax);
              if (ce == null)
@@ -489,6 +496,8 @@
            
            if (posixID != -1) {
              options.addElement(new 
RETokenPOSIX(subIndex,posixID,insens,negate));
+           } else if (np != null) {
+             
options.addElement(getRETokenNamedProperty(subIndex,np,insens,index));
            } else if (asciiEsc != 0) {
              lastChar = asciiEsc;
            } else {
@@ -991,6 +1000,19 @@
          currentToken = new RETokenChar(subIndex,ce.ch,insens);
        }
 
+       // NAMED PROPERTY
+       // \p{prop}, \P{prop}
+
+       else if ((unit.bk && (unit.ch == 'p') && 
syntax.get(RESyntax.RE_NAMED_PROPERTY)) ||
+                (unit.bk && (unit.ch == 'P') && 
syntax.get(RESyntax.RE_NAMED_PROPERTY))) {
+         NamedProperty np = getNamedProperty(pattern, index - 2, pLength);
+         if (np == null)
+             throw new REException("invalid escape sequence", 
REException.REG_ESCAPE, index);
+         index = index - 2 + np.len;
+         addToken(currentToken);
+         currentToken = getRETokenNamedProperty(subIndex,np,insens,index);
+       }
+
        // NON-SPECIAL CHARACTER (or escape to make literal)
         //  c | \* for example
 
@@ -1126,6 +1148,73 @@
   }
 
   /**
+   * This class represents a substring in a pattern string expressing
+   * a named property.
+   * "\pA"      : Property named "A"
+   * "\p{prop}" : Property named "prop"
+   * "\PA"      : Property named "A" (Negated)
+   * "\P{prop}" : Property named "prop" (Negated)
+   */
+  private static class NamedProperty {
+    /** Property name */
+    String name;
+    /** Negated or not */
+    boolean negate;
+    /** length of this expression */
+    int len;
+  }
+
+  private NamedProperty getNamedProperty(char[] input, int pos, int lim) {
+    NamedProperty np = new NamedProperty();
+    char c = input[pos];
+    if (c == '\\') {
+      if (++pos >= lim) return null;
+      c = input[pos++];
+      switch(c) {
+      case 'p':
+        np.negate = false;
+        break;
+      case 'P':
+        np.negate = true;
+        break;
+      default:
+       return null;
+      }
+      c = input[pos++];
+      if (c == '{') {
+          int p = -1;
+         for (int i = pos; i < lim; i++) {
+             if (input[i] == '}') {
+                 p = i;
+                 break;
+             }
+         }
+         if (p < 0) return null;
+         int len = p - pos;
+          np.name = new String(input, pos, len);
+         np.len = len + 4;
+      }
+      else {
+          np.name = new String(input, pos - 1, 1);
+         np.len = 3;
+      }
+      return np;
+    }
+    else return null;
+  }
+
+  private static RETokenNamedProperty getRETokenNamedProperty(
+      int subIndex, NamedProperty np, boolean insens, int index)
+      throws REException {
+    try {
+       return new RETokenNamedProperty(subIndex, np.name, insens, np.negate);
+    }
+    catch (REException e) {
+       throw new REException(e.getMessage(), REException.REG_ESCAPE, index);
+    }
+  }
+
+  /**
    * Checks if the regular expression matches the input in its entirety.
    *
    * @param input The input text.
Index: classpath/gnu/regexp/RESyntax.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RESyntax.java,v
retrieving revision 1.5
diff -u -r1.5 RESyntax.java
--- classpath/gnu/regexp/RESyntax.java  19 Jan 2006 13:45:51 -0000      1.5
+++ classpath/gnu/regexp/RESyntax.java  30 Jan 2006 13:09:08 -0000
@@ -222,7 +222,12 @@
    */
   public static final int RE_UNICODE_CHAR              = 29;
 
-  private static final int BIT_TOTAL                   = 30;
+  /**
+   * Syntax bit.  Allow named property (\p{P}, \P{p}), as in Perl5.
+   */
+  public static final int RE_NAMED_PROPERTY            = 30;
+
+  private static final int BIT_TOTAL                   = 31;
 
   /**
    * Predefined syntax.
@@ -445,6 +450,7 @@
          .set(RE_EMBEDDED_FLAGS)         // (?imsx-imsx)
          .set(RE_OCTAL_CHAR)             // \0377
          .set(RE_HEX_CHAR)               // \x1b
+         .set(RE_NAMED_PROPERTY)         // \p{prop}, \P{prop}
          .makeFinal();
       
       RE_SYNTAX_PERL5_S = new RESyntax(RE_SYNTAX_PERL5)
Index: classpath/gnu/regexp/RETokenNamedProperty.java
===================================================================
RCS file: classpath/gnu/regexp/RETokenNamedProperty.java
diff -N classpath/gnu/regexp/RETokenNamedProperty.java
--- /dev/null   1 Jan 1970 00:00:00 -0000
+++ classpath/gnu/regexp/RETokenNamedProperty.java      30 Jan 2006 13:09:08 
-0000
@@ -0,0 +1,203 @@
+/* gnu/regexp/RETokenNamedProperty.java
+   Copyright (C) 2006 Free Software Foundation, Inc.
+
+This file is part of GNU Classpath.
+
+GNU Classpath is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2, or (at your option)
+any later version.
+
+GNU Classpath is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GNU Classpath; see the file COPYING.  If not, write to the
+Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+02110-1301 USA.
+
+Linking this library statically or dynamically with other modules is
+making a combined work based on this library.  Thus, the terms and
+conditions of the GNU General Public License cover the whole
+combination.
+
+As a special exception, the copyright holders of this library give you
+permission to link this library with independent modules to produce an
+executable, regardless of the license terms of these independent
+modules, and to copy and distribute the resulting executable under
+terms of your choice, provided that you also meet, for each linked
+independent module, the terms and conditions of the license of that
+module.  An independent module is a module which is not derived from
+or based on this library.  If you modify this library, you may extend
+this exception to your version of the library, but you are not
+obligated to do so.  If you do not wish to do so, delete this
+exception statement from your version. */
+
+
+package gnu.regexp;
+
+final class RETokenNamedProperty extends REToken {
+  String name;
+  boolean insens;
+  boolean negate;
+  Handler handler;
+
+  RETokenNamedProperty(int subIndex, String name, boolean insens, boolean 
negate) throws REException {
+    super(subIndex);
+    this.name = name;
+    this.insens = insens;
+    this.negate = negate;
+    handler = getHandler(name); 
+  }
+
+    int getMinimumLength() {
+       return 1;
+    }
+
+    boolean match(CharIndexed input, REMatch mymatch) {
+    char ch = input.charAt(mymatch.index);
+    if (ch == CharIndexed.OUT_OF_BOUNDS)
+      return false;
+    
+    boolean retval = handler.includes(ch);
+    if (insens) {
+        retval = retval ||
+                 handler.includes(Character.toUpperCase(ch)) ||
+                 handler.includes(Character.toLowerCase(ch));
+    }
+
+    if (negate) retval = !retval;
+    if (retval) {
+       ++mymatch.index;
+       return next(input, mymatch);
+    }
+    else return false;
+  }
+
+  void dump(StringBuffer os) {
+    os.append("\\")
+      .append(negate ? "P" : "p")
+      .append("{" + name + "}");
+  }
+
+  private abstract static class Handler {
+      public abstract boolean includes(char c);
+  }
+
+  private Handler getHandler(String name) throws REException {
+      if (name.equals("Lower") ||
+          name.equals("Upper") ||
+          // name.equals("ASCII") ||
+          name.equals("Alpha") ||
+          name.equals("Digit") ||
+          name.equals("Alnum") ||
+          name.equals("Punct") ||
+          name.equals("Graph") ||
+          name.equals("Print") ||
+          name.equals("Blank") ||
+          name.equals("Cntrl") ||
+          name.equals("XDigit") ||
+          name.equals("Space") ) {
+         return new POSIXHandler(name);
+      }
+      if (name.startsWith("In")) {
+          throw new REException("Unicode block is not supported yet", 
REException.REG_ESCAPE, 0); 
+      }
+      if (name.startsWith("Is")) {
+          name = name.substring(2);
+      }
+      if (name.equals("Mc"))
+          return new UnicodeCategoryHandler(Character.COMBINING_SPACING_MARK);
+      if (name.equals("Pc"))
+          return new UnicodeCategoryHandler(Character.CONNECTOR_PUNCTUATION);
+      if (name.equals("Cc"))
+          return new UnicodeCategoryHandler(Character.CONTROL);
+      if (name.equals("Sc"))
+          return new UnicodeCategoryHandler(Character.CURRENCY_SYMBOL);
+      if (name.equals("Pd"))
+          return new UnicodeCategoryHandler(Character.DASH_PUNCTUATION);
+      if (name.equals("Nd"))
+          return new UnicodeCategoryHandler(Character.DECIMAL_DIGIT_NUMBER);
+      if (name.equals("Me"))
+          return new UnicodeCategoryHandler(Character.ENCLOSING_MARK);
+      if (name.equals("Pe"))
+          return new UnicodeCategoryHandler(Character.END_PUNCTUATION);
+      if (name.equals("Pf"))
+          return new UnicodeCategoryHandler(Character.FINAL_QUOTE_PUNCTUATION);
+      if (name.equals("Cf"))
+          return new UnicodeCategoryHandler(Character.FORMAT);
+      if (name.equals("Pi"))
+          return new 
UnicodeCategoryHandler(Character.INITIAL_QUOTE_PUNCTUATION);
+      if (name.equals("Nl"))
+          return new UnicodeCategoryHandler(Character.LETTER_NUMBER);
+      if (name.equals("Zl"))
+          return new UnicodeCategoryHandler(Character.LINE_SEPARATOR);
+      if (name.equals("Ll"))
+          return new UnicodeCategoryHandler(Character.LOWERCASE_LETTER);
+      if (name.equals("Sm"))
+          return new UnicodeCategoryHandler(Character.MATH_SYMBOL);
+      if (name.equals("Lm"))
+          return new UnicodeCategoryHandler(Character.MODIFIER_LETTER);
+      if (name.equals("Sk"))
+          return new UnicodeCategoryHandler(Character.MODIFIER_SYMBOL);
+      if (name.equals("Mn"))
+          return new UnicodeCategoryHandler(Character.NON_SPACING_MARK);
+      if (name.equals("Lo"))
+          return new UnicodeCategoryHandler(Character.OTHER_LETTER);
+      if (name.equals("No"))
+          return new UnicodeCategoryHandler(Character.OTHER_NUMBER);
+      if (name.equals("Po"))
+          return new UnicodeCategoryHandler(Character.OTHER_PUNCTUATION);
+      if (name.equals("So"))
+          return new UnicodeCategoryHandler(Character.OTHER_SYMBOL);
+      if (name.equals("Zp"))
+          return new UnicodeCategoryHandler(Character.PARAGRAPH_SEPARATOR);
+      if (name.equals("Co"))
+          return new UnicodeCategoryHandler(Character.PRIVATE_USE);
+      if (name.equals("Zs"))
+          return new UnicodeCategoryHandler(Character.SPACE_SEPARATOR);
+      if (name.equals("Ps"))
+          return new UnicodeCategoryHandler(Character.START_PUNCTUATION);
+      if (name.equals("Cs"))
+          return new UnicodeCategoryHandler(Character.SURROGATE);
+      if (name.equals("Lt"))
+          return new UnicodeCategoryHandler(Character.TITLECASE_LETTER);
+      if (name.equals("Cn"))
+          return new UnicodeCategoryHandler(Character.UNASSIGNED);
+      if (name.equals("Lu"))
+          return new UnicodeCategoryHandler(Character.UPPERCASE_LETTER);
+      throw new REException("unsupported name " + name, 
REException.REG_ESCAPE, 0);
+  }
+
+  private static class POSIXHandler extends Handler {
+      private RETokenPOSIX retoken;
+      private REMatch mymatch = new REMatch(0,0,0);
+      private char[] chars = new char[1];
+      private CharIndexedCharArray ca = new CharIndexedCharArray(chars, 0);
+      public POSIXHandler(String name) {
+            int posixId = RETokenPOSIX.intValue(name.toLowerCase());
+            if (posixId != -1)
+              retoken = new RETokenPOSIX(0,posixId,false,false);
+           else
+              throw new RuntimeException("Unknown posix ID: " + name);
+      }
+      public boolean includes(char c) {
+          chars[0] = c;
+          mymatch.index = 0;
+          return retoken.match(ca, mymatch);
+      }
+  }
+
+  private static class UnicodeCategoryHandler extends Handler {
+      public UnicodeCategoryHandler(byte category) {
+          this.category = (int)category;
+      }
+      private int category;
+      public boolean includes(char c) {
+          return Character.getType(c) == category;
+      }
+  }
+ 
+}

Reply via email to