The patch proposed on Mon, 23 Jan 2006 has been cancelled,
and here is another patch.

TODO:

This patch cannot make the following tests pass.

/(([a-c])b*?\2)*/
    ababbbcbc
 0: ababb
 1: bb
 2: b

gnu.regexp finds "ababbbcbc" instead of "ababb".

/^(b+?|a){1,2}?c/
    bbc
 0: bbc
 1: b

gnu.regexp finds "bbc" but captures "bb" for the group(1). 

ChangeLog
2006-01-28  Ito Kazumitsu  <[EMAIL PROTECTED]>

        Fixes bug #24876
        * gnu/regexp/gnu/regexp/RE.java(REG_TRY_ENTIRE_MATCH):
        New execution flag.
        (getMatchImpl): if REG_TRY_ENTIRE_MATCH is set, add an
        implicit RETokenEnd at the end of the regexp chain.
        Do not select the longest match, but select the first match.
        * gnu/regexp/RETokenOneOf.java: Corrected a typo in a comment.
        * gnu/regexp/RETokenRepeated.java (match): Rewrote stingy matching.

Index: classpath/gnu/regexp/RE.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RE.java,v
retrieving revision 1.12
diff -u -r1.12 RE.java
--- classpath/gnu/regexp/RE.java        22 Jan 2006 02:22:21 -0000      1.12
+++ classpath/gnu/regexp/RE.java        28 Jan 2006 16:13:19 -0000
@@ -217,6 +217,13 @@
    */
   public static final int REG_NO_INTERPOLATE = 128;
 
+  /**
+   * Execution flag.
+   * Try to match the whole input string. An implicit match-end operator
+   * is added to this regexp.
+   */
+  public static final int REG_TRY_ENTIRE_MATCH = 256;
+
   /** Returns a string representing the version of the gnu.regexp package. */
   public static final String version() {
     return VERSION;
@@ -1337,23 +1344,34 @@
   }
 
   REMatch getMatchImpl(CharIndexed input, int anchor, int eflags, StringBuffer 
buffer) {
+      boolean tryEntireMatch = ((eflags & REG_TRY_ENTIRE_MATCH) != 0);
+      RE re = (tryEntireMatch ? (RE) this.clone() : this);
+      if (tryEntireMatch) {
+         re.chain(new RETokenEnd(0, null));
+      }
       // Create a new REMatch to hold results
       REMatch mymatch = new REMatch(numSubs, anchor, eflags);
       do {
          // Optimization: check if anchor + minimumLength > length
          if (minimumLength == 0 || input.charAt(minimumLength-1) != 
CharIndexed.OUT_OF_BOUNDS) {
-             if (match(input, mymatch)) {
-                 // Find longest match of them all to observe leftmost longest
-                 REMatch longest = mymatch;
+             if (re.match(input, mymatch)) {
+                 REMatch best = mymatch;
+                 // We assume that the match that coms first is the best.
+                 // And the following "The longer, the better" rule has
+                 // been commented out. The longest is not neccesarily
+                 // the best. For example, "a" out of "aaa" is the best
+                 // match for /a+?/.
+                 /*
+                 // Find best match of them all to observe leftmost longest
                  while ((mymatch = mymatch.next) != null) {
-                     if (mymatch.index > longest.index) {
-                         longest = mymatch;
+                     if (mymatch.index > best.index) {
+                       best = mymatch;
                      }
                  }
-                 
-                 longest.end[0] = longest.index;
-                 longest.finish(input);
-                 return longest;
+                 */
+                 best.end[0] = best.index;
+                 best.finish(input);
+                 return best;
              }
          }
          mymatch.clear(++anchor);
Index: classpath/gnu/regexp/RETokenOneOf.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RETokenOneOf.java,v
retrieving revision 1.3
diff -u -r1.3 RETokenOneOf.java
--- classpath/gnu/regexp/RETokenOneOf.java      23 Jan 2006 13:18:10 -0000      
1.3
+++ classpath/gnu/regexp/RETokenOneOf.java      28 Jan 2006 16:13:19 -0000
@@ -98,7 +98,7 @@
     REMatch last = null;
     REToken tk;
     for (int i=0; i < options.size(); i++) {
-       // In ordaer that the backtracking can work,
+       // In order that the backtracking can work,
        // each option must be chained to the next token.
        // But the chain method has some side effect, so
        // we use clones.
Index: classpath/gnu/regexp/RETokenRepeated.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RETokenRepeated.java,v
retrieving revision 1.6
diff -u -r1.6 RETokenRepeated.java
--- classpath/gnu/regexp/RETokenRepeated.java   22 Jan 2006 02:22:21 -0000      
1.6
+++ classpath/gnu/regexp/RETokenRepeated.java   28 Jan 2006 16:13:19 -0000
@@ -113,20 +113,24 @@
        REMatch recurrent;
        int lastIndex = mymatch.index;
 
-       do {
-           // Check for stingy match for each possibility.
-           if ((stingy && (numRepeats >= min)) || alwaysEmpty) {
-               REMatch result = matchRest(input, newMatch);
-               if (result != null) {
-                   mymatch.assignFrom(result);
-                   mymatch.empty = (mymatch.index == origin);
-                   return true;
-               }
-               else {
-               // Special case of {0}. It must always match an empty string.
-                   if (alwaysEmpty) return false;
-               }
+       // {0} needs some special treatment.
+       if (alwaysEmpty) {
+           REMatch result = matchRest(input, newMatch);
+           if (result != null) {
+               mymatch.assignFrom(result);
+               mymatch.empty = (mymatch.index == origin);
+               return true;
            }
+           else {
+               return false;
+           }
+       }
+
+       do {
+           // We want to check something like  
+           //    if (stingy && (numRepeats >= min)
+           // and neglect the further matching.  But experience tells
+           // such neglection may cause incomplete matching.
 
            doables = null;
            doablesLast = null;
@@ -222,8 +226,16 @@
            // desired.
            indexCount = 1;
        }
+       if (stingy) {
+           posIndex = (posIndex - indexCount - 1);
+           numRepeats = min - 1;
+       }
        while (indexCount-- > 0) {
-           --posIndex;
+           if (stingy) {
+               ++posIndex;
+               ++numRepeats;
+           }
+           else --posIndex;
            newMatch = (REMatch) positions.elementAt(posIndex);
            results = matchRest(input, newMatch);
            if (results != null) {
@@ -233,6 +245,7 @@
                } else {
                    // Order these from longest to shortest
                    // Start by assuming longest (more repeats)
+                   // If stingy the order is shortest to longest.
                    allResultsLast.next = results;
                }
                // Find new doablesLast

Reply via email to