The patch proposed on Mon, 23 Jan 2006 has been cancelled,
and here is another patch.
TODO:
This patch cannot make the following tests pass.
/(([a-c])b*?\2)*/
ababbbcbc
0: ababb
1: bb
2: b
gnu.regexp finds "ababbbcbc" instead of "ababb".
/^(b+?|a){1,2}?c/
bbc
0: bbc
1: b
gnu.regexp finds "bbc" but captures "bb" for the group(1).
ChangeLog
2006-01-28 Ito Kazumitsu <[EMAIL PROTECTED]>
Fixes bug #24876
* gnu/regexp/gnu/regexp/RE.java(REG_TRY_ENTIRE_MATCH):
New execution flag.
(getMatchImpl): if REG_TRY_ENTIRE_MATCH is set, add an
implicit RETokenEnd at the end of the regexp chain.
Do not select the longest match, but select the first match.
* gnu/regexp/RETokenOneOf.java: Corrected a typo in a comment.
* gnu/regexp/RETokenRepeated.java (match): Rewrote stingy matching.
Index: classpath/gnu/regexp/RE.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RE.java,v
retrieving revision 1.12
diff -u -r1.12 RE.java
--- classpath/gnu/regexp/RE.java 22 Jan 2006 02:22:21 -0000 1.12
+++ classpath/gnu/regexp/RE.java 28 Jan 2006 16:13:19 -0000
@@ -217,6 +217,13 @@
*/
public static final int REG_NO_INTERPOLATE = 128;
+ /**
+ * Execution flag.
+ * Try to match the whole input string. An implicit match-end operator
+ * is added to this regexp.
+ */
+ public static final int REG_TRY_ENTIRE_MATCH = 256;
+
/** Returns a string representing the version of the gnu.regexp package. */
public static final String version() {
return VERSION;
@@ -1337,23 +1344,34 @@
}
REMatch getMatchImpl(CharIndexed input, int anchor, int eflags, StringBuffer
buffer) {
+ boolean tryEntireMatch = ((eflags & REG_TRY_ENTIRE_MATCH) != 0);
+ RE re = (tryEntireMatch ? (RE) this.clone() : this);
+ if (tryEntireMatch) {
+ re.chain(new RETokenEnd(0, null));
+ }
// Create a new REMatch to hold results
REMatch mymatch = new REMatch(numSubs, anchor, eflags);
do {
// Optimization: check if anchor + minimumLength > length
if (minimumLength == 0 || input.charAt(minimumLength-1) !=
CharIndexed.OUT_OF_BOUNDS) {
- if (match(input, mymatch)) {
- // Find longest match of them all to observe leftmost longest
- REMatch longest = mymatch;
+ if (re.match(input, mymatch)) {
+ REMatch best = mymatch;
+ // We assume that the match that coms first is the best.
+ // And the following "The longer, the better" rule has
+ // been commented out. The longest is not neccesarily
+ // the best. For example, "a" out of "aaa" is the best
+ // match for /a+?/.
+ /*
+ // Find best match of them all to observe leftmost longest
while ((mymatch = mymatch.next) != null) {
- if (mymatch.index > longest.index) {
- longest = mymatch;
+ if (mymatch.index > best.index) {
+ best = mymatch;
}
}
-
- longest.end[0] = longest.index;
- longest.finish(input);
- return longest;
+ */
+ best.end[0] = best.index;
+ best.finish(input);
+ return best;
}
}
mymatch.clear(++anchor);
Index: classpath/gnu/regexp/RETokenOneOf.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RETokenOneOf.java,v
retrieving revision 1.3
diff -u -r1.3 RETokenOneOf.java
--- classpath/gnu/regexp/RETokenOneOf.java 23 Jan 2006 13:18:10 -0000
1.3
+++ classpath/gnu/regexp/RETokenOneOf.java 28 Jan 2006 16:13:19 -0000
@@ -98,7 +98,7 @@
REMatch last = null;
REToken tk;
for (int i=0; i < options.size(); i++) {
- // In ordaer that the backtracking can work,
+ // In order that the backtracking can work,
// each option must be chained to the next token.
// But the chain method has some side effect, so
// we use clones.
Index: classpath/gnu/regexp/RETokenRepeated.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RETokenRepeated.java,v
retrieving revision 1.6
diff -u -r1.6 RETokenRepeated.java
--- classpath/gnu/regexp/RETokenRepeated.java 22 Jan 2006 02:22:21 -0000
1.6
+++ classpath/gnu/regexp/RETokenRepeated.java 28 Jan 2006 16:13:19 -0000
@@ -113,20 +113,24 @@
REMatch recurrent;
int lastIndex = mymatch.index;
- do {
- // Check for stingy match for each possibility.
- if ((stingy && (numRepeats >= min)) || alwaysEmpty) {
- REMatch result = matchRest(input, newMatch);
- if (result != null) {
- mymatch.assignFrom(result);
- mymatch.empty = (mymatch.index == origin);
- return true;
- }
- else {
- // Special case of {0}. It must always match an empty string.
- if (alwaysEmpty) return false;
- }
+ // {0} needs some special treatment.
+ if (alwaysEmpty) {
+ REMatch result = matchRest(input, newMatch);
+ if (result != null) {
+ mymatch.assignFrom(result);
+ mymatch.empty = (mymatch.index == origin);
+ return true;
}
+ else {
+ return false;
+ }
+ }
+
+ do {
+ // We want to check something like
+ // if (stingy && (numRepeats >= min)
+ // and neglect the further matching. But experience tells
+ // such neglection may cause incomplete matching.
doables = null;
doablesLast = null;
@@ -222,8 +226,16 @@
// desired.
indexCount = 1;
}
+ if (stingy) {
+ posIndex = (posIndex - indexCount - 1);
+ numRepeats = min - 1;
+ }
while (indexCount-- > 0) {
- --posIndex;
+ if (stingy) {
+ ++posIndex;
+ ++numRepeats;
+ }
+ else --posIndex;
newMatch = (REMatch) positions.elementAt(posIndex);
results = matchRest(input, newMatch);
if (results != null) {
@@ -233,6 +245,7 @@
} else {
// Order these from longest to shortest
// Start by assuming longest (more repeats)
+ // If stingy the order is shortest to longest.
allResultsLast.next = results;
}
// Find new doablesLast