Hi,
I am rewriting gnu/regexp/RETokenRepeated.java so that it
performs a depth-first search.
The good news is that we can move a few more cases from
gnu/testlet/java/util/regex/Pattern/testdata2
(tests which fail)
to
gnu/testlet/java/util/regex/Pattern/testdata1
(tests which pass).
The bad news is that the performance has worsened
a bit (although the number of nodes we have to visit
is equal to that of a width-first search, the overhead
of loop control may be heavier, I guess). And there
remains a very difficult case:
/^(b+?|a){1,2}?c/
bbc
0: bbc
1: b
The reluctant operator ? of b+? comes befoe that of
(){1,2}?, so the former should be stronger than the latter.
But in our implementation, b+? belongs to (){1,2}? and the
reluctance of the latter is stronger.
ChangeLog
2006-02-16 Ito Kazumitsu <[EMAIL PROTECTED]>
* gnu/regexp/REMatch.java(matchFlags): New int field used as
option flags passed to match methods.
(MF_FIND_ALL): New flag.
* gnu/regexp/RETokenOneOf.java(matchP): Unless MF_FIND_ALL is set,
do not try other possibilties once a match is found.
* gnu/regexp/RETokenRepeated.java(findDoables): Set MF_FIND_ALL
so that all possibilities can be found.
(match): Rewritten using new methods matchMinimum and _match.
(_match): New method which performs a depth-first recursive search.
(matchMinimum): New method.
(initVisited), (visitedContains), (addVisited): New methods for
manipulating an array of icharacter positions which _match has
already visited.
Index: classpath/gnu/regexp/REMatch.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/REMatch.java,v
retrieving revision 1.7
diff -u -r1.7 REMatch.java
--- classpath/gnu/regexp/REMatch.java 9 Feb 2006 13:44:59 -0000 1.7
+++ classpath/gnu/regexp/REMatch.java 16 Feb 2006 15:14:35 -0000
@@ -1,5 +1,5 @@
/* gnu/regexp/REMatch.java
- Copyright (C) 1998-2001, 2004 Free Software Foundation, Inc.
+ Copyright (C) 2006 Free Software Foundation, Inc.
This file is part of GNU Classpath.
@@ -69,6 +69,8 @@
REMatch next; // other possibility (to avoid having to use arrays)
boolean empty; // empty string matched. This flag is used only within
// RETokenRepeated.
+ int matchFlags; // flags passed to match methods
+ static final int MF_FIND_ALL = 0x01;
public Object clone() {
try {
Index: classpath/gnu/regexp/RETokenOneOf.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RETokenOneOf.java,v
retrieving revision 1.7
diff -u -r1.7 RETokenOneOf.java
--- classpath/gnu/regexp/RETokenOneOf.java 13 Feb 2006 13:19:44 -0000
1.7
+++ classpath/gnu/regexp/RETokenOneOf.java 16 Feb 2006 15:14:35 -0000
@@ -187,6 +187,8 @@
}
private boolean matchP(CharIndexed input, REMatch mymatch, boolean
tryOnly) {
+ boolean stopMatchingIfSatisfied =
+ (mymatch.matchFlags & REMatch.MF_FIND_ALL) == 0;
REMatch.REMatchList newMatch = new REMatch.REMatchList();
REToken tk;
for (int i=0; i < options.size(); i++) {
@@ -204,6 +206,7 @@
if (tk.match(input, tryMatch)) { // match was successful
if (tryOnly) return true;
newMatch.addTail(tryMatch);
+ if (stopMatchingIfSatisfied) break;
} // is a match
} // try next option
if (tryOnly) return false;
Index: classpath/gnu/regexp/RETokenRepeated.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RETokenRepeated.java,v
retrieving revision 1.9
diff -u -r1.9 RETokenRepeated.java
--- classpath/gnu/regexp/RETokenRepeated.java 6 Feb 2006 14:03:59 -0000
1.9
+++ classpath/gnu/regexp/RETokenRepeated.java 16 Feb 2006 15:14:35 -0000
@@ -1,5 +1,5 @@
/* gnu/regexp/RETokenRepeated.java
- Copyright (C) 1998-2001, 2004 Free Software Foundation, Inc.
+ Copyright (C) 2006 Free Software Foundation, Inc.
This file is part of GNU Classpath.
@@ -39,20 +39,19 @@
package gnu.regexp;
import java.util.Vector;
+import java.util.Arrays;
final class RETokenRepeated extends REToken {
private REToken token;
private int min,max;
private boolean stingy;
private boolean possessive;
- private boolean alwaysEmpty; // Special case of {0}
RETokenRepeated(int subIndex, REToken token, int min, int max) {
super(subIndex);
this.token = token;
this.min = min;
this.max = max;
- alwaysEmpty = (min == 0 && max == 0);
}
/** Sets the minimal matching mode to true. */
@@ -91,8 +90,6 @@
return (max * tmax);
}
- boolean stopMatchingIfSatisfied = true;
-
private static REMatch findDoables(REToken tk,
CharIndexed input, REMatch mymatch) {
@@ -105,7 +102,11 @@
int origin = recurrent.index;
tk = (REToken) tk.clone();
tk.next = tk.uncle = null;
+ recurrent.matchFlags |= REMatch.MF_FIND_ALL;
if (tk.match(input, recurrent)) {
+ for (REMatch m = recurrent; m != null; m = m.next) {
+ m.matchFlags &= ~REMatch.MF_FIND_ALL;
+ }
if (recurrent.index == origin) recurrent.empty = true;
// add all items in current to doables array
doables.addTail(recurrent);
@@ -123,131 +124,179 @@
// the subexpression back-reference operator allow that?
boolean match(CharIndexed input, REMatch mymatch) {
- // Possible positions for the next repeat to match at
- REMatch newMatch = mymatch;
- // {0} needs some special treatment.
- if (alwaysEmpty) {
- REMatch result = matchRest(input, newMatch);
- if (result != null) {
- mymatch.assignFrom(result);
- return true;
- }
- else {
- return false;
- }
- }
+ boolean stopMatchingIfSatisfied =
+ (mymatch.matchFlags & REMatch.MF_FIND_ALL) == 0;
- // number of times we've matched so far
- int numRepeats = 0;
-
- REMatch doables;
- int lastIndex = mymatch.index;
- boolean emptyMatchFound = false;
+ REMatch newMatch = matchMinimum(input, mymatch);
+ if (newMatch == null) return false;
- while (numRepeats < min) {
- doables = findDoables(token, input, newMatch);
+ // Array of positions we have already visited
+ int[] visited = initVisited();
+ for (REMatch m = newMatch; m != null; m = m.next) {
+ visited = addVisited(m.index, visited);
+ }
- // if none of the possibilities worked out,
- // it means that minimum number of repeats could not be found.
- if (doables == null) return false;
-
- // reassign where the next repeat can match
- newMatch = doables;
-
- // increment how many repeats we've successfully found
- ++numRepeats;
-
- if (newMatch.empty) {
- numRepeats = min;
- emptyMatchFound = true;
- break;
- }
- lastIndex = newMatch.index;
+ int max1 = decreaseMax(max, min);
+
+ newMatch = _match(input, newMatch, max1,
+ stopMatchingIfSatisfied, visited);
+ if (newMatch != null) {
+ mymatch.assignFrom(newMatch);
+ return true;
}
+ return false;
+ }
- Vector positions = new Vector();
+ private static int decreaseMax(int m, int n) {
+ if (m == Integer.MAX_VALUE) return m;
+ return m - n;
+ }
- while (numRepeats <= max) {
- // We want to check something like
- // if (stingy)
- // and neglect the further matching. But experience tells
- // such neglection may cause incomplete matching.
- // For example, if we neglect the seemingly unnecessay
- // matching, /^(b+?|a){1,2}?c/ cannot match "bbc".
- // On the other hand, if we do not stop the unnecessary
- // matching, /(([a-c])b*?\2)*/ matches "ababbbcbc"
- // entirely when we wan to find only "ababb".
- // In order to make regression tests pass, we do as we did.
- if (stopMatchingIfSatisfied && stingy) {
- REMatch results = matchRest(input, newMatch);
- if (results != null) {
- mymatch.assignFrom(results);
- return true;
- }
- }
- positions.add(newMatch);
- if (emptyMatchFound) break;
+ private static int[] initVisited() {
+ int[] visited = new int[32];
+ for (int i = 0; i < visited.length; i++) {
+ visited[i] = Integer.MAX_VALUE;
+ }
+ return visited;
+ }
- doables = findDoables(token, input, newMatch);
- if (doables == null) break;
+ private static boolean visitedContains(int n, int[] visited) {
+ return (Arrays.binarySearch(visited, n) >= 0);
+ }
- // doables.index == lastIndex occurs either
- // (1) when an empty string was the longest
- // that matched this token.
- // or
- // (2) when the same string matches this token many times.
- // For example, "acbab" itself matches "a.*b" and
- // its substrings "acb" and "ab" also match.
- // In this case, we do not have to go further until
- // numRepeats == max because the more numRepeats grows,
- // the shorter the substring matching this token becomes.
- // So the previous succesful match must have bee the best
- // match. But this is not necessarily the case if stingy.
- if (doables.index == lastIndex) {
- if (doables.empty) {
- emptyMatchFound = true;
- }
- else {
- if (!stingy) break;
- }
+ private static int[] addVisited(int n, int[] visited) {
+ int p = Arrays.binarySearch(visited, n);
+ if (p >= 0) return visited;
+ p = -1 - p;
+ if (p >= visited.length) {
+ System.err.println("p=" + p);
+ int[] newvisited = new int[visited.length + 32];
+ System.arraycopy(visited, 0, newvisited, 0, visited.length);
+ for (int i = visited.length; i < newvisited.length; i++) {
+ newvisited[i] = Integer.MAX_VALUE;
}
- numRepeats++;
- newMatch = doables;
- lastIndex = newMatch.index;
+ visited = newvisited;
+ }
+ if (visited[p] != Integer.MAX_VALUE) {
+ System.arraycopy(visited, p, visited, p+1, visited.length - p - 1);
}
+ visited[p] = n;
+ return visited;
+ }
+
+ private REMatch _match(CharIndexed input, REMatch mymatch,
+ int max1, boolean stopMatchingIfSatisfied,
+ int[] visited) {
- // We're greedy, but ease off until a true match is found.
- // At this point we've either got too many or just the right amount.
- // See if this numRepeats works with the rest of the regexp.
+ if (max1 == 0) {
+ return matchRest(input, mymatch);
+ }
+ max1 = decreaseMax(max1, 1);
REMatch.REMatchList allResults = new REMatch.REMatchList();
- int posCount = positions.size();
- int posIndex = (stingy ? 0 : posCount - 1);
+ // Depth-first search
+
+ for (REMatch cur = mymatch; cur != null; cur = cur.next) {
+
+ REMatch cur1 = (REMatch) cur.clone();
- while (posCount-- > 0) {
- REMatch m = (REMatch) positions.elementAt(posIndex);
- if (stingy) posIndex++; else posIndex--;
-
- REMatch results = matchRest(input, m);
- if (results != null) {
- // Order these from longest to shortest
- // Start by assuming longest (more repeats)
- // If stingy the order is shortest to longest.
- allResults.addTail(results);
+ if (stingy) {
+ REMatch results = matchRest(input, cur1);
+ if (results != null) {
+ if (stopMatchingIfSatisfied) {
+ return results;
+ }
+ allResults.addTail(results);
+ }
}
- else {
- if (possessive) break;
+
+ DO_THIS:
+ do {
+
+ boolean emptyMatchFound = false;
+ REMatch doables = findDoables(token, input, cur1);
+ if (doables == null) break DO_THIS;
+ if (doables.empty) emptyMatchFound = true;
+
+ if (!emptyMatchFound) {
+ REMatch.REMatchList list = new REMatch.REMatchList();
+ for (REMatch m = doables; m != null; m = m.next) {
+ REMatch m1 = (REMatch) m.clone();
+ int n = m1.index;
+ if (! visitedContains(n, visited)) {
+ visited = addVisited(n, visited);
+ list.addTail(m1);
+ }
+ }
+ if (list.head == null) break DO_THIS;
+ doables = list.head;
+ }
+
+ for (REMatch m = doables; m != null; m = m.next) {
+ if (! emptyMatchFound) {
+ REMatch m1 = _match(input, m, max1,
+ stopMatchingIfSatisfied, visited);
+ if (possessive) return m1;
+ if (m1 != null) {
+ if (stopMatchingIfSatisfied) {
+ return m1;
+ }
+ allResults.addTail(m1);
+ }
+ }
+ else {
+ REMatch m1 = matchRest(input, m);
+ if (m1 != null) {
+ if (stopMatchingIfSatisfied) {
+ return m1;
+ }
+ allResults.addTail(m1);
+ }
+ }
+ }
+
+ } while (false); // DO_THIS only once;
+
+ // This point itself is a candidate.
+ if (!stingy) {
+ REMatch m2 = matchRest(input, cur1);
+ if (m2 != null) {
+ if (stopMatchingIfSatisfied) {
+ return m2;
+ }
+ allResults.addTail(m2);
+ }
}
}
- if (allResults.head != null) {
- mymatch.assignFrom(allResults.head); // does this get all?
- return true;
+ return allResults.head;
+ }
+
+ private REMatch matchMinimum(CharIndexed input, final REMatch mymatch) {
+ // Possible positions for the next repeat to match at
+ REMatch newMatch = mymatch;
+
+ // number of times we've matched so far
+ int numRepeats = 0;
+
+ while (numRepeats < min) {
+ REMatch doables = findDoables(token, input, newMatch);
+
+ // if none of the possibilities worked out,
+ // it means that minimum number of repeats could not be found.
+ if (doables == null) return null;
+
+ // reassign where the next repeat can match
+ newMatch = doables;
+
+ // increment how many repeats we've successfully found
+ ++numRepeats;
+
+ if (newMatch.empty) break;
}
- // If we fall out, no matches.
- return false;
+ return newMatch;
}
private REMatch matchRest(CharIndexed input, final REMatch newMatch) {