This change does not affect the functionality, but hopefully
improves the readability of source codes.
ChangeLog
2006-01-31 Ito Kazumitsu <[EMAIL PROTECTED]>
* gnu/regexp/REMatch.java(REMatchList): New inner utility class
for making a list of REMatch instances.
* gnu/regexp/RETokenOneOf.java(match): Rewritten using REMatchList.
* gnu/regexp/RETokenRepeated.java(findDoables): New method.
(match): Rewritten using REMatchList.
(matchRest): Rewritten using REMatchList.
Index: classpath/gnu/regexp/REMatch.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/REMatch.java,v
retrieving revision 1.5
diff -u -r1.5 REMatch.java
--- classpath/gnu/regexp/REMatch.java 31 Jan 2006 14:57:43 -0000 1.5
+++ classpath/gnu/regexp/REMatch.java 31 Jan 2006 16:01:40 -0000
@@ -264,4 +264,42 @@
if (pos < input.length()) output.append(input.charAt(pos));
return output.toString();
}
+
+ static class REMatchList {
+ REMatch head;
+ REMatch tail;
+ REMatchList() {
+ head = tail = null;
+ }
+ /* Not used now. But we may need this some day?
+ void addHead(REMatch newone) {
+ if (head == null) {
+ head = newone;
+ tail = newone;
+ while (tail.next != null) {
+ tail = tail.next;
+ }
+ }
+ else {
+ REMatch tmp = newone;
+ while (tmp.next != null) tmp = tmp.next;
+ tmp.next = head;
+ head = newone;
+ }
+ }
+ */
+ void addTail(REMatch newone) {
+ if (head == null) {
+ head = newone;
+ tail = newone;
+ }
+ else {
+ tail.next = newone;
+ }
+ while (tail.next != null) {
+ tail = tail.next;
+ }
+ }
+ }
+
}
Index: classpath/gnu/regexp/RETokenOneOf.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RETokenOneOf.java,v
retrieving revision 1.4
diff -u -r1.4 RETokenOneOf.java
--- classpath/gnu/regexp/RETokenOneOf.java 30 Jan 2006 12:35:54 -0000
1.4
+++ classpath/gnu/regexp/RETokenOneOf.java 31 Jan 2006 16:01:40 -0000
@@ -94,8 +94,7 @@
}
private boolean matchP(CharIndexed input, REMatch mymatch) {
- REMatch newMatch = null;
- REMatch last = null;
+ REMatch.REMatchList newMatch = new REMatch.REMatchList();
REToken tk;
for (int i=0; i < options.size(); i++) {
// In order that the backtracking can work,
@@ -108,21 +107,15 @@
tk.subIndex = this.subIndex;
REMatch tryMatch = (REMatch) mymatch.clone();
if (tk.match(input, tryMatch)) { // match was successful
- if (last == null) {
- newMatch = tryMatch;
- last = tryMatch;
- } else {
- last.next = tryMatch;
- last = tryMatch;
- }
+ newMatch.addTail(tryMatch);
} // is a match
} // try next option
- if (newMatch != null) {
+ if (newMatch.head != null) {
// set contents of mymatch equal to newMatch
// try each one that matched
- mymatch.assignFrom(newMatch);
+ mymatch.assignFrom(newMatch.head);
return true;
} else {
return false;
Index: classpath/gnu/regexp/RETokenRepeated.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RETokenRepeated.java,v
retrieving revision 1.7
diff -u -r1.7 RETokenRepeated.java
--- classpath/gnu/regexp/RETokenRepeated.java 30 Jan 2006 12:35:54 -0000
1.7
+++ classpath/gnu/regexp/RETokenRepeated.java 31 Jan 2006 16:01:40 -0000
@@ -86,6 +86,27 @@
boolean stopMatchingIfSatisfied = true;
+ private static REMatch findDoables(REToken tk,
+ CharIndexed input, REMatch mymatch) {
+
+ REMatch.REMatchList doables = new REMatch.REMatchList();
+
+ // try next repeat at all possible positions
+ for (REMatch current = mymatch;
+ current != null; current = current.next) {
+ REMatch recurrent = (REMatch) current.clone();
+ int origin = recurrent.index;
+ tk = (REToken) tk.clone();
+ tk.next = tk.uncle = null;
+ if (tk.match(input, recurrent)) {
+ if (recurrent.index == origin) recurrent.empty = true;
+ // add all items in current to doables array
+ doables.addTail(recurrent);
+ }
+ }
+ return doables.head;
+ }
+
// We do need to save every possible point, but the number of clone()
// invocations here is really a killer for performance on non-stingy
// repeat operators. I'm open to suggestions...
@@ -95,24 +116,8 @@
// the subexpression back-reference operator allow that?
boolean match(CharIndexed input, REMatch mymatch) {
- // number of times we've matched so far
- int numRepeats = 0;
-
// Possible positions for the next repeat to match at
REMatch newMatch = mymatch;
- REMatch last = null;
- REMatch current;
-
- // Add the '0-repeats' index
- // positions.elementAt(z) == position [] in input after <<z>> matches
- Vector positions = new Vector();
- positions.addElement(newMatch);
-
- // Declare variables used in loop
- REMatch doables;
- REMatch doablesLast;
- REMatch recurrent;
- int lastIndex = mymatch.index;
// {0} needs some special treatment.
if (alwaysEmpty) {
@@ -126,9 +131,39 @@
}
}
- do {
+ // number of times we've matched so far
+ int numRepeats = 0;
+
+ REMatch doables;
+ int lastIndex = mymatch.index;
+ boolean emptyMatchFound = false;
+
+ while (numRepeats < min) {
+ doables = findDoables(token, input, newMatch);
+
+ // if none of the possibilities worked out,
+ // it means that minimum number of repeats could not be found.
+ if (doables == null) return false;
+
+ // reassign where the next repeat can match
+ newMatch = doables;
+
+ // increment how many repeats we've successfully found
+ ++numRepeats;
+
+ if (newMatch.empty) {
+ numRepeats = min;
+ emptyMatchFound = true;
+ break;
+ }
+ lastIndex = newMatch.index;
+ }
+
+ Vector positions = new Vector();
+
+ while (numRepeats <= max) {
// We want to check something like
- // if (stingy && (numRepeats >= min))
+ // if (stingy)
// and neglect the further matching. But experience tells
// such neglection may cause incomplete matching.
// For example, if we neglect the seemingly unnecessay
@@ -137,138 +172,71 @@
// matching, /(([a-c])b*?\2)*/ matches "ababbbcbc"
// entirely when we wan to find only "ababb".
// In order to make regression tests pass, we do as we did.
- if (stopMatchingIfSatisfied && stingy && (numRepeats >= min)) {
- REMatch result = matchRest(input, newMatch);
- if (result != null) {
- mymatch.assignFrom(result);
- return true;
- }
- }
-
- doables = null;
- doablesLast = null;
-
- // try next repeat at all possible positions
- for (current = newMatch; current != null; current = current.next) {
- recurrent = (REMatch) current.clone();
- int origin = recurrent.index;
- if (token.match(input, recurrent)) {
- if (recurrent.index == origin) recurrent.empty = true;
- // add all items in current to doables array
- if (doables == null) {
- doables = recurrent;
- doablesLast = recurrent;
- } else {
- // Order these from longest to shortest
- // Start by assuming longest (more repeats)
- doablesLast.next = recurrent;
- }
- // Find new doablesLast
- while (doablesLast.next != null) {
- doablesLast = doablesLast.next;
- }
+ if (stopMatchingIfSatisfied && stingy) {
+ REMatch results = matchRest(input, newMatch);
+ if (results != null) {
+ mymatch.assignFrom(results);
+ return true;
}
}
- // if none of the possibilities worked out, break out of do/while
+ positions.add(newMatch);
+ if (emptyMatchFound) break;
+
+ doables = findDoables(token, input, newMatch);
if (doables == null) break;
-
- // reassign where the next repeat can match
- newMatch = doables;
-
- // increment how many repeats we've successfully found
- ++numRepeats;
-
- positions.addElement(newMatch);
// doables.index == lastIndex occurs either
// (1) when an empty string was the longest
// that matched this token.
- // And this case occurs either
- // (1-1) when this token is always empty,
- // for example "()" or "(())".
- // (1-2) when this token is not always empty
- // but can match an empty string, for example,
- // "a*", "(a|)".
// or
// (2) when the same string matches this token many times.
// For example, "acbab" itself matches "a.*b" and
// its substrings "acb" and "ab" also match.
+ // In this case, we do not have to go further until
+ // numRepeats == max because the more numRepeats grows,
+ // the shorter the substring matching this token becomes.
+ // So the previous succesful match must have bee the best
+ // match. But this is not necessarily the case if stingy.
if (doables.index == lastIndex) {
if (doables.empty) {
- // Case (1): We break here, otherwise we will fall
- // into an endless loop.
- if (numRepeats < min) numRepeats = min;
- break;
- }
+ emptyMatchFound = true;
+ }
else {
- // Case (2): We cannot break here because, for example,
- // "acbacb" matches "a.*b" up to 2 times but
- // not 3 times. So we have to check numRepeats >= min.
- // But we do not have to go further until numRepeats == max
- // because the more numRepeats grows, the shorter the
- // substring matching this token becomes.
- if (numRepeats > min) {
- // This means the previous match was successful,
- // and that must be the best match. This match
- // resulted in shortening the matched substring.
- numRepeats--;
- positions.remove(positions.size() - 1);
- break;
- }
- if (numRepeats == min) break;
+ if (!stingy) break;
}
- }
- lastIndex = doables.index;
- } while (numRepeats < max);
-
- // If there aren't enough repeats, then fail
- if (numRepeats < min) return false;
-
- // We're greedy, but ease off until a true match is found
- int posIndex = positions.size();
-
+ }
+ numRepeats++;
+ newMatch = doables;
+ lastIndex = newMatch.index;
+ }
+
+ // We're greedy, but ease off until a true match is found.
// At this point we've either got too many or just the right amount.
// See if this numRepeats works with the rest of the regexp.
- REMatch allResults = null;
- REMatch allResultsLast = null;
- REMatch results = null;
- int indexCount = posIndex - min;
- if (indexCount <= 0) {
- // This case occurs when we exited the previous do loop before
- // numRepeats >= min because an empty string matched the token.
- // In this case, an empty string can match as many times as
- // desired.
- indexCount = 1;
- }
- if (stingy) {
- posIndex = (posIndex - indexCount - 1);
- }
- while (indexCount-- > 0) {
- if (stingy) ++posIndex; else --posIndex;
- newMatch = (REMatch) positions.elementAt(posIndex);
- results = matchRest(input, newMatch);
- if (results != null) {
- if (allResults == null) {
- allResults = results;
- allResultsLast = results;
- } else {
- // Order these from longest to shortest
- // Start by assuming longest (more repeats)
- // If stingy the order is shortest to longest.
- allResultsLast.next = results;
- }
- // Find new doablesLast
- while (allResultsLast.next != null) {
- allResultsLast = allResultsLast.next;
- }
+ REMatch.REMatchList allResults = new REMatch.REMatchList();
+
+ int posCount = positions.size();
+ int posIndex = (stingy ? 0 : posCount - 1);
+
+ while (posCount-- > 0) {
+ REMatch m = (REMatch) positions.elementAt(posIndex);
+ if (stingy) posIndex++; else posIndex--;
+
+ REMatch results = matchRest(input, m);
+ if (results != null) {
+ // Order these from longest to shortest
+ // Start by assuming longest (more repeats)
+ // If stingy the order is shortest to longest.
+ allResults.addTail(results);
+ }
+ else {
+ if (possessive) break;
}
- // else did not match rest of the tokens, try again on smaller
sample
- // or break out when performing possessive matching
- if (possessive) break;
}
- if (allResults != null) {
- mymatch.assignFrom(allResults); // does this get all?
+
+ if (allResults.head != null) {
+ mymatch.assignFrom(allResults.head); // does this get all?
return true;
}
// If we fall out, no matches.
@@ -277,27 +245,17 @@
private REMatch matchRest(CharIndexed input, final REMatch newMatch) {
REMatch current, single;
- REMatch doneIndex = null;
- REMatch doneIndexLast = null;
+ REMatch.REMatchList doneIndex = new REMatch.REMatchList();
// Test all possible matches for this number of repeats
for (current = newMatch; current != null; current = current.next) {
// clone() separates a single match from the chain
single = (REMatch) current.clone();
if (next(input, single)) {
// chain results to doneIndex
- if (doneIndex == null) {
- doneIndex = single;
- doneIndexLast = single;
- } else {
- doneIndexLast.next = single;
- }
- // Find new doneIndexLast
- while (doneIndexLast.next != null) {
- doneIndexLast = doneIndexLast.next;
- }
+ doneIndex.addTail(single);
}
}
- return doneIndex;
+ return doneIndex.head;
}
void dump(StringBuffer os) {