This patch implements the region-based matching methods in java.util.regex. There are still a number of minor bugs, due to missing features in our regex implementation, but I'd prefer to keep those fixes for further patches, as this one already alters things a great deal and does at least implement all the required methods and track the state correctly.
ChangeLog: 2008-05-11 Andrew John Hughes <[EMAIL PROTECTED]> * java/util/regex/Matcher.java: (regionStart): New variable. (regionEnd): Likewise. (transparentBounds): Likewise. (anchoringBounds): Likewise. (Matcher()): Initialise new variables. (find()): Alter to use new settings. (find(int)): Likewise. (lookingAt()): Likewise. (matches()): Likewise. (reset()): Reset region. (reset(CharSequence)): Documented. (toString()): Include new variables. (region(int,int)): Implemented. (regionStart()): Likewise. (regionEnd()): Likewise. (hasTransparentBounds()): Likewise. (useTransparentBounds(boolean)): Likewise. (hasAnchoringBounds()): Likewise. (useAnchoringBounds(boolean)): Likewise. -- Andrew :) Support Free Java! Contribute to GNU Classpath and the OpenJDK http://www.gnu.org/software/classpath http://openjdk.java.net PGP Key: 94EFD9D8 (http://subkeys.pgp.net) Fingerprint = F8EF F1EA 401E 2E60 15FA 7927 142C 2591 94EF D9D8
Index: java/util/regex/Matcher.java =================================================================== RCS file: /sources/classpath/classpath/java/util/regex/Matcher.java,v retrieving revision 1.20 diff -u -r1.20 Matcher.java --- java/util/regex/Matcher.java 16 Mar 2008 22:44:41 -0000 1.20 +++ java/util/regex/Matcher.java 12 May 2008 20:34:34 -0000 @@ -61,11 +61,45 @@ private int appendPosition; private REMatch match; + /** + * The start of the region of the input on which to match. + */ + private int regionStart; + + /** + * The end of the region of the input on which to match. + */ + private int regionEnd; + + /** + * True if the match process should look beyond the + * region marked by regionStart to regionEnd when + * performing lookAhead, lookBehind and boundary + * matching. + */ + private boolean transparentBounds; + + /** + * The flags that affect the anchoring bounds. + * If [EMAIL PROTECTED] #hasAnchoringBounds()} is [EMAIL PROTECTED] true}, + * the match process will honour the + * anchoring bounds: ^, \A, \Z, \z and $. If + * [EMAIL PROTECTED] #hasAnchoringBounds()} is [EMAIL PROTECTED] false}, + * the anchors are ignored and appropriate flags, + * stored in this variable, are used to provide this + * behaviour. + */ + private int anchoringBounds; + Matcher(Pattern pattern, CharSequence input) { this.pattern = pattern; this.input = input; this.inputCharIndexed = RE.makeCharIndexed(input, 0); + regionStart = 0; + regionEnd = input.length(); + transparentBounds = false; + anchoringBounds = 0; } /** @@ -127,7 +161,11 @@ public boolean find () { boolean first = (match == null); - match = pattern.getRE().getMatch(inputCharIndexed, position); + if (transparentBounds || (regionStart == 0 && regionEnd == input.length())) + match = pattern.getRE().getMatch(inputCharIndexed, position, anchoringBounds); + else + match = pattern.getRE().getMatch(input.subSequence(regionStart, regionEnd), + position, anchoringBounds); if (match != null) { int endIndex = match.getEndIndex(); @@ -158,7 +196,11 @@ */ public boolean find (int start) { - match = pattern.getRE().getMatch(inputCharIndexed, start); + if (transparentBounds || (regionStart == 0 && regionEnd == input.length())) + match = pattern.getRE().getMatch(inputCharIndexed, start, anchoringBounds); + else + match = pattern.getRE().getMatch(input.subSequence(regionStart, regionEnd), + start, anchoringBounds); if (match != null) { position = match.getEndIndex(); @@ -220,7 +262,12 @@ public boolean lookingAt () { - match = pattern.getRE().getMatch(inputCharIndexed, 0, RE.REG_FIX_STARTING_POSITION, null); + if (transparentBounds || (regionStart == 0 && regionEnd == input.length())) + match = pattern.getRE().getMatch(inputCharIndexed, regionStart, + anchoringBounds|RE.REG_FIX_STARTING_POSITION|RE.REG_ANCHORINDEX); + else + match = pattern.getRE().getMatch(input.subSequence(regionStart, regionEnd), 0, + anchoringBounds|RE.REG_FIX_STARTING_POSITION); if (match != null) { if (match.getStartIndex() == 0) @@ -245,7 +292,12 @@ */ public boolean matches () { - match = pattern.getRE().getMatch(inputCharIndexed, 0, RE.REG_TRY_ENTIRE_MATCH|RE.REG_FIX_STARTING_POSITION, null); + if (transparentBounds || (regionStart == 0 && regionEnd == input.length())) + match = pattern.getRE().getMatch(inputCharIndexed, regionStart, + anchoringBounds|RE.REG_TRY_ENTIRE_MATCH|RE.REG_FIX_STARTING_POSITION|RE.REG_ANCHORINDEX); + else + match = pattern.getRE().getMatch(input.subSequence(regionStart, regionEnd), 0, + anchoringBounds|RE.REG_TRY_ENTIRE_MATCH|RE.REG_FIX_STARTING_POSITION); if (match != null) { if (match.getStartIndex() == 0) @@ -267,15 +319,39 @@ return pattern; } + /** + * Resets the internal state of the matcher, including + * resetting the region to its default state of encompassing + * the whole input. The state of [EMAIL PROTECTED] #hasTransparentBounds()} + * and [EMAIL PROTECTED] #hasAnchoringBounds()} are unaffected. + * + * @return a reference to this matcher. + * @see #regionStart() + * @see #regionEnd() + * @see #hasTransparentBounds() + * @see #hasAnchoringBounds() + */ public Matcher reset () { position = 0; match = null; + regionStart = 0; + regionEnd = input.length(); return this; } /** - * @param input The new input character sequence + * Resets the internal state of the matcher, including + * resetting the region to its default state of encompassing + * the whole input. The state of [EMAIL PROTECTED] #hasTransparentBounds()} + * and [EMAIL PROTECTED] #hasAnchoringBounds()} are unaffected. + * + * @param input The new input character sequence. + * @return a reference to this matcher. + * @see #regionStart() + * @see #regionEnd() + * @see #hasTransparentBounds() + * @see #hasAnchoringBounds() */ public Matcher reset (CharSequence input) { @@ -285,7 +361,7 @@ } /** - * @returns the index of a capturing group in this matcher's pattern + * @return the index of a capturing group in this matcher's pattern * * @exception IllegalStateException If no match has yet been attempted, * or if the previous match operation failed @@ -314,6 +390,7 @@ /** * @return True if and only if the matcher hit the end of input. + * @since 1.5 */ public boolean hitEnd() { @@ -328,7 +405,9 @@ CPStringBuilder sb = new CPStringBuilder(); sb.append(this.getClass().getName()) .append("[pattern=").append(pattern.pattern()) - .append(" region=").append("0").append(",").append(input.length()) + .append(" region=").append(regionStart).append(",").append(regionEnd) + .append(" anchoringBounds=").append(anchoringBounds == 0) + .append(" transparentBounds=").append(transparentBounds) .append(" lastmatch=").append(match == null ? "" : match.toString()) .append("]"); return sb.toString(); @@ -338,4 +417,177 @@ { if (match == null) throw new IllegalStateException(); } + + /** + * <p> + * Defines the region of the input on which to match. + * By default, the [EMAIL PROTECTED] Matcher} attempts to match + * the whole string (from 0 to the length of the input), + * but a region between [EMAIL PROTECTED] start} (inclusive) and + * [EMAIL PROTECTED] end} (exclusive) on which to match may instead + * be defined using this method. + * </p> + * <p> + * The behaviour of region matching is further affected + * by the use of transparent or opaque bounds (see + * [EMAIL PROTECTED] #useTransparentBounds(boolean)}) and whether or not + * anchors ([EMAIL PROTECTED] ^} and [EMAIL PROTECTED] $}) are in use + * (see [EMAIL PROTECTED] #useAnchoringBounds(boolean)}). With transparent + * bounds, the matcher is aware of input outside the bounds + * set by this method, whereas, with opaque bounds (the default) + * only the input within the bounds is used. The use of + * anchors are affected by this setting; with transparent + * bounds, anchors will match the beginning of the real input, + * while with opaque bounds they match the beginning of the + * region. [EMAIL PROTECTED] #useAnchoringBounds(boolean)} can be used + * to turn on or off the matching of anchors. + * </p> + * + * @param start the start of the region (inclusive). + * @param end the end of the region (exclusive). + * @return a reference to this matcher. + * @throws IndexOutOfBoundsException if either [EMAIL PROTECTED] start} or + * [EMAIL PROTECTED] end} are less than zero, + * if either [EMAIL PROTECTED] start} or + * [EMAIL PROTECTED] end} are greater than the + * length of the input, or if + * [EMAIL PROTECTED] start} is greater than + * [EMAIL PROTECTED] end}. + * @see #regionStart() + * @see #regionEnd() + * @see #hasTransparentBounds() + * @see #useTransparentBounds(boolean) + * @see #hasAnchoringBounds() + * @see #useAnchoringBounds(boolean) + * @since 1.5 + */ + public Matcher region(int start, int end) + { + int length = input.length(); + if (start < 0) + throw new IndexOutOfBoundsException("The start position was less than zero."); + if (start >= length) + throw new IndexOutOfBoundsException("The start position is after the end of the input."); + if (end < 0) + throw new IndexOutOfBoundsException("The end position was less than zero."); + if (end > length) + throw new IndexOutOfBoundsException("The end position is after the end of the input."); + if (start > end) + throw new IndexOutOfBoundsException("The start position is after the end position."); + reset(); + regionStart = start; + regionEnd = end; + return this; + } + + /** + * The start of the region on which to perform matches (inclusive). + * + * @return the start index of the region. + * @see #region(int,int) + * #see #regionEnd() + * @since 1.5 + */ + public int regionStart() + { + return regionStart; + } + + /** + * The end of the region on which to perform matches (exclusive). + * + * @return the end index of the region. + * @see #region(int,int) + * @see #regionStart() + * @since 1.5 + */ + public int regionEnd() + { + return regionEnd; + } + + /** + * Returns true if the bounds of the region marked by + * [EMAIL PROTECTED] #regionStart()} and [EMAIL PROTECTED] #regionEnd()} are + * transparent. When these bounds are transparent, the + * matching process can look beyond them to perform + * lookahead, lookbehind and boundary matching operations. + * By default, the bounds are opaque. + * + * @return true if the bounds of the matching region are + * transparent. + * @see #useTransparentBounds(boolean) + * @see #region(int,int) + * @see #regionStart() + * @see #regionEnd() + * @since 1.5 + */ + public boolean hasTransparentBounds() + { + return transparentBounds; + } + + /** + * Sets the transparency of the bounds of the region + * marked by [EMAIL PROTECTED] #regionStart()} and [EMAIL PROTECTED] #regionEnd()}. + * A value of [EMAIL PROTECTED] true} makes the bounds transparent, + * so the matcher can see beyond them to perform lookahead, + * lookbehind and boundary matching operations. A value + * of [EMAIL PROTECTED] false} (the default) makes the bounds opaque, + * restricting the match to the input region denoted + * by [EMAIL PROTECTED] #regionStart()} and [EMAIL PROTECTED] #regionEnd()}. + * + * @param transparent true if the bounds should be transparent. + * @return a reference to this matcher. + * @see #hasTransparentBounds() + * @see #region(int,int) + * @see #regionStart() + * @see #regionEnd() + * @since 1.5 + */ + public Matcher useTransparentBounds(boolean transparent) + { + transparentBounds = transparent; + return this; + } + + /** + * Returns true if the matcher will honour the use of + * the anchoring bounds: [EMAIL PROTECTED] ^}, [EMAIL PROTECTED] \A}, [EMAIL PROTECTED] \Z}, + * [EMAIL PROTECTED] \z} and [EMAIL PROTECTED] $}. By default, the anchors + * are used. Note that the effect of the anchors is + * also affected by [EMAIL PROTECTED] #hasTransparentBounds()}. + * + * @return true if the matcher will attempt to match + * the anchoring bounds. + * @see #useAnchoringBounds(boolean) + * @see #hasTransparentBounds() + * @since 1.5 + */ + public boolean hasAnchoringBounds() + { + return anchoringBounds == 0; + } + + /** + * Enables or disables the use of the anchoring bounds: + * [EMAIL PROTECTED] ^}, [EMAIL PROTECTED] \A}, [EMAIL PROTECTED] \Z}, [EMAIL PROTECTED] \z} and + * [EMAIL PROTECTED] $}. By default, their use is enabled. When + * disabled, the matcher will not attempt to match + * the anchors. + * + * @param useAnchors true if anchoring bounds should be used. + * @return a reference to this matcher. + * @since 1.5 + * @see #hasAnchoringBounds() + */ + public Matcher useAnchoringBounds(boolean useAnchors) + { + if (useAnchors) + anchoringBounds = 0; + else + anchoringBounds = RE.REG_NOTBOL|RE.REG_NOTEOL; + return this; + } + }