This patch implements the region-based matching
methods in java.util.regex. There are still a number
of minor bugs, due to missing features in our regex
implementation, but I'd prefer to keep those fixes
for further patches, as this one already alters things
a great deal and does at least implement all the required
methods and track the state correctly.
ChangeLog:
2008-05-11 Andrew John Hughes <[EMAIL PROTECTED]>
* java/util/regex/Matcher.java:
(regionStart): New variable.
(regionEnd): Likewise.
(transparentBounds): Likewise.
(anchoringBounds): Likewise.
(Matcher()): Initialise new variables.
(find()): Alter to use new settings.
(find(int)): Likewise.
(lookingAt()): Likewise.
(matches()): Likewise.
(reset()): Reset region.
(reset(CharSequence)): Documented.
(toString()): Include new variables.
(region(int,int)): Implemented.
(regionStart()): Likewise.
(regionEnd()): Likewise.
(hasTransparentBounds()): Likewise.
(useTransparentBounds(boolean)): Likewise.
(hasAnchoringBounds()): Likewise.
(useAnchoringBounds(boolean)): Likewise.
--
Andrew :)
Support Free Java!
Contribute to GNU Classpath and the OpenJDK
http://www.gnu.org/software/classpath
http://openjdk.java.net
PGP Key: 94EFD9D8 (http://subkeys.pgp.net)
Fingerprint = F8EF F1EA 401E 2E60 15FA 7927 142C 2591 94EF D9D8
Index: java/util/regex/Matcher.java
===================================================================
RCS file: /sources/classpath/classpath/java/util/regex/Matcher.java,v
retrieving revision 1.20
diff -u -r1.20 Matcher.java
--- java/util/regex/Matcher.java 16 Mar 2008 22:44:41 -0000 1.20
+++ java/util/regex/Matcher.java 12 May 2008 20:34:34 -0000
@@ -61,11 +61,45 @@
private int appendPosition;
private REMatch match;
+ /**
+ * The start of the region of the input on which to match.
+ */
+ private int regionStart;
+
+ /**
+ * The end of the region of the input on which to match.
+ */
+ private int regionEnd;
+
+ /**
+ * True if the match process should look beyond the
+ * region marked by regionStart to regionEnd when
+ * performing lookAhead, lookBehind and boundary
+ * matching.
+ */
+ private boolean transparentBounds;
+
+ /**
+ * The flags that affect the anchoring bounds.
+ * If [EMAIL PROTECTED] #hasAnchoringBounds()} is [EMAIL PROTECTED] true},
+ * the match process will honour the
+ * anchoring bounds: ^, \A, \Z, \z and $. If
+ * [EMAIL PROTECTED] #hasAnchoringBounds()} is [EMAIL PROTECTED] false},
+ * the anchors are ignored and appropriate flags,
+ * stored in this variable, are used to provide this
+ * behaviour.
+ */
+ private int anchoringBounds;
+
Matcher(Pattern pattern, CharSequence input)
{
this.pattern = pattern;
this.input = input;
this.inputCharIndexed = RE.makeCharIndexed(input, 0);
+ regionStart = 0;
+ regionEnd = input.length();
+ transparentBounds = false;
+ anchoringBounds = 0;
}
/**
@@ -127,7 +161,11 @@
public boolean find ()
{
boolean first = (match == null);
- match = pattern.getRE().getMatch(inputCharIndexed, position);
+ if (transparentBounds || (regionStart == 0 && regionEnd == input.length()))
+ match = pattern.getRE().getMatch(inputCharIndexed, position,
anchoringBounds);
+ else
+ match = pattern.getRE().getMatch(input.subSequence(regionStart,
regionEnd),
+ position, anchoringBounds);
if (match != null)
{
int endIndex = match.getEndIndex();
@@ -158,7 +196,11 @@
*/
public boolean find (int start)
{
- match = pattern.getRE().getMatch(inputCharIndexed, start);
+ if (transparentBounds || (regionStart == 0 && regionEnd == input.length()))
+ match = pattern.getRE().getMatch(inputCharIndexed, start,
anchoringBounds);
+ else
+ match = pattern.getRE().getMatch(input.subSequence(regionStart,
regionEnd),
+ start, anchoringBounds);
if (match != null)
{
position = match.getEndIndex();
@@ -220,7 +262,12 @@
public boolean lookingAt ()
{
- match = pattern.getRE().getMatch(inputCharIndexed, 0,
RE.REG_FIX_STARTING_POSITION, null);
+ if (transparentBounds || (regionStart == 0 && regionEnd == input.length()))
+ match = pattern.getRE().getMatch(inputCharIndexed, regionStart,
+
anchoringBounds|RE.REG_FIX_STARTING_POSITION|RE.REG_ANCHORINDEX);
+ else
+ match = pattern.getRE().getMatch(input.subSequence(regionStart,
regionEnd), 0,
+
anchoringBounds|RE.REG_FIX_STARTING_POSITION);
if (match != null)
{
if (match.getStartIndex() == 0)
@@ -245,7 +292,12 @@
*/
public boolean matches ()
{
- match = pattern.getRE().getMatch(inputCharIndexed, 0,
RE.REG_TRY_ENTIRE_MATCH|RE.REG_FIX_STARTING_POSITION, null);
+ if (transparentBounds || (regionStart == 0 && regionEnd == input.length()))
+ match = pattern.getRE().getMatch(inputCharIndexed, regionStart,
+
anchoringBounds|RE.REG_TRY_ENTIRE_MATCH|RE.REG_FIX_STARTING_POSITION|RE.REG_ANCHORINDEX);
+ else
+ match = pattern.getRE().getMatch(input.subSequence(regionStart,
regionEnd), 0,
+
anchoringBounds|RE.REG_TRY_ENTIRE_MATCH|RE.REG_FIX_STARTING_POSITION);
if (match != null)
{
if (match.getStartIndex() == 0)
@@ -267,15 +319,39 @@
return pattern;
}
+ /**
+ * Resets the internal state of the matcher, including
+ * resetting the region to its default state of encompassing
+ * the whole input. The state of [EMAIL PROTECTED] #hasTransparentBounds()}
+ * and [EMAIL PROTECTED] #hasAnchoringBounds()} are unaffected.
+ *
+ * @return a reference to this matcher.
+ * @see #regionStart()
+ * @see #regionEnd()
+ * @see #hasTransparentBounds()
+ * @see #hasAnchoringBounds()
+ */
public Matcher reset ()
{
position = 0;
match = null;
+ regionStart = 0;
+ regionEnd = input.length();
return this;
}
/**
- * @param input The new input character sequence
+ * Resets the internal state of the matcher, including
+ * resetting the region to its default state of encompassing
+ * the whole input. The state of [EMAIL PROTECTED] #hasTransparentBounds()}
+ * and [EMAIL PROTECTED] #hasAnchoringBounds()} are unaffected.
+ *
+ * @param input The new input character sequence.
+ * @return a reference to this matcher.
+ * @see #regionStart()
+ * @see #regionEnd()
+ * @see #hasTransparentBounds()
+ * @see #hasAnchoringBounds()
*/
public Matcher reset (CharSequence input)
{
@@ -285,7 +361,7 @@
}
/**
- * @returns the index of a capturing group in this matcher's pattern
+ * @return the index of a capturing group in this matcher's pattern
*
* @exception IllegalStateException If no match has yet been attempted,
* or if the previous match operation failed
@@ -314,6 +390,7 @@
/**
* @return True if and only if the matcher hit the end of input.
+ * @since 1.5
*/
public boolean hitEnd()
{
@@ -328,7 +405,9 @@
CPStringBuilder sb = new CPStringBuilder();
sb.append(this.getClass().getName())
.append("[pattern=").append(pattern.pattern())
- .append(" region=").append("0").append(",").append(input.length())
+ .append(" region=").append(regionStart).append(",").append(regionEnd)
+ .append(" anchoringBounds=").append(anchoringBounds == 0)
+ .append(" transparentBounds=").append(transparentBounds)
.append(" lastmatch=").append(match == null ? "" : match.toString())
.append("]");
return sb.toString();
@@ -338,4 +417,177 @@
{
if (match == null) throw new IllegalStateException();
}
+
+ /**
+ * <p>
+ * Defines the region of the input on which to match.
+ * By default, the [EMAIL PROTECTED] Matcher} attempts to match
+ * the whole string (from 0 to the length of the input),
+ * but a region between [EMAIL PROTECTED] start} (inclusive) and
+ * [EMAIL PROTECTED] end} (exclusive) on which to match may instead
+ * be defined using this method.
+ * </p>
+ * <p>
+ * The behaviour of region matching is further affected
+ * by the use of transparent or opaque bounds (see
+ * [EMAIL PROTECTED] #useTransparentBounds(boolean)}) and whether or not
+ * anchors ([EMAIL PROTECTED] ^} and [EMAIL PROTECTED] $}) are in use
+ * (see [EMAIL PROTECTED] #useAnchoringBounds(boolean)}). With transparent
+ * bounds, the matcher is aware of input outside the bounds
+ * set by this method, whereas, with opaque bounds (the default)
+ * only the input within the bounds is used. The use of
+ * anchors are affected by this setting; with transparent
+ * bounds, anchors will match the beginning of the real input,
+ * while with opaque bounds they match the beginning of the
+ * region. [EMAIL PROTECTED] #useAnchoringBounds(boolean)} can be used
+ * to turn on or off the matching of anchors.
+ * </p>
+ *
+ * @param start the start of the region (inclusive).
+ * @param end the end of the region (exclusive).
+ * @return a reference to this matcher.
+ * @throws IndexOutOfBoundsException if either [EMAIL PROTECTED] start} or
+ * [EMAIL PROTECTED] end} are less than
zero,
+ * if either [EMAIL PROTECTED] start} or
+ * [EMAIL PROTECTED] end} are greater than
the
+ * length of the input, or if
+ * [EMAIL PROTECTED] start} is greater than
+ * [EMAIL PROTECTED] end}.
+ * @see #regionStart()
+ * @see #regionEnd()
+ * @see #hasTransparentBounds()
+ * @see #useTransparentBounds(boolean)
+ * @see #hasAnchoringBounds()
+ * @see #useAnchoringBounds(boolean)
+ * @since 1.5
+ */
+ public Matcher region(int start, int end)
+ {
+ int length = input.length();
+ if (start < 0)
+ throw new IndexOutOfBoundsException("The start position was less than
zero.");
+ if (start >= length)
+ throw new IndexOutOfBoundsException("The start position is after the end
of the input.");
+ if (end < 0)
+ throw new IndexOutOfBoundsException("The end position was less than
zero.");
+ if (end > length)
+ throw new IndexOutOfBoundsException("The end position is after the end
of the input.");
+ if (start > end)
+ throw new IndexOutOfBoundsException("The start position is after the end
position.");
+ reset();
+ regionStart = start;
+ regionEnd = end;
+ return this;
+ }
+
+ /**
+ * The start of the region on which to perform matches (inclusive).
+ *
+ * @return the start index of the region.
+ * @see #region(int,int)
+ * #see #regionEnd()
+ * @since 1.5
+ */
+ public int regionStart()
+ {
+ return regionStart;
+ }
+
+ /**
+ * The end of the region on which to perform matches (exclusive).
+ *
+ * @return the end index of the region.
+ * @see #region(int,int)
+ * @see #regionStart()
+ * @since 1.5
+ */
+ public int regionEnd()
+ {
+ return regionEnd;
+ }
+
+ /**
+ * Returns true if the bounds of the region marked by
+ * [EMAIL PROTECTED] #regionStart()} and [EMAIL PROTECTED] #regionEnd()} are
+ * transparent. When these bounds are transparent, the
+ * matching process can look beyond them to perform
+ * lookahead, lookbehind and boundary matching operations.
+ * By default, the bounds are opaque.
+ *
+ * @return true if the bounds of the matching region are
+ * transparent.
+ * @see #useTransparentBounds(boolean)
+ * @see #region(int,int)
+ * @see #regionStart()
+ * @see #regionEnd()
+ * @since 1.5
+ */
+ public boolean hasTransparentBounds()
+ {
+ return transparentBounds;
+ }
+
+ /**
+ * Sets the transparency of the bounds of the region
+ * marked by [EMAIL PROTECTED] #regionStart()} and [EMAIL PROTECTED]
#regionEnd()}.
+ * A value of [EMAIL PROTECTED] true} makes the bounds transparent,
+ * so the matcher can see beyond them to perform lookahead,
+ * lookbehind and boundary matching operations. A value
+ * of [EMAIL PROTECTED] false} (the default) makes the bounds opaque,
+ * restricting the match to the input region denoted
+ * by [EMAIL PROTECTED] #regionStart()} and [EMAIL PROTECTED] #regionEnd()}.
+ *
+ * @param transparent true if the bounds should be transparent.
+ * @return a reference to this matcher.
+ * @see #hasTransparentBounds()
+ * @see #region(int,int)
+ * @see #regionStart()
+ * @see #regionEnd()
+ * @since 1.5
+ */
+ public Matcher useTransparentBounds(boolean transparent)
+ {
+ transparentBounds = transparent;
+ return this;
+ }
+
+ /**
+ * Returns true if the matcher will honour the use of
+ * the anchoring bounds: [EMAIL PROTECTED] ^}, [EMAIL PROTECTED] \A}, [EMAIL
PROTECTED] \Z},
+ * [EMAIL PROTECTED] \z} and [EMAIL PROTECTED] $}. By default, the anchors
+ * are used. Note that the effect of the anchors is
+ * also affected by [EMAIL PROTECTED] #hasTransparentBounds()}.
+ *
+ * @return true if the matcher will attempt to match
+ * the anchoring bounds.
+ * @see #useAnchoringBounds(boolean)
+ * @see #hasTransparentBounds()
+ * @since 1.5
+ */
+ public boolean hasAnchoringBounds()
+ {
+ return anchoringBounds == 0;
+ }
+
+ /**
+ * Enables or disables the use of the anchoring bounds:
+ * [EMAIL PROTECTED] ^}, [EMAIL PROTECTED] \A}, [EMAIL PROTECTED] \Z},
[EMAIL PROTECTED] \z} and
+ * [EMAIL PROTECTED] $}. By default, their use is enabled. When
+ * disabled, the matcher will not attempt to match
+ * the anchors.
+ *
+ * @param useAnchors true if anchoring bounds should be used.
+ * @return a reference to this matcher.
+ * @since 1.5
+ * @see #hasAnchoringBounds()
+ */
+ public Matcher useAnchoringBounds(boolean useAnchors)
+ {
+ if (useAnchors)
+ anchoringBounds = 0;
+ else
+ anchoringBounds = RE.REG_NOTBOL|RE.REG_NOTEOL;
+ return this;
+ }
+
}