Author: vgritsenko Date: Thu Aug 11 20:04:07 2005 New Revision: 232192 URL: http://svn.apache.org/viewcvs?rev=232192&view=rev Log: Applied patches for Bug #27795: Add optimization for regexps which start with ^ (BOL)
Modified: jakarta/regexp/trunk/src/java/org/apache/regexp/RE.java jakarta/regexp/trunk/src/java/org/apache/regexp/REProgram.java jakarta/regexp/trunk/src/java/org/apache/regexp/RETest.java jakarta/regexp/trunk/xdocs/changes.xml Modified: jakarta/regexp/trunk/src/java/org/apache/regexp/RE.java URL: http://svn.apache.org/viewcvs/jakarta/regexp/trunk/src/java/org/apache/regexp/RE.java?rev=232192&r1=232191&r2=232192&view=diff ============================================================================== --- jakarta/regexp/trunk/src/java/org/apache/regexp/RE.java (original) +++ jakarta/regexp/trunk/src/java/org/apache/regexp/RE.java Thu Aug 11 20:04:07 2005 @@ -1414,6 +1414,43 @@ // Save string to search this.search = search; + // Can we optimize the search by looking for new lines? + if ((program.flags & REProgram.OPT_HASBOL) == REProgram.OPT_HASBOL) + { + // Non multi-line matching with BOL: Must match at '0' index + if ((matchFlags & MATCH_MULTILINE) == 0) + { + return i == 0 && matchAt(i); + } + + // Multi-line matching with BOL: Seek to next line + for ( ;! search.isEnd(i); i++) + { + // Skip if we are at the beginning of the line + if (isNewline(i)) + { + continue; + } + + // Match at the beginning of the line + if (matchAt(i)) + { + return true; + } + + // Skip to the end of line + for ( ;! search.isEnd(i); i++) + { + if (isNewline(i)) + { + break; + } + } + } + + return false; + } + // Can we optimize the search by looking for a prefix string? if (program.prefix == null) { Modified: jakarta/regexp/trunk/src/java/org/apache/regexp/REProgram.java URL: http://svn.apache.org/viewcvs/jakarta/regexp/trunk/src/java/org/apache/regexp/REProgram.java?rev=232192&r1=232191&r2=232192&view=diff ============================================================================== --- jakarta/regexp/trunk/src/java/org/apache/regexp/REProgram.java (original) +++ jakarta/regexp/trunk/src/java/org/apache/regexp/REProgram.java Thu Aug 11 20:04:07 2005 @@ -33,6 +33,7 @@ public class REProgram implements Serializable { static final int OPT_HASBACKREFS = 1; + static final int OPT_HASBOL = 2; char[] instruction; // The compiled regular expression 'program' int lenInstruction; // The amount of the instruction buffer in use @@ -81,7 +82,7 @@ // Ensure program has been compiled! if (lenInstruction != 0) { - // Return copy of program + // Return copy of program char[] ret = new char[lenInstruction]; System.arraycopy(instruction, 0, ret, 0, lenInstruction); return ret; @@ -116,16 +117,23 @@ if (lenInstruction >= RE.nodeSize && instruction[0 + RE.offsetOpcode] == RE.OP_BRANCH) { // to the end node - int next = instruction[0 + RE.offsetNext]; - if (instruction[next + RE.offsetOpcode] == RE.OP_END) + char next = instruction[0 + RE.offsetNext]; + if (instruction[next + RE.offsetOpcode] == RE.OP_END && lenInstruction >= (RE.nodeSize * 2)) { - // and the branch starts with an atom - if (lenInstruction >= (RE.nodeSize * 2) && instruction[RE.nodeSize + RE.offsetOpcode] == RE.OP_ATOM) + final char nextOp = instruction[RE.nodeSize + RE.offsetOpcode]; + // the branch starts with an atom + if (nextOp == RE.OP_ATOM) { // then get that atom as an prefix because there's no other choice int lenAtom = instruction[RE.nodeSize + RE.offsetOpdata]; prefix = new char[lenAtom]; System.arraycopy(instruction, RE.nodeSize * 2, prefix, 0, lenAtom); + } + // the branch starts with a BOL + else if (nextOp == RE.OP_BOL) + { + // then set the flag indicating that BOL is present + flags |= OPT_HASBOL; } } } Modified: jakarta/regexp/trunk/src/java/org/apache/regexp/RETest.java URL: http://svn.apache.org/viewcvs/jakarta/regexp/trunk/src/java/org/apache/regexp/RETest.java?rev=232192&r1=232191&r2=232192&view=diff ============================================================================== --- jakarta/regexp/trunk/src/java/org/apache/regexp/RETest.java (original) +++ jakarta/regexp/trunk/src/java/org/apache/regexp/RETest.java Thu Aug 11 20:04:07 2005 @@ -378,6 +378,12 @@ showParens(r); } + // Test for eol/bol symbols. + r = new RE("^abc$"); + if (r.match("\nabc")) { + fail("\"\\nabc\" matches \"^abc$\""); + } + // Test MATCH_MULTILINE. Test for eol/bol symbols. r = new RE("^abc$", RE.MATCH_MULTILINE); if (!r.match("\nabc")) { Modified: jakarta/regexp/trunk/xdocs/changes.xml URL: http://svn.apache.org/viewcvs/jakarta/regexp/trunk/xdocs/changes.xml?rev=232192&r1=232191&r2=232192&view=diff ============================================================================== --- jakarta/regexp/trunk/xdocs/changes.xml (original) +++ jakarta/regexp/trunk/xdocs/changes.xml Thu Aug 11 20:04:07 2005 @@ -34,53 +34,56 @@ <h3>Version 1.4-dev</h3> <ul> -<li>Fixed Bug +<li>Applied patches for Bug + <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=27795">27795</a>: + Add optimization for regexps which start with ^ (BOL) (VG)</li> +<li>Fixed Bug <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=25985">25985</a>: In MATCH_MULTILINE mode $ does not match end of line (VG)</li> -<li>Fixed Bug +<li>Fixed Bug <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=2121">2121</a>: '.' or '-' in bracket expression gives unexpected results (VG)</li> <li>Regexp is relicensed to <a href="http://www.apache.org/licenses/LICENSE-2.0"> Apache License, Version 2.0</a> (VG)</li> -<li>Fixed Bug +<li>Fixed Bug <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=2525">2525</a>: Leading zero-length string splitted by RE (VG)</li> -<li>Applied patches for Bug +<li>Applied patches for Bug <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=4137">4137</a>: Regexp match gets different results on different platforms (VG)</li> -<li>Applied patches for Bug +<li>Applied patches for Bug <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=3303">3303</a>: Unicode 3.0 character \\uFFFD (VG)</li> -<li>Applied patches for Bug +<li>Applied patches for Bug <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=3773">3773</a>: Problem with parsing greedy match modifiers (VG)</li> -<li>Applied patches for Bug +<li>Applied patches for Bug <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=3273">3273</a>: CharacterArrayCharacterIterator docs and implementation mismatch (VG)</li> -<li>Fixed Bug +<li>Fixed Bug <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=22928">22928</a>: subst() with REPLACE_BACKREFERENCES cuts first 2 characters (VG)</li> </ul> <h3>Version 1.3</h3> <ul> -<li>Fixed Bug +<li>Fixed Bug <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=22804">22804</a>: ArrayIndexOutOfBoundsException on negated classes (VG)</li> <li>New Feature: subst() can now process backreferences when flag REPLACE_BACKREFERENCES is set. See API docs for details. Patch provided by Tobias Schaefer. (VG)</li> -<li>Applied patches for Bug +<li>Applied patches for Bug <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=16592">16592</a>: Syntax error: Too many bracketed closures (limit is 10) (VG)</li> -<li>Fixed Bug +<li>Fixed Bug <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=5212">5212</a>, aka <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=14954">14954</a>: A bug caused by '-' in character class definition ('[...]') (VG)</li> -<li>Fixed Bug +<li>Fixed Bug <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=4057">4057</a>: \w does not match underscore (VG)</li> -<li>Fixed Bug +<li>Fixed Bug <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=1030">1030</a>, aka <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=10893">10893</a>: {n.m} notation work incorrect if n=0 (VG)</li> @@ -89,22 +92,22 @@ Expressions using {0,n} match 0 to n+1 times instead of 0 to n times. Now, expression "[a-z]{0,3}" matches "123abcdefg123" resulting in "" (empty string). (VG)</li> -<li>Fixed Bug +<li>Fixed Bug <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=306">306</a>: Why is the RE class not Serializable? (VG)</li> -<li>Applied patches for Bug +<li>Applied patches for Bug <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=3879">3879</a>: Expressions using {0,n} match 0 to n+1 times instead of 0 to n times. (JSS)</li> -<li>Applied patches for Bug +<li>Applied patches for Bug <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=7288">7288</a>: Bug in negative character ranges. (JSS)</li> -<li>Applied patches for Bug +<li>Applied patches for Bug <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=986">986</a>: Leading "\b" word boundary is ignored. (JSS)</li> -<li>Applied patches for Bug +<li>Applied patches for Bug <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=3877">3877</a>: {n} and {n,m} not thread safe. (JSS)</li> -<li>Applied patches for Bug +<li>Applied patches for Bug <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=8467">8467</a>: Number of paren pairs limited to 16 (JSS)</li> <li>Fixed RE.grep() documentation to reflect a String[] is returned @@ -117,7 +120,7 @@ <h3>Version 1.2</h3> <ul> <li>Updated to Ant 1.2 (JSS)</li> -<li>Documentation now built with <a +<li>Documentation now built with <a href="http://jakarta.apache.org/site/jakarta-site2.html">Anakia</a> (JSS)</li> <li><a href="http://jakarta.apache.org/cvsweb/index.cgi/jakarta-regexp/src/java/org/apache/regexp/RE.java?rev=1.3&content-type=text/vnd.viewcvs-markup">Fixed bug</a></li> <li><a href="http://jakarta.apache.org/cvsweb/index.cgi/jakarta-regexp/src/java/org/apache/regexp/RE.java?rev=1.4&content-type=text/vnd.viewcvs-markup"> --------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]