Title: [90962] trunk
Revision
90962
Author
msab...@apple.com
Date
2011-07-13 16:05:40 -0700 (Wed, 13 Jul 2011)

Log Message

https://bugs.webkit.org/show_bug.cgi?id=64202
Enh: Improve handling of RegExp in the form of /.*blah.*/

Reviewed by Gavin Barraclough.

../../../../Volumes/Data/src/webkit/LayoutTests: 

New tests to support /.*<_expression_>.*/ enhancement.

* fast/regex/dotstar-expected.txt: Added.
* fast/regex/dotstar.html: Added.
* fast/regex/script-tests/dotstar.js: Added.

../../../../Volumes/Data/src/webkit/Source/_javascript_Core: 

Added code to both the Yarr interpreter and JIT to handle
these expressions a little differently.  First off, the terms
in between the leading and trailing .*'s cannot capture and
also this enhancement is limited to single alternative expressions.
If an _expression_ is of the right form with the aforementioned
restrictions, we process the inner terms and then look for the
beginning of the string and end of the string.  There is handling 
for multiline expressions to allow the beginning and end to be 
right after and right before newlines.

This enhancement speeds up expressions of this type 12x on
a MacBookPro.

Cleaned up 'case' statement indentation.

A new set of tests was added as LayoutTests/fast/regex/dotstar.html

* yarr/YarrInterpreter.cpp:
(JSC::Yarr::Interpreter::InputStream::end):
(JSC::Yarr::Interpreter::matchDotStarEnclosure):
(JSC::Yarr::Interpreter::matchDisjunction):
(JSC::Yarr::ByteCompiler::assertionDotStarEnclosure):
(JSC::Yarr::ByteCompiler::emitDisjunction):
* yarr/YarrInterpreter.h:
(JSC::Yarr::ByteTerm::DotStarEnclosure):
* yarr/YarrJIT.cpp:
(JSC::Yarr::YarrGenerator::generateDotStarEnclosure):
(JSC::Yarr::YarrGenerator::backtrackDotStarEnclosure):
(JSC::Yarr::YarrGenerator::generateTerm):
(JSC::Yarr::YarrGenerator::backtrackTerm):
* yarr/YarrPattern.cpp:
(JSC::Yarr::YarrPatternConstructor::setupAlternativeOffsets):
(JSC::Yarr::YarrPatternConstructor::containsCapturingTerms):
(JSC::Yarr::YarrPatternConstructor::optimizeDotStarWrappedExpressions):
(JSC::Yarr::YarrPattern::compile):
* yarr/YarrPattern.h:
(JSC::Yarr::PatternTerm::PatternTerm):

Modified Paths

Added Paths

Diff

Modified: trunk/LayoutTests/ChangeLog (90961 => 90962)


--- trunk/LayoutTests/ChangeLog	2011-07-13 23:00:52 UTC (rev 90961)
+++ trunk/LayoutTests/ChangeLog	2011-07-13 23:05:40 UTC (rev 90962)
@@ -1,3 +1,16 @@
+2011-07-13  Michael Saboff  <msab...@apple.com>
+
+        https://bugs.webkit.org/show_bug.cgi?id=64202
+        Enh: Improve handling of RegExp in the form of /.*blah.*/
+
+        Reviewed by Gavin Barraclough.
+
+        New tests to support /.*<_expression_>.*/ enhancement.
+
+        * fast/regex/dotstar-expected.txt: Added.
+        * fast/regex/dotstar.html: Added.
+        * fast/regex/script-tests/dotstar.js: Added.
+
 2011-07-13  Vincent Scheib  <sch...@chromium.org>
 
         [chromium] Update chromium test expectations.

Added: trunk/LayoutTests/fast/regex/dotstar-expected.txt (0 => 90962)


--- trunk/LayoutTests/fast/regex/dotstar-expected.txt	                        (rev 0)
+++ trunk/LayoutTests/fast/regex/dotstar-expected.txt	2011-07-13 23:05:40 UTC (rev 90962)
@@ -0,0 +1,118 @@
+This page tests handling of parentheses subexpressions.
+
+On success, you will see a series of "PASS" messages, followed by "TEST COMPLETE".
+
+
+PASS regexp1.exec('test') is null
+PASS regexp1.exec('blah') is ['blah']
+PASS regexp1.exec('1blah') is ['1blah']
+PASS regexp1.exec('blah1') is ['blah1']
+PASS regexp1.exec('blah blah blah') is ['blah blah blah']
+PASS regexp1.exec('blah\nsecond') is ['blah']
+PASS regexp1.exec('first\nblah') is ['blah']
+PASS regexp1.exec('first\nblah\nthird') is ['blah']
+PASS regexp1.exec('first\nblah2\nblah3') is ['blah2']
+PASS regexp2.exec('test') is null
+PASS regexp2.exec('blah') is ['blah']
+PASS regexp2.exec('1blah') is ['1blah']
+PASS regexp2.exec('blah1') is ['blah1']
+PASS regexp2.exec('blah blah blah') is ['blah blah blah']
+PASS regexp2.exec('blah\nsecond') is ['blah']
+PASS regexp2.exec('first\nblah') is null
+PASS regexp2.exec('first\nblah\nthird') is null
+PASS regexp2.exec('first\nblah2\nblah3') is null
+PASS regexp3.exec('test') is null
+PASS regexp3.exec('blah') is ['blah']
+PASS regexp3.exec('1blah') is ['1blah']
+PASS regexp3.exec('blah1') is ['blah1']
+PASS regexp3.exec('blah blah blah') is ['blah blah blah']
+PASS regexp3.exec('blah\nsecond') is null
+PASS regexp3.exec('first\nblah') is ['blah']
+PASS regexp3.exec('first\nblah\nthird') is null
+PASS regexp3.exec('first\nblah2\nblah3') is ['blah3']
+PASS regexp4.exec('test') is null
+PASS regexp4.exec('blah') is ['blah']
+PASS regexp4.exec('1blah') is ['1blah']
+PASS regexp4.exec('blah1') is ['blah1']
+PASS regexp4.exec('blah blah blah') is ['blah blah blah']
+PASS regexp4.exec('blah\nsecond') is null
+PASS regexp4.exec('first\nblah') is null
+PASS regexp4.exec('first\nblah\nthird') is null
+PASS regexp4.exec('first\nblah2\nblah3') is null
+PASS regexp5.exec('test') is null
+PASS regexp5.exec('blah') is ['blah']
+PASS regexp5.exec('1blah') is ['1blah']
+PASS regexp5.exec('blah1') is ['blah1']
+PASS regexp5.exec('blah blah blah') is ['blah blah blah']
+PASS regexp5.exec('blah\nsecond') is ['blah']
+PASS regexp5.exec('first\nblah') is ['blah']
+PASS regexp5.exec('first\nblah\nthird') is ['blah']
+PASS regexp5.exec('first\nblah2\nblah3') is ['blah2']
+PASS regexp6.exec('test') is null
+PASS regexp6.exec('blah') is ['blah']
+PASS regexp6.exec('1blah') is ['1blah']
+PASS regexp6.exec('blah1') is ['blah']
+PASS regexp6.exec('blah blah blah') is ['blah blah blah']
+PASS regexp6.exec('blah\nsecond') is ['blah']
+PASS regexp6.exec('first\nblah') is ['blah']
+PASS regexp6.exec('first\nblah\nthird') is ['blah']
+PASS regexp6.exec('first\nblah2\nblah3') is ['blah']
+PASS regexp7.exec('test') is null
+PASS regexp7.exec('blah') is ['blah']
+PASS regexp7.exec('1blah') is ['1blah']
+PASS regexp7.exec('blah1') is ['blah1']
+PASS regexp7.exec('blah blah blah') is ['blah blah blah']
+PASS regexp7.exec('blah\nsecond') is null
+PASS regexp7.exec('first\nblah') is null
+PASS regexp7.exec('first\nblah\nthird') is null
+PASS regexp7.exec('first\nblah2\nblah3') is null
+PASS regexp8.exec('test') is null
+PASS regexp8.exec('blah') is ['blah','']
+PASS regexp8.exec('1blah') is ['1blah','1']
+PASS regexp8.exec('blah1') is ['blah1','']
+PASS regexp8.exec('blah blah blah') is ['blah blah blah','blah blah ']
+PASS regexp8.exec('blah\nsecond') is null
+PASS regexp8.exec('first\nblah') is null
+PASS regexp8.exec('first\nblah\nthird') is null
+PASS regexp8.exec('first\nblah2\nblah3') is null
+PASS regexp9.exec('test') is null
+PASS regexp9.exec('blah') is ['blah']
+PASS regexp9.exec('1blah') is ['1blah']
+PASS regexp9.exec('blah1') is ['blah1']
+PASS regexp9.exec('blah blah blah') is ['blah blah blah']
+PASS regexp9.exec('blah\nsecond') is ['blah']
+PASS regexp9.exec('first\nblah') is ['blah']
+PASS regexp9.exec('first\nblah\nthird') is ['blah']
+PASS regexp9.exec('first\nblah2\nblah3') is ['blah2']
+PASS regexp10.exec('test') is null
+PASS regexp10.exec('blah') is ['blah']
+PASS regexp10.exec('1blah') is ['1blah']
+PASS regexp10.exec('blah1') is ['blah1']
+PASS regexp10.exec('blah blah blah') is ['blah blah blah']
+PASS regexp10.exec('blah\nsecond') is ['blah']
+PASS regexp10.exec('first\nblah') is ['blah']
+PASS regexp10.exec('first\nblah\nthird') is ['blah']
+PASS regexp10.exec('first\nblah2\nblah3') is ['blah2']
+PASS regexp11.exec('test') is null
+PASS regexp11.exec('blah') is ['blah']
+PASS regexp11.exec('1blah') is ['1blah']
+PASS regexp11.exec('blah1') is ['blah1']
+PASS regexp11.exec('blah blah blah') is ['blah blah blah']
+PASS regexp11.exec('blah\nsecond') is null
+PASS regexp11.exec('first\nblah') is ['blah']
+PASS regexp11.exec('first\nblah\nthird') is null
+PASS regexp11.exec('first\nblah2\nblah3') is ['blah3']
+PASS regexp12.exec('test') is null
+PASS regexp12.exec('blah') is ['blah']
+PASS regexp12.exec('1blah') is ['1blah']
+PASS regexp12.exec('blah1') is ['blah1']
+PASS regexp12.exec('blah blah blah') is ['blah blah blah']
+PASS regexp12.exec('blah\nsecond') is null
+PASS regexp12.exec('first\nblah') is ['blah']
+PASS regexp12.exec('first\nblah\nthird') is null
+PASS regexp12.exec('first\nblah2\nblah3') is ['blah3']
+PASS regexp13.exec('abc\n123') is ['abc\n123']
+PASS successfullyParsed is true
+
+TEST COMPLETE
+

Added: trunk/LayoutTests/fast/regex/dotstar.html (0 => 90962)


--- trunk/LayoutTests/fast/regex/dotstar.html	                        (rev 0)
+++ trunk/LayoutTests/fast/regex/dotstar.html	2011-07-13 23:05:40 UTC (rev 90962)
@@ -0,0 +1,13 @@
+<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML//EN">
+<html>
+<head>
+<link rel="stylesheet" href=""
+<script src=""
+</head>
+<body>
+<p id="description"></p>
+<div id="console"></div>
+<script src=""
+<script src=""
+</body>
+</html>

Added: trunk/LayoutTests/fast/regex/script-tests/dotstar.js (0 => 90962)


--- trunk/LayoutTests/fast/regex/script-tests/dotstar.js	                        (rev 0)
+++ trunk/LayoutTests/fast/regex/script-tests/dotstar.js	2011-07-13 23:05:40 UTC (rev 90962)
@@ -0,0 +1,139 @@
+description("This page tests handling of parentheses subexpressions.");
+
+var regexp1 = /.*blah.*/;
+shouldBeNull("regexp1.exec('test')");
+shouldBe("regexp1.exec('blah')", "['blah']");
+shouldBe("regexp1.exec('1blah')", "['1blah']");
+shouldBe("regexp1.exec('blah1')", "['blah1']");
+shouldBe("regexp1.exec('blah blah blah')", "['blah blah blah']");
+shouldBe("regexp1.exec('blah\\nsecond')", "['blah']");
+shouldBe("regexp1.exec('first\\nblah')", "['blah']");
+shouldBe("regexp1.exec('first\\nblah\\nthird')", "['blah']");
+shouldBe("regexp1.exec('first\\nblah2\\nblah3')", "['blah2']");
+
+var regexp2 = /^.*blah.*/;
+shouldBeNull("regexp2.exec('test')");
+shouldBe("regexp2.exec('blah')", "['blah']");
+shouldBe("regexp2.exec('1blah')", "['1blah']");
+shouldBe("regexp2.exec('blah1')", "['blah1']");
+shouldBe("regexp2.exec('blah blah blah')", "['blah blah blah']");
+shouldBe("regexp2.exec('blah\\nsecond')", "['blah']");
+shouldBeNull("regexp2.exec('first\\nblah')");
+shouldBeNull("regexp2.exec('first\\nblah\\nthird')");
+shouldBeNull("regexp2.exec('first\\nblah2\\nblah3')");
+
+var regexp3 = /.*blah.*$/;
+shouldBeNull("regexp3.exec('test')");
+shouldBe("regexp3.exec('blah')", "['blah']");
+shouldBe("regexp3.exec('1blah')", "['1blah']");
+shouldBe("regexp3.exec('blah1')", "['blah1']");
+shouldBe("regexp3.exec('blah blah blah')", "['blah blah blah']");
+shouldBeNull("regexp3.exec('blah\\nsecond')");
+shouldBe("regexp3.exec('first\\nblah')", "['blah']");
+shouldBeNull("regexp3.exec('first\\nblah\\nthird')");
+shouldBe("regexp3.exec('first\\nblah2\\nblah3')", "['blah3']");
+
+var regexp4 = /^.*blah.*$/;
+shouldBeNull("regexp4.exec('test')");
+shouldBe("regexp4.exec('blah')", "['blah']");
+shouldBe("regexp4.exec('1blah')", "['1blah']");
+shouldBe("regexp4.exec('blah1')", "['blah1']");
+shouldBe("regexp4.exec('blah blah blah')", "['blah blah blah']");
+shouldBeNull("regexp4.exec('blah\\nsecond')");
+shouldBeNull("regexp4.exec('first\\nblah')");
+shouldBeNull("regexp4.exec('first\\nblah\\nthird')");
+shouldBeNull("regexp4.exec('first\\nblah2\\nblah3')");
+
+var regexp5 = /.*?blah.*/;
+shouldBeNull("regexp5.exec('test')");
+shouldBe("regexp5.exec('blah')", "['blah']");
+shouldBe("regexp5.exec('1blah')", "['1blah']");
+shouldBe("regexp5.exec('blah1')", "['blah1']");
+shouldBe("regexp5.exec('blah blah blah')", "['blah blah blah']");
+shouldBe("regexp5.exec('blah\\nsecond')", "['blah']");
+shouldBe("regexp5.exec('first\\nblah')", "['blah']");
+shouldBe("regexp5.exec('first\\nblah\\nthird')", "['blah']");
+shouldBe("regexp5.exec('first\\nblah2\\nblah3')", "['blah2']");
+
+var regexp6 = /.*blah.*?/;
+shouldBeNull("regexp6.exec('test')");
+shouldBe("regexp6.exec('blah')", "['blah']");
+shouldBe("regexp6.exec('1blah')", "['1blah']");
+shouldBe("regexp6.exec('blah1')", "['blah']");
+shouldBe("regexp6.exec('blah blah blah')", "['blah blah blah']");
+shouldBe("regexp6.exec('blah\\nsecond')", "['blah']");
+shouldBe("regexp6.exec('first\\nblah')", "['blah']");
+shouldBe("regexp6.exec('first\\nblah\\nthird')", "['blah']");
+shouldBe("regexp6.exec('first\\nblah2\\nblah3')", "['blah']");
+
+var regexp7 = /^.*?blah.*?$/;
+shouldBeNull("regexp7.exec('test')");
+shouldBe("regexp7.exec('blah')", "['blah']");
+shouldBe("regexp7.exec('1blah')", "['1blah']");
+shouldBe("regexp7.exec('blah1')", "['blah1']");
+shouldBe("regexp7.exec('blah blah blah')", "['blah blah blah']");
+shouldBeNull("regexp7.exec('blah\\nsecond')");
+shouldBeNull("regexp7.exec('first\\nblah')");
+shouldBeNull("regexp7.exec('first\\nblah\\nthird')");
+shouldBeNull("regexp7.exec('first\\nblah2\\nblah3')");
+
+var regexp8 = /^(.*)blah.*$/;
+shouldBeNull("regexp8.exec('test')");
+shouldBe("regexp8.exec('blah')", "['blah','']");
+shouldBe("regexp8.exec('1blah')", "['1blah','1']");
+shouldBe("regexp8.exec('blah1')", "['blah1','']");
+shouldBe("regexp8.exec('blah blah blah')", "['blah blah blah','blah blah ']");
+shouldBeNull("regexp8.exec('blah\\nsecond')");
+shouldBeNull("regexp8.exec('first\\nblah')");
+shouldBeNull("regexp8.exec('first\\nblah\\nthird')");
+shouldBeNull("regexp8.exec('first\\nblah2\\nblah3')");
+
+var regexp9 = /.*blah.*/m;
+shouldBeNull("regexp9.exec('test')");
+shouldBe("regexp9.exec('blah')", "['blah']");
+shouldBe("regexp9.exec('1blah')", "['1blah']");
+shouldBe("regexp9.exec('blah1')", "['blah1']");
+shouldBe("regexp9.exec('blah blah blah')", "['blah blah blah']");
+shouldBe("regexp9.exec('blah\\nsecond')", "['blah']");
+shouldBe("regexp9.exec('first\\nblah')", "['blah']");
+shouldBe("regexp9.exec('first\\nblah\\nthird')", "['blah']");
+shouldBe("regexp9.exec('first\\nblah2\\nblah3')", "['blah2']");
+
+var regexp10 = /^.*blah.*/m;
+shouldBeNull("regexp10.exec('test')");
+shouldBe("regexp10.exec('blah')", "['blah']");
+shouldBe("regexp10.exec('1blah')", "['1blah']");
+shouldBe("regexp10.exec('blah1')", "['blah1']");
+shouldBe("regexp10.exec('blah blah blah')", "['blah blah blah']");
+shouldBe("regexp10.exec('blah\\nsecond')", "['blah']");
+shouldBe("regexp10.exec('first\\nblah')", "['blah']");
+shouldBe("regexp10.exec('first\\nblah\\nthird')", "['blah']");
+shouldBe("regexp10.exec('first\\nblah2\\nblah3')", "['blah2']");
+
+var regexp11 = /.*(?:blah).*$/;
+shouldBeNull("regexp11.exec('test')");
+shouldBe("regexp11.exec('blah')", "['blah']");
+shouldBe("regexp11.exec('1blah')", "['1blah']");
+shouldBe("regexp11.exec('blah1')", "['blah1']");
+shouldBe("regexp11.exec('blah blah blah')", "['blah blah blah']");
+shouldBeNull("regexp11.exec('blah\\nsecond')");
+shouldBe("regexp11.exec('first\\nblah')", "['blah']");
+shouldBeNull("regexp11.exec('first\\nblah\\nthird')");
+shouldBe("regexp11.exec('first\\nblah2\\nblah3')", "['blah3']");
+
+var regexp12 = /.*(?:blah|buzz|bang).*$/;
+shouldBeNull("regexp12.exec('test')");
+shouldBe("regexp12.exec('blah')", "['blah']");
+shouldBe("regexp12.exec('1blah')", "['1blah']");
+shouldBe("regexp12.exec('blah1')", "['blah1']");
+shouldBe("regexp12.exec('blah blah blah')", "['blah blah blah']");
+shouldBeNull("regexp12.exec('blah\\nsecond')");
+shouldBe("regexp12.exec('first\\nblah')", "['blah']");
+shouldBeNull("regexp12.exec('first\\nblah\\nthird')");
+shouldBe("regexp12.exec('first\\nblah2\\nblah3')", "['blah3']");
+
+var regexp13 = /.*\n\d+.*/;
+shouldBe("regexp13.exec('abc\\n123')", "['abc\\n123']");
+
+var successfullyParsed = true;
+

Modified: trunk/Source/_javascript_Core/ChangeLog (90961 => 90962)


--- trunk/Source/_javascript_Core/ChangeLog	2011-07-13 23:00:52 UTC (rev 90961)
+++ trunk/Source/_javascript_Core/ChangeLog	2011-07-13 23:05:40 UTC (rev 90962)
@@ -1,3 +1,48 @@
+2011-07-13  Michael Saboff  <msab...@apple.com>
+
+        https://bugs.webkit.org/show_bug.cgi?id=64202
+        Enh: Improve handling of RegExp in the form of /.*blah.*/
+
+        Reviewed by Gavin Barraclough.
+
+        Added code to both the Yarr interpreter and JIT to handle
+        these expressions a little differently.  First off, the terms
+        in between the leading and trailing .*'s cannot capture and
+        also this enhancement is limited to single alternative expressions.
+        If an _expression_ is of the right form with the aforementioned
+        restrictions, we process the inner terms and then look for the
+        beginning of the string and end of the string.  There is handling 
+        for multiline expressions to allow the beginning and end to be 
+        right after and right before newlines.
+
+        This enhancement speeds up expressions of this type 12x on
+        a MacBookPro.
+
+        Cleaned up 'case' statement indentation.
+
+        A new set of tests was added as LayoutTests/fast/regex/dotstar.html
+
+        * yarr/YarrInterpreter.cpp:
+        (JSC::Yarr::Interpreter::InputStream::end):
+        (JSC::Yarr::Interpreter::matchDotStarEnclosure):
+        (JSC::Yarr::Interpreter::matchDisjunction):
+        (JSC::Yarr::ByteCompiler::assertionDotStarEnclosure):
+        (JSC::Yarr::ByteCompiler::emitDisjunction):
+        * yarr/YarrInterpreter.h:
+        (JSC::Yarr::ByteTerm::DotStarEnclosure):
+        * yarr/YarrJIT.cpp:
+        (JSC::Yarr::YarrGenerator::generateDotStarEnclosure):
+        (JSC::Yarr::YarrGenerator::backtrackDotStarEnclosure):
+        (JSC::Yarr::YarrGenerator::generateTerm):
+        (JSC::Yarr::YarrGenerator::backtrackTerm):
+        * yarr/YarrPattern.cpp:
+        (JSC::Yarr::YarrPatternConstructor::setupAlternativeOffsets):
+        (JSC::Yarr::YarrPatternConstructor::containsCapturingTerms):
+        (JSC::Yarr::YarrPatternConstructor::optimizeDotStarWrappedExpressions):
+        (JSC::Yarr::YarrPattern::compile):
+        * yarr/YarrPattern.h:
+        (JSC::Yarr::PatternTerm::PatternTerm):
+
 2011-07-13  Xan Lopez  <xlo...@igalia.com>
 
         [GTK] Fix distcheck

Modified: trunk/Source/_javascript_Core/yarr/YarrInterpreter.cpp (90961 => 90962)


--- trunk/Source/_javascript_Core/yarr/YarrInterpreter.cpp	2011-07-13 23:00:52 UTC (rev 90961)
+++ trunk/Source/_javascript_Core/yarr/YarrInterpreter.cpp	2011-07-13 23:05:40 UTC (rev 90962)
@@ -243,6 +243,11 @@
             return pos == length;
         }
 
+        unsigned end()
+        {
+            return length;
+        }
+
         bool checkInput(int count)
         {
             if ((pos + count) <= length) {
@@ -1044,6 +1049,38 @@
         return JSRegExpErrorNoMatch;
     }
 
+    bool matchDotStarEnclosure(ByteTerm& term, DisjunctionContext* context)
+    {
+        UNUSED_PARAM(term);
+        unsigned matchBegin = context->matchBegin;
+
+        if (matchBegin) {
+            for (matchBegin--; true; matchBegin--) {
+                if (testCharacterClass(pattern->newlineCharacterClass, input.reread(matchBegin))) {
+                    ++matchBegin;
+                    break;
+                }
+
+                if (!matchBegin)
+                    break;
+            }
+        }
+
+        unsigned matchEnd = input.getPos();
+
+        for (; (matchEnd != input.end())
+             && (!testCharacterClass(pattern->newlineCharacterClass, input.reread(matchEnd))); matchEnd++) { }
+
+        if (((matchBegin && term.anchors.m_bol)
+             || ((matchEnd != input.end()) && term.anchors.m_eol))
+            && !pattern->m_multiline)
+            return false;
+
+        context->matchBegin = matchBegin;
+        context->matchEnd = matchEnd;
+        return true;
+    }
+
 #define MATCH_NEXT() { ++context->term; goto matchAgain; }
 #define BACKTRACK() { --context->term; goto backtrack; }
 #define currentTerm() (disjunction->terms[context->term])
@@ -1203,9 +1240,14 @@
                 MATCH_NEXT();
             BACKTRACK();
 
-            case ByteTerm::TypeUncheckInput:
-                input.uncheckInput(currentTerm().checkInputCount);
-                MATCH_NEXT();
+        case ByteTerm::TypeUncheckInput:
+            input.uncheckInput(currentTerm().checkInputCount);
+            MATCH_NEXT();
+                
+        case ByteTerm::TypeDotStarEnclosure:
+            if (matchDotStarEnclosure(currentTerm(), context))
+                return JSRegExpMatch;
+            BACKTRACK();
         }
 
         // We should never fall-through to here.
@@ -1242,91 +1284,94 @@
         case ByteTerm::TypeBodyAlternativeEnd:
             ASSERT_NOT_REACHED();
 
-            case ByteTerm::TypeAlternativeBegin:
-            case ByteTerm::TypeAlternativeDisjunction: {
-                int offset = currentTerm().alternative.next;
-                context->term += offset;
-                if (offset > 0)
-                    MATCH_NEXT();
-                BACKTRACK();
-            }
-            case ByteTerm::TypeAlternativeEnd: {
-                // We should never backtrack back into an alternative of the main body of the regex.
-                BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
-                unsigned offset = backTrack->offset;
-                context->term -= offset;
-                BACKTRACK();
-            }
+        case ByteTerm::TypeAlternativeBegin:
+        case ByteTerm::TypeAlternativeDisjunction: {
+            int offset = currentTerm().alternative.next;
+            context->term += offset;
+            if (offset > 0)
+                MATCH_NEXT();
+            BACKTRACK();
+        }
+        case ByteTerm::TypeAlternativeEnd: {
+            // We should never backtrack back into an alternative of the main body of the regex.
+            BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
+            unsigned offset = backTrack->offset;
+            context->term -= offset;
+            BACKTRACK();
+        }
 
-            case ByteTerm::TypeAssertionBOL:
-            case ByteTerm::TypeAssertionEOL:
-            case ByteTerm::TypeAssertionWordBoundary:
-                BACKTRACK();
+        case ByteTerm::TypeAssertionBOL:
+        case ByteTerm::TypeAssertionEOL:
+        case ByteTerm::TypeAssertionWordBoundary:
+            BACKTRACK();
 
-            case ByteTerm::TypePatternCharacterOnce:
-            case ByteTerm::TypePatternCharacterFixed:
-            case ByteTerm::TypePatternCharacterGreedy:
-            case ByteTerm::TypePatternCharacterNonGreedy:
-                if (backtrackPatternCharacter(currentTerm(), context))
-                    MATCH_NEXT();
-                BACKTRACK();
-            case ByteTerm::TypePatternCasedCharacterOnce:
-            case ByteTerm::TypePatternCasedCharacterFixed:
-            case ByteTerm::TypePatternCasedCharacterGreedy:
-            case ByteTerm::TypePatternCasedCharacterNonGreedy:
-                if (backtrackPatternCasedCharacter(currentTerm(), context))
-                    MATCH_NEXT();
-                BACKTRACK();
-            case ByteTerm::TypeCharacterClass:
-                if (backtrackCharacterClass(currentTerm(), context))
-                    MATCH_NEXT();
-                BACKTRACK();
-            case ByteTerm::TypeBackReference:
-                if (backtrackBackReference(currentTerm(), context))
-                    MATCH_NEXT();
-                BACKTRACK();
-            case ByteTerm::TypeParenthesesSubpattern: {
-                JSRegExpResult result = backtrackParentheses(currentTerm(), context);
+        case ByteTerm::TypePatternCharacterOnce:
+        case ByteTerm::TypePatternCharacterFixed:
+        case ByteTerm::TypePatternCharacterGreedy:
+        case ByteTerm::TypePatternCharacterNonGreedy:
+            if (backtrackPatternCharacter(currentTerm(), context))
+                MATCH_NEXT();
+            BACKTRACK();
+        case ByteTerm::TypePatternCasedCharacterOnce:
+        case ByteTerm::TypePatternCasedCharacterFixed:
+        case ByteTerm::TypePatternCasedCharacterGreedy:
+        case ByteTerm::TypePatternCasedCharacterNonGreedy:
+            if (backtrackPatternCasedCharacter(currentTerm(), context))
+                MATCH_NEXT();
+            BACKTRACK();
+        case ByteTerm::TypeCharacterClass:
+            if (backtrackCharacterClass(currentTerm(), context))
+                MATCH_NEXT();
+            BACKTRACK();
+        case ByteTerm::TypeBackReference:
+            if (backtrackBackReference(currentTerm(), context))
+                MATCH_NEXT();
+            BACKTRACK();
+        case ByteTerm::TypeParenthesesSubpattern: {
+            JSRegExpResult result = backtrackParentheses(currentTerm(), context);
 
-                if (result == JSRegExpMatch) {
-                    MATCH_NEXT();
-                } else if (result != JSRegExpNoMatch)
-                    return result;
+            if (result == JSRegExpMatch) {
+                MATCH_NEXT();
+            } else if (result != JSRegExpNoMatch)
+                return result;
 
-                BACKTRACK();
-            }
-            case ByteTerm::TypeParenthesesSubpatternOnceBegin:
-                if (backtrackParenthesesOnceBegin(currentTerm(), context))
-                    MATCH_NEXT();
-                BACKTRACK();
-            case ByteTerm::TypeParenthesesSubpatternOnceEnd:
-                if (backtrackParenthesesOnceEnd(currentTerm(), context))
-                    MATCH_NEXT();
-                BACKTRACK();
-            case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
-                if (backtrackParenthesesTerminalBegin(currentTerm(), context))
-                    MATCH_NEXT();
-                BACKTRACK();
-            case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
-                if (backtrackParenthesesTerminalEnd(currentTerm(), context))
-                    MATCH_NEXT();
-                BACKTRACK();
-            case ByteTerm::TypeParentheticalAssertionBegin:
-                if (backtrackParentheticalAssertionBegin(currentTerm(), context))
-                    MATCH_NEXT();
-                BACKTRACK();
-            case ByteTerm::TypeParentheticalAssertionEnd:
-                if (backtrackParentheticalAssertionEnd(currentTerm(), context))
-                    MATCH_NEXT();
-                BACKTRACK();
+            BACKTRACK();
+        }
+        case ByteTerm::TypeParenthesesSubpatternOnceBegin:
+            if (backtrackParenthesesOnceBegin(currentTerm(), context))
+                MATCH_NEXT();
+            BACKTRACK();
+        case ByteTerm::TypeParenthesesSubpatternOnceEnd:
+            if (backtrackParenthesesOnceEnd(currentTerm(), context))
+                MATCH_NEXT();
+            BACKTRACK();
+        case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
+            if (backtrackParenthesesTerminalBegin(currentTerm(), context))
+                MATCH_NEXT();
+            BACKTRACK();
+        case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
+            if (backtrackParenthesesTerminalEnd(currentTerm(), context))
+                MATCH_NEXT();
+            BACKTRACK();
+        case ByteTerm::TypeParentheticalAssertionBegin:
+            if (backtrackParentheticalAssertionBegin(currentTerm(), context))
+                MATCH_NEXT();
+            BACKTRACK();
+        case ByteTerm::TypeParentheticalAssertionEnd:
+            if (backtrackParentheticalAssertionEnd(currentTerm(), context))
+                MATCH_NEXT();
+            BACKTRACK();
 
-            case ByteTerm::TypeCheckInput:
-                input.uncheckInput(currentTerm().checkInputCount);
-                BACKTRACK();
+        case ByteTerm::TypeCheckInput:
+            input.uncheckInput(currentTerm().checkInputCount);
+            BACKTRACK();
 
-            case ByteTerm::TypeUncheckInput:
-                input.checkInput(currentTerm().checkInputCount);
-                BACKTRACK();
+        case ByteTerm::TypeUncheckInput:
+            input.checkInput(currentTerm().checkInputCount);
+            BACKTRACK();
+
+        case ByteTerm::TypeDotStarEnclosure:
+            ASSERT_NOT_REACHED();
         }
 
         ASSERT_NOT_REACHED();
@@ -1559,6 +1604,11 @@
         m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
     }
 
+    void assertionDotStarEnclosure(bool bolAnchored, bool eolAnchored)
+    {
+        m_bodyDisjunction->terms.append(ByteTerm::DotStarEnclosure(bolAnchored, eolAnchored));
+    }
+
     unsigned popParenthesesStack()
     {
         ASSERT(m_parenthesesStack.size());
@@ -1838,6 +1888,10 @@
                     }
                     break;
                 }
+
+                case PatternTerm::TypeDotStarEnclosure:
+                    assertionDotStarEnclosure(term.anchors.bolAnchor, term.anchors.eolAnchor);
+                    break;
                 }
             }
         }

Modified: trunk/Source/_javascript_Core/yarr/YarrInterpreter.h (90961 => 90962)


--- trunk/Source/_javascript_Core/yarr/YarrInterpreter.h	2011-07-13 23:00:52 UTC (rev 90961)
+++ trunk/Source/_javascript_Core/yarr/YarrInterpreter.h	2011-07-13 23:05:40 UTC (rev 90962)
@@ -71,6 +71,7 @@
         TypeParentheticalAssertionEnd,
         TypeCheckInput,
         TypeUncheckInput,
+        TypeDotStarEnclosure,
     } type;
     union {
         struct {
@@ -95,6 +96,10 @@
             int end;
             bool onceThrough;
         } alternative;
+        struct {
+            bool m_bol : 1;
+            bool m_eol : 1;
+        } anchors;
         unsigned checkInputCount;
     };
     unsigned frameLocation;
@@ -295,6 +300,14 @@
     {
         return ByteTerm(TypeSubpatternEnd);
     }
+    
+    static ByteTerm DotStarEnclosure(bool bolAnchor, bool eolAnchor)
+    {
+        ByteTerm term(TypeDotStarEnclosure);
+        term.anchors.m_bol = bolAnchor;
+        term.anchors.m_eol = eolAnchor;
+        return term;
+    }
 
     bool invert()
     {

Modified: trunk/Source/_javascript_Core/yarr/YarrJIT.cpp (90961 => 90962)


--- trunk/Source/_javascript_Core/yarr/YarrJIT.cpp	2011-07-13 23:00:52 UTC (rev 90961)
+++ trunk/Source/_javascript_Core/yarr/YarrJIT.cpp	2011-07-13 23:05:40 UTC (rev 90962)
@@ -995,6 +995,63 @@
         m_backtrackingState.fallthrough();
     }
 
+    void generateDotStarEnclosure(size_t opIndex)
+    {
+        YarrOp& op = m_ops[opIndex];
+        PatternTerm* term = op.m_term;
+
+        const RegisterID character = regT0;
+        const RegisterID matchPos = regT1;
+
+        JumpList foundBeginningNewLine;
+        JumpList saveStartIndex;
+        JumpList foundEndingNewLine;
+
+        if (m_pattern.m_body->m_hasFixedSize) {
+            move(index, matchPos);
+            sub32(Imm32(m_checked), matchPos);
+        } else
+            load32(Address(output), matchPos);
+
+        saveStartIndex.append(branchTest32(Zero, matchPos));
+        Label findBOLLoop(this);
+        sub32(TrustedImm32(1), matchPos);
+        load16(BaseIndex(input, matchPos, TimesTwo, 0), character);
+        matchCharacterClass(character, foundBeginningNewLine, m_pattern.newlineCharacterClass());
+        branchTest32(NonZero, matchPos).linkTo(findBOLLoop, this);
+        saveStartIndex.append(jump());
+
+        foundBeginningNewLine.link(this);
+        add32(TrustedImm32(1), matchPos); // Advance past newline
+        saveStartIndex.link(this);
+
+        if (!m_pattern.m_multiline && term->anchors.bolAnchor)
+            op.m_jumps.append(branchTest32(NonZero, matchPos));
+
+        store32(matchPos, Address(output));
+
+        move(index, matchPos);
+
+        Label findEOLLoop(this);        
+        foundEndingNewLine.append(branch32(Equal, matchPos, length));
+        load16(BaseIndex(input, matchPos, TimesTwo, 0), character);
+        matchCharacterClass(character, foundEndingNewLine, m_pattern.newlineCharacterClass());
+        add32(TrustedImm32(1), matchPos);
+        jump(findEOLLoop);
+
+        foundEndingNewLine.link(this);
+
+        if (!m_pattern.m_multiline && term->anchors.eolAnchor)
+            op.m_jumps.append(branch32(NotEqual, matchPos, length));
+
+        move(matchPos, index);
+    }
+
+    void backtrackDotStarEnclosure(size_t opIndex)
+    {
+        backtrackTermDefault(opIndex);
+    }
+    
     // Code generation/backtracking for simple terms
     // (pattern characters, character classes, and assertions).
     // These methods farm out work to the set of functions above.
@@ -1059,6 +1116,9 @@
         case PatternTerm::TypeBackReference:
             m_shouldFallBack = true;
             break;
+        case PatternTerm::TypeDotStarEnclosure:
+            generateDotStarEnclosure(opIndex);
+            break;
         }
     }
     void backtrackTerm(size_t opIndex)
@@ -1119,6 +1179,11 @@
         case PatternTerm::TypeParenthesesSubpattern:
         case PatternTerm::TypeParentheticalAssertion:
             ASSERT_NOT_REACHED();
+
+        case PatternTerm::TypeDotStarEnclosure:
+            backtrackDotStarEnclosure(opIndex);
+            break;
+
         case PatternTerm::TypeBackReference:
             m_shouldFallBack = true;
             break;

Modified: trunk/Source/_javascript_Core/yarr/YarrPattern.cpp (90961 => 90962)


--- trunk/Source/_javascript_Core/yarr/YarrPattern.cpp	2011-07-13 23:00:52 UTC (rev 90961)
+++ trunk/Source/_javascript_Core/yarr/YarrPattern.cpp	2011-07-13 23:05:40 UTC (rev 90962)
@@ -582,6 +582,11 @@
                 term.frameLocation = currentCallFrameSize;
                 currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + YarrStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition);
                 break;
+
+            case PatternTerm::TypeDotStarEnclosure:
+                alternative->m_hasFixedSize = false;
+                term.inputPosition = initialInputPosition;
+                break;
             }
         }
 
@@ -675,6 +680,87 @@
         }
     }
 
+    bool containsCapturingTerms(PatternAlternative* alternative, size_t firstTermIndex, size_t lastTermIndex)
+    {
+        Vector<PatternTerm>& terms = alternative->m_terms;
+
+        for (size_t termIndex = firstTermIndex; termIndex <= lastTermIndex; ++termIndex) {
+            PatternTerm& term = terms[termIndex];
+
+            if (term.m_capture)
+                return true;
+
+            if (term.type == PatternTerm::TypeParenthesesSubpattern) {
+                PatternDisjunction* nestedDisjunction = term.parentheses.disjunction;
+                for (unsigned alt = 0; alt < nestedDisjunction->m_alternatives.size(); ++alt) {
+                    if (containsCapturingTerms(nestedDisjunction->m_alternatives[alt], 0, nestedDisjunction->m_alternatives[alt]->m_terms.size() - 1))
+                        return true;
+                }
+            }
+        }
+
+        return false;
+    }
+
+    // This optimization identifies alternatives in the form of 
+    // [^].*[?]<_expression_>.*[$] for expressions that don't have any 
+    // capturing terms. The alternative is changed to <_expression_> 
+    // followed by processing of the dot stars to find and adjust the 
+    // beginning and the end of the match.
+    void optimizeDotStarWrappedExpressions()
+    {
+        Vector<PatternAlternative*>& alternatives = m_pattern.m_body->m_alternatives;
+        if (alternatives.size() != 1)
+            return;
+
+        PatternAlternative* alternative = alternatives[0];
+        Vector<PatternTerm>& terms = alternative->m_terms;
+        if (terms.size() >= 3) {
+            bool startsWithBOL = false;
+            bool endsWithEOL = false;
+            size_t termIndex, firstExpressionTerm, lastExpressionTerm;
+
+            termIndex = 0;
+            if (terms[termIndex].type == PatternTerm::TypeAssertionBOL) {
+                startsWithBOL = true;
+                ++termIndex;
+            }
+            
+            PatternTerm& firstNonAnchorTerm = terms[termIndex];
+            if ((firstNonAnchorTerm.type != PatternTerm::TypeCharacterClass) || (firstNonAnchorTerm.characterClass != m_pattern.newlineCharacterClass()) || !((firstNonAnchorTerm.quantityType == QuantifierGreedy) || (firstNonAnchorTerm.quantityType == QuantifierNonGreedy)))
+                return;
+            
+            firstExpressionTerm = termIndex + 1;
+            
+            termIndex = terms.size() - 1;
+            if (terms[termIndex].type == PatternTerm::TypeAssertionEOL) {
+                endsWithEOL = true;
+                --termIndex;
+            }
+            
+            PatternTerm& lastNonAnchorTerm = terms[termIndex];
+            if ((lastNonAnchorTerm.type != PatternTerm::TypeCharacterClass) || (lastNonAnchorTerm.characterClass != m_pattern.newlineCharacterClass()) || (lastNonAnchorTerm.quantityType != QuantifierGreedy))
+                return;
+            
+            lastExpressionTerm = termIndex - 1;
+
+            if (firstExpressionTerm > lastExpressionTerm)
+                return;
+
+            if (!containsCapturingTerms(alternative, firstExpressionTerm, lastExpressionTerm)) {
+                for (termIndex = terms.size() - 1; termIndex > lastExpressionTerm; --termIndex)
+                    terms.remove(termIndex);
+
+                for (termIndex = firstExpressionTerm; termIndex > 0; --termIndex)
+                    terms.remove(termIndex - 1);
+
+                terms.append(PatternTerm(startsWithBOL, endsWithEOL));
+                
+                m_pattern.m_containsBOL = false;
+            }
+        }
+    }
+
 private:
     YarrPattern& m_pattern;
     PatternAlternative* m_alternative;
@@ -708,6 +794,7 @@
     }
 
     constructor.checkForTerminalParentheses();
+    constructor.optimizeDotStarWrappedExpressions();
     constructor.optimizeBOL();
         
     constructor.setupOffsets();

Modified: trunk/Source/_javascript_Core/yarr/YarrPattern.h (90961 => 90962)


--- trunk/Source/_javascript_Core/yarr/YarrPattern.h	2011-07-13 23:00:52 UTC (rev 90961)
+++ trunk/Source/_javascript_Core/yarr/YarrPattern.h	2011-07-13 23:05:40 UTC (rev 90962)
@@ -96,6 +96,7 @@
         TypeForwardReference,
         TypeParenthesesSubpattern,
         TypeParentheticalAssertion,
+        TypeDotStarEnclosure,
     } type;
     bool m_capture :1;
     bool m_invert :1;
@@ -110,6 +111,10 @@
             bool isCopy;
             bool isTerminal;
         } parentheses;
+        struct {
+            bool bolAnchor : 1;
+            bool eolAnchor : 1;
+        } anchors;
     };
     QuantifierType quantityType;
     unsigned quantityCount;
@@ -168,6 +173,17 @@
         quantityCount = 1;
     }
 
+    PatternTerm(bool bolAnchor, bool eolAnchor)
+        : type(TypeDotStarEnclosure)
+        , m_capture(false)
+        , m_invert(false)
+    {
+        anchors.bolAnchor = bolAnchor;
+        anchors.eolAnchor = eolAnchor;
+        quantityType = QuantifierFixedCount;
+        quantityCount = 1;
+    }
+    
     static PatternTerm ForwardReference()
     {
         return PatternTerm(TypeForwardReference);
_______________________________________________
webkit-changes mailing list
webkit-changes@lists.webkit.org
http://lists.webkit.org/mailman/listinfo.cgi/webkit-changes

Reply via email to