Author: [email protected]
Date: Wed Dec 17 02:59:14 2008
New Revision: 988

Modified:
    branches/bleeding_edge/src/ast.cc
    branches/bleeding_edge/src/ast.h
    branches/bleeding_edge/src/jsregexp.cc
    branches/bleeding_edge/src/jsregexp.h
    branches/bleeding_edge/src/parser.cc
    branches/bleeding_edge/test/cctest/test-regexp.cc

Log:
Each RegExtTree node can now report the min and max size of strings it can  
match.


Modified: branches/bleeding_edge/src/ast.cc
==============================================================================
--- branches/bleeding_edge/src/ast.cc   (original)
+++ branches/bleeding_edge/src/ast.cc   Wed Dec 17 02:59:14 2008
@@ -332,7 +332,7 @@

  void* RegExpUnparser::VisitQuantifier(RegExpQuantifier* that, void* data) {
    stream()->Add("(# %i ", that->min());
-  if (that->max() == RegExpQuantifier::kInfinity) {
+  if (that->max() == RegExpTree::kInfinity) {
      stream()->Add("- ");
    } else {
      stream()->Add("%i ", that->max());
@@ -378,6 +378,36 @@
    RegExpUnparser unparser;
    Accept(&unparser, NULL);
    return unparser.ToString();
+}
+
+
+RegExpDisjunction::RegExpDisjunction(ZoneList<RegExpTree*>* alternatives)
+    : alternatives_(alternatives) {
+  RegExpTree* first_alternative = alternatives->at(0);
+  min_match_ = first_alternative->min_match();
+  max_match_ = first_alternative->max_match();
+  for (int i = 1; i < alternatives->length(); i++) {
+    RegExpTree* alternative = alternatives->at(i);
+    min_match_ = Min(min_match_, alternative->min_match());
+    max_match_ = Max(max_match_, alternative->max_match());
+  }
+}
+
+
+RegExpAlternative::RegExpAlternative(ZoneList<RegExpTree*>* nodes)
+    : nodes_(nodes) {
+  min_match_ = 0;
+  max_match_ = 0;
+  for (int i = 0; i < nodes->length(); i++) {
+    RegExpTree* node = nodes->at(i);
+    min_match_ += node->min_match();
+    int node_max_match = node->max_match();
+    if (kInfinity - max_match_ < node_max_match) {
+      max_match_ = kInfinity;
+    } else {
+      max_match_ += node->max_match();
+    }
+  }
  }



Modified: branches/bleeding_edge/src/ast.h
==============================================================================
--- branches/bleeding_edge/src/ast.h    (original)
+++ branches/bleeding_edge/src/ast.h    Wed Dec 17 02:59:14 2008
@@ -1213,11 +1213,14 @@

  class RegExpTree: public ZoneObject {
   public:
+  static const int kInfinity = (1<<31)-1;
    virtual ~RegExpTree() { }
    virtual void* Accept(RegExpVisitor* visitor, void* data) = 0;
    virtual RegExpNode* ToNode(RegExpCompiler* compiler,
                               RegExpNode* on_success) = 0;
    virtual bool IsTextElement() { return false; }
+  virtual int min_match() = 0;
+  virtual int max_match() = 0;
    virtual void AppendToText(RegExpText* text);
    SmartPointer<const char> ToString();
  #define MAKE_ASTYPE(Name)                                                   
\
@@ -1230,47 +1233,37 @@

  class RegExpDisjunction: public RegExpTree {
   public:
-  explicit RegExpDisjunction(ZoneList<RegExpTree*>* alternatives)
-      : alternatives_(alternatives) { }
+  explicit RegExpDisjunction(ZoneList<RegExpTree*>* alternatives);
    virtual void* Accept(RegExpVisitor* visitor, void* data);
    virtual RegExpNode* ToNode(RegExpCompiler* compiler,
                               RegExpNode* on_success);
    virtual RegExpDisjunction* AsDisjunction();
    virtual bool IsDisjunction();
+  virtual int min_match() { return min_match_; }
+  virtual int max_match() { return max_match_; }
    ZoneList<RegExpTree*>* alternatives() { return alternatives_; }
   private:
    ZoneList<RegExpTree*>* alternatives_;
+  int min_match_;
+  int max_match_;
  };


  class RegExpAlternative: public RegExpTree {
   public:
-  explicit RegExpAlternative(ZoneList<RegExpTree*>* nodes) : nodes_(nodes)  
{ }
+  explicit RegExpAlternative(ZoneList<RegExpTree*>* nodes);
    virtual void* Accept(RegExpVisitor* visitor, void* data);
    virtual RegExpNode* ToNode(RegExpCompiler* compiler,
                               RegExpNode* on_success);
    virtual RegExpAlternative* AsAlternative();
    virtual bool IsAlternative();
+  virtual int min_match() { return min_match_; }
+  virtual int max_match() { return max_match_; }
    ZoneList<RegExpTree*>* nodes() { return nodes_; }
   private:
    ZoneList<RegExpTree*>* nodes_;
-};
-
-
-class RegExpText: public RegExpTree {
- public:
-  RegExpText() : elements_(2) { }
-  virtual void* Accept(RegExpVisitor* visitor, void* data);
-  virtual RegExpNode* ToNode(RegExpCompiler* compiler,
-                             RegExpNode* on_success);
-  virtual RegExpText* AsText();
-  virtual bool IsText();
-  virtual bool IsTextElement() { return true; }
-  virtual void AppendToText(RegExpText* text);
-  void AddElement(TextElement elm) { elements_.Add(elm); }
-  ZoneList<TextElement>* elements() { return &elements_; }
- private:
-  ZoneList<TextElement> elements_;
+  int min_match_;
+  int max_match_;
  };


@@ -1290,6 +1283,8 @@
                               RegExpNode* on_success);
    virtual RegExpAssertion* AsAssertion();
    virtual bool IsAssertion();
+  virtual int min_match() { return 0; }
+  virtual int max_match() { return 0; }
    Type type() { return type_; }
   private:
    Type type_;
@@ -1312,6 +1307,8 @@
    virtual RegExpCharacterClass* AsCharacterClass();
    virtual bool IsCharacterClass();
    virtual bool IsTextElement() { return true; }
+  virtual int min_match() { return 1; }
+  virtual int max_match() { return 1; }
    virtual void AppendToText(RegExpText* text);
    ZoneList<CharacterRange>* ranges() { return ranges_; }
    bool is_negated() { return is_negated_; }
@@ -1330,20 +1327,53 @@
    virtual RegExpAtom* AsAtom();
    virtual bool IsAtom();
    virtual bool IsTextElement() { return true; }
+  virtual int min_match() { return data_.length(); }
+  virtual int max_match() { return data_.length(); }
    virtual void AppendToText(RegExpText* text);
    Vector<const uc16> data() { return data_; }
+  int length() { return data_.length(); }
   private:
    Vector<const uc16> data_;
  };


+class RegExpText: public RegExpTree {
+ public:
+  RegExpText() : elements_(2), length_(0) {}
+  virtual void* Accept(RegExpVisitor* visitor, void* data);
+  virtual RegExpNode* ToNode(RegExpCompiler* compiler,
+                             RegExpNode* on_success);
+  virtual RegExpText* AsText();
+  virtual bool IsText();
+  virtual bool IsTextElement() { return true; }
+  virtual int min_match() { return length_; }
+  virtual int max_match() { return length_; }
+  virtual void AppendToText(RegExpText* text);
+  void AddElement(TextElement elm)  {
+    elements_.Add(elm);
+    length_ += elm.length();
+  };
+  ZoneList<TextElement>* elements() { return &elements_; }
+ private:
+  ZoneList<TextElement> elements_;
+  int length_;
+};
+
+
  class RegExpQuantifier: public RegExpTree {
   public:
    RegExpQuantifier(int min, int max, bool is_greedy, RegExpTree* body)
        : min_(min),
          max_(max),
          is_greedy_(is_greedy),
-        body_(body) { }
+        body_(body),
+        min_match_(min * body->min_match()) {
+    if (max > 0 && body->max_match() > kInfinity / max) {
+      max_match_ = kInfinity;
+    } else {
+      max_match_ = max * body->max_match();
+    }
+  }
    virtual void* Accept(RegExpVisitor* visitor, void* data);
    virtual RegExpNode* ToNode(RegExpCompiler* compiler,
                               RegExpNode* on_success);
@@ -1355,18 +1385,19 @@
                              RegExpNode* on_success);
    virtual RegExpQuantifier* AsQuantifier();
    virtual bool IsQuantifier();
+  virtual int min_match() { return min_match_; }
+  virtual int max_match() { return max_match_; }
    int min() { return min_; }
    int max() { return max_; }
    bool is_greedy() { return is_greedy_; }
    RegExpTree* body() { return body_; }
-  // We just use a very large integer value as infinity because 2^30
-  // is infinite in practice.
-  static const int kInfinity = (1 << 30);
   private:
    int min_;
    int max_;
    bool is_greedy_;
    RegExpTree* body_;
+  int min_match_;
+  int max_match_;
  };


@@ -1389,6 +1420,8 @@
                              RegExpNode* on_success);
    virtual RegExpCapture* AsCapture();
    virtual bool IsCapture();
+  virtual int min_match() { return body_->min_match(); }
+  virtual int max_match() { return body_->max_match(); }
    RegExpTree* body() { return body_; }
    int index() { return index_; }
    inline CaptureAvailability available() { return available_; }
@@ -1414,6 +1447,8 @@
                               RegExpNode* on_success);
    virtual RegExpLookahead* AsLookahead();
    virtual bool IsLookahead();
+  virtual int min_match() { return 0; }
+  virtual int max_match() { return 0; }
    RegExpTree* body() { return body_; }
    bool is_positive() { return is_positive_; }
   private:
@@ -1431,6 +1466,8 @@
                               RegExpNode* on_success);
    virtual RegExpBackReference* AsBackReference();
    virtual bool IsBackReference();
+  virtual int min_match() { return capture_->min_match(); }
+  virtual int max_match() { return capture_->max_match(); }
    int index() { return capture_->index(); }
    RegExpCapture* capture() { return capture_; }
   private:
@@ -1446,6 +1483,8 @@
                               RegExpNode* on_success);
    virtual RegExpEmpty* AsEmpty();
    virtual bool IsEmpty();
+  virtual int min_match() { return 0; }
+  virtual int max_match() { return 0; }
    static RegExpEmpty* GetInstance() { return &kInstance; }
   private:
    static RegExpEmpty kInstance;

Modified: branches/bleeding_edge/src/jsregexp.cc
==============================================================================
--- branches/bleeding_edge/src/jsregexp.cc      (original)
+++ branches/bleeding_edge/src/jsregexp.cc      Wed Dec 17 02:59:14 2008
@@ -1193,6 +1193,16 @@
  }


+int TextElement::length() {
+  if (type == ATOM) {
+    return data.u_atom->length();
+  } else {
+    ASSERT(type == CHAR_CLASS);
+    return 1;
+  }
+}
+
+
  DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
    if (table_ == NULL) {
      table_ = new DispatchTable();
@@ -2667,7 +2677,7 @@
    // TODO(someone): clear captures on repetition and handle empty
    //   matches.
    bool has_min = min > 0;
-  bool has_max = max < RegExpQuantifier::kInfinity;
+  bool has_max = max < RegExpTree::kInfinity;
    bool needs_counter = has_min || has_max;
    int reg_ctr = needs_counter ? compiler->AllocateRegister() : -1;
    ChoiceNode* center = new LoopChoiceNode(2);
@@ -3784,7 +3794,7 @@
    //   since we don't even handle ^ yet I'm saving that optimization for
    //   later.
    RegExpNode* node = RegExpQuantifier::ToNode(0,
-                                              RegExpQuantifier::kInfinity,
+                                              RegExpTree::kInfinity,
                                                false,
                                                new  
RegExpCharacterClass('*'),
                                                &compiler,

Modified: branches/bleeding_edge/src/jsregexp.h
==============================================================================
--- branches/bleeding_edge/src/jsregexp.h       (original)
+++ branches/bleeding_edge/src/jsregexp.h       Wed Dec 17 02:59:14 2008
@@ -439,6 +439,7 @@
    explicit TextElement(Type t) : type(t), cp_offset(-1) { }
    static TextElement Atom(RegExpAtom* atom);
    static TextElement CharClass(RegExpCharacterClass* char_class);
+  int length();
    Type type;
    union {
      RegExpAtom* u_atom;

Modified: branches/bleeding_edge/src/parser.cc
==============================================================================
--- branches/bleeding_edge/src/parser.cc        (original)
+++ branches/bleeding_edge/src/parser.cc        Wed Dec 17 02:59:14 2008
@@ -470,9 +470,8 @@
    } else if (terms_.length() > 0) {
      ASSERT(last_added_ == ADD_ATOM);
      atom = terms_.RemoveLast();
-    if (atom->IsLookahead() || atom->IsAssertion()) {
-      // Guaranteed not to match a non-empty string.
-      // Assertion as an atom can happen as, e.g., (?:\b)
+    if (atom->max_match() == 0) {
+      // Guaranteed to only match an empty string.
        LAST(ADD_TERM);
        if (min == 0) {
          return;
@@ -3797,12 +3796,12 @@
      //   {
      case '*':
        min = 0;
-      max = RegExpQuantifier::kInfinity;
+      max = RegExpTree::kInfinity;
        Advance();
        break;
      case '+':
        min = 1;
-      max = RegExpQuantifier::kInfinity;
+      max = RegExpTree::kInfinity;
        Advance();
        break;
      case '?':
@@ -3968,7 +3967,7 @@
    } else if (current() == ',') {
      Advance();
      if (current() == '}') {
-      max = RegExpQuantifier::kInfinity;
+      max = RegExpTree::kInfinity;
        Advance();
      } else {
        while (IsDecimalDigit(current())) {

Modified: branches/bleeding_edge/test/cctest/test-regexp.cc
==============================================================================
--- branches/bleeding_edge/test/cctest/test-regexp.cc   (original)
+++ branches/bleeding_edge/test/cctest/test-regexp.cc   Wed Dec 17 02:59:14  
2008
@@ -76,9 +76,36 @@
    return result.simple;
  }

+struct MinMaxPair {
+  int min_match;
+  int max_match;
+};
+
+static MinMaxPair CheckMinMaxMatch(const char* input) {
+  V8::Initialize(NULL);
+  v8::HandleScope scope;
+  unibrow::Utf8InputBuffer<> buffer(input, strlen(input));
+  ZoneScope zone_scope(DELETE_ON_EXIT);
+  FlatStringReader reader(CStrVector(input));
+  RegExpCompileData result;
+  CHECK(v8::internal::ParseRegExp(&reader, false, &result));
+  CHECK(result.tree != NULL);
+  CHECK(result.error.is_null());
+  int min_match = result.tree->min_match();
+  int max_match = result.tree->max_match();
+  MinMaxPair pair = { min_match, max_match };
+  return pair;
+}
+
+

  #define CHECK_PARSE_EQ(input, expected) CHECK_EQ(expected, *Parse(input))
  #define CHECK_SIMPLE(input, simple) CHECK_EQ(simple, CheckSimple(input));
+#define CHECK_MIN_MAX(input, min,  
max)                                         \
+  { MinMaxPair min_max =  
CheckMinMaxMatch(input);                              \
+    CHECK_EQ(min,  
min_max.min_match);                                          \
+    CHECK_EQ(max,  
min_max.max_match);                                          \
+  }

  TEST(Parser) {
    V8::Initialize(NULL);
@@ -251,6 +278,55 @@
    CHECK_PARSE_EQ("{12z}", "'{12z}'");
    CHECK_PARSE_EQ("{12,", "'{12,'");
    CHECK_PARSE_EQ("{12,3b", "'{12,3b'");
+
+  CHECK_MIN_MAX("a", 1, 1);
+  CHECK_MIN_MAX("abc", 3, 3);
+  CHECK_MIN_MAX("a[bc]d", 3, 3);
+  CHECK_MIN_MAX("a|bc", 1, 2);
+  CHECK_MIN_MAX("ab|c", 1, 2);
+  CHECK_MIN_MAX("a||bc", 0, 2);
+  CHECK_MIN_MAX("|", 0, 0);
+  CHECK_MIN_MAX("(?:ab)", 2, 2);
+  CHECK_MIN_MAX("(?:ab|cde)", 2, 3);
+  CHECK_MIN_MAX("(?:ab)|cde", 2, 3);
+  CHECK_MIN_MAX("(ab)", 2, 2);
+  CHECK_MIN_MAX("(ab|cde)", 2, 3);
+  CHECK_MIN_MAX("(ab)\\1", 4, 4);
+  CHECK_MIN_MAX("(ab|cde)\\1", 4, 6);
+  CHECK_MIN_MAX("(?:ab)?", 0, 2);
+  CHECK_MIN_MAX("(?:ab)*", 0, RegExpTree::kInfinity);
+  CHECK_MIN_MAX("(?:ab)+", 2, RegExpTree::kInfinity);
+  CHECK_MIN_MAX("a?", 0, 1);
+  CHECK_MIN_MAX("a*", 0, RegExpTree::kInfinity);
+  CHECK_MIN_MAX("a+", 1, RegExpTree::kInfinity);
+  CHECK_MIN_MAX("a??", 0, 1);
+  CHECK_MIN_MAX("a*?", 0, RegExpTree::kInfinity);
+  CHECK_MIN_MAX("a+?", 1, RegExpTree::kInfinity);
+  CHECK_MIN_MAX("(?:a?)?", 0, 1);
+  CHECK_MIN_MAX("(?:a*)?", 0, RegExpTree::kInfinity);
+  CHECK_MIN_MAX("(?:a+)?", 0, RegExpTree::kInfinity);
+  CHECK_MIN_MAX("(?:a?)+", 0, RegExpTree::kInfinity);
+  CHECK_MIN_MAX("(?:a*)+", 0, RegExpTree::kInfinity);
+  CHECK_MIN_MAX("(?:a+)+", 1, RegExpTree::kInfinity);
+  CHECK_MIN_MAX("(?:a?)*", 0, RegExpTree::kInfinity);
+  CHECK_MIN_MAX("(?:a*)*", 0, RegExpTree::kInfinity);
+  CHECK_MIN_MAX("(?:a+)*", 0, RegExpTree::kInfinity);
+  CHECK_MIN_MAX("a{0}", 0, 0);
+  CHECK_MIN_MAX("(?:a+){0}", 0, 0);
+  CHECK_MIN_MAX("(?:a+){0,0}", 0, 0);
+  CHECK_MIN_MAX("a*b", 1, RegExpTree::kInfinity);
+  CHECK_MIN_MAX("a+b", 2, RegExpTree::kInfinity);
+  CHECK_MIN_MAX("a*b|c", 1, RegExpTree::kInfinity);
+  CHECK_MIN_MAX("a+b|c", 1, RegExpTree::kInfinity);
+  CHECK_MIN_MAX("(?:a{5,1000000}){3,1000000}", 15, RegExpTree::kInfinity);
+  CHECK_MIN_MAX("(?:ab){4,7}", 8, 14);
+  CHECK_MIN_MAX("a\\bc", 2, 2);
+  CHECK_MIN_MAX("a\\Bc", 2, 2);
+  CHECK_MIN_MAX("a\\sc", 3, 3);
+  CHECK_MIN_MAX("a\\Sc", 3, 3);
+  CHECK_MIN_MAX("a(?=b)c", 2, 2);
+  CHECK_MIN_MAX("a(?=bbb|bb)c", 2, 2);
+  CHECK_MIN_MAX("a(?!bbb|bb)c", 2, 2);
  }

  TEST(ParserRegression) {

--~--~---------~--~----~------------~-------~--~----~
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
-~----------~----~----~----~------~----~------~--~---

Reply via email to