Author: [EMAIL PROTECTED]
Date: Tue Oct 28 04:18:25 2008
New Revision: 623

Modified:
    branches/experimental/regexp2000/src/ast.cc
    branches/experimental/regexp2000/src/ast.h
    branches/experimental/regexp2000/src/jsregexp.cc
    branches/experimental/regexp2000/src/parser.cc
    branches/experimental/regexp2000/test/cctest/test-regexp.cc

Log:
Added parsing of integer escapes as backreferences.


Modified: branches/experimental/regexp2000/src/ast.cc
==============================================================================
--- branches/experimental/regexp2000/src/ast.cc (original)
+++ branches/experimental/regexp2000/src/ast.cc Tue Oct 28 04:18:25 2008
@@ -328,6 +328,13 @@
  }


+void* RegExpUnparser::VisitBackreference(RegExpBackreference* that,
+                                         void* data) {
+  stream()->Add("(<- %i)", that->index());
+  return NULL;
+}
+
+
  void* RegExpUnparser::VisitEmpty(RegExpEmpty* that, void* data) {
    stream()->Put('%');
    return NULL;

Modified: branches/experimental/regexp2000/src/ast.h
==============================================================================
--- branches/experimental/regexp2000/src/ast.h  (original)
+++ branches/experimental/regexp2000/src/ast.h  Tue Oct 28 04:18:25 2008
@@ -1193,6 +1193,7 @@
    VISIT(Quantifier)                                                  \
    VISIT(Capture)                                                     \
    VISIT(Lookahead)                                                   \
+  VISIT(Backreference)                                               \
    VISIT(Empty)


@@ -1354,6 +1355,16 @@
   private:
    RegExpTree* body_;
    bool is_positive_;
+};
+
+
+class RegExpBackreference: public RegExpTree {
+ public:
+  explicit RegExpBackreference(int index) : index_(index) { }
+  virtual void* Accept(RegExpVisitor* visitor, void* data);
+  int index() { return index_; }
+ private:
+  int index_;
  };



Modified: branches/experimental/regexp2000/src/jsregexp.cc
==============================================================================
--- branches/experimental/regexp2000/src/jsregexp.cc    (original)
+++ branches/experimental/regexp2000/src/jsregexp.cc    Tue Oct 28 04:18:25  
2008
@@ -795,6 +795,14 @@


  template <typename Char>
+void* RegExpCompiler<Char>::VisitBackreference(RegExpBackreference* that,
+                                               void* rest) {
+  UNIMPLEMENTED();
+  return NULL;
+}
+
+
+template <typename Char>
  void* RegExpCompiler<Char>::VisitEmpty(RegExpEmpty* that, void* rest) {
    return rest;
  }

Modified: branches/experimental/regexp2000/src/parser.cc
==============================================================================
--- branches/experimental/regexp2000/src/parser.cc      (original)
+++ branches/experimental/regexp2000/src/parser.cc      Tue Oct 28 04:18:25 2008
@@ -252,6 +252,12 @@
    uc32 ParseControlEscape(bool* ok);
    uc32 ParseOctalLiteral(bool* ok);

+  // Tries to parse the input as a backreference.  If successful it
+  // stores the result in the output parameter and returns true.  If
+  // it fails it will push back the characters read so the same characters
+  // can be reparsed.
+  bool ParseBackreferenceIndex(int* index_out);
+
    CharacterRange ParseClassAtom(bool* ok);
    RegExpTree* ReportError(Vector<const char> message, bool* ok);
    void Advance();
@@ -270,6 +276,9 @@
    int captures_seen_;
    unibrow::CharacterStream* in_;
    Handle<String>* error_;
+  static const int kMaxPushback = 5;
+  int pushback_count_;
+  uc32 pushback_buffer_[kMaxPushback];
  };


@@ -3208,7 +3217,8 @@


  //  
----------------------------------------------------------------------------
-// Regular expressions.
+// Regular expressions
+

  RegExpParser::RegExpParser(unibrow::CharacterStream* in, Handle<String>*  
error)
    : current_(kEndMarker),
@@ -3217,14 +3227,20 @@
      has_next_(true),
      captures_seen_(0),
      in_(in),
-    error_(error) {
+    error_(error),
+    pushback_count_(0) {
    Advance(2);
  }

+
  void RegExpParser::Advance() {
    current_ = next_;
    has_more_ = has_next_;
-  if (in()->has_more()) {
+  if (pushback_count_ > 0) {
+    pushback_count_--;
+    next_ = pushback_buffer_[pushback_count_];
+    has_next_ = true;
+  } else if (in()->has_more()) {
      next_ = in()->GetNext();
    } else {
      next_ = kEndMarker;
@@ -3232,23 +3248,27 @@
    }
  }

+
  void RegExpParser::Advance(int dist) {
    for (int i = 0; i < dist; i++)
      Advance();
  }

+
  RegExpTree* RegExpParser::ReportError(Vector<const char> message, bool*  
ok) {
    *ok = false;
    *error_ = Factory::NewStringFromAscii(message, NOT_TENURED);
    return NULL;
  }

+
  // Pattern ::
  //   Disjunction
  RegExpTree* RegExpParser::ParsePattern(bool* ok) {
    return ParseDisjunction(ok);
  }

+
  // Disjunction ::
  //   Alternative
  //   Alternative | Disjunction
@@ -3268,10 +3288,12 @@
    }
  }

+
  static bool IsAlternativeTerminator(uc32 c) {
    return c == '|' || c == ')' || c == RegExpParser::kEndMarker;
  }

+
  // Alternative ::
  //   [empty]
  //   Alternative Term
@@ -3332,6 +3354,52 @@
  }


+bool RegExpParser::ParseBackreferenceIndex(int* index_out) {
+  ASSERT_EQ('\\', current());
+  ASSERT('1' <= next() && next() <= '9');
+  ASSERT_EQ(0, pushback_count_);
+  if (captures_seen_ == 0)
+    return false;
+  int value = next() - '0';
+  if (value > captures_seen_)
+    return false;
+  static const int kMaxChars = kMaxPushback - 2;
+  EmbeddedVector<uc32, kMaxChars> chars_seen;
+  chars_seen[0] = next();
+  int char_count = 1;
+  Advance(2);
+  while (true) {
+    uc32 c = current();
+    if (IsDecimalDigit(c)) {
+      int next_value = 10 * value + (c - '0');
+      // To avoid reading past the end of the stack-allocated pushback
+      // buffers we only read kMaxChars before giving up.
+      if (next_value > captures_seen_ || char_count > kMaxChars) {
+        // If we give up we have to push the characters we read back
+        // onto the pushback buffer in the reverse order.
+        pushback_buffer_[0] = current();
+        for (int i = 0; i < char_count; i++)
+          pushback_buffer_[i + 1] = chars_seen[char_count - i - 1];
+        pushback_buffer_[char_count + 1] = '\\';
+        pushback_count_ = char_count + 2;
+        // Then, once we've filled up the buffer, we read the two
+        // first characters into the lookahead.  This is a roundabout
+        // way of doing it but makes the code simpler.
+        Advance(2);
+        return false;
+      } else {
+        value = next_value;
+        chars_seen[char_count++] = current();
+        Advance();
+      }
+    } else {
+      *index_out = value;
+      return true;
+    }
+  }
+}
+
+
  // Term ::
  //   Assertion
  //   Atom
@@ -3384,15 +3452,27 @@
              atom = new  
RegExpCharacterClass(CharacterRange::CharacterClass(c));
              goto has_read_atom;
            }
-          // Todo backreferences
+          case '1': case '2': case '3': case '4': case '5': case '6':
+          case '7': case '8': case '9': {
+            int index = 0;
+            if (ParseBackreferenceIndex(&index)) {
+              atom = new RegExpBackreference(index);
+              goto has_read_atom;
+            } else {
+              // If this is not a backreference we go to the atom parser
+              // which will read it as an octal escape.
+              goto parse_atom;
+            }
+          }
            default:
-            break;
+            goto parse_atom;
          }
        }
        // All other escapes fall through to the default case since
        // they correspond to single characters that can be
        // represented within atoms.
      default: {
+     parse_atom:
        atom = ParseAtom(CHECK_OK);
        break;
      }
@@ -3405,6 +3485,7 @@
      //   *
      //   +
      //   ?
+    //   {
      case '*':
        min = 0;
        max = RegExpQuantifier::kInfinity;

Modified: branches/experimental/regexp2000/test/cctest/test-regexp.cc
==============================================================================
--- branches/experimental/regexp2000/test/cctest/test-regexp.cc (original)
+++ branches/experimental/regexp2000/test/cctest/test-regexp.cc Tue Oct 28  
04:18:25 2008
@@ -140,13 +140,33 @@
    ExpectParse("\\11", "'\t'");
    ExpectParse("\\11a", "'\ta'");
    ExpectParse("\\011", "'\t'");
+  ExpectParse("\\00011", "'\t'");
    ExpectParse("\\118", "'\t8'");
    ExpectParse("\\111", "'I'");
    ExpectParse("\\1111", "'I1'");
+  ExpectParse("(.)(.)(.)\\1", "(: (^ [&.]) (^ [&.]) (^ [&.]) (<- 1))");
+  ExpectParse("(.)(.)(.)\\2", "(: (^ [&.]) (^ [&.]) (^ [&.]) (<- 2))");
+  ExpectParse("(.)(.)(.)\\3", "(: (^ [&.]) (^ [&.]) (^ [&.]) (<- 3))");
+  ExpectParse("(.)(.)(.)\\4", "(: (^ [&.]) (^ [&.]) (^ [&.]) '\x04')");
+  ExpectParse("(.)(.)(.)\\1*", "(: (^ [&.]) (^ [&.]) (^ [&.])"
+                               " (# 0 - g (<- 1)))");
+  ExpectParse("(.)(.)(.)\\2*", "(: (^ [&.]) (^ [&.]) (^ [&.])"
+                               " (# 0 - g (<- 2)))");
+  ExpectParse("(.)(.)(.)\\3*", "(: (^ [&.]) (^ [&.]) (^ [&.])"
+                               " (# 0 - g (<- 3)))");
+  ExpectParse("(.)(.)(.)\\4*", "(: (^ [&.]) (^ [&.]) (^ [&.])"
+                               " (# 0 - g '\x04'))");
+  ExpectParse("(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)\\10",
+              "(: (^ [&.]) (^ [&.]) (^ [&.]) (^ [&.]) (^ [&.]) (^ [&.])"
+              " (^ [&.]) (^ [&.]) (^ [&.]) (^ [&.]) (<- 10))");
+  ExpectParse("(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)\\11",
+              "(: (^ [&.]) (^ [&.]) (^ [&.]) (^ [&.]) (^ [&.]) (^ [&.])"
+              " (^ [&.]) (^ [&.]) (^ [&.]) (^ [&.]) '\x09')");
    ExpectParse("[\\0]", "[\0]");
    ExpectParse("[\\11]", "[\t]");
    ExpectParse("[\\11a]", "[\t a]");
    ExpectParse("[\\011]", "[\t]");
+  ExpectParse("[\\00011]", "[\t]");
    ExpectParse("[\\118]", "[\t 8]");
    ExpectParse("[\\111]", "[I]");
    ExpectParse("[\\1111]", "[I 1]");

--~--~---------~--~----~------------~-------~--~----~
v8-dev mailing list
v8-dev@googlegroups.com
http://groups.google.com/group/v8-dev
-~----------~----~----~----~------~----~------~--~---

Reply via email to