Reviewers: Michael Starzinger,

Message:
string: optimize non-ASCII string splitting

Optimize String.split() for UCS-2 strings where the search pattern is a single
ASCII or UCS-2 character.

Gives a 15% performance boost on x64 with the below micro-benchmark:

  var s = "Zum iech jeitzt wa, der stolz fergiess Dauschen op. " +
          "Dat rout Mier Faarwen no. Halm fond d'Blumme do hie, " +
          "Kënnt dénen derfir dat ke. Wuel main rem vu, hu main " +
          "zwëschen gei. Un lait schaddreg hie, gin hire drun " +
          "Hémecht en, hun hire dämpen da.";
  for (var i = 0; i < 1e7; ++i) s.split(" ");

Description:
Optimize non-ASCII string splitting with single-character search pattern

Please review this at https://codereview.chromium.org/11299163/

Affected files:
  M src/objects.h
  M src/runtime.cc
  M test/mjsunit/string-split.js


Index: src/objects.h
diff --git a/src/objects.h b/src/objects.h
index 7d46c068589bfb707a43e93d84bf5cc74e328275..5f04fe1e61d929ba47e86c45e3e804ac5644da0d 100644
--- a/src/objects.h
+++ b/src/objects.h
@@ -7189,6 +7189,13 @@ class String: public HeapObject {
       ASSERT_EQ(TWO_BYTE, state_);
       return Vector<const uc16>::cast(buffer_);
     }
+    // Return the character length of the string.
+    unsigned Length() const {
+      if (state_ == ASCII) return buffer_.length();
+      if (state_ == TWO_BYTE) return buffer_.length() / 2;
+      UNREACHABLE();
+      return 0;  // Placate compiler.
+    }

    private:
     enum State { NON_FLAT, ASCII, TWO_BYTE };
Index: src/runtime.cc
diff --git a/src/runtime.cc b/src/runtime.cc
index a15e1f50539de0b18ac3d4526f83d63cb1076eb5..ba41459d02707ef4ec1ee43a4aeaac0fe27f9ed4 100644
--- a/src/runtime.cc
+++ b/src/runtime.cc
@@ -2762,6 +2762,23 @@ void FindAsciiStringIndices(Vector<const char> subject,
 }


+void FindTwoByteStringIndices(const Vector<const uc16> subject,
+                              uc16 pattern,
+                              ZoneList<int>* indices,
+                              unsigned int limit,
+                              Zone* zone) {
+  ASSERT(limit > 0);
+  const uc16* subject_start = subject.start();
+  const uc16* subject_end = subject_start + subject.length();
+ for (const uc16* pos = subject_start; pos < subject_end && limit > 0; pos++) {
+    if (*pos == pattern) {
+      indices->Add(static_cast<int>(pos - subject_start), zone);
+      limit--;
+    }
+  }
+}
+
+
 template <typename SubjectChar, typename PatternChar>
 void FindStringIndices(Isolate* isolate,
                        Vector<const SubjectChar> subject,
@@ -2825,13 +2842,22 @@ void FindStringIndicesDispatch(Isolate* isolate,
       }
     } else {
       Vector<const uc16> subject_vector = subject_content.ToUC16Vector();
-      if (pattern_content.IsAscii()) {
-        FindStringIndices(isolate,
-                          subject_vector,
-                          pattern_content.ToAsciiVector(),
-                          indices,
-                          limit,
-                          zone);
+      if (pattern_content.Length() == 1) {
+        uc16 pattern_char = pattern_content.IsAscii() ?
+                            pattern_content.ToAsciiVector()[0] :
+                            pattern_content.ToUC16Vector()[0];
+        FindTwoByteStringIndices(subject_vector,
+                                 pattern_char,
+                                 indices,
+                                 limit,
+                                 zone);
+      } else if (pattern_content.IsAscii()) {
+          FindStringIndices(isolate,
+                            subject_vector,
+                            pattern_content.ToAsciiVector(),
+                            indices,
+                            limit,
+                            zone);
       } else {
         FindStringIndices(isolate,
                           subject_vector,
Index: test/mjsunit/string-split.js
diff --git a/test/mjsunit/string-split.js b/test/mjsunit/string-split.js
index d8412f0eed561cd710270efd6b2e62a2c9b7fc16..1308244cabd538eeba01f2d5b27d94c736978991 100644
--- a/test/mjsunit/string-split.js
+++ b/test/mjsunit/string-split.js
@@ -66,6 +66,23 @@ assertArrayEquals(["div", "#i", "d", ".class"], "div#id.class".split(/(?=[d#.])/

 assertArrayEquals(["a", "b", "c"], "abc".split(/(?=.)/));

+assertArrayEquals(["Wenige", "sind", "auserwählt."],
+                  "Wenige sind auserwählt.".split(" "));
+
+assertArrayEquals([], "Wenige sind auserwählt.".split(" ", 0));
+
+assertArrayEquals(["Wenige"], "Wenige sind auserwählt.".split(" ", 1));
+
+assertArrayEquals(["Wenige", "sind"], "Wenige sind auserwählt.".split(" ", 2));
+
+assertArrayEquals(["Wenige", "sind", "auserwählt."],
+                  "Wenige sind auserwählt.".split(" ", 3));
+
+assertArrayEquals(["Wenige sind auserw", "hlt."],
+                  "Wenige sind auserwählt.".split("ä"));
+
+assertArrayEquals(["Wenige sind ", "."],
+                  "Wenige sind auserwählt.".split("auserwählt"));

 /* "ab".split(/((?=.))/)
  *


--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev

Reply via email to