Revision: 13248
Author:   [email protected]
Date:     Thu Dec 20 01:20:37 2012
Log:      Remove Utf8InputBuffer

[email protected]
BUG=

Review URL: https://chromiumcodereview.appspot.com/11649018
Patch from Dan Carney <[email protected]>.
http://code.google.com/p/v8/source/detail?r=13248

Modified:
 /branches/bleeding_edge/src/debug-agent.cc
 /branches/bleeding_edge/src/heap.cc
 /branches/bleeding_edge/src/objects.cc
 /branches/bleeding_edge/src/objects.h
 /branches/bleeding_edge/src/scanner.h
 /branches/bleeding_edge/src/unicode-inl.h
 /branches/bleeding_edge/src/unicode.cc
 /branches/bleeding_edge/src/unicode.h
 /branches/bleeding_edge/test/cctest/test-regexp.cc

=======================================
--- /branches/bleeding_edge/src/debug-agent.cc  Wed Jun  6 01:37:34 2012
+++ /branches/bleeding_edge/src/debug-agent.cc  Thu Dec 20 01:20:37 2012
@@ -192,21 +192,14 @@
     }

     // Convert UTF-8 to UTF-16.
-    unibrow::Utf8InputBuffer<> buf(msg, StrLength(msg));
-    int len = 0;
-    while (buf.has_more()) {
-      buf.GetNext();
-      len++;
-    }
-    ScopedVector<int16_t> temp(len + 1);
-    buf.Reset(msg, StrLength(msg));
-    for (int i = 0; i < len; i++) {
-      temp[i] = buf.GetNext();
-    }
+    unibrow::Utf8Decoder<128> decoder(msg, StrLength(msg));
+    int utf16_length = decoder.Utf16Length();
+    ScopedVector<uint16_t> temp(utf16_length + 1);
+    decoder.WriteUtf16(temp.start(), utf16_length);

     // Send the request received to the debugger.
- v8::Debug::SendCommand(reinterpret_cast<const uint16_t *>(temp.start()),
-                           len,
+    v8::Debug::SendCommand(temp.start(),
+                           utf16_length,
                            NULL,
reinterpret_cast<v8::Isolate*>(agent_->isolate()));

=======================================
--- /branches/bleeding_edge/src/heap.cc Wed Dec 19 05:57:51 2012
+++ /branches/bleeding_edge/src/heap.cc Thu Dec 20 01:20:37 2012
@@ -4546,37 +4546,31 @@
                                               PretenureFlag pretenure) {
// Continue counting the number of characters in the UTF-8 string, starting
   // from the first non-ascii character or word.
-  int chars = non_ascii_start;
   Access<UnicodeCache::Utf8Decoder>
       decoder(isolate_->unicode_cache()->utf8_decoder());
- decoder->Reset(string.start() + non_ascii_start, string.length() - chars);
-  while (decoder->has_more()) {
-    uint32_t r = decoder->GetNext();
-    if (r <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
-      chars++;
-    } else {
-      chars += 2;
-    }
-  }
-
+  decoder->Reset(string.start() + non_ascii_start,
+                 string.length() - non_ascii_start);
+  int utf16_length = decoder->Utf16Length();
+  ASSERT(utf16_length > 0);
+  // Allocate string.
   Object* result;
-  { MaybeObject* maybe_result = AllocateRawTwoByteString(chars, pretenure);
+  {
+    int chars = non_ascii_start + utf16_length;
+    MaybeObject* maybe_result = AllocateRawTwoByteString(chars, pretenure);
     if (!maybe_result->ToObject(&result)) return maybe_result;
   }
-
   // Convert and copy the characters into the new object.
   SeqTwoByteString* twobyte = SeqTwoByteString::cast(result);
-  decoder->Reset(string.start(), string.length());
-  int i = 0;
-  while (i < chars) {
-    uint32_t r = decoder->GetNext();
-    if (r > unibrow::Utf16::kMaxNonSurrogateCharCode) {
-      twobyte->SeqTwoByteStringSet(i++, unibrow::Utf16::LeadSurrogate(r));
-      twobyte->SeqTwoByteStringSet(i++, unibrow::Utf16::TrailSurrogate(r));
-    } else {
-      twobyte->SeqTwoByteStringSet(i++, r);
+  // Copy ascii portion.
+  uint16_t* data = twobyte->GetChars();
+  if (non_ascii_start != 0) {
+    const char* ascii_data = string.start();
+    for (int i = 0; i < non_ascii_start; i++) {
+      *data++ = *ascii_data++;
     }
   }
+  // Now write the remainder.
+  decoder->WriteUtf16(data, utf16_length);
   return result;
 }

=======================================
--- /branches/bleeding_edge/src/objects.cc      Wed Dec 19 07:17:01 2012
+++ /branches/bleeding_edge/src/objects.cc      Thu Dec 20 01:20:37 2012
@@ -7641,14 +7641,20 @@


 bool String::IsEqualTo(Vector<const char> str) {
-  Isolate* isolate = GetIsolate();
   int slen = length();
-  Access<UnicodeCache::Utf8Decoder>
-      decoder(isolate->unicode_cache()->utf8_decoder());
-  decoder->Reset(str.start(), str.length());
+  // Can't check exact length equality, but we can check bounds.
+  int str_len = str.length();
+  if (str_len < slen ||
+      str_len > slen*static_cast<int>(unibrow::Utf8::kMaxEncodedSize)) {
+    return false;
+  }
   int i;
-  for (i = 0; i < slen && decoder->has_more(); i++) {
-    uint32_t r = decoder->GetNext();
+  unsigned remaining_in_str = static_cast<unsigned>(str_len);
+  const uint8_t* utf8_data = reinterpret_cast<const uint8_t*>(str.start());
+  for (i = 0; i < slen && remaining_in_str > 0; i++) {
+    unsigned cursor = 0;
+ uint32_t r = unibrow::Utf8::ValueOf(utf8_data, remaining_in_str, &cursor);
+    ASSERT(cursor > 0 && cursor <= remaining_in_str);
     if (r > unibrow::Utf16::kMaxNonSurrogateCharCode) {
       if (i > slen - 1) return false;
       if (Get(i++) != unibrow::Utf16::LeadSurrogate(r)) return false;
@@ -7656,8 +7662,10 @@
     } else {
       if (Get(i) != r) return false;
     }
+    utf8_data += cursor;
+    remaining_in_str -= cursor;
   }
-  return i == slen && !decoder->has_more();
+  return i == slen && remaining_in_str == 0;
 }


@@ -7862,46 +7870,51 @@
 }


-uint32_t StringHasher::ComputeHashField(unibrow::CharacterStream* buffer,
-                                        int length,
-                                        uint32_t seed) {
-  typedef unibrow::Utf16 u;
-  StringHasher hasher(length, seed);
-  // Very long strings have a trivial hash that doesn't inspect the
-  // string contents.
-  if (hasher.has_trivial_hash()) {
-    return hasher.GetHashField();
-  }
-  // Do the iterative array index computation as long as there is a
-  // chance this is an array index.
-  if (hasher.is_array_index_) {
-    while (buffer->has_more()) {
-      uint32_t c = buffer->GetNext();
-      if (c > u::kMaxNonSurrogateCharCode) {
-        uint16_t c1 = u::LeadSurrogate(c);
-        uint16_t c2 = u::TrailSurrogate(c);
-        hasher.AddCharacter(c1);
-        hasher.AddCharacter(c2);
-        if (!hasher.UpdateIndex(c1)) break;
-        if (!hasher.UpdateIndex(c2)) break;
-      } else {
-        hasher.AddCharacter(c);
-        if (!hasher.UpdateIndex(c)) break;
-      }
-    }
+uint32_t StringHasher::ComputeUtf8Hash(Vector<const char> chars,
+                                       uint32_t seed,
+                                       int* utf16_length_out) {
+  int vector_length = chars.length();
+  // Handle some edge cases
+  if (vector_length <= 1) {
+    ASSERT(vector_length == 0 ||
+           static_cast<uint8_t>(chars.start()[0]) <=
+               unibrow::Utf8::kMaxOneByteChar);
+    *utf16_length_out = vector_length;
+    return HashSequentialString(chars.start(), vector_length, seed);
   }
-  // Process the remaining characters without updating the array
-  // index.
-  while (buffer->has_more()) {
-    ASSERT(!hasher.is_array_index_);
-    uint32_t c = buffer->GetNext();
-    if (c > u::kMaxNonSurrogateCharCode) {
-      hasher.AddCharacter(u::LeadSurrogate(c));
-      hasher.AddCharacter(u::TrailSurrogate(c));
+  // Start with a fake length which won't affect computation.
+  // It will be updated later.
+  StringHasher hasher(String::kMaxArrayIndexSize, seed);
+  unsigned remaining = static_cast<unsigned>(vector_length);
+  const uint8_t* stream = reinterpret_cast<const uint8_t*>(chars.start());
+  int utf16_length = 0;
+  bool is_index = true;
+  ASSERT(hasher.is_array_index_);
+  while (remaining > 0) {
+    unsigned consumed = 0;
+    uint32_t c = unibrow::Utf8::ValueOf(stream, remaining, &consumed);
+    ASSERT(consumed > 0 && consumed <= remaining);
+    stream += consumed;
+    remaining -= consumed;
+    bool is_two_characters = c > unibrow::Utf16::kMaxNonSurrogateCharCode;
+    utf16_length += is_two_characters ? 2 : 1;
+    // No need to keep hashing. But we do need to calculate utf16_length.
+    if (utf16_length > String::kMaxHashCalcLength) continue;
+    if (is_two_characters) {
+      uint16_t c1 = unibrow::Utf16::LeadSurrogate(c);
+      uint16_t c2 = unibrow::Utf16::TrailSurrogate(c);
+      hasher.AddCharacter(c1);
+      hasher.AddCharacter(c2);
+      if (is_index) is_index = hasher.UpdateIndex(c1);
+      if (is_index) is_index = hasher.UpdateIndex(c2);
     } else {
       hasher.AddCharacter(c);
+      if (is_index) is_index = hasher.UpdateIndex(c);
     }
   }
+  *utf16_length_out = static_cast<int>(utf16_length);
+  // Must set length here so that hash computation is correct.
+  hasher.length_ = utf16_length;
   return hasher.GetHashField();
 }

@@ -11716,10 +11729,7 @@

   uint32_t Hash() {
     if (hash_field_ != 0) return hash_field_ >> String::kHashShift;
-    unibrow::Utf8InputBuffer<> buffer(string_.start(),
- static_cast<unsigned>(string_.length()));
-    chars_ = buffer.Utf16Length();
-    hash_field_ = StringHasher::ComputeHashField(&buffer, chars_, seed_);
+    hash_field_ = StringHasher::ComputeUtf8Hash(string_, seed_, &chars_);
     uint32_t result = hash_field_ >> String::kHashShift;
ASSERT(result != 0); // Ensure that the hash value of 0 is never computed.
     return result;
=======================================
--- /branches/bleeding_edge/src/objects.h       Wed Dec 19 05:27:20 2012
+++ /branches/bleeding_edge/src/objects.h       Thu Dec 20 01:20:37 2012
@@ -6934,9 +6934,10 @@
                                               int length,
                                               uint32_t seed);

-  static uint32_t ComputeHashField(unibrow::CharacterStream* buffer,
-                                   int length,
-                                   uint32_t seed);
+ // Reads all the data, even for long strings and computes the utf16 length.
+  static uint32_t ComputeUtf8Hash(Vector<const char> chars,
+                                  uint32_t seed,
+                                  int* utf16_length_out);

   // Calculated hash value for a string consisting of 1 to
   // String::kMaxArrayIndexSize digits with no leading zeros (except "0").
=======================================
--- /branches/bleeding_edge/src/scanner.h       Mon Apr 16 08:54:02 2012
+++ /branches/bleeding_edge/src/scanner.h       Thu Dec 20 01:20:37 2012
@@ -145,7 +145,7 @@
 // Caching predicates used by scanners.
  public:
   UnicodeCache() {}
-  typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
+  typedef unibrow::Utf8Decoder<512> Utf8Decoder;

   StaticResource<Utf8Decoder>* utf8_decoder() {
     return &utf8_decoder_;
@@ -315,8 +315,6 @@
   // -1 is outside of the range of any real source code.
   static const int kNoOctalLocation = -1;

-  typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
-
   explicit Scanner(UnicodeCache* scanner_contants);

   void Initialize(Utf16CharacterStream* source);
=======================================
--- /branches/bleeding_edge/src/unicode-inl.h   Tue Aug 28 02:37:41 2012
+++ /branches/bleeding_edge/src/unicode-inl.h   Thu Dec 20 01:20:37 2012
@@ -240,10 +240,51 @@
   buffer_ = R::ReadBlock(input_, util_buffer_, s, &remaining_, &offset_);
 }

-template <unsigned s>
-Utf8InputBuffer<s>::Utf8InputBuffer(const char* data, unsigned length)
-    : InputBuffer<Utf8, Buffer<const char*>, s>(Buffer<const char*>(data,
- length)) {
+Utf8DecoderBase::Utf8DecoderBase()
+  : unbuffered_start_(NULL),
+    utf16_length_(0),
+    last_byte_of_buffer_unused_(false) {}
+
+Utf8DecoderBase::Utf8DecoderBase(uint16_t* buffer,
+                                 unsigned buffer_length,
+                                 const uint8_t* stream,
+                                 unsigned stream_length) {
+  Reset(buffer, buffer_length, stream, stream_length);
+}
+
+template<unsigned kBufferSize>
+Utf8Decoder<kBufferSize>::Utf8Decoder(const char* stream, unsigned length)
+  : Utf8DecoderBase(buffer_,
+                    kBufferSize,
+                    reinterpret_cast<const uint8_t*>(stream),
+                    length) {
+}
+
+template<unsigned kBufferSize>
+void Utf8Decoder<kBufferSize>::Reset(const char* stream, unsigned length) {
+  Utf8DecoderBase::Reset(buffer_,
+                         kBufferSize,
+                         reinterpret_cast<const uint8_t*>(stream),
+                         length);
+}
+
+template <unsigned kBufferSize>
+unsigned Utf8Decoder<kBufferSize>::WriteUtf16(uint16_t* data,
+                                              unsigned length) const {
+  ASSERT(length > 0);
+  if (length > utf16_length_) length = utf16_length_;
+  // memcpy everything in buffer.
+  unsigned buffer_length =
+      last_byte_of_buffer_unused_ ? kBufferSize - 1 : kBufferSize;
+ unsigned memcpy_length = length <= buffer_length ? length : buffer_length;
+  memcpy(data, buffer_, memcpy_length*sizeof(uint16_t));
+  if (length <= buffer_length) return length;
+  ASSERT(unbuffered_start_ != NULL);
+  // Copy the rest the slow way.
+  WriteUtf16Slow(unbuffered_start_,
+                 data + buffer_length,
+                 length - buffer_length);
+  return length;
 }

 }  // namespace unibrow
=======================================
--- /branches/bleeding_edge/src/unicode.cc      Mon Mar 12 05:35:28 2012
+++ /branches/bleeding_edge/src/unicode.cc      Thu Dec 20 01:20:37 2012
@@ -276,58 +276,6 @@
   return kBadChar;
 }

-
-const byte* Utf8::ReadBlock(Buffer<const char*> str, byte* buffer,
-    unsigned capacity, unsigned* chars_read_ptr, unsigned* offset_ptr) {
-  unsigned offset = *offset_ptr;
-  // Bail out early if we've reached the end of the string.
-  if (offset == str.length()) {
-    *chars_read_ptr = 0;
-    return NULL;
-  }
-  const byte* data = reinterpret_cast<const byte*>(str.data());
-  if (data[offset] <= kMaxOneByteChar) {
-    // The next character is an ASCII char so we scan forward over
-    // the following ASCII characters and return the next pure ASCII
-    // substring
-    const byte* result = data + offset;
-    offset++;
-    while ((offset < str.length()) && (data[offset] <= kMaxOneByteChar))
-      offset++;
-    *chars_read_ptr = offset - *offset_ptr;
-    *offset_ptr = offset;
-    return result;
-  } else {
-    // The next character is non-ASCII so we just fill the buffer
-    unsigned cursor = 0;
-    unsigned chars_read = 0;
-    while (offset < str.length()) {
-      uchar c = data[offset];
-      if (c <= kMaxOneByteChar) {
-        // Fast case for ASCII characters
-        if (!CharacterStream::EncodeAsciiCharacter(c,
-                                                   buffer,
-                                                   capacity,
-                                                   cursor))
-          break;
-        offset += 1;
-      } else {
-        unsigned chars = 0;
-        c = Utf8::ValueOf(data + offset, str.length() - offset, &chars);
-        if (!CharacterStream::EncodeNonAsciiCharacter(c,
-                                                      buffer,
-                                                      capacity,
-                                                      cursor))
-          break;
-        offset += chars;
-      }
-      chars_read++;
-    }
-    *offset_ptr = offset;
-    *chars_read_ptr = chars_read;
-    return buffer;
-  }
-}

 unsigned CharacterStream::Length() {
   unsigned result = 0;
@@ -355,6 +303,75 @@
     GetNext();
   }
 }
+
+void Utf8DecoderBase::Reset(uint16_t* buffer,
+                            unsigned buffer_length,
+                            const uint8_t* stream,
+                            unsigned stream_length) {
+  // Assume everything will fit in the buffer and stream won't be needed.
+  last_byte_of_buffer_unused_ = false;
+  unbuffered_start_ = NULL;
+  bool writing_to_buffer = true;
+ // Loop until stream is read, writing to buffer as long as buffer has space.
+  unsigned utf16_length = 0;
+  while (stream_length != 0) {
+    unsigned cursor = 0;
+    uint32_t character = Utf8::ValueOf(stream, stream_length, &cursor);
+    ASSERT(cursor > 0 && cursor <= stream_length);
+    stream += cursor;
+    stream_length -= cursor;
+    bool is_two_characters = character > Utf16::kMaxNonSurrogateCharCode;
+    utf16_length += is_two_characters ? 2 : 1;
+    // Don't need to write to the buffer, but still need utf16_length.
+    if (!writing_to_buffer) continue;
+    // Write out the characters to the buffer.
+ // Must check for equality with buffer_length as we've already updated it.
+    if (utf16_length <= buffer_length) {
+      if (is_two_characters) {
+        *buffer++ = Utf16::LeadSurrogate(character);
+        *buffer++ = Utf16::TrailSurrogate(character);
+      } else {
+        *buffer++ = character;
+      }
+      if (utf16_length == buffer_length) {
+        // Just wrote last character of buffer
+        writing_to_buffer = false;
+        unbuffered_start_ = stream;
+      }
+      continue;
+    }
+    // Have gone over buffer.
+    // Last char of buffer is unused, set cursor back.
+    ASSERT(is_two_characters);
+    writing_to_buffer = false;
+    last_byte_of_buffer_unused_ = true;
+    unbuffered_start_ = stream - cursor;
+  }
+  utf16_length_ = utf16_length;
+}
+
+
+void Utf8DecoderBase::WriteUtf16Slow(const uint8_t* stream,
+                                     uint16_t* data,
+                                     unsigned data_length) {
+  while (data_length != 0) {
+    unsigned cursor = 0;
+ uint32_t character = Utf8::ValueOf(stream, Utf8::kMaxEncodedSize, &cursor);
+    // There's a total lack of bounds checking for stream
+    // as it was already done in Reset.
+    stream += cursor;
+    if (character > unibrow::Utf16::kMaxNonSurrogateCharCode) {
+      *data++ = Utf16::LeadSurrogate(character);
+      *data++ = Utf16::TrailSurrogate(character);
+      ASSERT(data_length > 1);
+      data_length -= 2;
+    } else {
+      *data++ = character;
+      data_length -= 1;
+    }
+  }
+}
+

 // Uppercase:            point.category == 'Lu'

=======================================
--- /branches/bleeding_edge/src/unicode.h       Wed Dec 19 05:27:20 2012
+++ /branches/bleeding_edge/src/unicode.h       Thu Dec 20 01:20:37 2012
@@ -29,7 +29,8 @@
 #define V8_UNICODE_H_

 #include <sys/types.h>
-
+#include <stdint.h>
+#include <globals.h>
 /**
  * \file
  * Definitions and convenience functions for working with unicode.
@@ -140,10 +141,10 @@
   // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes.
   // The illegality stems from the surrogate not being part of a pair.
   static const int kUtf8BytesToCodeASurrogate = 3;
-  static inline uchar LeadSurrogate(int char_code) {
+  static inline uint16_t LeadSurrogate(uint32_t char_code) {
     return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff);
   }
-  static inline uchar TrailSurrogate(int char_code) {
+  static inline uint16_t TrailSurrogate(uint32_t char_code) {
     return 0xdc00 + (char_code & 0x3ff);
   }
 };
@@ -154,8 +155,6 @@
   static inline uchar Length(uchar chr, int previous);
   static inline unsigned Encode(
       char* out, uchar c, int previous);
-  static const byte* ReadBlock(Buffer<const char*> str, byte* buffer,
-      unsigned capacity, unsigned* chars_read, unsigned* offset);
   static uchar CalculateValue(const byte* str,
                               unsigned length,
                               unsigned* cursor);
@@ -241,17 +240,42 @@
   byte util_buffer_[kSize];
 };

-// --- U t f 8   I n p u t   B u f f e r ---

-template <unsigned s = 256>
-class Utf8InputBuffer : public InputBuffer<Utf8, Buffer<const char*>, s> {
+class Utf8DecoderBase {
  public:
-  inline Utf8InputBuffer() { }
-  inline Utf8InputBuffer(const char* data, unsigned length);
-  inline void Reset(const char* data, unsigned length) {
-    InputBuffer<Utf8, Buffer<const char*>, s>::Reset(
-        Buffer<const char*>(data, length));
-  }
+  // Initialization done in subclass.
+  inline Utf8DecoderBase();
+  inline Utf8DecoderBase(uint16_t* buffer,
+                         unsigned buffer_length,
+                         const uint8_t* stream,
+                         unsigned stream_length);
+  inline unsigned Utf16Length() const { return utf16_length_; }
+ protected:
+  // This reads all characters and sets the utf16_length_.
+  // The first buffer_length utf16 chars are cached in the buffer.
+  void Reset(uint16_t* buffer,
+             unsigned buffer_length,
+             const uint8_t* stream,
+             unsigned stream_length);
+  static void WriteUtf16Slow(const uint8_t* stream,
+                             uint16_t* data,
+                             unsigned length);
+  const uint8_t* unbuffered_start_;
+  unsigned utf16_length_;
+  bool last_byte_of_buffer_unused_;
+ private:
+  DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase);
+};
+
+template <unsigned kBufferSize>
+class Utf8Decoder : public Utf8DecoderBase {
+ public:
+  inline Utf8Decoder() {}
+  inline Utf8Decoder(const char* stream, unsigned length);
+  inline void Reset(const char* stream, unsigned length);
+  inline unsigned WriteUtf16(uint16_t* data, unsigned length) const;
+ private:
+  uint16_t buffer_[kBufferSize];
 };


=======================================
--- /branches/bleeding_edge/test/cctest/test-regexp.cc Thu Nov 15 05:31:27 2012 +++ /branches/bleeding_edge/test/cctest/test-regexp.cc Thu Dec 20 01:20:37 2012
@@ -98,7 +98,6 @@
 static bool CheckSimple(const char* input) {
   V8::Initialize(NULL);
   v8::HandleScope scope;
-  unibrow::Utf8InputBuffer<> buffer(input, StrLength(input));
   ZoneScope zone_scope(Isolate::Current()->runtime_zone(), DELETE_ON_EXIT);
   FlatStringReader reader(Isolate::Current(), CStrVector(input));
   RegExpCompileData result;
@@ -117,7 +116,6 @@
 static MinMaxPair CheckMinMaxMatch(const char* input) {
   V8::Initialize(NULL);
   v8::HandleScope scope;
-  unibrow::Utf8InputBuffer<> buffer(input, StrLength(input));
   ZoneScope zone_scope(Isolate::Current()->runtime_zone(), DELETE_ON_EXIT);
   FlatStringReader reader(Isolate::Current(), CStrVector(input));
   RegExpCompileData result;

--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev

Reply via email to