Revision: 13248
Author: [email protected]
Date: Thu Dec 20 01:20:37 2012
Log: Remove Utf8InputBuffer
[email protected]
BUG=
Review URL: https://chromiumcodereview.appspot.com/11649018
Patch from Dan Carney <[email protected]>.
http://code.google.com/p/v8/source/detail?r=13248
Modified:
/branches/bleeding_edge/src/debug-agent.cc
/branches/bleeding_edge/src/heap.cc
/branches/bleeding_edge/src/objects.cc
/branches/bleeding_edge/src/objects.h
/branches/bleeding_edge/src/scanner.h
/branches/bleeding_edge/src/unicode-inl.h
/branches/bleeding_edge/src/unicode.cc
/branches/bleeding_edge/src/unicode.h
/branches/bleeding_edge/test/cctest/test-regexp.cc
=======================================
--- /branches/bleeding_edge/src/debug-agent.cc Wed Jun 6 01:37:34 2012
+++ /branches/bleeding_edge/src/debug-agent.cc Thu Dec 20 01:20:37 2012
@@ -192,21 +192,14 @@
}
// Convert UTF-8 to UTF-16.
- unibrow::Utf8InputBuffer<> buf(msg, StrLength(msg));
- int len = 0;
- while (buf.has_more()) {
- buf.GetNext();
- len++;
- }
- ScopedVector<int16_t> temp(len + 1);
- buf.Reset(msg, StrLength(msg));
- for (int i = 0; i < len; i++) {
- temp[i] = buf.GetNext();
- }
+ unibrow::Utf8Decoder<128> decoder(msg, StrLength(msg));
+ int utf16_length = decoder.Utf16Length();
+ ScopedVector<uint16_t> temp(utf16_length + 1);
+ decoder.WriteUtf16(temp.start(), utf16_length);
// Send the request received to the debugger.
- v8::Debug::SendCommand(reinterpret_cast<const uint16_t
*>(temp.start()),
- len,
+ v8::Debug::SendCommand(temp.start(),
+ utf16_length,
NULL,
reinterpret_cast<v8::Isolate*>(agent_->isolate()));
=======================================
--- /branches/bleeding_edge/src/heap.cc Wed Dec 19 05:57:51 2012
+++ /branches/bleeding_edge/src/heap.cc Thu Dec 20 01:20:37 2012
@@ -4546,37 +4546,31 @@
PretenureFlag pretenure) {
// Continue counting the number of characters in the UTF-8 string,
starting
// from the first non-ascii character or word.
- int chars = non_ascii_start;
Access<UnicodeCache::Utf8Decoder>
decoder(isolate_->unicode_cache()->utf8_decoder());
- decoder->Reset(string.start() + non_ascii_start, string.length() -
chars);
- while (decoder->has_more()) {
- uint32_t r = decoder->GetNext();
- if (r <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
- chars++;
- } else {
- chars += 2;
- }
- }
-
+ decoder->Reset(string.start() + non_ascii_start,
+ string.length() - non_ascii_start);
+ int utf16_length = decoder->Utf16Length();
+ ASSERT(utf16_length > 0);
+ // Allocate string.
Object* result;
- { MaybeObject* maybe_result = AllocateRawTwoByteString(chars, pretenure);
+ {
+ int chars = non_ascii_start + utf16_length;
+ MaybeObject* maybe_result = AllocateRawTwoByteString(chars, pretenure);
if (!maybe_result->ToObject(&result)) return maybe_result;
}
-
// Convert and copy the characters into the new object.
SeqTwoByteString* twobyte = SeqTwoByteString::cast(result);
- decoder->Reset(string.start(), string.length());
- int i = 0;
- while (i < chars) {
- uint32_t r = decoder->GetNext();
- if (r > unibrow::Utf16::kMaxNonSurrogateCharCode) {
- twobyte->SeqTwoByteStringSet(i++, unibrow::Utf16::LeadSurrogate(r));
- twobyte->SeqTwoByteStringSet(i++, unibrow::Utf16::TrailSurrogate(r));
- } else {
- twobyte->SeqTwoByteStringSet(i++, r);
+ // Copy ascii portion.
+ uint16_t* data = twobyte->GetChars();
+ if (non_ascii_start != 0) {
+ const char* ascii_data = string.start();
+ for (int i = 0; i < non_ascii_start; i++) {
+ *data++ = *ascii_data++;
}
}
+ // Now write the remainder.
+ decoder->WriteUtf16(data, utf16_length);
return result;
}
=======================================
--- /branches/bleeding_edge/src/objects.cc Wed Dec 19 07:17:01 2012
+++ /branches/bleeding_edge/src/objects.cc Thu Dec 20 01:20:37 2012
@@ -7641,14 +7641,20 @@
bool String::IsEqualTo(Vector<const char> str) {
- Isolate* isolate = GetIsolate();
int slen = length();
- Access<UnicodeCache::Utf8Decoder>
- decoder(isolate->unicode_cache()->utf8_decoder());
- decoder->Reset(str.start(), str.length());
+ // Can't check exact length equality, but we can check bounds.
+ int str_len = str.length();
+ if (str_len < slen ||
+ str_len > slen*static_cast<int>(unibrow::Utf8::kMaxEncodedSize)) {
+ return false;
+ }
int i;
- for (i = 0; i < slen && decoder->has_more(); i++) {
- uint32_t r = decoder->GetNext();
+ unsigned remaining_in_str = static_cast<unsigned>(str_len);
+ const uint8_t* utf8_data = reinterpret_cast<const uint8_t*>(str.start());
+ for (i = 0; i < slen && remaining_in_str > 0; i++) {
+ unsigned cursor = 0;
+ uint32_t r = unibrow::Utf8::ValueOf(utf8_data, remaining_in_str,
&cursor);
+ ASSERT(cursor > 0 && cursor <= remaining_in_str);
if (r > unibrow::Utf16::kMaxNonSurrogateCharCode) {
if (i > slen - 1) return false;
if (Get(i++) != unibrow::Utf16::LeadSurrogate(r)) return false;
@@ -7656,8 +7662,10 @@
} else {
if (Get(i) != r) return false;
}
+ utf8_data += cursor;
+ remaining_in_str -= cursor;
}
- return i == slen && !decoder->has_more();
+ return i == slen && remaining_in_str == 0;
}
@@ -7862,46 +7870,51 @@
}
-uint32_t StringHasher::ComputeHashField(unibrow::CharacterStream* buffer,
- int length,
- uint32_t seed) {
- typedef unibrow::Utf16 u;
- StringHasher hasher(length, seed);
- // Very long strings have a trivial hash that doesn't inspect the
- // string contents.
- if (hasher.has_trivial_hash()) {
- return hasher.GetHashField();
- }
- // Do the iterative array index computation as long as there is a
- // chance this is an array index.
- if (hasher.is_array_index_) {
- while (buffer->has_more()) {
- uint32_t c = buffer->GetNext();
- if (c > u::kMaxNonSurrogateCharCode) {
- uint16_t c1 = u::LeadSurrogate(c);
- uint16_t c2 = u::TrailSurrogate(c);
- hasher.AddCharacter(c1);
- hasher.AddCharacter(c2);
- if (!hasher.UpdateIndex(c1)) break;
- if (!hasher.UpdateIndex(c2)) break;
- } else {
- hasher.AddCharacter(c);
- if (!hasher.UpdateIndex(c)) break;
- }
- }
+uint32_t StringHasher::ComputeUtf8Hash(Vector<const char> chars,
+ uint32_t seed,
+ int* utf16_length_out) {
+ int vector_length = chars.length();
+ // Handle some edge cases
+ if (vector_length <= 1) {
+ ASSERT(vector_length == 0 ||
+ static_cast<uint8_t>(chars.start()[0]) <=
+ unibrow::Utf8::kMaxOneByteChar);
+ *utf16_length_out = vector_length;
+ return HashSequentialString(chars.start(), vector_length, seed);
}
- // Process the remaining characters without updating the array
- // index.
- while (buffer->has_more()) {
- ASSERT(!hasher.is_array_index_);
- uint32_t c = buffer->GetNext();
- if (c > u::kMaxNonSurrogateCharCode) {
- hasher.AddCharacter(u::LeadSurrogate(c));
- hasher.AddCharacter(u::TrailSurrogate(c));
+ // Start with a fake length which won't affect computation.
+ // It will be updated later.
+ StringHasher hasher(String::kMaxArrayIndexSize, seed);
+ unsigned remaining = static_cast<unsigned>(vector_length);
+ const uint8_t* stream = reinterpret_cast<const uint8_t*>(chars.start());
+ int utf16_length = 0;
+ bool is_index = true;
+ ASSERT(hasher.is_array_index_);
+ while (remaining > 0) {
+ unsigned consumed = 0;
+ uint32_t c = unibrow::Utf8::ValueOf(stream, remaining, &consumed);
+ ASSERT(consumed > 0 && consumed <= remaining);
+ stream += consumed;
+ remaining -= consumed;
+ bool is_two_characters = c > unibrow::Utf16::kMaxNonSurrogateCharCode;
+ utf16_length += is_two_characters ? 2 : 1;
+ // No need to keep hashing. But we do need to calculate utf16_length.
+ if (utf16_length > String::kMaxHashCalcLength) continue;
+ if (is_two_characters) {
+ uint16_t c1 = unibrow::Utf16::LeadSurrogate(c);
+ uint16_t c2 = unibrow::Utf16::TrailSurrogate(c);
+ hasher.AddCharacter(c1);
+ hasher.AddCharacter(c2);
+ if (is_index) is_index = hasher.UpdateIndex(c1);
+ if (is_index) is_index = hasher.UpdateIndex(c2);
} else {
hasher.AddCharacter(c);
+ if (is_index) is_index = hasher.UpdateIndex(c);
}
}
+ *utf16_length_out = static_cast<int>(utf16_length);
+ // Must set length here so that hash computation is correct.
+ hasher.length_ = utf16_length;
return hasher.GetHashField();
}
@@ -11716,10 +11729,7 @@
uint32_t Hash() {
if (hash_field_ != 0) return hash_field_ >> String::kHashShift;
- unibrow::Utf8InputBuffer<> buffer(string_.start(),
-
static_cast<unsigned>(string_.length()));
- chars_ = buffer.Utf16Length();
- hash_field_ = StringHasher::ComputeHashField(&buffer, chars_, seed_);
+ hash_field_ = StringHasher::ComputeUtf8Hash(string_, seed_, &chars_);
uint32_t result = hash_field_ >> String::kHashShift;
ASSERT(result != 0); // Ensure that the hash value of 0 is never
computed.
return result;
=======================================
--- /branches/bleeding_edge/src/objects.h Wed Dec 19 05:27:20 2012
+++ /branches/bleeding_edge/src/objects.h Thu Dec 20 01:20:37 2012
@@ -6934,9 +6934,10 @@
int length,
uint32_t seed);
- static uint32_t ComputeHashField(unibrow::CharacterStream* buffer,
- int length,
- uint32_t seed);
+ // Reads all the data, even for long strings and computes the utf16
length.
+ static uint32_t ComputeUtf8Hash(Vector<const char> chars,
+ uint32_t seed,
+ int* utf16_length_out);
// Calculated hash value for a string consisting of 1 to
// String::kMaxArrayIndexSize digits with no leading zeros (except "0").
=======================================
--- /branches/bleeding_edge/src/scanner.h Mon Apr 16 08:54:02 2012
+++ /branches/bleeding_edge/src/scanner.h Thu Dec 20 01:20:37 2012
@@ -145,7 +145,7 @@
// Caching predicates used by scanners.
public:
UnicodeCache() {}
- typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
+ typedef unibrow::Utf8Decoder<512> Utf8Decoder;
StaticResource<Utf8Decoder>* utf8_decoder() {
return &utf8_decoder_;
@@ -315,8 +315,6 @@
// -1 is outside of the range of any real source code.
static const int kNoOctalLocation = -1;
- typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
-
explicit Scanner(UnicodeCache* scanner_contants);
void Initialize(Utf16CharacterStream* source);
=======================================
--- /branches/bleeding_edge/src/unicode-inl.h Tue Aug 28 02:37:41 2012
+++ /branches/bleeding_edge/src/unicode-inl.h Thu Dec 20 01:20:37 2012
@@ -240,10 +240,51 @@
buffer_ = R::ReadBlock(input_, util_buffer_, s, &remaining_, &offset_);
}
-template <unsigned s>
-Utf8InputBuffer<s>::Utf8InputBuffer(const char* data, unsigned length)
- : InputBuffer<Utf8, Buffer<const char*>, s>(Buffer<const char*>(data,
-
length)) {
+Utf8DecoderBase::Utf8DecoderBase()
+ : unbuffered_start_(NULL),
+ utf16_length_(0),
+ last_byte_of_buffer_unused_(false) {}
+
+Utf8DecoderBase::Utf8DecoderBase(uint16_t* buffer,
+ unsigned buffer_length,
+ const uint8_t* stream,
+ unsigned stream_length) {
+ Reset(buffer, buffer_length, stream, stream_length);
+}
+
+template<unsigned kBufferSize>
+Utf8Decoder<kBufferSize>::Utf8Decoder(const char* stream, unsigned length)
+ : Utf8DecoderBase(buffer_,
+ kBufferSize,
+ reinterpret_cast<const uint8_t*>(stream),
+ length) {
+}
+
+template<unsigned kBufferSize>
+void Utf8Decoder<kBufferSize>::Reset(const char* stream, unsigned length) {
+ Utf8DecoderBase::Reset(buffer_,
+ kBufferSize,
+ reinterpret_cast<const uint8_t*>(stream),
+ length);
+}
+
+template <unsigned kBufferSize>
+unsigned Utf8Decoder<kBufferSize>::WriteUtf16(uint16_t* data,
+ unsigned length) const {
+ ASSERT(length > 0);
+ if (length > utf16_length_) length = utf16_length_;
+ // memcpy everything in buffer.
+ unsigned buffer_length =
+ last_byte_of_buffer_unused_ ? kBufferSize - 1 : kBufferSize;
+ unsigned memcpy_length = length <= buffer_length ? length :
buffer_length;
+ memcpy(data, buffer_, memcpy_length*sizeof(uint16_t));
+ if (length <= buffer_length) return length;
+ ASSERT(unbuffered_start_ != NULL);
+ // Copy the rest the slow way.
+ WriteUtf16Slow(unbuffered_start_,
+ data + buffer_length,
+ length - buffer_length);
+ return length;
}
} // namespace unibrow
=======================================
--- /branches/bleeding_edge/src/unicode.cc Mon Mar 12 05:35:28 2012
+++ /branches/bleeding_edge/src/unicode.cc Thu Dec 20 01:20:37 2012
@@ -276,58 +276,6 @@
return kBadChar;
}
-
-const byte* Utf8::ReadBlock(Buffer<const char*> str, byte* buffer,
- unsigned capacity, unsigned* chars_read_ptr, unsigned* offset_ptr) {
- unsigned offset = *offset_ptr;
- // Bail out early if we've reached the end of the string.
- if (offset == str.length()) {
- *chars_read_ptr = 0;
- return NULL;
- }
- const byte* data = reinterpret_cast<const byte*>(str.data());
- if (data[offset] <= kMaxOneByteChar) {
- // The next character is an ASCII char so we scan forward over
- // the following ASCII characters and return the next pure ASCII
- // substring
- const byte* result = data + offset;
- offset++;
- while ((offset < str.length()) && (data[offset] <= kMaxOneByteChar))
- offset++;
- *chars_read_ptr = offset - *offset_ptr;
- *offset_ptr = offset;
- return result;
- } else {
- // The next character is non-ASCII so we just fill the buffer
- unsigned cursor = 0;
- unsigned chars_read = 0;
- while (offset < str.length()) {
- uchar c = data[offset];
- if (c <= kMaxOneByteChar) {
- // Fast case for ASCII characters
- if (!CharacterStream::EncodeAsciiCharacter(c,
- buffer,
- capacity,
- cursor))
- break;
- offset += 1;
- } else {
- unsigned chars = 0;
- c = Utf8::ValueOf(data + offset, str.length() - offset, &chars);
- if (!CharacterStream::EncodeNonAsciiCharacter(c,
- buffer,
- capacity,
- cursor))
- break;
- offset += chars;
- }
- chars_read++;
- }
- *offset_ptr = offset;
- *chars_read_ptr = chars_read;
- return buffer;
- }
-}
unsigned CharacterStream::Length() {
unsigned result = 0;
@@ -355,6 +303,75 @@
GetNext();
}
}
+
+void Utf8DecoderBase::Reset(uint16_t* buffer,
+ unsigned buffer_length,
+ const uint8_t* stream,
+ unsigned stream_length) {
+ // Assume everything will fit in the buffer and stream won't be needed.
+ last_byte_of_buffer_unused_ = false;
+ unbuffered_start_ = NULL;
+ bool writing_to_buffer = true;
+ // Loop until stream is read, writing to buffer as long as buffer has
space.
+ unsigned utf16_length = 0;
+ while (stream_length != 0) {
+ unsigned cursor = 0;
+ uint32_t character = Utf8::ValueOf(stream, stream_length, &cursor);
+ ASSERT(cursor > 0 && cursor <= stream_length);
+ stream += cursor;
+ stream_length -= cursor;
+ bool is_two_characters = character > Utf16::kMaxNonSurrogateCharCode;
+ utf16_length += is_two_characters ? 2 : 1;
+ // Don't need to write to the buffer, but still need utf16_length.
+ if (!writing_to_buffer) continue;
+ // Write out the characters to the buffer.
+ // Must check for equality with buffer_length as we've already updated
it.
+ if (utf16_length <= buffer_length) {
+ if (is_two_characters) {
+ *buffer++ = Utf16::LeadSurrogate(character);
+ *buffer++ = Utf16::TrailSurrogate(character);
+ } else {
+ *buffer++ = character;
+ }
+ if (utf16_length == buffer_length) {
+ // Just wrote last character of buffer
+ writing_to_buffer = false;
+ unbuffered_start_ = stream;
+ }
+ continue;
+ }
+ // Have gone over buffer.
+ // Last char of buffer is unused, set cursor back.
+ ASSERT(is_two_characters);
+ writing_to_buffer = false;
+ last_byte_of_buffer_unused_ = true;
+ unbuffered_start_ = stream - cursor;
+ }
+ utf16_length_ = utf16_length;
+}
+
+
+void Utf8DecoderBase::WriteUtf16Slow(const uint8_t* stream,
+ uint16_t* data,
+ unsigned data_length) {
+ while (data_length != 0) {
+ unsigned cursor = 0;
+ uint32_t character = Utf8::ValueOf(stream, Utf8::kMaxEncodedSize,
&cursor);
+ // There's a total lack of bounds checking for stream
+ // as it was already done in Reset.
+ stream += cursor;
+ if (character > unibrow::Utf16::kMaxNonSurrogateCharCode) {
+ *data++ = Utf16::LeadSurrogate(character);
+ *data++ = Utf16::TrailSurrogate(character);
+ ASSERT(data_length > 1);
+ data_length -= 2;
+ } else {
+ *data++ = character;
+ data_length -= 1;
+ }
+ }
+}
+
// Uppercase: point.category == 'Lu'
=======================================
--- /branches/bleeding_edge/src/unicode.h Wed Dec 19 05:27:20 2012
+++ /branches/bleeding_edge/src/unicode.h Thu Dec 20 01:20:37 2012
@@ -29,7 +29,8 @@
#define V8_UNICODE_H_
#include <sys/types.h>
-
+#include <stdint.h>
+#include <globals.h>
/**
* \file
* Definitions and convenience functions for working with unicode.
@@ -140,10 +141,10 @@
// One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes.
// The illegality stems from the surrogate not being part of a pair.
static const int kUtf8BytesToCodeASurrogate = 3;
- static inline uchar LeadSurrogate(int char_code) {
+ static inline uint16_t LeadSurrogate(uint32_t char_code) {
return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff);
}
- static inline uchar TrailSurrogate(int char_code) {
+ static inline uint16_t TrailSurrogate(uint32_t char_code) {
return 0xdc00 + (char_code & 0x3ff);
}
};
@@ -154,8 +155,6 @@
static inline uchar Length(uchar chr, int previous);
static inline unsigned Encode(
char* out, uchar c, int previous);
- static const byte* ReadBlock(Buffer<const char*> str, byte* buffer,
- unsigned capacity, unsigned* chars_read, unsigned* offset);
static uchar CalculateValue(const byte* str,
unsigned length,
unsigned* cursor);
@@ -241,17 +240,42 @@
byte util_buffer_[kSize];
};
-// --- U t f 8 I n p u t B u f f e r ---
-template <unsigned s = 256>
-class Utf8InputBuffer : public InputBuffer<Utf8, Buffer<const char*>, s> {
+class Utf8DecoderBase {
public:
- inline Utf8InputBuffer() { }
- inline Utf8InputBuffer(const char* data, unsigned length);
- inline void Reset(const char* data, unsigned length) {
- InputBuffer<Utf8, Buffer<const char*>, s>::Reset(
- Buffer<const char*>(data, length));
- }
+ // Initialization done in subclass.
+ inline Utf8DecoderBase();
+ inline Utf8DecoderBase(uint16_t* buffer,
+ unsigned buffer_length,
+ const uint8_t* stream,
+ unsigned stream_length);
+ inline unsigned Utf16Length() const { return utf16_length_; }
+ protected:
+ // This reads all characters and sets the utf16_length_.
+ // The first buffer_length utf16 chars are cached in the buffer.
+ void Reset(uint16_t* buffer,
+ unsigned buffer_length,
+ const uint8_t* stream,
+ unsigned stream_length);
+ static void WriteUtf16Slow(const uint8_t* stream,
+ uint16_t* data,
+ unsigned length);
+ const uint8_t* unbuffered_start_;
+ unsigned utf16_length_;
+ bool last_byte_of_buffer_unused_;
+ private:
+ DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase);
+};
+
+template <unsigned kBufferSize>
+class Utf8Decoder : public Utf8DecoderBase {
+ public:
+ inline Utf8Decoder() {}
+ inline Utf8Decoder(const char* stream, unsigned length);
+ inline void Reset(const char* stream, unsigned length);
+ inline unsigned WriteUtf16(uint16_t* data, unsigned length) const;
+ private:
+ uint16_t buffer_[kBufferSize];
};
=======================================
--- /branches/bleeding_edge/test/cctest/test-regexp.cc Thu Nov 15 05:31:27
2012
+++ /branches/bleeding_edge/test/cctest/test-regexp.cc Thu Dec 20 01:20:37
2012
@@ -98,7 +98,6 @@
static bool CheckSimple(const char* input) {
V8::Initialize(NULL);
v8::HandleScope scope;
- unibrow::Utf8InputBuffer<> buffer(input, StrLength(input));
ZoneScope zone_scope(Isolate::Current()->runtime_zone(), DELETE_ON_EXIT);
FlatStringReader reader(Isolate::Current(), CStrVector(input));
RegExpCompileData result;
@@ -117,7 +116,6 @@
static MinMaxPair CheckMinMaxMatch(const char* input) {
V8::Initialize(NULL);
v8::HandleScope scope;
- unibrow::Utf8InputBuffer<> buffer(input, StrLength(input));
ZoneScope zone_scope(Isolate::Current()->runtime_zone(), DELETE_ON_EXIT);
FlatStringReader reader(Isolate::Current(), CStrVector(input));
RegExpCompileData result;
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev