Revision: 6115
Author: [email protected]
Date: Wed Dec 22 12:14:19 2010
Log: Change scanner buffers to not use utf-8.
Make preparser keep its symbol text itself instead of relying on the
scanner.
Review URL: http://codereview.chromium.org/6075005
http://code.google.com/p/v8/source/detail?r=6115
Modified:
/branches/bleeding_edge/src/factory.cc
/branches/bleeding_edge/src/factory.h
/branches/bleeding_edge/src/globals.h
/branches/bleeding_edge/src/heap-inl.h
/branches/bleeding_edge/src/heap.cc
/branches/bleeding_edge/src/heap.h
/branches/bleeding_edge/src/objects.cc
/branches/bleeding_edge/src/objects.h
/branches/bleeding_edge/src/parser.cc
/branches/bleeding_edge/src/parser.h
/branches/bleeding_edge/src/preparse-data.cc
/branches/bleeding_edge/src/preparse-data.h
/branches/bleeding_edge/src/preparser.cc
/branches/bleeding_edge/src/scanner-base.cc
/branches/bleeding_edge/src/scanner-base.h
/branches/bleeding_edge/src/scanner.cc
/branches/bleeding_edge/src/scanner.h
/branches/bleeding_edge/src/utils.h
/branches/bleeding_edge/test/cctest/test-parsing.cc
=======================================
--- /branches/bleeding_edge/src/factory.cc Tue Dec 7 03:31:57 2010
+++ /branches/bleeding_edge/src/factory.cc Wed Dec 22 12:14:19 2010
@@ -98,6 +98,14 @@
Handle<String> Factory::LookupSymbol(Vector<const char> string) {
CALL_HEAP_FUNCTION(Heap::LookupSymbol(string), String);
}
+
+Handle<String> Factory::LookupAsciiSymbol(Vector<const char> string) {
+ CALL_HEAP_FUNCTION(Heap::LookupAsciiSymbol(string), String);
+}
+
+Handle<String> Factory::LookupTwoByteSymbol(Vector<const uc16> string) {
+ CALL_HEAP_FUNCTION(Heap::LookupTwoByteSymbol(string), String);
+}
Handle<String> Factory::NewStringFromAscii(Vector<const char> string,
=======================================
--- /branches/bleeding_edge/src/factory.h Tue Dec 7 03:31:57 2010
+++ /branches/bleeding_edge/src/factory.h Wed Dec 22 12:14:19 2010
@@ -61,6 +61,8 @@
PretenureFlag pretenure);
static Handle<String> LookupSymbol(Vector<const char> str);
+ static Handle<String> LookupAsciiSymbol(Vector<const char> str);
+ static Handle<String> LookupTwoByteSymbol(Vector<const uc16> str);
static Handle<String> LookupAsciiSymbol(const char* str) {
return LookupSymbol(CStrVector(str));
}
=======================================
--- /branches/bleeding_edge/src/globals.h Wed Dec 15 00:07:27 2010
+++ /branches/bleeding_edge/src/globals.h Wed Dec 22 12:14:19 2010
@@ -181,10 +181,6 @@
#define USING_BSD_ABI
#endif
-// Code-point values in Unicode 4.0 are 21 bits wide.
-typedef uint16_t uc16;
-typedef int32_t uc32;
-
//
-----------------------------------------------------------------------------
// Constants
@@ -228,6 +224,15 @@
const int kBinary32MantissaBits = 23;
const int kBinary32ExponentShift = 23;
+// ASCII/UC16 constants
+// Code-point values in Unicode 4.0 are 21 bits wide.
+typedef uint16_t uc16;
+typedef int32_t uc32;
+const int kASCIISize = kCharSize;
+const int kUC16Size = sizeof(uc16); // NOLINT
+const uc32 kMaxAsciiCharCode = 0x7f;
+const uint32_t kMaxAsciiCharCodeU = 0x7fu;
+
// The expression OFFSET_OF(type, field) computes the byte-offset
// of the specified field relative to the containing type. This
=======================================
--- /branches/bleeding_edge/src/heap-inl.h Wed Dec 22 03:31:18 2010
+++ /branches/bleeding_edge/src/heap-inl.h Wed Dec 22 12:14:19 2010
@@ -61,6 +61,71 @@
return AllocateInternalSymbol(&buffer, chars, hash_field);
}
+
+MaybeObject* Heap::AllocateAsciiSymbol(Vector<const char> str,
+ uint32_t hash_field) {
+ if (str.length() > SeqAsciiString::kMaxLength) {
+ return Failure::OutOfMemoryException();
+ }
+ // Compute map and object size.
+ Map* map = ascii_symbol_map();
+ int size = SeqAsciiString::SizeFor(str.length());
+
+ // Allocate string.
+ Object* result;
+ { MaybeObject* maybe_result = (size > MaxObjectSizeInPagedSpace())
+ ? lo_space_->AllocateRaw(size)
+ : old_data_space_->AllocateRaw(size);
+ if (!maybe_result->ToObject(&result)) return maybe_result;
+ }
+
+ reinterpret_cast<HeapObject*>(result)->set_map(map);
+ // Set length and hash fields of the allocated string.
+ String* answer = String::cast(result);
+ answer->set_length(str.length());
+ answer->set_hash_field(hash_field);
+
+ ASSERT_EQ(size, answer->Size());
+
+ // Fill in the characters.
+ memcpy(answer->address() + SeqAsciiString::kHeaderSize,
+ str.start(), str.length());
+
+ return answer;
+}
+
+
+MaybeObject* Heap::AllocateTwoByteSymbol(Vector<const uc16> str,
+ uint32_t hash_field) {
+ if (str.length() > SeqTwoByteString::kMaxLength) {
+ return Failure::OutOfMemoryException();
+ }
+ // Compute map and object size.
+ Map* map = symbol_map();
+ int size = SeqTwoByteString::SizeFor(str.length());
+
+ // Allocate string.
+ Object* result;
+ { MaybeObject* maybe_result = (size > MaxObjectSizeInPagedSpace())
+ ? lo_space_->AllocateRaw(size)
+ : old_data_space_->AllocateRaw(size);
+ if (!maybe_result->ToObject(&result)) return maybe_result;
+ }
+
+ reinterpret_cast<HeapObject*>(result)->set_map(map);
+ // Set length and hash fields of the allocated string.
+ String* answer = String::cast(result);
+ answer->set_length(str.length());
+ answer->set_hash_field(hash_field);
+
+ ASSERT_EQ(size, answer->Size());
+
+ // Fill in the characters.
+ memcpy(answer->address() + SeqTwoByteString::kHeaderSize,
+ str.start(), str.length() * kUC16Size);
+
+ return answer;
+}
MaybeObject* Heap::CopyFixedArray(FixedArray* src) {
return CopyFixedArrayWithMap(src, src->map());
=======================================
--- /branches/bleeding_edge/src/heap.cc Wed Dec 22 08:07:59 2010
+++ /branches/bleeding_edge/src/heap.cc Wed Dec 22 12:14:19 2010
@@ -4011,6 +4011,36 @@
ASSERT(symbol != NULL);
return symbol;
}
+
+
+MaybeObject* Heap::LookupAsciiSymbol(Vector<const char> string) {
+ Object* symbol = NULL;
+ Object* new_table;
+ { MaybeObject* maybe_new_table =
+ symbol_table()->LookupAsciiSymbol(string, &symbol);
+ if (!maybe_new_table->ToObject(&new_table)) return maybe_new_table;
+ }
+ // Can't use set_symbol_table because SymbolTable::cast knows that
+ // SymbolTable is a singleton and checks for identity.
+ roots_[kSymbolTableRootIndex] = new_table;
+ ASSERT(symbol != NULL);
+ return symbol;
+}
+
+
+MaybeObject* Heap::LookupTwoByteSymbol(Vector<const uc16> string) {
+ Object* symbol = NULL;
+ Object* new_table;
+ { MaybeObject* maybe_new_table =
+ symbol_table()->LookupTwoByteSymbol(string, &symbol);
+ if (!maybe_new_table->ToObject(&new_table)) return maybe_new_table;
+ }
+ // Can't use set_symbol_table because SymbolTable::cast knows that
+ // SymbolTable is a singleton and checks for identity.
+ roots_[kSymbolTableRootIndex] = new_table;
+ ASSERT(symbol != NULL);
+ return symbol;
+}
MaybeObject* Heap::LookupSymbol(String* string) {
=======================================
--- /branches/bleeding_edge/src/heap.h Tue Dec 21 07:10:45 2010
+++ /branches/bleeding_edge/src/heap.h Wed Dec 22 12:14:19 2010
@@ -431,6 +431,14 @@
int chars,
uint32_t hash_field);
+ MUST_USE_RESULT static inline MaybeObject* AllocateAsciiSymbol(
+ Vector<const char> str,
+ uint32_t hash_field);
+
+ MUST_USE_RESULT static inline MaybeObject* AllocateTwoByteSymbol(
+ Vector<const uc16> str,
+ uint32_t hash_field);
+
MUST_USE_RESULT static MaybeObject* AllocateInternalSymbol(
unibrow::CharacterStream* buffer, int chars, uint32_t hash_field);
@@ -686,6 +694,9 @@
// failed.
// Please note this function does not perform a garbage collection.
MUST_USE_RESULT static MaybeObject* LookupSymbol(Vector<const char> str);
+ MUST_USE_RESULT static MaybeObject* LookupAsciiSymbol(Vector<const char>
str);
+ MUST_USE_RESULT static MaybeObject* LookupTwoByteSymbol(
+ Vector<const uc16> str);
MUST_USE_RESULT static MaybeObject* LookupAsciiSymbol(const char* str) {
return LookupSymbol(CStrVector(str));
}
=======================================
--- /branches/bleeding_edge/src/objects.cc Mon Dec 20 02:38:19 2010
+++ /branches/bleeding_edge/src/objects.cc Wed Dec 22 12:14:19 2010
@@ -5141,6 +5141,26 @@
}
return i == slen && !decoder->has_more();
}
+
+
+bool String::IsAsciiEqualTo(Vector<const char> str) {
+ int slen = length();
+ if (str.length() != slen) return false;
+ for (int i = 0; i < slen; i++) {
+ if (Get(i) != static_cast<uint16_t>(str[i])) return false;
+ }
+ return true;
+}
+
+
+bool String::IsTwoByteEqualTo(Vector<const uc16> str) {
+ int slen = length();
+ if (str.length() != slen) return false;
+ for (int i = 0; i < slen; i++) {
+ if (Get(i) != str[i]) return false;
+ }
+ return true;
+}
template <typename schar>
@@ -8086,6 +8106,85 @@
};
+template <typename Char>
+class SequentialSymbolKey : public HashTableKey {
+ public:
+ explicit SequentialSymbolKey(Vector<const Char> string)
+ : string_(string), hash_field_(0) { }
+
+ uint32_t Hash() {
+ StringHasher hasher(string_.length());
+
+ // Very long strings have a trivial hash that doesn't inspect the
+ // string contents.
+ if (hasher.has_trivial_hash()) {
+ hash_field_ = hasher.GetHashField();
+ } else {
+ int i = 0;
+ // Do the iterative array index computation as long as there is a
+ // chance this is an array index.
+ while (i < string_.length() && hasher.is_array_index()) {
+ hasher.AddCharacter(static_cast<uc32>(string_[i]));
+ i++;
+ }
+
+ // Process the remaining characters without updating the array
+ // index.
+ while (i < string_.length()) {
+ hasher.AddCharacterNoIndex(static_cast<uc32>(string_[i]));
+ i++;
+ }
+ hash_field_ = hasher.GetHashField();
+ }
+
+ uint32_t result = hash_field_ >> String::kHashShift;
+ ASSERT(result != 0); // Ensure that the hash value of 0 is never
computed.
+ return result;
+ }
+
+
+ uint32_t HashForObject(Object* other) {
+ return String::cast(other)->Hash();
+ }
+
+ Vector<const Char> string_;
+ uint32_t hash_field_;
+};
+
+
+
+class AsciiSymbolKey : public SequentialSymbolKey<char> {
+ public:
+ explicit AsciiSymbolKey(Vector<const char> str)
+ : SequentialSymbolKey<char>(str) { }
+
+ bool IsMatch(Object* string) {
+ return String::cast(string)->IsAsciiEqualTo(string_);
+ }
+
+ MaybeObject* AsObject() {
+ if (hash_field_ == 0) Hash();
+ return Heap::AllocateAsciiSymbol(string_, hash_field_);
+ }
+};
+
+
+class TwoByteSymbolKey : public SequentialSymbolKey<uc16> {
+ public:
+ explicit TwoByteSymbolKey(Vector<const uc16> str)
+ : SequentialSymbolKey<uc16>(str) { }
+
+ bool IsMatch(Object* string) {
+ return String::cast(string)->IsTwoByteEqualTo(string_);
+ }
+
+ MaybeObject* AsObject() {
+ if (hash_field_ == 0) Hash();
+ return Heap::AllocateTwoByteSymbol(string_, hash_field_);
+ }
+};
+
+
// SymbolKey carries a string/symbol object as key.
class SymbolKey : public HashTableKey {
public:
@@ -8829,6 +8928,19 @@
return LookupKey(&key, s);
}
+
+MaybeObject* SymbolTable::LookupAsciiSymbol(Vector<const char> str,
+ Object** s) {
+ AsciiSymbolKey key(str);
+ return LookupKey(&key, s);
+}
+
+
+MaybeObject* SymbolTable::LookupTwoByteSymbol(Vector<const uc16> str,
+ Object** s) {
+ TwoByteSymbolKey key(str);
+ return LookupKey(&key, s);
+}
MaybeObject* SymbolTable::LookupKey(HashTableKey* key, Object** s) {
int entry = FindEntry(key);
=======================================
--- /branches/bleeding_edge/src/objects.h Wed Dec 22 03:31:18 2010
+++ /branches/bleeding_edge/src/objects.h Wed Dec 22 12:14:19 2010
@@ -2327,6 +2327,10 @@
// been enlarged. If the return value is not a failure, the symbol
// pointer *s is set to the symbol found.
MUST_USE_RESULT MaybeObject* LookupSymbol(Vector<const char> str,
Object** s);
+ MUST_USE_RESULT MaybeObject* LookupAsciiSymbol(Vector<const char> str,
+ Object** s);
+ MUST_USE_RESULT MaybeObject* LookupTwoByteSymbol(Vector<const uc16> str,
+ Object** s);
MUST_USE_RESULT MaybeObject* LookupString(String* key, Object** s);
// Looks up a symbol that is equal to the given string and returns
@@ -5074,6 +5078,8 @@
// String equality operations.
inline bool Equals(String* other);
bool IsEqualTo(Vector<const char> str);
+ bool IsAsciiEqualTo(Vector<const char> str);
+ bool IsTwoByteEqualTo(Vector<const uc16> str);
// Return a UTF8 representation of the string. The string is null
// terminated but may optionally contain nulls. Length is returned
=======================================
--- /branches/bleeding_edge/src/parser.cc Mon Dec 20 02:44:41 2010
+++ /branches/bleeding_edge/src/parser.cc Wed Dec 22 12:14:19 2010
@@ -323,22 +323,24 @@
}
-Handle<String> Parser::LookupSymbol(int symbol_id,
- Vector<const char> string) {
+Handle<String> Parser::LookupSymbol(int symbol_id) {
// Length of symbol cache is the number of identified symbols.
// If we are larger than that, or negative, it's not a cached symbol.
// This might also happen if there is no preparser symbol data, even
// if there is some preparser data.
if (static_cast<unsigned>(symbol_id)
>= static_cast<unsigned>(symbol_cache_.length())) {
- return Factory::LookupSymbol(string);
- }
- return LookupCachedSymbol(symbol_id, string);
+ if (scanner().is_literal_ascii()) {
+ return Factory::LookupAsciiSymbol(scanner().literal_ascii_string());
+ } else {
+ return Factory::LookupTwoByteSymbol(scanner().literal_uc16_string());
+ }
+ }
+ return LookupCachedSymbol(symbol_id);
}
-Handle<String> Parser::LookupCachedSymbol(int symbol_id,
- Vector<const char> string) {
+Handle<String> Parser::LookupCachedSymbol(int symbol_id) {
// Make sure the cache is large enough to hold the symbol identifier.
if (symbol_cache_.length() <= symbol_id) {
// Increase length to index + 1.
@@ -347,7 +349,11 @@
}
Handle<String> result = symbol_cache_.at(symbol_id);
if (result.is_null()) {
- result = Factory::LookupSymbol(string);
+ if (scanner().is_literal_ascii()) {
+ result =
Factory::LookupAsciiSymbol(scanner().literal_ascii_string());
+ } else {
+ result =
Factory::LookupTwoByteSymbol(scanner().literal_uc16_string());
+ }
symbol_cache_.at(symbol_id) = result;
return result;
}
@@ -615,11 +621,11 @@
// identical calls.
ExternalTwoByteStringUC16CharacterStream stream(
Handle<ExternalTwoByteString>::cast(source), 0, source->length());
- scanner_.Initialize(&stream, JavaScriptScanner::kAllLiterals);
+ scanner_.Initialize(&stream);
return DoParseProgram(source, in_global_context, &zone_scope);
} else {
GenericStringUC16CharacterStream stream(source, 0, source->length());
- scanner_.Initialize(&stream, JavaScriptScanner::kAllLiterals);
+ scanner_.Initialize(&stream);
return DoParseProgram(source, in_global_context, &zone_scope);
}
}
@@ -705,7 +711,7 @@
FunctionLiteral* Parser::ParseLazy(Handle<SharedFunctionInfo> info,
UC16CharacterStream* source,
ZoneScope* zone_scope) {
- scanner_.Initialize(source, JavaScriptScanner::kAllLiterals);
+ scanner_.Initialize(source);
ASSERT(target_stack_ == NULL);
Handle<String> name(String::cast(info->name()));
@@ -757,7 +763,7 @@
if (pre_data() != NULL) {
symbol_id = pre_data()->GetSymbolIdentifier();
}
- return LookupSymbol(symbol_id, scanner().literal());
+ return LookupSymbol(symbol_id);
}
@@ -2715,8 +2721,9 @@
case Token::NUMBER: {
Consume(Token::NUMBER);
- double value =
- StringToDouble(scanner().literal(), ALLOW_HEX | ALLOW_OCTALS);
+ ASSERT(scanner().is_literal_ascii());
+ double value = StringToDouble(scanner().literal_ascii_string(),
+ ALLOW_HEX | ALLOW_OCTALS);
result = NewNumberLiteral(value);
break;
}
@@ -3066,8 +3073,9 @@
}
case Token::NUMBER: {
Consume(Token::NUMBER);
- double value =
- StringToDouble(scanner().literal(), ALLOW_HEX | ALLOW_OCTALS);
+ ASSERT(scanner().is_literal_ascii());
+ double value = StringToDouble(scanner().literal_ascii_string(),
+ ALLOW_HEX | ALLOW_OCTALS);
key = NewNumberLiteral(value);
break;
}
@@ -3137,11 +3145,9 @@
int literal_index = temp_scope_->NextMaterializedLiteralIndex();
- Handle<String> js_pattern =
- Factory::NewStringFromUtf8(scanner().next_literal(), TENURED);
+ Handle<String> js_pattern = NextLiteralString(TENURED);
scanner().ScanRegExpFlags();
- Handle<String> js_flags =
- Factory::NewStringFromUtf8(scanner().next_literal(), TENURED);
+ Handle<String> js_flags = NextLiteralString(TENURED);
Next();
return new RegExpLiteral(js_pattern, js_flags, literal_index);
@@ -3423,10 +3429,10 @@
bool* ok) {
Expect(Token::IDENTIFIER, ok);
if (!*ok) return Handle<String>();
- if (scanner().literal_length() == 3) {
- const char* token = scanner().literal_string();
- *is_get = strcmp(token, "get") == 0;
- *is_set = !*is_get && strcmp(token, "set") == 0;
+ if (scanner().is_literal_ascii() && scanner().literal_length() == 3) {
+ const char* token = scanner().literal_ascii_string().start();
+ *is_get = strncmp(token, "get", 3) == 0;
+ *is_set = !*is_get && strncmp(token, "set", 3) == 0;
}
return GetSymbol(ok);
}
@@ -3604,9 +3610,11 @@
if (literal_length == 0) {
return Factory::empty_string();
}
- const char* literal_string = scanner_.literal_string();
- Vector<const char> literal(literal_string, literal_length);
- return Factory::NewStringFromUtf8(literal);
+ if (scanner_.is_literal_ascii()) {
+ return Factory::NewStringFromAscii(scanner_.literal_ascii_string());
+ } else {
+ return Factory::NewStringFromTwoByte(scanner_.literal_uc16_string());
+ }
}
@@ -3618,7 +3626,8 @@
return GetString();
}
case Token::NUMBER: {
- double value = StringToDouble(scanner_.literal(),
+ ASSERT(scanner_.is_literal_ascii());
+ double value = StringToDouble(scanner_.literal_ascii_string(),
NO_FLAGS, // Hex, octal or trailing
junk.
OS::nan_value());
return Factory::NewNumber(value);
@@ -4597,10 +4606,9 @@
// Create a Scanner for the preparser to use as input, and preparse the
source.
static ScriptDataImpl* DoPreParse(UC16CharacterStream* source,
bool allow_lazy,
- ParserRecorder* recorder,
- int literal_flags) {
+ ParserRecorder* recorder) {
V8JavaScriptScanner scanner;
- scanner.Initialize(source, literal_flags);
+ scanner.Initialize(source);
intptr_t stack_limit = StackGuard::real_climit();
if (!preparser::PreParser::PreParseProgram(&scanner,
recorder,
@@ -4628,8 +4636,7 @@
return NULL;
}
PartialParserRecorder recorder;
- return DoPreParse(source, allow_lazy, &recorder,
- JavaScriptScanner::kNoLiterals);
+ return DoPreParse(source, allow_lazy, &recorder);
}
@@ -4638,9 +4645,7 @@
Handle<Script> no_script;
bool allow_lazy = FLAG_lazy && (extension == NULL);
CompleteParserRecorder recorder;
- int kPreParseLiteralsFlags =
- JavaScriptScanner::kLiteralString |
JavaScriptScanner::kLiteralIdentifier;
- return DoPreParse(source, allow_lazy, &recorder, kPreParseLiteralsFlags);
+ return DoPreParse(source, allow_lazy, &recorder);
}
=======================================
--- /branches/bleeding_edge/src/parser.h Tue Dec 7 06:03:59 2010
+++ /branches/bleeding_edge/src/parser.h Wed Dec 22 12:14:19 2010
@@ -578,6 +578,26 @@
bool Check(Token::Value token);
void ExpectSemicolon(bool* ok);
+ Handle<String> LiteralString(PretenureFlag tenured) {
+ if (scanner().is_literal_ascii()) {
+ return Factory::NewStringFromAscii(scanner().literal_ascii_string(),
+ tenured);
+ } else {
+ return Factory::NewStringFromTwoByte(scanner().literal_uc16_string(),
+ tenured);
+ }
+ }
+
+ Handle<String> NextLiteralString(PretenureFlag tenured) {
+ if (scanner().is_next_literal_ascii()) {
+ return
Factory::NewStringFromAscii(scanner().next_literal_ascii_string(),
+ tenured);
+ } else {
+ return
Factory::NewStringFromTwoByte(scanner().next_literal_uc16_string(),
+ tenured);
+ }
+ }
+
Handle<String> GetSymbol(bool* ok);
// Get odd-ball literals.
@@ -612,11 +632,9 @@
Scope* NewScope(Scope* parent, Scope::Type type, bool inside_with);
- Handle<String> LookupSymbol(int symbol_id,
- Vector<const char> string);
-
- Handle<String> LookupCachedSymbol(int symbol_id,
- Vector<const char> string);
+ Handle<String> LookupSymbol(int symbol_id);
+
+ Handle<String> LookupCachedSymbol(int symbol_id);
Expression* NewCall(Expression* expression,
ZoneList<Expression*>* arguments,
=======================================
--- /branches/bleeding_edge/src/preparse-data.cc Tue Dec 7 03:01:02 2010
+++ /branches/bleeding_edge/src/preparse-data.cc Wed Dec 22 12:14:19 2010
@@ -110,26 +110,29 @@
CompleteParserRecorder::CompleteParserRecorder()
: FunctionLoggingParserRecorder(),
+ literal_chars_(0),
symbol_store_(0),
- symbol_entries_(0),
+ symbol_keys_(0),
symbol_table_(vector_compare),
symbol_id_(0) {
}
-void CompleteParserRecorder::LogSymbol(
- int start, const char* literal_chars, int length) {
- if (!is_recording_) return;
-
- Vector<const char> literal(literal_chars, length);
- int hash = vector_hash(literal);
- HashMap::Entry* entry = symbol_table_.Lookup(&literal, hash, true);
+void CompleteParserRecorder::LogSymbol(int start,
+ int hash,
+ bool is_ascii,
+ Vector<const byte> literal_bytes) {
+ Key key = { is_ascii, literal_bytes };
+ HashMap::Entry* entry = symbol_table_.Lookup(&key, hash, true);
int id = static_cast<int>(reinterpret_cast<intptr_t>(entry->value));
if (id == 0) {
+ // Copy literal contents for later comparison.
+ key.literal_bytes =
+ Vector<const byte>::cast(literal_chars_.AddBlock(literal_bytes));
// Put (symbol_id_ + 1) into entry and increment it.
id = ++symbol_id_;
entry->value = reinterpret_cast<void*>(id);
- Vector<Vector<const char> > symbol = symbol_entries_.AddBlock(1,
literal);
+ Vector<Key> symbol = symbol_keys_.AddBlock(1, key);
entry->key = &symbol[0];
}
WriteNumber(id - 1);
=======================================
--- /branches/bleeding_edge/src/preparse-data.h Tue Dec 7 03:01:02 2010
+++ /branches/bleeding_edge/src/preparse-data.h Wed Dec 22 12:14:19 2010
@@ -75,7 +75,8 @@
int properties) = 0;
// Logs a symbol creation of a literal or identifier.
- virtual void LogSymbol(int start, const char* symbol, int length) = 0;
+ virtual void LogAsciiSymbol(int start, Vector<const char> literal) { }
+ virtual void LogUC16Symbol(int start, Vector<const uc16> literal) { }
// Logs an error message and marks the log as containing an error.
// Further logging will be ignored, and ExtractData will return a vector
@@ -165,7 +166,8 @@
class PartialParserRecorder : public FunctionLoggingParserRecorder {
public:
PartialParserRecorder() : FunctionLoggingParserRecorder() { }
- virtual void LogSymbol(int start, const char* symbol, int length) { }
+ virtual void LogAsciiSymbol(int start, Vector<const char> literal) { }
+ virtual void LogUC16Symbol(int start, Vector<const uc16> literal) { }
virtual ~PartialParserRecorder() { }
virtual Vector<unsigned> ExtractData();
virtual int symbol_position() { return 0; }
@@ -181,7 +183,17 @@
CompleteParserRecorder();
virtual ~CompleteParserRecorder() { }
- virtual void LogSymbol(int start, const char* symbol, int length);
+ virtual void LogAsciiSymbol(int start, Vector<const char> literal) {
+ if (!is_recording_) return;
+ int hash = vector_hash(literal);
+ LogSymbol(start, hash, true, Vector<const byte>::cast(literal));
+ }
+
+ virtual void LogUC16Symbol(int start, Vector<const uc16> literal) {
+ if (!is_recording_) return;
+ int hash = vector_hash(literal);
+ LogSymbol(start, hash, false, Vector<const byte>::cast(literal));
+ }
virtual Vector<unsigned> ExtractData();
@@ -189,10 +201,21 @@
virtual int symbol_ids() { return symbol_id_; }
private:
- static int vector_hash(Vector<const char> string) {
+ struct Key {
+ bool is_ascii;
+ Vector<const byte> literal_bytes;
+ };
+
+ virtual void LogSymbol(int start,
+ int hash,
+ bool is_ascii,
+ Vector<const byte> literal);
+
+ template <typename Char>
+ static int vector_hash(Vector<const Char> string) {
int hash = 0;
for (int i = 0; i < string.length(); i++) {
- int c = string[i];
+ int c = static_cast<int>(string[i]);
hash += c;
hash += (hash << 10);
hash ^= (hash >> 6);
@@ -201,18 +224,21 @@
}
static bool vector_compare(void* a, void* b) {
- Vector<const char>* string1 = reinterpret_cast<Vector<const char>*
(a);
- Vector<const char>* string2 = reinterpret_cast<Vector<const char>*
(b);
- int length = string1->length();
- if (string2->length() != length) return false;
- return memcmp(string1->start(), string2->start(), length) == 0;
+ Key* string1 = reinterpret_cast<Key*>(a);
+ Key* string2 = reinterpret_cast<Key*>(b);
+ if (string1->is_ascii != string2->is_ascii) return false;
+ int length = string1->literal_bytes.length();
+ if (string2->literal_bytes.length() != length) return false;
+ return memcmp(string1->literal_bytes.start(),
+ string2->literal_bytes.start(), length) == 0;
}
// Write a non-negative number to the symbol store.
void WriteNumber(int number);
+ Collector<byte> literal_chars_;
Collector<byte> symbol_store_;
- Collector<Vector<const char> > symbol_entries_;
+ Collector<Key> symbol_keys_;
HashMap symbol_table_;
int symbol_id_;
};
=======================================
--- /branches/bleeding_edge/src/preparser.cc Tue Dec 7 06:03:59 2010
+++ /branches/bleeding_edge/src/preparser.cc Wed Dec 22 12:14:19 2010
@@ -1121,23 +1121,23 @@
PreParser::Identifier PreParser::GetIdentifierSymbol() {
- const char* literal_chars = scanner_->literal_string();
- int literal_length = scanner_->literal_length();
int identifier_pos = scanner_->location().beg_pos;
-
- log_->LogSymbol(identifier_pos, literal_chars, literal_length);
-
- return kUnknownExpression;
+ if (scanner_->is_literal_ascii()) {
+ log_->LogAsciiSymbol(identifier_pos, scanner_->literal_ascii_string());
+ } else {
+ log_->LogUC16Symbol(identifier_pos, scanner_->literal_uc16_string());
+ }
+ return kUnknownIdentifier;
}
PreParser::Expression PreParser::GetStringSymbol() {
- const char* literal_chars = scanner_->literal_string();
- int literal_length = scanner_->literal_length();
-
- int literal_position = scanner_->location().beg_pos;
- log_->LogSymbol(literal_position, literal_chars, literal_length);
-
+ int identifier_pos = scanner_->location().beg_pos;
+ if (scanner_->is_literal_ascii()) {
+ log_->LogAsciiSymbol(identifier_pos, scanner_->literal_ascii_string());
+ } else {
+ log_->LogUC16Symbol(identifier_pos, scanner_->literal_uc16_string());
+ }
return kUnknownExpression;
}
@@ -1154,7 +1154,8 @@
if (i::Token::IsKeyword(next)) {
int pos = scanner_->location().beg_pos;
const char* keyword = i::Token::String(next);
- log_->LogSymbol(pos, keyword, i::StrLength(keyword));
+ log_->LogAsciiSymbol(pos, i::Vector<const char>(keyword,
+
i::StrLength(keyword)));
return kUnknownExpression;
}
if (next == i::Token::IDENTIFIER) {
@@ -1173,8 +1174,8 @@
bool* is_set,
bool* ok) {
Expect(i::Token::IDENTIFIER, CHECK_OK);
- if (scanner_->literal_length() == 3) {
- const char* token = scanner_->literal_string();
+ if (scanner_->is_literal_ascii() && scanner_->literal_length() == 3) {
+ const char* token = scanner_->literal_ascii_string().start();
*is_get = strncmp(token, "get", 3) == 0;
*is_set = !*is_get && strncmp(token, "set", 3) == 0;
}
=======================================
--- /branches/bleeding_edge/src/scanner-base.cc Wed Dec 8 02:06:40 2010
+++ /branches/bleeding_edge/src/scanner-base.cc Wed Dec 22 12:14:19 2010
@@ -33,28 +33,6 @@
namespace v8 {
namespace internal {
-
-//
----------------------------------------------------------------------------
-// LiteralCollector
-
-LiteralCollector::LiteralCollector()
- : buffer_(kInitialCapacity), recording_(false) { }
-
-
-LiteralCollector::~LiteralCollector() {}
-
-
-void LiteralCollector::AddCharSlow(uc32 c) {
- ASSERT(static_cast<unsigned>(c) > unibrow::Utf8::kMaxOneByteChar);
- int length = unibrow::Utf8::Length(c);
- Vector<char> block = buffer_.AddBlock(length, '\0');
-#ifdef DEBUG
- int written_length = unibrow::Utf8::Encode(block.start(), c);
- CHECK_EQ(length, written_length);
-#else
- unibrow::Utf8::Encode(block.start(), c);
-#endif
-}
//
----------------------------------------------------------------------------
// Character predicates
@@ -256,7 +234,7 @@
void JavaScriptScanner::Scan() {
- next_.literal_chars = Vector<const char>();
+ next_.literal_chars = NULL;
Token::Value token;
do {
// Remember the position of the next token
@@ -561,7 +539,7 @@
uc32 quote = c0_;
Advance(); // consume quote
- LiteralScope literal(this, kLiteralString);
+ LiteralScope literal(this);
while (c0_ != quote && c0_ >= 0
&& !ScannerConstants::kIsLineTerminator.get(c0_)) {
uc32 c = c0_;
@@ -592,7 +570,7 @@
enum { DECIMAL, HEX, OCTAL } kind = DECIMAL;
- LiteralScope literal(this, kLiteralNumber);
+ LiteralScope literal(this);
if (seen_period) {
// we have already seen a decimal point of the float
AddLiteralChar('.');
@@ -681,7 +659,7 @@
Token::Value JavaScriptScanner::ScanIdentifierOrKeyword() {
ASSERT(ScannerConstants::kIsIdentifierStart.get(c0_));
- LiteralScope literal(this, kLiteralIdentifier);
+ LiteralScope literal(this);
KeywordMatcher keyword_match;
// Scan identifier start character.
if (c0_ == '\\') {
@@ -747,7 +725,7 @@
// Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
// the scanner should pass uninterpreted bodies to the RegExp
// constructor.
- LiteralScope literal(this, kLiteralRegExp);
+ LiteralScope literal(this);
if (seen_equal)
AddLiteralChar('=');
@@ -773,7 +751,7 @@
bool JavaScriptScanner::ScanRegExpFlags() {
// Scan regular expression flags.
- LiteralScope literal(this, kLiteralRegExpFlags);
+ LiteralScope literal(this);
while (ScannerConstants::kIsIdentifierPart.get(c0_)) {
if (c0_ == '\\') {
uc32 c = ScanIdentifierUnicodeEscape();
=======================================
--- /branches/bleeding_edge/src/scanner-base.h Tue Dec 7 06:03:59 2010
+++ /branches/bleeding_edge/src/scanner-base.h Wed Dec 22 12:14:19 2010
@@ -141,61 +141,103 @@
};
//
----------------------------------------------------------------------------
-// LiteralCollector - Collector of chars of literals.
-
-class LiteralCollector {
+// LiteralBuffer - Collector of chars of literals.
+
+class LiteralBuffer {
public:
- LiteralCollector();
- ~LiteralCollector();
-
- inline void AddChar(uc32 c) {
- if (recording_) {
- if (static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) {
- buffer_.Add(static_cast<char>(c));
- } else {
- AddCharSlow(c);
- }
- }
+ LiteralBuffer() : is_ascii_(true), position_(0), backing_store_() { }
+
+ ~LiteralBuffer() {
+ if (backing_store_.length() > 0) {
+ backing_store_.Dispose();
+ }
+ }
+
+ inline void AddChar(uc16 character) {
+ if (position_ >= backing_store_.length()) ExpandBuffer();
+ if (is_ascii_) {
+ if (character < kMaxAsciiCharCodeU) {
+ backing_store_[position_] = static_cast<byte>(character);
+ position_ += kASCIISize;
+ return;
+ }
+ ConvertToUC16();
+ }
+ *reinterpret_cast<uc16*>(&backing_store_[position_]) = character;
+ position_ += kUC16Size;
}
- void StartLiteral() {
- buffer_.StartSequence();
- recording_ = true;
+ bool is_ascii() { return is_ascii_; }
+
+ Vector<const uc16> uc16_literal() {
+ ASSERT(!is_ascii_);
+ ASSERT((position_ & 0x1) == 0);
+ return Vector<const uc16>(
+ reinterpret_cast<const uc16*>(backing_store_.start()),
+ position_ >> 1);
}
- Vector<const char> EndLiteral() {
- if (recording_) {
- recording_ = false;
- buffer_.Add(kEndMarker);
- Vector<char> sequence = buffer_.EndSequence();
- return Vector<const char>(sequence.start(), sequence.length());
- }
- return Vector<const char>();
+ Vector<const char> ascii_literal() {
+ ASSERT(is_ascii_);
+ return Vector<const char>(
+ reinterpret_cast<const char*>(backing_store_.start()),
+ position_);
}
- void DropLiteral() {
- if (recording_) {
- recording_ = false;
- buffer_.DropSequence();
- }
+ int length() {
+ return is_ascii_ ? position_ : (position_ >> 1);
}
void Reset() {
- buffer_.Reset();
- }
-
- // The end marker added after a parsed literal.
- // Using zero allows the usage of strlen and similar functions on
- // identifiers and numbers (but not strings, since they may contain zero
- // bytes).
- static const char kEndMarker = '\x00';
+ position_ = 0;
+ is_ascii_ = true;
+ }
private:
- static const int kInitialCapacity = 256;
- SequenceCollector<char, 4> buffer_;
- bool recording_;
- void AddCharSlow(uc32 c);
+ static const int kInitialCapacity = 16;
+ static const int kGrowthFactory = 4;
+ static const int kMinConversionSlack = 256;
+ static const int kMaxGrowth = 1 * MB;
+ inline int NewCapacity(int min_capacity) {
+ int capacity = Max(min_capacity, backing_store_.length());
+ int new_capacity = Min(capacity * kGrowthFactory, capacity +
kMaxGrowth);
+ return new_capacity;
+ }
+
+ void ExpandBuffer() {
+ Vector<byte> new_store =
Vector<byte>::New(NewCapacity(kInitialCapacity));
+ memcpy(new_store.start(), backing_store_.start(), position_);
+ backing_store_.Dispose();
+ backing_store_ = new_store;
+ }
+
+ void ConvertToUC16() {
+ ASSERT(is_ascii_);
+ Vector<byte> new_store;
+ int new_content_size = position_ * kUC16Size;
+ if (new_content_size > backing_store_.length()) {
+ new_store = Vector<byte>::New(NewCapacity(new_content_size));
+ } else {
+ new_store = backing_store_;
+ }
+ char* src = reinterpret_cast<char*>(backing_store_.start());
+ uc16* dst = reinterpret_cast<uc16*>(new_store.start());
+ for (int i = position_ - 1; i >= 0; i--) {
+ dst[i] = src[i];
+ }
+ if (new_store.start() != backing_store_.start()) {
+ backing_store_.Dispose();
+ backing_store_ = new_store;
+ }
+ position_ = new_content_size;
+ is_ascii_ = false;
+ }
+
+ bool is_ascii_;
+ int position_;
+ Vector<byte> backing_store_;
};
+
//
----------------------------------------------------------------------------
// Scanner base-class.
@@ -241,35 +283,40 @@
// collected for identifiers, strings, and numbers.
// These functions only give the correct result if the literal
// was scanned between calls to StartLiteral() and TerminateLiteral().
- const char* literal_string() const {
- return current_.literal_chars.start();
- }
-
+ bool is_literal_ascii() {
+ ASSERT_NOT_NULL(current_.literal_chars);
+ return current_.literal_chars->is_ascii();
+ }
+ Vector<const char> literal_ascii_string() {
+ ASSERT_NOT_NULL(current_.literal_chars);
+ return current_.literal_chars->ascii_literal();
+ }
+ Vector<const uc16> literal_uc16_string() {
+ ASSERT_NOT_NULL(current_.literal_chars);
+ return current_.literal_chars->uc16_literal();
+ }
int literal_length() const {
- // Excluding terminal '\x00' added by TerminateLiteral().
- return current_.literal_chars.length() - 1;
- }
-
- Vector<const char> literal() const {
- return Vector<const char>(literal_string(), literal_length());
+ ASSERT_NOT_NULL(current_.literal_chars);
+ return current_.literal_chars->length();
}
// Returns the literal string for the next token (the token that
// would be returned if Next() were called).
- const char* next_literal_string() const {
- return next_.literal_chars.start();
- }
-
-
- // Returns the length of the next token (that would be returned if
- // Next() were called).
+ bool is_next_literal_ascii() {
+ ASSERT_NOT_NULL(next_.literal_chars);
+ return next_.literal_chars->is_ascii();
+ }
+ Vector<const char> next_literal_ascii_string() {
+ ASSERT_NOT_NULL(next_.literal_chars);
+ return next_.literal_chars->ascii_literal();
+ }
+ Vector<const uc16> next_literal_uc16_string() {
+ ASSERT_NOT_NULL(next_.literal_chars);
+ return next_.literal_chars->uc16_literal();
+ }
int next_literal_length() const {
- // Excluding terminal '\x00' added by TerminateLiteral().
- return next_.literal_chars.length() - 1;
- }
-
- Vector<const char> next_literal() const {
- return Vector<const char>(next_literal_string(),
next_literal_length());
+ ASSERT_NOT_NULL(next_.literal_chars);
+ return next_.literal_chars->length();
}
static const int kCharacterLookaheadBufferSize = 1;
@@ -279,7 +326,7 @@
struct TokenDesc {
Token::Value token;
Location location;
- Vector<const char> literal_chars;
+ LiteralBuffer* literal_chars;
};
// Call this after setting source_ to the input.
@@ -288,29 +335,31 @@
ASSERT(kCharacterLookaheadBufferSize == 1);
Advance();
// Initialize current_ to not refer to a literal.
- current_.literal_chars = Vector<const char>();
- // Reset literal buffer.
- literal_buffer_.Reset();
+ current_.literal_chars = NULL;
}
// Literal buffer support
inline void StartLiteral() {
- literal_buffer_.StartLiteral();
+ LiteralBuffer* free_buffer = (current_.literal_chars ==
&literal_buffer1_) ?
+ &literal_buffer2_ : &literal_buffer1_;
+ free_buffer->Reset();
+ next_.literal_chars = free_buffer;
}
inline void AddLiteralChar(uc32 c) {
- literal_buffer_.AddChar(c);
+ ASSERT_NOT_NULL(next_.literal_chars);
+ next_.literal_chars->AddChar(c);
}
// Complete scanning of a literal.
inline void TerminateLiteral() {
- next_.literal_chars = literal_buffer_.EndLiteral();
+ // Does nothing in the current implementation.
}
// Stops scanning of a literal and drop the collected characters,
// e.g., due to an encountered error.
inline void DropLiteral() {
- literal_buffer_.DropLiteral();
+ next_.literal_chars = NULL;
}
inline void AddLiteralCharAdvance() {
@@ -347,6 +396,10 @@
int source_pos() {
return source_->pos() - kCharacterLookaheadBufferSize;
}
+
+ // Buffers collecting literal strings, numbers, etc.
+ LiteralBuffer literal_buffer1_;
+ LiteralBuffer literal_buffer2_;
TokenDesc current_; // desc for current token (as returned by Next())
TokenDesc next_; // desc for next token (one token look-ahead)
@@ -354,9 +407,6 @@
// Input stream. Must be initialized to an UC16CharacterStream.
UC16CharacterStream* source_;
- // Buffer to hold literal values (identifiers, strings, numbers)
- // using '\x00'-terminated UTF-8 encoding. Handles allocation internally.
- LiteralCollector literal_buffer_;
// One Unicode character look-ahead; c0_ < 0 at the end of the input.
uc32 c0_;
@@ -367,28 +417,14 @@
class JavaScriptScanner : public Scanner {
public:
-
- // Bit vector representing set of types of literals.
- enum LiteralType {
- kNoLiterals = 0,
- kLiteralNumber = 1,
- kLiteralIdentifier = 2,
- kLiteralString = 4,
- kLiteralRegExp = 8,
- kLiteralRegExpFlags = 16,
- kAllLiterals = 31
- };
-
// A LiteralScope that disables recording of some types of JavaScript
// literals. If the scanner is configured to not record the specific
// type of literal, the scope will not call StartLiteral.
class LiteralScope {
public:
- LiteralScope(JavaScriptScanner* self, LiteralType type)
+ explicit LiteralScope(JavaScriptScanner* self)
: scanner_(self), complete_(false) {
- if (scanner_->RecordsLiteral(type)) {
- scanner_->StartLiteral();
- }
+ scanner_->StartLiteral();
}
~LiteralScope() {
if (!complete_) scanner_->DropLiteral();
@@ -429,11 +465,6 @@
// characters, but works for seeking forward until simple delimiter
// tokens, which is what it is used for.
void SeekForward(int pos);
-
- // Whether this scanner records the given literal type or not.
- bool RecordsLiteral(LiteralType type) {
- return (literal_flags_ & type) != 0;
- }
protected:
bool SkipWhiteSpace();
@@ -458,7 +489,6 @@
// If the escape sequence cannot be decoded the result is kBadChar.
uc32 ScanIdentifierUnicodeEscape();
- int literal_flags_;
bool has_line_terminator_before_next_;
};
=======================================
--- /branches/bleeding_edge/src/scanner.cc Tue Dec 7 06:03:59 2010
+++ /branches/bleeding_edge/src/scanner.cc Wed Dec 22 12:14:19 2010
@@ -324,10 +324,8 @@
V8JavaScriptScanner::V8JavaScriptScanner() : JavaScriptScanner() { }
-void V8JavaScriptScanner::Initialize(UC16CharacterStream* source,
- int literal_flags) {
+void V8JavaScriptScanner::Initialize(UC16CharacterStream* source) {
source_ = source;
- literal_flags_ = literal_flags | kLiteralIdentifier;
// Need to capture identifiers in order to recognize "get" and "set"
// in object literals.
Init();
@@ -377,7 +375,7 @@
void JsonScanner::ScanJson() {
- next_.literal_chars = Vector<const char>();
+ next_.literal_chars = NULL;
Token::Value token;
do {
// Remember the position of the next token
=======================================
--- /branches/bleeding_edge/src/scanner.h Tue Dec 7 06:03:59 2010
+++ /branches/bleeding_edge/src/scanner.h Wed Dec 22 12:14:19 2010
@@ -134,8 +134,7 @@
class V8JavaScriptScanner : public JavaScriptScanner {
public:
V8JavaScriptScanner();
- void Initialize(UC16CharacterStream* source,
- int literal_flags = kAllLiterals);
+ void Initialize(UC16CharacterStream* source);
};
=======================================
--- /branches/bleeding_edge/src/utils.h Tue Dec 7 03:31:57 2010
+++ /branches/bleeding_edge/src/utils.h Wed Dec 22 12:14:19 2010
@@ -528,6 +528,24 @@
}
return Vector<T>(position, size);
}
+
+
+ // Add a contiguous block of elements and return a vector backed
+ // by the added block.
+ // A basic Collector will keep this vector valid as long as the Collector
+ // is alive.
+ inline Vector<T> AddBlock(Vector<const T> source) {
+ if (source.length() > current_chunk_.length() - index_) {
+ Grow(source.length());
+ }
+ T* position = current_chunk_.start() + index_;
+ index_ += source.length();
+ size_ += source.length();
+ for (int i = 0; i < source.length(); i++) {
+ position[i] = source[i];
+ }
+ return Vector<T>(position, source.length());
+ }
// Write the contents of the collector into the provided vector.
=======================================
--- /branches/bleeding_edge/test/cctest/test-parsing.cc Tue Dec 7 06:03:59
2010
+++ /branches/bleeding_edge/test/cctest/test-parsing.cc Wed Dec 22 12:14:19
2010
@@ -573,7 +573,7 @@
int skip_pos = 0, // Zero means not skipping.
int skip_to = 0) {
i::V8JavaScriptScanner scanner;
- scanner.Initialize(stream, i::JavaScriptScanner::kAllLiterals);
+ scanner.Initialize(stream);
int i = 0;
do {
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev