sammccall created this revision.
sammccall added a reviewer: ioeric.
Herald added subscribers: cfe-commits, kadircet, arphaman, jkorous, MaskRay, 
ilya-biryukov.

The goal is 8 bytes, which has a nonzero risk of collisions with huge indexes.
This patch should shake out any issues with truncation at all, we can lower
further later.


Repository:
  rCTE Clang Tools Extra

https://reviews.llvm.org/D53587

Files:
  clangd/index/Index.cpp
  clangd/index/Index.h
  clangd/index/Serialization.cpp
  unittests/clangd/SerializationTests.cpp

Index: unittests/clangd/SerializationTests.cpp
===================================================================
--- unittests/clangd/SerializationTests.cpp
+++ unittests/clangd/SerializationTests.cpp
@@ -27,7 +27,7 @@
 const char *YAML = R"(
 ---
 !Symbol
-ID: 057557CEBF6E6B2DD437FBF60CC58F352D1DF856
+ID: 057557CEBF6E6B2DD437FBF60CC58F35
 Name:   'Foo1'
 Scope:   'clang::'
 SymInfo:
@@ -53,7 +53,7 @@
 ...
 ---
 !Symbol
-ID: 057557CEBF6E6B2DD437FBF60CC58F352D1DF858
+ID: 057557CEBF6E6B2DD437FBF60CC58F36
 Name:   'Foo2'
 Scope:   'clang::'
 SymInfo:
@@ -72,7 +72,7 @@
 CompletionSnippetSuffix:    '-snippet'
 ...
 !Refs
-ID: 057557CEBF6E6B2DD437FBF60CC58F352D1DF856
+ID: 057557CEBF6E6B2DD437FBF60CC58F35
 References:
   - Kind: 4
     Location:
@@ -98,15 +98,14 @@
   auto ParsedYAML = readIndexFile(YAML);
   ASSERT_TRUE(bool(ParsedYAML)) << ParsedYAML.takeError();
   ASSERT_TRUE(bool(ParsedYAML->Symbols));
-  EXPECT_THAT(
-      *ParsedYAML->Symbols,
-      UnorderedElementsAre(ID("057557CEBF6E6B2DD437FBF60CC58F352D1DF856"),
-                           ID("057557CEBF6E6B2DD437FBF60CC58F352D1DF858")));
+  EXPECT_THAT(*ParsedYAML->Symbols,
+              UnorderedElementsAre(ID("057557CEBF6E6B2DD437FBF60CC58F35"),
+                                   ID("057557CEBF6E6B2DD437FBF60CC58F36")));
 
   auto Sym1 = *ParsedYAML->Symbols->find(
-      cantFail(SymbolID::fromStr("057557CEBF6E6B2DD437FBF60CC58F352D1DF856")));
+      cantFail(SymbolID::fromStr("057557CEBF6E6B2DD437FBF60CC58F35")));
   auto Sym2 = *ParsedYAML->Symbols->find(
-      cantFail(SymbolID::fromStr("057557CEBF6E6B2DD437FBF60CC58F352D1DF858")));
+      cantFail(SymbolID::fromStr("057557CEBF6E6B2DD437FBF60CC58F36")));
 
   EXPECT_THAT(Sym1, QName("clang::Foo1"));
   EXPECT_EQ(Sym1.Signature, "");
@@ -128,11 +127,11 @@
   EXPECT_TRUE(Sym2.Flags & Symbol::Deprecated);
 
   ASSERT_TRUE(bool(ParsedYAML->Refs));
-  EXPECT_THAT(*ParsedYAML->Refs,
-              UnorderedElementsAre(
-                  Pair(cantFail(SymbolID::fromStr(
-                           "057557CEBF6E6B2DD437FBF60CC58F352D1DF856")),
-                       testing::SizeIs(1))));
+  EXPECT_THAT(
+      *ParsedYAML->Refs,
+      UnorderedElementsAre(
+          Pair(cantFail(SymbolID::fromStr("057557CEBF6E6B2DD437FBF60CC58F35")),
+               testing::SizeIs(1))));
   auto Ref1 = ParsedYAML->Refs->begin()->second.front();
   EXPECT_EQ(Ref1.Kind, RefKind::Reference);
   EXPECT_EQ(Ref1.Location.FileURI, "file:///path/foo.cc");
Index: clangd/index/Serialization.cpp
===================================================================
--- clangd/index/Serialization.cpp
+++ clangd/index/Serialization.cpp
@@ -300,7 +300,7 @@
 
 // REFS ENCODING
 // A refs section has data grouped by Symbol. Each symbol has:
-//  - SymbolID: 20 bytes
+//  - SymbolID: 16 bytes
 //  - NumRefs: varint
 //  - Ref[NumRefs]
 // Fields of Ref are encoded in turn, see implementation.
Index: clangd/index/Index.h
===================================================================
--- clangd/index/Index.h
+++ clangd/index/Index.h
@@ -89,7 +89,7 @@
 // The class identifies a particular C++ symbol (class, function, method, etc).
 //
 // As USRs (Unified Symbol Resolution) could be large, especially for functions
-// with long type arguments, SymbolID is using 160-bits SHA1(USR) values to
+// with long type arguments, SymbolID is using SHA1(USR) values to
 // guarantee the uniqueness of symbols while using a relatively small amount of
 // memory (vs storing USRs directly).
 //
@@ -106,13 +106,16 @@
     return HashValue < Sym.HashValue;
   }
 
-  constexpr static size_t RawSize = 20;
+  // The stored hash is truncated to RawSize bytes.
+  // This trades off memory against the number of symbols we can handle.
+  // FIXME: can we reduce this further to 8 bytes?
+  constexpr static size_t RawSize = 16;
   llvm::StringRef raw() const {
     return StringRef(reinterpret_cast<const char *>(HashValue.data()), RawSize);
   }
   static SymbolID fromRaw(llvm::StringRef);
 
-  // Returns a 40-bytes hex encoded string.
+  // Returns a hex encoded string.
   std::string str() const;
   static llvm::Expected<SymbolID> fromStr(llvm::StringRef);
 
Index: clangd/index/Index.cpp
===================================================================
--- clangd/index/Index.cpp
+++ clangd/index/Index.cpp
@@ -43,8 +43,11 @@
             << "-" << L.End.line() << ":" << L.End.column() << ")";
 }
 
-SymbolID::SymbolID(StringRef USR)
-    : HashValue(SHA1::hash(arrayRefFromStringRef(USR))) {}
+SymbolID::SymbolID(StringRef USR) {
+  auto Hash = SHA1::hash(arrayRefFromStringRef(USR));
+  static_assert(sizeof(Hash) >= RawSize, "RawSize larger than SHA1");
+  memcpy(HashValue.data(), Hash.data(), RawSize);
+}
 
 raw_ostream &operator<<(raw_ostream &OS, const SymbolID &ID) {
   return OS << toHex(ID.raw());
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to