sammccall created this revision.
sammccall added a reviewer: ilya-biryukov.
Herald added subscribers: cfe-commits, mgrang, klimek.

This improves a few things:

- the insert -> freeze -> read sequence is now enforced/communicated by the 
type system
- SymbolSlab::const_iterator iterates over symbols, not over id-symbol pairs
- we avoid permanently storing a second copy of the IDs, and the string map's 
hashtable

The slab size is now down to 21.8MB for the LLVM project.
Of this only 2.7MB is strings, the rest is #symbols * `sizeof(Symbol)`.
`sizeof(Symbol)` is currently 96, which seems too big - I think
SymbolInfo isn't efficiently packed. That's a topic for another patch!

Also added simple API to see the memory usage/#symbols of a slab, since
it seems likely we will continue to care about this.


Repository:
  rCTE Clang Tools Extra

https://reviews.llvm.org/D41506

Files:
  clangd/index/FileIndex.cpp
  clangd/index/Index.cpp
  clangd/index/Index.h
  clangd/index/SymbolCollector.cpp
  clangd/index/SymbolCollector.h
  clangd/index/SymbolYAML.cpp
  unittests/clangd/CodeCompleteTests.cpp
  unittests/clangd/FileIndexTests.cpp
  unittests/clangd/IndexTests.cpp
  unittests/clangd/SymbolCollectorTests.cpp

Index: unittests/clangd/SymbolCollectorTests.cpp
===================================================================
--- unittests/clangd/SymbolCollectorTests.cpp
+++ unittests/clangd/SymbolCollectorTests.cpp
@@ -32,8 +32,7 @@
 
 // GMock helpers for matching Symbol.
 MATCHER_P(QName, Name, "") {
-  return (arg.second.Scope + (arg.second.Scope.empty() ? "" : "::") +
-          arg.second.Name).str() == Name;
+  return (arg.Scope + (arg.Scope.empty() ? "" : "::") + arg.Name).str() == Name;
 }
 
 namespace clang {
Index: unittests/clangd/IndexTests.cpp
===================================================================
--- unittests/clangd/IndexTests.cpp
+++ unittests/clangd/IndexTests.cpp
@@ -45,18 +45,18 @@
 std::shared_ptr<std::vector<const Symbol *>>
 generateSymbols(std::vector<std::string> QualifiedNames,
                 std::weak_ptr<SlabAndPointers> *WeakSymbols = nullptr) {
-  auto Slab = std::make_shared<SlabAndPointers>();
-  if (WeakSymbols)
-    *WeakSymbols = Slab;
-
+  SymbolSlab::Builder Slab;
   for (llvm::StringRef QName : QualifiedNames)
-    Slab->Slab.insert(symbol(QName));
+    Slab.insert(symbol(QName));
 
-  for (const auto &Sym : Slab->Slab)
-    Slab->Pointers.push_back(&Sym.second);
-
-  auto *Pointers = &Slab->Pointers;
-  return {std::move(Slab), Pointers};
+  auto Storage = std::make_shared<SlabAndPointers>();
+  Storage->Slab = std::move(Slab).build();
+  for (const auto &Sym : Storage->Slab)
+    Storage->Pointers.push_back(&Sym);
+  if (WeakSymbols)
+    *WeakSymbols = Storage;
+  auto *Pointers = &Storage->Pointers;
+  return {std::move(Storage), Pointers};
 }
 
 // Create a slab of symbols with IDs and names [Begin, End], otherwise identical
Index: unittests/clangd/FileIndexTests.cpp
===================================================================
--- unittests/clangd/FileIndexTests.cpp
+++ unittests/clangd/FileIndexTests.cpp
@@ -28,9 +28,11 @@
   return Sym;
 }
 
-void addNumSymbolsToSlab(int Begin, int End, SymbolSlab *Slab) {
+std::unique_ptr<SymbolSlab> numSlab(int Begin, int End) {
+  SymbolSlab::Builder Slab;
   for (int i = Begin; i <= End; i++)
-    Slab->insert(symbol(std::to_string(i)));
+    Slab.insert(symbol(std::to_string(i)));
+  return llvm::make_unique<SymbolSlab>(std::move(Slab).build());
 }
 
 std::vector<std::string>
@@ -45,46 +47,29 @@
   FileSymbols FS;
   EXPECT_THAT(getSymbolNames(*FS.allSymbols()), UnorderedElementsAre());
 
-  auto Slab = llvm::make_unique<SymbolSlab>();
-  addNumSymbolsToSlab(1, 3, Slab.get());
-
-  FS.update("f1", std::move(Slab));
-
+  FS.update("f1", numSlab(1, 3));
   EXPECT_THAT(getSymbolNames(*FS.allSymbols()),
               UnorderedElementsAre("1", "2", "3"));
 }
 
 TEST(FileSymbolsTest, Overlap) {
   FileSymbols FS;
-
-  auto Slab = llvm::make_unique<SymbolSlab>();
-  addNumSymbolsToSlab(1, 3, Slab.get());
-
-  FS.update("f1", std::move(Slab));
-
-  Slab = llvm::make_unique<SymbolSlab>();
-  addNumSymbolsToSlab(3, 5, Slab.get());
-
-  FS.update("f2", std::move(Slab));
-
+  FS.update("f1", numSlab(1, 3));
+  FS.update("f2", numSlab(3, 5));
   EXPECT_THAT(getSymbolNames(*FS.allSymbols()),
               UnorderedElementsAre("1", "2", "3", "3", "4", "5"));
 }
 
 TEST(FileSymbolsTest, SnapshotAliveAfterRemove) {
   FileSymbols FS;
 
-  auto Slab = llvm::make_unique<SymbolSlab>();
-  addNumSymbolsToSlab(1, 3, Slab.get());
-
-  FS.update("f1", std::move(Slab));
+  FS.update("f1", numSlab(1, 3));
 
   auto Symbols = FS.allSymbols();
   EXPECT_THAT(getSymbolNames(*Symbols), UnorderedElementsAre("1", "2", "3"));
 
   FS.update("f1", nullptr);
   EXPECT_THAT(getSymbolNames(*FS.allSymbols()), UnorderedElementsAre());
-
   EXPECT_THAT(getSymbolNames(*Symbols), UnorderedElementsAre("1", "2", "3"));
 }
 
Index: unittests/clangd/CodeCompleteTests.cpp
===================================================================
--- unittests/clangd/CodeCompleteTests.cpp
+++ unittests/clangd/CodeCompleteTests.cpp
@@ -449,6 +449,7 @@
     std::vector<const Symbol *> Pointers;
   };
   auto Snap = std::make_shared<Snapshot>();
+  SymbolSlab::Builder Slab;
   for (const auto &Pair : Symbols) {
     Symbol Sym;
     Sym.ID = SymbolID(Pair.first);
@@ -462,10 +463,11 @@
       Sym.Scope = QName.substr(0, Pos);
     }
     Sym.SymInfo.Kind = Pair.second;
-    Snap->Slab.insert(std::move(Sym));
+    Slab.insert(Sym);
   }
+  Snap->Slab = std::move(Slab).build();
   for (auto &Iter : Snap->Slab)
-    Snap->Pointers.push_back(&Iter.second);
+    Snap->Pointers.push_back(&Iter);
   auto S = std::shared_ptr<std::vector<const Symbol *>>(std::move(Snap),
                                                         &Snap->Pointers);
   I->build(std::move(S));
Index: clangd/index/SymbolYAML.cpp
===================================================================
--- clangd/index/SymbolYAML.cpp
+++ clangd/index/SymbolYAML.cpp
@@ -127,20 +127,18 @@
   std::vector<Symbol> S;
   llvm::yaml::Input Yin(YAMLContent);
   Yin >> S;
-  SymbolSlab Syms;
+  SymbolSlab::Builder Syms;
   for (auto& Sym : S)
-    Syms.insert(std::move(Sym));
-  return Syms;
+    Syms.insert(Sym);
+  return std::move(Syms).build();
 }
 
 std::string SymbolToYAML(const SymbolSlab& Symbols) {
   std::string Str;
   llvm::raw_string_ostream OS(Str);
   llvm::yaml::Output Yout(OS);
-  for (auto &Pair : Symbols) {
-    Symbol MutableSymbol = Pair.second;
-    Yout<< MutableSymbol;
-  }
+  for (Symbol S : Symbols) // copy: Yout<< requires mutability.
+    Yout<< S;
   return OS.str();
 }
 
Index: clangd/index/SymbolCollector.h
===================================================================
--- clangd/index/SymbolCollector.h
+++ clangd/index/SymbolCollector.h
@@ -29,13 +29,11 @@
                       unsigned Offset,
                       index::IndexDataConsumer::ASTNodeInfo ASTNode) override;
 
-  void finish() override;
-
-  SymbolSlab takeSymbols() { return std::move(Symbols); }
+  SymbolSlab takeSymbols() { return std::move(Symbols).build(); }
 
 private:
   // All Symbols collected from the AST.
-  SymbolSlab Symbols;
+  SymbolSlab::Builder Symbols;
 };
 
 } // namespace clangd
Index: clangd/index/SymbolCollector.cpp
===================================================================
--- clangd/index/SymbolCollector.cpp
+++ clangd/index/SymbolCollector.cpp
@@ -91,7 +91,7 @@
 
     std::string USR(Buff.data(), Buff.size());
     auto ID = SymbolID(USR);
-    if (Symbols.find(ID) != Symbols.end())
+    if (Symbols.find(ID) != nullptr)
       return true;
 
     auto &SM = ND->getASTContext().getSourceManager();
@@ -107,7 +107,5 @@
   return true;
 }
 
-void SymbolCollector::finish() { Symbols.freeze(); }
-
 } // namespace clangd
 } // namespace clang
Index: clangd/index/Index.h
===================================================================
--- clangd/index/Index.h
+++ clangd/index/Index.h
@@ -13,9 +13,9 @@
 #include "../Context.h"
 #include "clang/Index/IndexSymbol.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/StringSet.h"
 #include <array>
 #include <string>
 
@@ -49,6 +49,9 @@
   bool operator==(const SymbolID &Sym) const {
     return HashValue == Sym.HashValue;
   }
+  bool operator<(const SymbolID &Sym) const {
+    return HashValue < Sym.HashValue;
+  }
 
 private:
   friend llvm::hash_code hash_value(const SymbolID &ID) {
@@ -70,6 +73,31 @@
 // "<<" operator.
 void operator>>(llvm::StringRef HexStr, SymbolID &ID);
 
+} // namespace clangd
+} // namespace clang
+namespace llvm {
+// Support SymbolIDs as DenseMap keys.
+template <> struct DenseMapInfo<clang::clangd::SymbolID> {
+  static inline clang::clangd::SymbolID getEmptyKey() {
+    static clang::clangd::SymbolID EmptyKey("EMPTYKEY");
+    return EmptyKey;
+  }
+  static inline clang::clangd::SymbolID getTombstoneKey() {
+    static clang::clangd::SymbolID TombstoneKey("TOMBSTONEKEY");
+    return TombstoneKey;
+  }
+  static unsigned getHashValue(const clang::clangd::SymbolID &Sym) {
+    return hash_value(Sym);
+  }
+  static bool isEqual(const clang::clangd::SymbolID &LHS,
+                      const clang::clangd::SymbolID &RHS) {
+    return LHS == RHS;
+  }
+};
+} // namespace llvm
+namespace clang {
+namespace clangd {
+
 // The class presents a C++ symbol, e.g. class, function.
 //
 // WARNING: Symbols do not own much of their underlying data - typically strings
@@ -102,38 +130,56 @@
   // FIXME: add code completion information.
 };
 
-// A symbol container that stores a set of symbols. The container will maintain
-// the lifetime of the symbols.
+// An immutable symbol container that stores a set of symbols.
+// The container will maintain the lifetime of the symbols.
 class SymbolSlab {
 public:
-  using const_iterator = llvm::DenseMap<SymbolID, Symbol>::const_iterator;
+  using const_iterator = std::vector<Symbol>::const_iterator;
 
   SymbolSlab() = default;
 
   const_iterator begin() const;
   const_iterator end() const;
   const_iterator find(const SymbolID &SymID) const;
 
-  // Once called, no more symbols would be added to the SymbolSlab. This
-  // operation is irreversible.
-  void freeze();
+  size_t size() const { return Symbols.size(); }
+  // Estimates the total memory usage.
+  size_t bytes() const {
+    return sizeof(*this) + Arena.getTotalMemory() +
+           Symbols.capacity() * sizeof(Symbol);
+  }
 
-  // Adds the symbol to this slab.
-  // This is a deep copy: underlying strings will be owned by the slab.
-  void insert(const Symbol& S);
+  // SymbolSlab::Builder is a mutable container that can 'freeze' to SymbolSlab.
+  // The frozen SymbolSlab will use less memory.
+  class Builder {
+   public:
+     // Adds a symbol, overwriting any existing one with the same ID.
+     // This is a deep copy: underlying strings will be owned by the slab.
+     void insert(const Symbol& S);
+
+     // Returns the symbol with an ID, if it exists. Valid until next insert().
+     const Symbol* find(const SymbolID &ID) {
+       auto I = SymbolIndex.find(ID);
+       return I == SymbolIndex.end() ? nullptr : &Symbols[I->second];
+     }
+
+     // Consumes the builder to finalize the slab.
+     SymbolSlab build() &&;
+
+   private:
+     llvm::BumpPtrAllocator Arena;
+     // Intern table for strings. Contents are on the arena.
+     llvm::DenseSet<llvm::StringRef> Strings;
+     std::vector<Symbol> Symbols;
+     llvm::DenseMap<SymbolID, size_t> SymbolIndex;
+  };
 
 private:
-  // Replaces S with a reference to the same string, owned by this slab.
-  void intern(llvm::StringRef &S) {
-    S = S.empty() ? llvm::StringRef() : Strings.insert(S).first->getKey();
-  }
-
-  bool Frozen = false;
+  SymbolSlab(llvm::BumpPtrAllocator Arena, std::vector<Symbol> Symbols)
+      : Arena(std::move(Arena)), Symbols(std::move(Symbols)) {}
 
-  // Intern table for strings. Not StringPool as we don't refcount, just insert.
-  // We use BumpPtrAllocator to avoid lots of tiny allocations for nodes.
-  llvm::StringSet<llvm::BumpPtrAllocator> Strings;
-  llvm::DenseMap<SymbolID, Symbol> Symbols;
+  llvm::BumpPtrAllocator Arena;
+  std::vector<Symbol> Symbols;
 };
 
 struct FuzzyFindRequest {
@@ -174,27 +220,4 @@
 
 } // namespace clangd
 } // namespace clang
-
-namespace llvm {
-
-template <> struct DenseMapInfo<clang::clangd::SymbolID> {
-  static inline clang::clangd::SymbolID getEmptyKey() {
-    static clang::clangd::SymbolID EmptyKey("EMPTYKEY");
-    return EmptyKey;
-  }
-  static inline clang::clangd::SymbolID getTombstoneKey() {
-    static clang::clangd::SymbolID TombstoneKey("TOMBSTONEKEY");
-    return TombstoneKey;
-  }
-  static unsigned getHashValue(const clang::clangd::SymbolID &Sym) {
-    return hash_value(Sym);
-  }
-  static bool isEqual(const clang::clangd::SymbolID &LHS,
-                      const clang::clangd::SymbolID &RHS) {
-    return LHS == RHS;
-  }
-};
-
-} // namespace llvm
-
 #endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_INDEX_H
Index: clangd/index/Index.cpp
===================================================================
--- clangd/index/Index.cpp
+++ clangd/index/Index.cpp
@@ -9,20 +9,22 @@
 
 #include "Index.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/Support/SHA1.h"
 
 namespace clang {
 namespace clangd {
+using namespace llvm;
 
-SymbolID::SymbolID(llvm::StringRef USR)
-    : HashValue(llvm::SHA1::hash(arrayRefFromStringRef(USR))) {}
+SymbolID::SymbolID(StringRef USR)
+    : HashValue(SHA1::hash(arrayRefFromStringRef(USR))) {}
 
-llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const SymbolID &ID) {
-  OS << toHex(llvm::toStringRef(ID.HashValue));
+raw_ostream &operator<<(raw_ostream &OS, const SymbolID &ID) {
+  OS << toHex(toStringRef(ID.HashValue));
   return OS;
 }
 
-void operator>>(llvm::StringRef Str, SymbolID &ID) {
+void operator>>(StringRef Str, SymbolID &ID) {
   std::string HexString = fromHex(Str);
   assert(HexString.size() == 20);
   std::copy(HexString.begin(), HexString.end(), ID.HashValue.begin());
@@ -32,23 +34,57 @@
 
 SymbolSlab::const_iterator SymbolSlab::end() const { return Symbols.end(); }
 
-SymbolSlab::const_iterator SymbolSlab::find(const SymbolID &SymID) const {
-  return Symbols.find(SymID);
+SymbolSlab::const_iterator SymbolSlab::find(const SymbolID &ID) const {
+  auto It = std::lower_bound(Symbols.begin(), Symbols.end(), ID,
+                             [](const Symbol &S, const SymbolID &I) {
+                               return S.ID == I;
+                             });
+  if (It != Symbols.end() || It->ID == ID)
+    return It;
+  return Symbols.end();
 }
 
-void SymbolSlab::freeze() { Frozen = true; }
+// Replace S with a reference to the same string owned by the arena.
+static void intern(StringRef &S, DenseSet<StringRef> &StringTable,
+                   BumpPtrAllocator &Arena) {
+  auto R = StringTable.insert(S);
+  if (R.second) { // New entry added to the table, we need to copy the string.
+    char *Data = Arena.Allocate<char>(S.size());
+    memcpy(Data, S.data(), S.size());
+    *R.first = StringRef(Data, S.size());
+  }
+  S = *R.first;
+}
+
+// Copy the underlying data of the symbol into a strings table.
+void own(Symbol &S, DenseSet<StringRef> &Strings, BumpPtrAllocator &Arena) {
+  auto Intern = [&](StringRef &S) { intern(S, Strings, Arena); };
 
-void SymbolSlab::insert(const Symbol &S) {
-  assert(!Frozen && "Can't insert a symbol after the slab has been frozen!");
-  auto ItInserted = Symbols.try_emplace(S.ID, S);
-  if (!ItInserted.second)
-    return;
-  auto &Sym = ItInserted.first->second;
+  Intern(S.Name);
+  Intern(S.Scope);
+  Intern(S.CanonicalDeclaration.FilePath);
+}
+
+void SymbolSlab::Builder::insert(const Symbol &S) {
+  auto R = SymbolIndex.try_emplace(S.ID, Symbols.size());
+  if (R.second) {
+    Symbols.push_back(S);
+    own(Symbols.back(), Strings, Arena);
+  } else
+    own(Symbols[R.first->second] = S, Strings, Arena);
+}
 
-  // We inserted a new symbol, so copy the underlying data.
-  intern(Sym.Name);
-  intern(Sym.Scope);
-  intern(Sym.CanonicalDeclaration.FilePath);
+SymbolSlab SymbolSlab::Builder::build() && {
+  Symbols = {Symbols.begin(), Symbols.end()}; // Force shrink-to-fit.
+  // Sort symbols so the slab can binary search over them.
+  std::sort(Symbols.begin(), Symbols.end(),
+            [](const Symbol &L, const Symbol &R) { return L.ID < R.ID; });
+  // We may have unused strings, so copy the used stuff over to another arena.
+  BumpPtrAllocator Arena;
+  DenseSet<StringRef> Strings;
+  for (auto &S : Symbols)
+    own(S, Strings, Arena);
+  return SymbolSlab(std::move(Arena), std::move(Symbols));
 }
 
 } // namespace clangd
Index: clangd/index/FileIndex.cpp
===================================================================
--- clangd/index/FileIndex.cpp
+++ clangd/index/FileIndex.cpp
@@ -54,7 +54,7 @@
     for (const auto &FileAndSlab : FileToSlabs) {
       Snap->KeepAlive.push_back(FileAndSlab.second);
       for (const auto &Iter : *FileAndSlab.second)
-        Snap->Pointers.push_back(&Iter.second);
+        Snap->Pointers.push_back(&Iter);
     }
   }
   auto *Pointers = &Snap->Pointers;
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to