[PATCH] D51481: [clangd] Implement proximity path boosting for Dex

Kirill Bobyrev via Phabricator via cfe-commits Thu, 30 Aug 2018 03:41:45 -0700

kbobyrev updated this revision to Diff 163294.
kbobyrev added a comment.

Stop query generation as soon as one valid URI scheme was found.



https://reviews.llvm.org/D51481

Files:
  clang-tools-extra/clangd/CMakeLists.txt
  clang-tools-extra/clangd/index/dex/DexIndex.cpp
  clang-tools-extra/clangd/index/dex/DexIndex.h
  clang-tools-extra/clangd/index/dex/Token.cpp
  clang-tools-extra/clangd/index/dex/Token.h
  clang-tools-extra/clangd/tool/ClangdMain.cpp
  clang-tools-extra/unittests/clangd/DexIndexTests.cpp

Index: clang-tools-extra/unittests/clangd/DexIndexTests.cpp
===================================================================
--- clang-tools-extra/unittests/clangd/DexIndexTests.cpp
+++ clang-tools-extra/unittests/clangd/DexIndexTests.cpp
@@ -7,6 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "FuzzyMatch.h"
 #include "TestIndex.h"
 #include "index/Index.h"
 #include "index/Merge.h"
@@ -29,6 +30,12 @@
 namespace dex {
 namespace {
 
+std::vector<std::string> URISchemes = {"file"};
+
+//===----------------------------------------------------------------------===//
+// Query Iterators tests.
+//===----------------------------------------------------------------------===//
+
 std::vector<DocID> consumeIDs(Iterator &It) {
   auto IDAndScore = consume(It);
   std::vector<DocID> IDs(IDAndScore.size());
@@ -324,16 +331,39 @@
   EXPECT_THAT(ElementBoost, 3);
 }
 
+//===----------------------------------------------------------------------===//
+// Search Token tests.
+//===----------------------------------------------------------------------===//
+
 testing::Matcher<std::vector<Token>>
-trigramsAre(std::initializer_list<std::string> Trigrams) {
+tokensAre(std::initializer_list<std::string> Strings, Token::Kind Kind) {
   std::vector<Token> Tokens;
-  for (const auto &Symbols : Trigrams) {
-    Tokens.push_back(Token(Token::Kind::Trigram, Symbols));
+  for (const auto &TokenData : Strings) {
+    Tokens.push_back(Token(Kind, TokenData));
   }
   return testing::UnorderedElementsAreArray(Tokens);
 }
 
-TEST(DexIndexTrigrams, IdentifierTrigrams) {
+testing::Matcher<std::vector<Token>>
+trigramsAre(std::initializer_list<std::string> Trigrams) {
+  return tokensAre(Trigrams, Token::Kind::Trigram);
+}
+
+testing::Matcher<std::vector<Token>>
+pathsAre(std::initializer_list<std::string> Paths) {
+  return tokensAre(Paths, Token::Kind::Path);
+}
+
+testing::Matcher<std::vector<std::pair<Token, float>>> proximityPathsAre(
+    std::initializer_list<std::pair<std::string, float>> ProximityPaths) {
+  std::vector<std::pair<Token, float>> Result;
+  for (const auto &P : ProximityPaths) {
+    Result.push_back({Token(Token::Kind::Path, P.first), P.second});
+  }
+  return testing::UnorderedElementsAreArray(Result);
+}
+
+TEST(DexSearchTokens, IdentifierTrigrams) {
   EXPECT_THAT(generateIdentifierTrigrams("X86"),
               trigramsAre({"x86", "x$$", "x8$"}));
 
@@ -374,7 +404,7 @@
                    "hkl", "ijk", "ikl", "jkl", "klm", "ab$", "ad$"}));
 }
 
-TEST(DexIndexTrigrams, QueryTrigrams) {
+TEST(DexSearchTokens, QueryTrigrams) {
   EXPECT_THAT(generateQueryTrigrams("c"), trigramsAre({"c$$"}));
   EXPECT_THAT(generateQueryTrigrams("cl"), trigramsAre({"cl$"}));
   EXPECT_THAT(generateQueryTrigrams("cla"), trigramsAre({"cla"}));
@@ -407,9 +437,31 @@
                            "hij", "ijk", "jkl", "klm"}));
 }
 
+TEST(DexSearchTokens, SymbolPath) {
+  EXPECT_THAT(generateProximityPaths(
+                  "file:///clang-tools-extra/clangd/index/dex/Token.h"),
+              pathsAre({"file:///clang-tools-extra/clangd/index/dex/",
+                        "file:///clang-tools-extra/clangd/index/",
+                        "file:///clang-tools-extra/clangd/",
+                        "file:///clang-tools-extra/", "file:///"}));
+}
+
+TEST(DexSearchTokens, QueryProximityPaths) {
+  EXPECT_THAT(generateQueryProximityPaths("/a/b/c/d/e/f/g.h", URISchemes),
+              proximityPathsAre({{"file:///a/b/c/d/e/f/", 6.},
+                                 {"file:///a/b/c/d/e/", 5.},
+                                 {"file:///a/b/c/d/", 4.},
+                                 {"file:///a/b/c/", 3.},
+                                 {"file:///a/b/", 2.}}));
+}
+
+//===----------------------------------------------------------------------===//
+// Index tests.
+//===----------------------------------------------------------------------===//
+
 TEST(DexIndex, Lookup) {
   DexIndex I;
-  I.build(generateSymbols({"ns::abc", "ns::xyz"}));
+  I.build(generateSymbols({"ns::abc", "ns::xyz"}), URISchemes);
   EXPECT_THAT(lookup(I, SymbolID("ns::abc")), UnorderedElementsAre("ns::abc"));
   EXPECT_THAT(lookup(I, {SymbolID("ns::abc"), SymbolID("ns::xyz")}),
               UnorderedElementsAre("ns::abc", "ns::xyz"));
@@ -421,7 +473,8 @@
 TEST(DexIndex, FuzzyFind) {
   DexIndex Index;
   Index.build(generateSymbols({"ns::ABC", "ns::BCD", "::ABC", "ns::nested::ABC",
-                               "other::ABC", "other::A"}));
+                               "other::ABC", "other::A"}),
+              URISchemes);
   FuzzyFindRequest Req;
   Req.Query = "ABC";
   Req.Scopes = {"ns::"};
@@ -444,7 +497,8 @@
 TEST(DexIndexTest, FuzzyMatchQ) {
   DexIndex I;
   I.build(
-      generateSymbols({"LaughingOutLoud", "LionPopulation", "LittleOldLady"}));
+      generateSymbols({"LaughingOutLoud", "LionPopulation", "LittleOldLady"}),
+      URISchemes);
   FuzzyFindRequest Req;
   Req.Query = "lol";
   Req.MaxCandidateCount = 2;
@@ -455,14 +509,14 @@
 TEST(DexIndexTest, DexIndexSymbolsRecycled) {
   DexIndex I;
   std::weak_ptr<SlabAndPointers> Symbols;
-  I.build(generateNumSymbols(0, 10, &Symbols));
+  I.build(generateNumSymbols(0, 10, &Symbols), URISchemes);
   FuzzyFindRequest Req;
   Req.Query = "7";
   EXPECT_THAT(match(I, Req), UnorderedElementsAre("7"));
 
   EXPECT_FALSE(Symbols.expired());
   // Release old symbols.
-  I.build(generateNumSymbols(0, 0));
+  I.build(generateNumSymbols(0, 0), URISchemes);
   EXPECT_TRUE(Symbols.expired());
 }
 
@@ -483,14 +537,14 @@
   FuzzyFindRequest Req;
   Req.Query = "7";
   DexIndex I;
-  I.build(std::move(Symbols));
+  I.build(std::move(Symbols), URISchemes);
   auto Matches = match(I, Req);
   EXPECT_EQ(Matches.size(), 4u);
 }
 
 TEST(DexIndexTest, DexIndexLimitedNumMatches) {
   DexIndex I;
-  I.build(generateNumSymbols(0, 100));
+  I.build(generateNumSymbols(0, 100), URISchemes);
   FuzzyFindRequest Req;
   Req.Query = "5";
   Req.MaxCandidateCount = 3;
@@ -503,7 +557,8 @@
 TEST(DexIndexTest, FuzzyMatch) {
   DexIndex I;
   I.build(
-      generateSymbols({"LaughingOutLoud", "LionPopulation", "LittleOldLady"}));
+      generateSymbols({"LaughingOutLoud", "LionPopulation", "LittleOldLady"}),
+      URISchemes);
   FuzzyFindRequest Req;
   Req.Query = "lol";
   Req.MaxCandidateCount = 2;
@@ -513,68 +568,116 @@
 
 TEST(DexIndexTest, MatchQualifiedNamesWithoutSpecificScope) {
   DexIndex I;
-  I.build(generateSymbols({"a::y1", "b::y2", "y3"}));
+  I.build(generateSymbols({"a::y1", "b::y2", "y3"}), URISchemes);
   FuzzyFindRequest Req;
   Req.Query = "y";
   EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1", "b::y2", "y3"));
 }
 
 TEST(DexIndexTest, MatchQualifiedNamesWithGlobalScope) {
   DexIndex I;
-  I.build(generateSymbols({"a::y1", "b::y2", "y3"}));
+  I.build(generateSymbols({"a::y1", "b::y2", "y3"}), URISchemes);
   FuzzyFindRequest Req;
   Req.Query = "y";
   Req.Scopes = {""};
   EXPECT_THAT(match(I, Req), UnorderedElementsAre("y3"));
 }
 
 TEST(DexIndexTest, MatchQualifiedNamesWithOneScope) {
   DexIndex I;
-  I.build(generateSymbols({"a::y1", "a::y2", "a::x", "b::y2", "y3"}));
+  I.build(generateSymbols({"a::y1", "a::y2", "a::x", "b::y2", "y3"}),
+          URISchemes);
   FuzzyFindRequest Req;
   Req.Query = "y";
   Req.Scopes = {"a::"};
   EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1", "a::y2"));
 }
 
 TEST(DexIndexTest, MatchQualifiedNamesWithMultipleScopes) {
   DexIndex I;
-  I.build(generateSymbols({"a::y1", "a::y2", "a::x", "b::y3", "y3"}));
+  I.build(generateSymbols({"a::y1", "a::y2", "a::x", "b::y3", "y3"}),
+          URISchemes);
   FuzzyFindRequest Req;
   Req.Query = "y";
   Req.Scopes = {"a::", "b::"};
   EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1", "a::y2", "b::y3"));
 }
 
 TEST(DexIndexTest, NoMatchNestedScopes) {
   DexIndex I;
-  I.build(generateSymbols({"a::y1", "a::b::y2"}));
+  I.build(generateSymbols({"a::y1", "a::b::y2"}), URISchemes);
   FuzzyFindRequest Req;
   Req.Query = "y";
   Req.Scopes = {"a::"};
   EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1"));
 }
 
 TEST(DexIndexTest, IgnoreCases) {
   DexIndex I;
-  I.build(generateSymbols({"ns::ABC", "ns::abc"}));
+  I.build(generateSymbols({"ns::ABC", "ns::abc"}), URISchemes);
   FuzzyFindRequest Req;
   Req.Query = "AB";
   Req.Scopes = {"ns::"};
   EXPECT_THAT(match(I, Req), UnorderedElementsAre("ns::ABC", "ns::abc"));
 }
 
 TEST(DexIndexTest, Lookup) {
   DexIndex I;
-  I.build(generateSymbols({"ns::abc", "ns::xyz"}));
+  I.build(generateSymbols({"ns::abc", "ns::xyz"}), URISchemes);
   EXPECT_THAT(lookup(I, SymbolID("ns::abc")), UnorderedElementsAre("ns::abc"));
   EXPECT_THAT(lookup(I, {SymbolID("ns::abc"), SymbolID("ns::xyz")}),
               UnorderedElementsAre("ns::abc", "ns::xyz"));
   EXPECT_THAT(lookup(I, {SymbolID("ns::nonono"), SymbolID("ns::xyz")}),
               UnorderedElementsAre("ns::xyz"));
   EXPECT_THAT(lookup(I, SymbolID("ns::nonono")), UnorderedElementsAre());
 }
 
+TEST(DexIndexTest, ProximityPathsBoosting) {
+  DexIndex I;
+
+  // All strings should be stored in local variables to ensure their lifetime is
+  // the same as Slab's.
+  std::string RootFileURI = "file::///file.h";
+  std::string CloseFileURI = "file:///a/b/c/d/e/file.h";
+
+  // Populate Symbol Slab
+  std::vector<const Symbol *> Slab;
+  // Symbol in the root will perfectly match the request, but its canonical
+  // declaration is in the root directory.
+  Symbol RootSymbol;
+  std::string RootSymbolName = "abc";
+  RootSymbol.Name = RootSymbolName;
+  RootSymbol.CanonicalDeclaration.FileURI = RootFileURI;
+  Slab.push_back(&RootSymbol);
+
+  // Another symbol has much lower Fuzzy Matching Score, but it is very close to
+  // the directory the request would come from.
+  Symbol CloseSymbol;
+  std::string CloseSymbolName = "AbsolutelyBizzareCoincidenceMatch";
+  CloseSymbol.Name = CloseSymbolName;
+  CloseSymbol.CanonicalDeclaration.FileURI = CloseFileURI;
+
+  // Insert duplicates so that only the boosted symbols end up in the scoring
+  // stage.
+  for (size_t Index = 0; Index < I.getScoringItemsMultiplier(); ++Index)
+    Slab.push_back(&CloseSymbol);
+
+  I.build(std::make_shared<std::vector<const Symbol *>>(Slab), URISchemes);
+
+
+  FuzzyFindRequest Req;
+  Req.Query = "abc";
+  Req.MaxCandidateCount = 1;
+  // FuzzyFind request comes from the file which is far from the root.
+  Req.ProximityPaths.push_back("/a/b/c/d/e/f/file.h");
+
+  FuzzyMatcher Matcher(Req.Query);
+  EXPECT_TRUE(Matcher.match(RootSymbolName) > Matcher.match(CloseSymbolName));
+
+  EXPECT_THAT(match(I, Req),
+              UnorderedElementsAre("AbsolutelyBizzareCoincidenceMatch"));
+}
+
 } // namespace
 } // namespace dex
 } // namespace clangd
Index: clang-tools-extra/clangd/tool/ClangdMain.cpp
===================================================================
--- clang-tools-extra/clangd/tool/ClangdMain.cpp
+++ clang-tools-extra/clangd/tool/ClangdMain.cpp
@@ -42,7 +42,9 @@
 // Build an in-memory static index for global symbols from a YAML-format file.
 // The size of global symbols should be relatively small, so that all symbols
 // can be managed in memory.
-std::unique_ptr<SymbolIndex> buildStaticIndex(llvm::StringRef YamlSymbolFile) {
+std::unique_ptr<SymbolIndex>
+buildStaticIndex(llvm::StringRef YamlSymbolFile,
+                 std::vector<std::string> &URISchemes) {
   auto Buffer = llvm::MemoryBuffer::getFile(YamlSymbolFile);
   if (!Buffer) {
     llvm::errs() << "Can't open " << YamlSymbolFile << "\n";
@@ -53,8 +55,9 @@
   for (auto Sym : Slab)
     SymsBuilder.insert(Sym);
 
-  return UseDex ? dex::DexIndex::build(std::move(SymsBuilder).build())
-                : MemIndex::build(std::move(SymsBuilder).build());
+  return UseDex
+             ? dex::DexIndex::build(std::move(SymsBuilder).build(), URISchemes)
+             : MemIndex::build(std::move(SymsBuilder).build());
 }
 
 } // namespace
@@ -298,7 +301,7 @@
   Opts.BuildDynamicSymbolIndex = EnableIndex;
   std::unique_ptr<SymbolIndex> StaticIdx;
   if (EnableIndex && !YamlSymbolFile.empty()) {
-    StaticIdx = buildStaticIndex(YamlSymbolFile);
+    StaticIdx = buildStaticIndex(YamlSymbolFile, Opts.URISchemes);
     Opts.StaticIndex = StaticIdx.get();
   }
   Opts.AsyncThreadsCount = WorkerThreadsCount;
Index: clang-tools-extra/clangd/index/dex/Token.h
===================================================================
--- clang-tools-extra/clangd/index/dex/Token.h
+++ clang-tools-extra/clangd/index/dex/Token.h
@@ -48,15 +48,18 @@
     Trigram,
     /// Scope primitives, e.g. "symbol belongs to namespace foo::bar".
     ///
-    /// Data stroes full scope name , e.g. "foo::bar::baz::" or "" (for global
+    /// Data stroes full scope name, e.g. "foo::bar::baz::" or "" (for global
     /// scope).
     Scope,
+    /// Path to symbol declaration.
+    ///
+    /// Data stores URI path to the directory with the trailing '/', e.g.
+    /// "file:///path/to/clang-tools-extra/clangd/index/dex/".
+    Path,
     /// Internal Token type for invalid/special tokens, e.g. empty tokens for
     /// llvm::DenseMap.
     Sentinel,
     /// FIXME(kbobyrev): Add other Token Kinds
-    /// * Path with full or relative path to the directory in which symbol is
-    ///   defined
     /// * Type with qualified type name or its USR
   };
 
@@ -81,6 +84,23 @@
   }
 };
 
+/// Returns Search Token for each parent directory of given Path. Should be used
+/// within the index build process.
+///
+/// When Limit is set, returns no more than Limit tokens.
+std::vector<Token>
+generateProximityPaths(llvm::StringRef Path,
+                       size_t Limit = std::numeric_limits<size_t>::max());
+
+/// Returns Search Token and its boost for each path created via converting
+/// given absolute path by using provided URI scheme.
+///
+/// Boost starts with Count and decreases by 1 for each parent directory token.
+std::vector<std::pair<Token, float>>
+generateQueryProximityPaths(llvm::StringRef AbsolutePath,
+                            llvm::ArrayRef<std::string> URISchemes,
+                            size_t Count = 5);
+
 } // namespace dex
 } // namespace clangd
 } // namespace clang
@@ -109,4 +129,4 @@
 
 } // namespace llvm
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TOKEN_H
+#endif
Index: clang-tools-extra/clangd/index/dex/Token.cpp
===================================================================
--- /dev/null
+++ clang-tools-extra/clangd/index/dex/Token.cpp
@@ -0,0 +1,58 @@
+//===--- Token.cpp - Symbol Search primitive --------------------*- C++ -*-===//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Token.h"
+#include "URI.h"
+#include <cassert>
+
+namespace clang {
+namespace clangd {
+namespace dex {
+
+std::vector<Token> generateProximityPaths(llvm::StringRef URIPath,
+                                          size_t Limit) {
+  std::vector<Token> Result;
+  const static char Separator = '/';
+  for (size_t SeparatorPosition = URIPath.find_last_of(Separator);
+       SeparatorPosition != StringRef::npos && !URIPath.endswith("://") &&
+       Limit > 0;
+       SeparatorPosition = URIPath.find_last_of(Separator), --Limit) {
+    // Drop everything after last '/'.
+    URIPath = URIPath.drop_back(URIPath.size() - SeparatorPosition - 1);
+    Result.push_back(Token(Token::Kind::Path, URIPath));
+    // Drop trailing '/' so that the next one could be found.
+    URIPath = URIPath.drop_back();
+  }
+  return Result;
+}
+
+std::vector<std::pair<Token, float>>
+generateQueryProximityPaths(llvm::StringRef AbsolutePath,
+                            llvm::ArrayRef<std::string> URISchemes,
+                            size_t Count) {
+  std::vector<std::pair<Token, float>> Result;
+  for (const auto Scheme : URISchemes) {
+    float Boost = Count + 1;
+    auto URI = URI::create(AbsolutePath, Scheme);
+    // For some paths, conversion to different URI schemes is impossible. These
+    // should be just skipped.
+    if (!URI)
+      continue;
+    for (const auto ProximityPath :
+         generateProximityPaths(URI.get().toString(), Count))
+      Result.push_back({ProximityPath, Boost--});
+    // As soon as one valid URI scheme for given absolute path is found, the
+    // token generation process should be stopped.
+    break;
+  }
+  return Result;
+}
+
+} // namespace dex
+} // namespace clangd
+} // namespace clang
Index: clang-tools-extra/clangd/index/dex/DexIndex.h
===================================================================
--- clang-tools-extra/clangd/index/dex/DexIndex.h
+++ clang-tools-extra/clangd/index/dex/DexIndex.h
@@ -41,10 +41,12 @@
 public:
   /// \brief (Re-)Build index for `Symbols`. All symbol pointers must remain
   /// accessible as long as `Symbols` is kept alive.
-  void build(std::shared_ptr<std::vector<const Symbol *>> Symbols);
+  void build(std::shared_ptr<std::vector<const Symbol *>> Symbols,
+             llvm::ArrayRef<std::string> URISchemes);
 
   /// \brief Build index from a symbol slab.
-  static std::unique_ptr<SymbolIndex> build(SymbolSlab Slab);
+  static std::unique_ptr<SymbolIndex>
+  build(SymbolSlab Slab, llvm::ArrayRef<std::string> URISchemes);
 
   bool
   fuzzyFind(const FuzzyFindRequest &Req,
@@ -59,8 +61,15 @@
 
   size_t estimateMemoryUsage() const override;
 
-private:
+  /// For fuzzyFind() Dex retrieves getRetrievalItemsMultiplier() more items
+  /// than requested via Req.MaxCandidateCount in the first stage of filtering.
+  size_t getRetrievalItemsMultiplier() const;
+
+  /// For fuzzyFind() Dex scores getScoringItemsMultiplier() more items
+  /// than requested via Req.MaxCandidateCount in the second stage of filtering.
+  size_t getScoringItemsMultiplier() const;
 
+private:
   mutable std::mutex Mutex;
 
   std::shared_ptr<std::vector<const Symbol *>> Symbols /*GUARDED_BY(Mutex)*/;
@@ -73,6 +82,10 @@
   // Inverted index is used to retrieve posting lists which are processed during
   // the fuzzyFind process.
   llvm::DenseMap<Token, PostingList> InvertedIndex /*GUARDED_BY(Mutex)*/;
+  std::vector<std::string> URISchemes /*GUARDED BY(Mutex)*/;
+
+  const static size_t RETRIEVAL_ITEMS_MULTIPLIER = 100;
+  const static size_t SCORING_ITEMS_MULTIPLIER = 10;
 };
 
 } // namespace dex
Index: clang-tools-extra/clangd/index/dex/DexIndex.cpp
===================================================================
--- clang-tools-extra/clangd/index/dex/DexIndex.cpp
+++ clang-tools-extra/clangd/index/dex/DexIndex.cpp
@@ -26,17 +26,21 @@
 // Returns the tokens which are given symbols's characteristics. For example,
 // trigrams and scopes.
 // FIXME(kbobyrev): Support more token types:
-// * Path proximity
 // * Types
 std::vector<Token> generateSearchTokens(const Symbol &Sym) {
   std::vector<Token> Result = generateIdentifierTrigrams(Sym.Name);
   Result.push_back(Token(Token::Kind::Scope, Sym.Scope));
+  for (const auto &PathToken :
+       generateProximityPaths(Sym.CanonicalDeclaration.FileURI)) {
+    Result.push_back(PathToken);
+  }
   return Result;
 }
 
 } // namespace
 
-void DexIndex::build(std::shared_ptr<std::vector<const Symbol *>> Syms) {
+void DexIndex::build(std::shared_ptr<std::vector<const Symbol *>> Syms,
+                     llvm::ArrayRef<std::string> ServerURISchemes) {
   llvm::DenseMap<SymbolID, const Symbol *> TempLookupTable;
   llvm::DenseMap<const Symbol *, float> TempSymbolQuality;
   for (const Symbol *Sym : *Syms) {
@@ -66,15 +70,17 @@
     Symbols = std::move(Syms);
     InvertedIndex = std::move(TempInvertedIndex);
     SymbolQuality = std::move(TempSymbolQuality);
+    URISchemes = ServerURISchemes;
   }
 
   vlog("Built DexIndex with estimated memory usage {0} bytes.",
        estimateMemoryUsage());
 }
 
-std::unique_ptr<SymbolIndex> DexIndex::build(SymbolSlab Slab) {
+std::unique_ptr<SymbolIndex>
+DexIndex::build(SymbolSlab Slab, llvm::ArrayRef<std::string> URISchemes) {
   auto Idx = llvm::make_unique<DexIndex>();
-  Idx->build(getSymbolsFromSlab(std::move(Slab)));
+  Idx->build(getSymbolsFromSlab(std::move(Slab)), URISchemes);
   return std::move(Idx);
 }
 
@@ -117,6 +123,26 @@
     if (!ScopeIterators.empty())
       TopLevelChildren.push_back(createOr(move(ScopeIterators)));
 
+    // Add proximity paths boosting.
+    std::vector<std::unique_ptr<Iterator>> BoostingIterators;
+    // This should generate proximity path boosting iterators for each different
+    // URI Scheme the Index is aware of.
+    for (const auto &ProximityPath : Req.ProximityPaths) {
+      for (const auto &P :
+           generateQueryProximityPaths(ProximityPath, URISchemes)) {
+        const auto It = InvertedIndex.find(P.first);
+        if (It != InvertedIndex.end())
+          // FIXME(kbobyrev): Append LIMIT on top of every BOOST iterator.
+          BoostingIterators.push_back(
+              createBoost(create(It->second), P.second));
+      }
+    }
+    // Boosting iterators do not actually filter symbols. In order to preserve
+    // the validity of resulting query, TRUE iterator should be added along
+    // BOOSTs.
+    BoostingIterators.push_back(createTrue(Symbols->size()));
+    TopLevelChildren.push_back(createOr(move(BoostingIterators)));
+
     // Use TRUE iterator if both trigrams and scopes from the query are not
     // present in the symbol index.
     auto QueryIterator = TopLevelChildren.empty()
@@ -127,16 +153,24 @@
     // FIXME(kbobyrev): Pre-scoring retrieval threshold should be adjusted as
     // using 100x of the requested number might not be good in practice, e.g.
     // when the requested number of items is small.
-    const unsigned ItemsToRetrieve = 100 * Req.MaxCandidateCount;
+    const size_t ItemsToRetrieve =
+        getRetrievalItemsMultiplier() * Req.MaxCandidateCount;
     auto Root = createLimit(move(QueryIterator), ItemsToRetrieve);
-    // FIXME(kbobyrev): Add boosting to the query and utilize retrieved
-    // boosting scores.
     std::vector<std::pair<DocID, float>> SymbolDocIDs = consume(*Root);
 
+    // Sort items using boosting score as the key.
+    std::sort(begin(SymbolDocIDs), end(SymbolDocIDs),
+              [](const std::pair<DocID, float> &LHS,
+                 const std::pair<DocID, float> &RHS) {
+                return LHS.second > RHS.second;
+              });
+
     // Retrieve top Req.MaxCandidateCount items.
+    const size_t ItemsToScore =
+        getScoringItemsMultiplier() * Req.MaxCandidateCount;
     std::priority_queue<std::pair<float, const Symbol *>> Top;
-    for (const auto &P : SymbolDocIDs) {
-      const DocID SymbolDocID = P.first;
+    for (size_t I = 0; I < std::min(SymbolDocIDs.size(), ItemsToScore); ++I) {
+      const DocID SymbolDocID = SymbolDocIDs[I].first;
       const auto *Sym = (*Symbols)[SymbolDocID];
       const llvm::Optional<float> Score = Filter.match(Sym->Name);
       if (!Score)
@@ -188,6 +222,14 @@
   return Bytes;
 }
 
+size_t DexIndex::getRetrievalItemsMultiplier() const {
+  return RETRIEVAL_ITEMS_MULTIPLIER;
+}
+
+size_t DexIndex::getScoringItemsMultiplier() const {
+  return SCORING_ITEMS_MULTIPLIER;
+}
+
 } // namespace dex
 } // namespace clangd
 } // namespace clang
Index: clang-tools-extra/clangd/CMakeLists.txt
===================================================================
--- clang-tools-extra/clangd/CMakeLists.txt
+++ clang-tools-extra/clangd/CMakeLists.txt
@@ -46,6 +46,7 @@
 
   index/dex/DexIndex.cpp
   index/dex/Iterator.cpp
+  index/dex/Token.cpp
   index/dex/Trigram.cpp
 
   LINK_LIBS

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D51481: [clangd] Implement proximity path boosting for Dex

Reply via email to