[PATCH] D51481: [clangd] Implement proximity path boosting for Dex

Kirill Bobyrev via Phabricator via cfe-commits Mon, 03 Sep 2018 03:26:42 -0700

kbobyrev updated this revision to Diff 163680.
kbobyrev marked 17 inline comments as done.
kbobyrev added a comment.


Address a round of comments, refactor code.


https://reviews.llvm.org/D51481

Files:
  clang-tools-extra/;
  clang-tools-extra/clangd/CMakeLists.txt
  clang-tools-extra/clangd/index/dex/DexIndex.cpp
  clang-tools-extra/clangd/index/dex/DexIndex.h
  clang-tools-extra/clangd/index/dex/Token.cpp
  clang-tools-extra/clangd/index/dex/Token.h
  clang-tools-extra/clangd/tool/ClangdMain.cpp
  clang-tools-extra/unittests/clangd/DexIndexTests.cpp
  clang-tools-extra/unittests/clangd/TestFS.cpp
  clang-tools-extra/unittests/clangd/TestFS.h

Index: clang-tools-extra/unittests/clangd/TestFS.h
===================================================================
--- clang-tools-extra/unittests/clangd/TestFS.h
+++ clang-tools-extra/unittests/clangd/TestFS.h
@@ -59,7 +59,7 @@
 };
 
 // Returns an absolute (fake) test directory for this OS.
-const char *testRoot();
+std::string testRoot();
 
 // Returns a suitable absolute path for this OS.
 std::string testPath(PathRef File);
Index: clang-tools-extra/unittests/clangd/TestFS.cpp
===================================================================
--- clang-tools-extra/unittests/clangd/TestFS.cpp
+++ clang-tools-extra/unittests/clangd/TestFS.cpp
@@ -64,7 +64,7 @@
       FileName, std::move(CommandLine), "")};
 }
 
-const char *testRoot() {
+std::string testRoot() {
 #ifdef _WIN32
   return "C:\\clangd-test";
 #else
Index: clang-tools-extra/unittests/clangd/DexIndexTests.cpp
===================================================================
--- clang-tools-extra/unittests/clangd/DexIndexTests.cpp
+++ clang-tools-extra/unittests/clangd/DexIndexTests.cpp
@@ -7,6 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "FuzzyMatch.h"
+#include "TestFS.h"
 #include "TestIndex.h"
 #include "index/Index.h"
 #include "index/Merge.h"
@@ -29,6 +31,8 @@
 namespace dex {
 namespace {
 
+std::vector<std::string> URISchemes = {"unittest"};
+
 std::vector<DocID> consumeIDs(Iterator &It) {
   auto IDAndScore = consume(It);
   std::vector<DocID> IDs(IDAndScore.size());
@@ -325,14 +329,33 @@
 }
 
 testing::Matcher<std::vector<Token>>
-trigramsAre(std::initializer_list<std::string> Trigrams) {
+tokensAre(std::initializer_list<std::string> Strings, Token::Kind Kind) {
   std::vector<Token> Tokens;
-  for (const auto &Symbols : Trigrams) {
-    Tokens.push_back(Token(Token::Kind::Trigram, Symbols));
+  for (const auto &TokenData : Strings) {
+    Tokens.push_back(Token(Kind, TokenData));
   }
   return testing::UnorderedElementsAreArray(Tokens);
 }
 
+testing::Matcher<std::vector<Token>>
+trigramsAre(std::initializer_list<std::string> Trigrams) {
+  return tokensAre(Trigrams, Token::Kind::Trigram);
+}
+
+testing::Matcher<std::vector<Token>>
+pathsAre(std::initializer_list<std::string> Paths) {
+  return tokensAre(Paths, Token::Kind::PathURI);
+}
+
+testing::Matcher<std::vector<Token>>
+proximityPathsAre(std::initializer_list<std::string> ProximityPaths) {
+  std::vector<Token> Result;
+  for (const auto &Path : ProximityPaths) {
+    Result.emplace_back(Token::Kind::PathURI, Path);
+  }
+  return testing::UnorderedElementsAreArray(Result);
+}
+
 TEST(DexIndexTrigrams, IdentifierTrigrams) {
   EXPECT_THAT(generateIdentifierTrigrams("X86"),
               trigramsAre({"x86", "x$$", "x8$"}));
@@ -407,8 +430,27 @@
                            "hij", "ijk", "jkl", "klm"}));
 }
 
+TEST(DexSearchTokens, SymbolPath) {
+  EXPECT_THAT(generateProximityPathURIs(
+                  "unittest:///clang-tools-extra/clangd/index/Token.h"),
+              pathsAre({"unittest:///clang-tools-extra/clangd/index/Token.h",
+                        "unittest:///clang-tools-extra/clangd/index",
+                        "unittest:///clang-tools-extra/clangd",
+                        "unittest:///clang-tools-extra", "unittest:///"}));
+}
+
+TEST(DexSearchTokens, QueryProximityDistances) {
+  EXPECT_THAT(generateQueryProximityPathURIs(testRoot() + "/a/b/c/d/e/f/g.h",
+                                             URISchemes),
+              proximityPathsAre({{"unittest:///a/b/c/d/e/f/g.h"},
+                                 {"unittest:///a/b/c/d/e/f"},
+                                 {"unittest:///a/b/c/d/e"},
+                                 {"unittest:///a/b/c/d"},
+                                 {"unittest:///a/b/c"}}));
+}
+
 TEST(DexIndex, Lookup) {
-  DexIndex I;
+  DexIndex I(URISchemes);
   I.build(generateSymbols({"ns::abc", "ns::xyz"}));
   EXPECT_THAT(lookup(I, SymbolID("ns::abc")), UnorderedElementsAre("ns::abc"));
   EXPECT_THAT(lookup(I, {SymbolID("ns::abc"), SymbolID("ns::xyz")}),
@@ -419,7 +461,7 @@
 }
 
 TEST(DexIndex, FuzzyFind) {
-  DexIndex Index;
+  DexIndex Index(URISchemes);
   Index.build(generateSymbols({"ns::ABC", "ns::BCD", "::ABC", "ns::nested::ABC",
                                "other::ABC", "other::A"}));
   FuzzyFindRequest Req;
@@ -442,7 +484,7 @@
 }
 
 TEST(DexIndexTest, FuzzyMatchQ) {
-  DexIndex I;
+  DexIndex I(URISchemes);
   I.build(
       generateSymbols({"LaughingOutLoud", "LionPopulation", "LittleOldLady"}));
   FuzzyFindRequest Req;
@@ -453,7 +495,7 @@
 }
 
 TEST(DexIndexTest, DexIndexSymbolsRecycled) {
-  DexIndex I;
+  DexIndex I(URISchemes);
   std::weak_ptr<SlabAndPointers> Symbols;
   I.build(generateNumSymbols(0, 10, &Symbols));
   FuzzyFindRequest Req;
@@ -482,14 +524,14 @@
 
   FuzzyFindRequest Req;
   Req.Query = "7";
-  DexIndex I;
+  DexIndex I(URISchemes);
   I.build(std::move(Symbols));
   auto Matches = match(I, Req);
   EXPECT_EQ(Matches.size(), 4u);
 }
 
 TEST(DexIndexTest, DexIndexLimitedNumMatches) {
-  DexIndex I;
+  DexIndex I(URISchemes);
   I.build(generateNumSymbols(0, 100));
   FuzzyFindRequest Req;
   Req.Query = "5";
@@ -501,7 +543,7 @@
 }
 
 TEST(DexIndexTest, FuzzyMatch) {
-  DexIndex I;
+  DexIndex I(URISchemes);
   I.build(
       generateSymbols({"LaughingOutLoud", "LionPopulation", "LittleOldLady"}));
   FuzzyFindRequest Req;
@@ -512,60 +554,60 @@
 }
 
 TEST(DexIndexTest, MatchQualifiedNamesWithoutSpecificScope) {
-  DexIndex I;
+  DexIndex I(URISchemes);
   I.build(generateSymbols({"a::y1", "b::y2", "y3"}));
   FuzzyFindRequest Req;
   Req.Query = "y";
   EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1", "b::y2", "y3"));
 }
 
 TEST(DexIndexTest, MatchQualifiedNamesWithGlobalScope) {
-  DexIndex I;
+  DexIndex I(URISchemes);
   I.build(generateSymbols({"a::y1", "b::y2", "y3"}));
   FuzzyFindRequest Req;
   Req.Query = "y";
   Req.Scopes = {""};
   EXPECT_THAT(match(I, Req), UnorderedElementsAre("y3"));
 }
 
 TEST(DexIndexTest, MatchQualifiedNamesWithOneScope) {
-  DexIndex I;
+  DexIndex I(URISchemes);
   I.build(generateSymbols({"a::y1", "a::y2", "a::x", "b::y2", "y3"}));
   FuzzyFindRequest Req;
   Req.Query = "y";
   Req.Scopes = {"a::"};
   EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1", "a::y2"));
 }
 
 TEST(DexIndexTest, MatchQualifiedNamesWithMultipleScopes) {
-  DexIndex I;
+  DexIndex I(URISchemes);
   I.build(generateSymbols({"a::y1", "a::y2", "a::x", "b::y3", "y3"}));
   FuzzyFindRequest Req;
   Req.Query = "y";
   Req.Scopes = {"a::", "b::"};
   EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1", "a::y2", "b::y3"));
 }
 
 TEST(DexIndexTest, NoMatchNestedScopes) {
-  DexIndex I;
+  DexIndex I(URISchemes);
   I.build(generateSymbols({"a::y1", "a::b::y2"}));
   FuzzyFindRequest Req;
   Req.Query = "y";
   Req.Scopes = {"a::"};
   EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1"));
 }
 
 TEST(DexIndexTest, IgnoreCases) {
-  DexIndex I;
+  DexIndex I(URISchemes);
   I.build(generateSymbols({"ns::ABC", "ns::abc"}));
   FuzzyFindRequest Req;
   Req.Query = "AB";
   Req.Scopes = {"ns::"};
   EXPECT_THAT(match(I, Req), UnorderedElementsAre("ns::ABC", "ns::abc"));
 }
 
 TEST(DexIndexTest, Lookup) {
-  DexIndex I;
+  DexIndex I(URISchemes);
   I.build(generateSymbols({"ns::abc", "ns::xyz"}));
   EXPECT_THAT(lookup(I, SymbolID("ns::abc")), UnorderedElementsAre("ns::abc"));
   EXPECT_THAT(lookup(I, {SymbolID("ns::abc"), SymbolID("ns::xyz")}),
@@ -575,6 +617,58 @@
   EXPECT_THAT(lookup(I, SymbolID("ns::nonono")), UnorderedElementsAre());
 }
 
+TEST(DexIndexTest, ProximityPathsBoosting) {
+  DexIndex I(URISchemes);
+
+  // All strings should be stored in local variables to ensure their lifetime is
+  // the same as Slab's.
+  auto RootURI = URI::create(testRoot() + "/file.h", "unittest");
+  assert(static_cast<bool>(RootURI));
+  auto CloseFileURI = URI::create(testRoot() + "/a/b/c/d/e/file.h", "unittest");
+  assert(static_cast<bool>(CloseFileURI));
+
+  // Populate Symbol Slab
+  std::vector<const Symbol *> Slab;
+  // Symbol in the root will perfectly match the request, but its canonical
+  // declaration is in the root directory.
+  Symbol RootSymbol;
+  std::string RootSymbolName = "abc";
+  RootSymbol.Name = RootSymbolName;
+  std::string RootFilename = RootURI->toString();
+  RootSymbol.CanonicalDeclaration.FileURI = RootFilename;
+  Slab.push_back(&RootSymbol);
+
+  // Another symbol has much lower Fuzzy Matching Score, but it is very close to
+  // the directory the request would come from.
+  Symbol CloseSymbol;
+  std::string CloseSymbolName = "AbsolutelyBizzareCoincidenceMatch";
+  CloseSymbol.Name = CloseSymbolName;
+  std::string CloseFileName = CloseFileURI->toString();
+  CloseSymbol.CanonicalDeclaration.FileURI = CloseFileName;
+
+  FuzzyFindRequest Req;
+  Req.Query = "abc";
+  Req.MaxCandidateCount = 1;
+  // FuzzyFind request comes from the file which is far from the root.
+  Req.ProximityPaths.push_back(testRoot() + "/a/b/c/d/e/f/file.h");
+
+  FuzzyMatcher Matcher(Req.Query);
+  EXPECT_TRUE(Matcher.match(RootSymbolName) > Matcher.match(CloseSymbolName));
+
+  // Insert duplicates so that only the boosted symbols end up in the scoring
+  // stage.
+  // FIXME(kbobyrev): 10 is the number of items passed to the last stage which
+  // involves fuzzy scoring. Would having access to that information be better
+  // here?
+  for (size_t Index = 0; Index < 10; ++Index)
+    Slab.push_back(&CloseSymbol);
+
+  I.build(std::make_shared<std::vector<const Symbol *>>(Slab));
+
+  EXPECT_THAT(match(I, Req),
+              UnorderedElementsAre("AbsolutelyBizzareCoincidenceMatch"));
+}
+
 } // namespace
 } // namespace dex
 } // namespace clangd
Index: clang-tools-extra/clangd/tool/ClangdMain.cpp
===================================================================
--- clang-tools-extra/clangd/tool/ClangdMain.cpp
+++ clang-tools-extra/clangd/tool/ClangdMain.cpp
@@ -42,7 +42,9 @@
 // Build an in-memory static index for global symbols from a YAML-format file.
 // The size of global symbols should be relatively small, so that all symbols
 // can be managed in memory.
-std::unique_ptr<SymbolIndex> buildStaticIndex(llvm::StringRef YamlSymbolFile) {
+std::unique_ptr<SymbolIndex>
+buildStaticIndex(llvm::StringRef YamlSymbolFile,
+                 std::vector<std::string> &URISchemes) {
   auto Buffer = llvm::MemoryBuffer::getFile(YamlSymbolFile);
   if (!Buffer) {
     llvm::errs() << "Can't open " << YamlSymbolFile << "\n";
@@ -53,9 +55,10 @@
   for (auto Sym : Slab)
     SymsBuilder.insert(Sym);
 
-  return UseDex ? dex::DexIndex::build(std::move(SymsBuilder).build())
-                : MemIndex::build(std::move(SymsBuilder).build(),
-                                  SymbolOccurrenceSlab::createEmpty());
+  return UseDex
+             ? dex::DexIndex::build(std::move(SymsBuilder).build(), URISchemes)
+             : MemIndex::build(std::move(SymsBuilder).build(),
+                               SymbolOccurrenceSlab::createEmpty());
 }
 
 } // namespace
@@ -299,7 +302,7 @@
   Opts.BuildDynamicSymbolIndex = EnableIndex;
   std::unique_ptr<SymbolIndex> StaticIdx;
   if (EnableIndex && !YamlSymbolFile.empty()) {
-    StaticIdx = buildStaticIndex(YamlSymbolFile);
+    StaticIdx = buildStaticIndex(YamlSymbolFile, Opts.URISchemes);
     Opts.StaticIndex = StaticIdx.get();
   }
   Opts.AsyncThreadsCount = WorkerThreadsCount;
Index: clang-tools-extra/clangd/index/dex/Token.h
===================================================================
--- clang-tools-extra/clangd/index/dex/Token.h
+++ clang-tools-extra/clangd/index/dex/Token.h
@@ -41,22 +41,32 @@
   /// Kind specifies Token type which defines semantics for the internal
   /// representation. Each Kind has different representation stored in Data
   /// field.
+  // FIXME(kbobyrev): Storing Data hash would be more efficient than storing raw
+  // strings. For example, PathURI store URIs of each directory and its parents,
+  // which induces a lot of overhead because these paths tend to be long and
+  // each parent directory is a prefix.
   enum class Kind {
     /// Represents trigram used for fuzzy search of unqualified symbol names.
     ///
     /// Data contains 3 bytes with trigram contents.
     Trigram,
     /// Scope primitives, e.g. "symbol belongs to namespace foo::bar".
     ///
-    /// Data stroes full scope name , e.g. "foo::bar::baz::" or "" (for global
+    /// Data stroes full scope name, e.g. "foo::bar::baz::" or "" (for global
     /// scope).
     Scope,
+    /// Path URI to symbol declaration.
+    ///
+    /// Data stores path URI of file and each parent directory of symbol
+    /// declaration location.
+    ///
+    /// Example: "file:///path/to/clang-tools-extra/clangd/index/SymbolIndex.h"
+    /// and all its parents.
+    PathURI,
     /// Internal Token type for invalid/special tokens, e.g. empty tokens for
     /// llvm::DenseMap.
     Sentinel,
     /// FIXME(kbobyrev): Add other Token Kinds
-    /// * Path with full or relative path to the directory in which symbol is
-    ///   defined
     /// * Type with qualified type name or its USR
   };
 
@@ -81,6 +91,16 @@
   }
 };
 
+/// Returns Search Token for a number of parent directories of given Path.
+/// Should be used within the index build process.
+std::vector<Token> generateProximityPathURIs(llvm::StringRef Path);
+
+/// Returns Search Token and its distance from the absolute path for the first
+/// URI scheme allows valid conversions from AbsolutePath.
+std::vector<Token>
+generateQueryProximityPathURIs(llvm::StringRef AbsolutePath,
+                               llvm::ArrayRef<std::string> URISchemes);
+
 } // namespace dex
 } // namespace clangd
 } // namespace clang
Index: clang-tools-extra/clangd/index/dex/Token.cpp
===================================================================
--- /dev/null
+++ clang-tools-extra/clangd/index/dex/Token.cpp
@@ -0,0 +1,70 @@
+//===--- Token.cpp - Symbol Search primitive --------------------*- C++ -*-===//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Token.h"
+#include "URI.h"
+#include "llvm/Support/Path.h"
+#include <cassert>
+
+namespace clang {
+namespace clangd {
+namespace dex {
+
+std::vector<Token> generateProximityPathURIs(llvm::StringRef URIPath) {
+  std::vector<Token> Result;
+  // Don't generate any tokens for empty paths.
+  if (URIPath.empty())
+    return Result;
+  auto ParsedURI = URI::parse(URIPath);
+  assert(static_cast<bool>(ParsedURI) &&
+         "Non-empty argument of generateProximityPathURIs() should be a valid "
+         "URI.");
+  const auto Scheme = ParsedURI->scheme();
+  const auto Authority = ParsedURI->authority();
+  StringRef Path = ParsedURI->body();
+  // FIXME(kbobyrev): Currently, this is a heuristic which defines the maximum
+  // size of resulting vector. Some projects might want to have higher limit if
+  // the file hierarchy is deeper. For the generic case, it would be useful to
+  // calculate Limit in the index build stage by calculating the maximum depth
+  // of the project source tree at runtime.
+  size_t Limit = 5;
+  while (!Path.empty() && Limit--) {
+    // FIXME(kbobyrev): Parsing and encoding path to URIs is not necessary and
+    // could be optimized.
+    URI TokenURI(Scheme, Authority, Path);
+    Result.emplace_back(Token::Kind::PathURI, TokenURI.toString());
+    Path = llvm::sys::path::parent_path(Path, llvm::sys::path::Style::posix);
+  }
+  return Result;
+}
+
+std::vector<Token>
+generateQueryProximityPathURIs(llvm::StringRef AbsolutePath,
+                               llvm::ArrayRef<std::string> URISchemes) {
+  std::vector<Token> Result;
+  for (const auto Scheme : URISchemes) {
+    auto URI = URI::create(AbsolutePath, Scheme);
+    // For some paths, conversion to different URI schemes is impossible. These
+    // should be just skipped.
+    if (!static_cast<bool>(URI)) {
+      // Ignore the error.
+      llvm::consumeError(URI.takeError());
+      continue;
+    }
+    for (const auto ProximityPath : generateProximityPathURIs(URI->toString()))
+      Result.emplace_back(ProximityPath);
+    // As soon as one valid URI scheme for given absolute path is found, the
+    // token generation process should be stopped.
+    break;
+  }
+  return Result;
+}
+
+} // namespace dex
+} // namespace clangd
+} // namespace clang
Index: clang-tools-extra/clangd/index/dex/DexIndex.h
===================================================================
--- clang-tools-extra/clangd/index/dex/DexIndex.h
+++ clang-tools-extra/clangd/index/dex/DexIndex.h
@@ -39,12 +39,21 @@
 // index on disk and then load it if available.
 class DexIndex : public SymbolIndex {
 public:
+  /// URISchemes are used to convert proximity paths from FuzzyFindRequest to
+  /// the desired URIs. URISchemes should contain a list of schemes sorted by
+  /// the priority, because each proximity path from FuzzyFindRequest will be
+  /// converted to the first scheme from URISchemes list which is valid for that
+  /// path.
+  explicit DexIndex(llvm::ArrayRef<std::string> URISchemes)
+      : URISchemes(URISchemes) {}
+
   /// \brief (Re-)Build index for `Symbols`. All symbol pointers must remain
   /// accessible as long as `Symbols` is kept alive.
-  void build(std::shared_ptr<std::vector<const Symbol *>> Syms);
+  void build(std::shared_ptr<std::vector<const Symbol *>> Symbols);
 
   /// \brief Build index from a symbol slab.
-  static std::unique_ptr<SymbolIndex> build(SymbolSlab Slab);
+  static std::unique_ptr<SymbolIndex>
+  build(SymbolSlab Slab, llvm::ArrayRef<std::string> URISchemes);
 
   bool
   fuzzyFind(const FuzzyFindRequest &Req,
@@ -60,23 +69,24 @@
   size_t estimateMemoryUsage() const override;
 
 private:
-
   mutable std::mutex Mutex;
 
   std::shared_ptr<std::vector<const Symbol *>> Symbols /*GUARDED_BY(Mutex)*/;
   llvm::DenseMap<SymbolID, const Symbol *> LookupTable /*GUARDED_BY(Mutex)*/;
-  llvm::DenseMap<const Symbol *, float> SymbolQuality /*GUARDED_BY(Mutex)*/;
+  std::vector<float> SymbolQuality /*GUARDED_BY(Mutex)*/;
   // Inverted index is a mapping from the search token to the posting list,
   // which contains all items which can be characterized by such search token.
   // For example, if the search token is scope "std::", the corresponding
   // posting list would contain all indices of symbols defined in namespace std.
   // Inverted index is used to retrieve posting lists which are processed during
   // the fuzzyFind process.
   llvm::DenseMap<Token, PostingList> InvertedIndex /*GUARDED_BY(Mutex)*/;
+
+  const std::vector<std::string> URISchemes;
 };
 
 } // namespace dex
 } // namespace clangd
 } // namespace clang
 
-#endif
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_DEXINDEX_H
Index: clang-tools-extra/clangd/index/dex/DexIndex.cpp
===================================================================
--- clang-tools-extra/clangd/index/dex/DexIndex.cpp
+++ clang-tools-extra/clangd/index/dex/DexIndex.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "DexIndex.h"
+#include "../../FileDistance.h"
 #include "../../FuzzyMatch.h"
 #include "../../Logger.h"
 #include <algorithm>
@@ -26,29 +27,31 @@
 // Returns the tokens which are given symbols's characteristics. For example,
 // trigrams and scopes.
 // FIXME(kbobyrev): Support more token types:
-// * Path proximity
 // * Types
 std::vector<Token> generateSearchTokens(const Symbol &Sym) {
   std::vector<Token> Result = generateIdentifierTrigrams(Sym.Name);
   Result.emplace_back(Token::Kind::Scope, Sym.Scope);
+  for (const auto &PathToken :
+       generateProximityPathURIs(Sym.CanonicalDeclaration.FileURI))
+    Result.push_back(PathToken);
   return Result;
 }
 
 } // namespace
 
 void DexIndex::build(std::shared_ptr<std::vector<const Symbol *>> Syms) {
   llvm::DenseMap<SymbolID, const Symbol *> TempLookupTable;
-  llvm::DenseMap<const Symbol *, float> TempSymbolQuality;
+  llvm::DenseMap<const Symbol *, float> SymbolQualityLookup;
   for (const Symbol *Sym : *Syms) {
     TempLookupTable[Sym->ID] = Sym;
-    TempSymbolQuality[Sym] = quality(*Sym);
+    SymbolQualityLookup[Sym] = quality(*Sym);
   }
 
   // Symbols are sorted by symbol qualities so that items in the posting lists
   // are stored in the descending order of symbol quality.
   std::sort(begin(*Syms), end(*Syms),
             [&](const Symbol *LHS, const Symbol *RHS) {
-              return TempSymbolQuality[LHS] > TempSymbolQuality[RHS];
+              return SymbolQualityLookup[LHS] > SymbolQualityLookup[RHS];
             });
   llvm::DenseMap<Token, PostingList> TempInvertedIndex;
   // Populate TempInvertedIndex with posting lists for index symbols.
@@ -58,6 +61,10 @@
       TempInvertedIndex[Token].push_back(SymbolRank);
   }
 
+  std::vector<float> TempSymbolQuality(Syms->size());
+  for (size_t I = 0; I < Syms->size(); ++I)
+    TempSymbolQuality[I] = SymbolQualityLookup[(*Syms)[I]];
+
   {
     std::lock_guard<std::mutex> Lock(Mutex);
 
@@ -72,8 +79,9 @@
        estimateMemoryUsage());
 }
 
-std::unique_ptr<SymbolIndex> DexIndex::build(SymbolSlab Slab) {
-  auto Idx = llvm::make_unique<DexIndex>();
+std::unique_ptr<SymbolIndex>
+DexIndex::build(SymbolSlab Slab, llvm::ArrayRef<std::string> URISchemes) {
+  auto Idx = llvm::make_unique<DexIndex>(URISchemes);
   Idx->build(getSymbolsFromSlab(std::move(Slab)));
   return std::move(Idx);
 }
@@ -117,6 +125,33 @@
     if (!ScopeIterators.empty())
       TopLevelChildren.push_back(createOr(move(ScopeIterators)));
 
+    // Add proximity paths boosting.
+    std::vector<std::unique_ptr<Iterator>> BoostingIterators;
+    // Try to build BOOST iterator for each Proximity Path provided by
+    // FuzzyFindRequest. Boosting factor should depend on the distance to the
+    // Proximity Path: the closer processed path is, the higher boosting factor.
+    for (const auto &ProximityPath : Req.ProximityPaths) {
+      const auto ProximityPathURIs =
+          generateQueryProximityPathURIs(ProximityPath, URISchemes);
+      for (size_t Distance = 0; Distance < ProximityPathURIs.size();
+           ++Distance) {
+        const auto It = InvertedIndex.find(ProximityPathURIs[Distance]);
+        if (It != InvertedIndex.end()) {
+          // FIXME(kbobyrev): Append LIMIT on top of every BOOST iterator.
+          // Each parent directory is boosted by PARENTS_TO_BOOST + 1 - Level.
+          float BoostFactor =
+              std::exp(Distance * 0.4f / FileDistanceOptions().UpCost);
+          BoostingIterators.push_back(
+              createBoost(create(It->second), BoostFactor));
+        }
+      }
+    }
+    // Boosting iterators do not actually filter symbols. In order to preserve
+    // the validity of resulting query, TRUE iterator should be added along
+    // BOOSTs.
+    BoostingIterators.push_back(createTrue(Symbols->size()));
+    TopLevelChildren.push_back(createOr(move(BoostingIterators)));
+
     // Use TRUE iterator if both trigrams and scopes from the query are not
     // present in the symbol index.
     auto QueryIterator = TopLevelChildren.empty()
@@ -127,23 +162,35 @@
     // FIXME(kbobyrev): Pre-scoring retrieval threshold should be adjusted as
     // using 100x of the requested number might not be good in practice, e.g.
     // when the requested number of items is small.
-    const unsigned ItemsToRetrieve = 100 * Req.MaxCandidateCount;
+    const size_t ItemsToRetrieve = 100 * Req.MaxCandidateCount;
     auto Root = createLimit(move(QueryIterator), ItemsToRetrieve);
-    // FIXME(kbobyrev): Add boosting to the query and utilize retrieved
-    // boosting scores.
+
     std::vector<std::pair<DocID, float>> SymbolDocIDs = consume(*Root);
 
+    // Sort items using a combination of boosting score and quality.
+    // FIXME(kbobyrev): Consider merging this stage with the next one? This is
+    // a performance/quality trade-off and if the performance is not an issue
+    // there's plenty of quality to be bought by sacrificing part of
+    // performance.
+    std::sort(begin(SymbolDocIDs), end(SymbolDocIDs),
+              [&](const std::pair<DocID, float> &LHS,
+                  const std::pair<DocID, float> &RHS) {
+                return SymbolQuality[LHS.first] * LHS.second >
+                       SymbolQuality[RHS.first] * RHS.second;
+              });
+
     // Retrieve top Req.MaxCandidateCount items.
+    const size_t ItemsToScore = 10 * Req.MaxCandidateCount;
     std::priority_queue<std::pair<float, const Symbol *>> Top;
-    for (const auto &P : SymbolDocIDs) {
-      const DocID SymbolDocID = P.first;
+    for (size_t I = 0; I < std::min(SymbolDocIDs.size(), ItemsToScore); ++I) {
+      const DocID SymbolDocID = SymbolDocIDs[I].first;
       const auto *Sym = (*Symbols)[SymbolDocID];
       const llvm::Optional<float> Score = Filter.match(Sym->Name);
       if (!Score)
         continue;
       // Multiply score by a negative factor so that Top stores items with the
       // highest actual score.
-      Top.emplace(-(*Score) * SymbolQuality.find(Sym)->second, Sym);
+      Top.emplace(-(*Score) * SymbolQuality[SymbolDocID], Sym);
       if (Top.size() > Req.MaxCandidateCount) {
         More = true;
         Top.pop();
Index: clang-tools-extra/clangd/CMakeLists.txt
===================================================================
--- clang-tools-extra/clangd/CMakeLists.txt
+++ clang-tools-extra/clangd/CMakeLists.txt
@@ -46,6 +46,7 @@
 
   index/dex/DexIndex.cpp
   index/dex/Iterator.cpp
+  index/dex/Token.cpp
   index/dex/Trigram.cpp
 
   LINK_LIBS
Index: clang-tools-extra/;
===================================================================
--- /dev/null
+++ clang-tools-extra/;
@@ -0,0 +1,68 @@
+//===--- Token.cpp - Symbol Search primitive --------------------*- C++ -*-===//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Token.h"
+#include "URI.h"
+#include "llvm/Support/Path.h"
+#include <cassert>
+
+namespace clang {
+namespace clangd {
+namespace dex {
+
+std::vector<Token> generateProximityPaths(llvm::StringRef URIPath,
+                                          size_t Limit) {
+  std::vector<Token> Result;
+  // Don't generate any tokens for empty paths.
+  if (URIPath.empty())
+    return Result;
+  auto ParsedURI = URI::parse(URIPath);
+  assert(
+      static_cast<bool>(ParsedURI) &&
+      "Non-empty argument of generateProximityPaths() should be a valid URI.");
+  const auto Scheme = ParsedURI.get().scheme();
+  const auto Authority = ParsedURI.get().authority();
+  StringRef Path = ParsedURI.get().body();
+  // Get parent directory of the file with symbol declaration.
+  Path = llvm::sys::path::parent_path(Path);
+  while (!Path.empty() && Limit--) {
+    URI TokenURI(Scheme, Authority, Path);
+    Result.emplace_back(Token::Kind::PathURI, TokenURI.toString());
+    Path = llvm::sys::path::parent_path(Path);
+  }
+  return Result;
+}
+
+std::vector<std::pair<Token, unsigned>>
+generateQueryProximityPaths(llvm::StringRef AbsolutePath,
+                            llvm::ArrayRef<std::string> URISchemes,
+                            size_t Count) {
+  std::vector<std::pair<Token, unsigned>> Result;
+  for (const auto Scheme : URISchemes) {
+    unsigned Distance = 0;
+    auto URI = URI::create(AbsolutePath, Scheme);
+    // For some paths, conversion to different URI schemes is impossible. These
+    // should be just skipped.
+    if (!static_cast<bool>(URI)) {
+      // Ignore the error.
+      handleAllErrors(URI.takeError(), [](const llvm::ErrorInfoBase &) {});
+      continue;
+    }
+    for (const auto ProximityPath :
+         generateProximityPaths(URI.get().toString(), Count))
+      Result.emplace_back(ProximityPath, Distance++);
+    // As soon as one valid URI scheme for given absolute path is found, the
+    // token generation process should be stopped.
+    break;
+  }
+  return Result;
+}
+
+} // namespace dex
+} // namespace clangd
+} // namespace clang

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D51481: [clangd] Implement proximity path boosting for Dex

Reply via email to