[PATCH] D51481: [clangd] Implement proximity path boosting for Dex

Kirill Bobyrev via Phabricator via cfe-commits Tue, 04 Sep 2018 03:05:49 -0700

kbobyrev updated this revision to Diff 163773.
kbobyrev marked 12 inline comments as done.
kbobyrev added a comment.


- Rebase on top of the parent patch
- Apply many refactorings where appropriate
- Write more comments and documentation
- Abstract pieces of code which are shared by multiple pieces of Clangd


https://reviews.llvm.org/D51481

Files:
  clang-tools-extra/clangd/CMakeLists.txt
  clang-tools-extra/clangd/Quality.cpp
  clang-tools-extra/clangd/Quality.h
  clang-tools-extra/clangd/index/dex/DexIndex.cpp
  clang-tools-extra/clangd/index/dex/DexIndex.h
  clang-tools-extra/clangd/index/dex/Iterator.cpp
  clang-tools-extra/clangd/index/dex/Iterator.h
  clang-tools-extra/clangd/index/dex/Token.cpp
  clang-tools-extra/clangd/index/dex/Token.h
  clang-tools-extra/clangd/tool/ClangdMain.cpp
  clang-tools-extra/unittests/clangd/DexIndexTests.cpp
  clang-tools-extra/unittests/clangd/TestIndex.cpp
  clang-tools-extra/unittests/clangd/TestIndex.h

Index: clang-tools-extra/unittests/clangd/TestIndex.h
===================================================================
--- clang-tools-extra/unittests/clangd/TestIndex.h
+++ clang-tools-extra/unittests/clangd/TestIndex.h
@@ -39,6 +39,13 @@
                                const FuzzyFindRequest &Req,
                                bool *Incomplete = nullptr);
 
+// Performs fuzzy matching-based symbol lookup given a query and an index.
+// Incomplete is set true if more items than requested can be retrieved, false
+// otherwise. Returns filename URIs of matched symbols canonical declarations.
+std::vector<std::string> matchDeclURIs(const SymbolIndex &I,
+                                       const FuzzyFindRequest &Req,
+                                       bool *Incomplete = nullptr);
+
 // Returns qualified names of symbols with any of IDs in the index.
 std::vector<std::string> lookup(const SymbolIndex &I,
                                 llvm::ArrayRef<SymbolID> IDs);
Index: clang-tools-extra/unittests/clangd/TestIndex.cpp
===================================================================
--- clang-tools-extra/unittests/clangd/TestIndex.cpp
+++ clang-tools-extra/unittests/clangd/TestIndex.cpp
@@ -55,6 +55,18 @@
   return Matches;
 }
 
+std::vector<std::string> matchDeclURIs(const SymbolIndex &I,
+                                       const FuzzyFindRequest &Req,
+                                       bool *Incomplete) {
+  std::vector<std::string> Matches;
+  bool IsIncomplete = I.fuzzyFind(Req, [&](const Symbol &Sym) {
+    Matches.push_back(Sym.CanonicalDeclaration.FileURI);
+  });
+  if (Incomplete)
+    *Incomplete = IsIncomplete;
+  return Matches;
+}
+
 // Returns qualified names of symbols with any of IDs in the index.
 std::vector<std::string> lookup(const SymbolIndex &I,
                                 llvm::ArrayRef<SymbolID> IDs) {
Index: clang-tools-extra/unittests/clangd/DexIndexTests.cpp
===================================================================
--- clang-tools-extra/unittests/clangd/DexIndexTests.cpp
+++ clang-tools-extra/unittests/clangd/DexIndexTests.cpp
@@ -7,6 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "FuzzyMatch.h"
+#include "TestFS.h"
 #include "TestIndex.h"
 #include "index/Index.h"
 #include "index/Merge.h"
@@ -30,11 +32,17 @@
 namespace dex {
 namespace {
 
+std::vector<std::string> URISchemes = {"unittest"};
+
+//===----------------------------------------------------------------------===//
+// Query iterator tests.
+//===----------------------------------------------------------------------===//
+
 std::vector<DocID> consumeIDs(Iterator &It) {
   auto IDAndScore = consume(It);
   std::vector<DocID> IDs(IDAndScore.size());
   for (size_t I = 0; I < IDAndScore.size(); ++I)
-    IDs[I] = IDAndScore[I].first;
+    IDs[I] = IDAndScore[I].ID;
   return IDs;
 }
 
@@ -325,15 +333,38 @@
   EXPECT_THAT(ElementBoost, 3);
 }
 
+//===----------------------------------------------------------------------===//
+// Search token tests.
+//===----------------------------------------------------------------------===//
+
 testing::Matcher<std::vector<Token>>
-trigramsAre(std::initializer_list<std::string> Trigrams) {
+tokensAre(std::initializer_list<std::string> Strings, Token::Kind Kind) {
   std::vector<Token> Tokens;
-  for (const auto &Symbols : Trigrams) {
-    Tokens.push_back(Token(Token::Kind::Trigram, Symbols));
+  for (const auto &TokenData : Strings) {
+    Tokens.push_back(Token(Kind, TokenData));
   }
   return testing::UnorderedElementsAreArray(Tokens);
 }
 
+testing::Matcher<std::vector<Token>>
+trigramsAre(std::initializer_list<std::string> Trigrams) {
+  return tokensAre(Trigrams, Token::Kind::Trigram);
+}
+
+testing::Matcher<std::vector<Token>>
+pathsAre(std::initializer_list<std::string> Paths) {
+  return tokensAre(Paths, Token::Kind::ProximityURI);
+}
+
+testing::Matcher<std::vector<Token>>
+proximityPathsAre(std::initializer_list<std::string> ProximityPaths) {
+  std::vector<Token> Result;
+  for (const auto &Path : ProximityPaths) {
+    Result.emplace_back(Token::Kind::ProximityURI, Path);
+  }
+  return testing::UnorderedElementsAreArray(Result);
+}
+
 TEST(DexIndexTrigrams, IdentifierTrigrams) {
   EXPECT_THAT(generateIdentifierTrigrams("X86"),
               trigramsAre({"x86", "x$$", "x8$"}));
@@ -408,8 +439,32 @@
                            "hij", "ijk", "jkl", "klm"}));
 }
 
+TEST(DexSearchTokens, SymbolPath) {
+  EXPECT_THAT(generateProximityURIs(
+                  "unittest:///clang-tools-extra/clangd/index/Token.h"),
+              pathsAre({"unittest:///clang-tools-extra/clangd/index/Token.h",
+                        "unittest:///clang-tools-extra/clangd/index",
+                        "unittest:///clang-tools-extra/clangd",
+                        "unittest:///clang-tools-extra", "unittest:///"}));
+}
+
+TEST(DexSearchTokens, QueryProximityDistances) {
+  llvm::SmallString<30> Path(testRoot());
+  llvm::sys::path::append(Path, "/a/b/c/d/e/f/g.h");
+  EXPECT_THAT(generateQueryProximityURIs(Path, URISchemes),
+              proximityPathsAre({{"unittest:///a/b/c/d/e/f/g.h"},
+                                 {"unittest:///a/b/c/d/e/f"},
+                                 {"unittest:///a/b/c/d/e"},
+                                 {"unittest:///a/b/c/d"},
+                                 {"unittest:///a/b/c"}}));
+}
+
+//===----------------------------------------------------------------------===//
+// Index tests.
+//===----------------------------------------------------------------------===//
+
 TEST(DexIndex, Lookup) {
-  auto I = DexIndex::build(generateSymbols({"ns::abc", "ns::xyz"}));
+  auto I = DexIndex::build(generateSymbols({"ns::abc", "ns::xyz"}), URISchemes);
   EXPECT_THAT(lookup(*I, SymbolID("ns::abc")), UnorderedElementsAre("ns::abc"));
   EXPECT_THAT(lookup(*I, {SymbolID("ns::abc"), SymbolID("ns::xyz")}),
               UnorderedElementsAre("ns::abc", "ns::xyz"));
@@ -421,7 +476,8 @@
 TEST(DexIndex, FuzzyFind) {
   auto Index = DexIndex::build(
       generateSymbols({"ns::ABC", "ns::BCD", "::ABC", "ns::nested::ABC",
-                       "other::ABC", "other::A"}));
+                       "other::ABC", "other::A"}),
+      URISchemes);
   FuzzyFindRequest Req;
   Req.Query = "ABC";
   Req.Scopes = {"ns::"};
@@ -443,7 +499,8 @@
 
 TEST(DexIndexTest, FuzzyMatchQ) {
   auto I = DexIndex::build(
-      generateSymbols({"LaughingOutLoud", "LionPopulation", "LittleOldLady"}));
+      generateSymbols({"LaughingOutLoud", "LionPopulation", "LittleOldLady"}),
+      URISchemes);
   FuzzyFindRequest Req;
   Req.Query = "lol";
   Req.MaxCandidateCount = 2;
@@ -461,12 +518,12 @@
                                  symbol("2") /* duplicate */};
   FuzzyFindRequest Req;
   Req.Query = "2";
-  DexIndex I(Symbols);
+  DexIndex I(Symbols, URISchemes);
   EXPECT_THAT(match(I, Req), ElementsAre("2", "2"));
 }
 
 TEST(DexIndexTest, DexIndexLimitedNumMatches) {
-  auto I = DexIndex::build(generateNumSymbols(0, 100));
+  auto I = DexIndex::build(generateNumSymbols(0, 100), URISchemes);
   FuzzyFindRequest Req;
   Req.Query = "5";
   Req.MaxCandidateCount = 3;
@@ -478,73 +535,132 @@
 
 TEST(DexIndexTest, FuzzyMatch) {
   auto I = DexIndex::build(
-      generateSymbols({"LaughingOutLoud", "LionPopulation", "LittleOldLady"}));
+      generateSymbols({"LaughingOutLoud", "LionPopulation", "LittleOldLady"}),
+      URISchemes);
   FuzzyFindRequest Req;
   Req.Query = "lol";
   Req.MaxCandidateCount = 2;
   EXPECT_THAT(match(*I, Req),
               UnorderedElementsAre("LaughingOutLoud", "LittleOldLady"));
 }
 
 TEST(DexIndexTest, MatchQualifiedNamesWithoutSpecificScope) {
-  auto I = DexIndex::build(generateSymbols({"a::y1", "b::y2", "y3"}));
+  auto I =
+      DexIndex::build(generateSymbols({"a::y1", "b::y2", "y3"}), URISchemes);
   FuzzyFindRequest Req;
   Req.Query = "y";
   EXPECT_THAT(match(*I, Req), UnorderedElementsAre("a::y1", "b::y2", "y3"));
 }
 
 TEST(DexIndexTest, MatchQualifiedNamesWithGlobalScope) {
-  auto I = DexIndex::build(generateSymbols({"a::y1", "b::y2", "y3"}));
+  auto I =
+      DexIndex::build(generateSymbols({"a::y1", "b::y2", "y3"}), URISchemes);
   FuzzyFindRequest Req;
   Req.Query = "y";
   Req.Scopes = {""};
   EXPECT_THAT(match(*I, Req), UnorderedElementsAre("y3"));
 }
 
 TEST(DexIndexTest, MatchQualifiedNamesWithOneScope) {
   auto I = DexIndex::build(
-      generateSymbols({"a::y1", "a::y2", "a::x", "b::y2", "y3"}));
+      generateSymbols({"a::y1", "a::y2", "a::x", "b::y2", "y3"}), URISchemes);
   FuzzyFindRequest Req;
   Req.Query = "y";
   Req.Scopes = {"a::"};
   EXPECT_THAT(match(*I, Req), UnorderedElementsAre("a::y1", "a::y2"));
 }
 
 TEST(DexIndexTest, MatchQualifiedNamesWithMultipleScopes) {
   auto I = DexIndex::build(
-      generateSymbols({"a::y1", "a::y2", "a::x", "b::y3", "y3"}));
+      generateSymbols({"a::y1", "a::y2", "a::x", "b::y3", "y3"}), URISchemes);
   FuzzyFindRequest Req;
   Req.Query = "y";
   Req.Scopes = {"a::", "b::"};
   EXPECT_THAT(match(*I, Req), UnorderedElementsAre("a::y1", "a::y2", "b::y3"));
 }
 
 TEST(DexIndexTest, NoMatchNestedScopes) {
-  auto I = DexIndex::build(generateSymbols({"a::y1", "a::b::y2"}));
+  auto I = DexIndex::build(generateSymbols({"a::y1", "a::b::y2"}), URISchemes);
   FuzzyFindRequest Req;
   Req.Query = "y";
   Req.Scopes = {"a::"};
   EXPECT_THAT(match(*I, Req), UnorderedElementsAre("a::y1"));
 }
 
 TEST(DexIndexTest, IgnoreCases) {
-  auto I = DexIndex::build(generateSymbols({"ns::ABC", "ns::abc"}));
+  auto I = DexIndex::build(generateSymbols({"ns::ABC", "ns::abc"}), URISchemes);
   FuzzyFindRequest Req;
   Req.Query = "AB";
   Req.Scopes = {"ns::"};
   EXPECT_THAT(match(*I, Req), UnorderedElementsAre("ns::ABC", "ns::abc"));
 }
 
 TEST(DexIndexTest, Lookup) {
-  auto I = DexIndex::build(generateSymbols({"ns::abc", "ns::xyz"}));
+  auto I = DexIndex::build(generateSymbols({"ns::abc", "ns::xyz"}), URISchemes);
   EXPECT_THAT(lookup(*I, SymbolID("ns::abc")), UnorderedElementsAre("ns::abc"));
   EXPECT_THAT(lookup(*I, {SymbolID("ns::abc"), SymbolID("ns::xyz")}),
               UnorderedElementsAre("ns::abc", "ns::xyz"));
   EXPECT_THAT(lookup(*I, {SymbolID("ns::nonono"), SymbolID("ns::xyz")}),
               UnorderedElementsAre("ns::xyz"));
   EXPECT_THAT(lookup(*I, SymbolID("ns::nonono")), UnorderedElementsAre());
 }
 
+TEST(DexIndexTest, ProximityPathsBoosting) {
+  // All strings should be stored in local variables to ensure their lifetime is
+  // the same as Slab's.
+  std::string Name = "abc";
+
+  // Populate Symbol Slab: generate two symbols with the same characteristics
+  // and hence quality, but different CanonicalDeclaration file.
+  SymbolSlab::Builder Builder;
+
+  Symbol RootSymbol;
+  RootSymbol.Name = Name;
+  llvm::SmallString<30> RootPath(testRoot());
+  llvm::sys::path::append(RootPath, "file.h");
+  auto RootURI = URI::create(RootPath, "unittest");
+  assert(RootURI);
+  std::string RootFilename = RootURI->toString();
+  RootSymbol.CanonicalDeclaration.FileURI = RootFilename;
+  RootSymbol.ID = SymbolID("RootSymbol");
+  Builder.insert(RootSymbol);
+
+  Symbol CloseSymbol;
+  CloseSymbol.Name = Name;
+  llvm::SmallString<30> ClosePath(testRoot());
+  llvm::sys::path::append(ClosePath, "/a/b/c/d/e/f/file.h");
+  auto CloseFileURI = URI::create(ClosePath, "unittest");
+  assert(CloseFileURI);
+  std::string CloseFileName = CloseFileURI->toString();
+  CloseSymbol.CanonicalDeclaration.FileURI = CloseFileName;
+  CloseSymbol.ID = SymbolID("CloseSymbol");
+  Builder.insert(CloseSymbol);
+
+  auto I = DexIndex::build(std::move(Builder).build(), URISchemes);
+
+  FuzzyFindRequest Req;
+  Req.Query = "abc";
+  // Return only one element, because the order of callback calls is not
+  // specified: items with lower score can be matched first. Therefore, to check
+  // whether the necessary item has higher score, test should only retrieve one
+  // element (with highest cumulative score).
+  Req.MaxCandidateCount = 1;
+
+  // FuzzyFind request comes from the file which is far from the root: expect
+  // CloseSymbol to come out.
+  llvm::SmallString<30> RequestPath(testRoot());
+  llvm::sys::path::append(RequestPath, "/a/b/c/d/e/f/file.h");
+  Req.ProximityPaths = {RequestPath.str()};
+  EXPECT_THAT(matchDeclURIs(*I, Req), ElementsAre(CloseFileURI->toString()));
+
+  // FuzzyFind request comes from the file which is close to the root: expect
+  // RootSymbol to come out.
+  RequestPath = testRoot();
+  llvm::sys::path::append(RequestPath, "file.h");
+  Req.ProximityPaths = {RequestPath.str()};
+  EXPECT_THAT(matchDeclURIs(*I, Req), ElementsAre(RootURI->toString()));
+}
+
 } // namespace
 } // namespace dex
 } // namespace clangd
Index: clang-tools-extra/clangd/tool/ClangdMain.cpp
===================================================================
--- clang-tools-extra/clangd/tool/ClangdMain.cpp
+++ clang-tools-extra/clangd/tool/ClangdMain.cpp
@@ -42,7 +42,9 @@
 // Build an in-memory static index for global symbols from a YAML-format file.
 // The size of global symbols should be relatively small, so that all symbols
 // can be managed in memory.
-std::unique_ptr<SymbolIndex> buildStaticIndex(llvm::StringRef YamlSymbolFile) {
+std::unique_ptr<SymbolIndex>
+buildStaticIndex(llvm::StringRef YamlSymbolFile,
+                 llvm::ArrayRef<std::string> URISchemes) {
   auto Buffer = llvm::MemoryBuffer::getFile(YamlSymbolFile);
   if (!Buffer) {
     llvm::errs() << "Can't open " << YamlSymbolFile << "\n";
@@ -53,9 +55,10 @@
   for (auto Sym : Slab)
     SymsBuilder.insert(Sym);
 
-  return UseDex ? dex::DexIndex::build(std::move(SymsBuilder).build())
-                : MemIndex::build(std::move(SymsBuilder).build(),
-                                  SymbolOccurrenceSlab::createEmpty());
+  return UseDex
+             ? dex::DexIndex::build(std::move(SymsBuilder).build(), URISchemes)
+             : MemIndex::build(std::move(SymsBuilder).build(),
+                               SymbolOccurrenceSlab::createEmpty());
 }
 
 } // namespace
@@ -299,7 +302,7 @@
   Opts.BuildDynamicSymbolIndex = EnableIndex;
   std::unique_ptr<SymbolIndex> StaticIdx;
   if (EnableIndex && !YamlSymbolFile.empty()) {
-    StaticIdx = buildStaticIndex(YamlSymbolFile);
+    StaticIdx = buildStaticIndex(YamlSymbolFile, Opts.URISchemes);
     Opts.StaticIndex = StaticIdx.get();
   }
   Opts.AsyncThreadsCount = WorkerThreadsCount;
Index: clang-tools-extra/clangd/index/dex/Token.h
===================================================================
--- clang-tools-extra/clangd/index/dex/Token.h
+++ clang-tools-extra/clangd/index/dex/Token.h
@@ -41,22 +41,31 @@
   /// Kind specifies Token type which defines semantics for the internal
   /// representation. Each Kind has different representation stored in Data
   /// field.
+  // FIXME(kbobyrev): Storing Data hash would be more efficient than storing raw
+  // strings. For example, PathURI store URIs of each directory and its parents,
+  // which induces a lot of overhead because these paths tend to be long and
+  // each parent directory is a prefix.
   enum class Kind {
     /// Represents trigram used for fuzzy search of unqualified symbol names.
     ///
     /// Data contains 3 bytes with trigram contents.
     Trigram,
     /// Scope primitives, e.g. "symbol belongs to namespace foo::bar".
     ///
-    /// Data stroes full scope name , e.g. "foo::bar::baz::" or "" (for global
+    /// Data stroes full scope name, e.g. "foo::bar::baz::" or "" (for global
     /// scope).
     Scope,
+    /// Path Proximity URI to symbol declaration.
+    ///
+    /// Data stores path URI of symbol declaration file or its parent.
+    ///
+    /// Example: "file:///path/to/clang-tools-extra/clangd/index/SymbolIndex.h"
+    /// and some amount of its parents.
+    ProximityURI,
     /// Internal Token type for invalid/special tokens, e.g. empty tokens for
     /// llvm::DenseMap.
     Sentinel,
     /// FIXME(kbobyrev): Add other Token Kinds
-    /// * Path with full or relative path to the directory in which symbol is
-    ///   defined
     /// * Type with qualified type name or its USR
   };
 
@@ -81,6 +90,16 @@
   }
 };
 
+/// Returns Search Token for a number of parent directories of given Path.
+/// Should be used within the index build process.
+std::vector<Token> generateProximityURIs(llvm::StringRef Path);
+
+/// Returns Search Token and its distance from the absolute path for the first
+/// URI scheme allows valid conversions from AbsolutePath.
+std::vector<Token>
+generateQueryProximityURIs(llvm::StringRef AbsolutePath,
+                           llvm::ArrayRef<std::string> URISchemes);
+
 } // namespace dex
 } // namespace clangd
 } // namespace clang
Index: clang-tools-extra/clangd/index/dex/Token.cpp
===================================================================
--- /dev/null
+++ clang-tools-extra/clangd/index/dex/Token.cpp
@@ -0,0 +1,73 @@
+//===--- Token.cpp - Symbol Search primitive --------------------*- C++ -*-===//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Token.h"
+#include "URI.h"
+#include "llvm/Support/Path.h"
+#include <cassert>
+
+namespace clang {
+namespace clangd {
+namespace dex {
+
+std::vector<Token> generateProximityURIs(llvm::StringRef URIPath) {
+  std::vector<Token> Result;
+  // Don't generate any tokens for empty paths.
+  if (URIPath.empty())
+    return Result;
+  auto ParsedURI = URI::parse(URIPath);
+  assert(ParsedURI &&
+         "Non-empty argument of generateProximityURIs() should be a valid "
+         "URI.");
+  const auto Scheme = ParsedURI->scheme();
+  const auto Authority = ParsedURI->authority();
+  StringRef Path = ParsedURI->body();
+  // FIXME(kbobyrev): Currently, this is a heuristic which defines the maximum
+  // size of resulting vector. Some projects might want to have higher limit if
+  // the file hierarchy is deeper. For the generic case, it would be useful to
+  // calculate Limit in the index build stage by calculating the maximum depth
+  // of the project source tree at runtime.
+  size_t Limit = 5;
+  // Insert original URI before the loop: this would save a redundant iteration
+  // with a URI parse.
+  Result.emplace_back(Token::Kind::ProximityURI, ParsedURI->toString());
+  while (!Path.empty() && --Limit) {
+    // FIXME(kbobyrev): Parsing and encoding path to URIs is not necessary and
+    // could be optimized.
+    Path = llvm::sys::path::parent_path(Path, llvm::sys::path::Style::posix);
+    URI TokenURI(Scheme, Authority, Path);
+    Result.emplace_back(Token::Kind::ProximityURI, TokenURI.toString());
+  }
+  return Result;
+}
+
+std::vector<Token>
+generateQueryProximityURIs(llvm::StringRef AbsolutePath,
+                           llvm::ArrayRef<std::string> URISchemes) {
+  std::vector<Token> Result;
+  for (const auto Scheme : URISchemes) {
+    auto URI = URI::create(AbsolutePath, Scheme);
+    // For some paths, conversion to different URI schemes is impossible. These
+    // should be just skipped.
+    if (!URI) {
+      // Ignore the error.
+      llvm::consumeError(URI.takeError());
+      continue;
+    }
+    for (const auto ProximityPath : generateProximityURIs(URI->toString()))
+      Result.emplace_back(ProximityPath);
+    // As soon as one valid URI scheme for given absolute path is found, the
+    // token generation process should be stopped.
+    break;
+  }
+  return Result;
+}
+
+} // namespace dex
+} // namespace clangd
+} // namespace clang
Index: clang-tools-extra/clangd/index/dex/Iterator.h
===================================================================
--- clang-tools-extra/clangd/index/dex/Iterator.h
+++ clang-tools-extra/clangd/index/dex/Iterator.h
@@ -121,15 +121,22 @@
   virtual llvm::raw_ostream &dump(llvm::raw_ostream &OS) const = 0;
 };
 
+struct DocIDAndScore {
+  DocID ID;
+  float Score;
+};
+
+bool operator>(const DocIDAndScore &LHS, const DocIDAndScore &RHS);
+
 /// Advances the iterator until it is exhausted. Returns pairs of document IDs
 /// with the corresponding boosting score.
 ///
 /// Boosting can be seen as a compromise between retrieving too many items and
 /// calculating finals score for each of them (which might be very expensive)
 /// and not retrieving enough items so that items with very high final score
 /// would not be processed. Boosting score is a computationally efficient way
 /// to acquire preliminary scores of requested items.
-std::vector<std::pair<DocID, float>> consume(Iterator &It);
+std::vector<DocIDAndScore> consume(Iterator &It);
 
 /// Returns a document iterator over given PostingList.
 ///
Index: clang-tools-extra/clangd/index/dex/Iterator.cpp
===================================================================
--- clang-tools-extra/clangd/index/dex/Iterator.cpp
+++ clang-tools-extra/clangd/index/dex/Iterator.cpp
@@ -389,10 +389,14 @@
 
 } // end namespace
 
-std::vector<std::pair<DocID, float>> consume(Iterator &It) {
-  std::vector<std::pair<DocID, float>> Result;
+bool operator>(const DocIDAndScore &LHS, const DocIDAndScore &RHS) {
+  return LHS.Score > RHS.Score;
+}
+
+std::vector<DocIDAndScore> consume(Iterator &It) {
+  std::vector<DocIDAndScore> Result;
   for (; !It.reachedEnd(); It.advance())
-    Result.emplace_back(It.peek(), It.consume());
+    Result.push_back({It.peek(), It.consume()});
   return Result;
 }
 
Index: clang-tools-extra/clangd/index/dex/DexIndex.h
===================================================================
--- clang-tools-extra/clangd/index/dex/DexIndex.h
+++ clang-tools-extra/clangd/index/dex/DexIndex.h
@@ -39,22 +39,26 @@
 class DexIndex : public SymbolIndex {
 public:
   // All symbols must outlive this index.
-  template <typename Range> DexIndex(Range &&Symbols) {
+  template <typename Range>
+  DexIndex(Range &&Symbols, llvm::ArrayRef<std::string> URISchemes)
+      : URISchemes(URISchemes) {
     for (auto &&Sym : Symbols)
       this->Symbols.push_back(&Sym);
     buildIndex();
   }
   // Symbols are owned by BackingData, Index takes ownership.
   template <typename Range, typename Payload>
-  DexIndex(Range &&Symbols, Payload &&BackingData)
-      : DexIndex(std::forward<Range>(Symbols)) {
+  DexIndex(Range &&Symbols, Payload &&BackingData,
+           llvm::ArrayRef<std::string> URISchemes)
+      : DexIndex(std::forward<Range>(Symbols), URISchemes) {
     KeepAlive = std::shared_ptr<void>(
         std::make_shared<Payload>(std::move(BackingData)), nullptr);
   }
 
   /// Builds an index from a slab. The index takes ownership of the slab.
-  static std::unique_ptr<SymbolIndex> build(SymbolSlab Slab) {
-    return llvm::make_unique<DexIndex>(Slab, std::move(Slab));
+  static std::unique_ptr<SymbolIndex>
+  build(SymbolSlab Slab, llvm::ArrayRef<std::string> URISchemes) {
+    return llvm::make_unique<DexIndex>(Slab, std::move(Slab), URISchemes);
   }
 
   bool
@@ -73,21 +77,25 @@
 private:
   void buildIndex();
 
+  /// Stores symbols in the descending order using symbol quality as the key.
   std::vector<const Symbol *> Symbols;
+  /// Maps DocID to the quality of underlying symbol.
+  std::vector<float> SymbolQuality;
   llvm::DenseMap<SymbolID, const Symbol *> LookupTable;
-  llvm::DenseMap<const Symbol *, float> SymbolQuality;
-  // Inverted index is a mapping from the search token to the posting list,
-  // which contains all items which can be characterized by such search token.
-  // For example, if the search token is scope "std::", the corresponding
-  // posting list would contain all indices of symbols defined in namespace std.
-  // Inverted index is used to retrieve posting lists which are processed during
-  // the fuzzyFind process.
+  /// Inverted index is a mapping from the search token to the posting list,
+  /// which contains all items which can be characterized by such search token.
+  /// For example, if the search token is scope "std::", the corresponding
+  /// posting list would contain all indices of symbols defined in namespace
+  /// std. Inverted index is used to retrieve posting lists which are processed
+  /// during the fuzzyFind process.
   llvm::DenseMap<Token, PostingList> InvertedIndex;
   std::shared_ptr<void> KeepAlive; // poor man's move-only std::any
+
+  const std::vector<std::string> URISchemes;
 };
 
 } // namespace dex
 } // namespace clangd
 } // namespace clang
 
-#endif
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_DEXINDEX_H
Index: clang-tools-extra/clangd/index/dex/DexIndex.cpp
===================================================================
--- clang-tools-extra/clangd/index/dex/DexIndex.cpp
+++ clang-tools-extra/clangd/index/dex/DexIndex.cpp
@@ -8,8 +8,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "DexIndex.h"
+#include "../../FileDistance.h"
 #include "../../FuzzyMatch.h"
 #include "../../Logger.h"
+#include "../../Quality.h"
 #include <algorithm>
 #include <queue>
 
@@ -31,23 +33,44 @@
 std::vector<Token> generateSearchTokens(const Symbol &Sym) {
   std::vector<Token> Result = generateIdentifierTrigrams(Sym.Name);
   Result.emplace_back(Token::Kind::Scope, Sym.Scope);
+  for (const auto &ProximityToken :
+       generateProximityURIs(Sym.CanonicalDeclaration.FileURI))
+    Result.emplace_back(ProximityToken);
   return Result;
 }
 
+struct SymbolAndScore {
+  const Symbol *Sym;
+  float Score;
+};
+
+bool operator>(const SymbolAndScore &LHS, const SymbolAndScore &RHS) {
+  return LHS.Score > RHS.Score;
+}
+
 } // namespace
 
 void DexIndex::buildIndex() {
-  for (const Symbol *Sym : Symbols) {
+  std::vector<SymbolAndScore> SymbolAndScores(Symbols.size());
+
+  for (size_t I = 0; I < Symbols.size(); ++I) {
+    const Symbol *Sym = Symbols[I];
     LookupTable[Sym->ID] = Sym;
-    SymbolQuality[Sym] = quality(*Sym);
+    SymbolAndScores[I] = {Sym, static_cast<float>(quality(*Sym))};
   }
 
   // Symbols are sorted by symbol qualities so that items in the posting lists
   // are stored in the descending order of symbol quality.
-  std::sort(begin(Symbols), end(Symbols),
-            [&](const Symbol *LHS, const Symbol *RHS) {
-              return SymbolQuality[LHS] > SymbolQuality[RHS];
-            });
+  std::sort(begin(SymbolAndScores), end(SymbolAndScores),
+            std::greater<SymbolAndScore>());
+
+  // SymbolQuality was empty up until now.
+  SymbolQuality.resize(Symbols.size());
+  // Populate internal storage using Symbol + Score pairs.
+  for (size_t I = 0; I < SymbolAndScores.size(); ++I) {
+    Symbols[I] = SymbolAndScores[I].Sym;
+    SymbolQuality[I] = SymbolAndScores[I].Score;
+  }
 
   // Populate TempInvertedIndex with posting lists for index symbols.
   for (DocID SymbolRank = 0; SymbolRank < Symbols.size(); ++SymbolRank) {
@@ -96,6 +119,34 @@
   if (!ScopeIterators.empty())
     TopLevelChildren.push_back(createOr(move(ScopeIterators)));
 
+  // Add proximity paths boosting.
+  std::vector<std::unique_ptr<Iterator>> BoostingIterators;
+  // Try to build BOOST iterator for each Proximity Path provided by
+  // FuzzyFindRequest. Boosting factor should depend on the distance to the
+  // Proximity Path: the closer processed path is, the higher boosting factor.
+  for (const auto &ProximityPath : Req.ProximityPaths) {
+    const auto ProximityURIs =
+        generateQueryProximityURIs(ProximityPath, URISchemes);
+    for (size_t Distance = 0; Distance < ProximityURIs.size(); ++Distance) {
+      const auto It = InvertedIndex.find(ProximityURIs[Distance]);
+      if (It != InvertedIndex.end()) {
+        // FIXME(kbobyrev): Append LIMIT on top of every BOOST iterator.
+        // parentProximityScore() scales score within [0, 1] range so that it is
+        // aligned with final Quality measurements.
+        float BoostFactor = 1 + parentProximityScore(Distance);
+        BoostingIterators.push_back(
+            createBoost(create(It->second), BoostFactor));
+      }
+    }
+  }
+  // Boosting iterators do not actually filter symbols. In order to preserve
+  // the validity of resulting query, TRUE iterator should be added along
+  // BOOSTs.
+  if (!BoostingIterators.empty()) {
+    BoostingIterators.push_back(createTrue(Symbols.size()));
+    TopLevelChildren.push_back(createOr(move(BoostingIterators)));
+  }
+
   // Use TRUE iterator if both trigrams and scopes from the query are not
   // present in the symbol index.
   auto QueryIterator = TopLevelChildren.empty()
@@ -106,32 +157,43 @@
   // FIXME(kbobyrev): Pre-scoring retrieval threshold should be adjusted as
   // using 100x of the requested number might not be good in practice, e.g.
   // when the requested number of items is small.
-  const unsigned ItemsToRetrieve = 100 * Req.MaxCandidateCount;
+  const size_t ItemsToRetrieve = 100 * Req.MaxCandidateCount;
   auto Root = createLimit(move(QueryIterator), ItemsToRetrieve);
-  // FIXME(kbobyrev): Add boosting to the query and utilize retrieved
-  // boosting scores.
-  std::vector<std::pair<DocID, float>> SymbolDocIDs = consume(*Root);
+
+  std::vector<DocIDAndScore> IDAndScores = consume(*Root);
+
+  // Multiply scores by the quality once to use the cumulative score in both
+  // subsequent filtering stages.
+  for (size_t I = 0; I < IDAndScores.size(); ++I)
+    IDAndScores[I].Score *= SymbolQuality[IDAndScores[I].ID];
+
+  // Sort items using a combination of boosting score and quality.
+  // FIXME(kbobyrev): Consider merging this stage with the next one? This is
+  // a performance/quality trade-off and if the performance is not an issue
+  // there's plenty of quality to be bought by sacrificing part of
+  // performance.
+  std::sort(begin(IDAndScores), end(IDAndScores),
+            std::greater<DocIDAndScore>());
 
   // Retrieve top Req.MaxCandidateCount items.
-  std::priority_queue<std::pair<float, const Symbol *>> Top;
-  for (const auto &P : SymbolDocIDs) {
-    const DocID SymbolDocID = P.first;
+  const size_t ItemsToScore = 10 * Req.MaxCandidateCount;
+  TopN<DocIDAndScore> Top(Req.MaxCandidateCount);
+  for (size_t I = 0; I < std::min(IDAndScores.size(), ItemsToScore); ++I) {
+    const DocID SymbolDocID = IDAndScores[I].ID;
     const auto *Sym = Symbols[SymbolDocID];
     const llvm::Optional<float> Score = Filter.match(Sym->Name);
     if (!Score)
       continue;
-    // Multiply score by a negative factor so that Top stores items with the
-    // highest actual score.
-    Top.emplace(-(*Score) * SymbolQuality.find(Sym)->second, Sym);
-    if (Top.size() > Req.MaxCandidateCount) {
+    // If Top.push(...) returns true, it means that it had to pop an item. In
+    // this case, it is possible to retrieve more symbols.
+    if (Top.push({SymbolDocID, (*Score) * SymbolQuality[SymbolDocID]}))
       More = true;
-      Top.pop();
-    }
   }
 
-  // Apply callback to the top Req.MaxCandidateCount items.
-  for (; !Top.empty(); Top.pop())
-    Callback(*Top.top().second);
+  // Apply callback to the top Req.MaxCandidateCount items in the descending
+  // order of cumulative score.
+  for (const auto &Item : std::move(Top).items())
+    Callback(*Symbols[Item.ID]);
   return More;
 }
 
Index: clang-tools-extra/clangd/Quality.h
===================================================================
--- clang-tools-extra/clangd/Quality.h
+++ clang-tools-extra/clangd/Quality.h
@@ -125,6 +125,10 @@
 /// Combine symbol quality and relevance into a single score.
 float evaluateSymbolAndRelevance(float SymbolQuality, float SymbolRelevance);
 
+/// Returns proximity score given distance between a parent and a child
+/// directories URIs.
+float parentProximityScore(unsigned Distance);
+
 /// TopN<T> is a lossy container that preserves only the "best" N elements.
 template <typename T, typename Compare = std::greater<T>> class TopN {
 public:
Index: clang-tools-extra/clangd/Quality.cpp
===================================================================
--- clang-tools-extra/clangd/Quality.cpp
+++ clang-tools-extra/clangd/Quality.cpp
@@ -296,13 +296,17 @@
   NeedsFixIts = !SemaCCResult.FixIts.empty();
 }
 
+float parentProximityScore(unsigned Distance) {
+  return std::exp(Distance * -.4f / FileDistanceOptions().UpCost);
+}
+
 static std::pair<float, unsigned> proximityScore(llvm::StringRef SymbolURI,
                                                  URIDistance *D) {
   if (!D || SymbolURI.empty())
     return {0.f, 0u};
   unsigned Distance = D->distance(SymbolURI);
   // Assume approximately default options are used for sensible scoring.
-  return {std::exp(Distance * -0.4f / FileDistanceOptions().UpCost), Distance};
+  return {parentProximityScore(Distance), Distance};
 }
 
 float SymbolRelevanceSignals::evaluate() const {
Index: clang-tools-extra/clangd/CMakeLists.txt
===================================================================
--- clang-tools-extra/clangd/CMakeLists.txt
+++ clang-tools-extra/clangd/CMakeLists.txt
@@ -46,6 +46,7 @@
 
   index/dex/DexIndex.cpp
   index/dex/Iterator.cpp
+  index/dex/Token.cpp
   index/dex/Trigram.cpp
 
   LINK_LIBS

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D51481: [clangd] Implement proximity path boosting for Dex

Reply via email to