kbobyrev updated this revision to Diff 163680. kbobyrev marked 17 inline comments as done. kbobyrev added a comment.
Address a round of comments, refactor code. https://reviews.llvm.org/D51481 Files: clang-tools-extra/; clang-tools-extra/clangd/CMakeLists.txt clang-tools-extra/clangd/index/dex/DexIndex.cpp clang-tools-extra/clangd/index/dex/DexIndex.h clang-tools-extra/clangd/index/dex/Token.cpp clang-tools-extra/clangd/index/dex/Token.h clang-tools-extra/clangd/tool/ClangdMain.cpp clang-tools-extra/unittests/clangd/DexIndexTests.cpp clang-tools-extra/unittests/clangd/TestFS.cpp clang-tools-extra/unittests/clangd/TestFS.h
Index: clang-tools-extra/unittests/clangd/TestFS.h =================================================================== --- clang-tools-extra/unittests/clangd/TestFS.h +++ clang-tools-extra/unittests/clangd/TestFS.h @@ -59,7 +59,7 @@ }; // Returns an absolute (fake) test directory for this OS. -const char *testRoot(); +std::string testRoot(); // Returns a suitable absolute path for this OS. std::string testPath(PathRef File); Index: clang-tools-extra/unittests/clangd/TestFS.cpp =================================================================== --- clang-tools-extra/unittests/clangd/TestFS.cpp +++ clang-tools-extra/unittests/clangd/TestFS.cpp @@ -64,7 +64,7 @@ FileName, std::move(CommandLine), "")}; } -const char *testRoot() { +std::string testRoot() { #ifdef _WIN32 return "C:\\clangd-test"; #else Index: clang-tools-extra/unittests/clangd/DexIndexTests.cpp =================================================================== --- clang-tools-extra/unittests/clangd/DexIndexTests.cpp +++ clang-tools-extra/unittests/clangd/DexIndexTests.cpp @@ -7,6 +7,8 @@ // //===----------------------------------------------------------------------===// +#include "FuzzyMatch.h" +#include "TestFS.h" #include "TestIndex.h" #include "index/Index.h" #include "index/Merge.h" @@ -29,6 +31,8 @@ namespace dex { namespace { +std::vector<std::string> URISchemes = {"unittest"}; + std::vector<DocID> consumeIDs(Iterator &It) { auto IDAndScore = consume(It); std::vector<DocID> IDs(IDAndScore.size()); @@ -325,14 +329,33 @@ } testing::Matcher<std::vector<Token>> -trigramsAre(std::initializer_list<std::string> Trigrams) { +tokensAre(std::initializer_list<std::string> Strings, Token::Kind Kind) { std::vector<Token> Tokens; - for (const auto &Symbols : Trigrams) { - Tokens.push_back(Token(Token::Kind::Trigram, Symbols)); + for (const auto &TokenData : Strings) { + Tokens.push_back(Token(Kind, TokenData)); } return testing::UnorderedElementsAreArray(Tokens); } +testing::Matcher<std::vector<Token>> +trigramsAre(std::initializer_list<std::string> Trigrams) { + return tokensAre(Trigrams, Token::Kind::Trigram); +} + +testing::Matcher<std::vector<Token>> +pathsAre(std::initializer_list<std::string> Paths) { + return tokensAre(Paths, Token::Kind::PathURI); +} + +testing::Matcher<std::vector<Token>> +proximityPathsAre(std::initializer_list<std::string> ProximityPaths) { + std::vector<Token> Result; + for (const auto &Path : ProximityPaths) { + Result.emplace_back(Token::Kind::PathURI, Path); + } + return testing::UnorderedElementsAreArray(Result); +} + TEST(DexIndexTrigrams, IdentifierTrigrams) { EXPECT_THAT(generateIdentifierTrigrams("X86"), trigramsAre({"x86", "x$$", "x8$"})); @@ -407,8 +430,27 @@ "hij", "ijk", "jkl", "klm"})); } +TEST(DexSearchTokens, SymbolPath) { + EXPECT_THAT(generateProximityPathURIs( + "unittest:///clang-tools-extra/clangd/index/Token.h"), + pathsAre({"unittest:///clang-tools-extra/clangd/index/Token.h", + "unittest:///clang-tools-extra/clangd/index", + "unittest:///clang-tools-extra/clangd", + "unittest:///clang-tools-extra", "unittest:///"})); +} + +TEST(DexSearchTokens, QueryProximityDistances) { + EXPECT_THAT(generateQueryProximityPathURIs(testRoot() + "/a/b/c/d/e/f/g.h", + URISchemes), + proximityPathsAre({{"unittest:///a/b/c/d/e/f/g.h"}, + {"unittest:///a/b/c/d/e/f"}, + {"unittest:///a/b/c/d/e"}, + {"unittest:///a/b/c/d"}, + {"unittest:///a/b/c"}})); +} + TEST(DexIndex, Lookup) { - DexIndex I; + DexIndex I(URISchemes); I.build(generateSymbols({"ns::abc", "ns::xyz"})); EXPECT_THAT(lookup(I, SymbolID("ns::abc")), UnorderedElementsAre("ns::abc")); EXPECT_THAT(lookup(I, {SymbolID("ns::abc"), SymbolID("ns::xyz")}), @@ -419,7 +461,7 @@ } TEST(DexIndex, FuzzyFind) { - DexIndex Index; + DexIndex Index(URISchemes); Index.build(generateSymbols({"ns::ABC", "ns::BCD", "::ABC", "ns::nested::ABC", "other::ABC", "other::A"})); FuzzyFindRequest Req; @@ -442,7 +484,7 @@ } TEST(DexIndexTest, FuzzyMatchQ) { - DexIndex I; + DexIndex I(URISchemes); I.build( generateSymbols({"LaughingOutLoud", "LionPopulation", "LittleOldLady"})); FuzzyFindRequest Req; @@ -453,7 +495,7 @@ } TEST(DexIndexTest, DexIndexSymbolsRecycled) { - DexIndex I; + DexIndex I(URISchemes); std::weak_ptr<SlabAndPointers> Symbols; I.build(generateNumSymbols(0, 10, &Symbols)); FuzzyFindRequest Req; @@ -482,14 +524,14 @@ FuzzyFindRequest Req; Req.Query = "7"; - DexIndex I; + DexIndex I(URISchemes); I.build(std::move(Symbols)); auto Matches = match(I, Req); EXPECT_EQ(Matches.size(), 4u); } TEST(DexIndexTest, DexIndexLimitedNumMatches) { - DexIndex I; + DexIndex I(URISchemes); I.build(generateNumSymbols(0, 100)); FuzzyFindRequest Req; Req.Query = "5"; @@ -501,7 +543,7 @@ } TEST(DexIndexTest, FuzzyMatch) { - DexIndex I; + DexIndex I(URISchemes); I.build( generateSymbols({"LaughingOutLoud", "LionPopulation", "LittleOldLady"})); FuzzyFindRequest Req; @@ -512,60 +554,60 @@ } TEST(DexIndexTest, MatchQualifiedNamesWithoutSpecificScope) { - DexIndex I; + DexIndex I(URISchemes); I.build(generateSymbols({"a::y1", "b::y2", "y3"})); FuzzyFindRequest Req; Req.Query = "y"; EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1", "b::y2", "y3")); } TEST(DexIndexTest, MatchQualifiedNamesWithGlobalScope) { - DexIndex I; + DexIndex I(URISchemes); I.build(generateSymbols({"a::y1", "b::y2", "y3"})); FuzzyFindRequest Req; Req.Query = "y"; Req.Scopes = {""}; EXPECT_THAT(match(I, Req), UnorderedElementsAre("y3")); } TEST(DexIndexTest, MatchQualifiedNamesWithOneScope) { - DexIndex I; + DexIndex I(URISchemes); I.build(generateSymbols({"a::y1", "a::y2", "a::x", "b::y2", "y3"})); FuzzyFindRequest Req; Req.Query = "y"; Req.Scopes = {"a::"}; EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1", "a::y2")); } TEST(DexIndexTest, MatchQualifiedNamesWithMultipleScopes) { - DexIndex I; + DexIndex I(URISchemes); I.build(generateSymbols({"a::y1", "a::y2", "a::x", "b::y3", "y3"})); FuzzyFindRequest Req; Req.Query = "y"; Req.Scopes = {"a::", "b::"}; EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1", "a::y2", "b::y3")); } TEST(DexIndexTest, NoMatchNestedScopes) { - DexIndex I; + DexIndex I(URISchemes); I.build(generateSymbols({"a::y1", "a::b::y2"})); FuzzyFindRequest Req; Req.Query = "y"; Req.Scopes = {"a::"}; EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1")); } TEST(DexIndexTest, IgnoreCases) { - DexIndex I; + DexIndex I(URISchemes); I.build(generateSymbols({"ns::ABC", "ns::abc"})); FuzzyFindRequest Req; Req.Query = "AB"; Req.Scopes = {"ns::"}; EXPECT_THAT(match(I, Req), UnorderedElementsAre("ns::ABC", "ns::abc")); } TEST(DexIndexTest, Lookup) { - DexIndex I; + DexIndex I(URISchemes); I.build(generateSymbols({"ns::abc", "ns::xyz"})); EXPECT_THAT(lookup(I, SymbolID("ns::abc")), UnorderedElementsAre("ns::abc")); EXPECT_THAT(lookup(I, {SymbolID("ns::abc"), SymbolID("ns::xyz")}), @@ -575,6 +617,58 @@ EXPECT_THAT(lookup(I, SymbolID("ns::nonono")), UnorderedElementsAre()); } +TEST(DexIndexTest, ProximityPathsBoosting) { + DexIndex I(URISchemes); + + // All strings should be stored in local variables to ensure their lifetime is + // the same as Slab's. + auto RootURI = URI::create(testRoot() + "/file.h", "unittest"); + assert(static_cast<bool>(RootURI)); + auto CloseFileURI = URI::create(testRoot() + "/a/b/c/d/e/file.h", "unittest"); + assert(static_cast<bool>(CloseFileURI)); + + // Populate Symbol Slab + std::vector<const Symbol *> Slab; + // Symbol in the root will perfectly match the request, but its canonical + // declaration is in the root directory. + Symbol RootSymbol; + std::string RootSymbolName = "abc"; + RootSymbol.Name = RootSymbolName; + std::string RootFilename = RootURI->toString(); + RootSymbol.CanonicalDeclaration.FileURI = RootFilename; + Slab.push_back(&RootSymbol); + + // Another symbol has much lower Fuzzy Matching Score, but it is very close to + // the directory the request would come from. + Symbol CloseSymbol; + std::string CloseSymbolName = "AbsolutelyBizzareCoincidenceMatch"; + CloseSymbol.Name = CloseSymbolName; + std::string CloseFileName = CloseFileURI->toString(); + CloseSymbol.CanonicalDeclaration.FileURI = CloseFileName; + + FuzzyFindRequest Req; + Req.Query = "abc"; + Req.MaxCandidateCount = 1; + // FuzzyFind request comes from the file which is far from the root. + Req.ProximityPaths.push_back(testRoot() + "/a/b/c/d/e/f/file.h"); + + FuzzyMatcher Matcher(Req.Query); + EXPECT_TRUE(Matcher.match(RootSymbolName) > Matcher.match(CloseSymbolName)); + + // Insert duplicates so that only the boosted symbols end up in the scoring + // stage. + // FIXME(kbobyrev): 10 is the number of items passed to the last stage which + // involves fuzzy scoring. Would having access to that information be better + // here? + for (size_t Index = 0; Index < 10; ++Index) + Slab.push_back(&CloseSymbol); + + I.build(std::make_shared<std::vector<const Symbol *>>(Slab)); + + EXPECT_THAT(match(I, Req), + UnorderedElementsAre("AbsolutelyBizzareCoincidenceMatch")); +} + } // namespace } // namespace dex } // namespace clangd Index: clang-tools-extra/clangd/tool/ClangdMain.cpp =================================================================== --- clang-tools-extra/clangd/tool/ClangdMain.cpp +++ clang-tools-extra/clangd/tool/ClangdMain.cpp @@ -42,7 +42,9 @@ // Build an in-memory static index for global symbols from a YAML-format file. // The size of global symbols should be relatively small, so that all symbols // can be managed in memory. -std::unique_ptr<SymbolIndex> buildStaticIndex(llvm::StringRef YamlSymbolFile) { +std::unique_ptr<SymbolIndex> +buildStaticIndex(llvm::StringRef YamlSymbolFile, + std::vector<std::string> &URISchemes) { auto Buffer = llvm::MemoryBuffer::getFile(YamlSymbolFile); if (!Buffer) { llvm::errs() << "Can't open " << YamlSymbolFile << "\n"; @@ -53,9 +55,10 @@ for (auto Sym : Slab) SymsBuilder.insert(Sym); - return UseDex ? dex::DexIndex::build(std::move(SymsBuilder).build()) - : MemIndex::build(std::move(SymsBuilder).build(), - SymbolOccurrenceSlab::createEmpty()); + return UseDex + ? dex::DexIndex::build(std::move(SymsBuilder).build(), URISchemes) + : MemIndex::build(std::move(SymsBuilder).build(), + SymbolOccurrenceSlab::createEmpty()); } } // namespace @@ -299,7 +302,7 @@ Opts.BuildDynamicSymbolIndex = EnableIndex; std::unique_ptr<SymbolIndex> StaticIdx; if (EnableIndex && !YamlSymbolFile.empty()) { - StaticIdx = buildStaticIndex(YamlSymbolFile); + StaticIdx = buildStaticIndex(YamlSymbolFile, Opts.URISchemes); Opts.StaticIndex = StaticIdx.get(); } Opts.AsyncThreadsCount = WorkerThreadsCount; Index: clang-tools-extra/clangd/index/dex/Token.h =================================================================== --- clang-tools-extra/clangd/index/dex/Token.h +++ clang-tools-extra/clangd/index/dex/Token.h @@ -41,22 +41,32 @@ /// Kind specifies Token type which defines semantics for the internal /// representation. Each Kind has different representation stored in Data /// field. + // FIXME(kbobyrev): Storing Data hash would be more efficient than storing raw + // strings. For example, PathURI store URIs of each directory and its parents, + // which induces a lot of overhead because these paths tend to be long and + // each parent directory is a prefix. enum class Kind { /// Represents trigram used for fuzzy search of unqualified symbol names. /// /// Data contains 3 bytes with trigram contents. Trigram, /// Scope primitives, e.g. "symbol belongs to namespace foo::bar". /// - /// Data stroes full scope name , e.g. "foo::bar::baz::" or "" (for global + /// Data stroes full scope name, e.g. "foo::bar::baz::" or "" (for global /// scope). Scope, + /// Path URI to symbol declaration. + /// + /// Data stores path URI of file and each parent directory of symbol + /// declaration location. + /// + /// Example: "file:///path/to/clang-tools-extra/clangd/index/SymbolIndex.h" + /// and all its parents. + PathURI, /// Internal Token type for invalid/special tokens, e.g. empty tokens for /// llvm::DenseMap. Sentinel, /// FIXME(kbobyrev): Add other Token Kinds - /// * Path with full or relative path to the directory in which symbol is - /// defined /// * Type with qualified type name or its USR }; @@ -81,6 +91,16 @@ } }; +/// Returns Search Token for a number of parent directories of given Path. +/// Should be used within the index build process. +std::vector<Token> generateProximityPathURIs(llvm::StringRef Path); + +/// Returns Search Token and its distance from the absolute path for the first +/// URI scheme allows valid conversions from AbsolutePath. +std::vector<Token> +generateQueryProximityPathURIs(llvm::StringRef AbsolutePath, + llvm::ArrayRef<std::string> URISchemes); + } // namespace dex } // namespace clangd } // namespace clang Index: clang-tools-extra/clangd/index/dex/Token.cpp =================================================================== --- /dev/null +++ clang-tools-extra/clangd/index/dex/Token.cpp @@ -0,0 +1,70 @@ +//===--- Token.cpp - Symbol Search primitive --------------------*- C++ -*-===// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "Token.h" +#include "URI.h" +#include "llvm/Support/Path.h" +#include <cassert> + +namespace clang { +namespace clangd { +namespace dex { + +std::vector<Token> generateProximityPathURIs(llvm::StringRef URIPath) { + std::vector<Token> Result; + // Don't generate any tokens for empty paths. + if (URIPath.empty()) + return Result; + auto ParsedURI = URI::parse(URIPath); + assert(static_cast<bool>(ParsedURI) && + "Non-empty argument of generateProximityPathURIs() should be a valid " + "URI."); + const auto Scheme = ParsedURI->scheme(); + const auto Authority = ParsedURI->authority(); + StringRef Path = ParsedURI->body(); + // FIXME(kbobyrev): Currently, this is a heuristic which defines the maximum + // size of resulting vector. Some projects might want to have higher limit if + // the file hierarchy is deeper. For the generic case, it would be useful to + // calculate Limit in the index build stage by calculating the maximum depth + // of the project source tree at runtime. + size_t Limit = 5; + while (!Path.empty() && Limit--) { + // FIXME(kbobyrev): Parsing and encoding path to URIs is not necessary and + // could be optimized. + URI TokenURI(Scheme, Authority, Path); + Result.emplace_back(Token::Kind::PathURI, TokenURI.toString()); + Path = llvm::sys::path::parent_path(Path, llvm::sys::path::Style::posix); + } + return Result; +} + +std::vector<Token> +generateQueryProximityPathURIs(llvm::StringRef AbsolutePath, + llvm::ArrayRef<std::string> URISchemes) { + std::vector<Token> Result; + for (const auto Scheme : URISchemes) { + auto URI = URI::create(AbsolutePath, Scheme); + // For some paths, conversion to different URI schemes is impossible. These + // should be just skipped. + if (!static_cast<bool>(URI)) { + // Ignore the error. + llvm::consumeError(URI.takeError()); + continue; + } + for (const auto ProximityPath : generateProximityPathURIs(URI->toString())) + Result.emplace_back(ProximityPath); + // As soon as one valid URI scheme for given absolute path is found, the + // token generation process should be stopped. + break; + } + return Result; +} + +} // namespace dex +} // namespace clangd +} // namespace clang Index: clang-tools-extra/clangd/index/dex/DexIndex.h =================================================================== --- clang-tools-extra/clangd/index/dex/DexIndex.h +++ clang-tools-extra/clangd/index/dex/DexIndex.h @@ -39,12 +39,21 @@ // index on disk and then load it if available. class DexIndex : public SymbolIndex { public: + /// URISchemes are used to convert proximity paths from FuzzyFindRequest to + /// the desired URIs. URISchemes should contain a list of schemes sorted by + /// the priority, because each proximity path from FuzzyFindRequest will be + /// converted to the first scheme from URISchemes list which is valid for that + /// path. + explicit DexIndex(llvm::ArrayRef<std::string> URISchemes) + : URISchemes(URISchemes) {} + /// \brief (Re-)Build index for `Symbols`. All symbol pointers must remain /// accessible as long as `Symbols` is kept alive. - void build(std::shared_ptr<std::vector<const Symbol *>> Syms); + void build(std::shared_ptr<std::vector<const Symbol *>> Symbols); /// \brief Build index from a symbol slab. - static std::unique_ptr<SymbolIndex> build(SymbolSlab Slab); + static std::unique_ptr<SymbolIndex> + build(SymbolSlab Slab, llvm::ArrayRef<std::string> URISchemes); bool fuzzyFind(const FuzzyFindRequest &Req, @@ -60,23 +69,24 @@ size_t estimateMemoryUsage() const override; private: - mutable std::mutex Mutex; std::shared_ptr<std::vector<const Symbol *>> Symbols /*GUARDED_BY(Mutex)*/; llvm::DenseMap<SymbolID, const Symbol *> LookupTable /*GUARDED_BY(Mutex)*/; - llvm::DenseMap<const Symbol *, float> SymbolQuality /*GUARDED_BY(Mutex)*/; + std::vector<float> SymbolQuality /*GUARDED_BY(Mutex)*/; // Inverted index is a mapping from the search token to the posting list, // which contains all items which can be characterized by such search token. // For example, if the search token is scope "std::", the corresponding // posting list would contain all indices of symbols defined in namespace std. // Inverted index is used to retrieve posting lists which are processed during // the fuzzyFind process. llvm::DenseMap<Token, PostingList> InvertedIndex /*GUARDED_BY(Mutex)*/; + + const std::vector<std::string> URISchemes; }; } // namespace dex } // namespace clangd } // namespace clang -#endif +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_DEXINDEX_H Index: clang-tools-extra/clangd/index/dex/DexIndex.cpp =================================================================== --- clang-tools-extra/clangd/index/dex/DexIndex.cpp +++ clang-tools-extra/clangd/index/dex/DexIndex.cpp @@ -8,6 +8,7 @@ //===----------------------------------------------------------------------===// #include "DexIndex.h" +#include "../../FileDistance.h" #include "../../FuzzyMatch.h" #include "../../Logger.h" #include <algorithm> @@ -26,29 +27,31 @@ // Returns the tokens which are given symbols's characteristics. For example, // trigrams and scopes. // FIXME(kbobyrev): Support more token types: -// * Path proximity // * Types std::vector<Token> generateSearchTokens(const Symbol &Sym) { std::vector<Token> Result = generateIdentifierTrigrams(Sym.Name); Result.emplace_back(Token::Kind::Scope, Sym.Scope); + for (const auto &PathToken : + generateProximityPathURIs(Sym.CanonicalDeclaration.FileURI)) + Result.push_back(PathToken); return Result; } } // namespace void DexIndex::build(std::shared_ptr<std::vector<const Symbol *>> Syms) { llvm::DenseMap<SymbolID, const Symbol *> TempLookupTable; - llvm::DenseMap<const Symbol *, float> TempSymbolQuality; + llvm::DenseMap<const Symbol *, float> SymbolQualityLookup; for (const Symbol *Sym : *Syms) { TempLookupTable[Sym->ID] = Sym; - TempSymbolQuality[Sym] = quality(*Sym); + SymbolQualityLookup[Sym] = quality(*Sym); } // Symbols are sorted by symbol qualities so that items in the posting lists // are stored in the descending order of symbol quality. std::sort(begin(*Syms), end(*Syms), [&](const Symbol *LHS, const Symbol *RHS) { - return TempSymbolQuality[LHS] > TempSymbolQuality[RHS]; + return SymbolQualityLookup[LHS] > SymbolQualityLookup[RHS]; }); llvm::DenseMap<Token, PostingList> TempInvertedIndex; // Populate TempInvertedIndex with posting lists for index symbols. @@ -58,6 +61,10 @@ TempInvertedIndex[Token].push_back(SymbolRank); } + std::vector<float> TempSymbolQuality(Syms->size()); + for (size_t I = 0; I < Syms->size(); ++I) + TempSymbolQuality[I] = SymbolQualityLookup[(*Syms)[I]]; + { std::lock_guard<std::mutex> Lock(Mutex); @@ -72,8 +79,9 @@ estimateMemoryUsage()); } -std::unique_ptr<SymbolIndex> DexIndex::build(SymbolSlab Slab) { - auto Idx = llvm::make_unique<DexIndex>(); +std::unique_ptr<SymbolIndex> +DexIndex::build(SymbolSlab Slab, llvm::ArrayRef<std::string> URISchemes) { + auto Idx = llvm::make_unique<DexIndex>(URISchemes); Idx->build(getSymbolsFromSlab(std::move(Slab))); return std::move(Idx); } @@ -117,6 +125,33 @@ if (!ScopeIterators.empty()) TopLevelChildren.push_back(createOr(move(ScopeIterators))); + // Add proximity paths boosting. + std::vector<std::unique_ptr<Iterator>> BoostingIterators; + // Try to build BOOST iterator for each Proximity Path provided by + // FuzzyFindRequest. Boosting factor should depend on the distance to the + // Proximity Path: the closer processed path is, the higher boosting factor. + for (const auto &ProximityPath : Req.ProximityPaths) { + const auto ProximityPathURIs = + generateQueryProximityPathURIs(ProximityPath, URISchemes); + for (size_t Distance = 0; Distance < ProximityPathURIs.size(); + ++Distance) { + const auto It = InvertedIndex.find(ProximityPathURIs[Distance]); + if (It != InvertedIndex.end()) { + // FIXME(kbobyrev): Append LIMIT on top of every BOOST iterator. + // Each parent directory is boosted by PARENTS_TO_BOOST + 1 - Level. + float BoostFactor = + std::exp(Distance * 0.4f / FileDistanceOptions().UpCost); + BoostingIterators.push_back( + createBoost(create(It->second), BoostFactor)); + } + } + } + // Boosting iterators do not actually filter symbols. In order to preserve + // the validity of resulting query, TRUE iterator should be added along + // BOOSTs. + BoostingIterators.push_back(createTrue(Symbols->size())); + TopLevelChildren.push_back(createOr(move(BoostingIterators))); + // Use TRUE iterator if both trigrams and scopes from the query are not // present in the symbol index. auto QueryIterator = TopLevelChildren.empty() @@ -127,23 +162,35 @@ // FIXME(kbobyrev): Pre-scoring retrieval threshold should be adjusted as // using 100x of the requested number might not be good in practice, e.g. // when the requested number of items is small. - const unsigned ItemsToRetrieve = 100 * Req.MaxCandidateCount; + const size_t ItemsToRetrieve = 100 * Req.MaxCandidateCount; auto Root = createLimit(move(QueryIterator), ItemsToRetrieve); - // FIXME(kbobyrev): Add boosting to the query and utilize retrieved - // boosting scores. + std::vector<std::pair<DocID, float>> SymbolDocIDs = consume(*Root); + // Sort items using a combination of boosting score and quality. + // FIXME(kbobyrev): Consider merging this stage with the next one? This is + // a performance/quality trade-off and if the performance is not an issue + // there's plenty of quality to be bought by sacrificing part of + // performance. + std::sort(begin(SymbolDocIDs), end(SymbolDocIDs), + [&](const std::pair<DocID, float> &LHS, + const std::pair<DocID, float> &RHS) { + return SymbolQuality[LHS.first] * LHS.second > + SymbolQuality[RHS.first] * RHS.second; + }); + // Retrieve top Req.MaxCandidateCount items. + const size_t ItemsToScore = 10 * Req.MaxCandidateCount; std::priority_queue<std::pair<float, const Symbol *>> Top; - for (const auto &P : SymbolDocIDs) { - const DocID SymbolDocID = P.first; + for (size_t I = 0; I < std::min(SymbolDocIDs.size(), ItemsToScore); ++I) { + const DocID SymbolDocID = SymbolDocIDs[I].first; const auto *Sym = (*Symbols)[SymbolDocID]; const llvm::Optional<float> Score = Filter.match(Sym->Name); if (!Score) continue; // Multiply score by a negative factor so that Top stores items with the // highest actual score. - Top.emplace(-(*Score) * SymbolQuality.find(Sym)->second, Sym); + Top.emplace(-(*Score) * SymbolQuality[SymbolDocID], Sym); if (Top.size() > Req.MaxCandidateCount) { More = true; Top.pop(); Index: clang-tools-extra/clangd/CMakeLists.txt =================================================================== --- clang-tools-extra/clangd/CMakeLists.txt +++ clang-tools-extra/clangd/CMakeLists.txt @@ -46,6 +46,7 @@ index/dex/DexIndex.cpp index/dex/Iterator.cpp + index/dex/Token.cpp index/dex/Trigram.cpp LINK_LIBS Index: clang-tools-extra/; =================================================================== --- /dev/null +++ clang-tools-extra/; @@ -0,0 +1,68 @@ +//===--- Token.cpp - Symbol Search primitive --------------------*- C++ -*-===// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "Token.h" +#include "URI.h" +#include "llvm/Support/Path.h" +#include <cassert> + +namespace clang { +namespace clangd { +namespace dex { + +std::vector<Token> generateProximityPaths(llvm::StringRef URIPath, + size_t Limit) { + std::vector<Token> Result; + // Don't generate any tokens for empty paths. + if (URIPath.empty()) + return Result; + auto ParsedURI = URI::parse(URIPath); + assert( + static_cast<bool>(ParsedURI) && + "Non-empty argument of generateProximityPaths() should be a valid URI."); + const auto Scheme = ParsedURI.get().scheme(); + const auto Authority = ParsedURI.get().authority(); + StringRef Path = ParsedURI.get().body(); + // Get parent directory of the file with symbol declaration. + Path = llvm::sys::path::parent_path(Path); + while (!Path.empty() && Limit--) { + URI TokenURI(Scheme, Authority, Path); + Result.emplace_back(Token::Kind::PathURI, TokenURI.toString()); + Path = llvm::sys::path::parent_path(Path); + } + return Result; +} + +std::vector<std::pair<Token, unsigned>> +generateQueryProximityPaths(llvm::StringRef AbsolutePath, + llvm::ArrayRef<std::string> URISchemes, + size_t Count) { + std::vector<std::pair<Token, unsigned>> Result; + for (const auto Scheme : URISchemes) { + unsigned Distance = 0; + auto URI = URI::create(AbsolutePath, Scheme); + // For some paths, conversion to different URI schemes is impossible. These + // should be just skipped. + if (!static_cast<bool>(URI)) { + // Ignore the error. + handleAllErrors(URI.takeError(), [](const llvm::ErrorInfoBase &) {}); + continue; + } + for (const auto ProximityPath : + generateProximityPaths(URI.get().toString(), Count)) + Result.emplace_back(ProximityPath, Distance++); + // As soon as one valid URI scheme for given absolute path is found, the + // token generation process should be stopped. + break; + } + return Result; +} + +} // namespace dex +} // namespace clangd +} // namespace clang
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits