kbobyrev updated this revision to Diff 160576. kbobyrev added a comment. Don't separate the logic for "long" and "short" queries: https://reviews.llvm.org/D50517 (https://reviews.llvm.org/rCTE339548) introduced incomplete trigrams which can be used on for "short" queries, too.
https://reviews.llvm.org/D50337 Files: clang-tools-extra/clangd/CMakeLists.txt clang-tools-extra/clangd/index/dex/DexIndex.cpp clang-tools-extra/clangd/index/dex/DexIndex.h clang-tools-extra/clangd/index/dex/Token.h clang-tools-extra/unittests/clangd/CMakeLists.txt clang-tools-extra/unittests/clangd/DexIndexTests.cpp clang-tools-extra/unittests/clangd/IndexTests.cpp clang-tools-extra/unittests/clangd/TestIndexOperations.cpp clang-tools-extra/unittests/clangd/TestIndexOperations.h
Index: clang-tools-extra/unittests/clangd/TestIndexOperations.cpp =================================================================== --- /dev/null +++ clang-tools-extra/unittests/clangd/TestIndexOperations.cpp @@ -0,0 +1,89 @@ +//===-- IndexHelpers.cpp ----------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "TestIndexOperations.h" + +namespace clang { +namespace clangd { + +Symbol symbol(llvm::StringRef QName) { + Symbol Sym; + Sym.ID = SymbolID(QName.str()); + size_t Pos = QName.rfind("::"); + if (Pos == llvm::StringRef::npos) { + Sym.Name = QName; + Sym.Scope = ""; + } else { + Sym.Name = QName.substr(Pos + 2); + Sym.Scope = QName.substr(0, Pos + 2); + } + return Sym; +} + +// Create a slab of symbols with the given qualified names as both IDs and +// names. The life time of the slab is managed by the returned shared pointer. +// If \p WeakSymbols is provided, it will be pointed to the managed object in +// the returned shared pointer. +std::shared_ptr<std::vector<const Symbol *>> +generateSymbols(std::vector<std::string> QualifiedNames, + std::weak_ptr<SlabAndPointers> *WeakSymbols) { + SymbolSlab::Builder Slab; + for (llvm::StringRef QName : QualifiedNames) + Slab.insert(symbol(QName)); + + auto Storage = std::make_shared<SlabAndPointers>(); + Storage->Slab = std::move(Slab).build(); + for (const auto &Sym : Storage->Slab) + Storage->Pointers.push_back(&Sym); + if (WeakSymbols) + *WeakSymbols = Storage; + auto *Pointers = &Storage->Pointers; + return {std::move(Storage), Pointers}; +} + +// Create a slab of symbols with IDs and names [Begin, End], otherwise identical +// to the `generateSymbols` above. +std::shared_ptr<std::vector<const Symbol *>> +generateNumSymbols(int Begin, int End, + std::weak_ptr<SlabAndPointers> *WeakSymbols) { + std::vector<std::string> Names; + for (int i = Begin; i <= End; i++) + Names.push_back(std::to_string(i)); + return generateSymbols(Names, WeakSymbols); +} + +std::string getQualifiedName(const Symbol &Sym) { + return (Sym.Scope + Sym.Name).str(); +} + +std::vector<std::string> match(const SymbolIndex &I, + const FuzzyFindRequest &Req, bool *Incomplete) { + std::vector<std::string> Matches; + bool IsIncomplete = I.fuzzyFind(Req, [&](const Symbol &Sym) { + Matches.push_back(clang::clangd::getQualifiedName(Sym)); + }); + if (Incomplete) + *Incomplete = IsIncomplete; + return Matches; +} + +// Returns qualified names of symbols with any of IDs in the index. +std::vector<std::string> lookup(const SymbolIndex &I, + llvm::ArrayRef<SymbolID> IDs) { + LookupRequest Req; + Req.IDs.insert(IDs.begin(), IDs.end()); + std::vector<std::string> Results; + I.lookup(Req, [&](const Symbol &Sym) { + Results.push_back(getQualifiedName(Sym)); + }); + return Results; +} + +} // namespace clangd +} // namespace clang Index: clang-tools-extra/unittests/clangd/TestIndexOperations.h =================================================================== --- /dev/null +++ clang-tools-extra/unittests/clangd/TestIndexOperations.h @@ -0,0 +1,57 @@ +//===-- IndexHelpers.h ------------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLS_EXTRA_UNITTESTS_CLANGD_INDEXTESTCOMMON_H +#define LLVM_CLANG_TOOLS_EXTRA_UNITTESTS_CLANGD_INDEXTESTCOMMON_H + +#include "index/Index.h" +#include "index/Merge.h" +#include "index/dex/DexIndex.h" +#include "index/dex/Iterator.h" +#include "index/dex/Token.h" +#include "index/dex/Trigram.h" + +namespace clang { +namespace clangd { + +Symbol symbol(llvm::StringRef QName); + +struct SlabAndPointers { + SymbolSlab Slab; + std::vector<const Symbol *> Pointers; +}; + +// Create a slab of symbols with the given qualified names as both IDs and +// names. The life time of the slab is managed by the returned shared pointer. +// If \p WeakSymbols is provided, it will be pointed to the managed object in +// the returned shared pointer. +std::shared_ptr<std::vector<const Symbol *>> +generateSymbols(std::vector<std::string> QualifiedNames, + std::weak_ptr<SlabAndPointers> *WeakSymbols = nullptr); + +// Create a slab of symbols with IDs and names [Begin, End], otherwise identical +// to the `generateSymbols` above. +std::shared_ptr<std::vector<const Symbol *>> +generateNumSymbols(int Begin, int End, + std::weak_ptr<SlabAndPointers> *WeakSymbols = nullptr); + +std::string getQualifiedName(const Symbol &Sym); + +std::vector<std::string> match(const SymbolIndex &I, + const FuzzyFindRequest &Req, + bool *Incomplete = nullptr); + +// Returns qualified names of symbols with any of IDs in the index. +std::vector<std::string> lookup(const SymbolIndex &I, + llvm::ArrayRef<SymbolID> IDs); + +} // namespace clangd +} // namespace clang + +#endif Index: clang-tools-extra/unittests/clangd/IndexTests.cpp =================================================================== --- clang-tools-extra/unittests/clangd/IndexTests.cpp +++ clang-tools-extra/unittests/clangd/IndexTests.cpp @@ -7,33 +7,20 @@ // //===----------------------------------------------------------------------===// +#include "TestIndexOperations.h" #include "index/Index.h" #include "index/MemIndex.h" #include "index/Merge.h" #include "gmock/gmock.h" #include "gtest/gtest.h" -using testing::UnorderedElementsAre; using testing::Pointee; +using testing::UnorderedElementsAre; namespace clang { namespace clangd { namespace { -Symbol symbol(llvm::StringRef QName) { - Symbol Sym; - Sym.ID = SymbolID(QName.str()); - size_t Pos = QName.rfind("::"); - if (Pos == llvm::StringRef::npos) { - Sym.Name = QName; - Sym.Scope = ""; - } else { - Sym.Name = QName.substr(Pos + 2); - Sym.Scope = QName.substr(0, Pos + 2); - } - return Sym; -} - MATCHER_P(Named, N, "") { return arg.Name == N; } TEST(SymbolSlab, FindAndIterate) { @@ -52,59 +39,6 @@ EXPECT_THAT(*S.find(SymbolID(Sym)), Named(Sym)); } -struct SlabAndPointers { - SymbolSlab Slab; - std::vector<const Symbol *> Pointers; -}; - -// Create a slab of symbols with the given qualified names as both IDs and -// names. The life time of the slab is managed by the returned shared pointer. -// If \p WeakSymbols is provided, it will be pointed to the managed object in -// the returned shared pointer. -std::shared_ptr<std::vector<const Symbol *>> -generateSymbols(std::vector<std::string> QualifiedNames, - std::weak_ptr<SlabAndPointers> *WeakSymbols = nullptr) { - SymbolSlab::Builder Slab; - for (llvm::StringRef QName : QualifiedNames) - Slab.insert(symbol(QName)); - - auto Storage = std::make_shared<SlabAndPointers>(); - Storage->Slab = std::move(Slab).build(); - for (const auto &Sym : Storage->Slab) - Storage->Pointers.push_back(&Sym); - if (WeakSymbols) - *WeakSymbols = Storage; - auto *Pointers = &Storage->Pointers; - return {std::move(Storage), Pointers}; -} - -// Create a slab of symbols with IDs and names [Begin, End], otherwise identical -// to the `generateSymbols` above. -std::shared_ptr<std::vector<const Symbol *>> -generateNumSymbols(int Begin, int End, - std::weak_ptr<SlabAndPointers> *WeakSymbols = nullptr) { - std::vector<std::string> Names; - for (int i = Begin; i <= End; i++) - Names.push_back(std::to_string(i)); - return generateSymbols(Names, WeakSymbols); -} - -std::string getQualifiedName(const Symbol &Sym) { - return (Sym.Scope + Sym.Name).str(); -} - -std::vector<std::string> match(const SymbolIndex &I, - const FuzzyFindRequest &Req, - bool *Incomplete = nullptr) { - std::vector<std::string> Matches; - bool IsIncomplete = I.fuzzyFind(Req, [&](const Symbol &Sym) { - Matches.push_back(getQualifiedName(Sym)); - }); - if (Incomplete) - *Incomplete = IsIncomplete; - return Matches; -} - TEST(MemIndexTest, MemIndexSymbolsRecycled) { MemIndex I; std::weak_ptr<SlabAndPointers> Symbols; @@ -212,18 +146,6 @@ EXPECT_THAT(match(I, Req), UnorderedElementsAre("ns::ABC", "ns::abc")); } -// Returns qualified names of symbols with any of IDs in the index. -std::vector<std::string> lookup(const SymbolIndex &I, - llvm::ArrayRef<SymbolID> IDs) { - LookupRequest Req; - Req.IDs.insert(IDs.begin(), IDs.end()); - std::vector<std::string> Results; - I.lookup(Req, [&](const Symbol &Sym) { - Results.push_back(getQualifiedName(Sym)); - }); - return Results; -} - TEST(MemIndexTest, Lookup) { MemIndex I; I.build(generateSymbols({"ns::abc", "ns::xyz"})); @@ -269,7 +191,7 @@ TEST(MergeTest, Merge) { Symbol L, R; L.ID = R.ID = SymbolID("hello"); - L.Name = R.Name = "Foo"; // same in both + L.Name = R.Name = "Foo"; // same in both L.CanonicalDeclaration.FileURI = "file:///left.h"; // differs R.CanonicalDeclaration.FileURI = "file:///right.h"; L.References = 1; Index: clang-tools-extra/unittests/clangd/DexIndexTests.cpp =================================================================== --- clang-tools-extra/unittests/clangd/DexIndexTests.cpp +++ clang-tools-extra/unittests/clangd/DexIndexTests.cpp @@ -7,6 +7,10 @@ // //===----------------------------------------------------------------------===// +#include "TestIndexOperations.h" +#include "index/Index.h" +#include "index/Merge.h" +#include "index/dex/DexIndex.h" #include "index/dex/Iterator.h" #include "index/dex/Token.h" #include "index/dex/Trigram.h" @@ -17,11 +21,13 @@ #include <string> #include <vector> +using ::testing::ElementsAre; +using ::testing::UnorderedElementsAre; + namespace clang { namespace clangd { namespace dex { - -using ::testing::ElementsAre; +namespace { TEST(DexIndexIterators, DocumentIterator) { const PostingList L = {4, 7, 8, 20, 42, 100}; @@ -333,6 +339,223 @@ "hij", "ijk", "jkl", "klm"})); } +TEST(DexIndex, Lookup) { + DexIndex I; + I.build(generateSymbols({"ns::abc", "ns::xyz"})); + EXPECT_THAT(lookup(I, SymbolID("ns::abc")), UnorderedElementsAre("ns::abc")); + EXPECT_THAT(lookup(I, {SymbolID("ns::abc"), SymbolID("ns::xyz")}), + UnorderedElementsAre("ns::abc", "ns::xyz")); + EXPECT_THAT(lookup(I, {SymbolID("ns::nonono"), SymbolID("ns::xyz")}), + UnorderedElementsAre("ns::xyz")); + EXPECT_THAT(lookup(I, SymbolID("ns::nonono")), UnorderedElementsAre()); +} + +// FIXME(kbobyrev): Use 3+ symbols long names so that trigram index is used. +TEST(MergeDexIndex, Lookup) { + DexIndex I, J; + I.build(generateSymbols({"ns::A", "ns::B"})); + J.build(generateSymbols({"ns::B", "ns::C"})); + EXPECT_THAT(lookup(*mergeIndex(&I, &J), SymbolID("ns::A")), + UnorderedElementsAre("ns::A")); + EXPECT_THAT(lookup(*mergeIndex(&I, &J), SymbolID("ns::B")), + UnorderedElementsAre("ns::B")); + EXPECT_THAT(lookup(*mergeIndex(&I, &J), SymbolID("ns::C")), + UnorderedElementsAre("ns::C")); + EXPECT_THAT( + lookup(*mergeIndex(&I, &J), {SymbolID("ns::A"), SymbolID("ns::B")}), + UnorderedElementsAre("ns::A", "ns::B")); + EXPECT_THAT( + lookup(*mergeIndex(&I, &J), {SymbolID("ns::A"), SymbolID("ns::C")}), + UnorderedElementsAre("ns::A", "ns::C")); + EXPECT_THAT(lookup(*mergeIndex(&I, &J), SymbolID("ns::D")), + UnorderedElementsAre()); + EXPECT_THAT(lookup(*mergeIndex(&I, &J), {}), UnorderedElementsAre()); +} + +// FIXME(kbobyrev): Add more tests on DexIndex? Right now, it's mostly a wrapper +// around DexIndex, adopting tests from IndexTests.cpp sounds reasonable. +// However, most of these tests use short names (<3 symbols) for symbol lookups, +// which currently are dispatched to DexIndex and hence just duplicating these +// tests while substituting DexIndex with DexIndex is not a viable solution. +TEST(DexIndex, FuzzyFind) { + DexIndex Index; + Index.build(generateSymbols( + {"ns::ABC", "ns::BCD", "::ABC", "ns::nested::ABC", "other::ABC"})); + FuzzyFindRequest Req; + Req.Query = "ABC"; + Req.Scopes = {"ns::"}; + EXPECT_THAT(match(Index, Req), UnorderedElementsAre("ns::ABC")); + Req.Scopes = {"ns::", "ns::nested::"}; + EXPECT_THAT(match(Index, Req), + UnorderedElementsAre("ns::ABC", "ns::nested::ABC")); +} + +TEST(DexIndexTest, FuzzyMatchQ) { + DexIndex I; + I.build( + generateSymbols({"LaughingOutLoud", "LionPopulation", "LittleOldLady"})); + FuzzyFindRequest Req; + Req.Query = "lol"; + Req.MaxCandidateCount = 2; + EXPECT_THAT(match(I, Req), + UnorderedElementsAre("LaughingOutLoud", "LittleOldLady")); +} + +TEST(DexIndexTest, DexIndexSymbolsRecycled) { + DexIndex I; + std::weak_ptr<SlabAndPointers> Symbols; + I.build(generateNumSymbols(0, 10, &Symbols)); + FuzzyFindRequest Req; + Req.Query = "7"; + EXPECT_THAT(match(I, Req), UnorderedElementsAre("7")); + + EXPECT_FALSE(Symbols.expired()); + // Release old symbols. + I.build(generateNumSymbols(0, 0)); + EXPECT_TRUE(Symbols.expired()); +} + +// FIXME(kbobyrev): This test is different for DexIndex and MemIndex: while +// MemIndex manages response deduplication, DexIndex simply returns all matched +// symbols which means there might be equivalent symbols in the response. +// Before drop-in replacement of MemIndex with DexIndex happens, FileIndex +// should handle deduplication instead. +TEST(DexIndexTest, DexIndexDeduplicate) { + auto Symbols = generateNumSymbols(0, 10); + + // Inject duplicates. + auto Sym = symbol("7"); + Symbols->push_back(&Sym); + Symbols->push_back(&Sym); + Symbols->push_back(&Sym); + + FuzzyFindRequest Req; + Req.Query = "7"; + DexIndex I; + I.build(std::move(Symbols)); + auto Matches = match(I, Req); + EXPECT_EQ(Matches.size(), 4u); +} + +TEST(DexIndexTest, DexIndexLimitedNumMatches) { + DexIndex I; + I.build(generateNumSymbols(0, 100)); + FuzzyFindRequest Req; + Req.Query = "5"; + Req.MaxCandidateCount = 3; + bool Incomplete; + auto Matches = match(I, Req, &Incomplete); + EXPECT_EQ(Matches.size(), Req.MaxCandidateCount); + EXPECT_TRUE(Incomplete); +} + +TEST(DexIndexTest, FuzzyMatch) { + DexIndex I; + I.build( + generateSymbols({"LaughingOutLoud", "LionPopulation", "LittleOldLady"})); + FuzzyFindRequest Req; + Req.Query = "lol"; + Req.MaxCandidateCount = 2; + EXPECT_THAT(match(I, Req), + UnorderedElementsAre("LaughingOutLoud", "LittleOldLady")); +} + +TEST(DexIndexTest, MatchQualifiedNamesWithoutSpecificScope) { + DexIndex I; + I.build(generateSymbols({"a::y1", "b::y2", "y3"})); + FuzzyFindRequest Req; + Req.Query = "y"; + EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1", "b::y2", "y3")); +} + +TEST(DexIndexTest, MatchQualifiedNamesWithGlobalScope) { + DexIndex I; + I.build(generateSymbols({"a::y1", "b::y2", "y3"})); + FuzzyFindRequest Req; + Req.Query = "y"; + Req.Scopes = {""}; + EXPECT_THAT(match(I, Req), UnorderedElementsAre("y3")); +} + +TEST(DexIndexTest, MatchQualifiedNamesWithOneScope) { + DexIndex I; + I.build(generateSymbols({"a::y1", "a::y2", "a::x", "b::y2", "y3"})); + FuzzyFindRequest Req; + Req.Query = "y"; + Req.Scopes = {"a::"}; + EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1", "a::y2")); +} + +TEST(DexIndexTest, MatchQualifiedNamesWithMultipleScopes) { + DexIndex I; + I.build(generateSymbols({"a::y1", "a::y2", "a::x", "b::y3", "y3"})); + FuzzyFindRequest Req; + Req.Query = "y"; + Req.Scopes = {"a::", "b::"}; + EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1", "a::y2", "b::y3")); +} + +TEST(DexIndexTest, NoMatchNestedScopes) { + DexIndex I; + I.build(generateSymbols({"a::y1", "a::b::y2"})); + FuzzyFindRequest Req; + Req.Query = "y"; + Req.Scopes = {"a::"}; + EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1")); +} + +TEST(DexIndexTest, IgnoreCases) { + DexIndex I; + I.build(generateSymbols({"ns::ABC", "ns::abc"})); + FuzzyFindRequest Req; + Req.Query = "AB"; + Req.Scopes = {"ns::"}; + EXPECT_THAT(match(I, Req), UnorderedElementsAre("ns::ABC", "ns::abc")); +} + +TEST(DexIndexTest, Lookup) { + DexIndex I; + I.build(generateSymbols({"ns::abc", "ns::xyz"})); + EXPECT_THAT(lookup(I, SymbolID("ns::abc")), UnorderedElementsAre("ns::abc")); + EXPECT_THAT(lookup(I, {SymbolID("ns::abc"), SymbolID("ns::xyz")}), + UnorderedElementsAre("ns::abc", "ns::xyz")); + EXPECT_THAT(lookup(I, {SymbolID("ns::nonono"), SymbolID("ns::xyz")}), + UnorderedElementsAre("ns::xyz")); + EXPECT_THAT(lookup(I, SymbolID("ns::nonono")), UnorderedElementsAre()); +} + +TEST(DexMergeIndexTest, Lookup) { + DexIndex I, J; + I.build(generateSymbols({"ns::A", "ns::B"})); + J.build(generateSymbols({"ns::B", "ns::C"})); + EXPECT_THAT(lookup(*mergeIndex(&I, &J), SymbolID("ns::A")), + UnorderedElementsAre("ns::A")); + EXPECT_THAT(lookup(*mergeIndex(&I, &J), SymbolID("ns::B")), + UnorderedElementsAre("ns::B")); + EXPECT_THAT(lookup(*mergeIndex(&I, &J), SymbolID("ns::C")), + UnorderedElementsAre("ns::C")); + EXPECT_THAT( + lookup(*mergeIndex(&I, &J), {SymbolID("ns::A"), SymbolID("ns::B")}), + UnorderedElementsAre("ns::A", "ns::B")); + EXPECT_THAT( + lookup(*mergeIndex(&I, &J), {SymbolID("ns::A"), SymbolID("ns::C")}), + UnorderedElementsAre("ns::A", "ns::C")); + EXPECT_THAT(lookup(*mergeIndex(&I, &J), SymbolID("ns::D")), + UnorderedElementsAre()); + EXPECT_THAT(lookup(*mergeIndex(&I, &J), {}), UnorderedElementsAre()); +} + +TEST(DexMergeIndexTest, FuzzyFind) { + DexIndex I, J; + I.build(generateSymbols({"ns::A", "ns::B"})); + J.build(generateSymbols({"ns::B", "ns::C"})); + FuzzyFindRequest Req; + Req.Scopes = {"ns::"}; + EXPECT_THAT(match(*mergeIndex(&I, &J), Req), + UnorderedElementsAre("ns::A", "ns::B", "ns::C")); +} + +} // namespace } // namespace dex } // namespace clangd } // namespace clang Index: clang-tools-extra/unittests/clangd/CMakeLists.txt =================================================================== --- clang-tools-extra/unittests/clangd/CMakeLists.txt +++ clang-tools-extra/unittests/clangd/CMakeLists.txt @@ -30,6 +30,7 @@ SyncAPI.cpp TUSchedulerTests.cpp TestFS.cpp + TestIndexOperations.cpp TestTU.cpp ThreadingTests.cpp TraceTests.cpp Index: clang-tools-extra/clangd/index/dex/Token.h =================================================================== --- clang-tools-extra/clangd/index/dex/Token.h +++ clang-tools-extra/clangd/index/dex/Token.h @@ -22,9 +22,9 @@ #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TOKEN_H #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TOKEN_H +#include "../Index.h" #include "llvm/ADT/DenseMap.h" #include "llvm/Support/raw_ostream.h" - #include <string> #include <vector> Index: clang-tools-extra/clangd/index/dex/DexIndex.cpp =================================================================== --- /dev/null +++ clang-tools-extra/clangd/index/dex/DexIndex.cpp @@ -0,0 +1,250 @@ +//===--- DexIndex.cpp - Dex Symbol Index Implementation ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "DexIndex.h" +#include "../../FuzzyMatch.h" +#include "../../Logger.h" +#include <algorithm> + +namespace clang { +namespace clangd { +namespace dex { + +namespace { + +// Applies Callback to each symbol from the Scores in the order of decreasing +// symbol quality. +void useCallback(llvm::function_ref<void(const Symbol &)> Callback, + std::vector<std::pair<float, const Symbol *>> &Scores) { + // First, sort retrieved symbols by the cumulative score to apply callbacks + // in the order of descending score. + std::sort(begin(Scores), end(Scores), + [](const std::pair<float, const Symbol *> &LHS, + const std::pair<float, const Symbol *> &RHS) { + return LHS.first > RHS.first; + }); + + for (size_t CandidateRank = 0; CandidateRank < Scores.size(); ++CandidateRank) + Callback(*Scores[CandidateRank].second); +} + +// Determines whether query contains enough letters to construct a trigram. +bool isShortQuery(std::string Query) { + // Apply fuzzy matching text segmentation. + std::vector<CharRole> Roles(Query.size()); + calculateRoles(Query, llvm::makeMutableArrayRef(Roles.data(), Query.size())); + + unsigned ValidSymbols = 0; + for (const auto &Role : Roles) { + if (Role == Head || Role == Tail) + ++ValidSymbols; + if (ValidSymbols >= 3) + return true; + } + + return false; +} + +// Returns the tokens which are given symbol's characteristics. Currently, the +// generated tokens only contain fuzzy matching trigrams and symbol's scope, +// but in the future this will also return path proximity tokens and other +// types of tokens such as symbol type (if applicable). +// Returns the tokens which are given symbols's characteristics. For example, +// trigrams and scopes. +// FIXME(kbobyrev): Support more token types: +// * Path proximity +// * Types +std::vector<Token> generateSearchTokens(const Symbol &Sym) { + std::vector<Token> Result = generateIdentifierTrigrams(Sym.Name); + Result.push_back(Token(Token::Kind::Scope, Sym.Scope)); + return Result; +} + +} // namespace + +void DexIndex::build(std::shared_ptr<std::vector<const Symbol *>> Syms) { + llvm::DenseMap<SymbolID, const Symbol *> TempLookupTable; + llvm::DenseMap<const Symbol *, float> TempSymbolQuality; + for (const Symbol *Sym : *Syms) { + TempLookupTable[Sym->ID] = Sym; + TempSymbolQuality[Sym] = quality(*Sym); + } + + // Symbols are sorted by symbol qualities so that items in the posting lists + // are stored in the descending order of symbol quality. + // FIXME(kbobyrev): Should we also store symbol quality? It is used later in + // fuzzy matching stage, too. If measuring symbol quality gets expensive, this + // might be more efficient. + std::sort(begin(*Syms), end(*Syms), + [&](const Symbol *LHS, const Symbol *RHS) { + return TempSymbolQuality[LHS] > TempSymbolQuality[RHS]; + }); + llvm::DenseMap<Token, PostingList> TempInvertedIndex; + // Populate TempInvertedIndex with posting lists for index symbols. + for (DocID SymbolRank = 0; SymbolRank < Syms->size(); ++SymbolRank) { + const auto *Sym = (*Syms)[SymbolRank]; + for (const auto &Token : generateSearchTokens(*Sym)) + TempInvertedIndex[Token].push_back(SymbolRank); + } + + { + std::lock_guard<std::mutex> Lock(Mutex); + + // Replace outdated index with the new one. + LookupTable = std::move(TempLookupTable); + Symbols = std::move(Syms); + InvertedIndex = std::move(TempInvertedIndex); + SymbolQuality = std::move(TempSymbolQuality); + } +} + +bool DexIndex::fuzzyFind( + const FuzzyFindRequest &Req, + llvm::function_ref<void(const Symbol &)> Callback) const { + assert(!StringRef(Req.Query).contains("::") && + "There must be no :: in query."); + FuzzyMatcher Filter(Req.Query); + bool More = false; + // The separation of long queries and short queries is temporary and will be + // removed once incomplete trigrams are generated from both identifiers and + // queries. + More = isShortQuery(Req.Query) ? fuzzyFindLongQuery(Callback, Req) + : fuzzyFindShortQuery(Callback, Req); + return More; +} + +/// Handles fuzzy matching requests for which trigrams can not be generated. +/// This approach currently does not use trigram index and query iterators and +/// will be revisited in the future. +// FIXME(kbobyrev): Implement short queries (<3 length) using posting lists and +// incomplete trigrams. Unigram and bigram posting lists are likely to be very +// dense, which is why posting list compression should be also implemented in +// that approach. +bool DexIndex::fuzzyFindShortQuery( + llvm::function_ref<void(const Symbol &)> Callback, + const FuzzyFindRequest &Req) const { + bool More = false; + std::vector<std::pair<float, const Symbol *>> Scores; + FuzzyMatcher Filter(Req.Query); + + { + std::lock_guard<std::mutex> Lock(Mutex); + + // FIXME(kbobyrev): This can be very expensive. We should first filter + // symbols by scopes and types using query iterators. + for (DocID ID = 0; ID < Symbols->size(); ++ID) { + const auto *Sym = (*Symbols)[ID]; + const llvm::Optional<float> Score = Filter.match(Sym->Name); + // Match scopes from the query. + if (!Req.Scopes.empty() && !llvm::is_contained(Req.Scopes, Sym->Scope)) + continue; + if (Req.RestrictForCodeCompletion && !Sym->IsIndexedForCodeCompletion) + continue; + if (Score) { + if (Scores.size() == Req.MaxCandidateCount) { + More = true; + break; + } + Scores.push_back( + std::make_pair(*Score * SymbolQuality.find(Sym)->second, Sym)); + } + } + useCallback(Callback, Scores); + } + return More; +} + +/// Constructs iterators over tokens extracted from the query and exhausts it +/// while applying Callback to each symbol in the order of decreasing quality +/// of the matched symbols. +bool DexIndex::fuzzyFindLongQuery( + llvm::function_ref<void(const Symbol &)> Callback, + const FuzzyFindRequest &Req) const { + bool More = false; + std::vector<DocID> SymbolDocIDs; + // FIXME(kbobyrev): Scoring all matched symbols and then processing + // MaxCandidateCount items is rather expensive, this should be replaced by + // boosting and two-stage filtering at some point as proposed in Dex design + // document. + std::vector<std::pair<float, const Symbol *>> Scores; + FuzzyMatcher Filter(Req.Query); + + { + std::lock_guard<std::mutex> Lock(Mutex); + + // Generate query trigrams and construct AND iterator over all query + // trigrams. + const auto TrigramTokens = generateIdentifierTrigrams(Req.Query); + std::vector<std::unique_ptr<Iterator>> TrigramIterators; + for (const auto &Trigram : TrigramTokens) { + const auto It = InvertedIndex.find(Trigram); + if (It != InvertedIndex.end()) + TrigramIterators.push_back(create(It->second)); + } + + std::vector<std::unique_ptr<Iterator>> TopLevelChildren; + TopLevelChildren.push_back(createAnd(move(TrigramIterators))); + + // Add OR iterator for scopes if the request contains scopes. + if (!Req.Scopes.empty()) { + // Generate scope tokens for search query. + std::vector<std::unique_ptr<Iterator>> ScopeIterators; + for (const auto &Scope : Req.Scopes) { + const auto It = InvertedIndex.find(Token(Token::Kind::Scope, Scope)); + if (It != InvertedIndex.end()) + ScopeIterators.push_back(create(It->second)); + } + TopLevelChildren.push_back(createOr(move(ScopeIterators))); + } + + auto QueryIterator = createAnd(move(TopLevelChildren)); + // FIXME(kbobyrev): Limit the total number of consumed symbols using + // Req.MaxCandidateCount. That would require properly implementing Limit + // iterator. + SymbolDocIDs = consume(*QueryIterator, Req.MaxCandidateCount); + More = SymbolDocIDs.size() == Req.MaxCandidateCount && + !QueryIterator->reachedEnd(); + + // Populate scores. + // FIXME(kbobyrev): A more generic way of populating scores would be + // passing a callback to iterator consumer, otherwise SymbolDocIDs are + // stored but not actually used elsewhere. + for (size_t SymbolIdx = 0; SymbolIdx < SymbolDocIDs.size(); ++SymbolIdx) { + const auto Sym = (*Symbols)[SymbolDocIDs[SymbolIdx]]; + const auto Score = Filter.match(Sym->Name); + assert(Score.hasValue() && "Matched symbol has all Fuzzy Matching " + "trigrams, it should match the query"); + Scores.push_back( + std::make_pair(*Score * SymbolQuality.find(Sym)->second, Sym)); + } + + useCallback(Callback, Scores); + } + + return More; +} + +void DexIndex::lookup(const LookupRequest &Req, + llvm::function_ref<void(const Symbol &)> Callback) const { + for (const auto &ID : Req.IDs) { + auto I = LookupTable.find(ID); + if (I != LookupTable.end()) + Callback(*I->second); + } +} + +void DexIndex::findOccurrences( + const OccurrencesRequest &Req, + llvm::function_ref<void(const SymbolOccurrence &)> Callback) const { + log("findOccurrences is not implemented."); +} + +} // namespace dex +} // namespace clangd +} // namespace clang Index: clang-tools-extra/clangd/index/dex/DexIndex.h =================================================================== --- /dev/null +++ clang-tools-extra/clangd/index/dex/DexIndex.h @@ -0,0 +1,81 @@ +//===--- DexIndex.h - Dex Symbol Index Implementation -----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This defines Dex - a symbol index implementation based on query iterators +// over symbol tokens, such as fuzzy matching trigrams, scopes, types, etc. +// While consuming more memory and having longer build stage due to +// preprocessing, Dex will have substantially lower latency. It will also allow +// efficient symbol searching which is crucial for operations like code +// completion, and can be very important for a number of different code +// transformations which will be eventually supported by Clangd. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_DEXINDEX_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_DEXINDEX_H + +#include "../Index.h" +#include "../MemIndex.h" +#include "Iterator.h" +#include "Token.h" +#include "Trigram.h" +#include <mutex> + +namespace clang { +namespace clangd { +namespace dex { + +/// In-memory Dex trigram-based index implementation. +// FIXME(kbobyrev): Introduce serialization and deserialization of the symbol +// index so that it can be loaded from the disk. Since static index is not +// changed frequently, it's safe to assume that it has to be built only once +// (when the clangd process starts). Therefore, it can be easier to store built +// index on disk and then load it if available. +class DexIndex : public SymbolIndex { +public: + /// \brief (Re-)Build index for `Symbols`. All symbol pointers must remain + /// accessible as long as `Symbols` is kept alive. + void build(std::shared_ptr<std::vector<const Symbol *>> Symbols); + + bool + fuzzyFind(const FuzzyFindRequest &Req, + llvm::function_ref<void(const Symbol &)> Callback) const override; + + virtual void + lookup(const LookupRequest &Req, + llvm::function_ref<void(const Symbol &)> Callback) const override; + + void findOccurrences(const OccurrencesRequest &Req, + llvm::function_ref<void(const SymbolOccurrence &)> + Callback) const override; + +private: + bool fuzzyFindLongQuery(llvm::function_ref<void(const Symbol &)> Callback, + const FuzzyFindRequest &Req) const; + bool fuzzyFindShortQuery(llvm::function_ref<void(const Symbol &)> Callback, + const FuzzyFindRequest &Req) const; + + std::shared_ptr<std::vector<const Symbol *>> Symbols; + llvm::DenseMap<SymbolID, const Symbol *> LookupTable; + llvm::DenseMap<const Symbol *, float> SymbolQuality; + mutable std::mutex Mutex; + // Inverted index is a mapping from the search token to the posting list, + // which contains all items which can be characterized by such search token. + // For example, if the search token is scope "std::", the corresponding + // posting list would contain all indices of symbols defined in namespace std. + // Inverted index is used to retrieve posting lists which are processed during + // the fuzzyFind process. + llvm::DenseMap<Token, PostingList> InvertedIndex; +}; + +} // namespace dex +} // namespace clangd +} // namespace clang + +#endif Index: clang-tools-extra/clangd/CMakeLists.txt =================================================================== --- clang-tools-extra/clangd/CMakeLists.txt +++ clang-tools-extra/clangd/CMakeLists.txt @@ -43,6 +43,7 @@ index/SymbolCollector.cpp index/SymbolYAML.cpp + index/dex/DexIndex.cpp index/dex/Iterator.cpp index/dex/Trigram.cpp
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits