[PATCH] D50517: [clangd] Generate incomplete trigrams for the Dex index

Kirill Bobyrev via Phabricator via cfe-commits Fri, 10 Aug 2018 03:12:54 -0700

kbobyrev updated this revision to Diff 160081.
kbobyrev marked 5 inline comments as done.
kbobyrev added a comment.


Address a round of comments.

I have added few comments to get additional feedback before further changes are 
made.


https://reviews.llvm.org/D50517

Files:
  clang-tools-extra/clangd/index/dex/Iterator.h
  clang-tools-extra/clangd/index/dex/Trigram.cpp
  clang-tools-extra/clangd/index/dex/Trigram.h
  clang-tools-extra/unittests/clangd/DexIndexTests.cpp

Index: clang-tools-extra/unittests/clangd/DexIndexTests.cpp
===================================================================
--- clang-tools-extra/unittests/clangd/DexIndexTests.cpp
+++ clang-tools-extra/unittests/clangd/DexIndexTests.cpp
@@ -250,45 +250,55 @@
 }
 
 TEST(DexIndexTrigrams, IdentifierTrigrams) {
-  EXPECT_THAT(generateIdentifierTrigrams("X86"), trigramsAre({"x86"}));
+  EXPECT_THAT(generateIdentifierTrigrams("X86"),
+              trigramsAre({"x86", "x$$", "x8$"}));
 
-  EXPECT_THAT(generateIdentifierTrigrams("nl"), trigramsAre({}));
+  EXPECT_THAT(generateIdentifierTrigrams("nl"), trigramsAre({"nl$", "n$$"}));
+
+  EXPECT_THAT(generateIdentifierTrigrams("n"), trigramsAre({"n$$"}));
 
   EXPECT_THAT(generateIdentifierTrigrams("clangd"),
-              trigramsAre({"cla", "lan", "ang", "ngd"}));
+              trigramsAre({"c$$", "cl$", "cla", "lan", "ang", "ngd"}));
 
   EXPECT_THAT(generateIdentifierTrigrams("abc_def"),
-              trigramsAre({"abc", "abd", "ade", "bcd", "bde", "cde", "def"}));
+              trigramsAre({"a$$", "d$$", "abc", "abd", "ade", "bcd", "bde",
+                           "cde", "def", "ab$", "ad$", "de$"}));
 
-  EXPECT_THAT(
-      generateIdentifierTrigrams("a_b_c_d_e_"),
-      trigramsAre({"abc", "abd", "acd", "ace", "bcd", "bce", "bde", "cde"}));
+  EXPECT_THAT(generateIdentifierTrigrams("a_b_c_d_e_"),
+              trigramsAre({"a$$", "b$$", "c$$", "d$$", "e$$", "abc", "abd",
+                           "acd", "ace", "bcd", "bce", "bde", "cde", "ab$",
+                           "ac$", "bc$", "bd$", "cd$", "ce$", "de$"}));
 
-  EXPECT_THAT(
-      generateIdentifierTrigrams("unique_ptr"),
-      trigramsAre({"uni", "unp", "upt", "niq", "nip", "npt", "iqu", "iqp",
-                   "ipt", "que", "qup", "qpt", "uep", "ept", "ptr"}));
+  EXPECT_THAT(generateIdentifierTrigrams("unique_ptr"),
+              trigramsAre({"u$$", "p$$", "uni", "unp", "upt", "niq", "nip",
+                           "npt", "iqu", "iqp", "ipt", "que", "qup", "qpt",
+                           "uep", "ept", "ptr", "un$", "up$", "pt$"}));
 
   EXPECT_THAT(generateIdentifierTrigrams("TUDecl"),
-              trigramsAre({"tud", "tde", "ude", "dec", "ecl"}));
-
-  EXPECT_THAT(generateIdentifierTrigrams("IsOK"),
-              trigramsAre({"iso", "iok", "sok"}));
-
-  EXPECT_THAT(generateIdentifierTrigrams("abc_defGhij__klm"),
-              trigramsAre({
-                  "abc", "abd", "abg", "ade", "adg", "adk", "agh", "agk", "bcd",
-                  "bcg", "bde", "bdg", "bdk", "bgh", "bgk", "cde", "cdg", "cdk",
-                  "cgh", "cgk", "def", "deg", "dek", "dgh", "dgk", "dkl", "efg",
-                  "efk", "egh", "egk", "ekl", "fgh", "fgk", "fkl", "ghi", "ghk",
-                  "gkl", "hij", "hik", "hkl", "ijk", "ikl", "jkl", "klm",
-              }));
+              trigramsAre({"t$$", "d$$", "tud", "tde", "ude", "dec", "ecl",
+                           "tu$", "td$", "de$"}));
+
+  EXPECT_THAT(
+      generateIdentifierTrigrams("IsOK"),
+      trigramsAre({"i$$", "o$$", "iso", "iok", "sok", "is$", "io$", "ok$"}));
+
+  EXPECT_THAT(
+      generateIdentifierTrigrams("abc_defGhij__klm"),
+      trigramsAre(
+          {"a$$", "d$$", "g$$", "k$$", "abc", "abd", "abg", "ade", "adg", "adk",
+           "agh", "agk", "bcd", "bcg", "bde", "bdg", "bdk", "bgh", "bgk", "cde",
+           "cdg", "cdk", "cgh", "cgk", "def", "deg", "dek", "dgh", "dgk", "dkl",
+           "efg", "efk", "egh", "egk", "ekl", "fgh", "fgk", "fkl", "ghi", "ghk",
+           "gkl", "hij", "hik", "hkl", "ijk", "ikl", "jkl", "klm", "ab$", "ad$",
+           "ag$", "de$", "dg$", "dk$", "gh$", "gk$", "kl$"}));
 }
 
 TEST(DexIndexTrigrams, QueryTrigrams) {
-  EXPECT_THAT(generateQueryTrigrams("X86"), trigramsAre({"x86"}));
+  EXPECT_THAT(generateQueryTrigrams("c"), trigramsAre({"c$$"}));
+  EXPECT_THAT(generateQueryTrigrams("cl"), trigramsAre({"cl$"}));
+  EXPECT_THAT(generateQueryTrigrams("cla"), trigramsAre({"cla"}));
 
-  EXPECT_THAT(generateQueryTrigrams("nl"), trigramsAre({}));
+  EXPECT_THAT(generateQueryTrigrams("X86"), trigramsAre({"x86"}));
 
   EXPECT_THAT(generateQueryTrigrams("clangd"),
               trigramsAre({"cla", "lan", "ang", "ngd"}));
Index: clang-tools-extra/clangd/index/dex/Trigram.h
===================================================================
--- clang-tools-extra/clangd/index/dex/Trigram.h
+++ clang-tools-extra/clangd/index/dex/Trigram.h
@@ -44,6 +44,14 @@
 /// to the next character, move to the start of the next segment, or skip over a
 /// segment.
 ///
+/// Special kind of trigrams - incomplete trigrams is also present in the
+/// result. Incomplete trigrams contain END_MARKER ('$') at the end. Result
+/// contains unigrams which contain the first symbol of each fuzzy matching
+/// segment start, i.e. symbols which are assigned Head role during fuzzy
+/// matching segmentation stage. The result also contains all bigrams which are
+/// generated in the same way sequences of length 3 are created. This means that
+/// for short queries Dex index would do special kind of prefix matching.
+///
 /// Note: the returned list of trigrams does not have duplicates, if any trigram
 /// belongs to more than one class it is only inserted once.
 std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier);
@@ -53,6 +61,11 @@
 /// Query is segmented using FuzzyMatch API and downcasted to lowercase. Then,
 /// the simplest trigrams - sequences of three consecutive letters and digits
 /// are extracted and returned after deduplication.
+///
+/// For short queries (Query contains less than 3 letters and digits) this
+/// returns a single trigram with all valid identifier symbols.
+///
+/// Example: Query - "_u$_p", Trigram - "up".
 std::vector<Token> generateQueryTrigrams(llvm::StringRef Query);
 
 } // namespace dex
Index: clang-tools-extra/clangd/index/dex/Trigram.cpp
===================================================================
--- clang-tools-extra/clangd/index/dex/Trigram.cpp
+++ clang-tools-extra/clangd/index/dex/Trigram.cpp
@@ -10,11 +10,9 @@
 #include "Trigram.h"
 #include "../../FuzzyMatch.h"
 #include "Token.h"
-
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/StringExtras.h"
-
 #include <cctype>
 #include <queue>
 #include <string>
@@ -25,12 +23,22 @@
 namespace clangd {
 namespace dex {
 
-// FIXME(kbobyrev): Deal with short symbol symbol names. A viable approach would
-// be generating unigrams and bigrams here, too. This would prevent symbol index
-// from applying fuzzy matching on a tremendous number of symbols and allow
-// supplementary retrieval for short queries.
-//
-// Short names (total segment length <3 characters) are currently ignored.
+namespace {
+
+/// This is used to mark unigrams and bigrams and distinct them from complete
+/// trigrams. Since '$' is not present in valid identifier names, it is safe to
+/// use it as the special symbol.
+const auto END_MARKER = '$';
+
+void insertChars(DenseSet<Token> &UniqueTrigrams, std::array<char, 4> Chars) {
+  const auto Trigram = Token(Token::Kind::Trigram, Chars.data());
+  if (!UniqueTrigrams.count(Trigram)) {
+    UniqueTrigrams.insert(Trigram);
+  }
+}
+
+} // namespace
+
 std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier) {
   // Apply fuzzy matching text segmentation.
   std::vector<CharRole> Roles(Identifier.size());
@@ -59,25 +67,37 @@
     }
   }
 
+  // Iterate through valid seqneces of three characters Fuzzy Matcher can
+  // process.
   DenseSet<Token> UniqueTrigrams;
-  std::array<char, 4> Chars;
   for (size_t I = 0; I < LowercaseIdentifier.size(); ++I) {
     // Skip delimiters.
     if (Roles[I] != Head && Roles[I] != Tail)
       continue;
+
+    // Generate unigrams using the first symbol of each fuzzy matching segment.
+    if (Roles[I] == Head) {
+      insertChars(UniqueTrigrams,
+                  {{LowercaseIdentifier[I], END_MARKER, END_MARKER, 0}});
+    }
+
     for (const unsigned J : Next[I]) {
       if (!J)
         continue;
+
+      // Generate trigrams only if the first character is the segment start.
+      // Example: "StringStartsWith" would yield "st$", "ss$" but not "tr$".
+      if (Roles[I] == Head) {
+        insertChars(UniqueTrigrams, {{LowercaseIdentifier[I],
+                                      LowercaseIdentifier[J], END_MARKER, 0}});
+      }
+
       for (const unsigned K : Next[J]) {
         if (!K)
           continue;
-        Chars = {{LowercaseIdentifier[I], LowercaseIdentifier[J],
-                  LowercaseIdentifier[K], 0}};
-        auto Trigram = Token(Token::Kind::Trigram, Chars.data());
-        // Push unique trigrams to the result.
-        if (!UniqueTrigrams.count(Trigram)) {
-          UniqueTrigrams.insert(Trigram);
-        }
+        insertChars(UniqueTrigrams,
+                    {{LowercaseIdentifier[I], LowercaseIdentifier[J],
+                      LowercaseIdentifier[K], 0}});
       }
     }
   }
@@ -89,13 +109,19 @@
   return Result;
 }
 
-// FIXME(kbobyrev): Similarly, to generateIdentifierTrigrams, this ignores short
-// inputs (total segment length <3 characters).
+// FIXME(kbobyrev): Correctly handle empty trigrams "$$$".
 std::vector<Token> generateQueryTrigrams(llvm::StringRef Query) {
   // Apply fuzzy matching text segmentation.
   std::vector<CharRole> Roles(Query.size());
   calculateRoles(Query, llvm::makeMutableArrayRef(Roles.data(), Query.size()));
 
+  // Additional pass is necessary to count valid identifier characters.
+  // Depending on that, this function might return incomplete trigram.
+  unsigned ValidSymbolsCount = 0;
+  for (size_t I = 0; I < Roles.size(); ++I)
+    if (Roles[I] == Head || Roles[I] == Tail)
+      ++ValidSymbolsCount;
+
   std::string LowercaseQuery = Query.lower();
 
   DenseSet<Token> UniqueTrigrams;
@@ -110,9 +136,27 @@
 
     if (Chars.size() > 3)
       Chars.pop_front();
+
+    if (ValidSymbolsCount < 3 && Chars.size() == ValidSymbolsCount) {
+      while (Chars.size() < 3) {
+        Chars.push_back(END_MARKER);
+      }
+      auto Trigram =
+          Token(Token::Kind::Trigram, std::string(begin(Chars), end(Chars)));
+      UniqueTrigrams.insert(Trigram);
+      // The symbol does not have sufficient number of valid symbols for
+      // complete trigrams, all valid symbols were seen at this point.
+      break;
+    }
+
     if (Chars.size() == 3) {
       auto Trigram =
           Token(Token::Kind::Trigram, std::string(begin(Chars), end(Chars)));
+
+      if (Chars.front() == END_MARKER)
+        Trigram = Token(Token::Kind::Trigram,
+                        std::string(Chars.rbegin(), Chars.rend()));
+
       // Push unique trigrams to the result.
       if (!UniqueTrigrams.count(Trigram)) {
         UniqueTrigrams.insert(Trigram);
Index: clang-tools-extra/clangd/index/dex/Iterator.h
===================================================================
--- clang-tools-extra/clangd/index/dex/Iterator.h
+++ clang-tools-extra/clangd/index/dex/Iterator.h
@@ -47,6 +47,14 @@
 /// Contains sorted sequence of DocIDs all of which belong to symbols matching
 /// certain criteria, i.e. containing a Search Token. PostingLists are values
 /// for the inverted index.
+// FIXME(kbobyrev): Posting lists for incomplete trigrams (one/two symbols) are
+// likely to be very dense and hence require special attention so that the index
+// doesn't use too much memory. Possible solution would be to construct
+// compressed posting lists which consist of ranges of DocIDs instead of
+// distinct DocIDs. A special case would be the empty query: for that case
+// TrueIterator should be implemented - an iterator which doesn't actually store
+// any PostingList within itself, but "contains" all DocIDs in range
+// [0, IndexSize).
 using PostingList = std::vector<DocID>;
 /// Immutable reference to PostingList object.
 using PostingListRef = llvm::ArrayRef<DocID>;

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D50517: [clangd] Generate incomplete trigrams for the Dex index

Reply via email to