kbobyrev created this revision.
kbobyrev added reviewers: ioeric, ilya-biryukov.
kbobyrev added a project: clang-tools-extra.
Herald added subscribers: arphaman, jkorous, MaskRay.

Currently, the query trigram generator would simply yield `u_p` trigram for the 
`u_p` query. This is not optimal, since the user is likely to try matching two 
heads with this query and this patch addresses the issue.


https://reviews.llvm.org/D50700

Files:
  clang-tools-extra/clangd/index/dex/Trigram.cpp
  clang-tools-extra/clangd/index/dex/Trigram.h
  clang-tools-extra/unittests/clangd/DexIndexTests.cpp


Index: clang-tools-extra/unittests/clangd/DexIndexTests.cpp
===================================================================
--- clang-tools-extra/unittests/clangd/DexIndexTests.cpp
+++ clang-tools-extra/unittests/clangd/DexIndexTests.cpp
@@ -300,6 +300,8 @@
   EXPECT_THAT(generateQueryTrigrams("__"), trigramsAre({"__$"}));
   EXPECT_THAT(generateQueryTrigrams("___"), trigramsAre({"___"}));
 
+  EXPECT_THAT(generateQueryTrigrams("u_p"), trigramsAre({"up$"}));
+
   EXPECT_THAT(generateQueryTrigrams("X86"), trigramsAre({"x86"}));
 
   EXPECT_THAT(generateQueryTrigrams("clangd"),
Index: clang-tools-extra/clangd/index/dex/Trigram.h
===================================================================
--- clang-tools-extra/clangd/index/dex/Trigram.h
+++ clang-tools-extra/clangd/index/dex/Trigram.h
@@ -68,7 +68,10 @@
 ///
 /// For short queries (less than 3 characters with Head or Tail roles in Fuzzy
 /// Matching segmentation) this returns a single trigram with the first
-/// characters (up to 3) to perfrom prefix match.
+/// characters (up to 3) to perfrom prefix match. However, if the query is 
short
+/// but it contains two HEAD symbols then the returned trigram would be an
+/// incomplete bigram with those two headsd. This would help to match
+/// "unique_ptr" and similar symbols with "u_p" query.
 std::vector<Token> generateQueryTrigrams(llvm::StringRef Query);
 
 } // namespace dex
Index: clang-tools-extra/clangd/index/dex/Trigram.cpp
===================================================================
--- clang-tools-extra/clangd/index/dex/Trigram.cpp
+++ clang-tools-extra/clangd/index/dex/Trigram.cpp
@@ -128,9 +128,15 @@
   // Additional pass is necessary to count valid identifier characters.
   // Depending on that, this function might return incomplete trigram.
   unsigned ValidSymbolsCount = 0;
-  for (size_t I = 0; I < Roles.size(); ++I)
-    if (Roles[I] == Head || Roles[I] == Tail)
+  unsigned Heads = 0;
+  for (size_t I = 0; I < Roles.size(); ++I) {
+    if (Roles[I] == Head) {
+      ++ValidSymbolsCount;
+      ++Heads;
+    } else if (Roles[I] == Tail) {
       ++ValidSymbolsCount;
+    }
+  }
 
   std::string LowercaseQuery = Query.lower();
 
@@ -140,13 +146,26 @@
   // If the number of symbols which can form fuzzy matching trigram is not
   // sufficient, generate a single incomplete trigram for query.
   if (ValidSymbolsCount < 3) {
-    std::string Symbols = {{END_MARKER, END_MARKER, END_MARKER}};
+    std::string Symbols;
+    // If the query is not long enough to form a trigram but contains two heads
+    // the returned trigram should be "xy$" where "x" and "y" are the heads.
+    // This might be particulary important for cases like "u_p" to match
+    // "unique_ptr" and similar symbols from the C++ Standard Library.
+    if (Heads == 2) {
+      for (size_t I = 0; I < LowercaseQuery.size(); ++I)
+        if (Roles[I] == Head)
+          Symbols += LowercaseQuery[I];
+
+      Symbols += END_MARKER;
+    } else {
+      Symbols = {{END_MARKER, END_MARKER, END_MARKER}};
       if (LowercaseQuery.size() > 0)
         Symbols[0] = LowercaseQuery[0];
       if (LowercaseQuery.size() > 1)
         Symbols[1] = LowercaseQuery[1];
       if (LowercaseQuery.size() > 2)
         Symbols[2] = LowercaseQuery[2];
+    }
     const auto Trigram = Token(Token::Kind::Trigram, Symbols);
     UniqueTrigrams.insert(Trigram);
   } else {


Index: clang-tools-extra/unittests/clangd/DexIndexTests.cpp
===================================================================
--- clang-tools-extra/unittests/clangd/DexIndexTests.cpp
+++ clang-tools-extra/unittests/clangd/DexIndexTests.cpp
@@ -300,6 +300,8 @@
   EXPECT_THAT(generateQueryTrigrams("__"), trigramsAre({"__$"}));
   EXPECT_THAT(generateQueryTrigrams("___"), trigramsAre({"___"}));
 
+  EXPECT_THAT(generateQueryTrigrams("u_p"), trigramsAre({"up$"}));
+
   EXPECT_THAT(generateQueryTrigrams("X86"), trigramsAre({"x86"}));
 
   EXPECT_THAT(generateQueryTrigrams("clangd"),
Index: clang-tools-extra/clangd/index/dex/Trigram.h
===================================================================
--- clang-tools-extra/clangd/index/dex/Trigram.h
+++ clang-tools-extra/clangd/index/dex/Trigram.h
@@ -68,7 +68,10 @@
 ///
 /// For short queries (less than 3 characters with Head or Tail roles in Fuzzy
 /// Matching segmentation) this returns a single trigram with the first
-/// characters (up to 3) to perfrom prefix match.
+/// characters (up to 3) to perfrom prefix match. However, if the query is short
+/// but it contains two HEAD symbols then the returned trigram would be an
+/// incomplete bigram with those two headsd. This would help to match
+/// "unique_ptr" and similar symbols with "u_p" query.
 std::vector<Token> generateQueryTrigrams(llvm::StringRef Query);
 
 } // namespace dex
Index: clang-tools-extra/clangd/index/dex/Trigram.cpp
===================================================================
--- clang-tools-extra/clangd/index/dex/Trigram.cpp
+++ clang-tools-extra/clangd/index/dex/Trigram.cpp
@@ -128,9 +128,15 @@
   // Additional pass is necessary to count valid identifier characters.
   // Depending on that, this function might return incomplete trigram.
   unsigned ValidSymbolsCount = 0;
-  for (size_t I = 0; I < Roles.size(); ++I)
-    if (Roles[I] == Head || Roles[I] == Tail)
+  unsigned Heads = 0;
+  for (size_t I = 0; I < Roles.size(); ++I) {
+    if (Roles[I] == Head) {
+      ++ValidSymbolsCount;
+      ++Heads;
+    } else if (Roles[I] == Tail) {
       ++ValidSymbolsCount;
+    }
+  }
 
   std::string LowercaseQuery = Query.lower();
 
@@ -140,13 +146,26 @@
   // If the number of symbols which can form fuzzy matching trigram is not
   // sufficient, generate a single incomplete trigram for query.
   if (ValidSymbolsCount < 3) {
-    std::string Symbols = {{END_MARKER, END_MARKER, END_MARKER}};
+    std::string Symbols;
+    // If the query is not long enough to form a trigram but contains two heads
+    // the returned trigram should be "xy$" where "x" and "y" are the heads.
+    // This might be particulary important for cases like "u_p" to match
+    // "unique_ptr" and similar symbols from the C++ Standard Library.
+    if (Heads == 2) {
+      for (size_t I = 0; I < LowercaseQuery.size(); ++I)
+        if (Roles[I] == Head)
+          Symbols += LowercaseQuery[I];
+
+      Symbols += END_MARKER;
+    } else {
+      Symbols = {{END_MARKER, END_MARKER, END_MARKER}};
       if (LowercaseQuery.size() > 0)
         Symbols[0] = LowercaseQuery[0];
       if (LowercaseQuery.size() > 1)
         Symbols[1] = LowercaseQuery[1];
       if (LowercaseQuery.size() > 2)
         Symbols[2] = LowercaseQuery[2];
+    }
     const auto Trigram = Token(Token::Kind::Trigram, Symbols);
     UniqueTrigrams.insert(Trigram);
   } else {
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to