[PATCH] D44003: [clangd] Fix unintentionally loose fuzzy matching, and the tests masking it.

Sam McCall via Phabricator via cfe-commits Fri, 02 Mar 2018 04:49:19 -0800

sammccall updated this revision to Diff 136719.
sammccall added a comment.

test tweaks



Repository:
  rCTE Clang Tools Extra

https://reviews.llvm.org/D44003

Files:
  clangd/FuzzyMatch.cpp
  clangd/FuzzyMatch.h
  unittests/clangd/FuzzyMatchTests.cpp
  unittests/clangd/IndexTests.cpp

Index: unittests/clangd/IndexTests.cpp
===================================================================
--- unittests/clangd/IndexTests.cpp
+++ unittests/clangd/IndexTests.cpp
@@ -116,14 +116,6 @@
   EXPECT_TRUE(Symbols.expired());
 }
 
-TEST(MemIndexTest, MemIndexMatchSubstring) {
-  MemIndex I;
-  I.build(generateNumSymbols(5, 25));
-  FuzzyFindRequest Req;
-  Req.Query = "5";
-  EXPECT_THAT(match(I, Req), UnorderedElementsAre("5", "15", "25"));
-}
-
 TEST(MemIndexTest, MemIndexDeduplicate) {
   auto Symbols = generateNumSymbols(0, 10);
 
@@ -166,46 +158,46 @@
 
 TEST(MemIndexTest, MatchQualifiedNamesWithoutSpecificScope) {
   MemIndex I;
-  I.build(generateSymbols({"a::xyz", "b::yz", "yz"}));
+  I.build(generateSymbols({"a::y1", "b::y2", "y3"}));
   FuzzyFindRequest Req;
   Req.Query = "y";
-  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::xyz", "b::yz", "yz"));
+  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1", "b::y2", "y3"));
 }
 
 TEST(MemIndexTest, MatchQualifiedNamesWithGlobalScope) {
   MemIndex I;
-  I.build(generateSymbols({"a::xyz", "b::yz", "yz"}));
+  I.build(generateSymbols({"a::y1", "b::y2", "y3"}));
   FuzzyFindRequest Req;
   Req.Query = "y";
   Req.Scopes = {""};
-  EXPECT_THAT(match(I, Req), UnorderedElementsAre("yz"));
+  EXPECT_THAT(match(I, Req), UnorderedElementsAre("y3"));
 }
 
 TEST(MemIndexTest, MatchQualifiedNamesWithOneScope) {
   MemIndex I;
-  I.build(generateSymbols({"a::xyz", "a::yy", "a::xz", "b::yz", "yz"}));
+  I.build(generateSymbols({"a::y1", "a::y2", "a::x", "b::y2", "y3"}));
   FuzzyFindRequest Req;
   Req.Query = "y";
   Req.Scopes = {"a"};
-  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::xyz", "a::yy"));
+  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1", "a::y2"));
 }
 
 TEST(MemIndexTest, MatchQualifiedNamesWithMultipleScopes) {
   MemIndex I;
-  I.build(generateSymbols({"a::xyz", "a::yy", "a::xz", "b::yz", "yz"}));
+  I.build(generateSymbols({"a::y1", "a::y2", "a::x", "b::y3", "y3"}));
   FuzzyFindRequest Req;
   Req.Query = "y";
   Req.Scopes = {"a", "b"};
-  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::xyz", "a::yy", "b::yz"));
+  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1", "a::y2", "b::y3"));
 }
 
 TEST(MemIndexTest, NoMatchNestedScopes) {
   MemIndex I;
-  I.build(generateSymbols({"a::xyz", "a::b::yy"}));
+  I.build(generateSymbols({"a::y1", "a::b::y2"}));
   FuzzyFindRequest Req;
   Req.Query = "y";
   Req.Scopes = {"a"};
-  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::xyz"));
+  EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1"));
 }
 
 TEST(MemIndexTest, IgnoreCases) {
Index: unittests/clangd/FuzzyMatchTests.cpp
===================================================================
--- unittests/clangd/FuzzyMatchTests.cpp
+++ unittests/clangd/FuzzyMatchTests.cpp
@@ -20,16 +20,27 @@
 using testing::Not;
 
 struct ExpectedMatch {
-  ExpectedMatch(StringRef Annotated) : Word(Annotated), Annotated(Annotated) {
+  // Annotations are optional, and will not be asserted if absent.
+  ExpectedMatch(StringRef Match) : Word(Match), Annotated(Match) {
     for (char C : "[]")
       Word.erase(std::remove(Word.begin(), Word.end(), C), Word.end());
+    if (Word.size() == Annotated->size())
+      Annotated = None;
+  }
+  bool accepts(StringRef ActualAnnotated) const {
+    return !Annotated || ActualAnnotated == *Annotated;
   }
   std::string Word;
-  StringRef Annotated;
+
+  friend raw_ostream &operator<<(raw_ostream &OS, const ExpectedMatch &M) {
+    return OS << "'" << M.Word;
+    if (M.Annotated)
+      OS << "' as " << *M.Annotated;
+  }
+
+private:
+  Optional<StringRef> Annotated;
 };
-raw_ostream &operator<<(raw_ostream &OS, const ExpectedMatch &M) {
-  return OS << "'" << M.Word << "' as " << M.Annotated;
-}
 
 struct MatchesMatcher : public testing::MatcherInterface<StringRef> {
   ExpectedMatch Candidate;
@@ -47,7 +58,7 @@
     FuzzyMatcher Matcher(Pattern);
     auto Result = Matcher.match(Candidate.Word);
     auto AnnotatedMatch = Matcher.dumpLast(*OS << "\n");
-    return Result && AnnotatedMatch == Candidate.Annotated;
+    return Result && Candidate.accepts(AnnotatedMatch);
   }
 };
 
@@ -122,6 +133,7 @@
   EXPECT_THAT("sl", matches("[S]Visual[L]oggerLogsList"));
   EXPECT_THAT("sllll", matches("[S]Visua[lL]ogger[L]ogs[L]ist"));
   EXPECT_THAT("Three", matches("H[T]ML[HRE]l[e]ment"));
+  EXPECT_THAT("b", Not(matches("NDEBUG")));
   EXPECT_THAT("Three", matches("[Three]"));
   EXPECT_THAT("fo", Not(matches("barfoo")));
   EXPECT_THAT("fo", matches("bar_[fo]o"));
@@ -151,8 +163,9 @@
   EXPECT_THAT("g", matches("zz[G]roup"));
 
   EXPECT_THAT("aaaa", matches("_a_[aaaa]")); // Prefer consecutive.
-  EXPECT_THAT("printf", matches("s[printf]"));
-  EXPECT_THAT("str", matches("o[str]eam"));
+  // These would ideally match, but would need special segmentation rules.
+  EXPECT_THAT("printf", Not(matches("s[printf]")));
+  EXPECT_THAT("str", Not(matches("o[str]eam")));
 }
 
 struct RankMatcher : public testing::MatcherInterface<StringRef> {
@@ -188,9 +201,8 @@
         llvm::raw_string_ostream Info(Buf);
         auto AnnotatedMatch = Matcher.dumpLast(Info);
 
-        if (AnnotatedMatch != Str.Annotated) {
-          *OS << "\nMatched " << Str.Word << " as " << AnnotatedMatch
-              << " instead of " << Str.Annotated << "\n"
+        if (!Str.accepts(AnnotatedMatch)) {
+          *OS << "\nDoesn't match " << Str << ", but " << AnnotatedMatch << "\n"
               << Info.str();
           Ok = false;
         } else if (LastScore && *LastScore < *Score) {
Index: clangd/FuzzyMatch.h
===================================================================
--- clangd/FuzzyMatch.h
+++ clangd/FuzzyMatch.h
@@ -56,23 +56,25 @@
 
   bool init(llvm::StringRef Word);
   void buildGraph();
-  void calculateRoles(const char *Text, CharRole *Out, int N);
-  int skipPenalty(int W, Action Last);
-  int matchBonus(int P, int W, Action Last);
+  void calculateRoles(const char *Text, CharRole *Out, int &Types, int N);
+  bool allowMatch(int P, int W) const;
+  int skipPenalty(int W, Action Last) const;
+  int matchBonus(int P, int W, Action Last) const;
 
   // Pattern data is initialized by the constructor, then constant.
   char Pat[MaxPat];         // Pattern data
   int PatN;                 // Length
   char LowPat[MaxPat];      // Pattern in lowercase
   CharRole PatRole[MaxPat]; // Pattern segmentation info
-  bool CaseSensitive;       // Case-sensitive match if pattern has uppercase
+  int PatTypeSet;           // Bitmask of 1<<CharType
   float ScoreScale;         // Normalizes scores for the pattern length.
 
   // Word data is initialized on each call to match(), mostly by init().
   char Word[MaxWord];         // Word data
   int WordN;                  // Length
   char LowWord[MaxWord];      // Word in lowercase
   CharRole WordRole[MaxWord]; // Word segmentation info
+  int WordTypeSet;            // Bitmask of 1<<CharType
   bool WordContainsPattern;   // Simple substring check
 
   // Cumulative best-match score table.
Index: clangd/FuzzyMatch.cpp
===================================================================
--- clangd/FuzzyMatch.cpp
+++ clangd/FuzzyMatch.cpp
@@ -33,6 +33,8 @@
 //    Legal if the characters match.
 //  - Moving down (consuming a pattern character) is never legal.
 //    Never legal: all pattern characters must match something.
+// Characters are matched case-insensitively.
+// The first pattern character may only match the start of a word segment.
 //
 // The scoring is based on heuristics:
 //  - when matching a character, apply a bonus or penalty depending on the
@@ -74,21 +76,19 @@
 static constexpr int PerfectBonus = 3; // Perfect per-pattern-char score.
 
 FuzzyMatcher::FuzzyMatcher(StringRef Pattern)
-    : PatN(std::min<int>(MaxPat, Pattern.size())), CaseSensitive(false),
+    : PatN(std::min<int>(MaxPat, Pattern.size())),
       ScoreScale(PatN ? float{1} / (PerfectBonus * PatN) : 0), WordN(0) {
   std::copy(Pattern.begin(), Pattern.begin() + PatN, Pat);
-  for (int I = 0; I < PatN; ++I) {
+  for (int I = 0; I < PatN; ++I)
     LowPat[I] = lower(Pat[I]);
-    CaseSensitive |= LowPat[I] != Pat[I];
-  }
   Scores[0][0][Miss] = {0, Miss};
   Scores[0][0][Match] = {AwfulScore, Miss};
   for (int P = 0; P <= PatN; ++P)
     for (int W = 0; W < P; ++W)
       for (Action A : {Miss, Match})
         Scores[P][W][A] = {AwfulScore, Miss};
   if (PatN > 0)
-    calculateRoles(Pat, PatRole, PatN);
+    calculateRoles(Pat, PatRole, PatTypeSet, PatN);
 }
 
 Optional<float> FuzzyMatcher::match(StringRef Word) {
@@ -177,16 +177,21 @@
 template <typename T> static T packedLookup(const uint8_t *Data, int I) {
   return static_cast<T>((Data[I >> 2] >> ((I & 3) * 2)) & 3);
 }
-void FuzzyMatcher::calculateRoles(const char *Text, CharRole *Out, int N) {
+void FuzzyMatcher::calculateRoles(const char *Text, CharRole *Out, int &TypeSet,
+                                  int N) {
   assert(N > 0);
+  CharType Type = packedLookup<CharType>(CharTypes, Text[0]);
+  TypeSet = 1 << Type;
   // Types holds a sliding window of (Prev, Curr, Next) types.
   // Initial value is (Empty, Empty, type of Text[0]).
-  int Types = packedLookup<CharType>(CharTypes, Text[0]);
+  int Types = Type;
   // Rotate slides in the type of the next character.
   auto Rotate = [&](CharType T) { Types = ((Types << 2) | T) & 0x3f; };
   for (int I = 0; I < N - 1; ++I) {
     // For each character, rotate in the next, and look up the role.
-    Rotate(packedLookup<CharType>(CharTypes, Text[I + 1]));
+    Type = packedLookup<CharType>(CharTypes, Text[I + 1]);
+    TypeSet |= 1 << Type;
+    Rotate(Type);
     *Out++ = packedLookup<CharRole>(CharRoles, Types);
   }
   // For the last character, the "next character" is Empty.
@@ -214,7 +219,10 @@
       ++P;
   }
 
-  calculateRoles(Word, WordRole, WordN);
+  // FIXME: some words are hard to tokenize algorithmically.
+  // e.g. vsprintf is V S Print F, and should match [pri] but not [int].
+  // We could add a tokenization dictionary for common stdlib names.
+  calculateRoles(Word, WordRole, WordTypeSet, WordN);
   return true;
 }
 
@@ -247,7 +255,7 @@
                         ? ScoreInfo{MatchMissScore, Match}
                         : ScoreInfo{MissMissScore, Miss};
 
-      if (LowPat[P] != LowWord[W]) { // No match possible.
+      if (!allowMatch(P, W)) {
         Score[Match] = {AwfulScore, Miss};
       } else {
         auto &PreMatch = Scores[P][W];
@@ -261,23 +269,38 @@
   }
 }
 
-int FuzzyMatcher::skipPenalty(int W, Action Last) {
+bool FuzzyMatcher::allowMatch(int P, int W) const {
+  if (LowPat[P] != LowWord[W])
+    return false;
+  // We require a "strong" match for the first pattern character only.
+  if (P > 0)
+    return true;
+  // Obvious "strong match" for first char: match against a word head.
+  // We're banning matches outright, so conservatively accept some other cases
+  // where our segmentation might be wrong:
+  //  - allow matching B in ABCDef (but not in NDEBUG)
+  //  - we'd like to accept print in sprintf, but too many false positives
+  return WordRole[W] != Tail ||
+         (Word[W] != LowWord[W] && WordTypeSet & 1 << Lower);
+}
+
+int FuzzyMatcher::skipPenalty(int W, Action Last) const {
   int S = 0;
   if (WordRole[W] == Head) // Skipping a segment.
     S += 1;
   if (Last == Match) // Non-consecutive match.
     S += 2;          // We'd rather skip a segment than split our match.
   return S;
 }
 
-int FuzzyMatcher::matchBonus(int P, int W, Action Last) {
+int FuzzyMatcher::matchBonus(int P, int W, Action Last) const {
   assert(LowPat[P] == LowWord[W]);
   int S = 1;
   // Bonus: pattern so far is a (case-insensitive) prefix of the word.
   if (P == W) // We can't skip pattern characters, so we must have matched all.
     ++S;
   // Bonus: case matches, or a Head in the pattern aligns with one in the word.
-  if ((Pat[P] == Word[W] && (CaseSensitive || P == W)) ||
+  if ((Pat[P] == Word[W] && ((PatTypeSet & 1 << Upper) || P == W)) ||
       (PatRole[P] == Head && WordRole[W] == Head))
     ++S;
   // Penalty: matching inside a segment (and previous char wasn't matched).
@@ -312,7 +335,7 @@
                               Scores[PatN][WordN][Miss].Score))) {
     OS << "Substring check passed, but all matches are forbidden\n";
   }
-  if (!CaseSensitive)
+  if (!(PatTypeSet & 1 << Upper))
     OS << "Lowercase query, so scoring ignores case\n";
 
   // Traverse Matched table backwards to reconstruct the Pattern/Word mapping.

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D44003: [clangd] Fix unintentionally loose fuzzy matching, and the tests masking it.

Reply via email to