[PATCH] D40060: [clangd] Fuzzy match scorer

Sam McCall via Phabricator via cfe-commits Thu, 30 Nov 2017 10:46:18 -0800

sammccall updated this revision to Diff 124968.
sammccall marked 5 inline comments as done.
sammccall added a comment.


- added more VSCode tests, and made test assert matched characters. This 
uncovered algorithm problems
- cache now includes "did previous character match" in the key (scoring depends 
on this, so we gave incorrect results)
- added a penalty for non-consecutive matches
- first character matching inside a segment downgraded from a ban to a penalty 
This allows [stream] to match "istream"
- don't award case bonuses if the query is all lowercase. This helps matches 
like [ccm] -> [c]ode[C]ompletec[m] compete with [c]odeComplete[cm]


https://reviews.llvm.org/D40060

Files:
  clangd/CMakeLists.txt
  clangd/FuzzyMatch.cpp
  clangd/FuzzyMatch.h
  unittests/clangd/CMakeLists.txt
  unittests/clangd/FuzzyMatchTests.cpp

Index: unittests/clangd/FuzzyMatchTests.cpp
===================================================================
--- /dev/null
+++ unittests/clangd/FuzzyMatchTests.cpp
@@ -0,0 +1,252 @@
+//===-- FuzzyMatchTests.cpp - String fuzzy matcher tests --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "FuzzyMatch.h"
+
+#include "llvm/ADT/StringExtras.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace clang {
+namespace clangd {
+namespace {
+using namespace llvm;
+using testing::Not;
+
+struct ExpectedMatch {
+  ExpectedMatch(StringRef Annotated) : Word(Annotated), Annotated(Annotated) {
+    for (char C : "[]")
+      Word.erase(std::remove(Word.begin(), Word.end(), C), Word.end());
+  }
+  std::string Word;
+  StringRef Annotated;
+};
+raw_ostream &operator<<(raw_ostream &OS, const ExpectedMatch &M) {
+  return OS << "'" << M.Word << "' as " << M.Annotated;
+}
+
+struct MatchesMatcher : public testing::MatcherInterface<StringRef> {
+  ExpectedMatch Candidate;
+  MatchesMatcher(ExpectedMatch Candidate) : Candidate(std::move(Candidate)) {}
+
+  void DescribeTo(::std::ostream *OS) const override {
+    raw_os_ostream(*OS) << "Matches " << Candidate;
+  }
+
+  bool MatchAndExplain(StringRef Pattern,
+                       testing::MatchResultListener *L) const override {
+    std::unique_ptr<raw_ostream> OS(
+        L->stream() ? (raw_ostream *)(new raw_os_ostream(*L->stream()))
+                    : new raw_null_ostream());
+    FuzzyMatcher Matcher(Pattern);
+    auto Result = Matcher.match(Candidate.Word);
+    auto AnnotatedMatch = Matcher.dumpLast(*OS << "\n");
+    return Result && AnnotatedMatch == Candidate.Annotated;
+  }
+};
+
+// Accepts patterns that match a given word.
+// Dumps the debug tables on match failure.
+testing::Matcher<StringRef> matches(StringRef M) {
+  return testing::MakeMatcher<StringRef>(new MatchesMatcher(M));
+}
+
+TEST(FuzzyMatch, Matches) {
+  EXPECT_THAT("u_p", matches("[u]nique[_p]tr"));
+  EXPECT_THAT("up", matches("[u]nique_[p]tr"));
+  EXPECT_THAT("uq", matches("[u]ni[q]ue_ptr"));
+  EXPECT_THAT("qp", Not(matches("unique_ptr")));
+  EXPECT_THAT("log", Not(matches("SVGFEMorphologyElement")));
+
+  EXPECT_THAT("tit", matches("win.[tit]"));
+  EXPECT_THAT("title", matches("win.[title]"));
+  EXPECT_THAT("WordCla", matches("[Word]Character[Cla]ssifier"));
+  EXPECT_THAT("WordCCla", matches("[WordC]haracter[Cla]ssifier"));
+
+  EXPECT_THAT("dete", Not(matches("editor.quickSuggestionsDelay")));
+
+  EXPECT_THAT("highlight", matches("editorHover[Highlight]"));
+  EXPECT_THAT("hhighlight", matches("editor[H]over[Highlight]"));
+  EXPECT_THAT("dhhighlight", Not(matches("editorHoverHighlight")));
+
+  EXPECT_THAT("-moz", matches("[-moz]-foo"));
+  EXPECT_THAT("moz", matches("-[moz]-foo"));
+  EXPECT_THAT("moza", matches("-[moz]-[a]nimation"));
+
+  EXPECT_THAT("ab", matches("[ab]A"));
+  EXPECT_THAT("ccm", matches("[c]a[cm]elCase"));
+  EXPECT_THAT("bti", Not(matches("the_black_knight")));
+  EXPECT_THAT("ccm", Not(matches("camelCase")));
+  EXPECT_THAT("cmcm", Not(matches("camelCase")));
+  EXPECT_THAT("BK", matches("the_[b]lack_[k]night"));
+  EXPECT_THAT("KeyboardLayout=", Not(matches("KeyboardLayout")));
+  EXPECT_THAT("LLL", matches("SVisual[L]ogger[L]ogs[L]ist"));
+  EXPECT_THAT("LLLL", Not(matches("SVilLoLosLi")));
+  EXPECT_THAT("LLLL", Not(matches("SVisualLoggerLogsList")));
+  EXPECT_THAT("TEdit", matches("[T]ext[Edit]"));
+  EXPECT_THAT("TEdit", matches("[T]ext[Edit]or"));
+  EXPECT_THAT("TEdit", matches("[Te]xte[dit]"));
+  EXPECT_THAT("TEdit", matches("[t]ext_[edit]"));
+  EXPECT_THAT("TEditDit", matches("[T]ext[Edit]or[D]ecorat[i]on[T]ype"));
+  EXPECT_THAT("TEdit", matches("[T]ext[Edit]orDecorationType"));
+  EXPECT_THAT("Tedit", matches("[T]ext[Edit]"));
+  EXPECT_THAT("ba", Not(matches("?AB?")));
+  EXPECT_THAT("bkn", matches("the_[b]lack_[kn]ight"));
+  EXPECT_THAT("bt", matches("the_[b]lack_knigh[t]"));
+  EXPECT_THAT("ccm", matches("[c]amelCase[cm]"));
+  EXPECT_THAT("fdm", matches("[f]in[dM]odel"));
+  EXPECT_THAT("fob", matches("[fo]o[b]ar"));
+  EXPECT_THAT("fobz", Not(matches("foobar")));
+  EXPECT_THAT("foobar", matches("[foobar]"));
+  EXPECT_THAT("form", matches("editor.[form]atOnSave"));
+  EXPECT_THAT("g p", matches("[G]it:[ P]ull"));
+  EXPECT_THAT("g p", matches("[G]it:[ P]ull"));
+  EXPECT_THAT("gip", matches("[Gi]t: [P]ull"));
+  EXPECT_THAT("gip", matches("[Gi]t: [P]ull"));
+  EXPECT_THAT("gp", matches("[G]it: [P]ull"));
+  EXPECT_THAT("gp", matches("[G]it_Git_[P]ull"));
+  EXPECT_THAT("is", matches("[I]mport[S]tatement"));
+  EXPECT_THAT("is", matches("[is]Valid"));
+  EXPECT_THAT("lowrd", matches("[low]Wo[rd]"));
+  EXPECT_THAT("myvable", matches("[myva]ria[ble]"));
+  EXPECT_THAT("no", Not(matches("")));
+  EXPECT_THAT("no", Not(matches("match")));
+  EXPECT_THAT("ob", Not(matches("foobar")));
+  EXPECT_THAT("sl", matches("[S]Visual[L]oggerLogsList"));
+  EXPECT_THAT("sllll", matches("[S]Visua[lL]ogger[L]ogs[L]ist"));
+  EXPECT_THAT("Three", matches("H[T]ML[HRE]l[e]ment"));
+  EXPECT_THAT("Three", matches("[Three]"));
+  EXPECT_THAT("fo", Not(matches("barfoo")));
+  EXPECT_THAT("fo", matches("bar_[fo]o"));
+  EXPECT_THAT("fo", matches("bar_[Fo]o"));
+  EXPECT_THAT("fo", matches("bar [fo]o"));
+  EXPECT_THAT("fo", matches("bar.[fo]o"));
+  EXPECT_THAT("fo", matches("bar/[fo]o"));
+  EXPECT_THAT("fo", matches("bar\\[fo]o"));
+
+  EXPECT_THAT(
+      "aaaaaa",
+      matches("[aaaaaa]aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
+              "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"));
+  EXPECT_THAT("baba", Not(matches("ababababab")));
+  EXPECT_THAT("fsfsfs", Not(matches("dsafdsafdsafdsafdsafdsafdsafasdfdsa")));
+  EXPECT_THAT("fsfsfsfsfsfsfsf",
+              Not(matches("dsafdsafdsafdsafdsafdsafdsafasdfdsafdsafdsafdsafdsfd"
+                          "safdsfdfdfasdnfdsajfndsjnafjndsajlknfdsa")));
+
+  EXPECT_THAT("  g", matches("[  g]roup"));
+  EXPECT_THAT("g", matches("  [g]roup"));
+  EXPECT_THAT("g g", Not(matches("  groupGroup")));
+  EXPECT_THAT("g g", matches("  [g]roup[ G]roup"));
+  EXPECT_THAT(" g g", matches(" [ g]roup[ G]roup"));
+  EXPECT_THAT("zz", matches("[zz]Group"));
+  EXPECT_THAT("zzg", matches("[zzG]roup"));
+  EXPECT_THAT("g", matches("zz[G]roup"));
+
+  EXPECT_THAT("aaaa", matches("_a_[aaaa]")); // Prefer consecutive.
+  EXPECT_THAT("printf", matches("s[printf]"));
+  EXPECT_THAT("str", matches("o[str]eam"));
+}
+
+struct RankMatcher : public testing::MatcherInterface<StringRef> {
+  std::vector<ExpectedMatch> RankedStrings;
+  RankMatcher(std::initializer_list<ExpectedMatch> RankedStrings)
+      : RankedStrings(RankedStrings) {}
+
+  void DescribeTo(::std::ostream *OS) const override {
+    raw_os_ostream O(*OS);
+    O << "Ranks strings in order: [";
+    for (const auto &Str : RankedStrings)
+      O << "\n\t" << Str;
+    O << "\n]";
+  }
+
+  bool MatchAndExplain(StringRef Pattern,
+                       testing::MatchResultListener *L) const override {
+    std::unique_ptr<raw_ostream> OS(
+        L->stream() ? (raw_ostream *)(new raw_os_ostream(*L->stream()))
+                    : new raw_null_ostream());
+    FuzzyMatcher Matcher(Pattern);
+    const ExpectedMatch *LastMatch;
+    Optional<float> LastScore;
+    bool Ok = true;
+    for (const auto& Str : RankedStrings) {
+      auto Score = Matcher.match(Str.Word);
+      if (!Score) {
+        *OS << "\nDoesn't match '" << Str.Word << "'";
+        Matcher.dumpLast(*OS << "\n");
+        Ok = false;
+      } else {
+        std::string Buf;
+        llvm::raw_string_ostream Info(Buf);
+        auto AnnotatedMatch = Matcher.dumpLast(Info);
+
+        if (AnnotatedMatch != Str.Annotated) {
+          *OS << "\nMatched " << Str.Word << " as " << AnnotatedMatch
+              << " instead of " << Str.Annotated << "\n"
+              << Info.str();
+          Ok = false;
+        } else if (LastScore && *LastScore < *Score) {
+          *OS << "\nRanks '" << Str.Word << "'=" << *Score << " above '"
+              << LastMatch->Word << "'=" << *LastScore << "\n"
+              << Info.str();
+          Matcher.match(LastMatch->Word);
+          Matcher.dumpLast(*OS << "\n");
+          Ok = false;
+        }
+      }
+      LastMatch = &Str;
+      LastScore = Score;
+    }
+    return Ok;
+  }
+};
+
+// Accepts patterns that match all the strings and rank them in the given order.
+// Dumps the debug tables on match failure.
+template <typename... T> testing::Matcher<StringRef> ranks(T... RankedStrings) {
+  return testing::MakeMatcher<StringRef>(
+      new RankMatcher{ExpectedMatch(RankedStrings)...});
+}
+
+TEST(FuzzyMatch, Ranking) {
+  EXPECT_THAT("eb", ranks("[e]mplace_[b]ack", "[e]m[b]ed"));
+  EXPECT_THAT("cons",
+              ranks("[cons]ole", "[Cons]ole", "ArrayBuffer[Cons]tructor"));
+  EXPECT_THAT("foo", ranks("[foo]", "[Foo]"));
+  EXPECT_THAT("onMess",
+              ranks("[onMess]age", "[onmess]age", "[on]This[M]ega[Es]cape[s]"));
+  EXPECT_THAT("CC", ranks("[C]amel[C]ase", "[c]amel[C]ase"));
+  EXPECT_THAT("cC", ranks("[c]amel[C]ase", "[C]amel[C]ase"));
+  EXPECT_THAT("p", ranks("[p]arse", "[p]osix", "[p]afdsa", "[p]ath", "[p]"));
+  EXPECT_THAT("pa", ranks("[pa]rse", "[pa]th", "[pa]fdsa"));
+  EXPECT_THAT("log", ranks("[log]", "Scroll[Log]icalPosition"));
+  EXPECT_THAT("e", ranks("[e]lse", "Abstract[E]lement"));
+  EXPECT_THAT("workbench.sideb",
+              ranks("[workbench.sideB]ar.location",
+                    "[workbench.]editor.default[SideB]ySideLayout"));
+  EXPECT_THAT("editor.r", ranks("[editor.r]enderControlCharacter",
+                                "[editor.]overview[R]ulerlanes",
+                                "diff[Editor.r]enderSideBySide"));
+  EXPECT_THAT("-mo", ranks("[-mo]z-columns", "-ms-ime[-mo]de"));
+  EXPECT_THAT("convertModelPosition",
+              ranks("[convertModelPosition]ToViewPosition",
+                    "[convert]ViewTo[ModelPosition]"));
+  EXPECT_THAT("is", ranks("[is]ValidViewletId", "[i]mport [s]tatement"));
+  EXPECT_THAT("title", ranks("window.[title]",
+                             "files.[t]r[i]m[T]rai[l]ingWhit[e]space"));
+  EXPECT_THAT("strcpy", ranks("[strcpy]", "[strcpy]_s", "[str]n[cpy]"));
+  EXPECT_THAT("close", ranks("workbench.quickOpen.[close]OnFocusOut",
+                             "[c]ss.[l]int.imp[o]rt[S]tat[e]ment",
+                             "[c]ss.co[lo]rDecorator[s].[e]nable"));
+}
+
+} // namespace
+} // namespace clangd
+} // namespace clang
Index: unittests/clangd/CMakeLists.txt
===================================================================
--- unittests/clangd/CMakeLists.txt
+++ unittests/clangd/CMakeLists.txt
@@ -10,6 +10,7 @@
 
 add_extra_unittest(ClangdTests
   ClangdTests.cpp
+  FuzzyMatchTests.cpp
   JSONExprTests.cpp
   TraceTests.cpp
   )
Index: clangd/FuzzyMatch.h
===================================================================
--- /dev/null
+++ clangd/FuzzyMatch.h
@@ -0,0 +1,76 @@
+//===--- FuzzyMatch.h - Approximate identifier matching  ---------*- C++-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements fuzzy-matching of strings against identifiers.
+// It indicates both the existence and quality of a match:
+// 'eb' matches both 'emplace_back' and 'embed', the former has a better score.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace clang {
+namespace clangd {
+
+// A matcher capable of matching and scoring strings against a single pattern.
+// It's optimized for matching against many strings - match() does not allocate.
+class FuzzyMatcher {
+public:
+  // Characters beyond MaxPat are ignored.
+  FuzzyMatcher(llvm::StringRef Pattern);
+
+  // If Word matches the pattern, return a score in [0,1] (higher is better).
+  // Characters beyond MaxWord are ignored.
+  llvm::Optional<float> match(llvm::StringRef Word);
+
+  // Dump internal state from the last match() to the stream, for debugging.
+  // Returns the pattern with [] around matched characters, e.g.
+  //   [u_p] + "unique_ptr" --> "[u]nique[_p]tr"
+  llvm::SmallString<256> dumpLast(llvm::raw_ostream &) const;
+
+private:
+  // We truncate the pattern and the word to bound the cost of matching.
+  constexpr static int MaxPat = 63, MaxWord = 127;
+  enum CharRole : char; // For segmentation.
+  enum CharType : char; // For segmentation.
+  enum Action { Miss = 0, Match = 1 };
+
+  bool init(llvm::StringRef Word);
+  void buildGraph();
+  void calculateRoles(const char *Text, CharRole *Out, int N);
+  int skipPenalty(int W, Action Last);
+  int matchBonus(int P, int W, Action Last);
+
+  // Pattern data is initialized by constructor, Word data by match() -> init().
+  int PatN, WordN;                               // Length of pattern and word.
+  char Pat[MaxPat], Word[MaxWord];               // Raw pattern and word data.
+  char LowPat[MaxPat], LowWord[MaxWord];         // Lowercase.
+  CharRole PatRole[MaxPat], WordRole[MaxWord];   // Segmentation info.
+  bool CaseSensitive;                            // Detected from pattern.
+  float ScoreScale;                              // Pattern normalizing factor.
+  bool WordContainsPattern;                      // dumpLast() debugging only.
+
+  // Cumulative best-match score table.
+  struct ScoreInfo {
+    signed int Score : 15;
+    Action Prev : 1;
+  };
+  ScoreInfo Scores[MaxPat + 1][MaxWord + 1][/* Last Action */2];
+};
+
+} // namespace clangd
+} // namespace clang
+
+#endif
Index: clangd/FuzzyMatch.cpp
===================================================================
--- /dev/null
+++ clangd/FuzzyMatch.cpp
@@ -0,0 +1,373 @@
+//===--- FuzzyMatch.h - Approximate identifier matching  ---------*- C++-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// To check for a match between a Pattern ('u_p') and a Word ('unique_ptr'),
+// we consider the possible partial match states:
+//
+//     u n i q u e _ p t r
+//   +---------------------
+//   |A . . . . . . . . . .
+//  u|
+//   |. . . . . . . . . . .
+//  _|
+//   |. . . . . . . O . . .
+//  p|
+//   |. . . . . . . . . . B
+//
+// Each dot represents some prefix of the pattern being matched against some
+// prefix of the word.
+//   - A is the initial state: '' matched against ''
+//   - O is an intermediate state: 'u_' matched against 'unique_'
+//   - B is the target state: 'u_p' matched against 'unique_ptr'
+//
+// We aim to find the best path from A->B.
+//  - Moving right (consuming a word character)
+//    Always legal: not all word characters must match.
+//  - Moving diagonally (consuming both a word and pattern character)
+//    Legal if the characters match.
+//  - Moving down (consuming a pattern character) is never legal.
+//    Never legal: all pattern characters must match something.
+//
+// The scoring is based on heuristics:
+//  - when matching a character, apply a bonus or penalty depending on the
+//    match quality (does case match, do word segments align, etc)
+//  - when skipping a character, apply a penalty if it hurts the match
+//    (it starts a word segment, or splits the matched region, etc)
+//
+// These heuristics require the ability to "look backward" one character, to
+// see whether it was matched or not. Therefore the dynamic-programming matrix
+// has an extra dimension (last character matched).
+// Each entry also has an additional flag indicating whether the last-but-one
+// character matched, which is needed to trace back through the scoring table
+// and reconstruct the match.
+//
+// We treat strings as byte-sequences, so only ASCII has first-class support.
+//
+// This algorithm was inspired by VS code's client-side filtering, and aims
+// to be mostly-compatible.
+//
+//===----------------------------------------------------------------------===//
+
+#include "FuzzyMatch.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/Support/Format.h"
+
+using namespace llvm;
+using namespace clang::clangd;
+
+const int FuzzyMatcher::MaxPat;
+const int FuzzyMatcher::MaxWord;
+
+static char lower(char C) { return C >= 'A' && C <= 'Z' ? C + ('a' - 'A') : C; }
+// A "negative infinity" score that won't overflow.
+// We use this to mark unreachable states and forbidden solutions.
+// Score field is 15 bits wide, min value is -2^14, we use half of that.
+static constexpr int AwfulScore = -(1 << 13);
+static bool isAwful(int S) { return S < AwfulScore / 2; }
+static constexpr int PerfectBonus = 2; // Perfect per-pattern-char score.
+
+FuzzyMatcher::FuzzyMatcher(StringRef Pattern)
+    : PatN(std::min<int>(MaxPat, Pattern.size())), WordN(0),
+      CaseSensitive(false), ScoreScale(float{1} / (PerfectBonus * PatN)) {
+  memcpy(Pat, Pattern.data(), PatN);
+  for (int I = 0; I < PatN; ++I) {
+    LowPat[I] = lower(Pat[I]);
+    CaseSensitive |= LowPat[I] != Pat[I];
+  }
+  Scores[0][0][Miss] = {0, Miss};
+  Scores[0][0][Match] = {AwfulScore, Miss};
+  for (int P = 0; P <= PatN; ++P)
+    for (int W = 0; W < P; ++W)
+      for (Action A : {Miss, Match})
+        Scores[P][W][A] = {AwfulScore, Miss};
+  calculateRoles(Pat, PatRole, PatN);
+}
+
+Optional<float> FuzzyMatcher::match(StringRef Word) {
+  if (!PatN)
+    return 1;
+  if (!(WordContainsPattern = init(Word)))
+    return None;
+  buildGraph();
+  auto Best = std::max(Scores[PatN][WordN][Miss].Score,
+                            Scores[PatN][WordN][Match].Score);
+  if (isAwful(Best))
+    return None;
+  return ScoreScale * std::min(PerfectBonus * PatN, std::max<int>(0, Best));
+}
+
+// Segmentation of words and patterns.
+// A name like "fooBar_baz" consists of several parts foo, bar, baz.
+// Aligning segmentation of word and pattern improves the fuzzy-match.
+// For example: [lol] matches "LaughingOutLoud" better than "LionPopulation"
+//
+// First we classify each character into types (uppercase, lowercase, etc).
+// Then we look at the sequence: e.g. [upper, lower] is the start of a segment.
+
+// We only distinguish the types of characters that affect segmentation.
+// It's not obvious how to segment digits, we treat them as lowercase letters.
+// As we don't decode UTF-8, we treat bytes over 127 as lowercase too.
+// This means we require exact (case-sensitive) match.
+enum FuzzyMatcher::CharType : char {
+  Empty = 0,       // Before-the-start and after-the-end (and control chars).
+  Lower = 1,       // Lowercase letters, digits, and non-ASCII bytes.
+  Upper = 2,       // Uppercase letters.
+  Punctuation = 3, // ASCII punctuation (including Space)
+};
+
+// We get CharTypes from a lookup table. Each is 2 bits, 4 fit in each byte.
+// The top 6 bits of the char select the byte, the bottom 2 select the offset.
+// e.g. 'q' = 010100 01 = byte 28 (55), bits 3-2 (01) -> Lower.
+constexpr static uint8_t CharTypes[] = {
+    0x00, 0x00, 0x00, 0x00, // Control characters
+    0x00, 0x00, 0x00, 0x00, // Control characters
+    0xff, 0xff, 0xff, 0xff, // Punctuation
+    0x55, 0x55, 0xf5, 0xff, // Numbers->Lower, more Punctuation.
+    0xab, 0xaa, 0xaa, 0xaa, // @ and A-O
+    0xaa, 0xaa, 0xea, 0xff, // P-Z, more Punctuation.
+    0x57, 0x55, 0x55, 0x55, // ` and a-o
+    0x55, 0x55, 0xd5, 0x3f, // p-z, Punctuation, DEL.
+    0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, // Bytes over 127 -> Lower.
+    0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, // (probably UTF-8).
+    0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+    0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+};
+
+// Each character's Role is the Head or Tail of a segment, or a Separator.
+// e.g. XMLHttpRequest_Async
+//      +--+---+------ +----
+//      ^Head   ^Tail ^Separator
+enum FuzzyMatcher::CharRole : char {
+  Unknown = 0,   // Stray control characters or impossible states.
+  Tail = 1,      // Part of a word segment, but not the first character.
+  Head = 2,      // The first character of a word segment.
+  Separator = 3, // Punctuation characters that separate word segments.
+};
+
+// The Role can be determined from the Type of a character and its neighbors:
+//
+//   Example  | Chars | Type | Role
+//   ---------+--------------+-----
+//   F(o)oBar | Foo   | Ull  | Tail
+//   Foo(B)ar | oBa   | lUl  | Head
+//   (f)oo    | ^fo   | Ell  | Head
+//   H(T)TP   | HTT   | UUU  | Tail
+//
+// Our lookup table maps a 6 bit key (Prev, Curr, Next) to a 2-bit Role.
+// A byte packs 4 Roles. (Prev, Curr) selects a byte, Next selects the offset.
+// e.g. Lower, Upper, Lower -> 01 10 01 -> byte 6 (aa), bits 3-2 (10) -> Head.
+constexpr static uint8_t CharRoles[] = {
+    // clang-format off
+    //         Curr= Empty Lower Upper Separ
+    /* Prev=Empty */ 0x00, 0xaa, 0xaa, 0xff, // At start, Lower|Upper->Head
+    /* Prev=Lower */ 0x00, 0x55, 0xaa, 0xff, // In word, Upper->Head;Lower->Tail
+    /* Prev=Upper */ 0x00, 0x55, 0x59, 0xff, // Ditto, but U(U)U->Tail
+    /* Prev=Separ */ 0x00, 0xaa, 0xaa, 0xff, // After separator, like at start
+    // clang-format on
+};
+
+template <typename T> static T packedLookup(const uint8_t *Data, int I) {
+  return static_cast<T>((Data[I >> 2] >> ((I & 3) * 2)) & 3);
+}
+void FuzzyMatcher::calculateRoles(const char *Text, CharRole *Out, int N) {
+  // Types holds a sliding window of (Prev, Curr, Next) types.
+  // Initial value is (Empty, Empty, type of Text[0]).
+  int Types = packedLookup<CharType>(CharTypes, Text[0]);
+  // Rotate slides in the type of the next character.
+  auto Rotate = [&](CharType T) { Types = ((Types << 2) | T) & 0x3f; };
+  for (int I = 0; I < N - 1; ++I) {
+    // For each character, rotate in the next, and look up the role.
+    Rotate(packedLookup<CharType>(CharTypes, Text[I + 1]));
+    *Out++ = packedLookup<CharRole>(CharRoles, Types);
+  }
+  // For the last character, the "next character" is Empty.
+  Rotate(Empty);
+  *Out++ = packedLookup<CharRole>(CharRoles, Types);
+}
+
+// Sets up the data structures matching Word.
+// Returns false if we can cheaply determine that no match is possible.
+bool FuzzyMatcher::init(StringRef NewWord) {
+  WordN = std::min<int>(MaxWord, NewWord.size());
+  if (PatN > WordN)
+    return false;
+  memcpy(Word, NewWord.data(), WordN);
+  for (int I = 0; I < WordN; ++I)
+    LowWord[I] = lower(Word[I]);
+
+  // Cheap subsequence check.
+  for (int W = 0, P = 0; P != PatN; ++W) {
+    if (W == WordN)
+      return false;
+    if (LowWord[W] == LowPat[P])
+      ++P;
+  }
+
+  calculateRoles(Word, WordRole, WordN);
+  return true;
+}
+
+// The forwards pass finds the mappings of Pattern onto Word.
+// Score = best score achieved matching Word[..W] against Pat[..P].
+// Unlike other tables, indices range from 0 to N *inclusive*
+// Matched = whether we chose to match Word[W] with Pat[P] or not.
+//
+// Points are mostly assigned to matched characters, with 1 being a good score
+// and 2 being a great one. So we treat the score range as [0, 2 * PatN].
+// This range is not strict: we can apply larger bonuses/penalties, or penalize
+// non-matched characters.
+void FuzzyMatcher::buildGraph() {
+  for (int W = 0; W < WordN; ++W) {
+    Scores[0][W + 1][Miss] = {Scores[0][W][Miss].Score - skipPenalty(W, Miss),
+                             Miss};
+    Scores[0][W + 1][Match] = {AwfulScore, Miss};
+  }
+  for (int P = 0; P < PatN; ++P) {
+    for (int W = P; W < WordN; ++W) {
+      auto &Score = Scores[P + 1][W + 1], &PreMiss = Scores[P + 1][W];
+
+      auto MatchMissScore = PreMiss[Match].Score;
+      auto MissMissScore = PreMiss[Miss].Score;
+      if (P < PatN - 1) { // Skipping trailing characters is always free.
+        MatchMissScore -= skipPenalty(W, Match);
+        MissMissScore -= skipPenalty(W, Miss);
+      }
+      Score[Miss] = (MatchMissScore > MissMissScore)
+                        ? ScoreInfo{MatchMissScore, Match}
+                        : ScoreInfo{MissMissScore, Miss};
+
+      if (LowPat[P] != LowWord[W]) { // No match possible.
+        Score[Match] = {AwfulScore, Miss};
+      } else {
+        auto &PreMatch = Scores[P][W];
+        auto MatchMatchScore = PreMatch[Match].Score + matchBonus(P, W, Match);
+        auto MissMatchScore = PreMatch[Miss].Score + matchBonus(P, W, Miss);
+        Score[Match] = (MatchMatchScore > MissMatchScore)
+                           ? ScoreInfo{MatchMatchScore, Match}
+                           : ScoreInfo{MissMatchScore, Miss};
+      }
+    }
+  }
+}
+
+int FuzzyMatcher::skipPenalty(int W, Action Last) {
+  int S = 0;
+  if (WordRole[W] == Head) // Skipping a segment.
+    S += 1;
+  if (Last == Match) // Non-consecutive match.
+    S += 2;          // We'd rather skip a segment than split our match.
+  return S;
+}
+
+int FuzzyMatcher::matchBonus(int P, int W, Action Last) {
+  assert(LowPat[P] == LowWord[W]);
+  int S = 0;
+  // Bonus: pattern so far is a (case-insensitive) prefix of the word.
+  if (P == W) // We can't skip pattern characters, so we must have matched all.
+    ++S;
+  // Bonus: case matches, or a Head in the pattern aligns with one in the word.
+  if ((Pat[P] == Word[W] && CaseSensitive) ||
+      (PatRole[P] == Head && WordRole[W] == Head))
+    ++S;
+  // Penalty: matching inside a segment (and previous char wasn't matched).
+  if (WordRole[W] == Tail && P && Last == Miss)
+    S -= 3;
+  // Penalty: a Head in the pattern matches in the middle of a word segment.
+  if (PatRole[P] == Head && WordRole[W] == Tail)
+    --S;
+  // Penalty: matching the first pattern character in the middle of a segment.
+  if (P == 0 && WordRole[W] == Tail)
+    S -= 4;
+  assert(S <= PerfectBonus);
+  return S;
+}
+
+llvm::SmallString<256> FuzzyMatcher::dumpLast(llvm::raw_ostream &OS) const {
+  llvm::SmallString<256> Result;
+  OS << "=== Match \"" << StringRef(Word, WordN) << "\" against ["
+     << StringRef(Pat, PatN) << "] ===\n";
+  if (!WordContainsPattern) {
+    OS << "Substring check failed.\n";
+    return Result;
+  } else if (isAwful(std::max(Scores[PatN][WordN][Match].Score,
+                              Scores[PatN][WordN][Miss].Score))) {
+    OS << "Substring check passed, but all matches are forbidden\n";
+  }
+  if (!CaseSensitive)
+    OS << "Lowercase query, so scoring ignores case\n";
+
+  // Traverse Matched table backwards to reconstruct the Pattern/Word mapping.
+  // The Score table has cumulative scores, subtracting along this path gives
+  // us the per-letter scores.
+  Action Last =
+      (Scores[PatN][WordN][Match].Score > Scores[PatN][WordN][Miss].Score)
+          ? Match
+          : Miss;
+  int S[MaxWord];
+  Action A[MaxWord];
+  for (int W = WordN - 1, P = PatN - 1; W >= 0; --W) {
+    A[W] = Last;
+    const auto &Cell = Scores[P + 1][W + 1][Last];
+    if (Last == Match)
+      --P;
+    const auto &Prev = Scores[P + 1][W][Cell.Prev];
+    S[W] = Cell.Score - Prev.Score;
+    Last = Cell.Prev;
+  }
+  for (int I = 0; I < WordN; ++I) {
+    if (A[I] == Match && (I == 0 || A[I - 1] == Miss))
+      Result.push_back('[');
+    if (A[I] == Miss && I > 0 && A[I - 1] == Match)
+      Result.push_back(']');
+    Result.push_back(Word[I]);
+  }
+  if (A[WordN - 1] == Match)
+    Result.push_back(']');
+
+  for (char C : StringRef(Word, WordN))
+    OS << " " << C << " ";
+  OS << "\n";
+  for (int I = 0, J = 0; I < WordN; I++)
+    OS << " " << (A[I] == Match ? Pat[J++] : ' ') << " ";
+  OS << "\n";
+  for (int I = 0; I < WordN; I++)
+    OS << format("%2d ", S[I]);
+  OS << "\n";
+
+  OS << "\nSegmentation:";
+  OS << "\n'" << StringRef(Word, WordN) << "'\n ";
+  for (int I = 0; I < WordN; ++I)
+    OS << "?-+ "[static_cast<int>(WordRole[I])];
+  OS << "\n[" << StringRef(Pat, PatN) << "]\n ";
+  for (int I = 0; I < PatN; ++I)
+    OS << "?-+ "[static_cast<int>(PatRole[I])];
+  OS << "\n";
+
+  OS << "\nScoring table (last-Miss, last-Match):\n";
+  OS << " |    ";
+  for (char C : StringRef(Word, WordN))
+    OS << "  " << C << " ";
+  OS << "\n";
+  OS << "-+----" << std::string(WordN * 4, '-') << "\n";
+  for (int I = 0; I <= PatN; ++I) {
+    for (Action A : {Miss, Match}) {
+      OS << ((I && A == Miss) ? Pat[I - 1] : ' ') << "|";
+      for (int J = 0; J <= WordN; ++J) {
+        if (!isAwful(Scores[I][J][A].Score))
+          OS << format("%3d%c", Scores[I][J][A].Score,
+                       Scores[I][J][A].Prev == Match ? '*' : ' ');
+        else
+          OS << "    ";
+      }
+      OS << "\n";
+    }
+  }
+
+  return Result;
+}
Index: clangd/CMakeLists.txt
===================================================================
--- clangd/CMakeLists.txt
+++ clangd/CMakeLists.txt
@@ -8,6 +8,7 @@
   ClangdUnit.cpp
   ClangdUnitStore.cpp
   DraftStore.cpp
+  FuzzyMatch.cpp
   GlobalCompilationDatabase.cpp
   JSONRPCDispatcher.cpp
   JSONExpr.cpp

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D40060: [clangd] Fuzzy match scorer

Reply via email to