sammccall updated this revision to Diff 122956.
sammccall added a comment.

clang-format


https://reviews.llvm.org/D40060

Files:
  clangd/CMakeLists.txt
  clangd/FuzzyMatch.cpp
  clangd/FuzzyMatch.h
  unittests/clangd/CMakeLists.txt
  unittests/clangd/FuzzyMatchTests.cpp

Index: unittests/clangd/FuzzyMatchTests.cpp
===================================================================
--- /dev/null
+++ unittests/clangd/FuzzyMatchTests.cpp
@@ -0,0 +1,130 @@
+//===-- FuzzyMatchTests.cpp - String fuzzy matcher tests --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "FuzzyMatch.h"
+
+#include "llvm/ADT/StringExtras.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace clang {
+namespace clangd {
+namespace {
+using namespace llvm;
+using testing::Not;
+
+struct MatchesMatcher : public testing::MatcherInterface<StringRef> {
+  StringRef Candidate;
+  MatchesMatcher(StringRef Candidate) : Candidate(Candidate) {}
+
+  void DescribeTo(::std::ostream *OS) const override {
+    *OS << "Matches '" << Candidate.str() << "'";
+  }
+
+  bool MatchAndExplain(StringRef Pattern,
+                       testing::MatchResultListener *L) const override {
+    std::unique_ptr<raw_ostream> OS(
+        L->stream() ? (raw_ostream *)(new raw_os_ostream(*L->stream()))
+                    : new raw_null_ostream());
+    FuzzyMatcher Matcher(Pattern);
+    auto Result = Matcher.match(Candidate);
+    Matcher.dumpLast(*OS << "\n");
+    return !!Result;
+  }
+};
+
+// Accepts patterns that match a given word.
+// Dumps the debug tables on match failure.
+testing::Matcher<StringRef> matches(StringRef Word) {
+  return testing::MakeMatcher<StringRef>(new MatchesMatcher(Word));
+}
+
+TEST(FuzzyMatch, Matches) {
+  EXPECT_THAT("u_p", matches("unique_ptr"));
+  EXPECT_THAT("up", matches("unique_ptr"));
+  EXPECT_THAT("uq", matches("unique_ptr"));
+  EXPECT_THAT("qp", Not(matches("unique_ptr")));
+  EXPECT_THAT("log", Not(matches("SVGFEMorphologyElement")));
+}
+
+struct RankMatcher : public testing::MatcherInterface<StringRef> {
+  std::vector<StringRef> RankedStrings;
+  RankMatcher(std::initializer_list<StringRef> RankedStrings)
+      : RankedStrings(RankedStrings) {}
+
+  void DescribeTo(::std::ostream *OS) const override {
+    *OS << "Ranks strings in order: ["
+        << join(RankedStrings.begin(), RankedStrings.end(), ", ") << "]";
+  }
+
+  bool MatchAndExplain(StringRef Pattern,
+                       testing::MatchResultListener *L) const override {
+    std::unique_ptr<raw_ostream> OS(
+        L->stream() ? (raw_ostream *)(new raw_os_ostream(*L->stream()))
+                    : new raw_null_ostream());
+    FuzzyMatcher Matcher(Pattern);
+    StringRef LastString;
+    Optional<float> LastScore;
+    bool Ok = true;
+    for (StringRef Str : RankedStrings) {
+      auto Score = Matcher.match(Str);
+      if (!Score) {
+        *OS << "\nDoesn't match '" << Str.str() << "'";
+        Matcher.dumpLast(*OS << "\n");
+        Ok = false;
+      } else if (LastScore && *LastScore < *Score) {
+        *OS << "\nRanks '" << Str.str() << "'=" << *Score << " above '"
+            << LastString.str() << "'=" << *LastScore;
+        Matcher.dumpLast(*OS << "\n");
+        Matcher.match(LastString);
+        Matcher.dumpLast(*OS << "\n");
+        Ok = false;
+      }
+      LastString = Str;
+      LastScore = Score;
+    }
+    return Ok;
+  }
+};
+
+// Accepts patterns that match all the strings and rank them in the given order.
+// Dumps the debug tables on match failure.
+template <typename... T> testing::Matcher<StringRef> ranks(T... RankedStrings) {
+  return testing::MakeMatcher<StringRef>(new RankMatcher{RankedStrings...});
+}
+
+TEST(FuzzyMatch, Ranking) {
+  EXPECT_THAT("eb", ranks("emplace_back", "embed"));
+  EXPECT_THAT("cons", ranks("console", "Console", "ArrayBufferConstructor"));
+  EXPECT_THAT("foo", ranks("foo", "Foo"));
+  EXPECT_THAT("onMess", ranks("onMessage", "onmessage", "onThisMegaEscapes"));
+  EXPECT_THAT("CC", ranks("CamelCase", "camelCase"));
+  EXPECT_THAT("cC", ranks("camelCase", "CamelCase"));
+  EXPECT_THAT("p", ranks("parse", "posix", "pafdsa", "path", "p"));
+  EXPECT_THAT("pa", ranks("parse", "path", "pafdsa"));
+  EXPECT_THAT("log", ranks("log", "ScrollLogicalPosition"));
+  EXPECT_THAT("e", ranks("else", "AbstractElement"));
+  EXPECT_THAT("workbench.sideb",
+              ranks("workbench.sideBar.location",
+                    "workbench.editor.defaultSideBySideLayout"));
+  EXPECT_THAT("editor.r", ranks("editor.renderControlCharacter",
+                                "editor.overviewRulerlanes",
+                                "diffEditor.renderSideBySide"));
+  EXPECT_THAT("-mo", ranks("-moz-columns", "-ms-ime-mode"));
+  EXPECT_THAT("convertModelPosition",
+              ranks("convertModelPositionToViewPosition",
+                    "convertViewToModelPosition"));
+  EXPECT_THAT("is", ranks("isValidViewletId", "import statement"));
+  EXPECT_THAT("title", ranks("window.title", "files.trimTrailingWhitespace"));
+  EXPECT_THAT("strcpy", ranks("strcpy", "strcpy_s", "strncpy"));
+}
+
+} // namespace
+} // namespace clangd
+} // namespace clang
Index: unittests/clangd/CMakeLists.txt
===================================================================
--- unittests/clangd/CMakeLists.txt
+++ unittests/clangd/CMakeLists.txt
@@ -10,6 +10,7 @@
 
 add_extra_unittest(ClangdTests
   ClangdTests.cpp
+  FuzzyMatchTests.cpp
   JSONExprTests.cpp
   TraceTests.cpp
   )
Index: clangd/FuzzyMatch.h
===================================================================
--- /dev/null
+++ clangd/FuzzyMatch.h
@@ -0,0 +1,64 @@
+//===--- FuzzyMatch.h - Approximate identifier matching  ---------*- C++-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements fuzzy-matching of strings against identifiers.
+// It indicates both the existence and quality of a match:
+// 'eb' matches both 'emplace_back' and 'embed', the former has a better score.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace clang {
+namespace clangd {
+
+class FuzzyMatcher {
+public:
+  FuzzyMatcher(llvm::StringRef Pattern);
+
+  // If Word matches the pattern, return a score in [0,1] (higher is better).
+  llvm::Optional<float> match(llvm::StringRef Word);
+
+  // Dump internal state from the last match() to the stream, for debugging.
+  void dumpLast(llvm::raw_ostream &) const;
+
+private:
+  constexpr static int MaxPat = 63;
+  constexpr static int MaxWord = 127;
+  enum CharRole { Unknown, Tail, Head, Separator };
+
+  bool init(llvm::StringRef Word);
+  void buildGraph();
+  void calculateRoles(const char *Text, CharRole *Out, int N);
+  int skipPenalty(int W);
+  int matchBonus(int P, int W);
+
+  char Pat[MaxPat];
+  char LPat[MaxPat];
+  int NPat;
+  char Word[MaxWord];
+  char LWord[MaxWord];
+  int NWord;
+  CharRole PatRole[MaxPat];
+  CharRole WordRole[MaxWord];
+  int Score[MaxPat + 1][MaxWord + 1]; // Oversized for alignment.
+  bool MatchGraph[MaxPat + 1][MaxWord + 1];
+  bool IsSubstring;
+  float ScoreScale;
+};
+
+} // namespace clangd
+} // namespace clang
+
+#endif
Index: clangd/FuzzyMatch.cpp
===================================================================
--- /dev/null
+++ clangd/FuzzyMatch.cpp
@@ -0,0 +1,287 @@
+//===--- FuzzyMatch.h - Approximate identifier matching  ---------*- C++-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// To check for a match between a Pattern ('u_p') and a Word ('unique_ptr'),
+// we consider the possible partial match states:
+//
+//     u n i q u e _ p t r
+//   +---------------------
+//   |A . . . . . . . . . .
+//  u|
+//   |. . . . . . . . . . .
+//  _|
+//   |. . . . . . . O . . .
+//  p|
+//   |. . . . . . . . . . B
+//
+// Each dot represents some prefix of the pattern being matched against some
+// prefix of the word.
+//   - A is the initial state: '' matched against ''
+//   - O is an intermediate state: 'u_' matched against 'unique_'
+//   - B is the target state: 'u_p' matched against 'unique_ptr'
+//
+// We aim to find the best path from A->B.
+//  - Moving right (consuming a word character)
+//    Always legal: not all word characters must match.
+//  - Moving diagonally (consuming both a word and pattern character)
+//    Legal if the characters match.
+//  - Moving down (consuming a pattern character) is never legal.
+//    Never legal: all pattern characters must match something.
+//
+// The scoring is based on heuristics:
+//  - when matching a character, apply a bonus or penalty depending on the
+//    match quality (does case match, do word segments align, etc)
+//  - when skipping a character, apply a penalty if it starts a word segment
+//  - if the first pattern character matches the middle of a word segment, then
+//    apply such a huge penalty that this match is never used.
+//
+// We treat strings as byte-sequences, so only ASCII has first-class support.
+//
+// This algorithm is inspired by VS code's client-side filtering.
+//
+//===----------------------------------------------------------------------===//
+
+#include "FuzzyMatch.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/Support/Format.h"
+
+using namespace clang::clangd;
+using namespace llvm;
+
+const int FuzzyMatcher::MaxPat;
+const int FuzzyMatcher::MaxWord;
+
+static char lower(char C) { return C >= 'A' && C <= 'Z' ? C + ('a' - 'A') : C; }
+// A "negative infinity" score that won't overflow.
+// We use this to mark unreachable states and forbidden solutions.
+static constexpr int AwfulScore = std::numeric_limits<int>::min() / 2;
+static bool isAwful(int S) { return S < AwfulScore / 2; }
+
+FuzzyMatcher::FuzzyMatcher(StringRef Pattern)
+    : NPat(std::min<int>(MaxPat, Pattern.size())), NWord(0),
+      ScoreScale(0.5f / NPat) {
+  memcpy(Pat, Pattern.data(), NPat);
+  for (int I = 0; I < NPat; ++I)
+    LPat[I] = lower(Pat[I]);
+  Score[0][0] = 0;
+  for (int P = 0; P <= NPat; ++P)
+    for (int W = 0; W < P; ++W)
+      Score[P][W] = AwfulScore;
+  calculateRoles(Pat, PatRole, NPat);
+}
+
+Optional<float> FuzzyMatcher::match(StringRef Word) {
+  if (!NPat)
+    return 1;
+  if (!(IsSubstring = init(Word)))
+    return None;
+  buildGraph();
+  if (isAwful(Score[NPat][NWord]))
+    return None;
+  return ScoreScale * std::min(2 * NPat, std::max(0, Score[NPat][NWord]));
+}
+
+// Segmentation of words and patterns
+// We mark each character as the Head or Tail of a segment, or a Separator.
+// e.g. XMLHttpRequest_Async
+//      +--+---+------ +----
+//
+// This happens in two stages:
+//   1. We classify chars into Types:
+//        Upper is uppercase letters
+//        Lower is lowercase letters, and also numbers
+//        Punctuation is most other things
+//   2. A characters segment role can be determined by the Types of itself and
+//      its neighbors. e.g. the middle entry in (Upper, Upper, Lower) is a Head.
+//      The Empty type is used to represent the start/end of string.
+//
+// Both of these stages are performed using lookup tables.
+// Type and Role can each be represented into 2 bits, so we pack 4 into a byte.
+namespace {
+enum CharType { Empty, Lower, Upper, Punctuation };
+// 4 packed CharTypes per entry, indexed by char.
+constexpr uint8_t CharTypes[] = {
+    0x00, 0x00, 0x00, 0x00, // Control characters
+    0x00, 0x00, 0x00, 0x00, // Control characters
+    0xff, 0xff, 0xff, 0xff, // Punctuation
+    0x55, 0x55, 0xf5, 0xff, // Numbers->Lower, more Punctuation.
+    0xab, 0xaa, 0xaa, 0xaa, // @ and A-O
+    0xaa, 0xaa, 0xea, 0xff, // P-Z, more Punctuation.
+    0x57, 0x55, 0x55, 0x55, // ` and a-o
+    0x55, 0x55, 0xd5, 0x3f, // p-z, Punctuation, DEL.
+    0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, // Bytes over 127 -> Lower.
+    0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, // This includes UTF-8.
+    0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+    0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+};
+// 4 packed CharRoles per entry, indexed by a packed triple of CharTypes:
+// (previous, current, next).
+constexpr uint8_t CharRoles[] = {
+    0x00, 0xaa, 0xaa, 0xff, // At start of word, Lower|Upper -> Head
+    0x00, 0x55, 0xaa, 0xff, // After Lower, Upper->Head, Lower->Tail
+    0x00, 0x55, 0x59, 0xff, // After Upper, Upper->Tail, Lower->Tail
+                            // Exception: [Upper] Upper [Lower]->Head
+    0x00, 0xaa, 0xaa, 0xff, // After Separator, Lower|Upper -> Head
+};
+template <typename T> T PackedLookup(const uint8_t *Data, int I) {
+  return static_cast<T>((Data[I >> 2] >> ((I & 3) * 2)) & 3);
+}
+} // namespace
+void FuzzyMatcher::calculateRoles(const char *Text, CharRole *Out, int N) {
+  int Types = PackedLookup<CharType>(CharTypes, Text[0]);
+  auto Rotate = [&](CharType T) { Types = ((Types << 2) | T) & 0x3f; };
+  for (int I = 0; I < N - 1; ++I) {
+    Rotate(PackedLookup<CharType>(CharTypes, Text[I + 1]));
+    *Out++ = PackedLookup<CharRole>(CharRoles, Types);
+  }
+  Rotate(Empty);
+  *Out++ = PackedLookup<CharRole>(CharRoles, Types);
+}
+
+// Sets up the data structures matching Word.
+// Returns false if we can cheaply determine that no match is possible.
+bool FuzzyMatcher::init(StringRef NewWord) {
+  NWord = std::min<int>(MaxWord, NewWord.size());
+  if (NPat > NWord)
+    return false;
+  memcpy(Word, NewWord.data(), NWord);
+  for (int I = 0; I < NWord; ++I)
+    LWord[I] = lower(Word[I]);
+
+  // Cheap subsequence check.
+  for (int W = 0, P = 0; P != NPat; ++W) {
+    if (W == NWord)
+      return false;
+    if (LWord[W] == LPat[P])
+      ++P;
+  }
+
+  calculateRoles(Word, WordRole, NWord);
+  return true;
+}
+
+// The forwards pass finds the mappings of Pattern onto Word.
+// Score = best score achieved matching Word[..W] against Pat[..P].
+// Unlike other tables, indices range from 0 to N *inclusive*
+// Matched = whether we chose to match Word[W] with Pat[P] or not.
+void FuzzyMatcher::buildGraph() {
+  for (int W = 0; W < NWord; ++W)
+    Score[0][W + 1] = Score[0][W] - skipPenalty(W);
+  for (int P = 0; P < NPat; ++P) {
+    for (int W = P; W < NWord; ++W) {
+      int Left = Score[P + 1][W];
+      if (P < NPat - 1)
+        Left -= skipPenalty(W);
+      if (LPat[P] == LWord[W]) {
+        int Diag = Score[P][W] + matchBonus(P, W);
+        if (Diag >= Left) {
+          Left = Score[P + 1][W + 1] = Diag;
+          Matched[P][W] = true;
+          continue;
+        }
+      }
+      Score[P + 1][W + 1] = Left;
+      Matched[P][W] = false;
+    }
+  }
+}
+
+int FuzzyMatcher::skipPenalty(int W) {
+  // Penalty: skipped a segment.
+  if (WordRole[W] == Head)
+    return 2;
+  return 0;
+}
+
+int FuzzyMatcher::matchBonus(int P, int W) {
+  // Forbidden: matching the first pattern character in the middle of a segment.
+  if (!P && WordRole[W] == Tail)
+    return AwfulScore;
+  int S = 0;
+  // Bonus: pattern is part of a word prefix.
+  if (P == W)
+    ++S;
+  // Bonus: case matches, or an asserted word break matches an actual.
+  if (Pat[P] == Word[W] || (PatRole[P] == Head && WordRole[W] == Head))
+    ++S;
+  // Penalty: matching inside a word where the previous didn't match.
+  if (WordRole[W] == Tail && P && !Matched[P - 1][W - 1])
+    S -= 3;
+  // Penalty: asserted word break but no actual.
+  if (PatRole[P] == Head && WordRole[W] == Tail)
+    --S;
+  return S;
+}
+
+void FuzzyMatcher::dumpLast(llvm::raw_ostream &OS) const {
+  OS << "=== Match \"" << StringRef(Word, NWord) << "\" against ["
+     << StringRef(Pat, NPat) << "] ===\n";
+  if (!IsSubstring) {
+    OS << "Substring check failed.\n";
+    return;
+  }
+
+  // Traverse Matched table backwards to reconstruct the Pattern/Word mapping.
+  // The Score table has cumulative scores, subtracting along this path gives
+  // us the per-letter scores.
+  int S[MaxWord];
+  bool M[MaxWord] = {};
+  for (int W = NWord - 1, P = NPat - 1; W >= 0; --W) {
+    if (P >= 0 && Matched[P][W]) {
+      S[W] = Score[P + 1][W + 1] - Score[P][W];
+      --P;
+      M[W] = true;
+    } else
+      S[W] = Score[P + 1][W + 1] - Score[P + 1][W];
+  }
+  for (char C : StringRef(Word, NWord))
+    OS << " " << C << " ";
+  OS << "\n";
+  for (int I = 0, J = 0; I < NWord; I++)
+    OS << " " << (M[I] ? Pat[J++] : ' ') << " ";
+  OS << "\n";
+  for (int I = 0; I < NWord; I++)
+    OS << format("%2d ", S[I]);
+  OS << "\n";
+
+  OS << "\nSegmentation:";
+  OS << "\n'" << StringRef(Word, NWord) << "'\n ";
+  for (int I = 0; I < NWord; ++I)
+    OS << "?-+ "[static_cast<int>(WordRole[I])];
+  OS << "\n[" << StringRef(Pat, NPat) << "]\n ";
+  for (int I = 0; I < NPat; ++I)
+    OS << "?-+ "[static_cast<int>(PatRole[I])];
+  OS << "\n";
+
+  OS << "\nScoring table:\n";
+  OS << " |    ";
+  for (char C : StringRef(Word, NWord))
+    OS << "  " << C << " ";
+  OS << "\n";
+  OS << "-+----" << std::string(NWord * 4, '-') << "\n";
+  for (int I = 0; I <= NPat; ++I) {
+    OS << (I ? Pat[I - 1] : ' ') << "|";
+    for (int J = 0; J <= NWord; ++J)
+      if (isAwful(Score[I][J]))
+        OS << format("%3d ", Score[I][J]);
+      else
+        OS << "    ";
+    OS << "\n";
+  }
+
+  OS << "\nMatch graph:\n";
+  OS << " |" << StringRef(Word, NWord) << "\n";
+  OS << "-+" << std::string(NWord, '-') << "\n";
+  for (int I = 0; I < NPat; ++I) {
+    OS << Pat[I] << "|";
+    for (int J = 0; J < NWord; ++J)
+      OS << " 1"[static_cast<int>(Matched[I][J])];
+    OS << "\n";
+  }
+}
Index: clangd/CMakeLists.txt
===================================================================
--- clangd/CMakeLists.txt
+++ clangd/CMakeLists.txt
@@ -8,6 +8,7 @@
   ClangdUnit.cpp
   ClangdUnitStore.cpp
   DraftStore.cpp
+  FuzzyMatch.cpp
   GlobalCompilationDatabase.cpp
   JSONRPCDispatcher.cpp
   JSONExpr.cpp
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to