sammccall updated this revision to Diff 124096. sammccall marked 15 inline comments as done. sammccall added a comment.
Addressing review comments and generally improving comments. https://reviews.llvm.org/D40060 Files: clangd/CMakeLists.txt clangd/FuzzyMatch.cpp clangd/FuzzyMatch.h unittests/clangd/CMakeLists.txt unittests/clangd/FuzzyMatchTests.cpp
Index: unittests/clangd/FuzzyMatchTests.cpp =================================================================== --- /dev/null +++ unittests/clangd/FuzzyMatchTests.cpp @@ -0,0 +1,130 @@ +//===-- FuzzyMatchTests.cpp - String fuzzy matcher tests --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "FuzzyMatch.h" + +#include "llvm/ADT/StringExtras.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +namespace clang { +namespace clangd { +namespace { +using namespace llvm; +using testing::Not; + +struct MatchesMatcher : public testing::MatcherInterface<StringRef> { + StringRef Candidate; + MatchesMatcher(StringRef Candidate) : Candidate(Candidate) {} + + void DescribeTo(::std::ostream *OS) const override { + *OS << "Matches '" << Candidate.str() << "'"; + } + + bool MatchAndExplain(StringRef Pattern, + testing::MatchResultListener *L) const override { + std::unique_ptr<raw_ostream> OS( + L->stream() ? (raw_ostream *)(new raw_os_ostream(*L->stream())) + : new raw_null_ostream()); + FuzzyMatcher Matcher(Pattern); + auto Result = Matcher.match(Candidate); + Matcher.dumpLast(*OS << "\n"); + return !!Result; + } +}; + +// Accepts patterns that match a given word. +// Dumps the debug tables on match failure. +testing::Matcher<StringRef> matches(StringRef Word) { + return testing::MakeMatcher<StringRef>(new MatchesMatcher(Word)); +} + +TEST(FuzzyMatch, Matches) { + EXPECT_THAT("u_p", matches("unique_ptr")); + EXPECT_THAT("up", matches("unique_ptr")); + EXPECT_THAT("uq", matches("unique_ptr")); + EXPECT_THAT("qp", Not(matches("unique_ptr"))); + EXPECT_THAT("log", Not(matches("SVGFEMorphologyElement"))); +} + +struct RankMatcher : public testing::MatcherInterface<StringRef> { + std::vector<StringRef> RankedStrings; + RankMatcher(std::initializer_list<StringRef> RankedStrings) + : RankedStrings(RankedStrings) {} + + void DescribeTo(::std::ostream *OS) const override { + *OS << "Ranks strings in order: [" + << join(RankedStrings.begin(), RankedStrings.end(), ", ") << "]"; + } + + bool MatchAndExplain(StringRef Pattern, + testing::MatchResultListener *L) const override { + std::unique_ptr<raw_ostream> OS( + L->stream() ? (raw_ostream *)(new raw_os_ostream(*L->stream())) + : new raw_null_ostream()); + FuzzyMatcher Matcher(Pattern); + StringRef LastString; + Optional<float> LastScore; + bool Ok = true; + for (StringRef Str : RankedStrings) { + auto Score = Matcher.match(Str); + if (!Score) { + *OS << "\nDoesn't match '" << Str.str() << "'"; + Matcher.dumpLast(*OS << "\n"); + Ok = false; + } else if (LastScore && *LastScore < *Score) { + *OS << "\nRanks '" << Str.str() << "'=" << *Score << " above '" + << LastString.str() << "'=" << *LastScore; + Matcher.dumpLast(*OS << "\n"); + Matcher.match(LastString); + Matcher.dumpLast(*OS << "\n"); + Ok = false; + } + LastString = Str; + LastScore = Score; + } + return Ok; + } +}; + +// Accepts patterns that match all the strings and rank them in the given order. +// Dumps the debug tables on match failure. +template <typename... T> testing::Matcher<StringRef> ranks(T... RankedStrings) { + return testing::MakeMatcher<StringRef>(new RankMatcher{RankedStrings...}); +} + +TEST(FuzzyMatch, Ranking) { + EXPECT_THAT("eb", ranks("emplace_back", "embed")); + EXPECT_THAT("cons", ranks("console", "Console", "ArrayBufferConstructor")); + EXPECT_THAT("foo", ranks("foo", "Foo")); + EXPECT_THAT("onMess", ranks("onMessage", "onmessage", "onThisMegaEscapes")); + EXPECT_THAT("CC", ranks("CamelCase", "camelCase")); + EXPECT_THAT("cC", ranks("camelCase", "CamelCase")); + EXPECT_THAT("p", ranks("parse", "posix", "pafdsa", "path", "p")); + EXPECT_THAT("pa", ranks("parse", "path", "pafdsa")); + EXPECT_THAT("log", ranks("log", "ScrollLogicalPosition")); + EXPECT_THAT("e", ranks("else", "AbstractElement")); + EXPECT_THAT("workbench.sideb", + ranks("workbench.sideBar.location", + "workbench.editor.defaultSideBySideLayout")); + EXPECT_THAT("editor.r", ranks("editor.renderControlCharacter", + "editor.overviewRulerlanes", + "diffEditor.renderSideBySide")); + EXPECT_THAT("-mo", ranks("-moz-columns", "-ms-ime-mode")); + EXPECT_THAT("convertModelPosition", + ranks("convertModelPositionToViewPosition", + "convertViewToModelPosition")); + EXPECT_THAT("is", ranks("isValidViewletId", "import statement")); + EXPECT_THAT("title", ranks("window.title", "files.trimTrailingWhitespace")); + EXPECT_THAT("strcpy", ranks("strcpy", "strcpy_s", "strncpy")); +} + +} // namespace +} // namespace clangd +} // namespace clang Index: unittests/clangd/CMakeLists.txt =================================================================== --- unittests/clangd/CMakeLists.txt +++ unittests/clangd/CMakeLists.txt @@ -10,6 +10,7 @@ add_extra_unittest(ClangdTests ClangdTests.cpp + FuzzyMatchTests.cpp JSONExprTests.cpp TraceTests.cpp ) Index: clangd/FuzzyMatch.h =================================================================== --- /dev/null +++ clangd/FuzzyMatch.h @@ -0,0 +1,67 @@ +//===--- FuzzyMatch.h - Approximate identifier matching ---------*- C++-*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements fuzzy-matching of strings against identifiers. +// It indicates both the existence and quality of a match: +// 'eb' matches both 'emplace_back' and 'embed', the former has a better score. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H + +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/raw_ostream.h" + +namespace clang { +namespace clangd { +namespace detail { using ScoreT = int_least16_t; } + +// A matcher capable of matching and scoring strings against a single pattern. +// It's optimized for matching against many strings - match() does not allocate. +class FuzzyMatcher { +public: + // Characters beyond MaxPat are ignored. + FuzzyMatcher(llvm::StringRef Pattern); + + // If Word matches the pattern, return a score in [0,1] (higher is better). + // Characters beyond MaxWord are ignored. + llvm::Optional<float> match(llvm::StringRef Word); + + // Dump internal state from the last match() to the stream, for debugging. + void dumpLast(llvm::raw_ostream &) const; + +private: + // We truncate the pattern and the word to bound the cost of matching. + constexpr static int MaxPat = 63, MaxWord = 127; + enum CharRole : char; // For segmentation. + enum CharType : char; // For segmentation. + + bool init(llvm::StringRef Word); + void buildGraph(); + void calculateRoles(const char *Text, CharRole *Out, int N); + detail::ScoreT skipPenalty(int W); + detail::ScoreT matchBonus(int P, int W); + + int PatN, WordN; // Length of pattern and word. + char Pat[MaxPat], Word[MaxWord]; // Raw pattern and word data. + char LowPat[MaxPat], LowWord[MaxWord]; // Lowercase. + CharRole PatRole[MaxPat], WordRole[MaxWord]; // Segmentation info. + detail::ScoreT Score[MaxPat + 1][MaxWord + 1]; // Cumulative best-match. + // Whether the last char was matched to produce the best-match. + bool Matched[MaxPat + 1][MaxWord + 1]; // Oversized by one for nice alignment. + float ScoreScale; // Conversion from integer to normalized score. + bool IsSubstring; // Retained for dumpLast() debugging only. +}; + +} // namespace clangd +} // namespace clang + +#endif Index: clangd/FuzzyMatch.cpp =================================================================== --- /dev/null +++ clangd/FuzzyMatch.cpp @@ -0,0 +1,328 @@ +//===--- FuzzyMatch.h - Approximate identifier matching ---------*- C++-*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// To check for a match between a Pattern ('u_p') and a Word ('unique_ptr'), +// we consider the possible partial match states: +// +// u n i q u e _ p t r +// +--------------------- +// |A . . . . . . . . . . +// u| +// |. . . . . . . . . . . +// _| +// |. . . . . . . O . . . +// p| +// |. . . . . . . . . . B +// +// Each dot represents some prefix of the pattern being matched against some +// prefix of the word. +// - A is the initial state: '' matched against '' +// - O is an intermediate state: 'u_' matched against 'unique_' +// - B is the target state: 'u_p' matched against 'unique_ptr' +// +// We aim to find the best path from A->B. +// - Moving right (consuming a word character) +// Always legal: not all word characters must match. +// - Moving diagonally (consuming both a word and pattern character) +// Legal if the characters match. +// - Moving down (consuming a pattern character) is never legal. +// Never legal: all pattern characters must match something. +// +// The scoring is based on heuristics: +// - when matching a character, apply a bonus or penalty depending on the +// match quality (does case match, do word segments align, etc) +// - when skipping a character, apply a penalty if it starts a word segment +// - if the first pattern character matches the middle of a word segment, then +// apply such a huge penalty that this match is never used. +// +// We treat strings as byte-sequences, so only ASCII has first-class support. +// +// This algorithm is inspired by VS code's client-side filtering. +// +//===----------------------------------------------------------------------===// + +#include "FuzzyMatch.h" +#include "llvm/ADT/Optional.h" +#include "llvm/Support/Format.h" + +using namespace llvm; +using namespace clang::clangd; +using clang::clangd::detail::ScoreT; + +const int FuzzyMatcher::MaxPat; +const int FuzzyMatcher::MaxWord; + +static char lower(char C) { return C >= 'A' && C <= 'Z' ? C + ('a' - 'A') : C; } +// A "negative infinity" score that won't overflow. +// We use this to mark unreachable states and forbidden solutions. +static constexpr ScoreT AwfulScore = std::numeric_limits<ScoreT>::min() / 2; +static bool isAwful(ScoreT S) { return S < AwfulScore / 2; } +static constexpr int PerfectBonus = 2; // Perfect per-pattern-char score. + +FuzzyMatcher::FuzzyMatcher(StringRef Pattern) + : PatN(std::min<int>(MaxPat, Pattern.size())), WordN(0), + ScoreScale(float{1} / (PerfectBonus * PatN)) { + memcpy(Pat, Pattern.data(), PatN); + for (int I = 0; I < PatN; ++I) + LowPat[I] = lower(Pat[I]); + Score[0][0] = 0; + for (int P = 0; P <= PatN; ++P) + for (int W = 0; W < P; ++W) + Score[P][W] = AwfulScore; + calculateRoles(Pat, PatRole, PatN); +} + +Optional<float> FuzzyMatcher::match(StringRef Word) { + if (!PatN) + return 1; + if (!(IsSubstring = init(Word))) + return None; + buildGraph(); + if (isAwful(Score[PatN][WordN])) + return None; + return ScoreScale * + std::min(PerfectBonus * PatN, std::max<int>(0, Score[PatN][WordN])); +} + +// Segmentation of words and patterns. +// A name like "fooBar_baz" consists of several parts foo, bar, baz. +// Aligning segmentation of word and pattern improves the fuzzy-match. +// For example: [lol] matches "LaughingOutLoud" better than "LionPopulation" +// +// First we classify each character into types (uppercase, lowercase, etc). +// Then we look at the sequence: e.g. [upper, lower] is the start of a segment. + +// We only distinguish the types of characters that affect segmentation. +// It's not obvious how to segment digits, we treat them as lowercase letters. +// As we don't decode UTF-8, we treat bytes over 127 as lowercase too. +// This means we require exact (case-sensitive) match. +enum FuzzyMatcher::CharType : char { + Empty = 0, // Before-the-start and after-the-end (and control chars). + Lower = 1, // Lowercase letters, digits, and non-ASCII bytes. + Upper = 2, // Uppercase letters. + Punctuation = 3, // ASCII punctuation (including Space) +}; + +// We get CharTypes from a lookup table. Each is 2 bits, 4 fit in each byte. +// The top 6 bits of the char select the byte, the bottom 2 select the offset. +// e.g. 'q' = 010100 01 = byte 28 (55), bits 3-2 (01) -> Lower. +constexpr static uint8_t CharTypes[] = { + 0x00, 0x00, 0x00, 0x00, // Control characters + 0x00, 0x00, 0x00, 0x00, // Control characters + 0xff, 0xff, 0xff, 0xff, // Punctuation + 0x55, 0x55, 0xf5, 0xff, // Numbers->Lower, more Punctuation. + 0xab, 0xaa, 0xaa, 0xaa, // @ and A-O + 0xaa, 0xaa, 0xea, 0xff, // P-Z, more Punctuation. + 0x57, 0x55, 0x55, 0x55, // ` and a-o + 0x55, 0x55, 0xd5, 0x3f, // p-z, Punctuation, DEL. + 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, // Bytes over 127 -> Lower. + 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, // (probably UTF-8). + 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, + 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, +}; + +// Each character's Role is the Head or Tail of a segment, or a Separator. +// e.g. XMLHttpRequest_Async +// +--+---+------ +---- +// ^Head ^Tail ^Separator +enum FuzzyMatcher::CharRole : char { + Unknown = 0, // Stray control characters or impossible states. + Tail = 1, // Part of a word segment, but not the first character. + Head = 2, // The first character of a word segment. + Separator = 3, // Punctuation characters that separate word segments. +}; + +// The Role can be determined from the Type of a character and its neighbors: +// +// Example | Chars | Type | Role +// ---------+--------------+----- +// F(o)oBar | Foo | Ull | Tail +// Foo(B)ar | oBa | lUl | Head +// (f)oo | ^fo | Ell | Head +// H(T)TP | HTT | UUU | Tail +// +// Our lookup table maps a 6 bit key (Prev, Curr, Next) to a 2-bit Role. +// A byte packs 4 Roles. (Prev, Curr) selects a byte, Next selects the offset. +// e.g. Lower, Upper, Lower -> 01 10 01 -> byte 6 (aa), bits 3-2 (10) -> Head. +constexpr static uint8_t CharRoles[] = { + // clang-format off + // Curr= Empty Lower Upper Separ + /* Prev=Empty */ 0x00, 0xaa, 0xaa, 0xff, // At start, Lower|Upper->Head + /* Prev=Lower */ 0x00, 0x55, 0xaa, 0xff, // In word, Upper->Head;Lower->Tail + /* Prev=Upper */ 0x00, 0x55, 0x59, 0xff, // Ditto, but U(U)U->Tail + /* Prev=Separ */ 0x00, 0xaa, 0xaa, 0xff, // After separator, like at start + // clang-format on +}; + +template <typename T> static T packedLookup(const uint8_t *Data, int I) { + return static_cast<T>((Data[I >> 2] >> ((I & 3) * 2)) & 3); +} +void FuzzyMatcher::calculateRoles(const char *Text, CharRole *Out, int N) { + // Types holds a sliding window of (Prev, Curr, Next) types. + // Initial value is (Empty, Empty, type of Text[0]). + int Types = packedLookup<CharType>(CharTypes, Text[0]); + // Rotate slides in the type of the next character. + auto Rotate = [&](CharType T) { Types = ((Types << 2) | T) & 0x3f; }; + for (int I = 0; I < N - 1; ++I) { + // For each character, rotate in the next, and look up the role. + Rotate(packedLookup<CharType>(CharTypes, Text[I + 1])); + *Out++ = packedLookup<CharRole>(CharRoles, Types); + } + // For the last character, the "next character" is Empty. + Rotate(Empty); + *Out++ = packedLookup<CharRole>(CharRoles, Types); +} + +// Sets up the data structures matching Word. +// Returns false if we can cheaply determine that no match is possible. +bool FuzzyMatcher::init(StringRef NewWord) { + WordN = std::min<int>(MaxWord, NewWord.size()); + if (PatN > WordN) + return false; + memcpy(Word, NewWord.data(), WordN); + for (int I = 0; I < WordN; ++I) + LowWord[I] = lower(Word[I]); + + // Cheap subsequence check. + for (int W = 0, P = 0; P != PatN; ++W) { + if (W == WordN) + return false; + if (LowWord[W] == LowPat[P]) + ++P; + } + + calculateRoles(Word, WordRole, WordN); + return true; +} + +// The forwards pass finds the mappings of Pattern onto Word. +// Score = best score achieved matching Word[..W] against Pat[..P]. +// Unlike other tables, indices range from 0 to N *inclusive* +// Matched = whether we chose to match Word[W] with Pat[P] or not. +// +// Points are mostly assigned to matched characters, with 1 being a good score +// and 2 being a great one. So we treat the score range as [0, 2 * PatN]. +// This range is not strict: we can apply larger bonuses/penalties, or penalize +// non-matched characters. +void FuzzyMatcher::buildGraph() { + for (int W = 0; W < WordN; ++W) + Score[0][W + 1] = Score[0][W] - skipPenalty(W); + for (int P = 0; P < PatN; ++P) { + for (int W = P; W < WordN; ++W) { + ScoreT Left = Score[P + 1][W]; // Score if not matching char W. + if (P < PatN - 1) // Skipping trailing characters is free. + Left -= skipPenalty(W); + if (LowPat[P] == LowWord[W]) { // Is a match possible? + ScoreT Diag = Score[P][W] + matchBonus(P, W); // Score if matching. + if (Diag >= Left) { // Is matching better than skipping? + Left = Score[P + 1][W + 1] = Diag; + Matched[P][W] = true; + continue; + } + } + Score[P + 1][W + 1] = Left; + Matched[P][W] = false; + } + } +} + +ScoreT FuzzyMatcher::skipPenalty(int W) { + if (WordRole[W] == Head) // Skipping a segment. + return 2; + return 0; +} + +ScoreT FuzzyMatcher::matchBonus(int P, int W) { + assert(LowPat[P] == LowWord[W]); + // Forbidden: matching the first pattern character in the middle of a segment. + if (!P && WordRole[W] == Tail) + return AwfulScore; + ScoreT S = 0; + // Bonus: pattern so far is a (case-insensitive) prefix of the word. + if (P == W) // We can't skip pattern characters, so we must have matched all. + ++S; + // Bonus: case matches, or a Head in the pattern aligns with one in the word. + if (Pat[P] == Word[W] || (PatRole[P] == Head && WordRole[W] == Head)) + ++S; + // Penalty: matching inside a segment (and previous char wasn't matched). + if (WordRole[W] == Tail && P && !Matched[P - 1][W - 1]) + S -= 3; + // Penalty: a Head in the pattern matches in the middle of a word segment. + if (PatRole[P] == Head && WordRole[W] == Tail) + --S; + assert(S <= PerfectBonus); + return S; +} + +void FuzzyMatcher::dumpLast(llvm::raw_ostream &OS) const { + OS << "=== Match \"" << StringRef(Word, WordN) << "\" against [" + << StringRef(Pat, PatN) << "] ===\n"; + if (!IsSubstring) { + OS << "Substring check failed.\n"; + return; + } + + // Traverse Matched table backwards to reconstruct the Pattern/Word mapping. + // The Score table has cumulative scores, subtracting along this path gives + // us the per-letter scores. + ScoreT S[MaxWord]; + bool M[MaxWord] = {}; + for (int W = WordN - 1, P = PatN - 1; W >= 0; --W) { + if (P >= 0 && Matched[P][W]) { + S[W] = Score[P + 1][W + 1] - Score[P][W]; + --P; + M[W] = true; + } else + S[W] = Score[P + 1][W + 1] - Score[P + 1][W]; + } + for (char C : StringRef(Word, WordN)) + OS << " " << C << " "; + OS << "\n"; + for (int I = 0, J = 0; I < WordN; I++) + OS << " " << (M[I] ? Pat[J++] : ' ') << " "; + OS << "\n"; + for (int I = 0; I < WordN; I++) + OS << format("%2d ", S[I]); + OS << "\n"; + + OS << "\nSegmentation:"; + OS << "\n'" << StringRef(Word, WordN) << "'\n "; + for (int I = 0; I < WordN; ++I) + OS << "?-+ "[static_cast<int>(WordRole[I])]; + OS << "\n[" << StringRef(Pat, PatN) << "]\n "; + for (int I = 0; I < PatN; ++I) + OS << "?-+ "[static_cast<int>(PatRole[I])]; + OS << "\n"; + + OS << "\nScoring table:\n"; + OS << " | "; + for (char C : StringRef(Word, WordN)) + OS << " " << C << " "; + OS << "\n"; + OS << "-+----" << std::string(WordN * 4, '-') << "\n"; + for (int I = 0; I <= PatN; ++I) { + OS << (I ? Pat[I - 1] : ' ') << "|"; + for (int J = 0; J <= WordN; ++J) + if (isAwful(Score[I][J])) + OS << format("%3d ", Score[I][J]); + else + OS << " "; + OS << "\n"; + } + + OS << "\nMatch graph:\n"; + OS << " |" << StringRef(Word, WordN) << "\n"; + OS << "-+" << std::string(WordN, '-') << "\n"; + for (int I = 0; I < PatN; ++I) { + OS << Pat[I] << "|"; + for (int J = 0; J < WordN; ++J) + OS << " 1"[static_cast<int>(Matched[I][J])]; + OS << "\n"; + } +} Index: clangd/CMakeLists.txt =================================================================== --- clangd/CMakeLists.txt +++ clangd/CMakeLists.txt @@ -8,6 +8,7 @@ ClangdUnit.cpp ClangdUnitStore.cpp DraftStore.cpp + FuzzyMatch.cpp GlobalCompilationDatabase.cpp JSONRPCDispatcher.cpp JSONExpr.cpp
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits