sammccall updated this revision to Diff 250730. sammccall added a comment. Fix '_' rules.
Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D75687/new/ https://reviews.llvm.org/D75687 Files: clang-tools-extra/clangd/FormattedString.cpp clang-tools-extra/clangd/FormattedString.h clang-tools-extra/clangd/unittests/FormattedStringTests.cpp clang-tools-extra/clangd/unittests/HoverTests.cpp
Index: clang-tools-extra/clangd/unittests/HoverTests.cpp =================================================================== --- clang-tools-extra/clangd/unittests/HoverTests.cpp +++ clang-tools-extra/clangd/unittests/HoverTests.cpp @@ -1905,7 +1905,7 @@ llvm::StringRef ExpectedMarkdown = R"md(### variable `foo` --- -Value \= `val` +Value = `val` --- ```cpp Index: clang-tools-extra/clangd/unittests/FormattedStringTests.cpp =================================================================== --- clang-tools-extra/clangd/unittests/FormattedStringTests.cpp +++ clang-tools-extra/clangd/unittests/FormattedStringTests.cpp @@ -17,25 +17,96 @@ namespace markup { namespace { -TEST(Render, Escaping) { - // Check some ASCII punctuation - Paragraph P; - P.appendText("*!`"); - EXPECT_EQ(P.asMarkdown(), "\\*\\!\\`"); +std::string escape(llvm::StringRef Text) { + return Paragraph().appendText(Text.str()).asMarkdown(); +} + +MATCHER_P(escaped, C, "") { + return testing::ExplainMatchResult(::testing::HasSubstr(std::string{'\\', C}), + arg, result_listener); +} +MATCHER(escapedNone, "") { + return testing::ExplainMatchResult(::testing::Not(::testing::HasSubstr("\\")), + arg, result_listener); +} + +TEST(Render, Escaping) { // Check all ASCII punctuation. - P = Paragraph(); std::string Punctuation = R"txt(!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~)txt"; - // Same text, with each character escaped. - std::string EscapedPunctuation; - EscapedPunctuation.reserve(2 * Punctuation.size()); - for (char C : Punctuation) - EscapedPunctuation += std::string("\\") + C; - P.appendText(Punctuation); - EXPECT_EQ(P.asMarkdown(), EscapedPunctuation); + std::string EscapedPunc = R"txt(!"#$%&'()\*+,-./:;<=>?@[\\]^\_\`{|}~)txt"; + EXPECT_EQ(escape(Punctuation), EscapedPunc); + + // Inline code + EXPECT_EQ(escape("`foo`"), R"(\`foo\`)"); + EXPECT_EQ(escape("`foo"), R"(\`foo)"); + EXPECT_EQ(escape("foo`"), R"(foo\`)"); + EXPECT_EQ(escape("``foo``"), R"(\`\`foo\`\`)"); + // Code blocks + EXPECT_EQ(escape("```"), R"(\`\`\`)"); // This could also be inline code! + EXPECT_EQ(escape("~~~"), R"(\~~~)"); + + // Rulers and headings + EXPECT_THAT(escape("## Heading"), escaped('#')); + EXPECT_THAT(escape("Foo # bar"), escapedNone()); + EXPECT_EQ(escape("---"), R"(\---)"); + EXPECT_EQ(escape("-"), R"(\-)"); + EXPECT_EQ(escape("==="), R"(\===)"); + EXPECT_EQ(escape("="), R"(\=)"); + EXPECT_EQ(escape("***"), R"(\*\*\*)"); // \** could start emphasis! + + // HTML tags. + EXPECT_THAT(escape("<pre"), escaped('<')); + EXPECT_THAT(escape("< pre"), escapedNone()); + EXPECT_THAT(escape("if a<b then"), escaped('<')); + EXPECT_THAT(escape("if a<b then c."), escapedNone()); + EXPECT_THAT(escape("if a<b then c='foo'."), escaped('<')); + EXPECT_THAT(escape("std::vector<T>"), escaped('<')); + EXPECT_THAT(escape("std::vector<std::string>"), escaped('<')); + EXPECT_THAT(escape("std::map<int, int>"), escapedNone()); + // Autolinks + EXPECT_THAT(escape("Email <f...@bar.com>"), escapedNone()); + EXPECT_THAT(escape("Website <http://foo.bar>"), escapedNone()); + + // Bullet lists. + EXPECT_THAT(escape("- foo"), escaped('-')); + EXPECT_THAT(escape("* foo"), escaped('*')); + EXPECT_THAT(escape("+ foo"), escaped('+')); + EXPECT_THAT(escape("+"), escaped('+')); + EXPECT_THAT(escape("a + foo"), escapedNone()); + EXPECT_THAT(escape("a+ foo"), escapedNone()); + EXPECT_THAT(escape("1. foo"), escaped('.')); + EXPECT_THAT(escape("a. foo"), escapedNone()); + + // Emphasis. + EXPECT_EQ(escape("*foo*"), R"(\*foo\*)"); + EXPECT_EQ(escape("**foo**"), R"(\*\*foo\*\*)"); + EXPECT_THAT(escape("*foo"), escaped('*')); + EXPECT_THAT(escape("foo *"), escapedNone()); + EXPECT_THAT(escape("foo * bar"), escapedNone()); + EXPECT_THAT(escape("foo_bar"), escapedNone()); + EXPECT_THAT(escape("foo _bar"), escaped('_')); + EXPECT_THAT(escape("foo_ bar"), escaped('_')); + EXPECT_THAT(escape("foo _ bar"), escapedNone()); + + // HTML entities. + EXPECT_THAT(escape("fish &chips;"), escaped('&')); + EXPECT_THAT(escape("fish & chips;"), escapedNone()); + EXPECT_THAT(escape("fish &chips"), escapedNone()); + EXPECT_THAT(escape("foo * bar"), escaped('&')); + EXPECT_THAT(escape("foo ¯ bar"), escaped('&')); + EXPECT_THAT(escape("foo &?; bar"), escapedNone()); + + // Links. + EXPECT_THAT(escape("[foo](bar)"), escaped(']')); + EXPECT_THAT(escape("[foo]: bar"), escaped(']')); + // No need to escape these, as the target never exists. + EXPECT_THAT(escape("[foo][]"), escapedNone()); + EXPECT_THAT(escape("[foo][bar]"), escapedNone()); + EXPECT_THAT(escape("[foo]"), escapedNone()); // In code blocks we don't need to escape ASCII punctuation. - P = Paragraph(); + Paragraph P = Paragraph(); P.appendCode("* foo !+ bar * baz"); EXPECT_EQ(P.asMarkdown(), "`* foo !+ bar * baz`"); Index: clang-tools-extra/clangd/FormattedString.h =================================================================== --- clang-tools-extra/clangd/FormattedString.h +++ clang-tools-extra/clangd/FormattedString.h @@ -95,6 +95,8 @@ BulletList &addBulletList(); /// Doesn't contain any trailing newlines. + /// We try to make the markdown human-readable, e.g. avoid extra escaping. + /// At least one client (coc.nvim) displays the markdown verbatim! std::string asMarkdown() const; /// Doesn't contain any trailing newlines. std::string asPlainText() const; Index: clang-tools-extra/clangd/FormattedString.cpp =================================================================== --- clang-tools-extra/clangd/FormattedString.cpp +++ clang-tools-extra/clangd/FormattedString.cpp @@ -12,6 +12,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/raw_ostream.h" @@ -26,23 +27,143 @@ namespace markup { namespace { + +// Is <contents a plausible start to an HTML tag? +// Contents may not be the rest of the line, but it's the rest of the plain +// text, so we expect to see at least the tag name. +bool looksLikeTag(llvm::StringRef Contents) { + if (Contents.empty()) + return false; + if (Contents.front() == '!' || Contents.front() == '?' || + Contents.front() == '/') + return true; + // Check the start of the tag name. + if (!llvm::isAlpha(Contents.front())) + return false; + // Drop rest of the tag name, and following whitespace. + Contents = Contents + .drop_while([](char C) { + return llvm::isAlnum(C) || C == '-' || C == '_' || C == ':'; + }) + .drop_while(isWhitespace); + // The rest of the tag consists of attributes, which have restrictive names. + // If we hit '=', all bets are off (attribute values can contain anything). + for (; !Contents.empty(); Contents = Contents.drop_front()) { + if (llvm::isAlnum(Contents.front()) || isWhitespace(Contents.front())) + continue; + if (Contents.front() == '>' || Contents.startswith("/>")) + return true; // May close the tag. + if (Contents.front() == '=') + return true; // Don't try to parse attribute values. + return false; // Random punctuation means this isn't a tag. + } + return true; // Potentially incomplete tag. +} + +// Tests whether C should be backslash-escaped in markdown. +// The string being escaped is Before + C + After. This is part of a paragraph. +// StartsLine indicates whether `Before` is the start of the line. +// After may not be everything until the end of the line. +// +// It's always safe to escape punctuation, but want minimal escaping. +// The strategy is to escape the first character of anything that might start +// a markdown grammar construct. +bool needsLeadingEscape(char C, llvm::StringRef Before, llvm::StringRef After, + bool StartsLine) { + assert(Before.take_while(isWhitespace).empty()); + auto RulerLength = [&]() -> /*Length*/ unsigned { + if (!StartsLine || !Before.empty()) + return false; + llvm::StringRef A = After.rtrim(); + return llvm::all_of(A, [C](char D) { return C == D; }) ? 1 + A.size() : 0; + }; + auto IsBullet = [&]() { + return StartsLine && Before.empty() && + (After.empty() || After.startswith(" ")); + }; + auto SpaceSurrounds = [&]() { + return (After.empty() || isWhitespace(After.front())) && + (Before.empty() || isWhitespace(Before.back())); + }; + auto WordSurrounds = [&]() { + return (!After.empty() && llvm::isAlnum(After.front())) && + (!Before.empty() && llvm::isAlnum(Before.back())); + }; + + switch (C) { + case '\\': // Escaped character. + return true; + case '`': // Code block or inline code + // Any number of backticks can delimit an inline code block that can end + // anywhere (including on another line). We must escape them all. + return true; + case '~': // Code block + return StartsLine && Before.empty() && After.startswith("~~"); + case '#': { // ATX heading. + if (!StartsLine || !Before.empty()) + return false; + llvm::StringRef Rest = After.ltrim(C); + return Rest.empty() || Rest.startswith(" "); + } + case ']': // Link or link reference. + // We escape ] rather than [ here, because it's more constrained: + // ](...) is an in-line link + // ]: is a link reference + // The following are only links if the link reference exists: + // ] by itself is a shortcut link + // ][...] is an out-of-line link + // Because we never emit link references, we don't need to handle these. + return After.startswith(":") || After.startswith("("); + case '=': // Setex heading. + return RulerLength() > 0; + case '_': // Horizontal ruler or matched delimiter. + if (RulerLength() >= 3) + return true; + // Not a delimiter if surrounded by space, or inside a word. + // (The rules at word boundaries are subtle). + return !(SpaceSurrounds() || WordSurrounds()); + case '-': // Setex heading, horizontal ruler, or bullet. + if (RulerLength() > 0) + return true; + return IsBullet(); + case '+': // Bullet list. + return IsBullet(); + case '*': // Bullet list, horizontal ruler, or delimiter. + return IsBullet() || RulerLength() >= 3 || !SpaceSurrounds(); + case '<': // HTML tag (or autolink, which we choose not to escape) + return looksLikeTag(After); + case '>': // Quote marker. Needs escaping at start of line. + return StartsLine && Before.empty(); + case '&': { // HTML entity reference + auto End = After.find(';'); + if (End == llvm::StringRef::npos) + return false; + llvm::StringRef Content = After.substr(0, End); + if (Content.consume_front("#")) { + if (Content.consume_front("x") || Content.consume_front("X")) + return llvm::all_of(Content, llvm::isHexDigit); + return llvm::all_of(Content, llvm::isDigit); + } + return llvm::all_of(Content, llvm::isAlpha); + } + case '.': // Numbered list indicator. Escape 12. -> 12\. at start of line. + case ')': + return StartsLine && !Before.empty() && + llvm::all_of(Before, llvm::isDigit) && After.startswith(" "); + default: + return false; + } +} + /// Escape a markdown text block. Ensures the punctuation will not introduce /// any of the markdown constructs. -std::string renderText(llvm::StringRef Input) { - // Escaping ASCII punctuation ensures we can't start a markdown construct. - constexpr llvm::StringLiteral Punctuation = - R"txt(!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~)txt"; - +std::string renderText(llvm::StringRef Input, bool StartsLine) { std::string R; - for (size_t From = 0; From < Input.size();) { - size_t Next = Input.find_first_of(Punctuation, From); - R += Input.substr(From, Next - From); - if (Next == llvm::StringRef::npos) - break; - R += "\\"; - R += Input[Next]; - - From = Next + 1; + for (unsigned I = 0; I < Input.size(); ++I) { + if (needsLeadingEscape(Input[I], Input.substr(0, I), Input.substr(I + 1), + StartsLine)) + R.push_back('\\'); + R.push_back(Input[I]); } return R; } @@ -236,7 +357,7 @@ OS << Sep; switch (C.Kind) { case Chunk::PlainText: - OS << renderText(C.Contents); + OS << renderText(C.Contents, Sep.empty()); break; case Chunk::InlineCode: OS << renderInlineBlock(C.Contents);
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits