================
@@ -0,0 +1,665 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Markdown.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/DebugLog.h"
+#include "llvm/Support/StringSaver.h"
+#include <cassert>
+#include <memory>
+#include <string>
+
+#define DEBUG_TYPE "clang-doc"
+
+using namespace llvm;
+
+namespace clang::doc::markdown {
+
+// Allocates a contiguous array of T in the arena and returns an ArrayRef.
+template <typename T>
+static ArrayRef<T> allocateArray(SmallVectorImpl<T> &Vec,
+ BumpPtrAllocator &Arena) {
+ if (Vec.empty())
+ return {};
+ T *Allocated = Arena.Allocate<T>(Vec.size());
+ std::uninitialized_copy(Vec.begin(), Vec.end(), Allocated);
+ return ArrayRef<T>(Allocated, Vec.size());
+}
+
+// A line is a table separator if it only contains |, -, :, and spaces,
+// and has at least one -.
+static bool isSepRow(StringRef Line) {
+ return Line.contains('-') &&
+ Line.find_first_not_of("|-: ") == StringRef::npos;
+}
+
+// Returns true if Line begins with a bullet list marker (-, *, or +)
+// followed by a space.
+static bool isListItem(StringRef Line) {
+ return Line.starts_with("- ") || Line.starts_with("* ") ||
+ Line.starts_with("+ ");
+}
+
+// Returns true if Line begins with an ordered list marker: one or more digits
+// followed by a period and a space (e.g. "1. ", "42. ").
+static bool isOrderedListItem(StringRef Line) {
+ size_t Dot = Line.find_first_not_of("0123456789");
+ return Dot != StringRef::npos && Dot > 0 && Line[Dot] == '.' &&
+ Dot + 1 < Line.size() && Line[Dot + 1] == ' ';
+}
+
+// Returns true if Line is a thematic break: three or more matching -, *, or _
+// characters, optionally separated by spaces, with nothing else. Line is
+// expected to be trimmed.
+static bool isThematicBreak(StringRef Line) {
+ char Marker = Line.empty() ? '\0' : Line[0];
+ if (Marker != '-' && Marker != '*' && Marker != '_')
+ return false;
+ unsigned Count = 0;
+ for (char C : Line) {
+ if (C == Marker)
+ ++Count;
+ else if (C != ' ')
+ return false;
+ }
+ return Count >= 3;
+}
+
+// Returns true if Line is a block quote line: it starts with "> ", or is a
bare
+// ">" marking an empty quote line.
+static bool isBlockQuote(StringRef Line) {
+ return Line.starts_with("> ") || Line == ">";
+}
+
+// Returns the ATX heading level (1 to 6) when Line is an ATX heading: one to
+// six leading # characters followed by a space. Returns 0 otherwise, so seven
+// or more # characters fall back to plain text.
+static unsigned atxHeadingLevel(StringRef Line) {
+ size_t Level = Line.find_first_not_of('#');
+ if (Level == StringRef::npos || Level < 1 || Level > 6 || Line[Level] != ' ')
+ return 0;
+ return Level;
+}
+
+// A forward cursor over the lines of a paragraph. Lines are stored untrimmed;
+// callers trim where they need a normalized view.
+class LineReader {
+public:
+ explicit LineReader(ArrayRef<StringRef> Lines) : Lines(Lines) {}
+
+ // True once every line has been consumed.
+ bool atEnd() const { return Pos >= Lines.size(); }
+
+ // The current line, untrimmed. Must not be called when atEnd().
+ StringRef peek() const {
+ assert(!atEnd() && "peek past end of input");
+ return Lines[Pos];
+ }
+
+ // The line Offset positions ahead of the cursor, or an empty StringRef when
+ // that position is past the end. peek(0) is the current line.
+ StringRef peek(size_t Offset) const {
+ size_t Target = Pos + Offset;
+ return Target < Lines.size() ? Lines[Target] : StringRef();
+ }
+
+ // Consume the current line and return it, untrimmed. Must not be called when
+ // atEnd().
+ StringRef advance() {
+ assert(!atEnd() && "advance past end of input");
+ return Lines[Pos++];
+ }
+
+private:
+ ArrayRef<StringRef> Lines;
+ size_t Pos = 0;
+};
+
+// A forward cursor over the characters of a string. position() and seek() let
+// it interoperate with the index-based run and delimiter helpers below.
+class CharReader {
+public:
+ explicit CharReader(StringRef S) : S(S) {}
+
+ // True once every character has been consumed.
+ bool atEnd() const { return Pos >= S.size(); }
+
+ // The current character. Must not be called when atEnd().
+ char peek() const {
+ assert(!atEnd() && "peek past end of input");
+ return S[Pos];
+ }
+
+ // Consume the current character and return it. Must not be called when
+ // atEnd().
+ char advance() {
+ assert(!atEnd() && "advance past end of input");
+ return S[Pos++];
+ }
+
+ // The current scan position, for substring, run, and delimiter computations.
+ size_t position() const { return Pos; }
+
+ // Move the cursor to an absolute position, used to skip past a matched span.
+ void seek(size_t NewPos) { Pos = NewPos; }
+
+private:
+ StringRef S;
+ size_t Pos = 0;
+};
+
+// Returns the number of consecutive copies of C starting at S[Start].
+static size_t countRun(StringRef S, size_t Start, char C) {
+ size_t I = Start;
+ while (I < S.size() && S[I] == C)
+ ++I;
+ return I - Start;
+}
+
+// Strips one leading and one trailing space from a code span's content when
+// both are present and the content is not all spaces, per CommonMark §6.1.
+static StringRef trimCodeSpan(StringRef Code) {
+ if (Code.size() >= 2 && Code.front() == ' ' && Code.back() == ' ' &&
+ Code.find_first_not_of(' ') != StringRef::npos)
+ return Code.drop_front().drop_back();
+ return Code;
+}
+
+// Treats the start and end of the string (passed as '\0') as whitespace for
the
+// CommonMark flanking rules.
+static bool isFlankWhitespace(char C) { return C == '\0' || isSpace(C); }
+
+// Computes whether a delimiter run can open or close emphasis, from the
+// characters immediately before and after the run, per the CommonMark §6.2
+// flanking rules. Before and After are '\0' at the string boundaries.
+static void computeFlanking(char Before, char Marker, char After, bool
&CanOpen,
+ bool &CanClose) {
+ bool AfterWS = isFlankWhitespace(After);
+ bool BeforeWS = isFlankWhitespace(Before);
+ bool AfterPunct = isPunct(After);
+ bool BeforePunct = isPunct(Before);
+ bool LeftFlanking = !AfterWS && (!AfterPunct || BeforeWS || BeforePunct);
+ bool RightFlanking = !BeforeWS && (!BeforePunct || AfterWS || AfterPunct);
+ if (Marker == '_') {
+ // Underscore does not open or close emphasis intraword.
+ CanOpen = LeftFlanking && (!RightFlanking || BeforePunct);
+ CanClose = RightFlanking && (!LeftFlanking || AfterPunct);
+ } else {
+ CanOpen = LeftFlanking;
+ CanClose = RightFlanking;
+ }
+}
+
+namespace {
+// One piece of inline content while emphasis is being resolved. A piece is
+// either a finished content node (text, code span, or a built emphasis or
+// strong node) or a run of delimiter characters that may still open or close
+// emphasis. Pieces form a doubly linked list through Prev/Next so matched runs
+// can be spliced out without shifting the others.
+struct InlinePiece {
+ MDNode *Node = nullptr; // content node, or null while this is a delimiter
run
+ char Ch = 0; // '*' or '_' for a delimiter run
+ size_t Len = 0; // delimiters still available in the run
+ unsigned OrigLen = 0; // original run length, for the multiple-of-three
rule
+ bool CanOpen = false;
+ bool CanClose = false;
+ int Prev = -1;
+ int Next = -1;
+};
+} // namespace
+
+// Parses the inline content of a single line into a sequence of inline nodes:
+// inline code (`code`), emphasis (*text* or _text_), and strong (**text** or
+// __text__). Emphasis is resolved with a CommonMark-style delimiter stack: a
+// first pass tokenizes the line into text, code spans, and delimiter runs
(each
+// tagged with its flanking flags), then a second pass walks closers back to
+// openers, honoring the multiple-of-three rule. Unmatched runs stay as text.
+//
+// TODO: This does not yet handle links, autolinks, or backslash escapes.
+static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena,
+ StringSaver &Saver) {
+ SmallVector<InlinePiece> Pool;
+ int Head = -1, Tail = -1;
+
+ auto makePiece = [&]() -> int {
----------------
ilovepi wrote:
You should prefer to capture only what you actually use if practical.
```suggestion
auto makePiece = [&Pool]() -> int {
```
But why do you need this API? It seems like other lambdas just want to insert
into the vector, and emplace_back() returns a reference (iterator?) to the
newly inserted item.
https://github.com/llvm/llvm-project/pull/202991
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits