[clang-tools-extra] [llvm] [clang-doc] Add standalone Markdown parsing library (PR #202991)

Paul Kirth via cfe-commits Thu, 18 Jun 2026 09:55:51 -0700

================
@@ -0,0 +1,665 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Markdown.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/DebugLog.h"
+#include "llvm/Support/StringSaver.h"
+#include <cassert>
+#include <memory>
+#include <string>
+
+#define DEBUG_TYPE "clang-doc"
+
+using namespace llvm;
+
+namespace clang::doc::markdown {
+
+// Allocates a contiguous array of T in the arena and returns an ArrayRef.
+template <typename T>
+static ArrayRef<T> allocateArray(SmallVectorImpl<T> &Vec,
+                                 BumpPtrAllocator &Arena) {
+  if (Vec.empty())
+    return {};
+  T *Allocated = Arena.Allocate<T>(Vec.size());
+  std::uninitialized_copy(Vec.begin(), Vec.end(), Allocated);
+  return ArrayRef<T>(Allocated, Vec.size());
+}
+
+// A line is a table separator if it only contains |, -, :, and spaces,
+// and has at least one -.
+static bool isSepRow(StringRef Line) {
+  return Line.contains('-') &&
+         Line.find_first_not_of("|-: ") == StringRef::npos;
+}
+
+// Returns true if Line begins with a bullet list marker (-, *, or +)
+// followed by a space.
+static bool isListItem(StringRef Line) {
+  return Line.starts_with("- ") || Line.starts_with("* ") ||
+         Line.starts_with("+ ");
+}
+
+// Returns true if Line begins with an ordered list marker: one or more digits
+// followed by a period and a space (e.g. "1. ", "42. ").
+static bool isOrderedListItem(StringRef Line) {
+  size_t Dot = Line.find_first_not_of("0123456789");
+  return Dot != StringRef::npos && Dot > 0 && Line[Dot] == '.' &&
+         Dot + 1 < Line.size() && Line[Dot + 1] == ' ';
+}
+
+// Returns true if Line is a thematic break: three or more matching -, *, or _
+// characters, optionally separated by spaces, with nothing else. Line is
+// expected to be trimmed.
+static bool isThematicBreak(StringRef Line) {
+  char Marker = Line.empty() ? '\0' : Line[0];
+  if (Marker != '-' && Marker != '*' && Marker != '_')
+    return false;
+  unsigned Count = 0;
+  for (char C : Line) {
+    if (C == Marker)
+      ++Count;
+    else if (C != ' ')
+      return false;
+  }
+  return Count >= 3;
+}
+
+// Returns true if Line is a block quote line: it starts with "> ", or is a 
bare
+// ">" marking an empty quote line.
+static bool isBlockQuote(StringRef Line) {
+  return Line.starts_with("> ") || Line == ">";
+}
+
+// Returns the ATX heading level (1 to 6) when Line is an ATX heading: one to
+// six leading # characters followed by a space. Returns 0 otherwise, so seven
+// or more # characters fall back to plain text.
+static unsigned atxHeadingLevel(StringRef Line) {
+  size_t Level = Line.find_first_not_of('#');
+  if (Level == StringRef::npos || Level < 1 || Level > 6 || Line[Level] != ' ')
+    return 0;
+  return Level;
+}
+
+// A forward cursor over the lines of a paragraph. Lines are stored untrimmed;
+// callers trim where they need a normalized view.
+class LineReader {
+public:
+  explicit LineReader(ArrayRef<StringRef> Lines) : Lines(Lines) {}
+
+  // True once every line has been consumed.
+  bool atEnd() const { return Pos >= Lines.size(); }
+
+  // The current line, untrimmed. Must not be called when atEnd().
+  StringRef peek() const {
+    assert(!atEnd() && "peek past end of input");
+    return Lines[Pos];
+  }
+
+  // The line Offset positions ahead of the cursor, or an empty StringRef when
+  // that position is past the end. peek(0) is the current line.
+  StringRef peek(size_t Offset) const {
+    size_t Target = Pos + Offset;
+    return Target < Lines.size() ? Lines[Target] : StringRef();
+  }
+
+  // Consume the current line and return it, untrimmed. Must not be called when
+  // atEnd().
+  StringRef advance() {
+    assert(!atEnd() && "advance past end of input");
+    return Lines[Pos++];
+  }
+
+private:
+  ArrayRef<StringRef> Lines;
+  size_t Pos = 0;
+};
+
+// A forward cursor over the characters of a string. position() and seek() let
+// it interoperate with the index-based run and delimiter helpers below.
+class CharReader {
+public:
+  explicit CharReader(StringRef S) : S(S) {}
+
+  // True once every character has been consumed.
+  bool atEnd() const { return Pos >= S.size(); }
+
+  // The current character. Must not be called when atEnd().
+  char peek() const {
+    assert(!atEnd() && "peek past end of input");
+    return S[Pos];
+  }
+
+  // Consume the current character and return it. Must not be called when
+  // atEnd().
+  char advance() {
+    assert(!atEnd() && "advance past end of input");
+    return S[Pos++];
+  }
+
+  // The current scan position, for substring, run, and delimiter computations.
+  size_t position() const { return Pos; }
+
+  // Move the cursor to an absolute position, used to skip past a matched span.
+  void seek(size_t NewPos) { Pos = NewPos; }
+
+private:
+  StringRef S;
+  size_t Pos = 0;
+};
+
+// Returns the number of consecutive copies of C starting at S[Start].
+static size_t countRun(StringRef S, size_t Start, char C) {
+  size_t I = Start;
+  while (I < S.size() && S[I] == C)
+    ++I;
+  return I - Start;
+}
+
+// Strips one leading and one trailing space from a code span's content when
+// both are present and the content is not all spaces, per CommonMark §6.1.
+static StringRef trimCodeSpan(StringRef Code) {
+  if (Code.size() >= 2 && Code.front() == ' ' && Code.back() == ' ' &&
+      Code.find_first_not_of(' ') != StringRef::npos)
+    return Code.drop_front().drop_back();
+  return Code;
+}
+
+// Treats the start and end of the string (passed as '\0') as whitespace for 
the
+// CommonMark flanking rules.
+static bool isFlankWhitespace(char C) { return C == '\0' || isSpace(C); }
+
+// Computes whether a delimiter run can open or close emphasis, from the
+// characters immediately before and after the run, per the CommonMark §6.2
+// flanking rules. Before and After are '\0' at the string boundaries.
+static void computeFlanking(char Before, char Marker, char After, bool 
&CanOpen,
+                            bool &CanClose) {
+  bool AfterWS = isFlankWhitespace(After);
+  bool BeforeWS = isFlankWhitespace(Before);
+  bool AfterPunct = isPunct(After);
+  bool BeforePunct = isPunct(Before);
+  bool LeftFlanking = !AfterWS && (!AfterPunct || BeforeWS || BeforePunct);
+  bool RightFlanking = !BeforeWS && (!BeforePunct || AfterWS || AfterPunct);
+  if (Marker == '_') {
+    // Underscore does not open or close emphasis intraword.
+    CanOpen = LeftFlanking && (!RightFlanking || BeforePunct);
+    CanClose = RightFlanking && (!LeftFlanking || AfterPunct);
+  } else {
+    CanOpen = LeftFlanking;
+    CanClose = RightFlanking;
+  }
+}
+
+namespace {
+// One piece of inline content while emphasis is being resolved. A piece is
+// either a finished content node (text, code span, or a built emphasis or
+// strong node) or a run of delimiter characters that may still open or close
+// emphasis. Pieces form a doubly linked list through Prev/Next so matched runs
+// can be spliced out without shifting the others.
+struct InlinePiece {
+  MDNode *Node = nullptr; // content node, or null while this is a delimiter 
run
+  char Ch = 0;            // '*' or '_' for a delimiter run
+  size_t Len = 0;         // delimiters still available in the run
+  unsigned OrigLen = 0;   // original run length, for the multiple-of-three 
rule
+  bool CanOpen = false;
+  bool CanClose = false;
+  int Prev = -1;
+  int Next = -1;
+};
+} // namespace
+
+// Parses the inline content of a single line into a sequence of inline nodes:
+// inline code (`code`), emphasis (*text* or _text_), and strong (**text** or
+// __text__). Emphasis is resolved with a CommonMark-style delimiter stack: a
+// first pass tokenizes the line into text, code spans, and delimiter runs 
(each
+// tagged with its flanking flags), then a second pass walks closers back to
+// openers, honoring the multiple-of-three rule. Unmatched runs stay as text.
+//
+// TODO: This does not yet handle links, autolinks, or backslash escapes.
+static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena,
+                                      StringSaver &Saver) {
+  SmallVector<InlinePiece> Pool;
+  int Head = -1, Tail = -1;
+
+  auto makePiece = [&]() -> int {
+    Pool.emplace_back();
+    return Pool.size() - 1;
+  };
+  auto linkAtTail = [&](int Idx) {
+    Pool[Idx].Prev = Tail;
+    (Tail != -1 ? Pool[Tail].Next : Head) = Idx;
+    Tail = Idx;
+  };
+  auto appendNode = [&](MDNode *N) {
+    int Idx = makePiece();
+    Pool[Idx].Node = N;
+    linkAtTail(Idx);
+  };
+  // Content nodes pass through; a leftover delimiter run becomes a TextNode of
+  // its remaining characters.
+  auto pieceNode = [&](int P) -> MDNode * {
+    if (Pool[P].Node)
+      return Pool[P].Node;
+    return new (Arena)
+        TextNode(Saver.save(std::string(Pool[P].Len, Pool[P].Ch)));
+  };
+  // Merges adjacent TextNodes so unmatched delimiters coalesce with 
neighboring
+  // text, then copies the result into the arena.
+  auto finalize = [&](SmallVectorImpl<MDNode *> &Nodes) -> ArrayRef<MDNode *> {
+    SmallVector<MDNode *> Merged;
+    for (MDNode *Nd : Nodes) {
+      if (isa<TextNode>(Nd) && !Merged.empty() &&
+          isa<TextNode>(Merged.back())) {
+        StringRef Prev = cast<TextNode>(Merged.back())->Text;
+        StringRef Cur = cast<TextNode>(Nd)->Text;
+        Merged.back() =
+            new (Arena) TextNode(Saver.save(Prev.str() + Cur.str()));
+      } else {
+        Merged.push_back(Nd);
+      }
+    }
+    return allocateArray(Merged, Arena);
+  };
+
+  // Phase 1: tokenize the line into text, code spans, and delimiter runs.
+  CharReader Reader(S);
+  size_t TextStart = 0;
+  auto flushText = [&](size_t End) {
+    if (End > TextStart)
+      appendNode(new (Arena) TextNode(
+          Saver.save(S.substr(TextStart, End - TextStart))));
+  };
+
+  while (!Reader.atEnd()) {
+    size_t Pos = Reader.position();
+    char C = Reader.peek();
+
+    // Inline code span: an opening backtick run closed by a run of the same
+    // length.
+    if (C == '`') {
+      size_t OpenLen = countRun(S, Pos, '`');
+      size_t ClosePos = Pos + OpenLen;
+      while (ClosePos < S.size() && countRun(S, ClosePos, '`') != OpenLen)
+        ClosePos += S[ClosePos] == '`' ? countRun(S, ClosePos, '`') : 1;
+      if (ClosePos < S.size()) {
+        flushText(Pos);
+        StringRef Code =
+            trimCodeSpan(S.substr(Pos + OpenLen, ClosePos - (Pos + OpenLen)));
+        appendNode(new (Arena) InlineCodeNode(Saver.save(Code)));
+        Reader.seek(ClosePos + OpenLen);
+        TextStart = Reader.position();
+        continue;
+      }
+      // No closing run; leave the backticks as literal text.
+      Reader.seek(Pos + OpenLen);
+      continue;
+    }
+
+    // Delimiter run for emphasis or strong.
+    if (C == '*' || C == '_') {
+      size_t RunLen = countRun(S, Pos, C);
+      flushText(Pos);
+      char Before = Pos == 0 ? '\0' : S[Pos - 1];
+      char After = Pos + RunLen < S.size() ? S[Pos + RunLen] : '\0';
+      int Idx = makePiece();
+      InlinePiece &D = Pool[Idx];
+      D.Ch = C;
+      D.Len = RunLen;
+      D.OrigLen = RunLen;
+      computeFlanking(Before, C, After, D.CanOpen, D.CanClose);
+      linkAtTail(Idx);
+      Reader.seek(Pos + RunLen);
+      TextStart = Reader.position();
+      continue;
+    }
+
+    Reader.advance();
+  }
+  flushText(S.size());
+
+  // Phase 2: match closers back to openers. OpenersBottom records, per closer
+  // kind, how far back a failed search needs to look, keyed by delimiter char,
+  // run length mod 3, and whether the closer can also open.
+  int OpenersBottom[12];
+  for (int &B : OpenersBottom)
+    B = -1;
+  auto bucket = [](const InlinePiece &P) {
+    return (P.Ch == '_' ? 6 : 0) + (P.OrigLen % 3) * 2 + (P.CanOpen ? 1 : 0);
+  };
+
+  int Current = Head;
+  while (Current != -1) {
+    // Advance to the next run that can close.
+    while (Current != -1 &&
+           !(Pool[Current].Ch && Pool[Current].CanClose && Pool[Current].Len))
+      Current = Pool[Current].Next;
+    if (Current == -1)
+      break;
+    int Closer = Current;
+    int Key = bucket(Pool[Closer]);
+
+    // Search back for the nearest matching opener.
+    int Opener = Pool[Closer].Prev;
+    bool Found = false;
+    while (Opener != -1 && Opener != OpenersBottom[Key]) {
+      InlinePiece &O = Pool[Opener];
+      if (O.Ch == Pool[Closer].Ch && O.Len && O.CanOpen) {
+        unsigned Sum = O.OrigLen + Pool[Closer].OrigLen;
+        bool OddMatch = (O.CanClose || Pool[Closer].CanOpen) && Sum % 3 == 0 &&
+                        !(O.OrigLen % 3 == 0 && Pool[Closer].OrigLen % 3 == 0);
+        if (!OddMatch) {
+          Found = true;
+          break;
+        }
+      }
+      Opener = Pool[Opener].Prev;
+    }
+
+    if (!Found) {
+      OpenersBottom[Key] = Pool[Closer].Prev;
+      // A run that cannot also open will never match anything; keep its text
+      // but stop treating it as a delimiter.
+      if (!Pool[Closer].CanOpen)
+        Pool[Closer].CanClose = false;
+      Current = Pool[Closer].Next;
+      continue;
+    }
+
+    // Wrap the pieces between opener and closer, consuming one delimiter from
+    // each side for emphasis or two for strong.
+    unsigned Use = Pool[Opener].Len >= 2 && Pool[Closer].Len >= 2 ? 2 : 1;
+    SmallVector<MDNode *> Inner;
+    for (int P = Pool[Opener].Next; P != Closer; P = Pool[P].Next)
+      Inner.push_back(pieceNode(P));
+    Pool[Opener].Len -= Use;
+    Pool[Closer].Len -= Use;
+    MDNode *Emph =
+        Use == 2
+            ? static_cast<MDNode *>(new (Arena) StrongNode(finalize(Inner)))
+            : static_cast<MDNode *>(new (Arena) EmphasisNode(finalize(Inner)));
+    int EP = makePiece();
+    Pool[EP].Node = Emph;
+    Pool[EP].Prev = Opener;
+    Pool[EP].Next = Closer;
+    Pool[Opener].Next = EP;
+    Pool[Closer].Prev = EP;
+
+    // Drop the opener or closer once its run is fully consumed.
+    if (Pool[Opener].Len == 0) {
+      int Pr = Pool[Opener].Prev;
+      Pool[EP].Prev = Pr;
+      (Pr != -1 ? Pool[Pr].Next : Head) = EP;
+    }
+    if (Pool[Closer].Len == 0) {
+      int Nx = Pool[Closer].Next;
+      Pool[EP].Next = Nx;
+      (Nx != -1 ? Pool[Nx].Prev : Tail) = EP;
+      Current = Nx;
+    } else {
+      Current = Closer;
+    }
+  }
+
+  // Phase 3: collect the surviving pieces, dropping fully consumed delimiters.
+  SmallVector<MDNode *> Result;
+  for (int P = Head; P != -1; P = Pool[P].Next)
+    if (Pool[P].Node || Pool[P].Len)
+      Result.push_back(pieceNode(P));
+  return finalize(Result);
+}
+
+// Parses a fenced code block opened with ``` or ~~~. The cursor must be on the
+// opening fence; the fence, body lines, and closing fence are consumed.
+//
+// TODO: Follow CommonMark spec §4.5 more closely. Opening fences may be
+// indented up to 3 spaces, the closing fence must use the same character and 
be
+// at least as long as the opening fence, and the closing fence may only be
+// followed by spaces.
+static FencedCodeNode *parseFencedCode(LineReader &Reader,
+                                       BumpPtrAllocator &Arena,
+                                       StringSaver &Saver) {
+  StringRef Open = Reader.peek().trim();
+  char Fence = Open[0];
+  StringRef Lang = Saver.save(Open.drop_front(3).trim());
+  Reader.advance(); // consume opening fence
+  SmallVector<StringRef> CodeLines;
+  while (!Reader.atEnd()) {
+    StringRef CodeLine = Reader.peek().trim();
+    if (CodeLine.size() >= 3 &&
+        llvm::all_of(CodeLine.take_front(3),
+                     [Fence](char C) { return C == Fence; }))
+      break;
+    CodeLines.push_back(Saver.save(Reader.advance()));
+  }
+  if (!Reader.atEnd())
+    Reader.advance(); // consume closing fence
----------------
ilovepi wrote:


```suggestion
  // consume closing fence
  if (!Reader.atEnd())
    Reader.advance();
```

https://github.com/llvm/llvm-project/pull/202991
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang-tools-extra] [llvm] [clang-doc] Add standalone Markdown parsing library (PR #202991)

Reply via email to