[clang-tools-extra] [llvm] [clang-doc] Add standalone Markdown parsing library (PR #202991)

Paul Kirth via cfe-commits Tue, 16 Jun 2026 13:56:06 -0700

================
@@ -0,0 +1,665 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Markdown.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/DebugLog.h"
+#include "llvm/Support/StringSaver.h"
+#include <cassert>
+#include <memory>
+#include <string>
+
+#define DEBUG_TYPE "clang-doc"
+
+using namespace llvm;
+
+namespace clang::doc::markdown {
+
+// Allocates a contiguous array of T in the arena and returns an ArrayRef.
+template <typename T>
+static ArrayRef<T> allocateArray(SmallVectorImpl<T> &Vec,
+                                 BumpPtrAllocator &Arena) {
+  if (Vec.empty())
+    return {};
+  T *Allocated = Arena.Allocate<T>(Vec.size());
+  std::uninitialized_copy(Vec.begin(), Vec.end(), Allocated);
+  return ArrayRef<T>(Allocated, Vec.size());
+}
+
+// A line is a table separator if it only contains |, -, :, and spaces,
+// and has at least one -.
+static bool isSepRow(StringRef Line) {
+  return Line.contains('-') &&
+         Line.find_first_not_of("|-: ") == StringRef::npos;
+}
+
+// Returns true if Line begins with a bullet list marker (-, *, or +)
+// followed by a space.
+static bool isListItem(StringRef Line) {
+  return Line.starts_with("- ") || Line.starts_with("* ") ||
+         Line.starts_with("+ ");
+}
+
+// Returns true if Line begins with an ordered list marker: one or more digits
+// followed by a period and a space (e.g. "1. ", "42. ").
+static bool isOrderedListItem(StringRef Line) {
+  size_t Dot = Line.find_first_not_of("0123456789");
+  return Dot != StringRef::npos && Dot > 0 && Line[Dot] == '.' &&
+         Dot + 1 < Line.size() && Line[Dot + 1] == ' ';
+}
+
+// Returns true if Line is a thematic break: three or more matching -, *, or _
+// characters, optionally separated by spaces, with nothing else. Line is
+// expected to be trimmed.
+static bool isThematicBreak(StringRef Line) {
+  char Marker = Line.empty() ? '\0' : Line[0];
+  if (Marker != '-' && Marker != '*' && Marker != '_')
+    return false;
+  unsigned Count = 0;
+  for (char C : Line) {
+    if (C == Marker)
+      ++Count;
+    else if (C != ' ')
+      return false;
+  }
+  return Count >= 3;
+}
+
+// Returns true if Line is a block quote line: it starts with "> ", or is a 
bare
+// ">" marking an empty quote line.
+static bool isBlockQuote(StringRef Line) {
+  return Line.starts_with("> ") || Line == ">";
+}
+
+// Returns the ATX heading level (1 to 6) when Line is an ATX heading: one to
+// six leading # characters followed by a space. Returns 0 otherwise, so seven
+// or more # characters fall back to plain text.
+static unsigned atxHeadingLevel(StringRef Line) {
+  size_t Level = Line.find_first_not_of('#');
+  if (Level == StringRef::npos || Level < 1 || Level > 6 || Line[Level] != ' ')
+    return 0;
+  return Level;
+}
+
+// A forward cursor over the lines of a paragraph. Lines are stored untrimmed;
+// callers trim where they need a normalized view.
+class LineReader {
+public:
+  explicit LineReader(ArrayRef<StringRef> Lines) : Lines(Lines) {}
+
+  // True once every line has been consumed.
+  bool atEnd() const { return Pos >= Lines.size(); }
+
+  // The current line, untrimmed. Must not be called when atEnd().
+  StringRef peek() const {
+    assert(!atEnd() && "peek past end of input");
+    return Lines[Pos];
+  }
+
+  // The line Offset positions ahead of the cursor, or an empty StringRef when
+  // that position is past the end. peek(0) is the current line.
+  StringRef peek(size_t Offset) const {
+    size_t Target = Pos + Offset;
+    return Target < Lines.size() ? Lines[Target] : StringRef();
+  }
+
+  // Consume the current line and return it, untrimmed. Must not be called when
+  // atEnd().
+  StringRef advance() {
+    assert(!atEnd() && "advance past end of input");
+    return Lines[Pos++];
+  }
+
+private:
+  ArrayRef<StringRef> Lines;
+  size_t Pos = 0;
+};
+
+// A forward cursor over the characters of a string. position() and seek() let
+// it interoperate with the index-based run and delimiter helpers below.
+class CharReader {
+public:
+  explicit CharReader(StringRef S) : S(S) {}
+
+  // True once every character has been consumed.
+  bool atEnd() const { return Pos >= S.size(); }
+
+  // The current character. Must not be called when atEnd().
+  char peek() const {
+    assert(!atEnd() && "peek past end of input");
+    return S[Pos];
+  }
+
+  // Consume the current character and return it. Must not be called when
+  // atEnd().
+  char advance() {
+    assert(!atEnd() && "advance past end of input");
+    return S[Pos++];
+  }
+
+  // The current scan position, for substring, run, and delimiter computations.
+  size_t position() const { return Pos; }
+
+  // Move the cursor to an absolute position, used to skip past a matched span.
+  void seek(size_t NewPos) { Pos = NewPos; }
+
+private:
+  StringRef S;
+  size_t Pos = 0;
+};
+
+// Returns the number of consecutive copies of C starting at S[Start].
+static size_t countRun(StringRef S, size_t Start, char C) {
+  size_t I = Start;
+  while (I < S.size() && S[I] == C)
+    ++I;
+  return I - Start;
+}
+
+// Strips one leading and one trailing space from a code span's content when
+// both are present and the content is not all spaces, per CommonMark §6.1.
+static StringRef trimCodeSpan(StringRef Code) {
+  if (Code.size() >= 2 && Code.front() == ' ' && Code.back() == ' ' &&
+      Code.find_first_not_of(' ') != StringRef::npos)
+    return Code.drop_front().drop_back();
+  return Code;
+}
+
+// Treats the start and end of the string (passed as '\0') as whitespace for 
the
+// CommonMark flanking rules.
+static bool isFlankWhitespace(char C) { return C == '\0' || isSpace(C); }
+
+// Computes whether a delimiter run can open or close emphasis, from the
+// characters immediately before and after the run, per the CommonMark §6.2
+// flanking rules. Before and After are '\0' at the string boundaries.
+static void computeFlanking(char Before, char Marker, char After, bool 
&CanOpen,
+                            bool &CanClose) {
+  bool AfterWS = isFlankWhitespace(After);
+  bool BeforeWS = isFlankWhitespace(Before);
+  bool AfterPunct = isPunct(After);
+  bool BeforePunct = isPunct(Before);
+  bool LeftFlanking = !AfterWS && (!AfterPunct || BeforeWS || BeforePunct);
+  bool RightFlanking = !BeforeWS && (!BeforePunct || AfterWS || AfterPunct);
+  if (Marker == '_') {
+    // Underscore does not open or close emphasis intraword.
+    CanOpen = LeftFlanking && (!RightFlanking || BeforePunct);
+    CanClose = RightFlanking && (!LeftFlanking || AfterPunct);
+  } else {
+    CanOpen = LeftFlanking;
+    CanClose = RightFlanking;
+  }
+}
+
+namespace {
+// One piece of inline content while emphasis is being resolved. A piece is
+// either a finished content node (text, code span, or a built emphasis or
+// strong node) or a run of delimiter characters that may still open or close
+// emphasis. Pieces form a doubly linked list through Prev/Next so matched runs
+// can be spliced out without shifting the others.
+struct InlinePiece {
+  MDNode *Node = nullptr; // content node, or null while this is a delimiter 
run
+  char Ch = 0;            // '*' or '_' for a delimiter run
+  size_t Len = 0;         // delimiters still available in the run
+  unsigned OrigLen = 0;   // original run length, for the multiple-of-three 
rule
+  bool CanOpen = false;
+  bool CanClose = false;
+  int Prev = -1;
+  int Next = -1;
+};
+} // namespace
+
+// Parses the inline content of a single line into a sequence of inline nodes:
+// inline code (`code`), emphasis (*text* or _text_), and strong (**text** or
+// __text__). Emphasis is resolved with a CommonMark-style delimiter stack: a
+// first pass tokenizes the line into text, code spans, and delimiter runs 
(each
+// tagged with its flanking flags), then a second pass walks closers back to
+// openers, honoring the multiple-of-three rule. Unmatched runs stay as text.
+//
+// TODO: This does not yet handle links, autolinks, or backslash escapes.
+static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena,
+                                      StringSaver &Saver) {
+  SmallVector<InlinePiece> Pool;
+  int Head = -1, Tail = -1;
+
+  auto makePiece = [&]() -> int {
+    Pool.emplace_back();
+    return Pool.size() - 1;
+  };
+  auto linkAtTail = [&](int Idx) {
+    Pool[Idx].Prev = Tail;
+    (Tail != -1 ? Pool[Tail].Next : Head) = Idx;
+    Tail = Idx;
+  };
+  auto appendNode = [&](MDNode *N) {
+    int Idx = makePiece();
+    Pool[Idx].Node = N;
+    linkAtTail(Idx);
+  };
+  // Content nodes pass through; a leftover delimiter run becomes a TextNode of
+  // its remaining characters.
+  auto pieceNode = [&](int P) -> MDNode * {
+    if (Pool[P].Node)
+      return Pool[P].Node;
+    return new (Arena)
+        TextNode(Saver.save(std::string(Pool[P].Len, Pool[P].Ch)));
+  };
+  // Merges adjacent TextNodes so unmatched delimiters coalesce with 
neighboring
+  // text, then copies the result into the arena.
+  auto finalize = [&](SmallVectorImpl<MDNode *> &Nodes) -> ArrayRef<MDNode *> {
+    SmallVector<MDNode *> Merged;
+    for (MDNode *Nd : Nodes) {
+      if (isa<TextNode>(Nd) && !Merged.empty() &&
+          isa<TextNode>(Merged.back())) {
+        StringRef Prev = cast<TextNode>(Merged.back())->Text;
+        StringRef Cur = cast<TextNode>(Nd)->Text;
+        Merged.back() =
+            new (Arena) TextNode(Saver.save(Prev.str() + Cur.str()));
+      } else {
+        Merged.push_back(Nd);
+      }
+    }
+    return allocateArray(Merged, Arena);
+  };
+
+  // Phase 1: tokenize the line into text, code spans, and delimiter runs.
+  CharReader Reader(S);
+  size_t TextStart = 0;
+  auto flushText = [&](size_t End) {
+    if (End > TextStart)
+      appendNode(new (Arena) TextNode(
+          Saver.save(S.substr(TextStart, End - TextStart))));
+  };
+
+  while (!Reader.atEnd()) {
+    size_t Pos = Reader.position();
+    char C = Reader.peek();
+
+    // Inline code span: an opening backtick run closed by a run of the same
+    // length.
+    if (C == '`') {
+      size_t OpenLen = countRun(S, Pos, '`');
+      size_t ClosePos = Pos + OpenLen;
+      while (ClosePos < S.size() && countRun(S, ClosePos, '`') != OpenLen)
+        ClosePos += S[ClosePos] == '`' ? countRun(S, ClosePos, '`') : 1;
+      if (ClosePos < S.size()) {
+        flushText(Pos);
+        StringRef Code =
+            trimCodeSpan(S.substr(Pos + OpenLen, ClosePos - (Pos + OpenLen)));
+        appendNode(new (Arena) InlineCodeNode(Saver.save(Code)));
+        Reader.seek(ClosePos + OpenLen);
+        TextStart = Reader.position();
+        continue;
+      }
+      // No closing run; leave the backticks as literal text.
+      Reader.seek(Pos + OpenLen);
+      continue;
+    }
+
+    // Delimiter run for emphasis or strong.
+    if (C == '*' || C == '_') {
+      size_t RunLen = countRun(S, Pos, C);
+      flushText(Pos);
+      char Before = Pos == 0 ? '\0' : S[Pos - 1];
+      char After = Pos + RunLen < S.size() ? S[Pos + RunLen] : '\0';
+      int Idx = makePiece();
+      InlinePiece &D = Pool[Idx];
+      D.Ch = C;
+      D.Len = RunLen;
+      D.OrigLen = RunLen;
+      computeFlanking(Before, C, After, D.CanOpen, D.CanClose);
+      linkAtTail(Idx);
+      Reader.seek(Pos + RunLen);
+      TextStart = Reader.position();
+      continue;
+    }
+
+    Reader.advance();
+  }
+  flushText(S.size());
+
+  // Phase 2: match closers back to openers. OpenersBottom records, per closer
+  // kind, how far back a failed search needs to look, keyed by delimiter char,
+  // run length mod 3, and whether the closer can also open.
+  int OpenersBottom[12];
+  for (int &B : OpenersBottom)
+    B = -1;
+  auto bucket = [](const InlinePiece &P) {
+    return (P.Ch == '_' ? 6 : 0) + (P.OrigLen % 3) * 2 + (P.CanOpen ? 1 : 0);
+  };
+
+  int Current = Head;
+  while (Current != -1) {
----------------
ilovepi wrote:


This loop is ... well its pretty crazy. I think this needs more clear 
documentation about what its tryign to do  and how it does works (e.g. the goal 
and algorithm). I also think its quite hard to reason about give that there are 
so many lambdas, and arbitrary bits of arithmetic that interact in non-obvious 
ways.

To be frank: I'll never approve this as written. Its convoluted, poorly 
documented, and feels completely arbitrary in its implementation. Please 
refactor this in to smaller, better documented pieces that reviewers can 
actually reason about. 

Further,  I'm not really a fan of this whole approach. Where do these buckets 
suddenly come from? how are they actually used? what's happening with all this 
modulus math? why does any of that make sense. There's no indication that any 
of this is remotely correct other than some new tests, and I cant say I have 
much confidence in my ability to reason through every choice here.

So lets take another pass at this. Ideally starting from a very simple base 
with only the minimal amount of testing for the basics, and subsequent patches 
that add individual pieces over all the functionality under the sun.

Additionally, please put yourself in our shoes: if I handed you this code, 
which you'd never seen before, could you easily explain how it all works 
together?  how the edge cases get worked out? why the modulus operations are 
correct? Would you be able to reconstruct the algorithm easily in pseudo-code 
and explain it to a collegue?

https://github.com/llvm/llvm-project/pull/202991
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang-tools-extra] [llvm] [clang-doc] Add standalone Markdown parsing library (PR #202991)

Reply via email to