[clang-tools-extra] [llvm] [clang-doc] Add standalone Markdown parsing library (PR #202991)

Paul Kirth via cfe-commits Tue, 16 Jun 2026 13:55:45 -0700

================
@@ -0,0 +1,665 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Markdown.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/DebugLog.h"
+#include "llvm/Support/StringSaver.h"
+#include <cassert>
+#include <memory>
+#include <string>
+
+#define DEBUG_TYPE "clang-doc"
+
+using namespace llvm;
+
+namespace clang::doc::markdown {
+
+// Allocates a contiguous array of T in the arena and returns an ArrayRef.
+template <typename T>
+static ArrayRef<T> allocateArray(SmallVectorImpl<T> &Vec,
+                                 BumpPtrAllocator &Arena) {
+  if (Vec.empty())
+    return {};
+  T *Allocated = Arena.Allocate<T>(Vec.size());
+  std::uninitialized_copy(Vec.begin(), Vec.end(), Allocated);
+  return ArrayRef<T>(Allocated, Vec.size());
+}
+
+// A line is a table separator if it only contains |, -, :, and spaces,
+// and has at least one -.
+static bool isSepRow(StringRef Line) {
+  return Line.contains('-') &&
+         Line.find_first_not_of("|-: ") == StringRef::npos;
+}
+
+// Returns true if Line begins with a bullet list marker (-, *, or +)
+// followed by a space.
+static bool isListItem(StringRef Line) {
+  return Line.starts_with("- ") || Line.starts_with("* ") ||
+         Line.starts_with("+ ");
+}
+
+// Returns true if Line begins with an ordered list marker: one or more digits
+// followed by a period and a space (e.g. "1. ", "42. ").
+static bool isOrderedListItem(StringRef Line) {
+  size_t Dot = Line.find_first_not_of("0123456789");
+  return Dot != StringRef::npos && Dot > 0 && Line[Dot] == '.' &&
+         Dot + 1 < Line.size() && Line[Dot + 1] == ' ';
+}
+
+// Returns true if Line is a thematic break: three or more matching -, *, or _
+// characters, optionally separated by spaces, with nothing else. Line is
+// expected to be trimmed.
+static bool isThematicBreak(StringRef Line) {
+  char Marker = Line.empty() ? '\0' : Line[0];
+  if (Marker != '-' && Marker != '*' && Marker != '_')
+    return false;
+  unsigned Count = 0;
+  for (char C : Line) {
+    if (C == Marker)
+      ++Count;
+    else if (C != ' ')
+      return false;
+  }
+  return Count >= 3;
+}
+
+// Returns true if Line is a block quote line: it starts with "> ", or is a 
bare
+// ">" marking an empty quote line.
+static bool isBlockQuote(StringRef Line) {
+  return Line.starts_with("> ") || Line == ">";
+}
+
+// Returns the ATX heading level (1 to 6) when Line is an ATX heading: one to
+// six leading # characters followed by a space. Returns 0 otherwise, so seven
+// or more # characters fall back to plain text.
+static unsigned atxHeadingLevel(StringRef Line) {
+  size_t Level = Line.find_first_not_of('#');
+  if (Level == StringRef::npos || Level < 1 || Level > 6 || Line[Level] != ' ')
+    return 0;
+  return Level;
+}
+
+// A forward cursor over the lines of a paragraph. Lines are stored untrimmed;
+// callers trim where they need a normalized view.
+class LineReader {
+public:
+  explicit LineReader(ArrayRef<StringRef> Lines) : Lines(Lines) {}
+
+  // True once every line has been consumed.
+  bool atEnd() const { return Pos >= Lines.size(); }
+
+  // The current line, untrimmed. Must not be called when atEnd().
+  StringRef peek() const {
+    assert(!atEnd() && "peek past end of input");
+    return Lines[Pos];
+  }
+
+  // The line Offset positions ahead of the cursor, or an empty StringRef when
+  // that position is past the end. peek(0) is the current line.
+  StringRef peek(size_t Offset) const {
+    size_t Target = Pos + Offset;
+    return Target < Lines.size() ? Lines[Target] : StringRef();
+  }
+
+  // Consume the current line and return it, untrimmed. Must not be called when
+  // atEnd().
+  StringRef advance() {
+    assert(!atEnd() && "advance past end of input");
+    return Lines[Pos++];
+  }
+
+private:
+  ArrayRef<StringRef> Lines;
+  size_t Pos = 0;
+};
+
+// A forward cursor over the characters of a string. position() and seek() let
+// it interoperate with the index-based run and delimiter helpers below.
+class CharReader {
+public:
+  explicit CharReader(StringRef S) : S(S) {}
+
+  // True once every character has been consumed.
+  bool atEnd() const { return Pos >= S.size(); }
+
+  // The current character. Must not be called when atEnd().
+  char peek() const {
+    assert(!atEnd() && "peek past end of input");
+    return S[Pos];
+  }
+
+  // Consume the current character and return it. Must not be called when
+  // atEnd().
+  char advance() {
+    assert(!atEnd() && "advance past end of input");
+    return S[Pos++];
+  }
+
+  // The current scan position, for substring, run, and delimiter computations.
+  size_t position() const { return Pos; }
+
+  // Move the cursor to an absolute position, used to skip past a matched span.
+  void seek(size_t NewPos) { Pos = NewPos; }
+
+private:
+  StringRef S;
+  size_t Pos = 0;
+};
+
+// Returns the number of consecutive copies of C starting at S[Start].
+static size_t countRun(StringRef S, size_t Start, char C) {
+  size_t I = Start;
+  while (I < S.size() && S[I] == C)
+    ++I;
+  return I - Start;
+}
+
+// Strips one leading and one trailing space from a code span's content when
+// both are present and the content is not all spaces, per CommonMark §6.1.
+static StringRef trimCodeSpan(StringRef Code) {
+  if (Code.size() >= 2 && Code.front() == ' ' && Code.back() == ' ' &&
+      Code.find_first_not_of(' ') != StringRef::npos)
+    return Code.drop_front().drop_back();
+  return Code;
+}
+
+// Treats the start and end of the string (passed as '\0') as whitespace for 
the
+// CommonMark flanking rules.
+static bool isFlankWhitespace(char C) { return C == '\0' || isSpace(C); }
+
+// Computes whether a delimiter run can open or close emphasis, from the
+// characters immediately before and after the run, per the CommonMark §6.2
+// flanking rules. Before and After are '\0' at the string boundaries.
+static void computeFlanking(char Before, char Marker, char After, bool 
&CanOpen,
+                            bool &CanClose) {
+  bool AfterWS = isFlankWhitespace(After);
+  bool BeforeWS = isFlankWhitespace(Before);
+  bool AfterPunct = isPunct(After);
+  bool BeforePunct = isPunct(Before);
+  bool LeftFlanking = !AfterWS && (!AfterPunct || BeforeWS || BeforePunct);
+  bool RightFlanking = !BeforeWS && (!BeforePunct || AfterWS || AfterPunct);
+  if (Marker == '_') {
+    // Underscore does not open or close emphasis intraword.
+    CanOpen = LeftFlanking && (!RightFlanking || BeforePunct);
+    CanClose = RightFlanking && (!LeftFlanking || AfterPunct);
+  } else {
+    CanOpen = LeftFlanking;
+    CanClose = RightFlanking;
+  }
+}
+
+namespace {
+// One piece of inline content while emphasis is being resolved. A piece is
+// either a finished content node (text, code span, or a built emphasis or
+// strong node) or a run of delimiter characters that may still open or close
+// emphasis. Pieces form a doubly linked list through Prev/Next so matched runs
+// can be spliced out without shifting the others.
+struct InlinePiece {
+  MDNode *Node = nullptr; // content node, or null while this is a delimiter 
run
+  char Ch = 0;            // '*' or '_' for a delimiter run
+  size_t Len = 0;         // delimiters still available in the run
+  unsigned OrigLen = 0;   // original run length, for the multiple-of-three 
rule
+  bool CanOpen = false;
+  bool CanClose = false;
+  int Prev = -1;
+  int Next = -1;
+};
----------------
ilovepi wrote:


I  think we need some more clarity on the delineation between the MDNodes and 
this struct, because to me this feels almost like part of what I'd typically 
refer to as an AST or IR.  Can you elaborate, either here or int he file level 
documentation how these fit together? Is this a convenience struct, i.e., some 
data grouped to basically be passed as parameters? Something akin to a context 
object? or is it representing something more fundamental in the MD?

Also, I'd consider running the tidy checks for padding bytes and ordering, 
since I think you've ordered the fields in a fairly sub-optimal way.

```console
clang-tidy -p=~/llvm-fork/build *.cpp 
--checks=-*,'clang-analyzer-optin.performance.Padding' --extra-arg='-Xclang' 
--extra-arg=-analyzer-config --extra-arg=-Xclang 
--extra-arg='optin.performance.Padding:AllowedPad=0'
```

https://github.com/llvm/llvm-project/pull/202991
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang-tools-extra] [llvm] [clang-doc] Add standalone Markdown parsing library (PR #202991)

Reply via email to