This is an automated email from the ASF dual-hosted git repository.

kwin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/maven-doxia.git


The following commit(s) were added to refs/heads/master by this push:
     new fa5358fc Use JSoup to convert HTML to XHTML after parsing with Flexmark
fa5358fc is described below

commit fa5358fc3142e53df70376175e55516f40d91938
Author: Konrad Windszus <[email protected]>
AuthorDate: Fri Sep 12 11:43:44 2025 +0200

    Use JSoup to convert HTML to XHTML after parsing with Flexmark
    
    That way valid XHTML is passed to the XHTML parser.
    This is crucial as markdown may contain elements which are no properly
    closed (for example inline html5 with unclosed <p> tags)
    
    This closes #999
---
 doxia-modules/doxia-module-markdown/pom.xml              |  7 ++++++-
 .../maven/doxia/module/markdown/MarkdownParser.java      | 16 ++++++++++++----
 .../maven/doxia/module/markdown/MarkdownParserTest.java  |  9 ++++++++-
 .../doxia-module-markdown/src/test/resources/html.md     |  4 ++++
 4 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/doxia-modules/doxia-module-markdown/pom.xml 
b/doxia-modules/doxia-module-markdown/pom.xml
index f0a4e209..aff1ea86 100644
--- a/doxia-modules/doxia-module-markdown/pom.xml
+++ b/doxia-modules/doxia-module-markdown/pom.xml
@@ -140,7 +140,12 @@ under the License.
       <artifactId>flexmark-ext-yaml-front-matter</artifactId>
       <version>${flexmarkVersion}</version>
     </dependency>
-
+    <dependency>
+      <!-- for converting html to xhtml -->
+      <groupId>org.jsoup</groupId>
+      <artifactId>jsoup</artifactId>
+      <version>1.21.2</version>
+    </dependency>
     <dependency>
       <groupId>org.slf4j</groupId>
       <artifactId>slf4j-api</artifactId>
diff --git 
a/doxia-modules/doxia-module-markdown/src/main/java/org/apache/maven/doxia/module/markdown/MarkdownParser.java
 
b/doxia-modules/doxia-module-markdown/src/main/java/org/apache/maven/doxia/module/markdown/MarkdownParser.java
index 175ee615..b9654da7 100644
--- 
a/doxia-modules/doxia-module-markdown/src/main/java/org/apache/maven/doxia/module/markdown/MarkdownParser.java
+++ 
b/doxia-modules/doxia-module-markdown/src/main/java/org/apache/maven/doxia/module/markdown/MarkdownParser.java
@@ -59,6 +59,8 @@ import org.apache.maven.doxia.parser.ParseException;
 import org.apache.maven.doxia.sink.Sink;
 import org.apache.maven.doxia.util.HtmlTools;
 import org.codehaus.plexus.util.xml.pull.XmlPullParser;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
 
 /**
  * <p>
@@ -172,14 +174,14 @@ public class MarkdownParser extends AbstractTextParser 
implements TextMarkup {
     public void parse(Reader source, Sink sink, String reference) throws 
ParseException {
         try {
             // Markdown to HTML (using flexmark-java library)
-            String html = toHtml(source);
+            String xhtml = toXhtml(source);
 
             // TODO: add locator for the markdown source (not the intermediate 
HTML format)
             // this requires writing a custom renderer not leveraging the 
XHTML parser
 
             // then HTML to Sink API
             parser.setEmitComments(isEmitComments());
-            parser.parse(html, getWrappedSink(sink), "Intermediate HTML from " 
+ reference);
+            parser.parse(xhtml, getWrappedSink(sink), "Intermediate HTML from 
" + reference);
         } catch (IOException e) {
             throw new ParseException("Failed reading Markdown source 
document", e);
         }
@@ -272,7 +274,7 @@ public class MarkdownParser extends AbstractTextParser 
implements TextMarkup {
      * @return HTML content generated by flexmark-java
      * @throws IOException passed through
      */
-    String toHtml(Reader source) throws IOException {
+    String toXhtml(Reader source) throws IOException {
         // Read the source
         StringBuilder markdownText = new 
StringBuilder(IOUtils.toString(source));
 
@@ -314,7 +316,13 @@ public class MarkdownParser extends AbstractTextParser 
implements TextMarkup {
         html.append("</body>");
         html.append("</html>");
 
-        return html.toString();
+        return toXhtml(html.toString());
+    }
+
+    private String toXhtml(String html) {
+        final Document document = Jsoup.parse(html);
+        
document.outputSettings().syntax(Document.OutputSettings.Syntax.xml).prettyPrint(false);
+        return document.html();
     }
 
     /**
diff --git 
a/doxia-modules/doxia-module-markdown/src/test/java/org/apache/maven/doxia/module/markdown/MarkdownParserTest.java
 
b/doxia-modules/doxia-module-markdown/src/test/java/org/apache/maven/doxia/module/markdown/MarkdownParserTest.java
index 8b9f2466..f3743ce1 100644
--- 
a/doxia-modules/doxia-module-markdown/src/test/java/org/apache/maven/doxia/module/markdown/MarkdownParserTest.java
+++ 
b/doxia-modules/doxia-module-markdown/src/test/java/org/apache/maven/doxia/module/markdown/MarkdownParserTest.java
@@ -628,6 +628,7 @@ public class MarkdownParserTest extends AbstractParserTest {
                 "table",
                 "tableRows",
                 "text",
+                "unknown", // tbody start
                 "tableRow",
                 "tableHeaderCell",
                 "text",
@@ -640,6 +641,7 @@ public class MarkdownParserTest extends AbstractParserTest {
                 "tableCell_",
                 "tableRow_",
                 "text",
+                "unknown", // tbody end
                 "tableRows_",
                 "table_",
                 "text",
@@ -669,7 +671,7 @@ public class MarkdownParserTest extends AbstractParserTest {
 
     protected String parseFileToHtml(String file) throws ParseException, 
IOException {
         try (Reader reader = getTestReader(file)) {
-            return parser.toHtml(reader).toString();
+            return parser.toXhtml(reader).toString();
         }
     }
 
@@ -860,6 +862,11 @@ public class MarkdownParserTest extends AbstractParserTest 
{
         assertSinkDoesNotContain(eventList.iterator(), "comment", "comment_");
     }
 
+    @Test
+    public void testHtmlInMarkdown() throws ParseException, IOException {
+        parseFileToEventTestingSink("html");
+    }
+
     protected static void assertComment(ListIterator<SinkEventElement> it, 
String comment) {
         assertSinkEquals(it.next(), "comment", comment);
         // every comment ends with a line break in the emitted html which 
leads to an additional text event containing a
diff --git a/doxia-modules/doxia-module-markdown/src/test/resources/html.md 
b/doxia-modules/doxia-module-markdown/src/test/resources/html.md
new file mode 100644
index 00000000..c9222d42
--- /dev/null
+++ b/doxia-modules/doxia-module-markdown/src/test/resources/html.md
@@ -0,0 +1,4 @@
+
+# HTML in Markdown
+
+This is a <p> test.
\ No newline at end of file

Reply via email to