[PATCH devel 4/6] htmlprag: Allow users to parameterize the parent constraints.

Maxim Cournoyer Thu, 04 Mar 2021 19:04:17 -0800

These constraints are used to make the tokenizer more lenient in the
face of incomplete/invalid HTML; unfortunately it causes valid HTML to
be parsed incorrectly.  This change allows users to parameterize the
alist that defines which elements are treated specially via the
%PARENT-CONSTRAINTS parameter, or to disable the pragmatic behavior
completely via the %STRICT-TOKENIZER? parameter or the STRICT?
keyword argument of the HTML->SXML procedure (and its variants).


* src/htmlprag.scm: Update doc.
(%default-parent-constraints): New variable.
(%parent-constraints): New parameter.
(%strict-tokenizer?): Likewise.
(parse-html/tokenizer)[strict?]: New keyword argument.  Adjust to use
the newly added parameters and argument.
(htmlprag-internal:parse-html)[strict?]: New argument.
(test-htmlprag): Add tests.
(html->sxml-0nf, html->sxml-1nf, html->sxml-2nf)[strict?]: New argument.
---
 src/htmlprag.scm | 143 ++++++++++++++++++++++++++++++++++-------------
 1 file changed, 104 insertions(+), 39 deletions(-)

diff --git a/src/htmlprag.scm b/src/htmlprag.scm
index 3bd352b..79a7b2f 100644
--- a/src/htmlprag.scm
+++ b/src/htmlprag.scm
@@ -1,6 +1,7 @@
 ;; (htmlprag) -- pragmatic parsing of real-world HTML
 ;; Copyright (C) 2003-2004 Neil W. Van Dyke <neil at neilvandyke.org>
 ;; Modified 2004 by Andy Wingo to fit in with guile-lib.
+;; Modified 2021 by Maxim Cournoyer to parameterize the parent constraints.
 
 ;; This program is free software: you can redistribute it and/or modify
 ;; it under the terms of the GNU Lesser General Public License as
@@ -39,7 +40,12 @@
 ;;; defeat a strict or validating parser.  HtmlPrag's handling of errors is
 ;;; intended to generally emulate popular Web browsers' interpretation of the
 ;;; structure of erroneous HTML.  We euphemistically term this kind of parse
-;;; ``pragmatic.''
+;;; ``pragmatic.''  To disable the pragmatic behavior and parse HTML more
+;;; rigidly, the @code{%strict-tokenizer?} parameter can be set to
+;;; @code{#true}.  In this mode of operation, one ended HTML tags will not be
+;;; treated specially, for example, and their content will be coalesced.  On
+;;; the other side, valid HTML will parse more accurately.  When working with
+;;; HTML known to be valid, it makes sense to use this mode of operation.
 ;;;
 ;;; HtmlPrag also has some support for [XHTML], although XML namespace
 ;;; qualifiers [XML-Names] are currently accepted but stripped from the
@@ -1076,6 +1082,48 @@
 ;;; input port.  This procedure is used internally, and generally should not be
 ;;; called directly.
 
+;;; The alist below defines constraints about what possible parents a
+;;; tag may have.  This exists to allow parsing malformed HTML, e.g.,
+;;; in the presence of missing closing tags.  The drawback is that
+;;; this restricts where these tags may nested; for example, the
+;;; following HTML fragment
+;;; "<body><blockquote><p>foo</p>\n</blockquote></body>" parses to
+;;; '(*TOP* (body (blockquote) (p "foo") "\n")), which incorrectly
+;;; forces the 'p' tag to be a child of 'body' rather than of
+;;; 'blockquote'.
+(define %default-parent-constraints
+  '((area     . (map))
+    (body     . (html))
+    (caption  . (table))
+    (colgroup . (table))
+    (dd       . (dl))
+    (dt       . (dl))
+    (frame    . (frameset))
+    (head     . (html))
+    (isindex  . (head))
+    (li       . (dir menu ol ul))
+    (meta     . (head))
+    (noframes . (frameset))
+    (option   . (select))
+    (p        . (body td th))
+    (param    . (applet))
+    (tbody    . (table))
+    (td       . (tr))
+    (th       . (tr))
+    (thead    . (table))
+    (title    . (head))
+    (tr       . (table tbody thead))))
+
+;;; The following parameter enables users to parameterize which
+;;; constraints to use when tokenizing HTML.
+(define %parent-constraints (make-parameter %default-parent-constraints))
+
+;;; The following switch is disabled for historical reasons.  When true,
+;;; it disables the use of the above %parent-constraints parameter.
+;;; TODO: Set to #true when bumping the major version, which is a
+;;; better default in modern times, where most HTML is valid.
+(define %strict-tokenizer? (make-parameter #false))
+
 (define parse-html/tokenizer
   ;; TODO: Document the algorithm, then see if rewriting as idiomatic Scheme
   ;;       can make it more clear.
@@ -1084,32 +1132,9 @@
             ;;       acceptable way to parse old HTML that uses the `p' element
             ;;       as a paragraph terminator.
             htmlprag-internal:empty-elements)
-           (parent-constraints
-            ;; TODO: Maybe make this an option.
-            '((area     . (map))
-              (body     . (html))
-              (caption  . (table))
-              (colgroup . (table))
-              (dd       . (dl))
-              (dt       . (dl))
-              (frame    . (frameset))
-              (head     . (html))
-              (isindex  . (head))
-              (li       . (dir menu ol ul))
-              (meta     . (head))
-              (noframes . (frameset))
-              (option   . (select))
-              (p        . (body td th))
-              (param    . (applet))
-              (tbody    . (table))
-              (td       . (tr))
-              (th       . (tr))
-              (thead    . (table))
-              (title    . (head))
-              (tr       . (table tbody thead))))
            (start-tag-name (lambda (tag-token) (car tag-token)))
            (end-tag-name   (lambda (tag-token) (list-ref tag-token 1))))
-    (lambda (tokenizer normalized?)
+    (lambda* (tokenizer normalized? #:key (strict? 'unset))
       ;; Example `begs' value:
       ;;
       ;; ( ((head ...) . ( (title ...)                         ))
@@ -1169,7 +1194,14 @@
                            (add-to-current-beg tok))
                           ((eqv? kind shtml-start-symbol)
                            (let* ((name (start-tag-name tok))
-                                  (cell (assq name parent-constraints)))
+                                  ;; If STRICT? is a boolean, it means the
+                                  ;; user explicitly provided it, in which
+                                  ;; case it takes precedence over the
+                                  ;; %strict-tokenizer? parameter.
+                                  (cell (and (not (if (boolean? strict?)
+                                                      strict?
+                                                      (%strict-tokenizer?)))
+                                             (assq name 
(%parent-constraints)))))
                              (and cell (finish-begs-upto (cdr cell) begs))
                              (add-to-current-beg tok)
                              (or (memq name empty-elements)
@@ -1207,7 +1239,7 @@
 ;; variants, and should not be used directly by programs.  The interface is
 ;; likely to change in future versions of HtmlPrag.
 
-(define (htmlprag-internal:parse-html input normalized? top?)
+(define (htmlprag-internal:parse-html input normalized? top? strict?)
   (let ((parse
          (lambda ()
            (parse-html/tokenizer
@@ -1219,15 +1251,16 @@
                           "invalid input type"
                           input)))
              normalized?)
-            normalized?))))
+            normalized?
+            #:strict? strict?))))
     (if top?
         (cons shtml-top-symbol (parse))
         (parse))))
 
-;;; @defproc  html->sxml-0nf input
-;;; @defprocx html->sxml-1nf input
-;;; @defprocx html->sxml-2nf input
-;;; @defprocx html->sxml     input
+;;; @defproc  html->sxml-0nf input strict?
+;;; @defprocx html->sxml-1nf input strict?
+;;; @defprocx html->sxml-2nf input strict?
+;;; @defprocx html->sxml     input strict?
 ;;;
 ;;; Permissively parse HTML from @var{input}, which is either an input port or
 ;;; a string, and emit an SHTML equivalent or approximation.  To borrow and
@@ -1257,7 +1290,15 @@
 ;;; Note that in the emitted SHTML the text token @code{"still < bold"} is
 ;;; @emph{not} inside the @code{b} element, which represents an unfortunate
 ;;; failure to emulate all the quirks-handling behavior of some popular Web
-;;; browsers.
+;;; browsers.  When correctness is preferred over pragmatism, the
+;;; @code{%strict-tokenizer?} parameter can be set to true.  In the above
+;;; example, the unbound elements would nested, but valid HTML would parse
+;;; correctly, without the parsing quirk mentioned above.  Alternatively, the
+;;; @code{strict?} keyword argument can be set to true, in which case it takes
+;;; precedence over the value of the @code{%strict-tokenizer?} parameter.
+;;; Finally, the %parent-constraints parameter can also be used to customize
+;;; which elements should be treated specially, when operating in the default
+;;; pragmatic mode.
 ;;;
 ;;; The procedures @code{html->sxml-@var{n}nf} for @var{n} 0 through 2
 ;;; correspond to 0th through 2nd normal forms of SXML as specified in [SXML],
@@ -1267,9 +1308,12 @@
 ;;; @code{html->sxml-0nf}, and can be used in scripts and interactively, when
 ;;; terseness is important and any normal form of SXML would suffice.
 
-(define (html->sxml-0nf input) (htmlprag-internal:parse-html input #f #t))
-(define (html->sxml-1nf input) (htmlprag-internal:parse-html input #f #t))
-(define (html->sxml-2nf input) (htmlprag-internal:parse-html input #t #t))
+(define* (html->sxml-0nf input #:key (strict? 'unset))
+  (htmlprag-internal:parse-html input #f #t strict?))
+(define* (html->sxml-1nf input #:key (strict? 'unset))
+  (htmlprag-internal:parse-html input #f #t strict?))
+(define* (html->sxml-2nf input #:key (strict? 'unset))
+  (htmlprag-internal:parse-html input #t #t strict?))
 
 (define html->sxml  html->sxml-0nf)
 (define html->shtml html->sxml-0nf)
@@ -1660,7 +1704,7 @@
 ;;; @defproc test-htmlprag
 ;;;
 ;;; Run the test suite.  A log will be printed to the default output port.
-;;; Returns true iff all tests pass.
+;;; Returns true if all tests pass.
 
 (define (test-htmlprag)
   (letrec ((passed      0)
@@ -1708,10 +1752,10 @@
                                 (display ";;  ")
                                 (write expected)
                                 (newline))))))
-           (t1 (lambda (input expected)
+           (t1 (lambda* (input expected #:key (strict? 'unset))
                  (test html->shtml
                        'html->shtml
-                       (list input)
+                       (list input #:strict? strict?)
                        (cons shtml-top-symbol expected))))
            (t2 (lambda (input expected)
                  (test shtml->html
@@ -2019,6 +2063,20 @@
          "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\""
          " \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\";>"))
 
+    (parameterize ((%strict-tokenizer? #true))
+      ;; When using the strict tokenizer mode, the 'p' tag is correctly parsed
+      ;; as a child of blockquote.
+      (t1 "<body><blockquote><p>foo</p>\n</blockquote></body>"
+          '((body (blockquote (p "foo") "\n"))))
+      ;; Ensure the strict? argument takes precedence over %strict-tokenizer?.
+      (t1 "<body><blockquote><p>foo</p>\n</blockquote></body>"
+          '((body (blockquote) (p "foo") "\n"))
+          #:strict? #false)
+      ;; In strict tokenizer mode, missing closing tags are not handled
+      ;; specially.
+      (t1 "<body><blockquote><p>foo\n</blockquote></body>"
+          '((body (blockquote (p "foo\n"))))))
+
     ;; TODO: Write more test cases for HTML encoding.
 
     ;; TODO: Document this.
@@ -2213,6 +2271,9 @@ shtml-entity-value
 make-html-tokenizer
 tokenize-html
 shtml-token-kind
+%default-parent-constraints
+%parent-constraints
+%strict-tokenizer?
 parse-html/tokenizer
 html->sxml-0nf
 html->sxml-1nf
@@ -2227,3 +2288,7 @@ test-htmlprag
 )
 ;;; arch-tag: 491d7e61-5690-4b76-bc8f-d70315c10ed5
 ;;; htmlprag.scm ends here
+
+;; Local Variables:
+;; fill-column: 78
+;; End:
-- 
2.30.1

[PATCH devel 4/6] htmlprag: Allow users to parameterize the parent constraints.

Reply via email to