These constraints are used to make the tokenizer more lenient in the face of incomplete/invalid HTML; unfortunately it causes valid HTML to be parsed incorrectly. This change allows users to parameterize the alist that defines which elements are treated specially via the %PARENT-CONSTRAINTS parameter, or to disable the pragmatic behavior completely via the %STRICT-TOKENIZER? parameter or the STRICT? keyword argument of the HTML->SXML procedure (and its variants).
* src/htmlprag.scm: Update doc. (%default-parent-constraints): New variable. (%parent-constraints): New parameter. (%strict-tokenizer?): Likewise. (parse-html/tokenizer)[strict?]: New keyword argument. Adjust to use the newly added parameters and argument. (htmlprag-internal:parse-html)[strict?]: New argument. (test-htmlprag): Add tests. (html->sxml-0nf, html->sxml-1nf, html->sxml-2nf)[strict?]: New argument. --- src/htmlprag.scm | 143 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 104 insertions(+), 39 deletions(-) diff --git a/src/htmlprag.scm b/src/htmlprag.scm index 3bd352b..79a7b2f 100644 --- a/src/htmlprag.scm +++ b/src/htmlprag.scm @@ -1,6 +1,7 @@ ;; (htmlprag) -- pragmatic parsing of real-world HTML ;; Copyright (C) 2003-2004 Neil W. Van Dyke <neil at neilvandyke.org> ;; Modified 2004 by Andy Wingo to fit in with guile-lib. +;; Modified 2021 by Maxim Cournoyer to parameterize the parent constraints. ;; This program is free software: you can redistribute it and/or modify ;; it under the terms of the GNU Lesser General Public License as @@ -39,7 +40,12 @@ ;;; defeat a strict or validating parser. HtmlPrag's handling of errors is ;;; intended to generally emulate popular Web browsers' interpretation of the ;;; structure of erroneous HTML. We euphemistically term this kind of parse -;;; ``pragmatic.'' +;;; ``pragmatic.'' To disable the pragmatic behavior and parse HTML more +;;; rigidly, the @code{%strict-tokenizer?} parameter can be set to +;;; @code{#true}. In this mode of operation, one ended HTML tags will not be +;;; treated specially, for example, and their content will be coalesced. On +;;; the other side, valid HTML will parse more accurately. When working with +;;; HTML known to be valid, it makes sense to use this mode of operation. ;;; ;;; HtmlPrag also has some support for [XHTML], although XML namespace ;;; qualifiers [XML-Names] are currently accepted but stripped from the @@ -1076,6 +1082,48 @@ ;;; input port. This procedure is used internally, and generally should not be ;;; called directly. +;;; The alist below defines constraints about what possible parents a +;;; tag may have. This exists to allow parsing malformed HTML, e.g., +;;; in the presence of missing closing tags. The drawback is that +;;; this restricts where these tags may nested; for example, the +;;; following HTML fragment +;;; "<body><blockquote><p>foo</p>\n</blockquote></body>" parses to +;;; '(*TOP* (body (blockquote) (p "foo") "\n")), which incorrectly +;;; forces the 'p' tag to be a child of 'body' rather than of +;;; 'blockquote'. +(define %default-parent-constraints + '((area . (map)) + (body . (html)) + (caption . (table)) + (colgroup . (table)) + (dd . (dl)) + (dt . (dl)) + (frame . (frameset)) + (head . (html)) + (isindex . (head)) + (li . (dir menu ol ul)) + (meta . (head)) + (noframes . (frameset)) + (option . (select)) + (p . (body td th)) + (param . (applet)) + (tbody . (table)) + (td . (tr)) + (th . (tr)) + (thead . (table)) + (title . (head)) + (tr . (table tbody thead)))) + +;;; The following parameter enables users to parameterize which +;;; constraints to use when tokenizing HTML. +(define %parent-constraints (make-parameter %default-parent-constraints)) + +;;; The following switch is disabled for historical reasons. When true, +;;; it disables the use of the above %parent-constraints parameter. +;;; TODO: Set to #true when bumping the major version, which is a +;;; better default in modern times, where most HTML is valid. +(define %strict-tokenizer? (make-parameter #false)) + (define parse-html/tokenizer ;; TODO: Document the algorithm, then see if rewriting as idiomatic Scheme ;; can make it more clear. @@ -1084,32 +1132,9 @@ ;; acceptable way to parse old HTML that uses the `p' element ;; as a paragraph terminator. htmlprag-internal:empty-elements) - (parent-constraints - ;; TODO: Maybe make this an option. - '((area . (map)) - (body . (html)) - (caption . (table)) - (colgroup . (table)) - (dd . (dl)) - (dt . (dl)) - (frame . (frameset)) - (head . (html)) - (isindex . (head)) - (li . (dir menu ol ul)) - (meta . (head)) - (noframes . (frameset)) - (option . (select)) - (p . (body td th)) - (param . (applet)) - (tbody . (table)) - (td . (tr)) - (th . (tr)) - (thead . (table)) - (title . (head)) - (tr . (table tbody thead)))) (start-tag-name (lambda (tag-token) (car tag-token))) (end-tag-name (lambda (tag-token) (list-ref tag-token 1)))) - (lambda (tokenizer normalized?) + (lambda* (tokenizer normalized? #:key (strict? 'unset)) ;; Example `begs' value: ;; ;; ( ((head ...) . ( (title ...) )) @@ -1169,7 +1194,14 @@ (add-to-current-beg tok)) ((eqv? kind shtml-start-symbol) (let* ((name (start-tag-name tok)) - (cell (assq name parent-constraints))) + ;; If STRICT? is a boolean, it means the + ;; user explicitly provided it, in which + ;; case it takes precedence over the + ;; %strict-tokenizer? parameter. + (cell (and (not (if (boolean? strict?) + strict? + (%strict-tokenizer?))) + (assq name (%parent-constraints))))) (and cell (finish-begs-upto (cdr cell) begs)) (add-to-current-beg tok) (or (memq name empty-elements) @@ -1207,7 +1239,7 @@ ;; variants, and should not be used directly by programs. The interface is ;; likely to change in future versions of HtmlPrag. -(define (htmlprag-internal:parse-html input normalized? top?) +(define (htmlprag-internal:parse-html input normalized? top? strict?) (let ((parse (lambda () (parse-html/tokenizer @@ -1219,15 +1251,16 @@ "invalid input type" input))) normalized?) - normalized?)))) + normalized? + #:strict? strict?)))) (if top? (cons shtml-top-symbol (parse)) (parse)))) -;;; @defproc html->sxml-0nf input -;;; @defprocx html->sxml-1nf input -;;; @defprocx html->sxml-2nf input -;;; @defprocx html->sxml input +;;; @defproc html->sxml-0nf input strict? +;;; @defprocx html->sxml-1nf input strict? +;;; @defprocx html->sxml-2nf input strict? +;;; @defprocx html->sxml input strict? ;;; ;;; Permissively parse HTML from @var{input}, which is either an input port or ;;; a string, and emit an SHTML equivalent or approximation. To borrow and @@ -1257,7 +1290,15 @@ ;;; Note that in the emitted SHTML the text token @code{"still < bold"} is ;;; @emph{not} inside the @code{b} element, which represents an unfortunate ;;; failure to emulate all the quirks-handling behavior of some popular Web -;;; browsers. +;;; browsers. When correctness is preferred over pragmatism, the +;;; @code{%strict-tokenizer?} parameter can be set to true. In the above +;;; example, the unbound elements would nested, but valid HTML would parse +;;; correctly, without the parsing quirk mentioned above. Alternatively, the +;;; @code{strict?} keyword argument can be set to true, in which case it takes +;;; precedence over the value of the @code{%strict-tokenizer?} parameter. +;;; Finally, the %parent-constraints parameter can also be used to customize +;;; which elements should be treated specially, when operating in the default +;;; pragmatic mode. ;;; ;;; The procedures @code{html->sxml-@var{n}nf} for @var{n} 0 through 2 ;;; correspond to 0th through 2nd normal forms of SXML as specified in [SXML], @@ -1267,9 +1308,12 @@ ;;; @code{html->sxml-0nf}, and can be used in scripts and interactively, when ;;; terseness is important and any normal form of SXML would suffice. -(define (html->sxml-0nf input) (htmlprag-internal:parse-html input #f #t)) -(define (html->sxml-1nf input) (htmlprag-internal:parse-html input #f #t)) -(define (html->sxml-2nf input) (htmlprag-internal:parse-html input #t #t)) +(define* (html->sxml-0nf input #:key (strict? 'unset)) + (htmlprag-internal:parse-html input #f #t strict?)) +(define* (html->sxml-1nf input #:key (strict? 'unset)) + (htmlprag-internal:parse-html input #f #t strict?)) +(define* (html->sxml-2nf input #:key (strict? 'unset)) + (htmlprag-internal:parse-html input #t #t strict?)) (define html->sxml html->sxml-0nf) (define html->shtml html->sxml-0nf) @@ -1660,7 +1704,7 @@ ;;; @defproc test-htmlprag ;;; ;;; Run the test suite. A log will be printed to the default output port. -;;; Returns true iff all tests pass. +;;; Returns true if all tests pass. (define (test-htmlprag) (letrec ((passed 0) @@ -1708,10 +1752,10 @@ (display ";; ") (write expected) (newline)))))) - (t1 (lambda (input expected) + (t1 (lambda* (input expected #:key (strict? 'unset)) (test html->shtml 'html->shtml - (list input) + (list input #:strict? strict?) (cons shtml-top-symbol expected)))) (t2 (lambda (input expected) (test shtml->html @@ -2019,6 +2063,20 @@ "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"" " \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">")) + (parameterize ((%strict-tokenizer? #true)) + ;; When using the strict tokenizer mode, the 'p' tag is correctly parsed + ;; as a child of blockquote. + (t1 "<body><blockquote><p>foo</p>\n</blockquote></body>" + '((body (blockquote (p "foo") "\n")))) + ;; Ensure the strict? argument takes precedence over %strict-tokenizer?. + (t1 "<body><blockquote><p>foo</p>\n</blockquote></body>" + '((body (blockquote) (p "foo") "\n")) + #:strict? #false) + ;; In strict tokenizer mode, missing closing tags are not handled + ;; specially. + (t1 "<body><blockquote><p>foo\n</blockquote></body>" + '((body (blockquote (p "foo\n")))))) + ;; TODO: Write more test cases for HTML encoding. ;; TODO: Document this. @@ -2213,6 +2271,9 @@ shtml-entity-value make-html-tokenizer tokenize-html shtml-token-kind +%default-parent-constraints +%parent-constraints +%strict-tokenizer? parse-html/tokenizer html->sxml-0nf html->sxml-1nf @@ -2227,3 +2288,7 @@ test-htmlprag ) ;;; arch-tag: 491d7e61-5690-4b76-bc8f-d70315c10ed5 ;;; htmlprag.scm ends here + +;; Local Variables: +;; fill-column: 78 +;; End: -- 2.30.1