https://github.com/python/cpython/commit/d8f6297e6d678f635d78cc59776ab7b246bfb03c commit: d8f6297e6d678f635d78cc59776ab7b246bfb03c branch: 3.11 author: Miss Islington (bot) <[email protected]> committer: ambv <[email protected]> date: 2025-07-12T14:24:39+02:00 summary:
[3.11] gh-102555: Fix comment parsing in HTMLParser according to the HTML5 standard (GH-135664) (GH-136274) * "--!>" now ends the comment. * "-- >" no longer ends the comment. * Support abnormally ended empty comments "<-->" and "<--->". --------- (cherry picked from commit 8ac7613dc8b8f82253d7c0e2b6ef6ed703a0a1ee) Co-author: Kerim Kabirov <[email protected]> Co-authored-by: Serhiy Storchaka <[email protected]> Co-authored-by: Ezio Melotti <[email protected]> files: A Misc/NEWS.d/next/Security/2025-06-18-13-28-08.gh-issue-102555.nADrzJ.rst M Lib/html/parser.py M Lib/test/test_htmlparser.py diff --git a/Lib/html/parser.py b/Lib/html/parser.py index 9c38008bbfd06b..5105b95c50a1e0 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -27,7 +27,8 @@ starttagopen = re.compile('<[a-zA-Z]') endtagopen = re.compile('</[a-zA-Z]') piclose = re.compile('>') -commentclose = re.compile(r'--\s*>') +commentclose = re.compile(r'--!?>') +commentabruptclose = re.compile(r'-?>') # Note: # 1) if you change tagfind/attrfind remember to update locatestarttagend too; # 2) if you change tagfind/attrfind and/or locatestarttagend the parser will @@ -290,6 +291,21 @@ def parse_html_declaration(self, i): else: return self.parse_bogus_comment(i) + # Internal -- parse comment, return length or -1 if not terminated + # see https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state + def parse_comment(self, i, report=True): + rawdata = self.rawdata + assert rawdata.startswith('<!--', i), 'unexpected call to parse_comment()' + match = commentclose.search(rawdata, i+4) + if not match: + match = commentabruptclose.match(rawdata, i+4) + if not match: + return -1 + if report: + j = match.start() + self.handle_comment(rawdata[i+4: j]) + return match.end() + # Internal -- parse bogus comment, return length or -1 if not terminated # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state def parse_bogus_comment(self, i, report=1): diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index df775c11310146..d784ee945e0f1d 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -321,17 +321,45 @@ def test_comments(self): html = ("<!-- I'm a valid comment -->" '<!--me too!-->' '<!------>' + '<!----->' '<!---->' + # abrupt-closing-of-empty-comment + '<!--->' + '<!-->' '<!----I have many hyphens---->' '<!-- I have a > in the middle -->' - '<!-- and I have -- in the middle! -->') + '<!-- and I have -- in the middle! -->' + '<!--incorrectly-closed-comment--!>' + '<!----!>' + '<!----!-->' + '<!---- >-->' + '<!---!>-->' + '<!--!>-->' + # nested-comment + '<!-- <!-- nested --> -->' + '<!--<!-->' + '<!--<!--!>' + ) expected = [('comment', " I'm a valid comment "), ('comment', 'me too!'), ('comment', '--'), + ('comment', '-'), + ('comment', ''), + ('comment', ''), ('comment', ''), ('comment', '--I have many hyphens--'), ('comment', ' I have a > in the middle '), - ('comment', ' and I have -- in the middle! ')] + ('comment', ' and I have -- in the middle! '), + ('comment', 'incorrectly-closed-comment'), + ('comment', ''), + ('comment', '--!'), + ('comment', '-- >'), + ('comment', '-!>'), + ('comment', '!>'), + ('comment', ' <!-- nested '), ('data', ' -->'), + ('comment', '<!'), + ('comment', '<!'), + ] self._run_check(html, expected) def test_condcoms(self): diff --git a/Misc/NEWS.d/next/Security/2025-06-18-13-28-08.gh-issue-102555.nADrzJ.rst b/Misc/NEWS.d/next/Security/2025-06-18-13-28-08.gh-issue-102555.nADrzJ.rst new file mode 100644 index 00000000000000..71d15ee0852ebd --- /dev/null +++ b/Misc/NEWS.d/next/Security/2025-06-18-13-28-08.gh-issue-102555.nADrzJ.rst @@ -0,0 +1,3 @@ +Fix comment parsing in :class:`html.parser.HTMLParser` according to the +HTML5 standard. ``--!>`` now ends the comment. ``-- >`` no longer ends the +comment. Support abnormally ended empty comments ``<-->`` and ``<--->``. _______________________________________________ Python-checkins mailing list -- [email protected] To unsubscribe send an email to [email protected] https://mail.python.org/mailman3//lists/python-checkins.python.org Member address: [email protected]
