https://github.com/python/cpython/commit/7317e0bbb733e0ec91e6e8ef6065855a99cc8aad commit: 7317e0bbb733e0ec91e6e8ef6065855a99cc8aad branch: 3.10 author: Miss Islington (bot) <[email protected]> committer: ambv <[email protected]> date: 2025-10-07T14:12:23+02:00 summary:
[3.10] gh-135661: Fix CDATA section parsing in HTMLParser (GH-135665) (GH-137774) (GH-139660) "] ]>" and "]] >" no longer end the CDATA section. Make CDATA section parsing context depending. Add private method HTMLParser._set_support_cdata() to change the context. If called with True, "<[CDATA[" starts a CDATA section which ends with "]]>". If called with False, "<[CDATA[" starts a bogus comments which ends with ">". (cherry picked from commit 0cbbfc462119b9107b373c24d2bda5a1271bed36) (cherry picked from commit dcf24768c918c41821cda6fe6a1aa20ce26545dd) Co-authored-by: Serhiy Storchaka <[email protected]> files: A Misc/NEWS.d/next/Security/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst M Lib/html/parser.py M Lib/test/test_htmlparser.py diff --git a/Lib/html/parser.py b/Lib/html/parser.py index c609ee99b4d6a9..8724c22f8ff289 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -127,6 +127,7 @@ def reset(self): self.lasttag = '???' self.interesting = interesting_normal self.cdata_elem = None + self._support_cdata = True self._escapable = True _markupbase.ParserBase.reset(self) @@ -164,6 +165,19 @@ def clear_cdata_mode(self): self.cdata_elem = None self._escapable = True + def _set_support_cdata(self, flag=True): + """Enable or disable support of the CDATA sections. + If enabled, "<[CDATA[" starts a CDATA section which ends with "]]>". + If disabled, "<[CDATA[" starts a bogus comments which ends with ">". + + This method is not called by default. Its purpose is to be called + in custom handle_starttag() and handle_endtag() methods, with + value that depends on the adjusted current node. + See https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state + for details. + """ + self._support_cdata = flag + # Internal -- handle data as far as reasonable. May leave state # and data to be processed by a subsequent call. If 'end' is # true, force handling all data as if followed by EOF marker. @@ -238,7 +252,7 @@ def goahead(self, end): j -= len(suffix) break self.handle_comment(rawdata[i+4:j]) - elif startswith("<![CDATA[", i): + elif startswith("<![CDATA[", i) and self._support_cdata: self.unknown_decl(rawdata[i+3:]) elif rawdata[i:i+9].lower() == '<!doctype': self.handle_decl(rawdata[i+2:]) @@ -314,8 +328,12 @@ def parse_html_declaration(self, i): if rawdata[i:i+4] == '<!--': # this case is actually already handled in goahead() return self.parse_comment(i) - elif rawdata[i:i+3] == '<![': - return self.parse_marked_section(i) + elif rawdata[i:i+9] == '<![CDATA[' and self._support_cdata: + j = rawdata.find(']]>', i+9) + if j < 0: + return -1 + self.unknown_decl(rawdata[i+3: j]) + return j + 3 elif rawdata[i:i+9].lower() == '<!doctype': # find the closing > gtpos = rawdata.find('>', i+9) @@ -323,6 +341,15 @@ def parse_html_declaration(self, i): return -1 self.handle_decl(rawdata[i+2:gtpos]) return gtpos+1 + elif rawdata[i:i+3] == '<![': + j = rawdata.find('>', i+3) + if j < 0: + return -1 + if rawdata[j-1] == ']': + self.unknown_decl(rawdata[i+3: j-1]) + else: + self.handle_comment(rawdata[i+2: j]) + return j + 1 else: return self.parse_bogus_comment(i) diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 4a67420ae14fe1..a7be7a6e20224a 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -9,10 +9,13 @@ class EventCollector(html.parser.HTMLParser): - def __init__(self, *args, **kw): + def __init__(self, *args, autocdata=False, **kw): + self.autocdata = autocdata self.events = [] self.append = self.events.append html.parser.HTMLParser.__init__(self, *args, **kw) + if autocdata: + self._set_support_cdata(False) def get_events(self): # Normalize the list of events so that buffer artefacts don't @@ -33,12 +36,16 @@ def get_events(self): def handle_starttag(self, tag, attrs): self.append(("starttag", tag, attrs)) + if self.autocdata and tag == 'svg': + self._set_support_cdata(True) def handle_startendtag(self, tag, attrs): self.append(("startendtag", tag, attrs)) def handle_endtag(self, tag): self.append(("endtag", tag)) + if self.autocdata and tag == 'svg': + self._set_support_cdata(False) # all other markup @@ -739,10 +746,6 @@ def test_eof_in_declarations(self): ('<!', [('comment', '')]), ('<!-', [('comment', '-')]), ('<![', [('comment', '[')]), - ('<![CDATA[', [('unknown decl', 'CDATA[')]), - ('<![CDATA[x', [('unknown decl', 'CDATA[x')]), - ('<![CDATA[x]', [('unknown decl', 'CDATA[x]')]), - ('<![CDATA[x]]', [('unknown decl', 'CDATA[x]]')]), ('<!DOCTYPE', [('decl', 'DOCTYPE')]), ('<!DOCTYPE ', [('decl', 'DOCTYPE ')]), ('<!DOCTYPE html', [('decl', 'DOCTYPE html')]), @@ -755,6 +758,18 @@ def test_eof_in_declarations(self): for html, expected in data: self._run_check(html, expected) + @support.subTests('content', ['', 'x', 'x]', 'x]]']) + def test_eof_in_cdata(self, content): + self._run_check('<![CDATA[' + content, + [('unknown decl', 'CDATA[' + content)]) + self._run_check('<![CDATA[' + content, + [('comment', '[CDATA[' + content)], + collector=EventCollector(autocdata=True)) + self._run_check('<svg><text y="100"><![CDATA[' + content, + [('starttag', 'svg', []), + ('starttag', 'text', [('y', '100')]), + ('unknown decl', 'CDATA[' + content)]) + def test_bogus_comments(self): html = ('<!ELEMENT br EMPTY>' '<! not really a comment >' @@ -804,8 +819,57 @@ def test_broken_condcoms(self): ('startendtag', 'img', [('src', 'mammoth.bmp')]), ('unknown decl', 'endif') ] + self._run_check(html, expected) + @support.subTests('content', [ + 'just some plain text', + '<!-- not a comment -->', + '¬-an-entity-ref;', + "<not a='start tag'>", + '', + '[[I have many brackets]]', + 'I have a > in the middle', + 'I have a ]] in the middle', + '] ]>', + ']] >', + ('\n' + ' if (a < b && a > b) {\n' + ' printf("[<marquee>How?</marquee>]");\n' + ' }\n'), + ]) + def test_cdata_section_content(self, content): + # See "13.2.5.42 Markup declaration open state", + # "13.2.5.69 CDATA section state", and issue bpo-32876. + html = f'<svg><text y="100"><![CDATA[{content}]]></text></svg>' + expected = [ + ('starttag', 'svg', []), + ('starttag', 'text', [('y', '100')]), + ('unknown decl', 'CDATA[' + content), + ('endtag', 'text'), + ('endtag', 'svg'), + ] + self._run_check(html, expected) + self._run_check(html, expected, collector=EventCollector(autocdata=True)) + + def test_cdata_section(self): + # See "13.2.5.42 Markup declaration open state". + html = ('<![CDATA[foo<br>bar]]>' + '<svg><text y="100"><![CDATA[foo<br>bar]]></text></svg>' + '<![CDATA[foo<br>bar]]>') + expected = [ + ('comment', '[CDATA[foo<br'), + ('data', 'bar]]>'), + ('starttag', 'svg', []), + ('starttag', 'text', [('y', '100')]), + ('unknown decl', 'CDATA[foo<br>bar'), + ('endtag', 'text'), + ('endtag', 'svg'), + ('comment', '[CDATA[foo<br'), + ('data', 'bar]]>'), + ] + self._run_check(html, expected, collector=EventCollector(autocdata=True)) + def test_convert_charrefs_dropped_text(self): # #23144: make sure that all the events are triggered when # convert_charrefs is True, even if we don't call .close() diff --git a/Misc/NEWS.d/next/Security/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst b/Misc/NEWS.d/next/Security/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst new file mode 100644 index 00000000000000..fe000d936aae9d --- /dev/null +++ b/Misc/NEWS.d/next/Security/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst @@ -0,0 +1,5 @@ +Fix CDATA section parsing in :class:`html.parser.HTMLParser` according to +the HTML5 standard: ``] ]>`` and ``]] >`` no longer end the CDATA section. +Add private method ``_set_support_cdata()`` which can be used to specify +how to parse ``<[CDATA[`` --- as a CDATA section in foreign content +(SVG or MathML) or as a bogus comment in the HTML namespace. _______________________________________________ Python-checkins mailing list -- [email protected] To unsubscribe send an email to [email protected] https://mail.python.org/mailman3//lists/python-checkins.python.org Member address: [email protected]
