https://github.com/python/cpython/commit/3a623c6c55200e131302e961b535ecf91e89db91
commit: 3a623c6c55200e131302e961b535ecf91e89db91
branch: 3.10
author: Serhiy Storchaka <[email protected]>
committer: ambv <[email protected]>
date: 2025-10-31T17:55:58+01:00
summary:
[3.10] gh-137836: Support more RAWTEXT and PLAINTEXT elements in HTMLParser
(GH-137837) (GH-140842) (GH-140853)
(cherry picked from commit a17c57eee5b5cc81390750d07e4800b19c0c3084)
(cherry picked from commit 0329bd11c7e98484727bbb9062d53a8fa53ac7fd)
files:
A Misc/NEWS.d/next/Security/2025-08-15-23-08-44.gh-issue-137836.b55rhh.rst
M Doc/library/html.parser.rst
M Lib/html/parser.py
M Lib/test/test_htmlparser.py
diff --git a/Doc/library/html.parser.rst b/Doc/library/html.parser.rst
index 03aff25ce6117a..c1d056a5447edf 100644
--- a/Doc/library/html.parser.rst
+++ b/Doc/library/html.parser.rst
@@ -15,14 +15,18 @@
This module defines a class :class:`HTMLParser` which serves as the basis for
parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
-.. class:: HTMLParser(*, convert_charrefs=True)
+.. class:: HTMLParser(*, convert_charrefs=True, scripting=False)
Create a parser instance able to parse invalid markup.
- If *convert_charrefs* is ``True`` (the default), all character
- references (except the ones in ``script``/``style`` elements) are
+ If *convert_charrefs* is true (the default), all character
+ references (except the ones in elements like ``script`` and ``style``) are
automatically converted to the corresponding Unicode characters.
+ If *scripting* is false (the default), the content of the ``noscript``
+ element is parsed normally; if it's true, it's returned as is without
+ being parsed.
+
An :class:`.HTMLParser` instance is fed HTML data and calls handler methods
when start tags, end tags, text, comments, and other markup elements are
encountered. The user should subclass :class:`.HTMLParser` and override its
@@ -37,6 +41,9 @@ parsing text files formatted in HTML (HyperText Mark-up
Language) and XHTML.
.. versionchanged:: 3.5
The default value for argument *convert_charrefs* is now ``True``.
+ .. versionchanged:: 3.10.20
+ Added the *scripting* parameter.
+
Example HTML Parser Application
-------------------------------
@@ -159,15 +166,15 @@ implementations do nothing (except for
:meth:`~HTMLParser.handle_startendtag`):
.. method:: HTMLParser.handle_data(data)
This method is called to process arbitrary data (e.g. text nodes and the
- content of ``<script>...</script>`` and ``<style>...</style>``).
+ content of elements like ``script`` and ``style``).
.. method:: HTMLParser.handle_entityref(name)
This method is called to process a named character reference of the form
``&name;`` (e.g. ``>``), where *name* is a general entity reference
- (e.g. ``'gt'``). This method is never called if *convert_charrefs* is
- ``True``.
+ (e.g. ``'gt'``).
+ This method is only called if *convert_charrefs* is false.
.. method:: HTMLParser.handle_charref(name)
@@ -175,8 +182,8 @@ implementations do nothing (except for
:meth:`~HTMLParser.handle_startendtag`):
This method is called to process decimal and hexadecimal numeric character
references of the form ``&#NNN;`` and ``&#xNNN;``. For example, the decimal
equivalent for ``>`` is ``>``, whereas the hexadecimal is ``>``;
- in this case the method will receive ``'62'`` or ``'x3E'``. This method
- is never called if *convert_charrefs* is ``True``.
+ in this case the method will receive ``'62'`` or ``'x3E'``.
+ This method is only called if *convert_charrefs* is false.
.. method:: HTMLParser.handle_comment(data)
@@ -284,8 +291,8 @@ Parsing an element with a few attributes and a title::
Data : Python
End tag : h1
-The content of ``script`` and ``style`` elements is returned as is, without
-further parsing::
+The content of elements like ``script`` and ``style`` is returned as is,
+without further parsing::
>>> parser.feed('<style type="text/css">#python { color: green }</style>')
Start tag: style
@@ -294,10 +301,10 @@ further parsing::
End tag : style
>>> parser.feed('<script type="text/javascript">'
- ... 'alert("<strong>hello!</strong>");</script>')
+ ... 'alert("<strong>hello! ☺</strong>");</script>')
Start tag: script
attr: ('type', 'text/javascript')
- Data : alert("<strong>hello!</strong>");
+ Data : alert("<strong>hello! ☺</strong>");
End tag : script
Parsing comments::
@@ -317,7 +324,7 @@ correct char (note: these 3 references are all equivalent
to ``'>'``)::
Feeding incomplete chunks to :meth:`~HTMLParser.feed` works, but
:meth:`~HTMLParser.handle_data` might be called more than once
-(unless *convert_charrefs* is set to ``True``)::
+if *convert_charrefs* is false::
>>> for chunk in ['<sp', 'an>buff', 'ered ', 'text</s', 'pan>']:
... parser.feed(chunk)
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index 8724c22f8ff289..62134d376e1654 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -109,16 +109,24 @@ class HTMLParser(_markupbase.ParserBase):
argument.
"""
- CDATA_CONTENT_ELEMENTS = ("script", "style")
+ # See the HTML5 specs section "13.4 Parsing HTML fragments".
+ #
https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments
+ # CDATA_CONTENT_ELEMENTS are parsed in RAWTEXT mode
+ CDATA_CONTENT_ELEMENTS = ("script", "style", "xmp", "iframe", "noembed",
"noframes")
RCDATA_CONTENT_ELEMENTS = ("textarea", "title")
- def __init__(self, *, convert_charrefs=True):
+ def __init__(self, *, convert_charrefs=True, scripting=False):
"""Initialize and reset this instance.
- If convert_charrefs is True (the default), all character references
+ If convert_charrefs is true (the default), all character references
are automatically converted to the corresponding Unicode characters.
+
+ If *scripting* is false (the default), the content of the
+ ``noscript`` element is parsed normally; if it's true,
+ it's returned as is without being parsed.
"""
self.convert_charrefs = convert_charrefs
+ self.scripting = scripting
self.reset()
def reset(self):
@@ -153,7 +161,9 @@ def get_starttag_text(self):
def set_cdata_mode(self, elem, *, escapable=False):
self.cdata_elem = elem.lower()
self._escapable = escapable
- if escapable and not self.convert_charrefs:
+ if self.cdata_elem == 'plaintext':
+ self.interesting = re.compile(r'\Z')
+ elif escapable and not self.convert_charrefs:
self.interesting = re.compile(r'&|</%s(?=[\t\n\r\f />])' %
self.cdata_elem,
re.IGNORECASE|re.ASCII)
else:
@@ -441,8 +451,10 @@ def parse_starttag(self, i):
self.handle_startendtag(tag, attrs)
else:
self.handle_starttag(tag, attrs)
- if tag in self.CDATA_CONTENT_ELEMENTS:
- self.set_cdata_mode(tag)
+ if (tag in self.CDATA_CONTENT_ELEMENTS or
+ (self.scripting and tag == "noscript") or
+ tag == "plaintext"):
+ self.set_cdata_mode(tag, escapable=False)
elif tag in self.RCDATA_CONTENT_ELEMENTS:
self.set_cdata_mode(tag, escapable=True)
return endpos
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index a7be7a6e20224a..1c1be3ff476886 100644
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -7,6 +7,18 @@
from test import support
+SAMPLE_RCDATA = (
+ '<!-- not a comment -->'
+ "<not a='start tag'>"
+ '<![CDATA[not a cdata]]>'
+ '<!not a bogus comment>'
+ '</not a bogus comment>'
+ '\u2603'
+)
+
+SAMPLE_RAWTEXT = SAMPLE_RCDATA + '&☺'
+
+
class EventCollector(html.parser.HTMLParser):
def __init__(self, *args, autocdata=False, **kw):
@@ -292,30 +304,20 @@ def test_get_starttag_text(self):
'Date().getTime()+\'"><\\/s\'+\'cript>\');\n//]]>'),
'\n<!-- //\nvar foo = 3.14;\n// -->\n',
'<!-- \u2603 -->',
- 'foo = "</ script>"',
- 'foo = "</scripture>"',
- 'foo = "</script\v>"',
- 'foo = "</script\xa0>"',
- 'foo = "</ſcript>"',
- 'foo = "</scrıpt>"',
])
def test_script_content(self, content):
s = f'<script>{content}</script>'
- self._run_check(s, [("starttag", "script", []),
- ("data", content),
- ("endtag", "script")])
+ self._run_check(s, [
+ ("starttag", "script", []),
+ ("data", content),
+ ("endtag", "script"),
+ ])
@support.subTests('content', [
'a::before { content: "<!-- not a comment -->"; }',
'a::before { content: "¬-an-entity-ref;"; }',
'a::before { content: "<not a=\'start tag\'>"; }',
'a::before { content: "\u2603"; }',
- 'a::before { content: "< /style>"; }',
- 'a::before { content: "</ style>"; }',
- 'a::before { content: "</styled>"; }',
- 'a::before { content: "</style\v>"; }',
- 'a::before { content: "</style\xa0>"; }',
- 'a::before { content: "</ſtyle>"; }',
])
def test_style_content(self, content):
s = f'<style>{content}</style>'
@@ -323,47 +325,59 @@ def test_style_content(self, content):
("data", content),
("endtag", "style")])
- @support.subTests('content', [
- '<!-- not a comment -->',
- "<not a='start tag'>",
- '<![CDATA[not a cdata]]>',
- '<!not a bogus comment>',
- '</not a bogus comment>',
- '\u2603',
- '< /title>',
- '</ title>',
- '</titled>',
- '</title\v>',
- '</title\xa0>',
- '</tıtle>',
+ @support.subTests('tag', ['title', 'textarea'])
+ def test_rcdata_content(self, tag):
+ source = f"<{tag}>{SAMPLE_RCDATA}</{tag}>"
+ self._run_check(source, [
+ ("starttag", tag, []),
+ ("data", SAMPLE_RCDATA),
+ ("endtag", tag),
])
- def test_title_content(self, content):
- source = f"<title>{content}</title>"
+ source = f"<{tag}>&</{tag}>"
self._run_check(source, [
- ("starttag", "title", []),
- ("data", content),
- ("endtag", "title"),
+ ("starttag", tag, []),
+ ('entityref', 'amp'),
+ ("endtag", tag),
])
- @support.subTests('content', [
- '<!-- not a comment -->',
- "<not a='start tag'>",
- '<![CDATA[not a cdata]]>',
- '<!not a bogus comment>',
- '</not a bogus comment>',
- '\u2603',
- '< /textarea>',
- '</ textarea>',
- '</textareable>',
- '</textarea\v>',
- '</textarea\xa0>',
+ @support.subTests('tag',
+ ['style', 'xmp', 'iframe', 'noembed', 'noframes', 'script'])
+ def test_rawtext_content(self, tag):
+ source = f"<{tag}>{SAMPLE_RAWTEXT}</{tag}>"
+ self._run_check(source, [
+ ("starttag", tag, []),
+ ("data", SAMPLE_RAWTEXT),
+ ("endtag", tag),
+ ])
+
+ def test_noscript_content(self):
+ source = f"<noscript>{SAMPLE_RAWTEXT}</noscript>"
+ # scripting=False -- normal mode
+ self._run_check(source, [
+ ('starttag', 'noscript', []),
+ ('comment', ' not a comment '),
+ ('starttag', 'not', [('a', 'start tag')]),
+ ('unknown decl', 'CDATA[not a cdata'),
+ ('comment', 'not a bogus comment'),
+ ('endtag', 'not'),
+ ('data', '☃'),
+ ('entityref', 'amp'),
+ ('charref', '9786'),
+ ('endtag', 'noscript'),
])
- def test_textarea_content(self, content):
- source = f"<textarea>{content}</textarea>"
+ # scripting=True -- RAWTEXT mode
+ self._run_check(source, [
+ ("starttag", "noscript", []),
+ ("data", SAMPLE_RAWTEXT),
+ ("endtag", "noscript"),
+ ], collector=EventCollector(scripting=True))
+
+ def test_plaintext_content(self):
+ content = SAMPLE_RAWTEXT + '</plaintext>' # not closing
+ source = f"<plaintext>{content}"
self._run_check(source, [
- ("starttag", "textarea", []),
+ ("starttag", "plaintext", []),
("data", content),
- ("endtag", "textarea"),
])
@support.subTests('endtag', ['script', 'SCRIPT', 'script ', 'script\n',
@@ -380,52 +394,65 @@ def test_script_closing_tag(self, endtag):
("endtag", "script")],
collector=EventCollectorNoNormalize(convert_charrefs=False))
- @support.subTests('endtag', ['style', 'STYLE', 'style ', 'style\n',
- 'style/', 'style foo=bar', 'style foo=">"'])
- def test_style_closing_tag(self, endtag):
- content = """
- b::before { content: "<!-- not a comment -->"; }
- p::before { content: "¬-an-entity-ref;"; }
- a::before { content: "<i>"; }
- a::after { content: "</i>"; }
- """
- s = f'<StyLE>{content}</{endtag}>'
- self._run_check(s, [("starttag", "style", []),
- ("data", content),
- ("endtag", "style")],
-
collector=EventCollectorNoNormalize(convert_charrefs=False))
-
- @support.subTests('endtag', ['title', 'TITLE', 'title ', 'title\n',
- 'title/', 'title foo=bar', 'title foo=">"'])
- def test_title_closing_tag(self, endtag):
- content = "<!-- not a comment --><i>Egg & Spam</i>"
- s = f'<TitLe>{content}</{endtag}>'
- self._run_check(s, [("starttag", "title", []),
- ('data', '<!-- not a comment --><i>Egg &
Spam</i>'),
- ("endtag", "title")],
-
collector=EventCollectorNoNormalize(convert_charrefs=True))
- self._run_check(s, [("starttag", "title", []),
- ('data', '<!-- not a comment --><i>Egg '),
- ('entityref', 'amp'),
- ('data', ' Spam</i>'),
- ("endtag", "title")],
-
collector=EventCollectorNoNormalize(convert_charrefs=False))
-
- @support.subTests('endtag', ['textarea', 'TEXTAREA', 'textarea ',
'textarea\n',
- 'textarea/', 'textarea foo=bar', 'textarea
foo=">"'])
- def test_textarea_closing_tag(self, endtag):
- content = "<!-- not a comment --><i>Egg & Spam</i>"
- s = f'<TexTarEa>{content}</{endtag}>'
- self._run_check(s, [("starttag", "textarea", []),
- ('data', '<!-- not a comment --><i>Egg &
Spam</i>'),
- ("endtag", "textarea")],
-
collector=EventCollectorNoNormalize(convert_charrefs=True))
- self._run_check(s, [("starttag", "textarea", []),
- ('data', '<!-- not a comment --><i>Egg '),
- ('entityref', 'amp'),
- ('data', ' Spam</i>'),
- ("endtag", "textarea")],
-
collector=EventCollectorNoNormalize(convert_charrefs=False))
+ @support.subTests('tag', [
+ 'script', 'style', 'xmp', 'iframe', 'noembed', 'noframes',
+ 'textarea', 'title', 'noscript',
+ ])
+ def test_closing_tag(self, tag):
+ for endtag in [tag, tag.upper(), f'{tag} ', f'{tag}\n',
+ f'{tag}/', f'{tag} foo=bar', f'{tag} foo=">"']:
+ content = "<!-- not a comment --><i>Spam</i>"
+ s = f'<{tag.upper()}>{content}</{endtag}>'
+ self._run_check(s, [
+ ("starttag", tag, []),
+ ('data', content),
+ ("endtag", tag),
+ ], collector=EventCollectorNoNormalize(convert_charrefs=False,
scripting=True))
+
+ @support.subTests('tag', [
+ 'script', 'style', 'xmp', 'iframe', 'noembed', 'noframes',
+ 'textarea', 'title', 'noscript',
+ ])
+ def test_invalid_closing_tag(self, tag):
+ content = (
+ f'< /{tag}>'
+ f'</ {tag}>'
+ f'</{tag}x>'
+ f'</{tag}\v>'
+ f'</{tag}\xa0>'
+ )
+ source = f"<{tag}>{content}</{tag}>"
+ self._run_check(source, [
+ ("starttag", tag, []),
+ ("data", content),
+ ("endtag", tag),
+ ], collector=EventCollector(convert_charrefs=False, scripting=True))
+
+ @support.subTests('tag,endtag', [
+ ('title', 'tıtle'),
+ ('style', 'ſtyle'),
+ ('style', 'ſtyle'),
+ ('style', 'style'),
+ ('iframe', 'ıframe'),
+ ('noframes', 'noframeſ'),
+ ('noscript', 'noſcript'),
+ ('noscript', 'noscrıpt'),
+ ('script', 'ſcript'),
+ ('script', 'scrıpt'),
+ ])
+ def test_invalid_nonascii_closing_tag(self, tag, endtag):
+ content = f"<br></{endtag}>"
+ source = f"<{tag}>{content}"
+ self._run_check(source, [
+ ("starttag", tag, []),
+ ("data", content),
+ ], collector=EventCollector(convert_charrefs=False, scripting=True))
+ source = f"<{tag}>{content}</{tag}>"
+ self._run_check(source, [
+ ("starttag", tag, []),
+ ("data", content),
+ ("endtag", tag),
+ ], collector=EventCollector(convert_charrefs=False, scripting=True))
@support.subTests('tail,end', [
('', False),
diff --git
a/Misc/NEWS.d/next/Security/2025-08-15-23-08-44.gh-issue-137836.b55rhh.rst
b/Misc/NEWS.d/next/Security/2025-08-15-23-08-44.gh-issue-137836.b55rhh.rst
new file mode 100644
index 00000000000000..c30c9439a76a19
--- /dev/null
+++ b/Misc/NEWS.d/next/Security/2025-08-15-23-08-44.gh-issue-137836.b55rhh.rst
@@ -0,0 +1,3 @@
+Add support of the "plaintext" element, RAWTEXT elements "xmp", "iframe",
+"noembed" and "noframes", and optionally RAWTEXT element "noscript" in
+:class:`html.parser.HTMLParser`.
_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3//lists/python-checkins.python.org
Member address: [email protected]