[Python-checkins] [3.9] gh-118350: Fix support of elements "textarea" and "title" in HTMLParser (GH-135310) (GH-137784)

ambv Sat, 13 Sep 2025 13:52:25 -0700

https://github.com/python/cpython/commit/4dea0fb67b7064eb637d859de0b5d3ce4989bd1c
commit: 4dea0fb67b7064eb637d859de0b5d3ce4989bd1c
branch: 3.9
author: Serhiy Storchaka <[email protected]>
committer: ambv <[email protected]>
date: 2025-09-13T22:35:13+02:00
summary:


[3.9] gh-118350: Fix support of elements "textarea" and "title" in HTMLParser 
(GH-135310) (GH-137784)

(cherry picked from commit 4d02f31cdd45d81b95540d9076222b709d4f2335)

Co-authored-by: Timon Viola <[email protected]>
Co-authored-by: Łukasz Langa <[email protected]>

files:
A Misc/NEWS.d/next/Security/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst
M Lib/html/parser.py
M Lib/test/test_htmlparser.py

diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index c7dde2e911f935..c609ee99b4d6a9 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -110,6 +110,7 @@ class HTMLParser(_markupbase.ParserBase):
     """
 
     CDATA_CONTENT_ELEMENTS = ("script", "style")
+    RCDATA_CONTENT_ELEMENTS = ("textarea", "title")
 
     def __init__(self, *, convert_charrefs=True):
         """Initialize and reset this instance.
@@ -126,6 +127,7 @@ def reset(self):
         self.lasttag = '???'
         self.interesting = interesting_normal
         self.cdata_elem = None
+        self._escapable = True
         _markupbase.ParserBase.reset(self)
 
     def feed(self, data):
@@ -147,14 +149,20 @@ def get_starttag_text(self):
         """Return full source of start tag: '<...>'."""
         return self.__starttag_text
 
-    def set_cdata_mode(self, elem):
+    def set_cdata_mode(self, elem, *, escapable=False):
         self.cdata_elem = elem.lower()
-        self.interesting = re.compile(r'</%s(?=[\t\n\r\f />])' % 
self.cdata_elem,
-                                      re.IGNORECASE|re.ASCII)
+        self._escapable = escapable
+        if escapable and not self.convert_charrefs:
+            self.interesting = re.compile(r'&|</%s(?=[\t\n\r\f />])' % 
self.cdata_elem,
+                                          re.IGNORECASE|re.ASCII)
+        else:
+            self.interesting = re.compile(r'</%s(?=[\t\n\r\f />])' % 
self.cdata_elem,
+                                          re.IGNORECASE|re.ASCII)
 
     def clear_cdata_mode(self):
         self.interesting = interesting_normal
         self.cdata_elem = None
+        self._escapable = True
 
     # Internal -- handle data as far as reasonable.  May leave state
     # and data to be processed by a subsequent call.  If 'end' is
@@ -187,7 +195,7 @@ def goahead(self, end):
                         break
                     j = n
             if i < j:
-                if self.convert_charrefs and not self.cdata_elem:
+                if self.convert_charrefs and self._escapable:
                     self.handle_data(unescape(rawdata[i:j]))
                 else:
                     self.handle_data(rawdata[i:j])
@@ -289,7 +297,7 @@ def goahead(self, end):
                 assert 0, "interesting.search() lied"
         # end while
         if end and i < n:
-            if self.convert_charrefs and not self.cdata_elem:
+            if self.convert_charrefs and self._escapable:
                 self.handle_data(unescape(rawdata[i:n]))
             else:
                 self.handle_data(rawdata[i:n])
@@ -408,6 +416,8 @@ def parse_starttag(self, i):
             self.handle_starttag(tag, attrs)
             if tag in self.CDATA_CONTENT_ELEMENTS:
                 self.set_cdata_mode(tag)
+            elif tag in self.RCDATA_CONTENT_ELEMENTS:
+                self.set_cdata_mode(tag, escapable=True)
         return endpos
 
     # Internal -- check to see if we have a complete starttag; return end
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index 8493fe11259286..93f6ceae6f1422 100644
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -316,6 +316,49 @@ def test_style_content(self, content):
                             ("data", content),
                             ("endtag", "style")])
 
+    @support.subTests('content', [
+            '<!-- not a comment -->',
+            "<not a='start tag'>",
+            '<![CDATA[not a cdata]]>',
+            '<!not a bogus comment>',
+            '</not a bogus comment>',
+            '\u2603',
+            '< /title>',
+            '</ title>',
+            '</titled>',
+            '</title\v>',
+            '</title\xa0>',
+            '</tıtle>',
+        ])
+    def test_title_content(self, content):
+        source = f"<title>{content}</title>"
+        self._run_check(source, [
+            ("starttag", "title", []),
+            ("data", content),
+            ("endtag", "title"),
+        ])
+
+    @support.subTests('content', [
+            '<!-- not a comment -->',
+            "<not a='start tag'>",
+            '<![CDATA[not a cdata]]>',
+            '<!not a bogus comment>',
+            '</not a bogus comment>',
+            '\u2603',
+            '< /textarea>',
+            '</ textarea>',
+            '</textareable>',
+            '</textarea\v>',
+            '</textarea\xa0>',
+        ])
+    def test_textarea_content(self, content):
+        source = f"<textarea>{content}</textarea>"
+        self._run_check(source, [
+            ("starttag", "textarea", []),
+            ("data", content),
+            ("endtag", "textarea"),
+        ])
+
     @support.subTests('endtag', ['script', 'SCRIPT', 'script ', 'script\n',
                                  'script/', 'script foo=bar', 'script 
foo=">"'])
     def test_script_closing_tag(self, endtag):
@@ -345,6 +388,38 @@ def test_style_closing_tag(self, endtag):
                             ("endtag", "style")],
                         
collector=EventCollectorNoNormalize(convert_charrefs=False))
 
+    @support.subTests('endtag', ['title', 'TITLE', 'title ', 'title\n',
+                                 'title/', 'title foo=bar', 'title foo=">"'])
+    def test_title_closing_tag(self, endtag):
+        content = "<!-- not a comment --><i>Egg &amp; Spam</i>"
+        s = f'<TitLe>{content}</{endtag}>'
+        self._run_check(s, [("starttag", "title", []),
+                            ('data', '<!-- not a comment --><i>Egg & 
Spam</i>'),
+                            ("endtag", "title")],
+                        
collector=EventCollectorNoNormalize(convert_charrefs=True))
+        self._run_check(s, [("starttag", "title", []),
+                            ('data', '<!-- not a comment --><i>Egg '),
+                            ('entityref', 'amp'),
+                            ('data', ' Spam</i>'),
+                            ("endtag", "title")],
+                        
collector=EventCollectorNoNormalize(convert_charrefs=False))
+
+    @support.subTests('endtag', ['textarea', 'TEXTAREA', 'textarea ', 
'textarea\n',
+                                 'textarea/', 'textarea foo=bar', 'textarea 
foo=">"'])
+    def test_textarea_closing_tag(self, endtag):
+        content = "<!-- not a comment --><i>Egg &amp; Spam</i>"
+        s = f'<TexTarEa>{content}</{endtag}>'
+        self._run_check(s, [("starttag", "textarea", []),
+                            ('data', '<!-- not a comment --><i>Egg & 
Spam</i>'),
+                            ("endtag", "textarea")],
+                        
collector=EventCollectorNoNormalize(convert_charrefs=True))
+        self._run_check(s, [("starttag", "textarea", []),
+                            ('data', '<!-- not a comment --><i>Egg '),
+                            ('entityref', 'amp'),
+                            ('data', ' Spam</i>'),
+                            ("endtag", "textarea")],
+                        
collector=EventCollectorNoNormalize(convert_charrefs=False))
+
     @support.subTests('tail,end', [
         ('', False),
         ('<', False),
@@ -362,6 +437,27 @@ def test_eof_in_script(self, tail, end):
                             ("data", content if end else content + tail)],
                         
collector=EventCollectorNoNormalize(convert_charrefs=False))
 
+    @support.subTests('tail,end', [
+        ('', False),
+        ('<', False),
+        ('</', False),
+        ('</t', False),
+        ('</title', False),
+        ('</title ', True),
+        ('</title foo=bar', True),
+        ('</title foo=">', True),
+    ])
+    def test_eof_in_title(self, tail, end):
+        s = f'<TitLe>Egg &amp; Spam{tail}'
+        self._run_check(s, [("starttag", "title", []),
+                            ("data", "Egg & Spam" + ('' if end else tail))],
+                        
collector=EventCollectorNoNormalize(convert_charrefs=True))
+        self._run_check(s, [("starttag", "title", []),
+                            ('data', 'Egg '),
+                            ('entityref', 'amp'),
+                            ('data', ' Spam' + ('' if end else tail))],
+                        
collector=EventCollectorNoNormalize(convert_charrefs=False))
+
     def test_comments(self):
         html = ("<!-- I'm a valid comment -->"
                 '<!--me too!-->'
diff --git 
a/Misc/NEWS.d/next/Security/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst 
b/Misc/NEWS.d/next/Security/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst
new file mode 100644
index 00000000000000..6ad3caf33b2201
--- /dev/null
+++ b/Misc/NEWS.d/next/Security/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst
@@ -0,0 +1,2 @@
+Fix support of escapable raw text mode (elements "textarea" and "title")
+in :class:`html.parser.HTMLParser`.

_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3//lists/python-checkins.python.org
Member address: [email protected]

[Python-checkins] [3.9] gh-118350: Fix support of elements "textarea" and "title" in HTMLParser (GH-135310) (GH-137784)

Reply via email to