https://github.com/python/cpython/commit/7636a66635a0da849cfccd06a52d0a21fb692271
commit: 7636a66635a0da849cfccd06a52d0a21fb692271
branch: main
author: Serhiy Storchaka <[email protected]>
committer: serhiy-storchaka <[email protected]>
date: 2025-08-17T13:37:50+03:00
summary:
gh-135661: Fix parsing unterminated bogus comments in HTMLParser (GH-137873)
Bogus comments that start with "<![CDATA[" should not include the starting "!"
in its value.
files:
M Lib/html/parser.py
M Lib/test/test_htmlparser.py
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index 75bf8adae6d70a..5d7050dad2396b 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -271,11 +271,8 @@ def goahead(self, end):
j -= len(suffix)
break
self.handle_comment(rawdata[i+4:j])
- elif startswith("<![CDATA[", i):
- if self._support_cdata:
- self.unknown_decl(rawdata[i+3:])
- else:
- self.handle_comment(rawdata[i+1:])
+ elif startswith("<![CDATA[", i) and self._support_cdata:
+ self.unknown_decl(rawdata[i+3:])
elif rawdata[i:i+9].lower() == '<!doctype':
self.handle_decl(rawdata[i+2:])
elif startswith("<!", i):
@@ -350,15 +347,12 @@ def parse_html_declaration(self, i):
if rawdata[i:i+4] == '<!--':
# this case is actually already handled in goahead()
return self.parse_comment(i)
- elif rawdata[i:i+9] == '<![CDATA[':
- if self._support_cdata:
- j = rawdata.find(']]>', i+9)
- if j < 0:
- return -1
- self.unknown_decl(rawdata[i+3: j])
- return j + 3
- else:
- return self.parse_bogus_comment(i)
+ elif rawdata[i:i+9] == '<![CDATA[' and self._support_cdata:
+ j = rawdata.find(']]>', i+9)
+ if j < 0:
+ return -1
+ self.unknown_decl(rawdata[i+3: j])
+ return j + 3
elif rawdata[i:i+9].lower() == '<!doctype':
# find the closing >
gtpos = rawdata.find('>', i+9)
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index fff41dab321acd..6a1d69335a0616 100644
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -791,7 +791,7 @@ def test_eof_in_cdata(self, content):
self._run_check('<![CDATA[' + content,
[('unknown decl', 'CDATA[' + content)])
self._run_check('<![CDATA[' + content,
- [('comment', '![CDATA[' + content)],
+ [('comment', '[CDATA[' + content)],
collector=EventCollector(autocdata=True))
self._run_check('<svg><text y="100"><![CDATA[' + content,
[('starttag', 'svg', []),
_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3//lists/python-checkins.python.org
Member address: [email protected]