https://github.com/python/cpython/commit/95296a9d40aa2d58502a09e86e2a93c03df23366
commit: 95296a9d40aa2d58502a09e86e2a93c03df23366
branch: main
author: Serhiy Storchaka <[email protected]>
committer: serhiy-storchaka <[email protected]>
date: 2025-11-19T13:55:10+02:00
summary:
gh-140875: Fix handling of unclosed charrefs before EOF in HTMLParser
(GH-140904)
files:
A Misc/NEWS.d/next/Library/2025-11-02-10-44-23.gh-issue-140875.wt6B37.rst
M Lib/html/parser.py
M Lib/test/test_htmlparser.py
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index e50620de800d63..80fb8c3f929f6b 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -24,6 +24,7 @@
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
+incomplete_charref = re.compile('&#(?:[0-9]|[xX][0-9a-fA-F])')
attr_charref =
re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?')
starttagopen = re.compile('<[a-zA-Z]')
@@ -304,10 +305,20 @@ def goahead(self, end):
k = k - 1
i = self.updatepos(i, k)
continue
+ match = incomplete_charref.match(rawdata, i)
+ if match:
+ if end:
+ self.handle_charref(rawdata[i+2:])
+ i = self.updatepos(i, n)
+ break
+ # incomplete
+ break
+ elif i + 3 < n: # larger than "&#x"
+ # not the end of the buffer, and can't be confused
+ # with some other construct
+ self.handle_data("&#")
+ i = self.updatepos(i, i + 2)
else:
- if ";" in rawdata[i:]: # bail by consuming &#
- self.handle_data(rawdata[i:i+2])
- i = self.updatepos(i, i+2)
break
elif startswith('&', i):
match = entityref.match(rawdata, i)
@@ -321,15 +332,13 @@ def goahead(self, end):
continue
match = incomplete.match(rawdata, i)
if match:
- # match.group() will contain at least 2 chars
- if end and match.group() == rawdata[i:]:
- k = match.end()
- if k <= i:
- k = n
- i = self.updatepos(i, i + 1)
+ if end:
+ self.handle_entityref(rawdata[i+1:])
+ i = self.updatepos(i, n)
+ break
# incomplete
break
- elif (i + 1) < n:
+ elif i + 1 < n:
# not the end of the buffer, and can't be confused
# with some other construct
self.handle_data("&")
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index 19dde9362a43b6..e4eff1ea17a670 100644
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -109,12 +109,13 @@ def get_events(self):
class TestCaseBase(unittest.TestCase):
- def get_collector(self):
- return EventCollector(convert_charrefs=False)
+ def get_collector(self, convert_charrefs=False):
+ return EventCollector(convert_charrefs=convert_charrefs)
- def _run_check(self, source, expected_events, collector=None):
+ def _run_check(self, source, expected_events,
+ *, collector=None, convert_charrefs=False):
if collector is None:
- collector = self.get_collector()
+ collector = self.get_collector(convert_charrefs=convert_charrefs)
parser = collector
for s in source:
parser.feed(s)
@@ -128,7 +129,7 @@ def _run_check(self, source, expected_events,
collector=None):
def _run_check_extra(self, source, events):
self._run_check(source, events,
- EventCollectorExtra(convert_charrefs=False))
+ collector=EventCollectorExtra(convert_charrefs=False))
class HTMLParserTestCase(TestCaseBase):
@@ -187,10 +188,87 @@ def test_malformatted_charref(self):
])
def test_unclosed_entityref(self):
- self._run_check("&entityref foo", [
- ("entityref", "entityref"),
- ("data", " foo"),
- ])
+ self._run_check('> <', [('entityref', 'gt'), ('data', ' '),
('entityref', 'lt')],
+ convert_charrefs=False)
+ self._run_check('> <', [('data', '> <')], convert_charrefs=True)
+
+ self._run_check('&undefined <',
+ [('entityref', 'undefined'), ('data', ' '),
('entityref', 'lt')],
+ convert_charrefs=False)
+ self._run_check('&undefined <', [('data', '&undefined <')],
+ convert_charrefs=True)
+
+ self._run_check('>undefined <',
+ [('entityref', 'gtundefined'), ('data', ' '),
('entityref', 'lt')],
+ convert_charrefs=False)
+ self._run_check('>undefined <', [('data', '>undefined <')],
+ convert_charrefs=True)
+
+ self._run_check('& <', [('data', '& '), ('entityref', 'lt')],
+ convert_charrefs=False)
+ self._run_check('& <', [('data', '& <')], convert_charrefs=True)
+
+ def test_eof_in_entityref(self):
+ self._run_check('>', [('entityref', 'gt')], convert_charrefs=False)
+ self._run_check('>', [('data', '>')], convert_charrefs=True)
+
+ self._run_check('&g', [('entityref', 'g')], convert_charrefs=False)
+ self._run_check('&g', [('data', '&g')], convert_charrefs=True)
+
+ self._run_check('&undefined', [('entityref', 'undefined')],
+ convert_charrefs=False)
+ self._run_check('&undefined', [('data', '&undefined')],
+ convert_charrefs=True)
+
+ self._run_check('>undefined', [('entityref', 'gtundefined')],
+ convert_charrefs=False)
+ self._run_check('>undefined', [('data', '>undefined')],
+ convert_charrefs=True)
+
+ self._run_check('&', [('data', '&')], convert_charrefs=False)
+ self._run_check('&', [('data', '&')], convert_charrefs=True)
+
+ def test_unclosed_charref(self):
+ self._run_check('{ <', [('charref', '123'), ('data', ' '),
('entityref', 'lt')],
+ convert_charrefs=False)
+ self._run_check('{ <', [('data', '{ <')], convert_charrefs=True)
+ self._run_check('« <', [('charref', 'xab'), ('data', ' '),
('entityref', 'lt')],
+ convert_charrefs=False)
+ self._run_check('« <', [('data', '\xab <')],
convert_charrefs=True)
+
+ self._run_check('� <',
+ [('charref', '123456789'), ('data', ' '),
('entityref', 'lt')],
+ convert_charrefs=False)
+ self._run_check('� <', [('data', '\ufffd <')],
+ convert_charrefs=True)
+ self._run_check('� <',
+ [('charref', 'x123456789'), ('data', ' '),
('entityref', 'lt')],
+ convert_charrefs=False)
+ self._run_check('� <', [('data', '\ufffd <')],
+ convert_charrefs=True)
+
+ self._run_check('&# <', [('data', '&# '), ('entityref', 'lt')],
convert_charrefs=False)
+ self._run_check('&# <', [('data', '&# <')], convert_charrefs=True)
+ self._run_check('&#x <', [('data', '&#x '), ('entityref', 'lt')],
convert_charrefs=False)
+ self._run_check('&#x <', [('data', '&#x <')], convert_charrefs=True)
+
+ def test_eof_in_charref(self):
+ self._run_check('{', [('charref', '123')], convert_charrefs=False)
+ self._run_check('{', [('data', '{')], convert_charrefs=True)
+ self._run_check('«', [('charref', 'xab')], convert_charrefs=False)
+ self._run_check('«', [('data', '\xab')], convert_charrefs=True)
+
+ self._run_check('�', [('charref', '123456789')],
+ convert_charrefs=False)
+ self._run_check('�', [('data', '\ufffd')],
convert_charrefs=True)
+ self._run_check('�', [('charref', 'x123456789')],
+ convert_charrefs=False)
+ self._run_check('�', [('data', '\ufffd')],
convert_charrefs=True)
+
+ self._run_check('&#', [('data', '&#')], convert_charrefs=False)
+ self._run_check('&#', [('data', '&#')], convert_charrefs=True)
+ self._run_check('&#x', [('data', '&#x')], convert_charrefs=False)
+ self._run_check('&#x', [('data', '&#x')], convert_charrefs=True)
def test_bad_nesting(self):
# Strangely, this *is* supposed to test that overlapping
@@ -762,20 +840,6 @@ def test_correct_detection_of_start_tags(self):
]
self._run_check(html, expected)
- def test_EOF_in_charref(self):
- # see #17802
- # This test checks that the UnboundLocalError reported in the issue
- # is not raised, however I'm not sure the returned values are correct.
- # Maybe HTMLParser should use self.unescape for these
- data = [
- ('a&', [('data', 'a&')]),
- ('a&b', [('data', 'ab')]),
- ('a&b ', [('data', 'a'), ('entityref', 'b'), ('data', ' ')]),
- ('a&b;', [('data', 'a'), ('entityref', 'b')]),
- ]
- for html, expected in data:
- self._run_check(html, expected)
-
def test_eof_in_comments(self):
data = [
('<!--', [('comment', '')]),
diff --git
a/Misc/NEWS.d/next/Library/2025-11-02-10-44-23.gh-issue-140875.wt6B37.rst
b/Misc/NEWS.d/next/Library/2025-11-02-10-44-23.gh-issue-140875.wt6B37.rst
new file mode 100644
index 00000000000000..c08a8966d53401
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-11-02-10-44-23.gh-issue-140875.wt6B37.rst
@@ -0,0 +1,3 @@
+Fix handling of unclosed character references (named and numerical)
+followed by the end of file in :class:`html.parser.HTMLParser` with
+``convert_charrefs=False``.
_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3//lists/python-checkins.python.org
Member address: [email protected]