https://github.com/python/cpython/commit/38d4b436ca767351db834189b3a5379406cd52a8
commit: 38d4b436ca767351db834189b3a5379406cd52a8
branch: main
author: Serhiy Storchaka <[email protected]>
committer: serhiy-storchaka <[email protected]>
date: 2025-10-20T20:08:47+03:00
summary:
gh-63161: Fix tokenize.detect_encoding() (GH-139446)
* Support non-UTF-8 shebang and comments if non-UTF-8 encoding is specified.
* Detect decoding error for non-UTF-8 encoding.
* Detect null bytes in source code.
files:
A Misc/NEWS.d/next/Library/2025-09-30-12-52-54.gh-issue-63161.mECM1A.rst
M Lib/test/test_tokenize.py
M Lib/tokenize.py
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index 8fdd03f347b632..d274726eed2e65 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -1495,6 +1495,61 @@ def
test_cookie_second_line_noncommented_first_line(self):
expected = [b"print('\xc2\xa3')\n"]
self.assertEqual(consumed_lines, expected)
+ def test_first_non_utf8_coding_line(self):
+ lines = (
+ b'#coding:iso-8859-15 \xa4\n',
+ b'print(something)\n'
+ )
+ encoding, consumed_lines =
tokenize.detect_encoding(self.get_readline(lines))
+ self.assertEqual(encoding, 'iso-8859-15')
+ self.assertEqual(consumed_lines, list(lines[:1]))
+
+ def test_first_utf8_coding_line_error(self):
+ lines = (
+ b'#coding:ascii \xc3\xa4\n',
+ b'print(something)\n'
+ )
+ with self.assertRaises(SyntaxError):
+ tokenize.detect_encoding(self.get_readline(lines))
+
+ def test_second_non_utf8_coding_line(self):
+ lines = (
+ b'#!/usr/bin/python\n',
+ b'#coding:iso-8859-15 \xa4\n',
+ b'print(something)\n'
+ )
+ encoding, consumed_lines =
tokenize.detect_encoding(self.get_readline(lines))
+ self.assertEqual(encoding, 'iso-8859-15')
+ self.assertEqual(consumed_lines, list(lines[:2]))
+
+ def test_second_utf8_coding_line_error(self):
+ lines = (
+ b'#!/usr/bin/python\n',
+ b'#coding:ascii \xc3\xa4\n',
+ b'print(something)\n'
+ )
+ with self.assertRaises(SyntaxError):
+ tokenize.detect_encoding(self.get_readline(lines))
+
+ def test_non_utf8_shebang(self):
+ lines = (
+ b'#!/home/\xa4/bin/python\n',
+ b'#coding:iso-8859-15\n',
+ b'print(something)\n'
+ )
+ encoding, consumed_lines =
tokenize.detect_encoding(self.get_readline(lines))
+ self.assertEqual(encoding, 'iso-8859-15')
+ self.assertEqual(consumed_lines, list(lines[:2]))
+
+ def test_utf8_shebang_error(self):
+ lines = (
+ b'#!/home/\xc3\xa4/bin/python\n',
+ b'#coding:ascii\n',
+ b'print(something)\n'
+ )
+ with self.assertRaises(SyntaxError):
+ tokenize.detect_encoding(self.get_readline(lines))
+
def test_cookie_second_line_empty_first_line(self):
lines = (
b'\n',
@@ -1548,6 +1603,28 @@ def test_double_coding_utf8(self):
self.assertEqual(encoding, 'utf-8')
self.assertEqual(consumed_lines, list(lines[:1]))
+ def test_nul_in_first_coding_line(self):
+ lines = (
+ b'#coding:iso8859-15\x00\n',
+ b'\n',
+ b'\n',
+ b'print(something)\n'
+ )
+ with self.assertRaisesRegex(SyntaxError,
+ "source code cannot contain null bytes"):
+ tokenize.detect_encoding(self.get_readline(lines))
+
+ def test_nul_in_second_coding_line(self):
+ lines = (
+ b'#!/usr/bin/python\n',
+ b'#coding:iso8859-15\x00\n',
+ b'\n',
+ b'print(something)\n'
+ )
+ with self.assertRaisesRegex(SyntaxError,
+ "source code cannot contain null bytes"):
+ tokenize.detect_encoding(self.get_readline(lines))
+
def test_latin1_normalization(self):
# See get_normal_name() in Parser/tokenizer/helpers.c.
encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index 7e71755068e1df..1f31258ce361c9 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -36,7 +36,7 @@
from token import EXACT_TOKEN_TYPES
import _tokenize
-cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
+cookie_re = re.compile(br'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
import token
@@ -385,22 +385,23 @@ def read_or_stop():
except StopIteration:
return b''
- def find_cookie(line):
+ def check(line, encoding):
+ # Check if the line matches the encoding.
+ if 0 in line:
+ raise SyntaxError("source code cannot contain null bytes")
try:
- # Decode as UTF-8. Either the line is an encoding declaration,
- # in which case it should be pure ASCII, or it must be UTF-8
- # per default encoding.
- line_string = line.decode('utf-8')
+ line.decode(encoding)
except UnicodeDecodeError:
msg = "invalid or missing encoding declaration"
if filename is not None:
msg = '{} for {!r}'.format(msg, filename)
raise SyntaxError(msg)
- match = cookie_re.match(line_string)
+ def find_cookie(line):
+ match = cookie_re.match(line)
if not match:
return None
- encoding = _get_normal_name(match.group(1))
+ encoding = _get_normal_name(match.group(1).decode())
try:
codec = lookup(encoding)
except LookupError:
@@ -433,18 +434,23 @@ def find_cookie(line):
encoding = find_cookie(first)
if encoding:
+ check(first, encoding)
return encoding, [first]
if not blank_re.match(first):
+ check(first, default)
return default, [first]
second = read_or_stop()
if not second:
+ check(first, default)
return default, [first]
encoding = find_cookie(second)
if encoding:
+ check(first + second, encoding)
return encoding, [first, second]
+ check(first + second, default)
return default, [first, second]
diff --git
a/Misc/NEWS.d/next/Library/2025-09-30-12-52-54.gh-issue-63161.mECM1A.rst
b/Misc/NEWS.d/next/Library/2025-09-30-12-52-54.gh-issue-63161.mECM1A.rst
new file mode 100644
index 00000000000000..3daed20d099a8a
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-09-30-12-52-54.gh-issue-63161.mECM1A.rst
@@ -0,0 +1,3 @@
+Fix :func:`tokenize.detect_encoding`. Support non-UTF-8 shebang and comments
+if non-UTF-8 encoding is specified. Detect decoding error for non-UTF-8
+encoding. Detect null bytes in source code.
_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3//lists/python-checkins.python.org
Member address: [email protected]