https://github.com/python/cpython/commit/9983c7d4416cac8deb2fded1ec9c7daf786c3a02
commit: 9983c7d4416cac8deb2fded1ec9c7daf786c3a02
branch: main
author: Serhiy Storchaka <[email protected]>
committer: serhiy-storchaka <[email protected]>
date: 2025-05-18T22:21:06+03:00
summary:
gh-133890: Handle UnicodeEncodeError in tarfile (GH-134147)
UnicodeEncodeError is now handled the same way as OSError during
TarFile member extraction.
files:
A Misc/NEWS.d/next/Library/2025-05-17-18-08-35.gh-issue-133890.onn9_X.rst
M Lib/tarfile.py
M Lib/test/test_tarfile.py
diff --git a/Lib/tarfile.py b/Lib/tarfile.py
index 13889d768021b1..212b71f6509740 100644
--- a/Lib/tarfile.py
+++ b/Lib/tarfile.py
@@ -2439,7 +2439,7 @@ def _get_extract_tarinfo(self, member, filter_function,
path):
unfiltered = tarinfo
try:
tarinfo = filter_function(tarinfo, path)
- except (OSError, FilterError) as e:
+ except (OSError, UnicodeEncodeError, FilterError) as e:
self._handle_fatal_error(e)
except ExtractError as e:
self._handle_nonfatal_error(e)
@@ -2460,7 +2460,7 @@ def _extract_one(self, tarinfo, path, set_attrs,
numeric_owner):
self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
set_attrs=set_attrs,
numeric_owner=numeric_owner)
- except OSError as e:
+ except (OSError, UnicodeEncodeError) as e:
self._handle_fatal_error(e)
except ExtractError as e:
self._handle_nonfatal_error(e)
diff --git a/Lib/test/test_tarfile.py b/Lib/test/test_tarfile.py
index 2d9649237a9382..2018a20afd1b18 100644
--- a/Lib/test/test_tarfile.py
+++ b/Lib/test/test_tarfile.py
@@ -3490,11 +3490,12 @@ class ArchiveMaker:
with t.open() as tar:
... # `tar` is now a TarFile with 'filename' in it!
"""
- def __init__(self):
+ def __init__(self, **kwargs):
self.bio = io.BytesIO()
+ self.tar_kwargs = dict(kwargs)
def __enter__(self):
- self.tar_w = tarfile.TarFile(mode='w', fileobj=self.bio)
+ self.tar_w = tarfile.TarFile(mode='w', fileobj=self.bio,
**self.tar_kwargs)
return self
def __exit__(self, *exc):
@@ -4073,7 +4074,10 @@ def test_tar_filter(self):
# that in the test archive.)
with tarfile.TarFile.open(tarname) as tar:
for tarinfo in tar.getmembers():
- filtered = tarfile.tar_filter(tarinfo, '')
+ try:
+ filtered = tarfile.tar_filter(tarinfo, '')
+ except UnicodeEncodeError:
+ continue
self.assertIs(filtered.name, tarinfo.name)
self.assertIs(filtered.type, tarinfo.type)
@@ -4084,11 +4088,48 @@ def test_data_filter(self):
for tarinfo in tar.getmembers():
try:
filtered = tarfile.data_filter(tarinfo, '')
- except tarfile.FilterError:
+ except (tarfile.FilterError, UnicodeEncodeError):
continue
self.assertIs(filtered.name, tarinfo.name)
self.assertIs(filtered.type, tarinfo.type)
+ @unittest.skipIf(sys.platform == 'win32', 'requires native bytes paths')
+ def test_filter_unencodable(self):
+ # Sanity check using a valid path.
+ tarinfo = tarfile.TarInfo(os_helper.TESTFN)
+ filtered = tarfile.tar_filter(tarinfo, '')
+ self.assertIs(filtered.name, tarinfo.name)
+ filtered = tarfile.data_filter(tarinfo, '')
+ self.assertIs(filtered.name, tarinfo.name)
+
+ tarinfo = tarfile.TarInfo('test\x00')
+ self.assertRaises(ValueError, tarfile.tar_filter, tarinfo, '')
+ self.assertRaises(ValueError, tarfile.data_filter, tarinfo, '')
+ tarinfo = tarfile.TarInfo('\ud800')
+ self.assertRaises(UnicodeEncodeError, tarfile.tar_filter, tarinfo, '')
+ self.assertRaises(UnicodeEncodeError, tarfile.data_filter, tarinfo, '')
+
+ @unittest.skipIf(sys.platform == 'win32', 'requires native bytes paths')
+ def test_extract_unencodable(self):
+ # Create a member with name \xed\xa0\x80 which is UTF-8 encoded
+ # lone surrogate \ud800.
+ with ArchiveMaker(encoding='ascii', errors='surrogateescape') as arc:
+ arc.add('\udced\udca0\udc80')
+ with os_helper.temp_cwd() as tmp:
+ tar = arc.open(encoding='utf-8', errors='surrogatepass',
+ errorlevel=1)
+ self.assertEqual(tar.getnames(), ['\ud800'])
+ with self.assertRaises(UnicodeEncodeError):
+ tar.extractall()
+ self.assertEqual(os.listdir(), [])
+
+ tar = arc.open(encoding='utf-8', errors='surrogatepass',
+ errorlevel=0, debug=1)
+ with support.captured_stderr() as stderr:
+ tar.extractall()
+ self.assertEqual(os.listdir(), [])
+ self.assertIn('tarfile: UnicodeEncodeError ', stderr.getvalue())
+
def test_change_default_filter_on_instance(self):
tar = tarfile.TarFile(tarname, 'r')
def strict_filter(tarinfo, path):
diff --git
a/Misc/NEWS.d/next/Library/2025-05-17-18-08-35.gh-issue-133890.onn9_X.rst
b/Misc/NEWS.d/next/Library/2025-05-17-18-08-35.gh-issue-133890.onn9_X.rst
new file mode 100644
index 00000000000000..44565a5424e65b
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-05-17-18-08-35.gh-issue-133890.onn9_X.rst
@@ -0,0 +1,2 @@
+The :mod:`tarfile` module now handles :exc:`UnicodeEncodeError` in the same
+way as :exc:`OSError` when cannot extract a member.
_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3/lists/python-checkins.python.org/
Member address: [email protected]