https://github.com/python/cpython/commit/7595e6743ac78ac0dd19418176f66d251668fafc
commit: 7595e6743ac78ac0dd19418176f66d251668fafc
branch: main
author: Bénédikt Tran <[email protected]>
committer: encukou <[email protected]>
date: 2024-06-21T14:22:38+02:00
summary:

gh-120380: fix Python implementation of `pickle.Pickler` for `bytes` and 
`bytearray` objects in protocol version 5. (GH-120422)

files:
A Misc/NEWS.d/next/Core and 
Builtins/2024-06-12-18-23-15.gh-issue-120380.edtqjq.rst
M Lib/pickle.py
M Lib/test/pickletester.py

diff --git a/Lib/pickle.py b/Lib/pickle.py
index 33c97c8c5efb28..d719ceb7a0b8e8 100644
--- a/Lib/pickle.py
+++ b/Lib/pickle.py
@@ -782,14 +782,10 @@ def save_float(self, obj):
             self.write(FLOAT + repr(obj).encode("ascii") + b'\n')
     dispatch[float] = save_float
 
-    def save_bytes(self, obj):
-        if self.proto < 3:
-            if not obj: # bytes object is empty
-                self.save_reduce(bytes, (), obj=obj)
-            else:
-                self.save_reduce(codecs.encode,
-                                 (str(obj, 'latin1'), 'latin1'), obj=obj)
-            return
+    def _save_bytes_no_memo(self, obj):
+        # helper for writing bytes objects for protocol >= 3
+        # without memoizing them
+        assert self.proto >= 3
         n = len(obj)
         if n <= 0xff:
             self.write(SHORT_BINBYTES + pack("<B", n) + obj)
@@ -799,9 +795,29 @@ def save_bytes(self, obj):
             self._write_large_bytes(BINBYTES + pack("<I", n), obj)
         else:
             self.write(BINBYTES + pack("<I", n) + obj)
+
+    def save_bytes(self, obj):
+        if self.proto < 3:
+            if not obj: # bytes object is empty
+                self.save_reduce(bytes, (), obj=obj)
+            else:
+                self.save_reduce(codecs.encode,
+                                 (str(obj, 'latin1'), 'latin1'), obj=obj)
+            return
+        self._save_bytes_no_memo(obj)
         self.memoize(obj)
     dispatch[bytes] = save_bytes
 
+    def _save_bytearray_no_memo(self, obj):
+        # helper for writing bytearray objects for protocol >= 5
+        # without memoizing them
+        assert self.proto >= 5
+        n = len(obj)
+        if n >= self.framer._FRAME_SIZE_TARGET:
+            self._write_large_bytes(BYTEARRAY8 + pack("<Q", n), obj)
+        else:
+            self.write(BYTEARRAY8 + pack("<Q", n) + obj)
+
     def save_bytearray(self, obj):
         if self.proto < 5:
             if not obj:  # bytearray is empty
@@ -809,11 +825,7 @@ def save_bytearray(self, obj):
             else:
                 self.save_reduce(bytearray, (bytes(obj),), obj=obj)
             return
-        n = len(obj)
-        if n >= self.framer._FRAME_SIZE_TARGET:
-            self._write_large_bytes(BYTEARRAY8 + pack("<Q", n), obj)
-        else:
-            self.write(BYTEARRAY8 + pack("<Q", n) + obj)
+        self._save_bytearray_no_memo(obj)
         self.memoize(obj)
     dispatch[bytearray] = save_bytearray
 
@@ -832,10 +844,18 @@ def save_picklebuffer(self, obj):
                 if in_band:
                     # Write data in-band
                     # XXX The C implementation avoids a copy here
+                    buf = m.tobytes()
+                    in_memo = id(buf) in self.memo
                     if m.readonly:
-                        self.save_bytes(m.tobytes())
+                        if in_memo:
+                            self._save_bytes_no_memo(buf)
+                        else:
+                            self.save_bytes(buf)
                     else:
-                        self.save_bytearray(m.tobytes())
+                        if in_memo:
+                            self._save_bytearray_no_memo(buf)
+                        else:
+                            self.save_bytearray(buf)
                 else:
                     # Write data out-of-band
                     self.write(NEXT_BUFFER)
diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py
index 93e7dbbd103934..9922591ce7114a 100644
--- a/Lib/test/pickletester.py
+++ b/Lib/test/pickletester.py
@@ -1845,6 +1845,25 @@ def test_bytes(self):
                 p = self.dumps(s, proto)
                 self.assert_is_copy(s, self.loads(p))
 
+    def test_bytes_memoization(self):
+        for proto in protocols:
+            for array_type in [bytes, ZeroCopyBytes]:
+                for s in b'', b'xyz', b'xyz'*100:
+                    with self.subTest(proto=proto, array_type=array_type, s=s, 
independent=False):
+                        b = array_type(s)
+                        p = self.dumps((b, b), proto)
+                        x, y = self.loads(p)
+                        self.assertIs(x, y)
+                        self.assert_is_copy((b, b), (x, y))
+
+                    with self.subTest(proto=proto, array_type=array_type, s=s, 
independent=True):
+                        b1, b2 = array_type(s), array_type(s)
+                        p = self.dumps((b1, b2), proto)
+                        # Note that (b1, b2) = self.loads(p) might have 
identical
+                        # components, i.e., b1 is b2, but this is not always 
the
+                        # case if the content is large (equality still holds).
+                        self.assert_is_copy((b1, b2), self.loads(p))
+
     def test_bytearray(self):
         for proto in protocols:
             for s in b'', b'xyz', b'xyz'*100:
@@ -1864,13 +1883,31 @@ def test_bytearray(self):
                     self.assertNotIn(b'bytearray', p)
                     self.assertTrue(opcode_in_pickle(pickle.BYTEARRAY8, p))
 
-    def test_bytearray_memoization_bug(self):
+    def test_bytearray_memoization(self):
         for proto in protocols:
-            for s in b'', b'xyz', b'xyz'*100:
-                b = bytearray(s)
-                p = self.dumps((b, b), proto)
-                b1, b2 = self.loads(p)
-                self.assertIs(b1, b2)
+            for array_type in [bytearray, ZeroCopyBytearray]:
+                for s in b'', b'xyz', b'xyz'*100:
+                    with self.subTest(proto=proto, array_type=array_type, s=s, 
independent=False):
+                        b = array_type(s)
+                        p = self.dumps((b, b), proto)
+                        b1, b2 = self.loads(p)
+                        self.assertIs(b1, b2)
+
+                    with self.subTest(proto=proto, array_type=array_type, s=s, 
independent=True):
+                        b1a, b2a = array_type(s), array_type(s)
+                        # Unlike bytes, equal but independent bytearray 
objects are
+                        # never identical.
+                        self.assertIsNot(b1a, b2a)
+
+                        p = self.dumps((b1a, b2a), proto)
+                        b1b, b2b = self.loads(p)
+                        self.assertIsNot(b1b, b2b)
+
+                        self.assertIsNot(b1a, b1b)
+                        self.assert_is_copy(b1a, b1b)
+
+                        self.assertIsNot(b2a, b2b)
+                        self.assert_is_copy(b2a, b2b)
 
     def test_ints(self):
         for proto in protocols:
diff --git a/Misc/NEWS.d/next/Core and 
Builtins/2024-06-12-18-23-15.gh-issue-120380.edtqjq.rst b/Misc/NEWS.d/next/Core 
and Builtins/2024-06-12-18-23-15.gh-issue-120380.edtqjq.rst
new file mode 100644
index 00000000000000..c682a0b7666416
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and 
Builtins/2024-06-12-18-23-15.gh-issue-120380.edtqjq.rst 
@@ -0,0 +1,3 @@
+Fix Python implementation of :class:`pickle.Pickler` for :class:`bytes` and
+:class:`bytearray` objects when using protocol version 5. Patch by Bénédikt
+Tran.

_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3/lists/python-checkins.python.org/
Member address: [email protected]

Reply via email to