This evening I had a couple hours to spar and happend to read Guido's plea for help near the beginning of it. I picked up a failing testcase that no one had claimed and did what I could: it's not finished, but it fixes approximately 75% of the errors in test_tarfile. I concentrated on fixing problems that the testcase turned up; a pure inspection of the source would turn up lots of things I missed, I'm sure. I hope it's useful; it probably need minor attention from me on what the Right Thing to do is in the case of encoding and decoding: ascii? I had to do a .decode('latin-1') to pass the umlaut-in-a-filename test, but I'm not at all sure that that's the true Right Thing. Anyway, here's a start; I'm explicitly *not* claiming that I'll ever touch this source code again; I don't want to block anyone else from working on it. Enjoy.
--pj Index: tarfile.py =================================================================== --- tarfile.py (revision 56785) +++ tarfile.py (working copy) @@ -72,33 +72,33 @@ #--------------------------------------------------------- # tar constants #--------------------------------------------------------- -NUL = "\0" # the null character +NUL = b"\0" # the null character BLOCKSIZE = 512 # length of processing blocks RECORDSIZE = BLOCKSIZE * 20 # length of records -GNU_MAGIC = "ustar \0" # magic gnu tar string -POSIX_MAGIC = "ustar\x0000" # magic posix tar string +GNU_MAGIC = b"ustar \0" # magic gnu tar string +POSIX_MAGIC = b"ustar\x0000" # magic posix tar string LENGTH_NAME = 100 # maximum length of a filename LENGTH_LINK = 100 # maximum length of a linkname LENGTH_PREFIX = 155 # maximum length of the prefix field -REGTYPE = "0" # regular file -AREGTYPE = "\0" # regular file -LNKTYPE = "1" # link (inside tarfile) -SYMTYPE = "2" # symbolic link -CHRTYPE = "3" # character special device -BLKTYPE = "4" # block special device -DIRTYPE = "5" # directory -FIFOTYPE = "6" # fifo special device -CONTTYPE = "7" # contiguous file +REGTYPE = b"0" # regular file +AREGTYPE = b"\0" # regular file +LNKTYPE = b"1" # link (inside tarfile) +SYMTYPE = b"2" # symbolic link +CHRTYPE = b"3" # character special device +BLKTYPE = b"4" # block special device +DIRTYPE = b"5" # directory +FIFOTYPE = b"6" # fifo special device +CONTTYPE = b"7" # contiguous file -GNUTYPE_LONGNAME = "L" # GNU tar longname -GNUTYPE_LONGLINK = "K" # GNU tar longlink -GNUTYPE_SPARSE = "S" # GNU tar sparse file +GNUTYPE_LONGNAME = b"L" # GNU tar longname +GNUTYPE_LONGLINK = b"K" # GNU tar longlink +GNUTYPE_SPARSE = b"S" # GNU tar sparse file -XHDTYPE = "x" # POSIX.1-2001 extended header -XGLTYPE = "g" # POSIX.1-2001 global header -SOLARIS_XHDTYPE = "X" # Solaris extended header +XHDTYPE = b"x" # POSIX.1-2001 extended header +XGLTYPE = b"g" # POSIX.1-2001 global header +SOLARIS_XHDTYPE = b"X" # Solaris extended header USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format GNU_FORMAT = 1 # GNU tar format @@ -176,6 +176,9 @@ def stn(s, length): """Convert a python string to a null-terminated string buffer. """ + #return s[:length].encode('ascii') + (length - len(s)) * NUL + if type(s) != type(b''): + s = s.encode('ascii') return s[:length] + (length - len(s)) * NUL def nts(s): @@ -184,8 +187,8 @@ # Use the string up to the first null char. p = s.find("\0") if p == -1: - return s - return s[:p] + return s.decode('latin-1') + return s[:p].decode('latin-1') def nti(s): """Convert a number field to a python number. @@ -214,7 +217,7 @@ # encoding, the following digits-1 bytes are a big-endian # representation. This allows values up to (256**(digits-1))-1. if 0 <= n < 8 ** (digits - 1): - s = "%0*o" % (digits - 1, n) + NUL + s = ("%0*o" % (digits - 1, n)).encode('ascii') + NUL else: if format != GNU_FORMAT or n >= 256 ** (digits - 1): raise ValueError("overflow in number field") @@ -412,7 +415,7 @@ self.comptype = comptype self.fileobj = fileobj self.bufsize = bufsize - self.buf = "" + self.buf = b"" self.pos = 0 self.closed = False @@ -434,7 +437,7 @@ except ImportError: raise CompressionError("bz2 module is not available") if mode == "r": - self.dbuf = "" + self.dbuf = b"" self.cmp = bz2.BZ2Decompressor() else: self.cmp = bz2.BZ2Compressor() @@ -451,10 +454,10 @@ self.zlib.DEF_MEM_LEVEL, 0) timestamp = struct.pack("<L", int(time.time())) - self.__write("\037\213\010\010%s\002\377" % timestamp) + self.__write(b"\037\213\010\010" + timestamp + b"\002\377") if self.name.endswith(".gz"): self.name = self.name[:-3] - self.__write(self.name + NUL) + self.__write(self.name.encode('ascii') + NUL) def write(self, s): """Write string s to the stream. @@ -487,7 +490,7 @@ if self.mode == "w" and self.buf: self.fileobj.write(self.buf) - self.buf = "" + self.buf = b"" if self.comptype == "gz": # The native zlib crc is an unsigned 32-bit integer, but # the Python wrapper implicitly casts that to a signed C @@ -507,12 +510,12 @@ """Initialize for reading a gzip compressed fileobj. """ self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS) - self.dbuf = "" + self.dbuf = b"" # taken from gzip.GzipFile with some alterations - if self.__read(2) != "\037\213": + if self.__read(2) != b"\037\213": raise ReadError("not a gzip file") - if self.__read(1) != "\010": + if self.__read(1) != b"\010": raise CompressionError("unsupported compression method") flag = ord(self.__read(1)) @@ -564,7 +567,7 @@ if not buf: break t.append(buf) - buf = "".join(t) + buf = b"".join(t) else: buf = self._read(size) self.pos += len(buf) @@ -588,7 +591,7 @@ raise ReadError("invalid compressed data") t.append(buf) c += len(buf) - t = "".join(t) + t = b"".join(t) self.dbuf = t[size:] return t[:size] @@ -604,7 +607,7 @@ break t.append(buf) c += len(buf) - t = "".join(t) + t = b"".join(t) self.buf = t[size:] return t[:size] # class _Stream @@ -655,7 +658,7 @@ if self.mode == "r": self.bz2obj = bz2.BZ2Decompressor() self.fileobj.seek(0) - self.buf = "" + self.buf = b"" else: self.bz2obj = bz2.BZ2Compressor() @@ -670,7 +673,7 @@ except EOFError: break x += len(data) - self.buf = "".join(b) + self.buf = b"".join(b) buf = self.buf[:size] self.buf = self.buf[size:] @@ -753,7 +756,7 @@ break size -= len(buf) data.append(buf) - return "".join(data) + return b"".join(data) def readsparsesection(self, size): """Read a single section of a sparse file. @@ -761,7 +764,7 @@ section = self.sparse.find(self.position) if section is None: - return "" + return b"" size = min(size, section.offset + section.size - self.position) @@ -793,7 +796,7 @@ self.size = tarinfo.size self.position = 0 - self.buffer = "" + self.buffer = b"" def read(self, size=None): """Read at most size bytes from the file. If size is not @@ -802,11 +805,11 @@ if self.closed: raise ValueError("I/O operation on closed file") - buf = "" + buf = b"" if self.buffer: if size is None: buf = self.buffer - self.buffer = "" + self.buffer = b"" else: buf = self.buffer[:size] self.buffer = self.buffer[size:] @@ -827,16 +830,16 @@ if self.closed: raise ValueError("I/O operation on closed file") - if "\n" in self.buffer: - pos = self.buffer.find("\n") + 1 + if b"\n" in self.buffer: + pos = self.buffer.find(b"\n") + 1 else: buffers = [self.buffer] while True: buf = self.fileobj.read(self.blocksize) buffers.append(buf) - if not buf or "\n" in buf: - self.buffer = "".join(buffers) - pos = self.buffer.find("\n") + 1 + if not buf or b"\n" in buf: + self.buffer = b"".join(buffers) + pos = self.buffer.find(b"\n") + 1 if pos == 0: # no newline found. pos = len(self.buffer) @@ -848,7 +851,7 @@ buf = self.buffer[:pos] self.buffer = self.buffer[pos:] self.position += len(buf) - return buf + return buf.decode() def readlines(self): """Return a list with all remaining lines. @@ -886,7 +889,7 @@ else: raise ValueError("Invalid argument") - self.buffer = "" + self.buffer = b"" self.fileobj.seek(self.position) def close(self): @@ -1015,7 +1018,7 @@ """ info["magic"] = GNU_MAGIC - buf = "" + buf = b"" if len(info["linkname"]) > LENGTH_LINK: buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK) @@ -1071,7 +1074,7 @@ if pax_headers: buf = self._create_pax_generic_header(pax_headers) else: - buf = "" + buf = b"" return buf + self._create_header(info, USTAR_FORMAT) @@ -1108,7 +1111,7 @@ itn(info.get("gid", 0), 8, format), itn(info.get("size", 0), 12, format), itn(info.get("mtime", 0), 12, format), - " ", # checksum field + b" ", # checksum field info.get("type", REGTYPE), stn(info.get("linkname", ""), 100), stn(info.get("magic", POSIX_MAGIC), 8), @@ -1119,9 +1122,9 @@ stn(info.get("prefix", ""), 155) ] - buf = struct.pack("%ds" % BLOCKSIZE, "".join(parts)) + buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts)) chksum = calc_chksums(buf[-BLOCKSIZE:])[0] - buf = buf[:-364] + "%06o\0" % chksum + buf[-357:] + buf = buf[:-364] + ("%06o\0" % chksum).encode('ascii') + buf[-357:] return buf @staticmethod @@ -1139,10 +1142,10 @@ """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence for name. """ - name += NUL + name = name.encode('ascii') + NUL info = {} - info["name"] = "././@LongLink" + info["name"] = b"././@LongLink" info["type"] = type info["size"] = len(name) info["magic"] = GNU_MAGIC @@ -1324,7 +1327,7 @@ lastpos = offset + numbytes pos += 24 - isextended = ord(buf[482]) + isextended = buf[482] origsize = nti(buf[483:495]) # If the isextended flag is given, @@ -1344,7 +1347,7 @@ realpos += numbytes lastpos = offset + numbytes pos += 24 - isextended = ord(buf[504]) + isextended = buf[504] if lastpos < origsize: sp.append(_hole(lastpos, origsize - lastpos)) Index: test/test_tarfile.py =================================================================== --- test/test_tarfile.py (revision 56784) +++ test/test_tarfile.py (working copy) @@ -115,7 +115,7 @@ fobj.seek(0, 2) self.assertEqual(tarinfo.size, fobj.tell(), "seek() to file's end failed") - self.assert_(fobj.read() == "", + self.assert_(fobj.read() == b"", "read() at file's end did not return empty string") fobj.seek(-tarinfo.size, 2) self.assertEqual(0, fobj.tell(), _______________________________________________ Python-3000 mailing list Python-3000@python.org http://mail.python.org/mailman/listinfo/python-3000 Unsubscribe: http://mail.python.org/mailman/options/python-3000/archive%40mail-archive.com