Hi,
See attached patch: python3_bytes_filename.patch
Using the patch, you will get:
- open() support bytes
- listdir(unicode) -> only unicode, *skip* invalid filenames
(as asked by Guido)
- remove os.getcwdu()
- create os.getcwdb() -> bytes
- glob.glob() support bytes
- fnmatch.filter() support bytes
- posixpath.join() and posixpath.split() support bytes
Mixing bytes and str is invalid. Examples raising a TypeError:
- posixpath.join(b'x', 'y')
- fnmatch.filter([b'x', 'y'], '*')
- fnmatch.filter([b'x', b'y'], '*')
- glob.glob1('.', b'*')
- glob.glob1(b'.', '*')
$ diffstat ~/python3_bytes_filename.patch
Lib/fnmatch.py | 7 +++-
Lib/glob.py | 15 ++++++---
Lib/io.py | 2 -
Lib/posixpath.py | 20 ++++++++----
Modules/posixmodule.c | 83
++++++++++++++++++--------------------------------
5 files changed, 62 insertions(+), 65 deletions(-)
TODO:
- review this patch :-)
- support non-ASCII bytes in fnmatch.filter()
- fix other functions, eg. posixpath.isabs() and fnmatch.fnmatchcase()
- fix functions written in C: grep FileSystemDefaultEncoding
- make sure that mixing bytes and str is rejected
--
Victor Stinner aka haypo
http://www.haypocalc.com/blog/
Index: Lib/posixpath.py
===================================================================
--- Lib/posixpath.py (révision 66687)
+++ Lib/posixpath.py (copie de travail)
@@ -59,14 +59,18 @@
"""Join two or more pathname components, inserting '/' as needed.
If any component is an absolute path, all previous path components
will be discarded."""
+ if isinstance(a, bytes):
+ sep = b'/'
+ else:
+ sep = '/'
path = a
for b in p:
- if b.startswith('/'):
+ if b.startswith(sep):
path = b
- elif path == '' or path.endswith('/'):
+ elif not path or path.endswith(sep):
path += b
else:
- path += '/' + b
+ path += sep + b
return path
@@ -78,10 +82,14 @@
def split(p):
"""Split a pathname. Returns tuple "(head, tail)" where "tail" is
everything after the final slash. Either part may be empty."""
- i = p.rfind('/') + 1
+ if isinstance(p, bytes):
+ sep = b'/'
+ else:
+ sep = '/'
+ i = p.rfind(sep) + 1
head, tail = p[:i], p[i:]
- if head and head != '/'*len(head):
- head = head.rstrip('/')
+ if head and head != sep*len(head):
+ head = head.rstrip(sep)
return head, tail
Index: Lib/glob.py
===================================================================
--- Lib/glob.py (révision 66687)
+++ Lib/glob.py (copie de travail)
@@ -27,7 +27,7 @@
return
dirname, basename = os.path.split(pathname)
if not dirname:
- for name in glob1(os.curdir, basename):
+ for name in glob1(None, basename):
yield name
return
if has_magic(dirname):
@@ -49,9 +49,8 @@
def glob1(dirname, pattern):
if not dirname:
dirname = os.curdir
- if isinstance(pattern, str) and not isinstance(dirname, str):
- dirname = str(dirname, sys.getfilesystemencoding() or
- sys.getdefaultencoding())
+ if isinstance(pattern, bytes):
+ dirname = dirname.encode("ASCII")
try:
names = os.listdir(dirname)
except os.error:
@@ -73,6 +72,12 @@
magic_check = re.compile('[*?[]')
+magic_check_bytes = re.compile(b'[*?[]')
def has_magic(s):
- return magic_check.search(s) is not None
+ if isinstance(s, bytes):
+ match = magic_check_bytes.search(s)
+ else:
+ match = magic_check.search(s)
+ return match is not None
+
Index: Lib/fnmatch.py
===================================================================
--- Lib/fnmatch.py (révision 66687)
+++ Lib/fnmatch.py (copie de travail)
@@ -43,7 +43,12 @@
result=[]
pat=os.path.normcase(pat)
if not pat in _cache:
- res = translate(pat)
+ if isinstance(pat, bytes):
+ pat_str = str(pat, "ASCII")
+ res_str = translate(pat_str)
+ res = res_str.encode("ASCII")
+ else:
+ res = translate(pat)
_cache[pat] = re.compile(res)
match=_cache[pat].match
if os.path is posixpath:
Index: Lib/io.py
===================================================================
--- Lib/io.py (révision 66687)
+++ Lib/io.py (copie de travail)
@@ -180,7 +180,7 @@
opened in a text mode, and for bytes a BytesIO can be used like a file
opened in a binary mode.
"""
- if not isinstance(file, (str, int)):
+ if not isinstance(file, (str, bytes, int)):
raise TypeError("invalid file: %r" % file)
if not isinstance(mode, str):
raise TypeError("invalid mode: %r" % mode)
Index: Modules/posixmodule.c
===================================================================
--- Modules/posixmodule.c (révision 66687)
+++ Modules/posixmodule.c (copie de travail)
@@ -1968,63 +1968,18 @@
#ifdef HAVE_GETCWD
-PyDoc_STRVAR(posix_getcwd__doc__,
-"getcwd() -> path\n\n\
-Return a string representing the current working directory.");
-
static PyObject *
-posix_getcwd(PyObject *self, PyObject *noargs)
+posix_getcwd(int use_bytes)
{
- int bufsize_incr = 1024;
- int bufsize = 0;
- char *tmpbuf = NULL;
- char *res = NULL;
- PyObject *dynamic_return;
-
- Py_BEGIN_ALLOW_THREADS
- do {
- bufsize = bufsize + bufsize_incr;
- tmpbuf = malloc(bufsize);
- if (tmpbuf == NULL) {
- break;
- }
-#if defined(PYOS_OS2) && defined(PYCC_GCC)
- res = _getcwd2(tmpbuf, bufsize);
-#else
- res = getcwd(tmpbuf, bufsize);
-#endif
-
- if (res == NULL) {
- free(tmpbuf);
- }
- } while ((res == NULL) && (errno == ERANGE));
- Py_END_ALLOW_THREADS
-
- if (res == NULL)
- return posix_error();
-
- dynamic_return = PyUnicode_FromString(tmpbuf);
- free(tmpbuf);
-
- return dynamic_return;
-}
-
-PyDoc_STRVAR(posix_getcwdu__doc__,
-"getcwdu() -> path\n\n\
-Return a unicode string representing the current working directory.");
-
-static PyObject *
-posix_getcwdu(PyObject *self, PyObject *noargs)
-{
char buf[1026];
char *res;
#ifdef Py_WIN_WIDE_FILENAMES
- DWORD len;
- if (unicode_file_names()) {
+ if (!use_bytes && unicode_file_names()) {
wchar_t wbuf[1026];
wchar_t *wbuf2 = wbuf;
PyObject *resobj;
+ DWORD len;
Py_BEGIN_ALLOW_THREADS
len = GetCurrentDirectoryW(sizeof wbuf/ sizeof wbuf[0], wbuf);
/* If the buffer is large enough, len does not include the
@@ -2059,8 +2014,30 @@
Py_END_ALLOW_THREADS
if (res == NULL)
return posix_error();
+ if (use_bytes)
+ return PyBytes_FromStringAndSize(buf, strlen(buf));
return PyUnicode_Decode(buf, strlen(buf), Py_FileSystemDefaultEncoding,"strict");
}
+
+PyDoc_STRVAR(posix_getcwd__doc__,
+"getcwd() -> path\n\n\
+Return a unicode string representing the current working directory.");
+
+static PyObject *
+posix_getcwd_unicode(PyObject *self)
+{
+ return posix_getcwd(0);
+}
+
+PyDoc_STRVAR(posix_getcwdb__doc__,
+"getcwdb() -> path\n\n\
+Return a bytes string representing the current working directory.");
+
+static PyObject *
+posix_getcwd_bytes(PyObject *self)
+{
+ return posix_getcwd(1);
+}
#endif
@@ -2378,9 +2355,9 @@
v = w;
}
else {
- /* fall back to the original byte string, as
- discussed in patch #683592 */
PyErr_Clear();
+ Py_DECREF(v);
+ continue;
}
}
if (PyList_Append(d, v) != 0) {
@@ -6810,8 +6787,10 @@
{"ctermid", posix_ctermid, METH_NOARGS, posix_ctermid__doc__},
#endif
#ifdef HAVE_GETCWD
- {"getcwd", posix_getcwd, METH_NOARGS, posix_getcwd__doc__},
- {"getcwdu", posix_getcwdu, METH_NOARGS, posix_getcwdu__doc__},
+ {"getcwd", (PyCFunction)posix_getcwd_unicode,
+ METH_NOARGS, posix_getcwd__doc__},
+ {"getcwdb", (PyCFunction)posix_getcwd_bytes,
+ METH_NOARGS, posix_getcwdb__doc__},
#endif
#ifdef HAVE_LINK
{"link", posix_link, METH_VARARGS, posix_link__doc__},
_______________________________________________
Python-3000 mailing list
[email protected]
http://mail.python.org/mailman/listinfo/python-3000
Unsubscribe:
http://mail.python.org/mailman/options/python-3000/archive%40mail-archive.com