Charles-François Natali <neolo...@free.fr> added the comment:
Here's a patch with tests and documentation.
I noticed something surprising:
walk() with followlinks=False returns links to directories as
directories (in dirnames).
I find this surprising, since if you don't follow symlinks, those are
just files (and you don't recurse into it). Also, it's a pain when you
want to remove dirnames, since you have to distinguish between a link
and a directory (unlink()/rmdir() or unlinkat() without/with
AT_REMOVEDIR)
To be consistent with this behavior, I had to change fdwalk() (I
renamed it to be consistent with fdlistdir()) to perform a call to
fstatat() without AT_SYMLINK_NOFOLLOW, since otherwise it would report
such links as files.
So the bottom line is that because of this, you can have up to 3
stat() calls per entry:
- fstatat(rootfd, name)
- fstatat(rootfd, name, AT_SYMLINK_NOFOLLOW) right before opening the directory
- fstat(dirfd) right after open to check that we're dealing with the same file
(walk() currently uses two stat() per entry, so it's not too bad).
----------
Added file: http://bugs.python.org/file24197/fdwalk.diff
_______________________________________
Python tracker <rep...@bugs.python.org>
<http://bugs.python.org/issue13734>
_______________________________________
diff --git a/Doc/library/os.rst b/Doc/library/os.rst
--- a/Doc/library/os.rst
+++ b/Doc/library/os.rst
@@ -2240,6 +2240,58 @@
os.rmdir(os.path.join(root, name))
+.. function:: fdwalk(top, topdown=True, onerror=None, followlinks=False)
+
+ .. index::
+ single: directory; walking
+ single: directory; traversal
+
+ This behaves exactly like :func:`walk`, except that it yields a 4-tuple
+ ``(dirpath, dirnames, filenames, dirfd)``.
+
+ *dirpath*, *dirnames* and *filenames* are identical to :func:`walk` output,
+ and *dirfd* is an open file descriptor to the directory.
+
+ .. note::
+
+ Since :func:`fdwalk` yields file descriptors, those are only valid until
+ the next iteration step, so you should duplicate them (e.g. with
+ :func:`dup`) if you want to keep them longer.
+
+ .. note::
+
+ Contrarily to :func:`walk`, modifying the dirnames list in-place won't
+ affect the directories traversed.
+
+ This example displays the number of bytes taken by non-directory files in
each
+ directory under the starting directory::
+
+ import os
+ for root, dirs, files, rootfd in os.fdwalk('python/Lib/email'):
+ print(root, "consumes", end="")
+ print(sum([os.fstatat(rootfd, name).st_size for name in files]),
+ end="")
+ print("bytes in", len(files), "non-directory files")
+
+ In the next example, walking the tree bottom-up is essential:
+ :func:`unlinkat` doesn't allow deleting a directory before the directory is
+ empty::
+
+ # Delete everything reachable from the directory named in "top",
+ # assuming there are no symbolic links.
+ # CAUTION: This is dangerous! For example, if top == '/', it
+ # could delete all your disk files.
+ import os
+ for root, dirs, files, rootfd in os.fdwalk(top, topdown=False):
+ for name in files:
+ os.unlinkat(rootfd, name)
+ for name in dirs:
+ os.unlinkat(rootfd, name, os.AT_REMOVEDIR)
+
+ Availability: Unix.
+
+ .. versionadded:: 3.3
+
.. _os-process:
Process Management
diff --git a/Doc/whatsnew/3.3.rst b/Doc/whatsnew/3.3.rst
--- a/Doc/whatsnew/3.3.rst
+++ b/Doc/whatsnew/3.3.rst
@@ -478,6 +478,10 @@
(Patch submitted by Giampaolo Rodolà in :issue:`10784`.)
+* The :mod:`os` module has a new :func:`~os.fdwalk` function similar to
+ :func:`~os.walk` excepts that it also yields an open file descriptor to the
+ directories visited. This is especially useful to avoid symlink races.
+
* "at" functions (:issue:`4761`):
* :func:`~os.faccessat`
diff --git a/Lib/os.py b/Lib/os.py
--- a/Lib/os.py
+++ b/Lib/os.py
@@ -24,6 +24,7 @@
#'
import sys, errno
+import stat as st
_names = sys.builtin_module_names
@@ -32,6 +33,9 @@
"defpath", "name", "path", "devnull",
"SEEK_SET", "SEEK_CUR", "SEEK_END"]
+def _exists(name):
+ return name in globals()
+
def _get_exports_list(module):
try:
return list(module.__all__)
@@ -120,7 +124,13 @@
umask(mask)
return mode & ~mask
-#'
+def _are_same_file(stat1, stat2):
+ """Helper function that checks whether two stat results refer to the same
+ file.
+ """
+ return (stat1.st_mode == stat2.st_mode and stat1.st_ino == stat2.st_ino and
+ stat1.st_dev == stat2.st_dev)
+#
# Super directory utilities.
# (Inspired by Eric Raymond; the doc strings are mostly his)
@@ -151,7 +161,6 @@
try:
mkdir(name, mode)
except OSError as e:
- import stat as st
if not (e.errno == errno.EEXIST and exist_ok and path.isdir(name) and
st.S_IMODE(lstat(name).st_mode) == _get_masked_mode(mode)):
raise
@@ -298,6 +307,94 @@
__all__.append("walk")
+if _exists("openat"):
+
+ def fdwalk(top, topdown=True, onerror=None, followlinks=False):
+ """Directory tree generator.
+
+ This behaves exactly like walk(), except that it yields a 4-tuple
+
+ dirpath, dirnames, filenames, dirfd
+
+ dirpath, dirnames and filenames are identical to walk() output, and
+ dirfd is an open file descriptor to dirpath.
+
+ The advantage of walkfd() over walk() is that it's safe against symlink
+ races (when followlinks is False).
+
+ Caution:
+ Since fdwalk() yields file descriptors, those are only valid until the
+ next iteration step, so you should dup() them if you want to keep them
+ for a longer period.
+ Also, contrarily to walk(), modifying the dirnames list in-place won't
+ affect the directories traversed.
+
+ Example:
+
+ import os
+ for root, dirs, files, rootfd in os.fdwalk('python/Lib/email'):
+ print(root, "consumes", end="")
+ print(sum([os.fstatat(rootfd, name).st_size for name in files]),
+ end="")
+ print("bytes in", len(files), "non-directory files")
+ """
+ # Note: To guard against symlink races, we use the standard
+ # lstat()/open()/fstat() trick.
+ orig_st = lstat(top)
+ topfd = open(top, O_RDONLY)
+ try:
+ if (followlinks or (st.S_ISDIR(orig_st.st_mode) and
+ _are_same_file(orig_st, fstat(topfd)))):
+ for x in _fdwalk(topfd, top, topdown, onerror, followlinks):
+ yield x
+ finally:
+ close(topfd)
+
+ def _fdwalk(topfd, toppath, topdown, onerror, followlinks):
+ try:
+ names = fdlistdir(topfd)
+ except error as err:
+ if onerror is not None:
+ onerror(err)
+ return
+
+ dirs, nondirs = [], []
+ for name in names:
+ # Here, we don't use AT_SYMLINK_NOFOLLOW to be consistent with
+ # walk() which reports symlinks to directories as directories. We
do
+ # however check for symlinks before recursing into a subdirectory.
+ if st.S_ISDIR(fstatat(topfd, name).st_mode):
+ dirs.append(name)
+ else:
+ nondirs.append(name)
+
+ # whether to follow symlinks
+ flag = 0 if followlinks else AT_SYMLINK_NOFOLLOW
+
+ if topdown:
+ yield toppath, dirs, nondirs, topfd
+
+ for name in dirs:
+ try:
+ orig_st = fstatat(topfd, name, flag)
+ dirfd = openat(topfd, name, O_RDONLY)
+ except error as err:
+ if onerror is not None:
+ onerror(err)
+ return
+ try:
+ if followlinks or _are_same_file(orig_st, fstat(dirfd)):
+ dirpath = path.join(toppath, name)
+ for x in _fdwalk(dirfd, dirpath, topdown, onerror,
followlinks):
+ yield x
+ finally:
+ close(dirfd)
+
+ if not topdown:
+ yield toppath, dirs, nondirs, topfd
+
+ __all__.append("fdwalk")
+
# Make sure os.environ exists, at least
try:
environ
@@ -598,9 +695,6 @@
fsencode, fsdecode = _fscodec()
del _fscodec
-def _exists(name):
- return name in globals()
-
# Supply spawn*() (probably only for Unix)
if _exists("fork") and not _exists("spawnv") and _exists("execv"):
diff --git a/Lib/test/test_os.py b/Lib/test/test_os.py
--- a/Lib/test/test_os.py
+++ b/Lib/test/test_os.py
@@ -20,6 +20,8 @@
import asyncore
import asynchat
import socket
+import itertools
+import stat
try:
import threading
except ImportError:
@@ -147,7 +149,6 @@
if not hasattr(os, "stat"):
return
- import stat
result = os.stat(fname)
# Make sure direct access works
@@ -464,7 +465,7 @@
class WalkTests(unittest.TestCase):
"""Tests for os.walk()."""
- def test_traversal(self):
+ def setUp(self):
import os
from os.path import join
@@ -569,6 +570,57 @@
os.remove(dirname)
os.rmdir(support.TESTFN)
+@unittest.skipUnless(hasattr(os, 'fdwalk'), "Test needs os.fdwalk()")
+class FdWalkTests(WalkTests):
+ """Tests for os.fdwalk()."""
+
+ def test_compare_to_walk(self):
+ # compare with walk() results
+ for topdown, followlinks in itertools.product((True, False), repeat=2):
+ args = support.TESTFN, topdown, None, followlinks
+ expected = {}
+ for root, dirs, files in os.walk(*args):
+ expected[root] = (set(dirs), set(files))
+
+ for root, dirs, files, rootfd in os.fdwalk(*args):
+ self.assertEqual(expected[root], (set(dirs), set(files)))
+
+ def test_dir_fd(self):
+ # check returned file descriptors
+ for topdown, followlinks in itertools.product((True, False), repeat=2):
+ args = support.TESTFN, topdown, None, followlinks
+ for root, dirs, files, rootfd in os.fdwalk(*args):
+ # check that the FD is valid
+ os.fstat(rootfd)
+ # check that fdlistdir() returns consistent information
+ self.assertEqual(set(os.fdlistdir(rootfd)), set(dirs) |
set(files))
+
+ def test_fd_leak(self):
+ # Since we're opening a lot of FDs, we must be careful to avoid leaks:
+ # we both check that calling fdwalk() a large number of times doesn't
+ # yield EMFILE, and that the minimum allocated FD hasn't changed.
+ minfd = os.dup(1)
+ os.close(minfd)
+ for i in range(1024):
+ for x in os.fdwalk(support.TESTFN):
+ pass
+ newfd = os.dup(1)
+ self.addCleanup(os.close, newfd)
+ self.assertEqual(newfd, minfd)
+
+ def tearDown(self):
+ # cleanup
+ for root, dirs, files, rootfd in os.fdwalk(support.TESTFN,
topdown=False):
+ for name in files:
+ os.unlinkat(rootfd, name)
+ for name in dirs:
+ st = os.fstatat(rootfd, name, os.AT_SYMLINK_NOFOLLOW)
+ if stat.S_ISDIR(st.st_mode):
+ os.unlinkat(rootfd, name, os.AT_REMOVEDIR)
+ else:
+ os.unlinkat(rootfd, name)
+ os.rmdir(support.TESTFN)
+
class MakedirTests(unittest.TestCase):
def setUp(self):
os.mkdir(support.TESTFN)
@@ -1683,6 +1735,7 @@
StatAttributeTests,
EnvironTests,
WalkTests,
+ FdWalkTests,
MakedirTests,
DevNullTests,
URandomTests,
_______________________________________________
Python-bugs-list mailing list
Unsubscribe:
http://mail.python.org/mailman/options/python-bugs-list/archive%40mail-archive.com