Charles-François Natali <neolo...@free.fr> added the comment:

Here are two new versions, both addressing Antoine's and Nick's comments:
- fwalk-3.diff is just an updated version
- fwalk-single_fd.diff doesn't use more than 2 FDs to walk a directory
tree, instead of the depth of directory tree. It's not as simple and
clean as I'd like it to be, but it should be much more robust, and
still safe (please make sure about that :-).
I was a little worried about the performance impact, so I did some
trivial benchmarks:
- O(depth) fwalk() is actually a tiny bit faster than walk() (it may
be because we don't do as much path lookup)
- O(1) fwalk() is around 20% slower, on a pure-traversal benchmark (so
in a realistic use case where we would actually do something with the
values returned by fwalk() the difference shouldn't be that
noticeable)

----------
Added file: http://bugs.python.org/file24382/fwalk-3.diff
Added file: http://bugs.python.org/file24383/fwalk-single_fd.diff

_______________________________________
Python tracker <rep...@bugs.python.org>
<http://bugs.python.org/issue13734>
_______________________________________
diff --git a/Doc/library/os.rst b/Doc/library/os.rst
--- a/Doc/library/os.rst
+++ b/Doc/library/os.rst
@@ -2251,6 +2251,56 @@
               os.rmdir(os.path.join(root, name))
 
 
+.. function:: fwalk(top, topdown=True, onerror=None, followlinks=False)
+
+   .. index::
+      single: directory; walking
+      single: directory; traversal
+
+    This behaves exactly like :func:`walk`, except that it yields a 4-tuple
+    ``(dirpath, dirnames, filenames, dirfd)``.
+
+   *dirpath*, *dirnames* and *filenames* are identical to :func:`walk` output,
+   and *dirfd* is a file descriptor referring to the directory *dirpath*.
+
+   .. note::
+
+      Since :func:`fwalk` yields file descriptors, those are only valid until
+      the next iteration step, so you should duplicate them (e.g. with
+      :func:`dup`) if you want to keep them longer.
+
+   This example displays the number of bytes taken by non-directory files in 
each
+   directory under the starting directory, except that it doesn't look under 
any
+   CVS subdirectory::
+
+      import os
+      for root, dirs, files, rootfd in os.fwalk('python/Lib/email'):
+          print(root, "consumes", end="")
+          print(sum([os.fstatat(rootfd, name).st_size for name in files]),
+                end="")
+          print("bytes in", len(files), "non-directory files")
+          if 'CVS' in dirs:
+              dirs.remove('CVS')  # don't visit CVS directories
+
+   In the next example, walking the tree bottom-up is essential:
+   :func:`unlinkat` doesn't allow deleting a directory before the directory is
+   empty::
+
+      # Delete everything reachable from the directory named in "top",
+      # assuming there are no symbolic links.
+      # CAUTION:  This is dangerous!  For example, if top == '/', it
+      # could delete all your disk files.
+      import os
+      for root, dirs, files, rootfd in os.fwalk(top, topdown=False):
+          for name in files:
+              os.unlinkat(rootfd, name)
+          for name in dirs:
+              os.unlinkat(rootfd, name, os.AT_REMOVEDIR)
+
+   Availability: Unix.
+
+   .. versionadded:: 3.3
+
 .. _os-process:
 
 Process Management
diff --git a/Doc/whatsnew/3.3.rst b/Doc/whatsnew/3.3.rst
--- a/Doc/whatsnew/3.3.rst
+++ b/Doc/whatsnew/3.3.rst
@@ -497,6 +497,10 @@
 
   (Patch submitted by Giampaolo Rodolà in :issue:`10784`.)
 
+* The :mod:`os` module has a new :func:`~os.fwalk` function similar to
+  :func:`~os.walk` except that it also yields file descriptors referring to the
+  directories visited. This is especially useful to avoid symlink races.
+
 * "at" functions (:issue:`4761`):
 
   * :func:`~os.faccessat`
diff --git a/Lib/os.py b/Lib/os.py
--- a/Lib/os.py
+++ b/Lib/os.py
@@ -24,6 +24,7 @@
 #'
 
 import sys, errno
+import stat as st
 
 _names = sys.builtin_module_names
 
@@ -32,6 +33,9 @@
            "defpath", "name", "path", "devnull",
            "SEEK_SET", "SEEK_CUR", "SEEK_END"]
 
+def _exists(name):
+    return name in globals()
+
 def _get_exports_list(module):
     try:
         return list(module.__all__)
@@ -120,7 +124,12 @@
     umask(mask)
     return mode & ~mask
 
-#'
+def _are_same_file(stat1, stat2):
+    """Helper function that checks whether two stat results refer to the same
+    file.
+    """
+    return (stat1.st_ino == stat2.st_ino and stat1.st_dev == stat2.st_dev)
+#
 
 # Super directory utilities.
 # (Inspired by Eric Raymond; the doc strings are mostly his)
@@ -151,7 +160,6 @@
     try:
         mkdir(name, mode)
     except OSError as e:
-        import stat as st
         if not (e.errno == errno.EEXIST and exist_ok and path.isdir(name) and
                 st.S_IMODE(lstat(name).st_mode) == _get_masked_mode(mode)):
             raise
@@ -298,6 +306,94 @@
 
 __all__.append("walk")
 
+if _exists("openat"):
+
+    def fwalk(top, topdown=True, onerror=None, followlinks=False):
+        """Directory tree generator.
+
+        This behaves exactly like walk(), except that it yields a 4-tuple
+
+            dirpath, dirnames, filenames, dirfd
+
+        `dirpath`, `dirnames` and `filenames` are identical to walk() output,
+        and `dirfd` is a file descriptor referring to the directory `dirpath`.
+
+        The advantage of walkfd() over walk() is that it's safe against symlink
+        races (when followlinks is False).
+
+        Caution:
+        Since fwalk() yields file descriptors, those are only valid until the
+        next iteration step, so you should dup() them if you want to keep them
+        for a longer period.
+
+        Example:
+
+        import os
+        for root, dirs, files, rootfd in os.fwalk('python/Lib/email'):
+            print(root, "consumes", end="")
+            print(sum([os.fstatat(rootfd, name).st_size for name in files]),
+                  end="")
+            print("bytes in", len(files), "non-directory files")
+            if 'CVS' in dirs:
+                dirs.remove('CVS')  # don't visit CVS directories
+        """
+        # Note: To guard against symlink races, we use the standard
+        # lstat()/open()/fstat() trick.
+        orig_st = lstat(top)
+        topfd = open(top, O_RDONLY)
+        try:
+            if (followlinks or (st.S_ISDIR(orig_st.st_mode) and
+                               _are_same_file(orig_st, fstat(topfd)))):
+                for x in _fwalk(topfd, top, topdown, onerror, followlinks):
+                    yield x
+        finally:
+            close(topfd)
+
+    def _fwalk(topfd, toppath, topdown, onerror, followlinks):
+        try:
+            names = fdlistdir(topfd)
+        except error as err:
+            if onerror is not None:
+                onerror(err)
+            return
+
+        dirs, nondirs = [], []
+        for name in names:
+            # Here, we don't use AT_SYMLINK_NOFOLLOW to be consistent with
+            # walk() which reports symlinks to directories as directories. We 
do
+            # however check for symlinks before recursing into a subdirectory.
+            if st.S_ISDIR(fstatat(topfd, name).st_mode):
+                dirs.append(name)
+            else:
+                nondirs.append(name)
+
+        # whether to follow symlinks
+        flag = 0 if followlinks else AT_SYMLINK_NOFOLLOW
+
+        if topdown:
+            yield toppath, dirs, nondirs, topfd
+
+        for name in dirs:
+            try:
+                orig_st = fstatat(topfd, name, flag)
+                dirfd = openat(topfd, name, O_RDONLY)
+            except error as err:
+                if onerror is not None:
+                    onerror(err)
+                return
+            try:
+                if followlinks or _are_same_file(orig_st, fstat(dirfd)):
+                    dirpath = path.join(toppath, name)
+                    for x in _fwalk(dirfd, dirpath, topdown, onerror, 
followlinks):
+                        yield x
+            finally:
+                close(dirfd)
+
+        if not topdown:
+            yield toppath, dirs, nondirs, topfd
+
+    __all__.append("fwalk")
+
 # Make sure os.environ exists, at least
 try:
     environ
@@ -598,9 +694,6 @@
 fsencode, fsdecode = _fscodec()
 del _fscodec
 
-def _exists(name):
-    return name in globals()
-
 # Supply spawn*() (probably only for Unix)
 if _exists("fork") and not _exists("spawnv") and _exists("execv"):
 
diff --git a/Lib/test/test_os.py b/Lib/test/test_os.py
--- a/Lib/test/test_os.py
+++ b/Lib/test/test_os.py
@@ -20,6 +20,8 @@
 import asyncore
 import asynchat
 import socket
+import itertools
+import stat
 try:
     import threading
 except ImportError:
@@ -159,7 +161,6 @@
         if not hasattr(os, "stat"):
             return
 
-        import stat
         result = os.stat(fname)
 
         # Make sure direct access works
@@ -476,7 +477,7 @@
 class WalkTests(unittest.TestCase):
     """Tests for os.walk()."""
 
-    def test_traversal(self):
+    def setUp(self):
         import os
         from os.path import join
 
@@ -586,6 +587,58 @@
                     os.remove(dirname)
         os.rmdir(support.TESTFN)
 
+@unittest.skipUnless(hasattr(os, 'fwalk'), "Test needs os.fwalk()")
+class fwalkTests(WalkTests):
+    """Tests for os.fwalk()."""
+
+    def test_compare_to_walk(self):
+        # compare with walk() results
+        for topdown, followlinks in itertools.product((True, False), repeat=2):
+            args = support.TESTFN, topdown, None, followlinks
+            expected = {}
+            for root, dirs, files in os.walk(*args):
+                expected[root] = (set(dirs), set(files))
+
+            for root, dirs, files, rootfd in os.fwalk(*args):
+                self.assertIn(root, expected)
+                self.assertEqual(expected[root], (set(dirs), set(files)))
+
+    def test_dir_fd(self):
+        # check returned file descriptors
+        for topdown, followlinks in itertools.product((True, False), repeat=2):
+            args = support.TESTFN, topdown, None, followlinks
+            for root, dirs, files, rootfd in os.fwalk(*args):
+                # check that the FD is valid
+                os.fstat(rootfd)
+                # check that fdlistdir() returns consistent information
+                self.assertEqual(set(os.fdlistdir(rootfd)), set(dirs) | 
set(files))
+
+    def test_fd_leak(self):
+        # Since we're opening a lot of FDs, we must be careful to avoid leaks:
+        # we both check that calling fwalk() a large number of times doesn't
+        # yield EMFILE, and that the minimum allocated FD hasn't changed.
+        minfd = os.dup(1)
+        os.close(minfd)
+        for i in range(512):
+            for x in os.fwalk(support.TESTFN):
+                pass
+        newfd = os.dup(1)
+        self.addCleanup(os.close, newfd)
+        self.assertEqual(newfd, minfd)
+
+    def tearDown(self):
+        # cleanup
+        for root, dirs, files, rootfd in os.fwalk(support.TESTFN, 
topdown=False):
+            for name in files:
+                os.unlinkat(rootfd, name)
+            for name in dirs:
+                st = os.fstatat(rootfd, name, os.AT_SYMLINK_NOFOLLOW)
+                if stat.S_ISDIR(st.st_mode):
+                    os.unlinkat(rootfd, name, os.AT_REMOVEDIR)
+                else:
+                    os.unlinkat(rootfd, name)
+        os.rmdir(support.TESTFN)
+
 class MakedirTests(unittest.TestCase):
     def setUp(self):
         os.mkdir(support.TESTFN)
@@ -1700,6 +1753,7 @@
         StatAttributeTests,
         EnvironTests,
         WalkTests,
+        fwalkTests,
         MakedirTests,
         DevNullTests,
         URandomTests,
diff --git a/Doc/library/os.rst b/Doc/library/os.rst
--- a/Doc/library/os.rst
+++ b/Doc/library/os.rst
@@ -2251,6 +2251,56 @@
               os.rmdir(os.path.join(root, name))
 
 
+.. function:: fwalk(top, topdown=True, onerror=None, followlinks=False)
+
+   .. index::
+      single: directory; walking
+      single: directory; traversal
+
+    This behaves exactly like :func:`walk`, except that it yields a 4-tuple
+    ``(dirpath, dirnames, filenames, dirfd)``.
+
+   *dirpath*, *dirnames* and *filenames* are identical to :func:`walk` output,
+   and *dirfd* is a file descriptor referring to the directory *dirpath*.
+
+   .. note::
+
+      Since :func:`fwalk` yields file descriptors, those are only valid until
+      the next iteration step, so you should duplicate them (e.g. with
+      :func:`dup`) if you want to keep them longer.
+
+   This example displays the number of bytes taken by non-directory files in 
each
+   directory under the starting directory, except that it doesn't look under 
any
+   CVS subdirectory::
+
+      import os
+      for root, dirs, files, rootfd in os.fwalk('python/Lib/email'):
+          print(root, "consumes", end="")
+          print(sum([os.fstatat(rootfd, name).st_size for name in files]),
+                end="")
+          print("bytes in", len(files), "non-directory files")
+          if 'CVS' in dirs:
+              dirs.remove('CVS')  # don't visit CVS directories
+
+   In the next example, walking the tree bottom-up is essential:
+   :func:`unlinkat` doesn't allow deleting a directory before the directory is
+   empty::
+
+      # Delete everything reachable from the directory named in "top",
+      # assuming there are no symbolic links.
+      # CAUTION:  This is dangerous!  For example, if top == '/', it
+      # could delete all your disk files.
+      import os
+      for root, dirs, files, rootfd in os.fwalk(top, topdown=False):
+          for name in files:
+              os.unlinkat(rootfd, name)
+          for name in dirs:
+              os.unlinkat(rootfd, name, os.AT_REMOVEDIR)
+
+   Availability: Unix.
+
+   .. versionadded:: 3.3
+
 .. _os-process:
 
 Process Management
diff --git a/Doc/whatsnew/3.3.rst b/Doc/whatsnew/3.3.rst
--- a/Doc/whatsnew/3.3.rst
+++ b/Doc/whatsnew/3.3.rst
@@ -497,6 +497,10 @@
 
   (Patch submitted by Giampaolo Rodolà in :issue:`10784`.)
 
+* The :mod:`os` module has a new :func:`~os.fwalk` function similar to
+  :func:`~os.walk` except that it also yields file descriptors referring to the
+  directories visited. This is especially useful to avoid symlink races.
+
 * "at" functions (:issue:`4761`):
 
   * :func:`~os.faccessat`
diff --git a/Lib/os.py b/Lib/os.py
--- a/Lib/os.py
+++ b/Lib/os.py
@@ -24,6 +24,7 @@
 #'
 
 import sys, errno
+import stat as st
 
 _names = sys.builtin_module_names
 
@@ -32,6 +33,9 @@
            "defpath", "name", "path", "devnull",
            "SEEK_SET", "SEEK_CUR", "SEEK_END"]
 
+def _exists(name):
+    return name in globals()
+
 def _get_exports_list(module):
     try:
         return list(module.__all__)
@@ -120,7 +124,12 @@
     umask(mask)
     return mode & ~mask
 
-#'
+def _are_same_file(stat1, stat2):
+    """Helper function that checks whether two stat results refer to the same
+    file.
+    """
+    return (stat1.st_ino == stat2.st_ino and stat1.st_dev == stat2.st_dev)
+#
 
 # Super directory utilities.
 # (Inspired by Eric Raymond; the doc strings are mostly his)
@@ -151,7 +160,6 @@
     try:
         mkdir(name, mode)
     except OSError as e:
-        import stat as st
         if not (e.errno == errno.EEXIST and exist_ok and path.isdir(name) and
                 st.S_IMODE(lstat(name).st_mode) == _get_masked_mode(mode)):
             raise
@@ -298,6 +306,115 @@
 
 __all__.append("walk")
 
+if _exists("openat"):
+
+    def fwalk(top, topdown=True, onerror=None, followlinks=False):
+        """Directory tree generator.
+
+        This behaves exactly like walk(), except that it yields a 4-tuple
+
+            dirpath, dirnames, filenames, dirfd
+
+        `dirpath`, `dirnames` and `filenames` are identical to walk() output,
+        and `dirfd` is a file descriptor referring to the directory `dirpath`.
+
+        The advantage of walkfd() over walk() is that it's safe against symlink
+        races (when followlinks is False).
+
+        Caution:
+        Since fwalk() yields file descriptors, those are only valid until the
+        next iteration step, so you should dup() them if you want to keep them
+        for a longer period.
+
+        Example:
+
+        import os
+        for root, dirs, files, rootfd in os.fwalk('python/Lib/email'):
+            print(root, "consumes", end="")
+            print(sum([os.fstatat(rootfd, name).st_size for name in files]),
+                  end="")
+            print("bytes in", len(files), "non-directory files")
+            if 'CVS' in dirs:
+                dirs.remove('CVS')  # don't visit CVS directories
+        """
+        # Note: To guard against symlink races, we use the standard
+        # lstat()/open()/fstat() trick.
+        orig_st = lstat(top)
+        topfd = open(top, O_RDONLY)
+        if (followlinks or (st.S_ISDIR(orig_st.st_mode) and
+                            _are_same_file(orig_st, fstat(topfd)))):
+            for x in _fwalk(topfd, top, topdown, onerror, followlinks):
+                yield x
+        else:
+            close(topfd)
+
+    def _fwalk(topfd, toppath, topdown, onerror, followlinks):
+        """fwalk() backend - `topfd` is always closed."""
+        # Note: We use openat()/fdlistdir()/fstatat() to recurse into
+        # subdirectories in a safe way. However, we don't want to keep an open
+        # FD at each directory level, because we could run out of FDs on deeply
+        # nested directory hierarchies. So, we have to close the current 
`topfd`
+        # FD before recursing into a subdirectory, and re-open it right after.
+        # Of couse, we must check that the directory hasn't changed in between.
+
+        try:
+            orig_top_st = fstat(topfd)
+            names = fdlistdir(topfd)
+        except error as err:
+            close(topfd)
+            if onerror is not None:
+                onerror(err)
+            return
+
+        dirs, nondirs = [], []
+        for name in names:
+            # Here, we don't use AT_SYMLINK_NOFOLLOW to be consistent with
+            # walk() which reports symlinks to directories as directories. We 
do
+            # however check for symlinks before recursing into a subdirectory.
+            if st.S_ISDIR(fstatat(topfd, name).st_mode):
+                dirs.append(name)
+            else:
+                nondirs.append(name)
+
+        # whether to follow symlinks
+        flag = 0 if followlinks else AT_SYMLINK_NOFOLLOW
+
+        if topdown:
+            yield toppath, dirs, nondirs, topfd
+
+        for name in dirs:
+            try:
+                orig_st = fstatat(topfd, name, flag)
+                dirfd = openat(topfd, name, O_RDONLY)
+            except error as err:
+                close(topfd)
+                if onerror is not None:
+                    onerror(err)
+                return
+
+            if followlinks or _are_same_file(orig_st, fstat(dirfd)):
+                # close the current directory's FD before recursing into
+                # subdirectory
+                close(topfd)
+
+                dirpath = path.join(toppath, name)
+                for x in _fwalk(dirfd, dirpath, topdown, onerror, followlinks):
+                    yield x
+
+                # re-open it, and check it hasn't changed
+                topfd = open(toppath, O_RDONLY)
+                if not (followlinks or _are_same_file(orig_top_st, 
fstat(topfd))):
+                    break
+            else:
+                close(dirfd)
+
+        if not topdown:
+            yield toppath, dirs, nondirs, topfd
+
+        close(topfd)
+
+    __all__.append("fwalk")
+
 # Make sure os.environ exists, at least
 try:
     environ
@@ -598,9 +715,6 @@
 fsencode, fsdecode = _fscodec()
 del _fscodec
 
-def _exists(name):
-    return name in globals()
-
 # Supply spawn*() (probably only for Unix)
 if _exists("fork") and not _exists("spawnv") and _exists("execv"):
 
diff --git a/Lib/test/test_os.py b/Lib/test/test_os.py
--- a/Lib/test/test_os.py
+++ b/Lib/test/test_os.py
@@ -20,6 +20,8 @@
 import asyncore
 import asynchat
 import socket
+import itertools
+import stat
 try:
     import threading
 except ImportError:
@@ -159,7 +161,6 @@
         if not hasattr(os, "stat"):
             return
 
-        import stat
         result = os.stat(fname)
 
         # Make sure direct access works
@@ -476,7 +477,7 @@
 class WalkTests(unittest.TestCase):
     """Tests for os.walk()."""
 
-    def test_traversal(self):
+    def setUp(self):
         import os
         from os.path import join
 
@@ -586,6 +587,58 @@
                     os.remove(dirname)
         os.rmdir(support.TESTFN)
 
+@unittest.skipUnless(hasattr(os, 'fwalk'), "Test needs os.fwalk()")
+class FwalkTests(WalkTests):
+    """Tests for os.fwalk()."""
+
+    def test_compare_to_walk(self):
+        # compare with walk() results
+        for topdown, followlinks in itertools.product((True, False), repeat=2):
+            args = support.TESTFN, topdown, None, followlinks
+            expected = {}
+            for root, dirs, files in os.walk(*args):
+                expected[root] = (set(dirs), set(files))
+
+            for root, dirs, files, rootfd in os.fwalk(*args):
+                self.assertIn(root, expected)
+                self.assertEqual(expected[root], (set(dirs), set(files)))
+
+    def test_dir_fd(self):
+        # check returned file descriptors
+        for topdown, followlinks in itertools.product((True, False), repeat=2):
+            args = support.TESTFN, topdown, None, followlinks
+            for root, dirs, files, rootfd in os.fwalk(*args):
+                # check that the FD is valid
+                os.fstat(rootfd)
+                # check that fdlistdir() returns consistent information
+                self.assertEqual(set(os.fdlistdir(rootfd)), set(dirs) | 
set(files))
+
+    def test_fd_leak(self):
+        # Since we're opening a lot of FDs, we must be careful to avoid leaks:
+        # we both check that calling fwalk() a large number of times doesn't
+        # yield EMFILE, and that the minimum allocated FD hasn't changed.
+        minfd = os.dup(1)
+        os.close(minfd)
+        for i in range(512):
+            for x in os.fwalk(support.TESTFN):
+                pass
+        newfd = os.dup(1)
+        self.addCleanup(os.close, newfd)
+        self.assertEqual(newfd, minfd)
+
+    def tearDown(self):
+        # cleanup
+        for root, dirs, files, rootfd in os.fwalk(support.TESTFN, 
topdown=False):
+            for name in files:
+                os.unlinkat(rootfd, name)
+            for name in dirs:
+                st = os.fstatat(rootfd, name, os.AT_SYMLINK_NOFOLLOW)
+                if stat.S_ISDIR(st.st_mode):
+                    os.unlinkat(rootfd, name, os.AT_REMOVEDIR)
+                else:
+                    os.unlinkat(rootfd, name)
+        os.rmdir(support.TESTFN)
+
 class MakedirTests(unittest.TestCase):
     def setUp(self):
         os.mkdir(support.TESTFN)
@@ -1700,6 +1753,7 @@
         StatAttributeTests,
         EnvironTests,
         WalkTests,
+        FwalkTests,
         MakedirTests,
         DevNullTests,
         URandomTests,
_______________________________________________
Python-bugs-list mailing list
Unsubscribe: 
http://mail.python.org/mailman/options/python-bugs-list/archive%40mail-archive.com

Reply via email to