This addresses a FIXME in src/copy.c:

----
-/* FIXME: rewrite this to use a hash table so we avoid the quadratic
-   performance hit that's probably noticeable only on trees deeper
-   than a few hundred levels.  See use of active_dir_map in remove.c  */
----

The performance benefit is there, but on my machine with a PATH_MAX of
4096 it's hard to see, because the userland work `cp' does is dwarfed
by the work the kernel does on its behalf:

----
$ time src/cp.master -r a b

real    0m54.032s
user    0m3.236s    <-- coreutils HEAD
sys     0m47.335s

$ time src/cp -r a c

real    0m53.475s
user    0m0.624s    <-- with patch
sys     0m48.639s
----

Thanks,

Bo
From 224328e4bda44aa25cd5c98b1c13751ecea865c7 Mon Sep 17 00:00:00 2001
From: Bo Borgerson <[EMAIL PROTECTED]>
Date: Wed, 16 Apr 2008 13:44:36 -0400
Subject: [PATCH] Use a hash rather than a linked-list for cycle check in cp

* NEWS: List the change of cycle check behavior.
* src/copy.c (struct dir_info): These go in the hash.
(static Hash_table *ancestry_hash): This is the hash.
(static size_t dir_info_hasher): Simple hasher based on INO.
(static bool dir_info_comparator): Checks equivalence of INO and DEV.
(static bool ancestry_insert): Insert a dir_info entry for the current dir.
(static bool ancestry_delete): Delete the hash entry for the current dir.
(copy_internal): Now calls ancestry_insert and ancestry_delete instead of
managing the linked-list inline and calling is_ancestor for the cycle check.

Signed-off-by: Bo Borgerson <[EMAIL PROTECTED]>
---
 NEWS       |    2 +
 src/copy.c |  106 +++++++++++++++++++++++++++++++++++++++--------------------
 2 files changed, 72 insertions(+), 36 deletions(-)

diff --git a/NEWS b/NEWS
index 3a584e9..111e0c1 100644
--- a/NEWS
+++ b/NEWS
@@ -69,6 +69,8 @@ GNU coreutils NEWS                                    -*- outline -*-
 
   seq gives better diagnostics for invalid formats.
 
+  cp now uses a hash for cycle checks rather than a linked-list of ancestry.
+
 ** Portability
 
   rm now works properly even on systems like BeOS and Haiku,
diff --git a/src/copy.c b/src/copy.c
index c2f21a3..8e673f0 100644
--- a/src/copy.c
+++ b/src/copy.c
@@ -83,19 +83,74 @@ rpl_mkfifo (char const *file, mode_t mode)
 #define SAME_GROUP(A, B) ((A).st_gid == (B).st_gid)
 #define SAME_OWNER_AND_GROUP(A, B) (SAME_OWNER (A, B) && SAME_GROUP (A, B))
 
-struct dir_list
+
+/* This is for looking up directories by inode/device to ensure we don't
+   have any cycles. */
+static Hash_table *ancestors_hash;
+
+enum { INIT_ANCESTRY_SIZE = 47 };
+
+struct dir_info
 {
-  struct dir_list *parent;
   ino_t ino;
   dev_t dev;
 };
 
+static size_t
+dir_info_hasher (const void *entry, size_t tabsize)
+{
+  const struct dir_info *node = entry;
+  return node->ino % tabsize;
+}
+
+static bool
+dir_info_comparator (const void *e1, const void *e2)
+{
+  const struct dir_info *n1 = e1, *n2 = e2;
+  return n1->ino == n2->ino && n1->dev == n2->dev;
+}
+
+/* First check whether this directory has been seen.  If it has then
+   return false because we've encountered a cycle in the graph. If it
+   hasn't, then insert it into the ancestry hash and return true. */
+
+static bool
+ancestry_insert (struct dir_info *dir, struct stat *src_sb)
+{
+  dir->ino = src_sb->st_ino;
+  dir->dev = src_sb->st_dev;
+
+  if (! ancestors_hash)
+    {
+      ancestors_hash = hash_initialize (INIT_ANCESTRY_SIZE, NULL,
+					dir_info_hasher,
+					dir_info_comparator, NULL);
+      if (! ancestors_hash)
+	xalloc_die ();
+    }
+
+  if (hash_lookup (ancestors_hash, dir))
+    return false;
+
+  hash_insert (ancestors_hash, dir);
+
+  return true;
+}
+
+/* This is called when recursion through DIR is complete.  Note that
+   `*dir' is actually a pointer into the stack of the caller, so there's
+   no free here. */
+static inline void
+ancestry_delete (const struct dir_info *dir)
+{
+  hash_delete (ancestors_hash, dir);
+}
+
 /* Initial size of the cp.dest_info hash table.  */
 #define DEST_INFO_INITIAL_CAPACITY 61
 
 static bool copy_internal (char const *src_name, char const *dst_name,
 			   bool new_dst, dev_t device,
-			   struct dir_list *ancestors,
 			   const struct cp_options *x,
 			   bool command_line_arg,
 			   bool *copy_into_self,
@@ -110,23 +165,6 @@ static char const *top_level_dst_name;
 /* The invocation name of this program.  */
 extern char *program_name;
 
-/* FIXME: describe */
-/* FIXME: rewrite this to use a hash table so we avoid the quadratic
-   performance hit that's probably noticeable only on trees deeper
-   than a few hundred levels.  See use of active_dir_map in remove.c  */
-
-static bool
-is_ancestor (const struct stat *sb, const struct dir_list *ancestors)
-{
-  while (ancestors != 0)
-    {
-      if (ancestors->ino == sb->st_ino && ancestors->dev == sb->st_dev)
-	return true;
-      ancestors = ancestors->parent;
-    }
-  return false;
-}
-
 /* Read the contents of the directory SRC_NAME_IN, and recursively
    copy the contents to DST_NAME_IN.  NEW_DST is true if
    DST_NAME_IN is a directory that was created previously in the
@@ -137,8 +175,8 @@ is_ancestor (const struct stat *sb, const struct dir_list *ancestors)
 
 static bool
 copy_dir (char const *src_name_in, char const *dst_name_in, bool new_dst,
-	  const struct stat *src_sb, struct dir_list *ancestors,
-	  const struct cp_options *x, bool *copy_into_self)
+	  const struct stat *src_sb, const struct cp_options *x,
+	  bool *copy_into_self)
 {
   char *name_space;
   char *namep;
@@ -167,7 +205,7 @@ copy_dir (char const *src_name_in, char const *dst_name_in, bool new_dst,
       char *dst_name = file_name_concat (dst_name_in, namep, NULL);
 
       ok &= copy_internal (src_name, dst_name, new_dst, src_sb->st_dev,
-			   ancestors, &non_command_line_options, false,
+			   &non_command_line_options, false,
 			   &local_copy_into_self, NULL);
       *copy_into_self |= local_copy_into_self;
 
@@ -1056,7 +1094,6 @@ static bool
 copy_internal (char const *src_name, char const *dst_name,
 	       bool new_dst,
 	       dev_t device,
-	       struct dir_list *ancestors,
 	       const struct cp_options *x,
 	       bool command_line_arg,
 	       bool *copy_into_self,
@@ -1673,27 +1710,21 @@ copy_internal (char const *src_name, char const *dst_name,
 
   if (S_ISDIR (src_mode))
     {
-      struct dir_list *dir;
+      struct dir_info dir;
 
-      /* If this directory has been copied before during the
+      /* Insert the current directory in the list of parents.
+         If this directory has been copied before during the
          recursion, there is a symbolic link to an ancestor
          directory of the symbolic link.  It is impossible to
          continue to copy this, unless we've got an infinite disk.  */
 
-      if (is_ancestor (&src_sb, ancestors))
+      if (! ancestry_insert (&dir, &src_sb))
 	{
 	  error (0, 0, _("cannot copy cyclic symbolic link %s"),
 		 quote (src_name));
 	  goto un_backup;
 	}
 
-      /* Insert the current directory in the list of parents.  */
-
-      dir = alloca (sizeof *dir);
-      dir->parent = ancestors;
-      dir->ino = src_sb.st_ino;
-      dir->dev = src_sb.st_dev;
-
       if (new_dst || !S_ISDIR (dst_sb.st_mode))
 	{
 	  /* POSIX says mkdir's behavior is implementation-defined when
@@ -1753,9 +1784,12 @@ copy_internal (char const *src_name, char const *dst_name,
 	     this fails -- otherwise, the failure to read a single file
 	     in a source directory would cause the containing destination
 	     directory not to have owner/perms set properly.  */
-	  delayed_ok = copy_dir (src_name, dst_name, new_dst, &src_sb, dir, x,
+	  delayed_ok = copy_dir (src_name, dst_name, new_dst, &src_sb, x,
 				 copy_into_self);
 	}
+
+      /* This whole branch has been copied now. */
+      ancestry_delete (&dir);
     }
   else if (x->symbolic_link)
     {
@@ -2097,7 +2131,7 @@ copy (char const *src_name, char const *dst_name,
   top_level_src_name = src_name;
   top_level_dst_name = dst_name;
 
-  return copy_internal (src_name, dst_name, nonexistent_dst, 0, NULL,
+  return copy_internal (src_name, dst_name, nonexistent_dst, 0,
 			options, true, copy_into_self, rename_succeeded);
 }
 
-- 
1.5.2.5

_______________________________________________
Bug-coreutils mailing list
Bug-coreutils@gnu.org
http://lists.gnu.org/mailman/listinfo/bug-coreutils

Reply via email to