Greetings,

  For the folk on linux-fsdevel:
     I have been working on the filehandle -> dentry lookup code in
     knfsd to get it to make more effective use of the dcache. (it
     previously had several caches of it's own which I didn't think
     were necessary).

     My first (released) attempt was accepted in to 2.3.23 (or there
     abouts) by Linus, and works quite well.  However, I have since
     realised that it showed flagrent disregard for VFS locking
     protocols.

     I am sending this to linux-fsdevel in there hope that there are
     people there who understand the dcache deeply and can comment on
     my code.

  I have attempted add proper VFS locking to knfsd and the patch below
  is the result.   I would appreciate it if anyone with a good
  understanding of, and interest in, the dcache would look it over
  to see if I have missed anything.
  I have tested the patch a reasonable amount, but as it is very hard
  to produce the sort of race that would be problematic, I can really
  only rely of code-review to increaes my confidence of correctness.

 A brief overview of what is happening is that knfsd tries to build a
 dcache path from some file or directory up to the root - or some other
 pre-existing part of the dcache. It then splices it in.
 If, at the same time, some other process tries to build a path down
 to the file (or something near by) the two could run into each other
 and create confusion.

 The worst case scenario is if we have, say
      a/b/c/d/e/f/g/h
  where a/b/c is in the dcache, and knfsd needs to access h.
  It starts building h, then g/h then f/g/h,
  At this point some process does
    rename a/b/c/d/e/f/g a/g

  knfsd will notice that a/b/c/d/e/f has appearing in the dcache and
  will back off and try again, but if, due to intense memory presure,
  d/e/f gets flushed from the dcache before knfsd gets a look-in, it
  will happily create e/ and d/ and splice in d/e/f/g/h, which would
  now be wrong.

  I think this case is extremely unlikely, and unsolvable without help
  from individual filesystems.  I do plan to get to that eventually,
  but for now, can anyone see any problems with this patch other than
  the above?

Thanks for your time,
NeilBrown

  This patch is against 2.3.28

*** fs/nfsd/nfsfh.c     1999/11/15 04:54:30     1.1
--- fs/nfsd/nfsfh.c     1999/11/19 01:40:34     1.2
***************
*** 5,11 ****
   *
   * Copyright (C) 1995, 1996 Olaf Kirch <[EMAIL PROTECTED]>
   * Portions Copyright (C) 1999 G. Allen Morris III <[EMAIL PROTECTED]>
!  * Extensive cleanup by Neil Brown <[EMAIL PROTECTED]> Southern-Spring 1999
   */
  
  #include <linux/sched.h>
--- 5,11 ----
   *
   * Copyright (C) 1995, 1996 Olaf Kirch <[EMAIL PROTECTED]>
   * Portions Copyright (C) 1999 G. Allen Morris III <[EMAIL PROTECTED]>
!  * Extensive rewrite by Neil Brown <[EMAIL PROTECTED]> Southern-Spring 1999
   */
  
  #include <linux/sched.h>
***************
*** 63,69 ****
  }
  
  /*
!  * Read a directory and return the name of the specified entry.
   */
  static int get_ino_name(struct dentry *dentry, struct qstr *name, unsigned long ino)
  {
--- 63,69 ----
  }
  
  /*
!  * Read a directory and return the name of the specified entry.  i_sem is already 
down().
   */
  static int get_ino_name(struct dentry *dentry, struct qstr *name, unsigned long ino)
  {
***************
*** 94,102 ****
        buffer.sequence = 0;
        while (1) {
                int old_seq = buffer.sequence;
-               down(&dir->i_sem);
                error = file.f_op->readdir(&file, &buffer, filldir_one);
-               up(&dir->i_sem);
                if (error < 0)
                        break;
  
--- 94,100 ----
***************
*** 118,124 ****
  /* this should be provided by each filesystem in an nfsd_operations interface as
   * iget isn't really the right interface
   */
! static inline struct dentry *nfsd_iget(struct super_block *sb, unsigned long ino, 
__u32 generation)
  {
  
        /* iget isn't really right if the inode is currently unallocated!!
--- 116,122 ----
  /* this should be provided by each filesystem in an nfsd_operations interface as
   * iget isn't really the right interface
   */
! static struct dentry *nfsd_iget(struct super_block *sb, unsigned long ino, __u32 
generation)
  {
  
        /* iget isn't really right if the inode is currently unallocated!!
***************
*** 209,214 ****
--- 207,213 ----
         * it is well connected.  But nobody returns different dentrys do they?
         */
        pdentry = child->d_inode->i_op->lookup(child->d_inode, tdentry);
+       d_drop(tdentry); /* we never want ".." hashed */
        if (!pdentry) {
                /* I don't want to return a ".." dentry.
                 * I would prefer to return an unconnected "IS_ROOT" dentry,
***************
*** 233,242 ****
                if (pdentry == NULL)
                        pdentry = ERR_PTR(-ENOMEM);
        }
!       dput(tdentry); /* it was never rehashed, it will be discarded */
        return pdentry;
  }
  
  /*
   * This is the basic lookup mechanism for turning an NFS file handle
   * into a dentry.
--- 232,293 ----
                if (pdentry == NULL)
                        pdentry = ERR_PTR(-ENOMEM);
        }
!       dput(tdentry); /* it is not hashed, it will be discarded */
        return pdentry;
  }
  
+ static struct dentry *splice(struct dentry *child, struct dentry *parent)
+ {
+       int err = 0;
+       struct qstr qs;
+       char namebuf[256];
+       struct list_head *lp;
+       struct dentry *tmp;
+       /* child is an IS_ROOT (anonymous) dentry, but it is hypothesised that
+        * it should be a child of parent.
+        * We see if we can find a name and, if we can - splice it in.
+        * We hold the i_sem on the parent the whole time to try to follow locking 
+protocols.
+        */
+       qs.name = namebuf;
+       down(&parent->d_inode->i_sem);
+ 
+       /* Now, things might have changed while we waited.
+        * Possibly a friendly filesystem found child and spliced it in in response
+        * to a lookup (though nobody does this yet).  In this case, just succeed.
+        */
+       if (child->d_parent == parent) goto out;
+       /* Possibly a new dentry has been made for this child->d_inode in parent by
+        * a lookup.  In this case return that dentry. caller must notice and act 
+accordingly
+        */
+       for (lp = child->d_inode->i_dentry.next; lp != &child->d_inode->i_dentry ; 
+lp=lp->next) {
+               tmp = list_entry(lp,struct dentry, d_alias);
+               if (tmp->d_parent == parent) {
+                       child = dget(tmp);
+                       goto out;
+               }
+       }
+       /* well, if we can find a name for child in parent, it should be safe to 
+splice it in */
+       err = get_ino_name(parent, &qs, child->d_inode->i_ino);
+       if (err)
+               goto out;
+       tmp = d_lookup(parent, &qs);
+       if (tmp) {
+               /* Now that IS odd.  I wonder what it means... */
+               err = -EEXIST;
+               printk("nfsd-fh: found a name that I didn't expect: %s/%s\n", 
+parent->d_name.name, qs.name);
+               dput(tmp);
+               goto out;
+       }
+       err = d_splice(child, parent, &qs);
+       dprintk("nfsd_fh: found name %s for ino %ld\n", child->d_name.name, 
+child->d_inode->i_ino);
+  out:
+       up(&parent->d_inode->i_sem);
+       if (err)
+               return ERR_PTR(err);
+       else
+               return child;
+ }
+ 
  /*
   * This is the basic lookup mechanism for turning an NFS file handle
   * into a dentry.
***************
*** 248,262 ****
  find_fh_dentry(struct super_block *sb, struct knfs_fh *fh, int needpath)
  {
        struct dentry *dentry, *result = NULL;
!       struct qstr qs;
!       char namebuf[256];
        int  found =0;
        u32 err;
  
!       qs.name = namebuf;
        /*
         * Attempt to find the inode.
         */
        result = nfsd_iget(sb, fh->fh_ino, fh->fh_generation);
        err = PTR_ERR(result);
        if (IS_ERR(result))
--- 299,320 ----
  find_fh_dentry(struct super_block *sb, struct knfs_fh *fh, int needpath)
  {
        struct dentry *dentry, *result = NULL;
!       struct dentry *tmp;
        int  found =0;
        u32 err;
+       /* This semaphore is needed to make sure that only one unconnected (free)
+        * dcache path ever exists, as otherwise two partial paths might get
+        * joined together, which would be very confusing.
+        * If there is ever an unconnected non-root directory, then this lock
+        * must be held.  This could sensibly be per-filesystem.
+        */
+       static DECLARE_MUTEX(free_path_sem);
  
!       nfsdstats.fh_lookup++;
        /*
         * Attempt to find the inode.
         */
+  retry:
        result = nfsd_iget(sb, fh->fh_ino, fh->fh_generation);
        err = PTR_ERR(result);
        if (IS_ERR(result))
***************
*** 269,278 ****
        if (!IS_ROOT(result) || result->d_inode->i_sb->s_root ==result)
                return result;
  
!       /* result is now a "root" dentry, which may be adequate as it stands, or else
         * will get spliced into the dcache tree */
  
        if (!S_ISDIR(result->d_inode->i_mode) && ! needpath) {
                return result;
        }
  
--- 327,337 ----
        if (!IS_ROOT(result) || result->d_inode->i_sb->s_root ==result)
                return result;
  
!       /* result is now an anonymous dentry, which may be adequate as it stands, or 
else
         * will get spliced into the dcache tree */
  
        if (!S_ISDIR(result->d_inode->i_mode) && ! needpath) {
+               nfsdstats.fh_anon++;
                return result;
        }
  
***************
*** 280,287 ****
--- 339,348 ----
         * location in the tree.
         */
        dprintk("nfs_fh: need to look harder for %d/%d\n",sb->s_dev,fh->fh_ino);
+       down(&free_path_sem);
        found = 0;
        if (!S_ISDIR(result->d_inode->i_mode)) {
+               nfsdstats.fh_nocache_nondir++;
                if (fh->fh_dirino == 0)
                        goto err_result; /* don't know how to find parent */
                else {
***************
*** 297,318 ****
                        }
                        if (!IS_ROOT(dentry) || dentry->d_inode->i_sb->s_root ==dentry)
                                found = 1;
!                       err = get_ino_name(dentry, &qs, result->d_inode->i_ino);
!                       if (err)
!                               goto err_dentry;
! 
!                       /* OK, we have the name in parent of inode,  lets fill in the 
dentry */
!                       err = d_splice(result, dentry, &qs);
!                       if (err)
                                goto err_dentry;
                }
!       }
!       else
                dentry = dget(result);
  
        while(!found) {
                /* LOOP INVARIANT */
!               /* haven't found a place in the tree yet, but we do have a path
                 * from dentry down to result, and dentry is a directory.
                 * Have a hold on dentry and result */
                struct dentry *pdentry;
--- 358,383 ----
                        }
                        if (!IS_ROOT(dentry) || dentry->d_inode->i_sb->s_root ==dentry)
                                found = 1;
!                       tmp = splice(result, dentry);
!                       err = PTR_ERR(tmp);
!                       if (IS_ERR(tmp))
                                goto err_dentry;
+                       if (tmp != result) {
+                               /* it is safe to just use tmp instead, but we must 
+discard result first */
+                               d_drop(result);
+                               dput(result);
+                               result = tmp;
+                               /* If !found, then this is really wierd, but it 
+shouldn't hurt */
+                       }
                }
!       } else {
!               nfsdstats.fh_nocache_dir++;
                dentry = dget(result);
+       }
  
        while(!found) {
                /* LOOP INVARIANT */
!               /* haven't found a place in the tree yet, but we do have a free path
                 * from dentry down to result, and dentry is a directory.
                 * Have a hold on dentry and result */
                struct dentry *pdentry;
***************
*** 334,356 ****
                if (!IS_ROOT(pdentry) || parent->i_sb->s_root == pdentry)
                        found = 1;
  
!               err = get_ino_name(pdentry, &qs, dentry->d_inode->i_ino);
!               if (err) {
                        dput(pdentry);
                        goto err_dentry;
                }
!               err = d_splice(dentry, pdentry, &qs);
!               dprintk("nfsd_fh: found name %s for ino %ld\n", dentry->d_name.name, 
dentry->d_inode->i_ino);
                dput(dentry);
                dentry = pdentry;
        }
        dput(dentry);
        return result;
  
  err_dentry:
        dput(dentry);
  err_result:
        dput(result);
  err_out:
        if (err == -ESTALE)
                nfsdstats.fh_stale++;
--- 399,439 ----
                if (!IS_ROOT(pdentry) || parent->i_sb->s_root == pdentry)
                        found = 1;
  
!               tmp = splice(dentry, pdentry);
!               if (tmp != dentry) {
!                       /* Something wrong.  We need to drop thw whole dentry->result 
path
!                        * whatever it was
!                        */
!                       struct dentry *d;
!                       for (d=result ; d ; d=(d->d_parent == d)?NULL:d->d_parent)
!                               d_drop(d);
!               }
!               if (IS_ERR(tmp)) {
!                       err = PTR_ERR(tmp);
                        dput(pdentry);
                        goto err_dentry;
                }
!               if (tmp != dentry) {
!                       /* we lost a race,  try again
!                        */
!                       dput(tmp);
!                       dput(dentry);
!                       dput(result);   /* this will discard the whole free path, so 
we can up the semaphore */
!                       up(&free_path_sem);
!                       goto retry;
!               }
                dput(dentry);
                dentry = pdentry;
        }
        dput(dentry);
+       up(&free_path_sem);
        return result;
  
  err_dentry:
        dput(dentry);
  err_result:
        dput(result);
+       up(&free_path_sem);
  err_out:
        if (err == -ESTALE)
                nfsdstats.fh_stale++;
*** fs/nfsd/stats.c     1999/11/15 04:54:37     1.1
--- fs/nfsd/stats.c     1999/11/19 01:41:02
***************
*** 38,48 ****
                        nfsdstats.rchits,
                        nfsdstats.rcmisses,
                        nfsdstats.rcnocache,
!                       nfsdstats.fh_cached,
!                       nfsdstats.fh_valid,
!                       nfsdstats.fh_fixup,
                        nfsdstats.fh_lookup,
!                       nfsdstats.fh_stale);
  
        /* Assume we haven't hit EOF yet. Will be set by svc_proc_read. */
        *eof = 0;
--- 38,49 ----
                        nfsdstats.rchits,
                        nfsdstats.rcmisses,
                        nfsdstats.rcnocache,
!                       nfsdstats.fh_stale,
                        nfsdstats.fh_lookup,
!                       nfsdstats.fh_anon,
!                       nfsdstats.fh_nocache_dir,
!                       nfsdstats.fh_nocache_nondir);
! 
  
        /* Assume we haven't hit EOF yet. Will be set by svc_proc_read. */
        *eof = 0;
*** include/linux/nfsd/stats.h  1999/11/15 04:55:07     1.1
--- include/linux/nfsd/stats.h  1999/11/19 01:41:44
***************
*** 13,23 ****
        unsigned int    rchits;         /* repcache hits */
        unsigned int    rcmisses;       /* repcache hits */
        unsigned int    rcnocache;      /* uncached reqs */
-       unsigned int    fh_cached;      /* dentry cached */
-       unsigned int    fh_valid;       /* dentry validated */
-       unsigned int    fh_fixup;       /* dentry fixup validated */
-       unsigned int    fh_lookup;      /* new lookup required */
        unsigned int    fh_stale;       /* FH stale error */
  };
  
  #ifdef __KERNEL__
--- 13,23 ----
        unsigned int    rchits;         /* repcache hits */
        unsigned int    rcmisses;       /* repcache hits */
        unsigned int    rcnocache;      /* uncached reqs */
        unsigned int    fh_stale;       /* FH stale error */
+       unsigned int    fh_lookup;      /* dentry cached */
+       unsigned int    fh_anon;        /* anon file dentry returned */
+       unsigned int    fh_nocache_dir; /* filehandle not foudn in dcache */
+       unsigned int    fh_nocache_nondir;      /* filehandle not foudn in dcache */
  };
  
  #ifdef __KERNEL__

Reply via email to