Diff below changes the representation of a NFS mount point. It is adapted from NetBSD with some tweaks of mine and is required for properly locking NFSnodes.
The idea is to keep a reference to the root vnode in 'struct nfsmount' instead of doing a lookup every time VFS_ROOT() is called. Calls to nfs_root() in the diskless path also become useless and we don't have to play with locking there. Finally we look at the number of refcounts when unmounting a filesystem, this is useful when debugging async races as reported by bluhm@ on bugs@. Note that nfs_root() already returns a "locked" node, that's why I'm using vput(9) and not vrele(9). However in the current context of NFS these functions are identical since VOP_LOCK/UNLOCK are noop. I have tested this on diskless & non-diskless, but I'm looking for more tests 'cause this area is critical. Comments? Oks? Index: nfs/nfs_node.c =================================================================== RCS file: /cvs/src/sys/nfs/nfs_node.c,v retrieving revision 1.66 diff -u -p -r1.66 nfs_node.c --- nfs/nfs_node.c 28 Mar 2018 09:40:26 -0000 1.66 +++ nfs/nfs_node.c 3 Apr 2018 12:22:12 -0000 @@ -138,19 +138,7 @@ loop: /* we now have an nfsnode on this vnode */ vp->v_flag &= ~VLARVAL; np->n_vnode = vp; - rw_init(&np->n_commitlock, "nfs_commitlk"); - - /* - * Are we getting the root? If so, make sure the vnode flags - * are correct - */ - if ((fhsize == nmp->nm_fhsize) && !bcmp(fh, nmp->nm_fh, fhsize)) { - if (vp->v_type == VNON) - vp->v_type = VDIR; - vp->v_flag |= VROOT; - } - np->n_fhp = &np->n_fh; bcopy(fh, np->n_fhp, fhsize); np->n_fhsize = fhsize; Index: nfs/nfs_vfsops.c =================================================================== RCS file: /cvs/src/sys/nfs/nfs_vfsops.c,v retrieving revision 1.116 diff -u -p -r1.116 nfs_vfsops.c --- nfs/nfs_vfsops.c 10 Feb 2018 05:24:23 -0000 1.116 +++ nfs/nfs_vfsops.c 3 Apr 2018 13:14:40 -0000 @@ -71,11 +71,13 @@ extern struct nfsstats nfsstats; extern int nfs_ticks; extern u_int32_t nfs_procids[NFS_NPROCS]; -int nfs_sysctl(int *, u_int, void *, size_t *, void *, size_t, struct proc *); -int nfs_checkexp(struct mount *, struct mbuf *, int *, struct ucred **); -struct mount *nfs_mount_diskless(struct nfs_dlmount *, char *, int); +int nfs_sysctl(int *, u_int, void *, size_t *, void *, size_t, + struct proc *); +int nfs_checkexp(struct mount *, struct mbuf *, int *, struct ucred **); +struct mount *nfs_mount_diskless(struct nfs_dlmount *, char *, int, + struct vnode **, struct proc *p); int mountnfs(struct nfs_args *, struct mount *, struct mbuf *, - const char *, char *); + const char *, char *, struct vnode **, struct proc *p); int nfs_quotactl(struct mount *, int, uid_t, caddr_t, struct proc *); int nfs_root(struct mount *, struct vnode **); int nfs_start(struct mount *, int, struct proc *); @@ -123,15 +125,13 @@ nfs_statfs(struct mount *mp, struct stat struct nfsmount *nmp = VFSTONFS(mp); int error = 0, retattr; struct ucred *cred; - struct nfsnode *np; u_quad_t tquad; info.nmi_v3 = (nmp->nm_flag & NFSMNT_NFSV3); - error = nfs_nget(mp, (nfsfh_t *)nmp->nm_fh, nmp->nm_fhsize, &np); + error = nfs_root(mp, &vp); if (error) return (error); - vp = NFSTOV(np); cred = crget(); cred->cr_ngroups = 0; if (info.nmi_v3 && (nmp->nm_flag & NFSMNT_GOTFSINFO) == 0) @@ -178,7 +178,7 @@ nfs_statfs(struct mount *mp, struct stat copy_statfs_info(sbp, mp); m_freem(info.nmi_mrep); nfsmout: - vrele(vp); + vput(vp); crfree(cred); return (error); } @@ -281,14 +281,14 @@ nfs_mountroot(void) */ if (nfs_boot_getfh(&nfs_diskless.nd_boot, "root", &nfs_diskless.nd_root, -1)) panic("nfs_mountroot: root"); - mp = nfs_mount_diskless(&nfs_diskless.nd_root, "/", 0); - nfs_root(mp, &rootvp); + mp = nfs_mount_diskless(&nfs_diskless.nd_root, "/", 0, &vp, procp); printf("root on %s\n", nfs_diskless.nd_root.ndm_host); /* * Link it into the mount list. */ TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); + rootvp = vp; vfs_unbusy(mp); /* Get root attributes (for the time). */ @@ -333,8 +333,8 @@ nfs_mountroot(void) */ error = nfs_boot_getfh(&nfs_diskless.nd_boot, "swap", &nfs_diskless.nd_swap, 5); if (!error) { - mp = nfs_mount_diskless(&nfs_diskless.nd_swap, "/swap", 0); - nfs_root(mp, &vp); + mp = nfs_mount_diskless(&nfs_diskless.nd_swap, "/swap", 0, &vp, + procp); vfs_unbusy(mp); /* @@ -376,7 +376,8 @@ nfs_mountroot(void) * Internal version of mount system call for diskless setup. */ struct mount * -nfs_mount_diskless(struct nfs_dlmount *ndmntp, char *mntname, int mntflag) +nfs_mount_diskless(struct nfs_dlmount *ndmntp, char *mntname, int mntflag, + struct vnode **vpp, struct proc *p) { struct mount *mp; struct mbuf *m; @@ -392,7 +393,7 @@ nfs_mount_diskless(struct nfs_dlmount *n (m->m_len = ndmntp->ndm_args.addr->sa_len)); error = mountnfs(&ndmntp->ndm_args, mp, m, mntname, - ndmntp->ndm_args.hostname); + ndmntp->ndm_args.hostname, vpp, p); if (error) panic("nfs_mountroot: mount %s failed: %d", mntname, error); @@ -556,6 +557,7 @@ nfs_mount(struct mount *mp, const char * int error; struct nfs_args *args = data; struct mbuf *nam; + struct vnode *vp; char hst[MNAMELEN]; size_t len; u_char nfh[NFSX_V3FHMAX]; @@ -599,7 +601,7 @@ nfs_mount(struct mount *mp, const char * if (error) return (error); args->fh = nfh; - error = mountnfs(args, mp, nam, path, hst); + error = mountnfs(args, mp, nam, path, hst, &vp, p); return (error); } @@ -608,9 +610,12 @@ nfs_mount(struct mount *mp, const char * */ int mountnfs(struct nfs_args *argp, struct mount *mp, struct mbuf *nam, - const char *pth, char *hst) + const char *pth, char *hst, struct vnode **vpp, struct proc *p) { struct nfsmount *nmp; + struct nfsnode *np; + struct vnode *vp; + struct vattr attr; int error; if (mp->mnt_flag & MNT_UPDATE) { @@ -633,12 +638,10 @@ mountnfs(struct nfs_args *argp, struct m nmp->nm_readdirsize = NFS_READDIRSIZE; nmp->nm_numgrps = NFS_MAXGRPS; nmp->nm_readahead = NFS_DEFRAHEAD; - nmp->nm_fhsize = argp->fhsize; nmp->nm_acregmin = NFS_MINATTRTIMO; nmp->nm_acregmax = NFS_MAXATTRTIMO; nmp->nm_acdirmin = NFS_MINATTRTIMO; nmp->nm_acdirmax = NFS_MAXATTRTIMO; - bcopy(argp->fh, nmp->nm_fh, argp->fhsize); mp->mnt_stat.f_namemax = MAXNAMLEN; memset(mp->mnt_stat.f_mntonname, 0, MNAMELEN); strlcpy(mp->mnt_stat.f_mntonname, pth, MNAMELEN); @@ -673,6 +676,30 @@ mountnfs(struct nfs_args *argp, struct m * point. */ mp->mnt_stat.f_iosize = NFS_MAXDGRAMDATA; + error = nfs_nget(mp, (nfsfh_t *)argp->fh, argp->fhsize, &np); + if (error) + goto bad; + vp = NFSTOV(np); + error = VOP_GETATTR(vp, &attr, p->p_ucred, p); + if (error) { + vput(vp); + goto bad; + } + + /* + * A reference count is needed on the nfsnode representing the + * remote root. If this object is not persistent, then backward + * traversals of the mount point (i.e. "..") will not work if + * the nfsnode gets flushed out of the cache. Ufs does not have + * this problem, because one can identify root inodes by their + * number == ROOTINO (2). So, just unlock, but no rele. + */ + nmp->nm_vnode = vp; + if (vp->v_type == VNON) + vp->v_type = VDIR; + vp->v_flag = VROOT; + VOP_UNLOCK(vp, curproc); + *vpp = vp; return (0); bad: @@ -687,18 +714,35 @@ int nfs_unmount(struct mount *mp, int mntflags, struct proc *p) { struct nfsmount *nmp; - int error, flags; + struct vnode *vp; + int error, flags = 0; nmp = VFSTONFS(mp); - flags = 0; + error = nfs_root(mp, &vp); + if (error) + return (error); + + if ((mntflags & MNT_FORCE) == 0 && vp->v_usecount > 2) { + vput(vp); + return (EBUSY); + } if (mntflags & MNT_FORCE) flags |= FORCECLOSE; - error = vflush(mp, NULL, flags); - if (error) + error = vflush(mp, vp, flags); + if (error) { + vput(vp); return (error); + } + /* + * There are two references count to get rid of here: one + * from mountnfs() and one from nfs_root() above. + */ + vrele(vp); + vput(vp); + vgone(vp); nfs_disconnect(nmp); m_freem(nmp->nm_nam); timeout_del(&nmp->nm_rtimeout); @@ -713,15 +757,19 @@ nfs_unmount(struct mount *mp, int mntfla int nfs_root(struct mount *mp, struct vnode **vpp) { + struct vnode *vp; struct nfsmount *nmp; - struct nfsnode *np; int error; nmp = VFSTONFS(mp); - error = nfs_nget(mp, (nfsfh_t *)nmp->nm_fh, nmp->nm_fhsize, &np); - if (error) + vp = nmp->nm_vnode; + vref(vp); + error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curproc); + if (error) { + vrele(vp); return (error); - *vpp = NFSTOV(np); + } + *vpp = vp; return (0); } Index: nfs/nfsmount.h =================================================================== RCS file: /cvs/src/sys/nfs/nfsmount.h,v retrieving revision 1.27 diff -u -p -r1.27 nfsmount.h --- nfs/nfsmount.h 22 Feb 2017 11:42:46 -0000 1.27 +++ nfs/nfsmount.h 3 Apr 2018 12:22:12 -0000 @@ -49,12 +49,11 @@ struct nfsmount { nm_ntree; /* filehandle/node tree */ TAILQ_HEAD(reqs, nfsreq) nm_reqsq; /* request queue for this mount. */ - struct timeout nm_rtimeout; /* timeout (scans/resends nm_reqsq). */ - int nm_flag; /* Flags for soft/hard... */ + struct timeout nm_rtimeout; /* timeout (scans/resends nm_reqsq). */ struct mount *nm_mountp; /* Vfs structure for this filesystem */ + struct vnode *nm_vnode; /* vnode of root dir */ + int nm_flag; /* Flags for soft/hard... */ int nm_numgrps; /* Max. size of groupslist */ - u_char nm_fh[NFSX_V3FHMAX]; /* File handle of root dir */ - int nm_fhsize; /* Size of root file handle */ struct socket *nm_so; /* Rpc socket */ int nm_sotype; /* Type of socket */ int nm_soproto; /* and protocol */