The branch main has been updated by rmacklem: URL: https://cgit.FreeBSD.org/src/commit/?id=72e57bc264179818f55b49deb97bc40f56c2b936
commit 72e57bc264179818f55b49deb97bc40f56c2b936 Author: Rick Macklem <[email protected]> AuthorDate: 2026-06-04 01:26:36 +0000 Commit: Rick Macklem <[email protected]> CommitDate: 2026-06-04 01:28:56 +0000 nfsd: Add support for striped Flexible File layout Without this patch, the NFSv4.1/4.2 pNFS server configuration did not support striping. This was mainly because the Linux client driver did not support it either. The Linux client driver for Flexible File layout does now support striping. (Linux kernel version 6.18 or newer) As such, this patch adds striping support. The configuration is currently just two new sysctls called vfs.nfsd.pnfsstripeunit - Size (in bytes) of a stripe vfs.nfsd.pnfsstripecnt - # of DSs to stripe across A setting of 0 for the first sysctl and 1 for the second disables striping. A patch that allows use of a different striping configuration for each exported MDS file system is planned for the future. The pnfsdscopymr may be broken by this patch, but since no one reported that they were actually using a pNFS server configuration, I do not believe that will be a problem at this time. Until the FreeBSD NFSv4.1/4.2 client is patched to handle striped flexible file layouts, mounts to a striped pNFS configuration must be done without the "pnfs" mount option. (Linux systems with a kernel version of 6.18 or newer should be able to handle a striped pNFS configuration.) Future patches that convert the pNFS server to a "loosely coupled" configuration (which allows the use of non-FreeBSD servers as DSs) are anticipated. --- sys/fs/nfs/nfs.h | 3 + sys/fs/nfs/nfs_var.h | 8 +- sys/fs/nfs/nfsproto.h | 16 +- sys/fs/nfs/nfsrvstate.h | 13 + sys/fs/nfsserver/nfs_nfsdport.c | 1075 +++++++++++++++++++++++++++++--------- sys/fs/nfsserver/nfs_nfsdserv.c | 16 +- sys/fs/nfsserver/nfs_nfsdstate.c | 91 ++-- 7 files changed, 908 insertions(+), 314 deletions(-) diff --git a/sys/fs/nfs/nfs.h b/sys/fs/nfs/nfs.h index b30e4f17b7c5..1056595e8cde 100644 --- a/sys/fs/nfs/nfs.h +++ b/sys/fs/nfs/nfs.h @@ -199,9 +199,12 @@ struct nfsd_nfsd_args { * NFSDEV_MAXMIRRORS - Maximum level of mirroring for a DS. * (Most will only put files on two DSs, but this setting allows up to 4.) * NFSDEV_MAXVERS - maximum number of NFS versions supported by Flex File. + * NFSDEV_MAXSTRIPE - sanity limit for maximum number of DSs used to stripe a + * file, which is nfsrv_maxstripecnt set by the nfsd option. */ #define NFSDEV_MAXMIRRORS 4 #define NFSDEV_MAXVERS 4 +#define NFSDEV_MAXSTRIPE 1024 struct nfsd_pnfsd_args { int op; /* Which pNFSd op to perform. */ diff --git a/sys/fs/nfs/nfs_var.h b/sys/fs/nfs/nfs_var.h index 8d8de381cbaf..00ed32ae705e 100644 --- a/sys/fs/nfs/nfs_var.h +++ b/sys/fs/nfs/nfs_var.h @@ -761,10 +761,10 @@ int nfsvno_testexp(struct nfsrv_descript *, struct nfsexstuff *); uint32_t nfsrv_hashfh(fhandle_t *); uint32_t nfsrv_hashsessionid(uint8_t *); void nfsrv_backupstable(void); -int nfsrv_dsgetdevandfh(struct vnode *, NFSPROC_T *, int *, fhandle_t *, - char *); -int nfsrv_dsgetsockmnt(struct vnode *, int, char *, int *, int *, - NFSPROC_T *, struct vnode **, fhandle_t *, char *, char *, +int nfsrv_dsgetdevandfh(struct vnode *, NFSPROC_T *, int *, uint64_t *, int *, + fhandle_t **, char **); +int nfsrv_dsgetsockmnt(struct vnode *, int, char *, int *, int *, int *, + uint64_t *, NFSPROC_T *, struct vnode ***, fhandle_t **, char **, char *, struct vnode **, struct nfsmount **, struct nfsmount *, int *, int *); int nfsrv_dscreate(struct vnode *, struct vattr *, struct vattr *, fhandle_t *, struct pnfsdsfile *, struct pnfsdsattr *, char *, diff --git a/sys/fs/nfs/nfsproto.h b/sys/fs/nfs/nfsproto.h index 1054c85d8e8f..467018c39535 100644 --- a/sys/fs/nfs/nfsproto.h +++ b/sys/fs/nfs/nfsproto.h @@ -282,9 +282,19 @@ #define NFSX_V4NAMEDATTRFH 3 #define NFSX_V4FILELAYOUT (4 * NFSX_UNSIGNED + NFSX_V4DEVICEID + \ NFSX_HYPER + NFSM_RNDUP(NFSX_V4PNFSFH)) -#define NFSX_V4FLEXLAYOUT(m) (NFSX_HYPER + 3 * NFSX_UNSIGNED + \ - ((m) * (NFSX_V4DEVICEID + NFSX_STATEID + NFSM_RNDUP(NFSX_V4PNFSFH) + \ - 8 * NFSX_UNSIGNED))) + +/* + * NFSX_V4FLEXLAYOUT() is the size in bytes of the XDR for ff_layout4, given + * m - # of mirrors + * s - # of stripes + * NFSX_HYPER is for ffl_stripe_unit + * "m + 3" calculates the counts of ffl_mirrors and ffm_data_servers plus + * ffl_flags and ffl_stats_collect_hint. + * The final section calculates the size of ff_data_server4. + */ +#define NFSX_V4FLEXLAYOUT(m, s) (NFSX_HYPER + ((m) + 3) * NFSX_UNSIGNED + \ + (((m) * (s)) * (NFSX_V4DEVICEID + NFSX_STATEID + \ + NFSM_RNDUP(NFSX_V4PNFSFH) + 7 * NFSX_UNSIGNED))) /* sizes common to multiple NFS versions */ #define NFSX_FHMAX (NFSX_V4FHMAX) diff --git a/sys/fs/nfs/nfsrvstate.h b/sys/fs/nfs/nfsrvstate.h index 0e93f87234b2..91365f66bde3 100644 --- a/sys/fs/nfs/nfsrvstate.h +++ b/sys/fs/nfs/nfsrvstate.h @@ -403,7 +403,20 @@ struct nfsdontlist { * in the metadata file's extended attribute called pnfsd.dsfile. */ #define PNFS_FILENAME_LEN (2 * sizeof(fhandle_t)) +struct opnfsdsfile { + fhandle_t dsf_fh; + uint32_t dsf_dir; + union { + struct sockaddr_in sin; + struct sockaddr_in6 sin6; + } dsf_nam; + char dsf_filename[PNFS_FILENAME_LEN + 1]; +}; + +/* New structure with stripe fields. */ struct pnfsdsfile { + uint32_t dsf_stripecnt; + uint64_t dsf_stripesiz; fhandle_t dsf_fh; uint32_t dsf_dir; union { diff --git a/sys/fs/nfsserver/nfs_nfsdport.c b/sys/fs/nfsserver/nfs_nfsdport.c index 1e6936ed351c..7fc8c426ba64 100644 --- a/sys/fs/nfsserver/nfs_nfsdport.c +++ b/sys/fs/nfsserver/nfs_nfsdport.c @@ -71,6 +71,9 @@ extern int nfs_bufpackets; extern u_long sb_max_adj; extern struct nfsv4lock nfsv4rootfs_lock; +static uint64_t nfsrv_stripesiz = 0; +static int nfsrv_maxstripecnt = 1; + VNET_DECLARE(int, nfsrv_numnfsd); VNET_DECLARE(struct nfsrv_stablefirst, nfsrv_stablefirst); VNET_DECLARE(SVCPOOL *, nfsrvd_pool); @@ -132,7 +135,7 @@ static int nfsrv_createiovecw(int, struct mbuf *, char *, struct iovec **, static void nfs_dtypetovtype(struct nfsvattr *, struct vnode *, uint8_t); static void nfsrv_pnfscreate(struct vnode *, struct vattr *, struct ucred *, NFSPROC_T *); -static void nfsrv_pnfsremovesetup(struct vnode *, NFSPROC_T *, struct vnode **, +static void nfsrv_pnfsremovesetup(struct vnode *, NFSPROC_T *, struct vnode ***, int *, char *, fhandle_t *); static void nfsrv_pnfsremove(struct vnode **, int, char *, fhandle_t *, NFSPROC_T *); @@ -141,27 +144,36 @@ static int nfsrv_proxyds(struct vnode *, off_t, int, struct ucred *, struct nfsvattr *, struct acl *, off_t *, int, bool *); static int nfsrv_setextattr(struct vnode *, struct nfsvattr *, NFSPROC_T *); static int nfsrv_readdsrpc(fhandle_t *, off_t, int, struct ucred *, - NFSPROC_T *, struct nfsmount *, struct mbuf **, struct mbuf **); + NFSPROC_T *, struct nfsmount **, int, int, uint64_t, struct mbuf **, + struct mbuf **, int *); static int nfsrv_writedsrpc(fhandle_t *, off_t, int, struct ucred *, - NFSPROC_T *, struct vnode *, struct nfsmount **, int, struct mbuf **, - char *, int *); + NFSPROC_T *, struct vnode *, struct nfsmount **, int, int, uint64_t, + struct mbuf **, char *, int *); +#ifdef notnow static int nfsrv_allocatedsrpc(fhandle_t *, off_t, off_t, struct ucred *, NFSPROC_T *, struct vnode *, struct nfsmount **, int, int *); static int nfsrv_deallocatedsrpc(fhandle_t *, off_t, off_t, struct ucred *, NFSPROC_T *, struct vnode *, struct nfsmount **, int, int *); static int nfsrv_setacldsrpc(fhandle_t *, struct ucred *, NFSPROC_T *, struct vnode *, struct nfsmount **, int, struct acl *, int *); -static int nfsrv_setattrdsrpc(fhandle_t *, struct ucred *, NFSPROC_T *, - struct vnode *, struct nfsmount **, int, struct nfsvattr *, int *); -static int nfsrv_getattrdsrpc(fhandle_t *, struct ucred *, NFSPROC_T *, - struct vnode *, struct nfsmount *, struct nfsvattr *); +#endif +static int nfsrv_setattrdsrpc(fhandle_t *, struct vnode *, struct ucred *, + NFSPROC_T *, struct nfsmount **, int, int, struct nfsvattr *, int *); +static int nfsrv_setattrdsdorpc(fhandle_t *, struct vnode *, struct ucred *, + NFSPROC_T *, struct nfsmount *, struct nfsvattr *, struct nfsvattr *); +static int nfsrv_getattrdsrpc(fhandle_t *, struct vnode *, struct ucred *, + NFSPROC_T *, struct nfsmount **, int, struct nfsvattr *, int *); +static int nfsrv_getattrdsdorpc(fhandle_t *, struct vnode *, struct ucred *, + NFSPROC_T *, struct nfsmount *, struct nfsvattr *); +#ifdef notnow static int nfsrv_seekdsrpc(fhandle_t *, off_t *, int, bool *, struct ucred *, NFSPROC_T *, struct nfsmount *); +#endif static int nfsrv_putfhname(fhandle_t *, char *); static int nfsrv_pnfslookupds(struct vnode *, struct vnode *, struct pnfsdsfile *, struct vnode **, NFSPROC_T *); -static void nfsrv_pnfssetfh(struct vnode *, struct pnfsdsfile *, char *, char *, - struct vnode *, NFSPROC_T *); +static void nfsrv_pnfssetfh(struct vnode *, struct pnfsdsfile *, char **, + char *, struct vnode *, NFSPROC_T *); static int nfsrv_dsremove(struct vnode *, char *, struct ucred *, NFSPROC_T *); static int nfsrv_dssetacl(struct vnode *, struct acl *, struct ucred *, NFSPROC_T *); @@ -330,6 +342,55 @@ SYSCTL_PROC(_vfs_nfsd, OID_AUTO, enable_locallocks, CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_dolocallocks, "IU", "Enable nfsd to acquire local locks on files"); +static int +sysctl_stripecnt(SYSCTL_HANDLER_ARGS) +{ + int error, newmaxstripecnt; + + newmaxstripecnt = nfsrv_maxstripecnt; + error = sysctl_handle_int(oidp, &newmaxstripecnt, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + if (newmaxstripecnt == nfsrv_maxstripecnt) + return (0); + if (newnfs_numnfsd > 0) + return (EPERM); + if (jailed(curthread->td_ucred)) + return (EINVAL); + if (newmaxstripecnt <= 0 || newmaxstripecnt > NFSDEV_MAXSTRIPE) + return (EINVAL); + nfsrv_maxstripecnt = newmaxstripecnt; + return (0); +} +SYSCTL_PROC(_vfs_nfsd, OID_AUTO, pnfsstripecnt, + CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, + sysctl_stripecnt, "IU", "Set the #stripes for a pNFS server"); + +static int +sysctl_stripeunit(SYSCTL_HANDLER_ARGS) +{ + uint64_t newstripesiz; + int error; + + newstripesiz = nfsrv_stripesiz; + error = sysctl_handle_64(oidp, &newstripesiz, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + if (newstripesiz == nfsrv_stripesiz) + return (0); + if (newnfs_numnfsd > 0) + return (EPERM); + if (jailed(curthread->td_ucred)) + return (EINVAL); + nfsrv_stripesiz = newstripesiz; + if (newstripesiz == 0) + nfsrv_maxstripecnt = 1; + return (0); +} +SYSCTL_PROC(_vfs_nfsd, OID_AUTO, pnfsstripeunit, + CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, + sysctl_stripeunit, "QU", "Set the stripe unit length for a pNFS server"); + #define MAX_REORDERED_RPC 16 #define NUM_HEURISTIC 1031 #define NHUSE_INIT 64 @@ -1492,14 +1553,14 @@ int nfsvno_removesub(struct nameidata *ndp, bool is_v4, struct nfsrv_descript *nd, struct thread *p, struct nfsexstuff *exp) { - struct vnode *vp, *dsdvp[NFSDEV_MAXMIRRORS], *newvp; + struct vnode *vp, **dsdvpp, *newvp; struct mount *mp; - int error = 0, mirrorcnt, ret; + int error = 0, dsfilecnt, ret; char fname[PNFS_FILENAME_LEN + 1]; fhandle_t fh; vp = ndp->ni_vp; - dsdvp[0] = NULL; + dsdvpp = NULL; if (vp->v_type == VDIR) { error = NFSERR_ISDIR; } else if (is_v4) { @@ -1510,12 +1571,18 @@ nfsvno_removesub(struct nameidata *ndp, bool is_v4, struct nfsrv_descript *nd, error = nfsrv_checkremove(vp, 1, NULL, nd->nd_clientid, p); } - if (error == 0) - nfsrv_pnfsremovesetup(vp, p, dsdvp, &mirrorcnt, fname, &fh); - if (!error) + if (error == 0) { + nfsrv_pnfsremovesetup(vp, p, &dsdvpp, &dsfilecnt, fname, &fh); + NFSD_DEBUG(4, "nfsrv_pnfsremovesetup err=%d dsfilecnt=%d\n", + error, dsfilecnt); error = VOP_REMOVE(ndp->ni_dvp, vp, &ndp->ni_cnd); - if (error == 0 && dsdvp[0] != NULL) - nfsrv_pnfsremove(dsdvp, mirrorcnt, fname, &fh, p); + } + if (error == 0 && dsdvpp != NULL) { + nfsrv_pnfsremove(dsdvpp, dsfilecnt, fname, &fh, p); + NFSD_DEBUG(4, "aft nfsrv_pnfsremove dsfilecnt=%d fname=%s\n", + dsfilecnt, fname); + } + free(dsdvpp, M_TEMP); if (is_v4 && (nd->nd_flag & ND_NFSV41) != 0 && error == 0) error = nfsvno_getfh(vp, &fh, p); if (ndp->ni_dvp == vp) @@ -1596,13 +1663,13 @@ int nfsvno_rename(struct nameidata *fromndp, struct nameidata *tondp, struct nfsrv_descript *nd, struct thread *p) { - struct vnode *fvp, *tvp, *tdvp, *dsdvp[NFSDEV_MAXMIRRORS], *newvp; + struct vnode *fvp, *tvp, *tdvp, **dsdvpp, *newvp; struct mount *mp; - int error = 0, mirrorcnt, ret; + int error = 0, dsfilecnt, ret; char fname[PNFS_FILENAME_LEN + 1]; fhandle_t fh, fh2; - dsdvp[0] = NULL; + dsdvpp = NULL; fvp = fromndp->ni_vp; if (nd->nd_repstat != 0) { vrele(fromndp->ni_dvp); @@ -1693,10 +1760,10 @@ nfsvno_rename(struct nameidata *fromndp, struct nameidata *tondp, if ((nd->nd_flag & ND_NFSV41) != 0) error = nfsvno_getfh(tvp, &fh2, p); if (error == 0) - nfsrv_pnfsremovesetup(tvp, p, dsdvp, &mirrorcnt, fname, - &fh); + nfsrv_pnfsremovesetup(tvp, p, &dsdvpp, &dsfilecnt, + fname, &fh); NFSD_DEBUG(4, "nfsvno_rename: pnfsremovesetup" - " dsdvp=%p\n", dsdvp[0]); + " dsdvpp=%p\n", dsdvpp); } out: mp = NULL; @@ -1738,13 +1805,14 @@ out: } /* - * If dsdvp[0] != NULL, it was set up by nfsrv_pnfsremovesetup() and + * If dsdvpp != NULL, it was set up by nfsrv_pnfsremovesetup() and * if the rename succeeded, the DS file for the tvp needs to be * removed. */ - if (error == 0 && dsdvp[0] != NULL) { - nfsrv_pnfsremove(dsdvp, mirrorcnt, fname, &fh, p); + if (error == 0 && dsdvpp != NULL) { + nfsrv_pnfsremove(dsdvpp, dsfilecnt, fname, &fh, p); NFSD_DEBUG(4, "nfsvno_rename: pnfsremove\n"); + free(dsdvpp, M_TEMP); } /* Use ret to determine if the file still exists. */ @@ -4678,26 +4746,31 @@ static void nfsrv_pnfscreate(struct vnode *vp, struct vattr *vap, struct ucred *cred, NFSPROC_T *p) { - struct nfsrvdscreate *dsc, *tdsc = NULL; + struct nfsrvdscreate *dsc, *tdsc; struct nfsdevice *ds, *tds, *fds; struct mount *mp; struct pnfsdsfile *pf, *tpf; struct pnfsdsattr dsattr; struct vattr va; - struct vnode *dvp[NFSDEV_MAXMIRRORS]; + struct vnode **dvp; struct nfsmount *nmp; fhandle_t fh; uid_t vauid; gid_t vagid; u_short vamode; struct ucred *tcred; - int dsdir[NFSDEV_MAXMIRRORS], error, i, mirrorcnt, ret; + int *dsdir, error, i, j, mirrorcnt, ret, stripecnt; int failpos, timo; /* Get a DS server directory in a round-robin order. */ mirrorcnt = 1; mp = vp->v_mount; ds = fds = NULL; + i = j = 0; + dvp = malloc(sizeof(*dvp) * nfsrv_maxpnfsmirror * nfsrv_maxstripecnt, + M_TEMP, M_WAITOK); + dsdir = malloc(sizeof(*dsdir) * nfsrv_maxpnfsmirror * + nfsrv_maxstripecnt, M_TEMP, M_WAITOK); NFSDDSLOCK(); /* * Search for the first entry that handles this MDS fs, but use the @@ -4705,49 +4778,85 @@ nfsrv_pnfscreate(struct vnode *vp, struct vattr *vap, struct ucred *cred, */ TAILQ_FOREACH(tds, &nfsrv_devidhead, nfsdev_list) { if (tds->nfsdev_nmp != NULL) { + i++; if (tds->nfsdev_mdsisset == 0 && ds == NULL) ds = tds; else if (tds->nfsdev_mdsisset != 0 && fsidcmp( &mp->mnt_stat.f_fsid, &tds->nfsdev_mdsfsid) == 0) { - ds = fds = tds; - break; + if (j == 0) + ds = fds = tds; + if (nfsrv_stripesiz == 0) + break; + j++; } } } if (ds == NULL) { NFSDDSUNLOCK(); + free(dvp, M_TEMP); + free(dsdir, M_TEMP); NFSD_DEBUG(4, "nfsrv_pnfscreate: no srv\n"); return; } + + /* + * i or j is the count of devices. The stripecnt is that number + * of devices devided by the number of mirrors. + */ + stripecnt = 0; + if (nfsrv_stripesiz > 0) { + if (j > 0) + stripecnt = j / nfsrv_maxpnfsmirror; + else + stripecnt = i / nfsrv_maxpnfsmirror; + if (stripecnt > nfsrv_maxstripecnt) + stripecnt = nfsrv_maxstripecnt; + } + if (stripecnt == 0) + stripecnt = 1; + + /* Set the first device as found above. */ i = dsdir[0] = ds->nfsdev_nextdir; ds->nfsdev_nextdir = (ds->nfsdev_nextdir + 1) % nfsrv_dsdirsize; dvp[0] = ds->nfsdev_dsdir[i]; tds = TAILQ_NEXT(ds, nfsdev_list); - if (nfsrv_maxpnfsmirror > 1 && tds != NULL) { + if ((nfsrv_maxpnfsmirror > 1 || stripecnt > 1) && tds != NULL) { + j = 1; /* Stripe number */ + mirrorcnt = 0; TAILQ_FOREACH_FROM(tds, &nfsrv_devidhead, nfsdev_list) { if (tds->nfsdev_nmp != NULL && ((tds->nfsdev_mdsisset == 0 && fds == NULL) || (tds->nfsdev_mdsisset != 0 && fds != NULL && fsidcmp(&mp->mnt_stat.f_fsid, &tds->nfsdev_mdsfsid) == 0))) { - dsdir[mirrorcnt] = i; - dvp[mirrorcnt] = tds->nfsdev_dsdir[i]; - mirrorcnt++; - if (mirrorcnt >= nfsrv_maxpnfsmirror) - break; + dsdir[mirrorcnt * stripecnt + j] = i; + dvp[mirrorcnt * stripecnt + j] = + tds->nfsdev_dsdir[i]; + j++; + if (j >= stripecnt) { + mirrorcnt++; + if (mirrorcnt >= nfsrv_maxpnfsmirror) + break; + j = 0; + } } } - } + if (mirrorcnt == 0) { + mirrorcnt = 1; + stripecnt = j; + } + } else + stripecnt = 1; /* Put at end of list to implement round-robin usage. */ TAILQ_REMOVE(&nfsrv_devidhead, ds, nfsdev_list); TAILQ_INSERT_TAIL(&nfsrv_devidhead, ds, nfsdev_list); NFSDDSUNLOCK(); dsc = NULL; - if (mirrorcnt > 1) - tdsc = dsc = malloc(sizeof(*dsc) * (mirrorcnt - 1), M_TEMP, + j = mirrorcnt * stripecnt; + if (j > 1) + tdsc = dsc = malloc(sizeof(*dsc) * (j - 1), M_TEMP, M_WAITOK | M_ZERO); - tpf = pf = malloc(sizeof(*pf) * nfsrv_maxpnfsmirror, M_TEMP, M_WAITOK | - M_ZERO); + tpf = pf = malloc(sizeof(*pf) * j, M_TEMP, M_WAITOK | M_ZERO); error = nfsvno_getfh(vp, &fh, p); if (error == 0) @@ -4775,7 +4884,9 @@ nfsrv_pnfscreate(struct vnode *vp, struct vattr *vap, struct ucred *cred, * additional mirrors. */ failpos = -1; - for (i = 0; i < mirrorcnt - 1 && error == 0; i++, tpf++, tdsc++) { + for (i = 0; i < j - 1 && error == 0; i++, tpf++, tdsc++) { + tpf->dsf_stripecnt = stripecnt; + tpf->dsf_stripesiz = nfsrv_stripesiz; tpf->dsf_dir = dsdir[i]; tdsc->tcred = tcred; tdsc->p = p; @@ -4806,11 +4917,13 @@ nfsrv_pnfscreate(struct vnode *vp, struct vattr *vap, struct ucred *cred, } } if (error == 0) { - tpf->dsf_dir = dsdir[mirrorcnt - 1]; - error = nfsrv_dscreate(dvp[mirrorcnt - 1], vap, &va, &fh, tpf, + tpf->dsf_stripecnt = stripecnt; + tpf->dsf_stripesiz = nfsrv_stripesiz; + tpf->dsf_dir = dsdir[j - 1]; + error = nfsrv_dscreate(dvp[j - 1], vap, &va, &fh, tpf, &dsattr, NULL, tcred, p, NULL); - if (failpos == -1 && mirrorcnt > 1 && nfsds_failerr(error)) { - failpos = mirrorcnt - 1; + if (failpos == -1 && j > 1 && nfsds_failerr(error)) { + failpos = j - 1; error = 0; } } @@ -4818,7 +4931,7 @@ nfsrv_pnfscreate(struct vnode *vp, struct vattr *vap, struct ucred *cred, if (timo < 1) timo = 1; /* Wait for kernel task(s) to complete. */ - for (tdsc = dsc, i = 0; i < mirrorcnt - 1; i++, tdsc++) { + for (tdsc = dsc, i = 0; i < j - 1; i++, tdsc++) { while (tdsc->inprog != 0 && tdsc->done == 0) tsleep(&tdsc->tsk, PVFS, "srvdcr", timo); if (tdsc->err != 0) { @@ -4830,7 +4943,7 @@ nfsrv_pnfscreate(struct vnode *vp, struct vattr *vap, struct ucred *cred, } /* - * If failpos has been set, that mirror has failed, so it needs + * If failpos has been set, that DS has failed, so it needs * to be disabled. */ if (failpos >= 0) { @@ -4855,25 +4968,12 @@ nfsrv_pnfscreate(struct vnode *vp, struct vattr *vap, struct ucred *cred, NFSFREECRED(tcred); if (error == 0) { ASSERT_VOP_ELOCKED(vp, "nfsrv_pnfscreate vp"); - - NFSD_DEBUG(4, "nfsrv_pnfscreate: mirrorcnt=%d maxmirror=%d\n", - mirrorcnt, nfsrv_maxpnfsmirror); - /* - * For all mirrors that couldn't be created, fill in the - * *pf structure, but with an IP address == 0.0.0.0. - */ - tpf = pf + mirrorcnt; - for (i = mirrorcnt; i < nfsrv_maxpnfsmirror; i++, tpf++) { - *tpf = *pf; - tpf->dsf_sin.sin_family = AF_INET; - tpf->dsf_sin.sin_len = sizeof(struct sockaddr_in); - tpf->dsf_sin.sin_addr.s_addr = 0; - tpf->dsf_sin.sin_port = 0; - } + NFSD_DEBUG(4, "nfsrv_pnfscreate: mirrorcnt=%d stripecnt=%d\n", + mirrorcnt, stripecnt); error = vn_extattr_set(vp, IO_NODELOCKED, EXTATTR_NAMESPACE_SYSTEM, "pnfsd.dsfile", - sizeof(*pf) * nfsrv_maxpnfsmirror, (char *)pf, p); + sizeof(*pf) * mirrorcnt * stripecnt, (char *)pf, p); if (error == 0) error = vn_extattr_set(vp, IO_NODELOCKED, EXTATTR_NAMESPACE_SYSTEM, "pnfsd.dsattr", @@ -4883,25 +4983,28 @@ nfsrv_pnfscreate(struct vnode *vp, struct vattr *vap, struct ucred *cred, error); } else printf("pNFS: pnfscreate=%d\n", error); + free(dvp, M_TEMP); + free(dsdir, M_TEMP); free(pf, M_TEMP); free(dsc, M_TEMP); } /* * Get the information needed to remove the pNFS Data Server file from the - * Metadata file. Upon success, ddvp is set non-NULL to the locked - * DS directory vnode. The caller must unlock *ddvp when done with it. + * Metadata file. Upon success, *dvppp is set to an array of locked + * DS directory vnode(s). The caller must unlock this array of *dvp when done + * with it. */ static void -nfsrv_pnfsremovesetup(struct vnode *vp, NFSPROC_T *p, struct vnode **dvpp, - int *mirrorcntp, char *fname, fhandle_t *fhp) +nfsrv_pnfsremovesetup(struct vnode *vp, NFSPROC_T *p, struct vnode ***dvppp, + int *dsfilecntp, char *fname, fhandle_t *fhp) { struct vattr va; struct ucred *tcred; char *buf; int buflen, error; - dvpp[0] = NULL; + *dvppp = NULL; /* If not an exported regular file or not a pNFS server, just return. */ if (vp->v_type != VREG || (vp->v_mount->mnt_flag & MNT_EXPORTED) == 0 || nfsrv_devidcnt == 0) @@ -4924,11 +5027,12 @@ nfsrv_pnfsremovesetup(struct vnode *vp, NFSPROC_T *p, struct vnode **dvpp, return; } - buflen = 1024; + buflen = sizeof(struct pnfsdsfile) * NFSDEV_MAXMIRRORS * + NFSDEV_MAXSTRIPE; buf = malloc(buflen, M_TEMP, M_WAITOK); /* Get the directory vnode for the DS mount and the file handle. */ - error = nfsrv_dsgetsockmnt(vp, 0, buf, &buflen, mirrorcntp, p, dvpp, - NULL, NULL, fname, NULL, NULL, NULL, NULL, NULL); + error = nfsrv_dsgetsockmnt(vp, 0, buf, &buflen, dsfilecntp, NULL, NULL, + p, dvppp, NULL, NULL, fname, NULL, NULL, NULL, NULL, NULL); free(buf, M_TEMP); if (error != 0) printf("pNFS: nfsrv_pnfsremovesetup getsockmnt=%d\n", error); @@ -5006,28 +5110,31 @@ start_dsremove(void *arg, int pending) * removed to set up the dvp and fill in the FH. */ static void -nfsrv_pnfsremove(struct vnode **dvp, int mirrorcnt, char *fname, fhandle_t *fhp, - NFSPROC_T *p) +nfsrv_pnfsremove(struct vnode **dvpp, int dsfilecnt, char *fname, + fhandle_t *fhp, NFSPROC_T *p) { struct ucred *tcred; struct nfsrvdsremove *dsrm, *tdsrm; struct nfsdevice *ds; struct nfsmount *nmp; + struct vnode **tdvpp; int failpos, i, ret, timo; tcred = newnfs_getcred(); dsrm = NULL; - if (mirrorcnt > 1) - dsrm = malloc(sizeof(*dsrm) * mirrorcnt - 1, M_TEMP, M_WAITOK); + if (dsfilecnt > 1) + tdsrm = dsrm = malloc(sizeof(*dsrm) * dsfilecnt - 1, M_TEMP, + M_WAITOK); /* * Remove the file on each DS mirror, using kernel process(es) for the * additional mirrors. */ failpos = -1; - for (tdsrm = dsrm, i = 0; i < mirrorcnt - 1; i++, tdsrm++) { + tdvpp = dvpp; + for (i = 0; i < dsfilecnt - 1; i++, tdsrm++, tdvpp++) { tdsrm->tcred = tcred; tdsrm->p = p; - tdsrm->dvp = dvp[i]; + tdsrm->dvp = *tdvpp; strlcpy(tdsrm->fname, fname, PNFS_FILENAME_LEN + 1); tdsrm->inprog = 0; tdsrm->done = 0; @@ -5038,19 +5145,19 @@ nfsrv_pnfsremove(struct vnode **dvp, int mirrorcnt, char *fname, fhandle_t *fhp, NFSD_DEBUG(4, "nfsrv_pnfsremove: nfs_pnfsio=%d\n", ret); } if (ret != 0) { - ret = nfsrv_dsremove(dvp[i], fname, tcred, p); + ret = nfsrv_dsremove(tdsrm->dvp, fname, tcred, p); if (failpos == -1 && nfsds_failerr(ret)) failpos = i; } } - ret = nfsrv_dsremove(dvp[mirrorcnt - 1], fname, tcred, p); - if (failpos == -1 && mirrorcnt > 1 && nfsds_failerr(ret)) - failpos = mirrorcnt - 1; + ret = nfsrv_dsremove(*tdvpp, fname, tcred, p); + if (failpos == -1 && dsfilecnt > 1 && nfsds_failerr(ret)) + failpos = dsfilecnt - 1; timo = hz / 50; /* Wait for 20msec. */ if (timo < 1) timo = 1; /* Wait for kernel task(s) to complete. */ - for (tdsrm = dsrm, i = 0; i < mirrorcnt - 1; i++, tdsrm++) { + for (tdsrm = dsrm, i = 0; i < dsfilecnt - 1; i++, tdsrm++) { while (tdsrm->inprog != 0 && tdsrm->done == 0) tsleep(&tdsrm->tsk, PVFS, "srvdsrm", timo); if (failpos == -1 && nfsds_failerr(tdsrm->err)) @@ -5062,7 +5169,8 @@ nfsrv_pnfsremove(struct vnode **dvp, int mirrorcnt, char *fname, fhandle_t *fhp, * to be disabled. */ if (failpos >= 0) { - nmp = VFSTONFS(dvp[failpos]->v_mount); + tdvpp = dvpp + failpos; + nmp = VFSTONFS((*tdvpp)->v_mount); NFSLOCKMNT(nmp); if ((nmp->nm_privflag & (NFSMNTP_FORCEDISM | NFSMNTP_CANCELRPCS)) == 0) { @@ -5147,14 +5255,16 @@ nfsrv_proxyds(struct vnode *vp, off_t off, int cnt, struct ucred *cred, struct mbuf **mpp2, struct nfsvattr *nap, struct acl *aclp, off_t *offp, int content, bool *eofp) { - struct nfsmount *nmp[NFSDEV_MAXMIRRORS], *failnmp; - fhandle_t fh[NFSDEV_MAXMIRRORS]; - struct vnode *dvp[NFSDEV_MAXMIRRORS]; + struct nfsmount **nmp, *failnmp; + fhandle_t *fhp; + struct vnode **dvp; struct nfsdevice *ds; struct pnfsdsattr dsattr; struct opnfsdsattr odsattr; char *buf; int buflen, error, failpos, i, mirrorcnt, origmircnt, trycnt; + int stripecnt; + uint64_t stripesiz; NFSD_DEBUG(4, "in nfsrv_proxyds\n"); /* @@ -5165,7 +5275,8 @@ nfsrv_proxyds(struct vnode *vp, off_t off, int cnt, struct ucred *cred, nfsrv_devidcnt == 0) return (ENOENT); - buflen = 1024; + buflen = sizeof(struct pnfsdsfile) * NFSDEV_MAXMIRRORS * + NFSDEV_MAXSTRIPE; buf = malloc(buflen, M_TEMP, M_WAITOK); error = 0; @@ -5228,23 +5339,34 @@ nfsrv_proxyds(struct vnode *vp, off_t off, int cnt, struct ucred *cred, origmircnt = -1; trycnt = 0; tryagain: + nmp = NULL; + dvp = NULL; + fhp = NULL; if (error == 0) { - buflen = 1024; + buflen = sizeof(struct pnfsdsfile) * NFSDEV_MAXMIRRORS * + NFSDEV_MAXSTRIPE; if (ioproc == NFSPROC_READDS && NFSVOPISLOCKED(vp) == LK_EXCLUSIVE) printf("nfsrv_proxyds: Readds vp exclusively locked\n"); + if (ioproc == NFSPROC_WRITEDS && NFSVOPISLOCKED(vp) == + LK_SHARED) + NFSVOPLOCK(vp, LK_UPGRADE | LK_RETRY); error = nfsrv_dsgetsockmnt(vp, LK_SHARED, buf, &buflen, - &mirrorcnt, p, dvp, fh, NULL, NULL, NULL, NULL, NULL, - NULL, NULL); + &mirrorcnt, &stripecnt, &stripesiz, p, &dvp, &fhp, NULL, + NULL, NULL, NULL, NULL, NULL, NULL); if (error == 0) { - for (i = 0; i < mirrorcnt; i++) - nmp[i] = VFSTONFS(dvp[i]->v_mount); + nmp = malloc(sizeof(*nmp) * mirrorcnt * stripecnt, + M_TEMP, M_WAITOK); + for (i = 0; i < mirrorcnt * stripecnt; i++) + if (dvp[i] != NULL) + nmp[i] = VFSTONFS(dvp[i]->v_mount); + else + nmp[i] = NULL; } else printf("pNFS: proxy getextattr sockaddr=%d\n", error); } else printf("pNFS: nfsrv_dsgetsockmnt=%d\n", error); if (error == 0) { - failpos = -1; if (origmircnt == -1) origmircnt = mirrorcnt; /* @@ -5261,24 +5383,26 @@ tryagain: * failed mirror#. */ if (ioproc == NFSPROC_READDS) { - error = nfsrv_readdsrpc(fh, off, cnt, cred, p, nmp[0], - mpp, mpp2); - if (nfsds_failerr(error) && mirrorcnt > 1) { + error = nfsrv_readdsrpc(fhp, off, cnt, cred, p, nmp, + mirrorcnt, stripecnt, stripesiz, mpp, mpp2, + &failpos); + if (failpos >= 0 && mirrorcnt > 1) { /* * Setting failpos will cause the mirror * to be disabled and then a retry of this * read is required. */ - failpos = 0; error = 0; trycnt++; } } else if (ioproc == NFSPROC_WRITEDS) - error = nfsrv_writedsrpc(fh, off, cnt, cred, p, vp, - &nmp[0], mirrorcnt, mpp, cp, &failpos); + error = nfsrv_writedsrpc(fhp, off, cnt, cred, p, vp, + nmp, mirrorcnt, stripecnt, stripesiz, mpp, cp, + &failpos); else if (ioproc == NFSPROC_SETATTR) - error = nfsrv_setattrdsrpc(fh, cred, p, vp, &nmp[0], - mirrorcnt, nap, &failpos); + error = nfsrv_setattrdsrpc(fhp, vp, cred, p, nmp, + mirrorcnt, stripecnt, nap, &failpos); +#ifdef notnow else if (ioproc == NFSPROC_SETACL) error = nfsrv_setacldsrpc(fh, cred, p, vp, &nmp[0], mirrorcnt, aclp, &failpos); @@ -5301,16 +5425,16 @@ tryagain: else if (ioproc == NFSPROC_DEALLOCATE) error = nfsrv_deallocatedsrpc(fh, off, *offp, cred, p, vp, &nmp[0], mirrorcnt, &failpos); +#endif else { - error = nfsrv_getattrdsrpc(&fh[mirrorcnt - 1], cred, p, - vp, nmp[mirrorcnt - 1], nap); - if (nfsds_failerr(error) && mirrorcnt > 1) { + error = nfsrv_getattrdsrpc(fhp, vp, cred, p, + nmp, stripecnt, nap, &failpos); + if (failpos >= 0 && mirrorcnt > 1) { /* * Setting failpos will cause the mirror * to be disabled and then a retry of this * getattr is required. */ - failpos = mirrorcnt - 1; error = 0; trycnt++; } @@ -5335,8 +5459,12 @@ tryagain: } NFSUNLOCKMNT(failnmp); } - for (i = 0; i < mirrorcnt; i++) - NFSVOPUNLOCK(dvp[i]); + for (i = 0; i < mirrorcnt * stripecnt; i++) + if (dvp[i] != NULL) + NFSVOPUNLOCK(dvp[i]); + free(dvp, M_TEMP); + free(nmp, M_TEMP); + free(fhp, M_TEMP); NFSD_DEBUG(4, "nfsrv_proxyds: aft RPC=%d trya=%d\n", error, trycnt); /* Try the Read/Getattr again if a mirror was deleted. */ @@ -5361,46 +5489,111 @@ tryagain: */ int nfsrv_dsgetsockmnt(struct vnode *vp, int lktype, char *buf, int *buflenp, - int *mirrorcntp, NFSPROC_T *p, struct vnode **dvpp, fhandle_t *fhp, - char *devid, char *fnamep, struct vnode **nvpp, struct nfsmount **newnmpp, + int *mirrorcntp, int *stripecntp, uint64_t *stripesizp, NFSPROC_T *p, + struct vnode ***dvppp, fhandle_t **fhpp, + char **devid, char *fnamep, struct vnode **nvpp, struct nfsmount **newnmpp, struct nfsmount *curnmp, int *ippos, int *dsdirp) { struct vnode *dvp, *nvp = NULL, **tdvpp; struct mount *mp; struct nfsmount *nmp, *newnmp; + fhandle_t *tfhp; struct sockaddr *sad; struct sockaddr_in *sin; struct nfsdevice *ds, *tds, *fndds; struct pnfsdsfile *pf; + struct opnfsdsfile *opf; uint32_t dsdir; - int error, fhiszero, fnd, gotone, i, mirrorcnt; + int error, fhiszero, fnd, gotmirror, gotone, i, j, k, l, m, mirrorcnt; + char *tdevid; + bool dvplocked; ASSERT_VOP_LOCKED(vp, "nfsrv_dsgetsockmnt vp"); - *mirrorcntp = 1; - tdvpp = dvpp; + dvplocked = false; + if (mirrorcntp != NULL) + *mirrorcntp = 1; + if (stripecntp != NULL) + *stripecntp = 1; + if (stripesizp != NULL) + *stripesizp = 0; if (nvpp != NULL) *nvpp = NULL; - if (dvpp != NULL) - *dvpp = NULL; + if (dvppp != NULL) + *dvppp = NULL; + if (fhpp != NULL) + *fhpp = NULL; + if (devid != NULL) + *devid = NULL; + tdevid = NULL; if (ippos != NULL) *ippos = -1; if (newnmpp != NULL) newnmp = *newnmpp; else newnmp = NULL; + if (fnamep != NULL) + fnamep[0] = '\0'; mp = vp->v_mount; error = vn_extattr_get(vp, IO_NODELOCKED, EXTATTR_NAMESPACE_SYSTEM, "pnfsd.dsfile", buflenp, buf, p); - mirrorcnt = *buflenp / sizeof(*pf); - if (error == 0 && (mirrorcnt < 1 || mirrorcnt > NFSDEV_MAXMIRRORS || - *buflenp != sizeof(*pf) * mirrorcnt)) + if (error == 0 && *buflenp > 0) { + j = *buflenp / sizeof(*pf); + if (*buflenp != sizeof(*pf) * j) { + /* Try opnfsdsfile. */ + j = *buflenp / sizeof(*opf); + if (j >= 1 && *buflenp == sizeof(*opf) * j) { + char *tbuf; + + tbuf = malloc(*buflenp, M_TEMP, M_WAITOK); + memcpy(tbuf, buf, *buflenp); + pf = (struct pnfsdsfile *)buf; + opf = (struct opnfsdsfile *)tbuf; + for (k = 0; k < j; k++, pf++, opf++) { + memcpy(&pf->dsf_fh, opf, sizeof(*opf)); + pf->dsf_stripecnt = 1; + pf->dsf_stripesiz = 0; + } + free(tbuf, M_TEMP); + } else + error = ENOATTR; + } else if (j < 1) + error = ENOATTR; + } else if (error == 0) error = ENOATTR; + if (error != 0) + return (error); pf = (struct pnfsdsfile *)buf; + if (pf->dsf_stripesiz > 0) { + mirrorcnt = j / pf->dsf_stripecnt; + k = pf->dsf_stripecnt; + } else { + mirrorcnt = j; + k = 1; + } + if (mirrorcnt < 1 || mirrorcnt > NFSDEV_MAXMIRRORS || + k < 1 || k > NFSDEV_MAXSTRIPE || j != mirrorcnt * k) + return (ENOATTR); + if (stripecntp != NULL) + *stripecntp = k; + if (stripesizp != NULL) + *stripesizp = pf->dsf_stripesiz; + + /* Allocate a large enough array for dvppp, if required. */ + if (dvppp != NULL) + tdvpp = *dvppp = malloc(sizeof(*tdvpp) * mirrorcnt * k, M_TEMP, + M_WAITOK | M_ZERO); + if (fhpp != NULL) + tfhp = *fhpp = malloc(sizeof(*tfhp) * mirrorcnt * k, M_TEMP, + M_WAITOK); + if (devid != NULL) + tdevid = *devid = malloc(NFSX_V4DEVICEID * mirrorcnt * k, + M_TEMP, M_WAITOK); + /* If curnmp != NULL, check for a match in the mirror list. */ - if (curnmp != NULL && error == 0) { + if (curnmp != NULL) { fnd = 0; - for (i = 0; i < mirrorcnt; i++, pf++) { + for (i = 0; i < j; i += k, pf += k) { sad = (struct sockaddr *)&pf->dsf_sin; if (nfsaddr2_match(sad, curnmp->nm_nam)) { if (ippos != NULL) @@ -5413,11 +5606,12 @@ nfsrv_dsgetsockmnt(struct vnode *vp, int lktype, char *buf, int *buflenp, error = ENXIO; } - gotone = 0; + gotmirror = gotone = 0; + l = 0; /* Index for tdvpp and tfhp. */ pf = (struct pnfsdsfile *)buf; - NFSD_DEBUG(4, "nfsrv_dsgetsockmnt: mirrorcnt=%d err=%d\n", mirrorcnt, - error); - for (i = 0; i < mirrorcnt && error == 0; i++, pf++) { + NFSD_DEBUG(4, "nfsrv_dsgetsockmnt: mirrorcnt=%d stripecnt=%d err=%d\n", + mirrorcnt, k, error); + for (i = 0; i < j && error == 0; i++, pf++) { fhiszero = 0; sad = (struct sockaddr *)&pf->dsf_sin; sin = &pf->dsf_sin; @@ -5486,6 +5680,7 @@ nfsrv_dsgetsockmnt(struct vnode *vp, int lktype, char *buf, int *buflenp, dvp = fndds->nfsdev_dsdir[dsdir]; if (lktype != 0 || fhiszero != 0 || (nvpp != NULL && *nvpp == NULL)) { + dvplocked = true; if (fhiszero != 0) error = vn_lock(dvp, LK_EXCLUSIVE); @@ -5525,16 +5720,21 @@ nfsrv_dsgetsockmnt(struct vnode *vp, int lktype, char *buf, int *buflenp, if (error == 0) { gotone++; NFSD_DEBUG(4, "gotone=%d\n", gotone); - if (devid != NULL) { + if (tdevid != NULL) { NFSBCOPY(fndds->nfsdev_deviceid, - devid, NFSX_V4DEVICEID); *** 1111 LINES SKIPPED ***
