On Thu, Nov 22, 2012 at 12:46:54PM +0100, Manuel Bouyer wrote: > Here's another patch, after comments from chs@ in a private mail. > > There really are 2 parts here: > - avoid unecessery list walk at truncate time. This is the change to > uvm_vnp_setsize() (truncate between pgend and oldsize instead of 0, which > always triggers a list walk - we know there is nothing beyond oldsize) > and ffs_truncate() (vtruncbuf() is needed only for non-VREG files, > and will always trigger a list walk). > > This help for the local write case, where on my test system syscalls/s rise > from 170 (when write succeed) to 570 (when EDQUOT is returned). Without > this patch, the syscalls/s would fall to less than 20. > > But this doesn't help much for nfsd, which can get write requests way beyond > the real end of file (the client's idea of end of file is wrong). > In such a case, the distance between pgend and oldsize in uvm_vnp_setsize() > is large enough to trigger a list walk, even through there is really only > a few pages to free. I guess fixing this would require an interface change > to pass to uvm_vnp_setsize() for which the allocation did really take place. > But I found a workaround: when a write error occurs, free all the > pages associated with the vnode in ffs_write(). This way, on subsequent writes > only new pages should be in the list, and freeing them is fast. > > With this change, the nfsd performances don't change when the quota limit > is hit.
In fact, with this patch alone, the nfsd performances drops from about 1250 writes/s to 700 when the quota limit is hit (and we keep a disk activity of 140MB/s when no data write occurs). To preserve the performance, the attached patch is also needed. What this does is detect early that we'll have an allocation failure in ffs_balloc(), and bail out before we allocated any indirect block that we would need to freed in the error path. This needs an additionnal flag to to chkdq(), to test a quota allocation without updating the quota values. -- Manuel Bouyer <bou...@antioche.eu.org> NetBSD: 26 ans d'experience feront toujours la difference --
Index: ffs/ffs_alloc.c =================================================================== RCS file: /cvsroot/src/sys/ufs/ffs/ffs_alloc.c,v retrieving revision 1.130 diff -u -p -r1.130 ffs_alloc.c --- ffs/ffs_alloc.c 28 Nov 2011 08:05:07 -0000 1.130 +++ ffs/ffs_alloc.c 22 Nov 2012 17:27:28 -0000 @@ -110,7 +110,6 @@ static daddr_t ffs_alloccg(struct inode static daddr_t ffs_alloccgblk(struct inode *, struct buf *, daddr_t, int); static ino_t ffs_dirpref(struct inode *); static daddr_t ffs_fragextend(struct inode *, int, daddr_t, int, int); -static void ffs_fserr(struct fs *, u_int, const char *); static daddr_t ffs_hashalloc(struct inode *, int, daddr_t, int, int, daddr_t (*)(struct inode *, int, daddr_t, int, int)); static daddr_t ffs_nodealloccg(struct inode *, int, daddr_t, int, int); @@ -2014,17 +2013,3 @@ ffs_mapsearch(struct fs *fs, struct cg * panic("ffs_alloccg: block not in map"); /* return (-1); */ } - -/* - * Fserr prints the name of a file system with an error diagnostic. - * - * The form of the error message is: - * fs: error message - */ -static void -ffs_fserr(struct fs *fs, u_int uid, const char *cp) -{ - - log(LOG_ERR, "uid %d, pid %d, command %s, on %s: %s\n", - uid, curproc->p_pid, curproc->p_comm, fs->fs_fsmnt, cp); -} Index: ffs/ffs_balloc.c =================================================================== RCS file: /cvsroot/src/sys/ufs/ffs/ffs_balloc.c,v retrieving revision 1.54 diff -u -p -r1.54 ffs_balloc.c --- ffs/ffs_balloc.c 23 Apr 2011 07:36:02 -0000 1.54 +++ ffs/ffs_balloc.c 22 Nov 2012 17:27:28 -0000 @@ -111,6 +111,7 @@ ffs_balloc_ufs1(struct vnode *vp, off_t int32_t *blkp, *allocblk, allociblk[NIADDR + 1]; int32_t *allocib; int unwindidx = -1; + off_t wantedblks; #ifdef FFS_EI const int needswap = UFS_FSNEEDSWAP(fs); #endif @@ -263,9 +264,43 @@ ffs_balloc_ufs1(struct vnode *vp, off_t return (error); /* + * before doing any allocations, check if we have enough space. + * This avoids going through allocations/deallocations if + * the filesystem or quota is short in space, and a badly-written + * software keeps retrying the write. + * Here we may check a size larger than what's really needed + * is indirect blocks are already there, but it's too large + * by at most 3 blocks. Not a big deal. + */ + wantedblks = (flags & B_METAONLY) ? num : (num + 1); + if (kauth_authorize_system(cred, KAUTH_SYSTEM_FS_RESERVEDSPACE, 0, + NULL, NULL, NULL) != 0) { + if (blkstofrags(fs, wantedblks) > freespace(fs, fs->fs_minfree)) { + ffs_fserr(fs, kauth_cred_geteuid(cred), + "file system full"); + uprintf("\n%s: write failed, file system is full\n", + fs->fs_fsmnt); + return (ENOSPC); + } + } else { + if (wantedblks > fs->fs_cstotal.cs_nbfree) { + ffs_fserr(fs, kauth_cred_geteuid(cred), + "file system full"); + uprintf("\n%s: write failed, file system is full\n", + fs->fs_fsmnt); + return (ENOSPC); + } + } + +#if defined(QUOTA) || defined(QUOTA2) + if ((error = chkdq(ip, btodb(lblktosize(fs, wantedblks)), cred, + TRYONLY)) != 0) + return error; +#endif + + /* * Fetch the first indirect block allocating if necessary. */ - --num; nb = ufs_rw32(ip->i_ffs1_ib[indirs[0].in_off], needswap); allocib = NULL; @@ -533,6 +571,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t daddr_t *blkp, *allocblk, allociblk[NIADDR + 1]; int64_t *allocib; int unwindidx = -1; + off_t wantedblks; #ifdef FFS_EI const int needswap = UFS_FSNEEDSWAP(fs); #endif @@ -793,6 +832,40 @@ ffs_balloc_ufs2(struct vnode *vp, off_t return (error); /* + * before doing any allocations, check if we have enough space. + * This avoids going through allocations/deallocations if + * the filesystem or quota is short in space, and a badly-written + * software keeps retrying the write. + * Here we may check a size larger than what's really needed + * is indirect blocks are already there, but it's too large + * by at most 3 blocks. Not a big deal. + */ + wantedblks = (flags & B_METAONLY) ? num : (num + 1); + if (kauth_authorize_system(cred, KAUTH_SYSTEM_FS_RESERVEDSPACE, 0, + NULL, NULL, NULL) != 0) { + if (blkstofrags(fs, wantedblks) > freespace(fs, fs->fs_minfree)) { + ffs_fserr(fs, kauth_cred_geteuid(cred), + "file system full"); + uprintf("\n%s: write failed, file system is full\n", + fs->fs_fsmnt); + return (ENOSPC); + } + } else { + if (wantedblks > fs->fs_cstotal.cs_nbfree) { + ffs_fserr(fs, kauth_cred_geteuid(cred), + "file system full"); + uprintf("\n%s: write failed, file system is full\n", + fs->fs_fsmnt); + return (ENOSPC); + } + } + +#if defined(QUOTA) || defined(QUOTA2) + if ((error = chkdq(ip, btodb(lblktosize(fs, wantedblks)), cred, + TRYONLY)) != 0) + return error; +#endif + /* * Fetch the first indirect block allocating if necessary. */ Index: ffs/ffs_extern.h =================================================================== RCS file: /cvsroot/src/sys/ufs/ffs/ffs_extern.h,v retrieving revision 1.78 diff -u -p -r1.78 ffs_extern.h --- ffs/ffs_extern.h 17 Jun 2011 14:23:52 -0000 1.78 +++ ffs/ffs_extern.h 22 Nov 2012 17:27:28 -0000 @@ -193,6 +193,7 @@ void ffs_cg_swap(struct cg *, struct cg #if defined(_KERNEL) void ffs_load_inode(struct buf *, struct inode *, struct fs *, ino_t); int ffs_getblk(struct vnode *, daddr_t, daddr_t, int, bool, buf_t **); +void ffs_fserr(struct fs *, u_int, const char *); #endif /* defined(_KERNEL) */ void ffs_fragacct(struct fs *, int, int32_t[], int, int); int ffs_isblock(struct fs *, u_char *, int32_t); Index: ffs/ffs_subr.c =================================================================== RCS file: /cvsroot/src/sys/ufs/ffs/ffs_subr.c,v retrieving revision 1.47 diff -u -p -r1.47 ffs_subr.c --- ffs/ffs_subr.c 14 Aug 2011 12:37:09 -0000 1.47 +++ ffs/ffs_subr.c 22 Nov 2012 17:27:28 -0000 @@ -64,6 +64,7 @@ void panic(const char *, ...) #include <sys/inttypes.h> #include <sys/pool.h> #include <sys/fstrans.h> +#include <sys/syslog.h> #include <ufs/ufs/inode.h> #include <ufs/ufs/ufsmount.h> #include <ufs/ufs/ufs_extern.h> @@ -132,6 +133,20 @@ ffs_getblk(struct vnode *vp, daddr_t lbl return error; } + +/* + * Fserr prints the name of a file system with an error diagnostic. + * + * The form of the error message is: + * fs: error message + */ +void +ffs_fserr(struct fs *fs, u_int uid, const char *cp) +{ + + log(LOG_ERR, "uid %d, pid %d, command %s, on %s: %s\n", + uid, curproc->p_pid, curproc->p_comm, fs->fs_fsmnt, cp); +} #endif /* _KERNEL */ /* Index: ufs/ufs_extern.h =================================================================== RCS file: /cvsroot/src/sys/ufs/ufs/ufs_extern.h,v retrieving revision 1.71 diff -u -p -r1.71 ufs_extern.h --- ufs/ufs_extern.h 1 Feb 2012 05:34:43 -0000 1.71 +++ ufs/ufs_extern.h 22 Nov 2012 17:27:28 -0000 @@ -144,6 +144,7 @@ int ufs_blkatoff(struct vnode *, off_t, * Flags to chkdq() and chkiq() */ #define FORCE 0x01 /* force usage changes independent of limits */ +#define TRYONLY 0x02 /* check limits but don't update usage */ void ufsquota_init(struct inode *); void ufsquota_free(struct inode *); int chkdq(struct inode *, int64_t, kauth_cred_t, int); Index: ufs/ufs_quota.c =================================================================== RCS file: /cvsroot/src/sys/ufs/ufs/ufs_quota.c,v retrieving revision 1.108.2.2 diff -u -p -r1.108.2.2 ufs_quota.c --- ufs/ufs_quota.c 13 Sep 2012 22:24:27 -0000 1.108.2.2 +++ ufs/ufs_quota.c 22 Nov 2012 17:27:28 -0000 @@ -154,6 +154,7 @@ chkdq(struct inode *ip, int64_t change, int chkiq(struct inode *ip, int32_t change, kauth_cred_t cred, int flags) { + KASSERT((flags & TRYONLY) == 0); /* do not track snapshot usage, or we will deadlock */ if ((ip->i_flags & SF_SNAPSHOT) != 0) return 0; Index: ufs/ufs_quota1.c =================================================================== RCS file: /cvsroot/src/sys/ufs/ufs/ufs_quota1.c,v retrieving revision 1.18 diff -u -p -r1.18 ufs_quota1.c --- ufs/ufs_quota1.c 2 Feb 2012 03:00:48 -0000 1.18 +++ ufs/ufs_quota1.c 22 Nov 2012 17:27:28 -0000 @@ -71,6 +71,8 @@ chkdq1(struct inode *ip, int64_t change, if (change == 0) return (0); if (change < 0) { + if (flags & TRYONLY) + return (0); for (i = 0; i < MAXQUOTAS; i++) { if ((dq = ip->i_dquot[i]) == NODQUOT) continue; @@ -100,6 +102,8 @@ chkdq1(struct inode *ip, int64_t change, return (error); } } + if (flags & TRYONLY) + return (0); for (i = 0; i < MAXQUOTAS; i++) { if ((dq = ip->i_dquot[i]) == NODQUOT) continue; Index: ufs/ufs_quota2.c =================================================================== RCS file: /cvsroot/src/sys/ufs/ufs/ufs_quota2.c,v retrieving revision 1.34.2.1 diff -u -p -r1.34.2.1 ufs_quota2.c --- ufs/ufs_quota2.c 1 Oct 2012 19:55:22 -0000 1.34.2.1 +++ ufs/ufs_quota2.c 22 Nov 2012 17:27:28 -0000 @@ -465,6 +465,8 @@ quota2_check(struct inode *ip, int vtype return 0; } if (change < 0) { + if (flags & TRYONLY) + return (0); for (i = 0; i < MAXQUOTAS; i++) { dq = ip->i_dquot[i]; if (dq == NODQUOT) @@ -551,7 +553,7 @@ quota2_check(struct inode *ip, int vtype if (dq == NODQUOT) continue; KASSERT(q2e[i] != NULL); - if (error == 0) { + if (error == 0 && (flags & TRYONLY) == 0) { q2vp = &q2e[i]->q2e_val[vtype]; ncurblks = ufs_rw64(q2vp->q2v_cur, needswap); q2vp->q2v_cur = ufs_rw64(ncurblks + change, needswap);