Author: mckusick
Date: Sat Aug 18 22:21:59 2018
New Revision: 338031
URL: https://svnweb.freebsd.org/changeset/base/338031

Log:
  Replace the TRIM consolodation framework originally added in -r337396
  driven by problems found with the algorithms being tested for TRIM
  consolodation.
  
  Reported by:  Peter Holm
  Suggested by: kib
  Reviewed by:  kib
  Sponsored by: Netflix

Modified:
  head/sys/ufs/ffs/ffs_alloc.c
  head/sys/ufs/ffs/ffs_balloc.c
  head/sys/ufs/ffs/ffs_extern.h
  head/sys/ufs/ffs/ffs_inode.c
  head/sys/ufs/ffs/ffs_snapshot.c
  head/sys/ufs/ffs/ffs_softdep.c
  head/sys/ufs/ffs/ffs_vfsops.c
  head/sys/ufs/ffs/softdep.h
  head/sys/ufs/ufs/ufsmount.h

Modified: head/sys/ufs/ffs/ffs_alloc.c
==============================================================================
--- head/sys/ufs/ffs/ffs_alloc.c        Sat Aug 18 22:07:48 2018        
(r338030)
+++ head/sys/ufs/ffs/ffs_alloc.c        Sat Aug 18 22:21:59 2018        
(r338031)
@@ -110,8 +110,6 @@ static ufs2_daddr_t
 static void    ffs_blkfree_cg(struct ufsmount *, struct fs *,
                    struct vnode *, ufs2_daddr_t, long, ino_t,
                    struct workhead *);
-static void    ffs_blkfree_trim_completed(struct buf *);
-static void    ffs_blkfree_trim_task(void *ctx, int pending __unused);
 #ifdef INVARIANTS
 static int     ffs_checkblk(struct inode *, ufs2_daddr_t, long);
 #endif
@@ -395,8 +393,24 @@ retry:
        if (bno > 0) {
                bp->b_blkno = fsbtodb(fs, bno);
                if (!DOINGSOFTDEP(vp))
+                       /*
+                        * The usual case is that a smaller fragment that
+                        * was just allocated has been replaced with a bigger
+                        * fragment or a full-size block. If it is marked as
+                        * B_DELWRI, the current contents have not been written
+                        * to disk. It is possible that the block was written
+                        * earlier, but very uncommon. If the block has never
+                        * been written, there is no need to send a BIO_DELETE
+                        * for it when it is freed. The gain from avoiding the
+                        * TRIMs for the common case of unwritten blocks far
+                        * exceeds the cost of the write amplification for the
+                        * uncommon case of failing to send a TRIM for a block
+                        * that had been written.
+                        */
                        ffs_blkfree(ump, fs, ump->um_devvp, bprev, (long)osize,
-                           ip->i_number, vp->v_type, NULL);
+                           ip->i_number, vp->v_type, NULL,
+                           (bp->b_flags & B_DELWRI) != 0 ?
+                           NOTRIM_KEY : SINGLETON_KEY);
                delta = btodb(nsize - osize);
                DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta);
                if (flags & IO_EXT)
@@ -521,7 +535,7 @@ ffs_reallocblks_ufs1(ap)
        struct fs *fs;
        struct inode *ip;
        struct vnode *vp;
-       struct buf *sbp, *ebp;
+       struct buf *sbp, *ebp, *bp;
        ufs1_daddr_t *bap, *sbap, *ebap;
        struct cluster_save *buflist;
        struct ufsmount *ump;
@@ -730,14 +744,30 @@ ffs_reallocblks_ufs1(ap)
                printf("\n\tnew:");
 #endif
        for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) {
+               bp = buflist->bs_children[i];
                if (!DOINGSOFTDEP(vp))
+                       /*
+                        * The usual case is that a set of N-contiguous blocks
+                        * that was just allocated has been replaced with a
+                        * set of N+1-contiguous blocks. If they are marked as
+                        * B_DELWRI, the current contents have not been written
+                        * to disk. It is possible that the blocks were written
+                        * earlier, but very uncommon. If the blocks have never
+                        * been written, there is no need to send a BIO_DELETE
+                        * for them when they are freed. The gain from avoiding
+                        * the TRIMs for the common case of unwritten blocks
+                        * far exceeds the cost of the write amplification for
+                        * the uncommon case of failing to send a TRIM for the
+                        * blocks that had been written.
+                        */
                        ffs_blkfree(ump, fs, ump->um_devvp,
-                           dbtofsb(fs, buflist->bs_children[i]->b_blkno),
-                           fs->fs_bsize, ip->i_number, vp->v_type, NULL);
-               buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno);
+                           dbtofsb(fs, bp->b_blkno),
+                           fs->fs_bsize, ip->i_number, vp->v_type, NULL,
+                           (bp->b_flags & B_DELWRI) != 0 ?
+                           NOTRIM_KEY : SINGLETON_KEY);
+               bp->b_blkno = fsbtodb(fs, blkno);
 #ifdef INVARIANTS
-               if (!ffs_checkblk(ip,
-                  dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
+               if (!ffs_checkblk(ip, dbtofsb(fs, bp->b_blkno), fs->fs_bsize))
                        panic("ffs_reallocblks: unallocated block 3");
 #endif
 #ifdef DEBUG
@@ -771,7 +801,7 @@ ffs_reallocblks_ufs2(ap)
        struct fs *fs;
        struct inode *ip;
        struct vnode *vp;
-       struct buf *sbp, *ebp;
+       struct buf *sbp, *ebp, *bp;
        ufs2_daddr_t *bap, *sbap, *ebap;
        struct cluster_save *buflist;
        struct ufsmount *ump;
@@ -978,14 +1008,30 @@ ffs_reallocblks_ufs2(ap)
                printf("\n\tnew:");
 #endif
        for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) {
+               bp = buflist->bs_children[i];
                if (!DOINGSOFTDEP(vp))
+                       /*
+                        * The usual case is that a set of N-contiguous blocks
+                        * that was just allocated has been replaced with a
+                        * set of N+1-contiguous blocks. If they are marked as
+                        * B_DELWRI, the current contents have not been written
+                        * to disk. It is possible that the blocks were written
+                        * earlier, but very uncommon. If the blocks have never
+                        * been written, there is no need to send a BIO_DELETE
+                        * for them when they are freed. The gain from avoiding
+                        * the TRIMs for the common case of unwritten blocks
+                        * far exceeds the cost of the write amplification for
+                        * the uncommon case of failing to send a TRIM for the
+                        * blocks that had been written.
+                        */
                        ffs_blkfree(ump, fs, ump->um_devvp,
-                           dbtofsb(fs, buflist->bs_children[i]->b_blkno),
-                           fs->fs_bsize, ip->i_number, vp->v_type, NULL);
-               buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno);
+                           dbtofsb(fs, bp->b_blkno),
+                           fs->fs_bsize, ip->i_number, vp->v_type, NULL,
+                           (bp->b_flags & B_DELWRI) != 0 ?
+                           NOTRIM_KEY : SINGLETON_KEY);
+               bp->b_blkno = fsbtodb(fs, blkno);
 #ifdef INVARIANTS
-               if (!ffs_checkblk(ip,
-                  dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
+               if (!ffs_checkblk(ip, dbtofsb(fs, bp->b_blkno), fs->fs_bsize))
                        panic("ffs_reallocblks: unallocated block 3");
 #endif
 #ifdef DEBUG
@@ -1823,8 +1869,7 @@ gotit:
        /* XXX Fixme. */
        UFS_UNLOCK(ump);
        if (DOINGSOFTDEP(ITOV(ip)))
-               softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno,
-                   size, 0);
+               softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, size, 0);
        UFS_LOCK(ump);
        return (blkno);
 }
@@ -2254,6 +2299,17 @@ ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd)
        bdwrite(bp);
 }
 
+/*
+ * Structures and routines associated with trim management.
+ */
+MALLOC_DEFINE(M_TRIM, "ufs_trim", "UFS trim structures");
+
+#define        TRIMLIST_HASH(ump, key) \
+       (&(ump)->um_trimhash[(key) & (ump)->um_trimlisthashsize])
+
+static void    ffs_blkfree_trim_completed(struct buf *);
+static void    ffs_blkfree_trim_task(void *ctx, int pending __unused);
+
 struct ffs_blkfree_trim_params {
        struct task task;
        struct ufsmount *ump;
@@ -2277,7 +2333,7 @@ ffs_blkfree_trim_task(ctx, pending)
            tp->inum, tp->pdephd);
        vn_finished_secondary_write(UFSTOVFS(tp->ump));
        atomic_add_int(&tp->ump->um_trim_inflight, -1);
-       free(tp, M_TEMP);
+       free(tp, M_TRIM);
 }
 
 static void
@@ -2287,14 +2343,46 @@ ffs_blkfree_trim_completed(bp)
        struct ffs_blkfree_trim_params *tp;
 
        tp = bp->b_fsprivate1;
-       free(bp, M_TEMP);
+       free(bp, M_TRIM);
        TASK_INIT(&tp->task, 0, ffs_blkfree_trim_task, tp);
        taskqueue_enqueue(tp->ump->um_trim_tq, &tp->task);
 }
 
+/*
+ * Allocate a new key to use to identify a range of blocks.
+ */
+u_long
+ffs_blkrelease_start(ump, devvp, inum)
+       struct ufsmount *ump;
+       struct vnode *devvp;
+       ino_t inum;
+{
+       static u_long masterkey;
+       u_long key;
+
+       if ((ump->um_flags & UM_CANDELETE) == 0)
+               return (SINGLETON_KEY);
+       do {
+               key = atomic_fetchadd_long(&masterkey, 1);
+       } while (key < FIRST_VALID_KEY);
+       return (key);
+}
+
+/*
+ * Deallocate a key that has been used to identify a range of blocks.
+ */
 void
-ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd)
+ffs_blkrelease_finish(ump, key)
        struct ufsmount *ump;
+       u_long key;
+{
+
+       return;
+}
+
+void
+ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd, key)
+       struct ufsmount *ump;
        struct fs *fs;
        struct vnode *devvp;
        ufs2_daddr_t bno;
@@ -2302,6 +2390,7 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, de
        ino_t inum;
        enum vtype vtype;
        struct workhead *dephd;
+       u_long key;
 {
        struct mount *mp;
        struct buf *bp;
@@ -2319,10 +2408,11 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, de
                return;
        }
        /*
-        * Nothing to delay if TRIM is disabled, or the operation is
-        * performed on the snapshot.
+        * Nothing to delay if TRIM is not required for this block or TRIM
+        * is disabled or the operation is performed on a snapshot.
         */
-       if (((ump->um_flags) & UM_CANDELETE) == 0 || devvp->v_type == VREG) {
+       if (key == NOTRIM_KEY || ((ump->um_flags & UM_CANDELETE) == 0) ||
+           devvp->v_type == VREG) {
                ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd);
                return;
        }
@@ -2334,7 +2424,7 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, de
         * and write some new data into it.
         */
        atomic_add_int(&ump->um_trim_inflight, 1);
-       tp = malloc(sizeof(struct ffs_blkfree_trim_params), M_TEMP, M_WAITOK);
+       tp = malloc(sizeof(struct ffs_blkfree_trim_params), M_TRIM, M_WAITOK);
        tp->ump = ump;
        tp->devvp = devvp;
        tp->bno = bno;
@@ -2347,7 +2437,7 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, de
        } else
                tp->pdephd = NULL;
 
-       bp = malloc(sizeof(*bp), M_TEMP, M_WAITOK | M_ZERO);
+       bp = malloc(sizeof(*bp), M_TRIM, M_WAITOK | M_ZERO);
        bp->b_iocmd = BIO_DELETE;
        bp->b_iooffset = dbtob(fsbtodb(fs, bno));
        bp->b_iodone = ffs_blkfree_trim_completed;
@@ -2822,6 +2912,7 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS)
        struct fs *fs;
        ufs2_daddr_t blkno;
        long blkcnt, blksize;
+       u_long key;
        struct file *fp, *vfp;
        cap_rights_t rights;
        int filetype, error;
@@ -2956,15 +3047,18 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS)
                blkno = cmd.value;
                blkcnt = cmd.size;
                blksize = fs->fs_frag - (blkno % fs->fs_frag);
+               key = ffs_blkrelease_start(ump, ump->um_devvp, UFS_ROOTINO);
                while (blkcnt > 0) {
-                       if (blksize > blkcnt)
+                       if (blkcnt < blksize)
                                blksize = blkcnt;
                        ffs_blkfree(ump, fs, ump->um_devvp, blkno,
-                           blksize * fs->fs_fsize, UFS_ROOTINO, VDIR, NULL);
+                           blksize * fs->fs_fsize, UFS_ROOTINO, 
+                           VDIR, NULL, key);
                        blkno += blksize;
                        blkcnt -= blksize;
                        blksize = fs->fs_frag;
                }
+               ffs_blkrelease_finish(ump, key);
                break;
 
        /*

Modified: head/sys/ufs/ffs/ffs_balloc.c
==============================================================================
--- head/sys/ufs/ffs/ffs_balloc.c       Sat Aug 18 22:07:48 2018        
(r338030)
+++ head/sys/ufs/ffs/ffs_balloc.c       Sat Aug 18 22:21:59 2018        
(r338031)
@@ -553,7 +553,7 @@ fail:
                lbns_remfree++;
 #endif
                ffs_blkfree(ump, fs, ump->um_devvp, *blkp, fs->fs_bsize,
-                   ip->i_number, vp->v_type, NULL);
+                   ip->i_number, vp->v_type, NULL, SINGLETON_KEY);
        }
        return (error);
 }
@@ -1147,7 +1147,7 @@ fail:
                lbns_remfree++;
 #endif
                ffs_blkfree(ump, fs, ump->um_devvp, *blkp, fs->fs_bsize,
-                   ip->i_number, vp->v_type, NULL);
+                   ip->i_number, vp->v_type, NULL, SINGLETON_KEY);
        }
        return (error);
 }

Modified: head/sys/ufs/ffs/ffs_extern.h
==============================================================================
--- head/sys/ufs/ffs/ffs_extern.h       Sat Aug 18 22:07:48 2018        
(r338030)
+++ head/sys/ufs/ffs/ffs_extern.h       Sat Aug 18 22:21:59 2018        
(r338031)
@@ -63,9 +63,11 @@ int  ffs_balloc_ufs2(struct vnode *a_vp, off_t a_starto
             struct ucred *a_cred, int a_flags, struct buf **a_bpp);
 int    ffs_blkatoff(struct vnode *, off_t, char **, struct buf **);
 void   ffs_blkfree(struct ufsmount *, struct fs *, struct vnode *,
-           ufs2_daddr_t, long, ino_t, enum vtype, struct workhead *);
+           ufs2_daddr_t, long, ino_t, enum vtype, struct workhead *, u_long);
 ufs2_daddr_t ffs_blkpref_ufs1(struct inode *, ufs_lbn_t, int, ufs1_daddr_t *);
 ufs2_daddr_t ffs_blkpref_ufs2(struct inode *, ufs_lbn_t, int, ufs2_daddr_t *);
+void   ffs_blkrelease_finish(struct ufsmount *, u_long);
+u_long ffs_blkrelease_start(struct ufsmount *, struct vnode *, ino_t);
 int    ffs_checkfreefile(struct fs *, struct vnode *, ino_t);
 void   ffs_clrblock(struct fs *, u_char *, ufs1_daddr_t);
 void   ffs_clusteracct(struct fs *, struct cg *, ufs1_daddr_t, int);
@@ -111,10 +113,26 @@ vfs_vget_t ffs_vget;
 int    ffs_vgetf(struct mount *, ino_t, int, struct vnode **, int);
 void   process_deferred_inactive(struct mount *mp);
 
+/*
+ * Flags to ffs_vgetf
+ */
 #define        FFSV_FORCEINSMQ 0x0001
 
+/*
+ * Flags to ffs_reload
+ */
 #define        FFSR_FORCE      0x0001
 #define        FFSR_UNSUSPEND  0x0002
+
+/*
+ * Definitions for TRIM interface
+ *
+ * Special keys and recommended hash table size
+ */
+#define        NOTRIM_KEY      1       /* never written, so don't call trim 
for it */
+#define        SINGLETON_KEY   2       /* only block being freed, so trim it 
now */
+#define        FIRST_VALID_KEY 3       /* first valid key describing a block 
range */
+#define        MAXTRIMIO       1024    /* maximum expected outstanding trim 
requests */
 
 extern struct vop_vector ffs_vnodeops1;
 extern struct vop_vector ffs_fifoops1;

Modified: head/sys/ufs/ffs/ffs_inode.c
==============================================================================
--- head/sys/ufs/ffs/ffs_inode.c        Sat Aug 18 22:07:48 2018        
(r338030)
+++ head/sys/ufs/ffs/ffs_inode.c        Sat Aug 18 22:21:59 2018        
(r338031)
@@ -197,6 +197,7 @@ ffs_truncate(vp, length, flags, cred)
        int needextclean, extblocks;
        int offset, size, level, nblocks;
        int i, error, allerror, indiroff, waitforupdate;
+       u_long key;
        off_t osize;
 
        ip = VTOI(vp);
@@ -275,7 +276,7 @@ ffs_truncate(vp, length, flags, cred)
                                        continue;
                                ffs_blkfree(ump, fs, ITODEVVP(ip), oldblks[i],
                                    sblksize(fs, osize, i), ip->i_number,
-                                   vp->v_type, NULL);
+                                   vp->v_type, NULL, SINGLETON_KEY);
                        }
                }
        }
@@ -523,7 +524,7 @@ ffs_truncate(vp, length, flags, cred)
                                DIP_SET(ip, i_ib[level], 0);
                                ffs_blkfree(ump, fs, ump->um_devvp, bn,
                                    fs->fs_bsize, ip->i_number,
-                                   vp->v_type, NULL);
+                                   vp->v_type, NULL, SINGLETON_KEY);
                                blocksreleased += nblocks;
                        }
                }
@@ -534,6 +535,7 @@ ffs_truncate(vp, length, flags, cred)
        /*
         * All whole direct blocks or frags.
         */
+       key = ffs_blkrelease_start(ump, ump->um_devvp, ip->i_number);
        for (i = UFS_NDADDR - 1; i > lastblock; i--) {
                long bsize;
 
@@ -543,9 +545,10 @@ ffs_truncate(vp, length, flags, cred)
                DIP_SET(ip, i_db[i], 0);
                bsize = blksize(fs, ip, i);
                ffs_blkfree(ump, fs, ump->um_devvp, bn, bsize, ip->i_number,
-                   vp->v_type, NULL);
+                   vp->v_type, NULL, key);
                blocksreleased += btodb(bsize);
        }
+       ffs_blkrelease_finish(ump, key);
        if (lastblock < 0)
                goto done;
 
@@ -575,7 +578,8 @@ ffs_truncate(vp, length, flags, cred)
                         */
                        bn += numfrags(fs, newspace);
                        ffs_blkfree(ump, fs, ump->um_devvp, bn,
-                          oldspace - newspace, ip->i_number, vp->v_type, NULL);
+                          oldspace - newspace, ip->i_number, vp->v_type,
+                          NULL, SINGLETON_KEY);
                        blocksreleased += btodb(oldspace - newspace);
                }
        }
@@ -634,8 +638,10 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp)
 {
        struct buf *bp;
        struct fs *fs;
+       struct ufsmount *ump;
        struct vnode *vp;
        caddr_t copy = NULL;
+       u_long key;
        int i, nblocks, error = 0, allerror = 0;
        ufs2_daddr_t nb, nlbn, last;
        ufs2_daddr_t blkcount, factor, blocksreleased = 0;
@@ -644,6 +650,7 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp)
 #define BAP(ip, i) (I_IS_UFS1(ip) ? bap1[i] : bap2[i])
 
        fs = ITOFS(ip);
+       ump = ITOUMP(ip);
 
        /*
         * Calculate index in current block of last
@@ -719,6 +726,7 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp)
        /*
         * Recursively free totally unused blocks.
         */
+       key = ffs_blkrelease_start(ump, ITODEVVP(ip), ip->i_number);
        for (i = NINDIR(fs) - 1, nlbn = lbn + 1 - i * factor; i > last;
            i--, nlbn += factor) {
                nb = BAP(ip, i);
@@ -730,10 +738,11 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp)
                                allerror = error;
                        blocksreleased += blkcount;
                }
-               ffs_blkfree(ITOUMP(ip), fs, ITODEVVP(ip), nb, fs->fs_bsize,
-                   ip->i_number, vp->v_type, NULL);
+               ffs_blkfree(ump, fs, ITODEVVP(ip), nb, fs->fs_bsize,
+                   ip->i_number, vp->v_type, NULL, key);
                blocksreleased += nblocks;
        }
+       ffs_blkrelease_finish(ump, key);
 
        /*
         * Recursively free last partial block.

Modified: head/sys/ufs/ffs/ffs_snapshot.c
==============================================================================
--- head/sys/ufs/ffs/ffs_snapshot.c     Sat Aug 18 22:07:48 2018        
(r338030)
+++ head/sys/ufs/ffs/ffs_snapshot.c     Sat Aug 18 22:21:59 2018        
(r338031)
@@ -583,7 +583,7 @@ loop:
                        if (len != 0 && len < fs->fs_bsize) {
                                ffs_blkfree(ump, copy_fs, vp,
                                    DIP(xp, i_db[loc]), len, xp->i_number,
-                                   xvp->v_type, NULL);
+                                   xvp->v_type, NULL, SINGLETON_KEY);
                                blkno = DIP(xp, i_db[loc]);
                                DIP_SET(xp, i_db[loc], 0);
                        }
@@ -1265,7 +1265,7 @@ mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expung
                if (blkno == BLK_SNAP)
                        blkno = blkstofrags(fs, lblkno);
                ffs_blkfree(ITOUMP(ip), fs, vp, blkno, fs->fs_bsize, inum,
-                   vp->v_type, NULL);
+                   vp->v_type, NULL, SINGLETON_KEY);
        }
        return (0);
 }
@@ -1549,7 +1549,7 @@ mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expung
                if (blkno == BLK_SNAP)
                        blkno = blkstofrags(fs, lblkno);
                ffs_blkfree(ITOUMP(ip), fs, vp, blkno, fs->fs_bsize, inum,
-                   vp->v_type, NULL);
+                   vp->v_type, NULL, SINGLETON_KEY);
        }
        return (0);
 }

Modified: head/sys/ufs/ffs/ffs_softdep.c
==============================================================================
--- head/sys/ufs/ffs/ffs_softdep.c      Sat Aug 18 22:07:48 2018        
(r338030)
+++ head/sys/ufs/ffs/ffs_softdep.c      Sat Aug 18 22:21:59 2018        
(r338031)
@@ -869,7 +869,7 @@ static      void cancel_allocdirect(struct allocdirectlst *
            struct allocdirect *, struct freeblks *);
 static int check_inode_unwritten(struct inodedep *);
 static int free_inodedep(struct inodedep *);
-static void freework_freeblock(struct freework *);
+static void freework_freeblock(struct freework *, u_long);
 static void freework_enqueue(struct freework *);
 static int handle_workitem_freeblocks(struct freeblks *, int);
 static int handle_complete_freeblocks(struct freeblks *, int);
@@ -884,7 +884,7 @@ static      struct allocindir *newallocindir(struct inode *
            ufs2_daddr_t, ufs_lbn_t);
 static void handle_workitem_freefrag(struct freefrag *);
 static struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long,
-           ufs_lbn_t);
+           ufs_lbn_t, u_long);
 static void allocdirect_merge(struct allocdirectlst *,
            struct allocdirect *, struct allocdirect *);
 static struct freefrag *allocindir_merge(struct allocindir *,
@@ -5289,7 +5289,22 @@ softdep_setup_allocdirect(ip, off, newblkno, oldblkno,
        KASSERT(MOUNTEDSOFTDEP(mp) != 0,
            ("softdep_setup_allocdirect called on non-softdep filesystem"));
        if (oldblkno && oldblkno != newblkno)
-               freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
+               /*
+                * The usual case is that a smaller fragment that
+                * was just allocated has been replaced with a bigger
+                * fragment or a full-size block. If it is marked as
+                * B_DELWRI, the current contents have not been written
+                * to disk. It is possible that the block was written
+                * earlier, but very uncommon. If the block has never
+                * been written, there is no need to send a BIO_DELETE
+                * for it when it is freed. The gain from avoiding the
+                * TRIMs for the common case of unwritten blocks far
+                * exceeds the cost of the write amplification for the
+                * uncommon case of failing to send a TRIM for a block
+                * that had been written.
+                */
+               freefrag = newfreefrag(ip, oldblkno, oldsize, lbn,
+                   (bp->b_flags & B_DELWRI) != 0 ? NOTRIM_KEY : SINGLETON_KEY);
        else
                freefrag = NULL;
 
@@ -5566,11 +5581,12 @@ newjfreefrag(freefrag, ip, blkno, size, lbn)
  * Allocate a new freefrag structure.
  */
 static struct freefrag *
-newfreefrag(ip, blkno, size, lbn)
+newfreefrag(ip, blkno, size, lbn, key)
        struct inode *ip;
        ufs2_daddr_t blkno;
        long size;
        ufs_lbn_t lbn;
+       u_long key;
 {
        struct freefrag *freefrag;
        struct ufsmount *ump;
@@ -5591,6 +5607,7 @@ newfreefrag(ip, blkno, size, lbn)
        freefrag->ff_vtype = ITOV(ip)->v_type;
        freefrag->ff_blkno = blkno;
        freefrag->ff_fragsize = size;
+       freefrag->ff_key = key;
 
        if (MOUNTEDSUJ(UFSTOVFS(ump))) {
                freefrag->ff_jdep = (struct worklist *)
@@ -5636,7 +5653,8 @@ handle_workitem_freefrag(freefrag)
        }
        FREE_LOCK(ump);
        ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
-          freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype, &wkhd);
+          freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype,
+          &wkhd, freefrag->ff_key);
        ACQUIRE_LOCK(ump);
        WORKITEM_FREE(freefrag, D_FREEFRAG);
        FREE_LOCK(ump);
@@ -5676,7 +5694,22 @@ softdep_setup_allocext(ip, off, newblkno, oldblkno, ne
 
        lbn = bp->b_lblkno;
        if (oldblkno && oldblkno != newblkno)
-               freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
+               /*
+                * The usual case is that a smaller fragment that
+                * was just allocated has been replaced with a bigger
+                * fragment or a full-size block. If it is marked as
+                * B_DELWRI, the current contents have not been written
+                * to disk. It is possible that the block was written
+                * earlier, but very uncommon. If the block has never
+                * been written, there is no need to send a BIO_DELETE
+                * for it when it is freed. The gain from avoiding the
+                * TRIMs for the common case of unwritten blocks far
+                * exceeds the cost of the write amplification for the
+                * uncommon case of failing to send a TRIM for a block
+                * that had been written.
+                */
+               freefrag = newfreefrag(ip, oldblkno, oldsize, lbn,
+                   (bp->b_flags & B_DELWRI) != 0 ? NOTRIM_KEY : SINGLETON_KEY);
        else
                freefrag = NULL;
 
@@ -5789,7 +5822,8 @@ newallocindir(ip, ptrno, newblkno, oldblkno, lbn)
        struct jnewblk *jnewblk;
 
        if (oldblkno)
-               freefrag = newfreefrag(ip, oldblkno, ITOFS(ip)->fs_bsize, lbn);
+               freefrag = newfreefrag(ip, oldblkno, ITOFS(ip)->fs_bsize, lbn,
+                   SINGLETON_KEY);
        else
                freefrag = NULL;
        ACQUIRE_LOCK(ITOUMP(ip));
@@ -7724,8 +7758,9 @@ free_inodedep(inodedep)
  * in memory immediately.
  */
 static void
-freework_freeblock(freework)
+freework_freeblock(freework, key)
        struct freework *freework;
+       u_long key;
 {
        struct freeblks *freeblks;
        struct jnewblk *jnewblk;
@@ -7779,10 +7814,10 @@ freework_freeblock(freework)
        FREE_LOCK(ump);
        freeblks_free(ump, freeblks, btodb(bsize));
        CTR4(KTR_SUJ,
-           "freework_freeblock: ino %d blkno %jd lbn %jd size %ld",
+           "freework_freeblock: ino %jd blkno %jd lbn %jd size %d",
            freeblks->fb_inum, freework->fw_blkno, freework->fw_lbn, bsize);
        ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, bsize,
-           freeblks->fb_inum, freeblks->fb_vtype, &wkhd);
+           freeblks->fb_inum, freeblks->fb_vtype, &wkhd, key);
        ACQUIRE_LOCK(ump);
        /*
         * The jnewblk will be discarded and the bits in the map never
@@ -7835,7 +7870,7 @@ handle_workitem_indirblk(freework)
                return;
        }
        if (freework->fw_off == NINDIR(fs)) {
-               freework_freeblock(freework);
+               freework_freeblock(freework, SINGLETON_KEY);
                return;
        }
        freework->fw_state |= INPROGRESS;
@@ -7894,10 +7929,12 @@ handle_workitem_freeblocks(freeblks, flags)
        struct allocindir *aip;
        struct ufsmount *ump;
        struct worklist *wk;
+       u_long key;
 
        KASSERT(LIST_EMPTY(&freeblks->fb_jblkdephd),
            ("handle_workitem_freeblocks: Journal entries not written."));
        ump = VFSTOUFS(freeblks->fb_list.wk_mp);
+       key = ffs_blkrelease_start(ump, freeblks->fb_devvp, freeblks->fb_inum);
        ACQUIRE_LOCK(ump);
        while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) {
                WORKLIST_REMOVE(wk);
@@ -7935,7 +7972,7 @@ handle_workitem_freeblocks(freeblks, flags)
                        if (freework->fw_lbn <= -UFS_NDADDR)
                                handle_workitem_indirblk(freework);
                        else
-                               freework_freeblock(freework);
+                               freework_freeblock(freework, key);
                        continue;
                default:
                        panic("handle_workitem_freeblocks: Unknown type %s",
@@ -7948,6 +7985,7 @@ handle_workitem_freeblocks(freeblks, flags)
                freeblks = NULL;
        }
        FREE_LOCK(ump);
+       ffs_blkrelease_finish(ump, key);
        if (freeblks)
                return handle_complete_freeblocks(freeblks, flags);
        return (0);
@@ -8080,13 +8118,9 @@ indir_trunc(freework, dbn, lbn)
        ufs1_daddr_t *bap1;
        ufs2_daddr_t nb, nnb, *bap2;
        ufs_lbn_t lbnadd, nlbn;
-       int i, nblocks, ufs1fmt;
-       int freedblocks;
-       int goingaway;
-       int freedeps;
-       int needj;
-       int level;
-       int cnt;
+       u_long key;
+       int nblocks, ufs1fmt, freedblocks;
+       int goingaway, freedeps, needj, level, cnt, i;
 
        freeblks = freework->fw_freeblks;
        ump = VFSTOUFS(freeblks->fb_list.wk_mp);
@@ -8180,6 +8214,7 @@ indir_trunc(freework, dbn, lbn)
         * arranges for the current level to be freed when subordinates
         * are free when journaling.
         */
+       key = ffs_blkrelease_start(ump, freeblks->fb_devvp, freeblks->fb_inum);
        for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) {
                if (i != NINDIR(fs) - 1) {
                        if (ufs1fmt)
@@ -8215,13 +8250,14 @@ indir_trunc(freework, dbn, lbn)
                                freedeps++;
                        }
                        CTR3(KTR_SUJ,
-                           "indir_trunc: ino %d blkno %jd size %ld",
+                           "indir_trunc: ino %jd blkno %jd size %d",
                            freeblks->fb_inum, nb, fs->fs_bsize);
                        ffs_blkfree(ump, fs, freeblks->fb_devvp, nb,
                            fs->fs_bsize, freeblks->fb_inum,
-                           freeblks->fb_vtype, &wkhd);
+                           freeblks->fb_vtype, &wkhd, key);
                }
        }
+       ffs_blkrelease_finish(ump, key);
        if (goingaway) {
                bp->b_flags |= B_INVAL | B_NOCACHE;
                brelse(bp);
@@ -8244,7 +8280,7 @@ indir_trunc(freework, dbn, lbn)
                if (level == 0)
                        freeblks->fb_cgwait += freedeps;
                if (freework->fw_ref == 0)
-                       freework_freeblock(freework);
+                       freework_freeblock(freework, SINGLETON_KEY);
                FREE_LOCK(ump);
                return;
        }
@@ -8253,10 +8289,10 @@ indir_trunc(freework, dbn, lbn)
         */
        dbn = dbtofsb(fs, dbn);
        CTR3(KTR_SUJ,
-           "indir_trunc 2: ino %d blkno %jd size %ld",
+           "indir_trunc 2: ino %jd blkno %jd size %d",
            freeblks->fb_inum, dbn, fs->fs_bsize);
        ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize,
-           freeblks->fb_inum, freeblks->fb_vtype, NULL);
+           freeblks->fb_inum, freeblks->fb_vtype, NULL, SINGLETON_KEY);
        /* Non SUJ softdep does single-threaded truncations. */
        if (freework->fw_blkno == dbn) {
                freework->fw_state |= ALLCOMPLETE;

Modified: head/sys/ufs/ffs/ffs_vfsops.c
==============================================================================
--- head/sys/ufs/ffs/ffs_vfsops.c       Sat Aug 18 22:07:48 2018        
(r338030)
+++ head/sys/ufs/ffs/ffs_vfsops.c       Sat Aug 18 22:21:59 2018        
(r338031)
@@ -978,6 +978,8 @@ ffs_mountfs(devvp, mp, td)
                            taskqueue_thread_enqueue, &ump->um_trim_tq);
                        taskqueue_start_threads(&ump->um_trim_tq, 1, PVFS,
                            "%s trim", mp->mnt_stat.f_mntonname);
+                       ump->um_trimhash = hashinit(MAXTRIMIO, M_TRIM,
+                           &ump->um_trimlisthashsize);
                }
        }
 
@@ -1256,6 +1258,7 @@ ffs_unmount(mp, mntflags)
                        pause("ufsutr", hz);
                taskqueue_drain_all(ump->um_trim_tq);
                taskqueue_free(ump->um_trim_tq);
+               free (ump->um_trimhash, M_TRIM);
        }
        g_topology_lock();
        if (ump->um_fsckpid > 0) {

Modified: head/sys/ufs/ffs/softdep.h
==============================================================================
--- head/sys/ufs/ffs/softdep.h  Sat Aug 18 22:07:48 2018        (r338030)
+++ head/sys/ufs/ffs/softdep.h  Sat Aug 18 22:21:59 2018        (r338031)
@@ -557,6 +557,7 @@ struct freefrag {
        long    ff_fragsize;            /* size of fragment being deleted */
        ino_t   ff_inum;                /* owning inode number */
        enum    vtype ff_vtype;         /* owning inode's file type */
+       int     ff_key;                 /* trim key when deleted */
 };
 
 /*

Modified: head/sys/ufs/ufs/ufsmount.h
==============================================================================
--- head/sys/ufs/ufs/ufsmount.h Sat Aug 18 22:07:48 2018        (r338030)
+++ head/sys/ufs/ufs/ufsmount.h Sat Aug 18 22:21:59 2018        (r338031)
@@ -47,6 +47,7 @@ struct ufs_args {
 
 #ifdef MALLOC_DECLARE
 MALLOC_DECLARE(M_UFSMNT);
+MALLOC_DECLARE(M_TRIM);
 #endif
 
 struct buf;
@@ -63,6 +64,7 @@ struct inodedep;
 
 TAILQ_HEAD(inodedeplst, inodedep);
 LIST_HEAD(bmsafemaphd, bmsafemap);
+LIST_HEAD(trimlist_hashhead, ffs_blkfree_trim_params);
 
 /*
  * This structure describes the UFS specific mount structure data.
@@ -70,7 +72,6 @@ LIST_HEAD(bmsafemaphd, bmsafemap);
  * UFS (UFS1, UFS2, etc).
  *
  * Lock reference:
- *     a - atomic operations
  *     c - set at allocation then constant until freed
  *     i - ufsmount interlock (UFS_LOCK / UFS_UNLOCK)
  *     q - associated quota file is locked
@@ -99,8 +100,13 @@ struct ufsmount {
        char    um_qflags[MAXQUOTAS];           /* (i) quota specific flags */
        int64_t um_savedmaxfilesize;            /* (c) track maxfilesize */
        u_int   um_flags;                       /* (i) filesystem flags */
-       u_int   um_trim_inflight;               /* (a) outstanding trim count */
+       u_int   um_trim_inflight;               /* (i) outstanding trim count */
+       u_int   um_trim_inflight_blks;          /* (i) outstanding trim blks */
+       u_long  um_trim_total;                  /* (i) total trim count */
+       u_long  um_trim_total_blks;             /* (i) total trim block count */
        struct  taskqueue *um_trim_tq;          /* (c) trim request queue */
+       struct  trimlist_hashhead *um_trimhash; /* (i) trimlist hash table */
+       u_long  um_trimlisthashsize;            /* (i) trim hash table size-1 */
                                                /* (c) - below function ptrs */
        int     (*um_balloc)(struct vnode *, off_t, int, struct ucred *,
                    int, struct buf **);
_______________________________________________
svn-src-all@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"

Reply via email to