Author: mckusick
Date: Mon Aug  6 21:09:11 2018
New Revision: 337396
URL: https://svnweb.freebsd.org/changeset/base/337396

Log:
  Put in place the framework for consolodating contiguous blocks into
  a smaller number of larger TRIM requests. The hope had been to have
  the full TRIM consolodation in place for 12.0, but the algorithms
  are still under development and need further testing. With this
  framework in place it will be possible to easily add TRIM consolodation
  once the optimal strategy has been found.
  
  The only functional change with this patch is the elimination of TRIM
  requests for blocks that are freed before they have been likely to
  have been written.
  
  Reviewed by: kib
  Discussed with: Warner Losh and Chuck Silvers
  Sponsored by: Netflix

Modified:
  head/sys/ufs/ffs/ffs_alloc.c
  head/sys/ufs/ffs/ffs_balloc.c
  head/sys/ufs/ffs/ffs_extern.h
  head/sys/ufs/ffs/ffs_inode.c
  head/sys/ufs/ffs/ffs_snapshot.c
  head/sys/ufs/ffs/ffs_softdep.c
  head/sys/ufs/ffs/ffs_vfsops.c
  head/sys/ufs/ffs/softdep.h
  head/sys/ufs/ufs/ufsmount.h

Modified: head/sys/ufs/ffs/ffs_alloc.c
==============================================================================
--- head/sys/ufs/ffs/ffs_alloc.c        Mon Aug  6 20:39:27 2018        
(r337395)
+++ head/sys/ufs/ffs/ffs_alloc.c        Mon Aug  6 21:09:11 2018        
(r337396)
@@ -110,8 +110,6 @@ static ufs2_daddr_t
 static void    ffs_blkfree_cg(struct ufsmount *, struct fs *,
                    struct vnode *, ufs2_daddr_t, long, ino_t,
                    struct workhead *);
-static void    ffs_blkfree_trim_completed(struct buf *);
-static void    ffs_blkfree_trim_task(void *ctx, int pending __unused);
 #ifdef INVARIANTS
 static int     ffs_checkblk(struct inode *, ufs2_daddr_t, long);
 #endif
@@ -395,8 +393,23 @@ retry:
        if (bno > 0) {
                bp->b_blkno = fsbtodb(fs, bno);
                if (!DOINGSOFTDEP(vp))
+                       /*
+                        * The usual case is that a smaller fragment that
+                        * was just allocated has been replaced with a bigger
+                        * fragment or a full-size block. If it is marked as
+                        * B_DELWRI, the current contents have not been written
+                        * to disk. It is possible that the block was written
+                        * earlier, but very uncommon. If the block has never
+                        * been written, there is no need to send a BIO_DELETE
+                        * for it when it is freed. The gain from avoiding the
+                        * TRIMs for the common case of unwritten blocks far
+                        * exceeds the cost of the write amplification for the
+                        * uncommon case of failing to send a TRIM for a block
+                        * that had been written.
+                        */
                        ffs_blkfree(ump, fs, ump->um_devvp, bprev, (long)osize,
-                           ip->i_number, vp->v_type, NULL);
+                           ip->i_number, vp->v_type, NULL,
+                           (bp->b_flags & B_DELWRI) != 0 ? NOTRIM : SINGLETON);
                delta = btodb(nsize - osize);
                DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta);
                if (flags & IO_EXT)
@@ -521,7 +534,7 @@ ffs_reallocblks_ufs1(ap)
        struct fs *fs;
        struct inode *ip;
        struct vnode *vp;
-       struct buf *sbp, *ebp;
+       struct buf *sbp, *ebp, *bp;
        ufs1_daddr_t *bap, *sbap, *ebap;
        struct cluster_save *buflist;
        struct ufsmount *ump;
@@ -730,14 +743,29 @@ ffs_reallocblks_ufs1(ap)
                printf("\n\tnew:");
 #endif
        for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) {
+               bp = buflist->bs_children[i];
                if (!DOINGSOFTDEP(vp))
+                       /*
+                        * The usual case is that a set of N-contiguous blocks
+                        * that was just allocated has been replaced with a
+                        * set of N+1-contiguous blocks. If they are marked as
+                        * B_DELWRI, the current contents have not been written
+                        * to disk. It is possible that the blocks were written
+                        * earlier, but very uncommon. If the blocks have never
+                        * been written, there is no need to send a BIO_DELETE
+                        * for them when they are freed. The gain from avoiding
+                        * the TRIMs for the common case of unwritten blocks
+                        * far exceeds the cost of the write amplification for
+                        * the uncommon case of failing to send a TRIM for the
+                        * blocks that had been written.
+                        */
                        ffs_blkfree(ump, fs, ump->um_devvp,
-                           dbtofsb(fs, buflist->bs_children[i]->b_blkno),
-                           fs->fs_bsize, ip->i_number, vp->v_type, NULL);
-               buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno);
+                           dbtofsb(fs, bp->b_blkno),
+                           fs->fs_bsize, ip->i_number, vp->v_type, NULL,
+                           (bp->b_flags & B_DELWRI) != 0 ? NOTRIM : SINGLETON);
+               bp->b_blkno = fsbtodb(fs, blkno);
 #ifdef INVARIANTS
-               if (!ffs_checkblk(ip,
-                  dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
+               if (!ffs_checkblk(ip, dbtofsb(fs, bp->b_blkno), fs->fs_bsize))
                        panic("ffs_reallocblks: unallocated block 3");
 #endif
 #ifdef DEBUG
@@ -771,7 +799,7 @@ ffs_reallocblks_ufs2(ap)
        struct fs *fs;
        struct inode *ip;
        struct vnode *vp;
-       struct buf *sbp, *ebp;
+       struct buf *sbp, *ebp, *bp;
        ufs2_daddr_t *bap, *sbap, *ebap;
        struct cluster_save *buflist;
        struct ufsmount *ump;
@@ -978,14 +1006,29 @@ ffs_reallocblks_ufs2(ap)
                printf("\n\tnew:");
 #endif
        for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) {
+               bp = buflist->bs_children[i];
                if (!DOINGSOFTDEP(vp))
+                       /*
+                        * The usual case is that a set of N-contiguous blocks
+                        * that was just allocated has been replaced with a
+                        * set of N+1-contiguous blocks. If they are marked as
+                        * B_DELWRI, the current contents have not been written
+                        * to disk. It is possible that the blocks were written
+                        * earlier, but very uncommon. If the blocks have never
+                        * been written, there is no need to send a BIO_DELETE
+                        * for them when they are freed. The gain from avoiding
+                        * the TRIMs for the common case of unwritten blocks
+                        * far exceeds the cost of the write amplification for
+                        * the uncommon case of failing to send a TRIM for the
+                        * blocks that had been written.
+                        */
                        ffs_blkfree(ump, fs, ump->um_devvp,
-                           dbtofsb(fs, buflist->bs_children[i]->b_blkno),
-                           fs->fs_bsize, ip->i_number, vp->v_type, NULL);
-               buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno);
+                           dbtofsb(fs, bp->b_blkno),
+                           fs->fs_bsize, ip->i_number, vp->v_type, NULL,
+                           (bp->b_flags & B_DELWRI) != 0 ? NOTRIM : SINGLETON);
+               bp->b_blkno = fsbtodb(fs, blkno);
 #ifdef INVARIANTS
-               if (!ffs_checkblk(ip,
-                  dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
+               if (!ffs_checkblk(ip, dbtofsb(fs, bp->b_blkno), fs->fs_bsize))
                        panic("ffs_reallocblks: unallocated block 3");
 #endif
 #ifdef DEBUG
@@ -1823,8 +1866,7 @@ gotit:
        /* XXX Fixme. */
        UFS_UNLOCK(ump);
        if (DOINGSOFTDEP(ITOV(ip)))
-               softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno,
-                   size, 0);
+               softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, size, 0);
        UFS_LOCK(ump);
        return (blkno);
 }
@@ -2254,6 +2296,17 @@ ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd)
        bdwrite(bp);
 }
 
+/*
+ * Structures and routines associated with trim management.
+ */
+MALLOC_DEFINE(M_TRIM, "ufs_trim", "UFS trim structures");
+
+#define        TRIMLIST_HASH(ump, inum) \
+       (&(ump)->um_trimhash[(inum) & (ump)->um_trimlisthashsize])
+
+static void    ffs_blkfree_trim_completed(struct buf *);
+static void    ffs_blkfree_trim_task(void *ctx, int pending __unused);
+
 struct ffs_blkfree_trim_params {
        struct task task;
        struct ufsmount *ump;
@@ -2277,7 +2330,7 @@ ffs_blkfree_trim_task(ctx, pending)
            tp->inum, tp->pdephd);
        vn_finished_secondary_write(UFSTOVFS(tp->ump));
        atomic_add_int(&tp->ump->um_trim_inflight, -1);
-       free(tp, M_TEMP);
+       free(tp, M_TRIM);
 }
 
 static void
@@ -2287,13 +2340,13 @@ ffs_blkfree_trim_completed(bp)
        struct ffs_blkfree_trim_params *tp;
 
        tp = bp->b_fsprivate1;
-       free(bp, M_TEMP);
+       free(bp, M_TRIM);
        TASK_INIT(&tp->task, 0, ffs_blkfree_trim_task, tp);
        taskqueue_enqueue(tp->ump->um_trim_tq, &tp->task);
 }
 
 void
-ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd)
+ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd, trimtype)
        struct ufsmount *ump;
        struct fs *fs;
        struct vnode *devvp;
@@ -2302,6 +2355,7 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, de
        ino_t inum;
        enum vtype vtype;
        struct workhead *dephd;
+       int trimtype;
 {
        struct mount *mp;
        struct buf *bp;
@@ -2319,10 +2373,11 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, de
                return;
        }
        /*
-        * Nothing to delay if TRIM is disabled, or the operation is
-        * performed on the snapshot.
+        * Nothing to delay if TRIM is not required for this block or TRIM
+        * is disabled or the operation is performed on a snapshot.
         */
-       if (((ump->um_flags) & UM_CANDELETE) == 0 || devvp->v_type == VREG) {
+       if (trimtype == NOTRIM || ((ump->um_flags & UM_CANDELETE) == 0) ||
+           devvp->v_type == VREG) {
                ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd);
                return;
        }
@@ -2334,7 +2389,7 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, de
         * and write some new data into it.
         */
        atomic_add_int(&ump->um_trim_inflight, 1);
-       tp = malloc(sizeof(struct ffs_blkfree_trim_params), M_TEMP, M_WAITOK);
+       tp = malloc(sizeof(struct ffs_blkfree_trim_params), M_TRIM, M_WAITOK);
        tp->ump = ump;
        tp->devvp = devvp;
        tp->bno = bno;
@@ -2347,7 +2402,7 @@ ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, de
        } else
                tp->pdephd = NULL;
 
-       bp = malloc(sizeof(*bp), M_TEMP, M_WAITOK | M_ZERO);
+       bp = malloc(sizeof(*bp), M_TRIM, M_WAITOK | M_ZERO);
        bp->b_iocmd = BIO_DELETE;
        bp->b_iooffset = dbtob(fsbtodb(fs, bno));
        bp->b_iodone = ffs_blkfree_trim_completed;
@@ -2824,7 +2879,7 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS)
        long blkcnt, blksize;
        struct file *fp, *vfp;
        cap_rights_t rights;
-       int filetype, error;
+       int filetype, trimtype, error;
        static struct fileops *origops, bufferedops;
 
        if (req->newlen > sizeof cmd)
@@ -2956,14 +3011,17 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS)
                blkno = cmd.value;
                blkcnt = cmd.size;
                blksize = fs->fs_frag - (blkno % fs->fs_frag);
+               trimtype = (blksize < blkcnt) ? STARTFREE : SINGLETON;
                while (blkcnt > 0) {
                        if (blksize > blkcnt)
                                blksize = blkcnt;
                        ffs_blkfree(ump, fs, ump->um_devvp, blkno,
-                           blksize * fs->fs_fsize, UFS_ROOTINO, VDIR, NULL);
+                           blksize * fs->fs_fsize, UFS_ROOTINO,
+                           VDIR, NULL, trimtype);
                        blkno += blksize;
                        blkcnt -= blksize;
                        blksize = fs->fs_frag;
+                       trimtype = (blksize < blkcnt) ? CONTINUEFREE : ENDFREE;
                }
                break;
 

Modified: head/sys/ufs/ffs/ffs_balloc.c
==============================================================================
--- head/sys/ufs/ffs/ffs_balloc.c       Mon Aug  6 20:39:27 2018        
(r337395)
+++ head/sys/ufs/ffs/ffs_balloc.c       Mon Aug  6 21:09:11 2018        
(r337396)
@@ -553,7 +553,7 @@ fail:
                lbns_remfree++;
 #endif
                ffs_blkfree(ump, fs, ump->um_devvp, *blkp, fs->fs_bsize,
-                   ip->i_number, vp->v_type, NULL);
+                   ip->i_number, vp->v_type, NULL, SINGLETON);
        }
        return (error);
 }
@@ -1147,7 +1147,7 @@ fail:
                lbns_remfree++;
 #endif
                ffs_blkfree(ump, fs, ump->um_devvp, *blkp, fs->fs_bsize,
-                   ip->i_number, vp->v_type, NULL);
+                   ip->i_number, vp->v_type, NULL, SINGLETON);
        }
        return (error);
 }

Modified: head/sys/ufs/ffs/ffs_extern.h
==============================================================================
--- head/sys/ufs/ffs/ffs_extern.h       Mon Aug  6 20:39:27 2018        
(r337395)
+++ head/sys/ufs/ffs/ffs_extern.h       Mon Aug  6 21:09:11 2018        
(r337396)
@@ -63,7 +63,7 @@ int   ffs_balloc_ufs2(struct vnode *a_vp, off_t a_starto
             struct ucred *a_cred, int a_flags, struct buf **a_bpp);
 int    ffs_blkatoff(struct vnode *, off_t, char **, struct buf **);
 void   ffs_blkfree(struct ufsmount *, struct fs *, struct vnode *,
-           ufs2_daddr_t, long, ino_t, enum vtype, struct workhead *);
+           ufs2_daddr_t, long, ino_t, enum vtype, struct workhead *, int);
 ufs2_daddr_t ffs_blkpref_ufs1(struct inode *, ufs_lbn_t, int, ufs1_daddr_t *);
 ufs2_daddr_t ffs_blkpref_ufs2(struct inode *, ufs_lbn_t, int, ufs2_daddr_t *);
 int    ffs_checkfreefile(struct fs *, struct vnode *, ino_t);
@@ -111,10 +111,27 @@ vfs_vget_t ffs_vget;
 int    ffs_vgetf(struct mount *, ino_t, int, struct vnode **, int);
 void   process_deferred_inactive(struct mount *mp);
 
+/*
+ * Flags to ffs_vgetf
+ */
 #define        FFSV_FORCEINSMQ 0x0001
 
+/*
+ * Flags to ffs_reload
+ */
 #define        FFSR_FORCE      0x0001
 #define        FFSR_UNSUSPEND  0x0002
+
+/*
+ * Trim type to ffs_blkfree - used to help with BIO_DELETE (trim) requests
+ */
+#define        NOTRIM          1       /* never written, so don't call trim 
for it */
+#define        SINGLETON       2       /* only block being freed, so trim it 
now */
+#define        STARTFREE       3       /* beginning to free for this inum */
+#define        CONTINUEFREE    4       /* additional block free for this inum 
*/
+#define        ENDFREE         5       /* last block to free for this inum */
+
+#define        MAXTRIMIO       1024    /* maximum expected outstanding trim 
requests */
 
 extern struct vop_vector ffs_vnodeops1;
 extern struct vop_vector ffs_fifoops1;

Modified: head/sys/ufs/ffs/ffs_inode.c
==============================================================================
--- head/sys/ufs/ffs/ffs_inode.c        Mon Aug  6 20:39:27 2018        
(r337395)
+++ head/sys/ufs/ffs/ffs_inode.c        Mon Aug  6 21:09:11 2018        
(r337396)
@@ -195,7 +195,7 @@ ffs_truncate(vp, length, flags, cred)
        struct ufsmount *ump;
        int softdeptrunc, journaltrunc;
        int needextclean, extblocks;
-       int offset, size, level, nblocks;
+       int trimtype, firstfree, offset, size, level, nblocks;
        int i, error, allerror, indiroff, waitforupdate;
        off_t osize;
 
@@ -275,7 +275,7 @@ ffs_truncate(vp, length, flags, cred)
                                        continue;
                                ffs_blkfree(ump, fs, ITODEVVP(ip), oldblks[i],
                                    sblksize(fs, osize, i), ip->i_number,
-                                   vp->v_type, NULL);
+                                   vp->v_type, NULL, SINGLETON);
                        }
                }
        }
@@ -523,7 +523,7 @@ ffs_truncate(vp, length, flags, cred)
                                DIP_SET(ip, i_ib[level], 0);
                                ffs_blkfree(ump, fs, ump->um_devvp, bn,
                                    fs->fs_bsize, ip->i_number,
-                                   vp->v_type, NULL);
+                                   vp->v_type, NULL, SINGLETON);
                                blocksreleased += nblocks;
                        }
                }
@@ -534,6 +534,7 @@ ffs_truncate(vp, length, flags, cred)
        /*
         * All whole direct blocks or frags.
         */
+       firstfree = 1;
        for (i = UFS_NDADDR - 1; i > lastblock; i--) {
                long bsize;
 
@@ -542,8 +543,23 @@ ffs_truncate(vp, length, flags, cred)
                        continue;
                DIP_SET(ip, i_db[i], 0);
                bsize = blksize(fs, ip, i);
+               if (firstfree) {
+                       if (i - 1 == lastblock || DIP(ip, i_db[i - 1]) == 0) {
+                               trimtype = SINGLETON;
+                       } else {
+                               trimtype = STARTFREE;
+                               firstfree = 0;
+                       }
+               } else {
+                       if (i - 1 == lastblock || DIP(ip, i_db[i - 1]) == 0) {
+                               trimtype = ENDFREE;
+                               firstfree = 1;
+                       } else {
+                               trimtype = CONTINUEFREE;
+                       }
+               }
                ffs_blkfree(ump, fs, ump->um_devvp, bn, bsize, ip->i_number,
-                   vp->v_type, NULL);
+                   vp->v_type, NULL, trimtype);
                blocksreleased += btodb(bsize);
        }
        if (lastblock < 0)
@@ -575,7 +591,8 @@ ffs_truncate(vp, length, flags, cred)
                         */
                        bn += numfrags(fs, newspace);
                        ffs_blkfree(ump, fs, ump->um_devvp, bn,
-                          oldspace - newspace, ip->i_number, vp->v_type, NULL);
+                          oldspace - newspace, ip->i_number, vp->v_type,
+                          NULL, SINGLETON);
                        blocksreleased += btodb(oldspace - newspace);
                }
        }
@@ -636,7 +653,7 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp)
        struct fs *fs;
        struct vnode *vp;
        caddr_t copy = NULL;
-       int i, nblocks, error = 0, allerror = 0;
+       int i, trimtype, nblocks, firstfree, error = 0, allerror = 0;
        ufs2_daddr_t nb, nlbn, last;
        ufs2_daddr_t blkcount, factor, blocksreleased = 0;
        ufs1_daddr_t *bap1 = NULL;
@@ -719,6 +736,7 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp)
        /*
         * Recursively free totally unused blocks.
         */
+       firstfree = 1;
        for (i = NINDIR(fs) - 1, nlbn = lbn + 1 - i * factor; i > last;
            i--, nlbn += factor) {
                nb = BAP(ip, i);
@@ -730,8 +748,23 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp)
                                allerror = error;
                        blocksreleased += blkcount;
                }
+               if (firstfree) {
+                       if (i - 1 == last || BAP(ip, i - 1) == 0) {
+                               trimtype = SINGLETON;
+                       } else {
+                               trimtype = STARTFREE;
+                               firstfree = 0;
+                       }
+               } else {
+                       if (i - 1 == last || BAP(ip, i - 1) == 0) {
+                               trimtype = ENDFREE;
+                               firstfree = 1;
+                       } else {
+                               trimtype = CONTINUEFREE;
+                       }
+               }
                ffs_blkfree(ITOUMP(ip), fs, ITODEVVP(ip), nb, fs->fs_bsize,
-                   ip->i_number, vp->v_type, NULL);
+                   ip->i_number, vp->v_type, NULL, trimtype);
                blocksreleased += nblocks;
        }
 

Modified: head/sys/ufs/ffs/ffs_snapshot.c
==============================================================================
--- head/sys/ufs/ffs/ffs_snapshot.c     Mon Aug  6 20:39:27 2018        
(r337395)
+++ head/sys/ufs/ffs/ffs_snapshot.c     Mon Aug  6 21:09:11 2018        
(r337396)
@@ -583,7 +583,7 @@ loop:
                        if (len != 0 && len < fs->fs_bsize) {
                                ffs_blkfree(ump, copy_fs, vp,
                                    DIP(xp, i_db[loc]), len, xp->i_number,
-                                   xvp->v_type, NULL);
+                                   xvp->v_type, NULL, SINGLETON);
                                blkno = DIP(xp, i_db[loc]);
                                DIP_SET(xp, i_db[loc], 0);
                        }
@@ -1265,7 +1265,7 @@ mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expung
                if (blkno == BLK_SNAP)
                        blkno = blkstofrags(fs, lblkno);
                ffs_blkfree(ITOUMP(ip), fs, vp, blkno, fs->fs_bsize, inum,
-                   vp->v_type, NULL);
+                   vp->v_type, NULL, SINGLETON);
        }
        return (0);
 }
@@ -1549,7 +1549,7 @@ mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expung
                if (blkno == BLK_SNAP)
                        blkno = blkstofrags(fs, lblkno);
                ffs_blkfree(ITOUMP(ip), fs, vp, blkno, fs->fs_bsize, inum,
-                   vp->v_type, NULL);
+                   vp->v_type, NULL, SINGLETON);
        }
        return (0);
 }

Modified: head/sys/ufs/ffs/ffs_softdep.c
==============================================================================
--- head/sys/ufs/ffs/ffs_softdep.c      Mon Aug  6 20:39:27 2018        
(r337395)
+++ head/sys/ufs/ffs/ffs_softdep.c      Mon Aug  6 21:09:11 2018        
(r337396)
@@ -869,7 +869,7 @@ static      void cancel_allocdirect(struct allocdirectlst *
            struct allocdirect *, struct freeblks *);
 static int check_inode_unwritten(struct inodedep *);
 static int free_inodedep(struct inodedep *);
-static void freework_freeblock(struct freework *);
+static void freework_freeblock(struct freework *, int);
 static void freework_enqueue(struct freework *);
 static int handle_workitem_freeblocks(struct freeblks *, int);
 static int handle_complete_freeblocks(struct freeblks *, int);
@@ -884,7 +884,7 @@ static      struct allocindir *newallocindir(struct inode *
            ufs2_daddr_t, ufs_lbn_t);
 static void handle_workitem_freefrag(struct freefrag *);
 static struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long,
-           ufs_lbn_t);
+           ufs_lbn_t, int);
 static void allocdirect_merge(struct allocdirectlst *,
            struct allocdirect *, struct allocdirect *);
 static struct freefrag *allocindir_merge(struct allocindir *,
@@ -5289,7 +5289,22 @@ softdep_setup_allocdirect(ip, off, newblkno, oldblkno,
        KASSERT(MOUNTEDSOFTDEP(mp) != 0,
            ("softdep_setup_allocdirect called on non-softdep filesystem"));
        if (oldblkno && oldblkno != newblkno)
-               freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
+               /*
+                * The usual case is that a smaller fragment that
+                * was just allocated has been replaced with a bigger
+                * fragment or a full-size block. If it is marked as
+                * B_DELWRI, the current contents have not been written
+                * to disk. It is possible that the block was written
+                * earlier, but very uncommon. If the block has never
+                * been written, there is no need to send a BIO_DELETE
+                * for it when it is freed. The gain from avoiding the
+                * TRIMs for the common case of unwritten blocks far
+                * exceeds the cost of the write amplification for the
+                * uncommon case of failing to send a TRIM for a block
+                * that had been written.
+                */
+               freefrag = newfreefrag(ip, oldblkno, oldsize, lbn,
+                   (bp->b_flags & B_DELWRI) != 0 ? NOTRIM : SINGLETON);
        else
                freefrag = NULL;
 
@@ -5566,11 +5581,12 @@ newjfreefrag(freefrag, ip, blkno, size, lbn)
  * Allocate a new freefrag structure.
  */
 static struct freefrag *
-newfreefrag(ip, blkno, size, lbn)
+newfreefrag(ip, blkno, size, lbn, trimtype)
        struct inode *ip;
        ufs2_daddr_t blkno;
        long size;
        ufs_lbn_t lbn;
+       int trimtype;
 {
        struct freefrag *freefrag;
        struct ufsmount *ump;
@@ -5591,6 +5607,7 @@ newfreefrag(ip, blkno, size, lbn)
        freefrag->ff_vtype = ITOV(ip)->v_type;
        freefrag->ff_blkno = blkno;
        freefrag->ff_fragsize = size;
+       freefrag->ff_trimtype = trimtype;
 
        if (MOUNTEDSUJ(UFSTOVFS(ump))) {
                freefrag->ff_jdep = (struct worklist *)
@@ -5636,7 +5653,8 @@ handle_workitem_freefrag(freefrag)
        }
        FREE_LOCK(ump);
        ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
-          freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype, &wkhd);
+          freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype, &wkhd,
+          freefrag->ff_trimtype);
        ACQUIRE_LOCK(ump);
        WORKITEM_FREE(freefrag, D_FREEFRAG);
        FREE_LOCK(ump);
@@ -5676,7 +5694,22 @@ softdep_setup_allocext(ip, off, newblkno, oldblkno, ne
 
        lbn = bp->b_lblkno;
        if (oldblkno && oldblkno != newblkno)
-               freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
+               /*
+                * The usual case is that a smaller fragment that
+                * was just allocated has been replaced with a bigger
+                * fragment or a full-size block. If it is marked as
+                * B_DELWRI, the current contents have not been written
+                * to disk. It is possible that the block was written
+                * earlier, but very uncommon. If the block has never
+                * been written, there is no need to send a BIO_DELETE
+                * for it when it is freed. The gain from avoiding the
+                * TRIMs for the common case of unwritten blocks far
+                * exceeds the cost of the write amplification for the
+                * uncommon case of failing to send a TRIM for a block
+                * that had been written.
+                */
+               freefrag = newfreefrag(ip, oldblkno, oldsize, lbn,
+                   (bp->b_flags & B_DELWRI) != 0 ? NOTRIM : SINGLETON);
        else
                freefrag = NULL;
 
@@ -5789,7 +5822,8 @@ newallocindir(ip, ptrno, newblkno, oldblkno, lbn)
        struct jnewblk *jnewblk;
 
        if (oldblkno)
-               freefrag = newfreefrag(ip, oldblkno, ITOFS(ip)->fs_bsize, lbn);
+               freefrag = newfreefrag(ip, oldblkno, ITOFS(ip)->fs_bsize, lbn,
+                   SINGLETON);
        else
                freefrag = NULL;
        ACQUIRE_LOCK(ITOUMP(ip));
@@ -7724,8 +7758,9 @@ free_inodedep(inodedep)
  * in memory immediately.
  */
 static void
-freework_freeblock(freework)
+freework_freeblock(freework, trimtype)
        struct freework *freework;
+       int trimtype;
 {
        struct freeblks *freeblks;
        struct jnewblk *jnewblk;
@@ -7779,10 +7814,10 @@ freework_freeblock(freework)
        FREE_LOCK(ump);
        freeblks_free(ump, freeblks, btodb(bsize));
        CTR4(KTR_SUJ,
-           "freework_freeblock: ino %d blkno %jd lbn %jd size %ld",
+           "freework_freeblock: ino %jd blkno %jd lbn %jd size %d",
            freeblks->fb_inum, freework->fw_blkno, freework->fw_lbn, bsize);
        ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, bsize,
-           freeblks->fb_inum, freeblks->fb_vtype, &wkhd);
+           freeblks->fb_inum, freeblks->fb_vtype, &wkhd, trimtype);
        ACQUIRE_LOCK(ump);
        /*
         * The jnewblk will be discarded and the bits in the map never
@@ -7835,7 +7870,7 @@ handle_workitem_indirblk(freework)
                return;
        }
        if (freework->fw_off == NINDIR(fs)) {
-               freework_freeblock(freework);
+               freework_freeblock(freework, SINGLETON);
                return;
        }
        freework->fw_state |= INPROGRESS;
@@ -7889,16 +7924,19 @@ handle_workitem_freeblocks(freeblks, flags)
        struct freeblks *freeblks;
        int flags;
 {
-       struct freework *freework;
+       struct freework *freework, *prevfreework;
        struct newblk *newblk;
        struct allocindir *aip;
        struct ufsmount *ump;
        struct worklist *wk;
+       int trimtype;
 
        KASSERT(LIST_EMPTY(&freeblks->fb_jblkdephd),
            ("handle_workitem_freeblocks: Journal entries not written."));
        ump = VFSTOUFS(freeblks->fb_list.wk_mp);
        ACQUIRE_LOCK(ump);
+       prevfreework = NULL;
+       trimtype = 0;
        while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) {
                WORKLIST_REMOVE(wk);
                switch (wk->wk_type) {
@@ -7932,16 +7970,26 @@ handle_workitem_freeblocks(freeblks, flags)
 
                case D_FREEWORK:
                        freework = WK_FREEWORK(wk);
-                       if (freework->fw_lbn <= -UFS_NDADDR)
+                       if (freework->fw_lbn <= -UFS_NDADDR) {
                                handle_workitem_indirblk(freework);
-                       else
-                               freework_freeblock(freework);
+                               continue;
+                       } else if (prevfreework == NULL) {
+                               trimtype = SINGLETON;
+                       } else if (trimtype == SINGLETON) {
+                               freework_freeblock(prevfreework, STARTFREE);
+                               trimtype = ENDFREE;
+                       } else {
+                               freework_freeblock(prevfreework, CONTINUEFREE);
+                       }
+                       prevfreework = freework;
                        continue;
                default:
                        panic("handle_workitem_freeblocks: Unknown type %s",
                            TYPENAME(wk->wk_type));
                }
        }
+       if (prevfreework != NULL)
+               freework_freeblock(prevfreework, trimtype);
        if (freeblks->fb_ref != 0) {
                freeblks->fb_state &= ~INPROGRESS;
                wake_worklist(&freeblks->fb_list);
@@ -8080,13 +8128,8 @@ indir_trunc(freework, dbn, lbn)
        ufs1_daddr_t *bap1;
        ufs2_daddr_t nb, nnb, *bap2;
        ufs_lbn_t lbnadd, nlbn;
-       int i, nblocks, ufs1fmt;
-       int freedblocks;
-       int goingaway;
-       int freedeps;
-       int needj;
-       int level;
-       int cnt;
+       int nblocks, ufs1fmt, firstfree, trimtype, freedblocks;
+       int goingaway, freedeps, needj, level, cnt, i;
 
        freeblks = freework->fw_freeblks;
        ump = VFSTOUFS(freeblks->fb_list.wk_mp);
@@ -8180,6 +8223,7 @@ indir_trunc(freework, dbn, lbn)
         * arranges for the current level to be freed when subordinates
         * are free when journaling.
         */
+       firstfree = 1;
        for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) {
                if (i != NINDIR(fs) - 1) {
                        if (ufs1fmt)
@@ -8215,11 +8259,26 @@ indir_trunc(freework, dbn, lbn)
                                freedeps++;
                        }
                        CTR3(KTR_SUJ,
-                           "indir_trunc: ino %d blkno %jd size %ld",
+                           "indir_trunc: ino %jd blkno %jd size %d",
                            freeblks->fb_inum, nb, fs->fs_bsize);
+                       if (firstfree) {
+                               if (i == NINDIR(fs) - 1 || nnb == 0) {
+                                       trimtype = SINGLETON;
+                               } else {
+                                       trimtype = STARTFREE;
+                                       firstfree = 0;
+                               }
+                       } else {
+                               if (i == NINDIR(fs) - 1 || nnb == 0) {
+                                       trimtype = ENDFREE;
+                                       firstfree = 1;
+                               } else {
+                                       trimtype = CONTINUEFREE;
+                               }
+                       }
                        ffs_blkfree(ump, fs, freeblks->fb_devvp, nb,
                            fs->fs_bsize, freeblks->fb_inum,
-                           freeblks->fb_vtype, &wkhd);
+                           freeblks->fb_vtype, &wkhd, trimtype);
                }
        }
        if (goingaway) {
@@ -8244,7 +8303,7 @@ indir_trunc(freework, dbn, lbn)
                if (level == 0)
                        freeblks->fb_cgwait += freedeps;
                if (freework->fw_ref == 0)
-                       freework_freeblock(freework);
+                       freework_freeblock(freework, SINGLETON);
                FREE_LOCK(ump);
                return;
        }
@@ -8253,10 +8312,10 @@ indir_trunc(freework, dbn, lbn)
         */
        dbn = dbtofsb(fs, dbn);
        CTR3(KTR_SUJ,
-           "indir_trunc 2: ino %d blkno %jd size %ld",
+           "indir_trunc 2: ino %jd blkno %jd size %d",
            freeblks->fb_inum, dbn, fs->fs_bsize);
        ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize,
-           freeblks->fb_inum, freeblks->fb_vtype, NULL);
+           freeblks->fb_inum, freeblks->fb_vtype, NULL, SINGLETON);
        /* Non SUJ softdep does single-threaded truncations. */
        if (freework->fw_blkno == dbn) {
                freework->fw_state |= ALLCOMPLETE;

Modified: head/sys/ufs/ffs/ffs_vfsops.c
==============================================================================
--- head/sys/ufs/ffs/ffs_vfsops.c       Mon Aug  6 20:39:27 2018        
(r337395)
+++ head/sys/ufs/ffs/ffs_vfsops.c       Mon Aug  6 21:09:11 2018        
(r337396)
@@ -978,6 +978,8 @@ ffs_mountfs(devvp, mp, td)
                            taskqueue_thread_enqueue, &ump->um_trim_tq);
                        taskqueue_start_threads(&ump->um_trim_tq, 1, PVFS,
                            "%s trim", mp->mnt_stat.f_mntonname);
+                       ump->um_trimhash = hashinit(MAXTRIMIO, M_TRIM,
+                           &ump->um_trimlisthashsize);
                }
        }
 
@@ -1256,6 +1258,7 @@ ffs_unmount(mp, mntflags)
                        pause("ufsutr", hz);
                taskqueue_drain_all(ump->um_trim_tq);
                taskqueue_free(ump->um_trim_tq);
+               free (ump->um_trimhash, M_TRIM);
        }
        g_topology_lock();
        if (ump->um_fsckpid > 0) {

Modified: head/sys/ufs/ffs/softdep.h
==============================================================================
--- head/sys/ufs/ffs/softdep.h  Mon Aug  6 20:39:27 2018        (r337395)
+++ head/sys/ufs/ffs/softdep.h  Mon Aug  6 21:09:11 2018        (r337396)
@@ -557,6 +557,7 @@ struct freefrag {
        long    ff_fragsize;            /* size of fragment being deleted */
        ino_t   ff_inum;                /* owning inode number */
        enum    vtype ff_vtype;         /* owning inode's file type */
+       int     ff_trimtype;            /* trim status when deleted */
 };
 
 /*

Modified: head/sys/ufs/ufs/ufsmount.h
==============================================================================
--- head/sys/ufs/ufs/ufsmount.h Mon Aug  6 20:39:27 2018        (r337395)
+++ head/sys/ufs/ufs/ufsmount.h Mon Aug  6 21:09:11 2018        (r337396)
@@ -47,6 +47,7 @@ struct ufs_args {
 
 #ifdef MALLOC_DECLARE
 MALLOC_DECLARE(M_UFSMNT);
+MALLOC_DECLARE(M_TRIM);
 #endif
 
 struct buf;
@@ -63,6 +64,7 @@ struct inodedep;
 
 TAILQ_HEAD(inodedeplst, inodedep);
 LIST_HEAD(bmsafemaphd, bmsafemap);
+LIST_HEAD(trimlist_hashhead, ffs_blkfree_trim_params);
 
 /*
  * This structure describes the UFS specific mount structure data.
@@ -101,6 +103,8 @@ struct ufsmount {
        u_int   um_flags;                       /* (i) filesystem flags */
        u_int   um_trim_inflight;               /* (a) outstanding trim count */
        struct  taskqueue *um_trim_tq;          /* (c) trim request queue */
+       struct  trimlist_hashhead *um_trimhash; /* (i) trimlist hash table */
+       u_long  um_trimlisthashsize;            /* (i) trim hash table size-1 */
                                                /* (c) - below function ptrs */
        int     (*um_balloc)(struct vnode *, off_t, int, struct ucred *,
                    int, struct buf **);
_______________________________________________
svn-src-all@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"

Reply via email to