Attached is final version of the patch. It uses MEDIA prefix for the
flags, but keeps the FUA/DPO - i.e names are B_MEDIA_FUA, B_MEDIA_DPO.
For wapbl it introduces a sysctl to use the feature, default is off
for now.

I plan to commit this later in the week or early next week, unless
there are some serious objections.

Jaromir

2017-03-05 23:22 GMT+01:00 Jaromír Doleček <jaromir.dole...@gmail.com>:
> Here is an updated patch. It was updated to check for the FUA support
> for SCSI, using the MODE SENSE device-specific flag. Code was tested
> with QEMU emulated bha(4) and nvme. WAPBL code was updated to use the
> flag. It keeps the flag naming for now.
>
> In the patch, WAPBL sets the flag for journal writes, and also for the
> metadata buffer for bawrite() call after journal commit.
>
> There is possible layer violation for metadata write - b_flags are
> supposed to be set by owner of the buffer. Not sure how strict we
> want/need to be there - perhaps introduce another flag field? Also the
> flag
> probably needs to be unset in biodone hook, so that the code
> guarantees the buffer in buffer cache doesn't accidentaly keep it over
> to another I/O.
>
> Jaromir
? dev/ic/TODO.nvme
Index: sys/buf.h
===================================================================
RCS file: /cvsroot/src/sys/sys/buf.h,v
retrieving revision 1.126
diff -u -p -r1.126 buf.h
--- sys/buf.h   26 Dec 2016 23:12:33 -0000      1.126
+++ sys/buf.h   27 Mar 2017 22:31:22 -0000
@@ -198,16 +198,21 @@ struct buf {
 #define        B_RAW           0x00080000      /* Set by physio for raw 
transfers. */
 #define        B_READ          0x00100000      /* Read buffer. */
 #define        B_DEVPRIVATE    0x02000000      /* Device driver private flag. 
*/
+#define        B_MEDIA_FUA     0x08000000      /* Set Force Unit Access for 
media. */
+#define        B_MEDIA_DPO     0x10000000      /* Set Disable Page Out for 
media. */
 
 #define BUF_FLAGBITS \
     "\20\1AGE\3ASYNC\4BAD\5BUSY\10DELWRI" \
     "\12DONE\13COWDONE\15GATHERED\16INVAL\17LOCKED\20NOCACHE" \
-    "\23PHYS\24RAW\25READ\32DEVPRIVATE\33VFLUSH"
+    "\23PHYS\24RAW\25READ\32DEVPRIVATE\33VFLUSH\34MEDIAFUA\35MEDIADPO"
 
 /* Avoid weird code due to B_WRITE being a "pseudo flag" */
 #define BUF_ISREAD(bp) (((bp)->b_flags & B_READ) == B_READ)
 #define BUF_ISWRITE(bp)        (((bp)->b_flags & B_READ) == B_WRITE)
 
+/* Media flags, to be passed for nested I/O */
+#define B_MEDIA_FLAGS  (B_MEDIA_FUA|B_MEDIA_DPO)
+
 /*
  * This structure describes a clustered I/O.  It is stored in the b_saveaddr
  * field of the buffer on which I/O is done.  At I/O completion, cluster
Index: sys/dkio.h
===================================================================
RCS file: /cvsroot/src/sys/sys/dkio.h,v
retrieving revision 1.22
diff -u -p -r1.22 dkio.h
--- sys/dkio.h  8 Dec 2015 20:36:15 -0000       1.22
+++ sys/dkio.h  27 Mar 2017 22:31:22 -0000
@@ -85,6 +85,8 @@
 #define        DKCACHE_RCHANGE 0x000100 /* read enable is changeable */
 #define        DKCACHE_WCHANGE 0x000200 /* write enable is changeable */
 #define        DKCACHE_SAVE    0x010000 /* cache parameters are savable/save 
them */
+#define        DKCACHE_FUA     0x020000 /* Force Unit Access supported */
+#define        DKCACHE_DPO     0x040000 /* Disable Page Out supported */
 
                /* sync disk cache */
 #define        DIOCCACHESYNC   _IOW('d', 118, int)     /* sync cache (force?) 
*/
Index: kern/vfs_bio.c
===================================================================
RCS file: /cvsroot/src/sys/kern/vfs_bio.c,v
retrieving revision 1.271
diff -u -p -r1.271 vfs_bio.c
--- kern/vfs_bio.c      21 Mar 2017 10:46:49 -0000      1.271
+++ kern/vfs_bio.c      27 Mar 2017 22:31:22 -0000
@@ -2027,7 +2027,7 @@ nestiobuf_iodone(buf_t *bp)
 void
 nestiobuf_setup(buf_t *mbp, buf_t *bp, int offset, size_t size)
 {
-       const int b_read = mbp->b_flags & B_READ;
+       const int b_pass = mbp->b_flags & (B_READ|B_MEDIA_FLAGS);
        struct vnode *vp = mbp->b_vp;
 
        KASSERT(mbp->b_bcount >= offset + size);
@@ -2035,14 +2035,14 @@ nestiobuf_setup(buf_t *mbp, buf_t *bp, i
        bp->b_dev = mbp->b_dev;
        bp->b_objlock = mbp->b_objlock;
        bp->b_cflags = BC_BUSY;
-       bp->b_flags = B_ASYNC | b_read;
+       bp->b_flags = B_ASYNC | b_pass;
        bp->b_iodone = nestiobuf_iodone;
        bp->b_data = (char *)mbp->b_data + offset;
        bp->b_resid = bp->b_bcount = size;
        bp->b_bufsize = bp->b_bcount;
        bp->b_private = mbp;
        BIO_COPYPRIO(bp, mbp);
-       if (!b_read && vp != NULL) {
+       if (BUF_ISWRITE(bp) && vp != NULL) {
                mutex_enter(vp->v_interlock);
                vp->v_numoutput++;
                mutex_exit(vp->v_interlock);
Index: kern/vfs_wapbl.c
===================================================================
RCS file: /cvsroot/src/sys/kern/vfs_wapbl.c,v
retrieving revision 1.92
diff -u -p -r1.92 vfs_wapbl.c
--- kern/vfs_wapbl.c    17 Mar 2017 03:19:46 -0000      1.92
+++ kern/vfs_wapbl.c    27 Mar 2017 22:31:23 -0000
@@ -71,6 +71,7 @@ __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,
 static struct sysctllog *wapbl_sysctl;
 static int wapbl_flush_disk_cache = 1;
 static int wapbl_verbose_commit = 0;
+static int wapbl_use_fua = 0;  /* switched off by default for now */
 
 static inline size_t wapbl_space_free(size_t, off_t, off_t);
 
@@ -230,6 +231,16 @@ struct wapbl {
        u_char *wl_buffer;      /* l:   buffer for wapbl_buffered_write() */
        daddr_t wl_buffer_dblk; /* l:   buffer disk block address */
        size_t wl_buffer_used;  /* l:   buffer current use */
+
+       int wl_dkcache;         /* r:   disk cache flags */
+#define WAPBL_USE_FUA(wl)      \
+               (wapbl_use_fua && ISSET((wl)->wl_dkcache, DKCACHE_FUA))
+#define WAPBL_JFLAGS(wl)       \
+               (WAPBL_USE_FUA(wl) ? (wl)->wl_jwrite_flags : 0)
+#define WAPBL_MFLAGS(wl)       \
+               (WAPBL_USE_FUA(wl) ? (wl)->wl_mwrite_flags : 0)
+       int wl_jwrite_flags;    /* r:   journal write flags */
+       int wl_mwrite_flags;    /* r:   metadata write flags */
 };
 
 #ifdef WAPBL_DEBUG_PRINT
@@ -281,6 +292,8 @@ static void wapbl_deallocation_free(stru
 static void wapbl_evcnt_init(struct wapbl *);
 static void wapbl_evcnt_free(struct wapbl *);
 
+static void wapbl_dkcache_init(struct wapbl *);
+
 #if 0
 int wapbl_replay_verify(struct wapbl_replay *, struct vnode *);
 #endif
@@ -335,6 +348,18 @@ wapbl_sysctl_init(void)
                       SYSCTL_DESCR("show time and size of wapbl log commits"),
                       NULL, 0, &wapbl_verbose_commit, 0,
                       CTL_CREATE, CTL_EOL);
+       if (rv)
+               return rv;
+
+       rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
+                      CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+                      CTLTYPE_INT, "use_fua",
+                      SYSCTL_DESCR("use FUA/DPO instead of cash flush if 
available"),
+                      NULL, 0, &wapbl_use_fua, 0,
+                      CTL_CREATE, CTL_EOL);
+       if (rv)
+               return rv;
+
        return rv;
 }
 
@@ -391,6 +416,30 @@ wapbl_evcnt_free(struct wapbl *wl)
        evcnt_detach(&wl->wl_ev_cacheflush);
 }
 
+static void
+wapbl_dkcache_init(struct wapbl *wl)
+{
+       int error;
+
+       /* Get disk cache flags */
+       error = VOP_IOCTL(wl->wl_devvp, DIOCGCACHE, &wl->wl_dkcache,
+           FWRITE, FSCRED);
+       if (error) {
+               /* behave as if there was a write cache */
+               wl->wl_dkcache = DKCACHE_WRITE;
+       }
+
+       /* Use FUA instead of cache flush if available */
+       if (ISSET(wl->wl_dkcache, DKCACHE_FUA)) {
+               wl->wl_jwrite_flags |= B_MEDIA_FUA;
+               wl->wl_mwrite_flags |= B_MEDIA_FUA;
+       }
+
+       /* Use DPO for journal writes if available */
+       if (ISSET(wl->wl_dkcache, DKCACHE_DPO))
+               wl->wl_jwrite_flags |= B_MEDIA_DPO;
+}
+
 static int
 wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr)
 {
@@ -563,6 +612,8 @@ wapbl_start(struct wapbl ** wlp, struct 
 
        wapbl_evcnt_init(wl);
 
+       wapbl_dkcache_init(wl);
+
        /* Initialize the commit header */
        {
                struct wapbl_wc_header *wc;
@@ -809,7 +860,6 @@ wapbl_doio(void *data, size_t len, struc
        struct buf *bp;
        int error;
 
-       KASSERT((flags & ~(B_WRITE | B_READ)) == 0);
        KASSERT(devvp->v_type == VBLK);
 
        if ((flags & (B_WRITE | B_READ)) == B_WRITE) {
@@ -823,7 +873,7 @@ wapbl_doio(void *data, size_t len, struc
 
        bp = getiobuf(devvp, true);
        bp->b_flags = flags;
-       bp->b_cflags = BC_BUSY; /* silly & dubious */
+       bp->b_cflags = BC_BUSY; /* mandatory, asserted by biowait() */
        bp->b_dev = devvp->v_rdev;
        bp->b_data = data;
        bp->b_bufsize = bp->b_resid = bp->b_bcount = len;
@@ -898,7 +948,8 @@ wapbl_buffered_flush(struct wapbl *wl)
                return 0;
 
        error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used,
-           wl->wl_devvp, wl->wl_buffer_dblk, B_WRITE);
+           wl->wl_devvp, wl->wl_buffer_dblk,
+           B_WRITE | WAPBL_JFLAGS(wl));
        wl->wl_buffer_used = 0;
 
        wl->wl_ev_journalwrite.ev_count++;
@@ -948,12 +999,10 @@ wapbl_buffered_write(void *data, size_t 
        if (len >= resid) {
                memcpy(wl->wl_buffer + wl->wl_buffer_used, data, resid);
                wl->wl_buffer_used += resid;
-               error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used,
-                   wl->wl_devvp, wl->wl_buffer_dblk, B_WRITE);
+               error = wapbl_buffered_flush(wl);
                data = (uint8_t *)data + resid;
                len -= resid;
                wl->wl_buffer_dblk = pbn + btodb(resid);
-               wl->wl_buffer_used = 0;
                if (error)
                        return error;
        }
@@ -1500,6 +1549,13 @@ wapbl_biodone(struct buf *bp)
        }
 
        /*
+        * Make sure that the buf doesn't retain the media flags, so that
+        * e.g. wapbl_use_fua has immediate effect on any following I/O.
+        * The flags will be set again if needed by another I/O.
+        */
+       bp->b_flags &= ~B_MEDIA_FLAGS;
+
+       /*
         * Release the buffer here. wapbl_flush() may wait for the
         * log to become empty and we better unbusy the buffer before
         * wapbl_flush() returns.
@@ -1754,6 +1810,10 @@ wapbl_flush(struct wapbl *wl, int waitfo
                }
                bp->b_iodone = wapbl_biodone;
                bp->b_private = we;
+
+               /* make sure the block is saved sync when FUA in use */
+               bp->b_flags |= WAPBL_MFLAGS(wl);
+
                bremfree(bp);
                wapbl_remove_buf_locked(wl, bp);
                mutex_exit(&wl->wl_mtx);
@@ -2201,7 +2261,8 @@ wapbl_cache_sync(struct wapbl *wl, const
        int force = 1;
        int error;
 
-       if (!wapbl_flush_disk_cache) {
+       /* Skip full cache sync if disabled, or when using FUA */
+       if (!wapbl_flush_disk_cache || WAPBL_USE_FUA(wl)) {
                return 0;
        }
        if (verbose) {
Index: dev/ic/ld_nvme.c
===================================================================
RCS file: /cvsroot/src/sys/dev/ic/ld_nvme.c,v
retrieving revision 1.14
diff -u -p -r1.14 ld_nvme.c
--- dev/ic/ld_nvme.c    28 Feb 2017 20:55:09 -0000      1.14
+++ dev/ic/ld_nvme.c    27 Mar 2017 22:31:23 -0000
@@ -152,11 +152,15 @@ static int
 ld_nvme_start(struct ld_softc *ld, struct buf *bp)
 {
        struct ld_nvme_softc *sc = device_private(ld->sc_dv);
+       int flags = BUF_ISWRITE(bp) ? 0 : NVME_NS_CTX_F_READ;
+
+       if (bp->b_flags & B_MEDIA_FUA)
+               flags |= NVME_NS_CTX_F_FUA;
 
        return nvme_ns_dobio(sc->sc_nvme, sc->sc_nsid, sc,
            bp, bp->b_data, bp->b_bcount,
            sc->sc_ld.sc_secsize, bp->b_rawblkno,
-           BUF_ISWRITE(bp) ? 0 : NVME_NS_CTX_F_READ,
+           flags,
            ld_nvme_biodone);
 }
 
@@ -221,7 +225,11 @@ ld_nvme_getcache(struct ld_softc *ld, in
        int error;
        struct ld_nvme_softc *sc = device_private(ld->sc_dv);
 
-       *addr = 0;
+       /*
+        * DPO not supported, Dataset Management (DSM) field doesn't specify
+        * the same semantics.
+        */ 
+       *addr = DKCACHE_FUA;
 
        if (!nvme_has_volatile_write_cache(sc->sc_nvme)) {
                /* cache simply not present */
Index: dev/ic/nvme.c
===================================================================
RCS file: /cvsroot/src/sys/dev/ic/nvme.c,v
retrieving revision 1.25
diff -u -p -r1.25 nvme.c
--- dev/ic/nvme.c       28 Feb 2017 20:53:50 -0000      1.25
+++ dev/ic/nvme.c       27 Mar 2017 22:31:23 -0000
@@ -727,6 +727,9 @@ nvme_ns_io_fill(struct nvme_queue *q, st
 
        htolem64(&sqe->slba, ccb->nnc_blkno);
 
+       if (ISSET(ccb->nnc_flags, NVME_NS_CTX_F_FUA))
+               htolem16(&sqe->ioflags, NVM_SQE_IO_FUA);
+
        /* guaranteed by upper layers, but check just in case */
        KASSERT((ccb->nnc_datasize % ccb->nnc_secsize) == 0);
        htolem16(&sqe->nlb, (ccb->nnc_datasize / ccb->nnc_secsize) - 1);
Index: dev/ic/nvmevar.h
===================================================================
RCS file: /cvsroot/src/sys/dev/ic/nvmevar.h,v
retrieving revision 1.12
diff -u -p -r1.12 nvmevar.h
--- dev/ic/nvmevar.h    28 Feb 2017 20:53:50 -0000      1.12
+++ dev/ic/nvmevar.h    27 Mar 2017 22:31:23 -0000
@@ -64,6 +64,7 @@ struct nvme_ccb {
        uint16_t        nnc_flags;
 #define        NVME_NS_CTX_F_READ      __BIT(0)
 #define        NVME_NS_CTX_F_POLL      __BIT(1)
+#define        NVME_NS_CTX_F_FUA       __BIT(2)
 
        struct buf      *nnc_buf;
        daddr_t         nnc_blkno;
Index: dev/scsipi/scsipi_disk.h
===================================================================
RCS file: /cvsroot/src/sys/dev/scsipi/scsipi_disk.h,v
retrieving revision 1.21
diff -u -p -r1.21 scsipi_disk.h
--- dev/scsipi/scsipi_disk.h    25 Dec 2007 18:33:42 -0000      1.21
+++ dev/scsipi/scsipi_disk.h    27 Mar 2017 22:31:23 -0000
@@ -62,9 +62,10 @@ struct scsipi_rw_10 {
        u_int8_t opcode;
        u_int8_t byte2;
 #define        SRWB_RELADDR    0x01    /* obsolete */
-#define        SRWB_FUA_NV     0x02    /* force unit access non-volatile cache 
*/
-#define        SRWB_FUA        0x08    /* force unit access */
-#define        SRWB_DPO        0x10    /* disable page out */
+#define        SRWB_FUA_NV     0x02    /* force unit access non-volatile cache 
(SCSI-3) */
+#define        SRWB_RESV2      0x04    /* reserved (SCSI-2) */
+#define        SRWB_FUA        0x08    /* force unit access volatile cache 
(SCSI-2) */
+#define        SRWB_DPO        0x10    /* disable page out (SCSI-2) */
 #define        SRWB_PROTECT(x) ((x) << 5)
        u_int8_t addr[4];
        u_int8_t reserved;
@@ -159,4 +160,7 @@ struct scsipi_capacity_descriptor {
 #define        SCSIPI_CAP_DESC_CODE_FORMATTED          0x2
 #define        SCSIPI_CAP_DESC_CODE_NONE               0x3
 
+/* defines for the device specific byte in the mode select/sense header */
+#define        SMH_DSP_DPOFUA          0x10
+
 #endif /* _DEV_SCSIPI_SCSIPI_DISK_H_ */
Index: dev/scsipi/sd.c
===================================================================
RCS file: /cvsroot/src/sys/dev/scsipi/sd.c,v
retrieving revision 1.322
diff -u -p -r1.322 sd.c
--- dev/scsipi/sd.c     21 Dec 2016 21:28:30 -0000      1.322
+++ dev/scsipi/sd.c     27 Mar 2017 22:31:23 -0000
@@ -654,6 +654,7 @@ sd_diskstart(device_t dev, struct buf *b
        struct scsipi_generic *cmdp;
        struct scsipi_xfer *xs;
        int error, flags, nblks, cmdlen;
+       int cdb_flags;
 
        mutex_enter(chan_mtx(chan));
 
@@ -698,12 +699,27 @@ sd_diskstart(device_t dev, struct buf *b
                nblks = howmany(bp->b_bcount, sd->params.blksize);
 
        /*
+        * Pass FUA and/or DPO if requested. Must be done before CDB
+        * selection, as 6-byte CDB doesn't support the flags.
+        */
+       cdb_flags = 0;
+
+       if (bp->b_flags & B_MEDIA_FUA)
+               cdb_flags |= SRWB_FUA;
+
+       if (bp->b_flags & B_MEDIA_DPO)
+               cdb_flags |= SRWB_DPO;
+
+       /*
         * Fill out the scsi command.  Use the smallest CDB possible
-        * (6-byte, 10-byte, or 16-byte).
+        * (6-byte, 10-byte, or 16-byte). If we need FUA or DPO,
+        * need to use 10-byte or bigger, as the 6-byte doesn't support
+        * the flags.
         */
        if (((bp->b_rawblkno & 0x1fffff) == bp->b_rawblkno) &&
            ((nblks & 0xff) == nblks) &&
-           !(periph->periph_quirks & PQUIRK_ONLYBIG)) {
+           !(periph->periph_quirks & PQUIRK_ONLYBIG) &&
+           !cdb_flags) {
                /* 6-byte CDB */
                memset(&cmd_small, 0, sizeof(cmd_small));
                cmd_small.opcode = (bp->b_flags & B_READ) ?
@@ -732,6 +748,9 @@ sd_diskstart(device_t dev, struct buf *b
                cmdp = (struct scsipi_generic *)&cmd16;
        }
 
+       if (cdb_flags)
+               cmdp->bytes[0] = cdb_flags;
+
        /*
         * Figure out what flags to use.
         */
@@ -1796,7 +1815,9 @@ sd_getcache(struct sd_softc *sd, int *bi
        int error, bits = 0;
        int big;
        union scsi_disk_pages *pages;
+       uint8_t dev_spec;
 
+       /* only SCSI-2 and later supported */
        if (periph->periph_version < 2)
                return (EOPNOTSUPP);
 
@@ -1806,10 +1827,13 @@ sd_getcache(struct sd_softc *sd, int *bi
        if (error)
                return (error);
 
-       if (big)
+       if (big) {
                pages = (void *)(&scsipi_sense.header.big + 1);
-       else
+               dev_spec = scsipi_sense.header.big.dev_spec;
+       } else {
                pages = (void *)(&scsipi_sense.header.small + 1);
+               dev_spec = scsipi_sense.header.small.dev_spec;
+       }
 
        if ((pages->caching_params.flags & CACHING_RCD) == 0)
                bits |= DKCACHE_READ;
@@ -1818,6 +1842,13 @@ sd_getcache(struct sd_softc *sd, int *bi
        if (pages->caching_params.pg_code & PGCODE_PS)
                bits |= DKCACHE_SAVE;
 
+       /*
+        * Support for FUA/DPO, defined starting with SCSI-2. Use only
+        * if device claims to support it, according to the MODE SENSE.
+        */
+       if (ISSET(dev_spec, SMH_DSP_DPOFUA))
+               bits |= DKCACHE_FUA | DKCACHE_DPO;
+
        memset(&scsipi_sense, 0, sizeof(scsipi_sense));
        error = sd_mode_sense(sd, SMS_DBD, &scsipi_sense,
            sizeof(scsipi_sense.pages.caching_params),

Reply via email to