Here is an updated patch. It was updated to check for the FUA support
for SCSI, using the MODE SENSE device-specific flag. Code was tested
with QEMU emulated bha(4) and nvme. WAPBL code was updated to use the
flag. It keeps the flag naming for now.

In the patch, WAPBL sets the flag for journal writes, and also for the
metadata buffer for bawrite() call after journal commit.

There is possible layer violation for metadata write - b_flags are
supposed to be set by owner of the buffer. Not sure how strict we
want/need to be there - perhaps introduce another flag field? Also the
flag
probably needs to be unset in biodone hook, so that the code
guarantees the buffer in buffer cache doesn't accidentaly keep it over
to another I/O.

Jaromir
? dev/ic/TODO.nvme
Index: sys/buf.h
===================================================================
RCS file: /cvsroot/src/sys/sys/buf.h,v
retrieving revision 1.126
diff -u -p -r1.126 buf.h
--- sys/buf.h   26 Dec 2016 23:12:33 -0000      1.126
+++ sys/buf.h   5 Mar 2017 22:08:35 -0000
@@ -198,11 +198,13 @@ struct buf {
 #define        B_RAW           0x00080000      /* Set by physio for raw 
transfers. */
 #define        B_READ          0x00100000      /* Read buffer. */
 #define        B_DEVPRIVATE    0x02000000      /* Device driver private flag. 
*/
+#define        B_FUA           0x08000000      /* Force Unit Access flag 
(mandatory). */
+#define        B_DPO           0x10000000      /* Disable Page Out flag 
(advisory). */
 
 #define BUF_FLAGBITS \
     "\20\1AGE\3ASYNC\4BAD\5BUSY\10DELWRI" \
     "\12DONE\13COWDONE\15GATHERED\16INVAL\17LOCKED\20NOCACHE" \
-    "\23PHYS\24RAW\25READ\32DEVPRIVATE\33VFLUSH"
+    "\23PHYS\24RAW\25READ\32DEVPRIVATE\33VFLUSH\34FUA\35DPO"
 
 /* Avoid weird code due to B_WRITE being a "pseudo flag" */
 #define BUF_ISREAD(bp) (((bp)->b_flags & B_READ) == B_READ)
Index: sys/dkio.h
===================================================================
RCS file: /cvsroot/src/sys/sys/dkio.h,v
retrieving revision 1.22
diff -u -p -r1.22 dkio.h
--- sys/dkio.h  8 Dec 2015 20:36:15 -0000       1.22
+++ sys/dkio.h  5 Mar 2017 22:08:35 -0000
@@ -85,6 +85,8 @@
 #define        DKCACHE_RCHANGE 0x000100 /* read enable is changeable */
 #define        DKCACHE_WCHANGE 0x000200 /* write enable is changeable */
 #define        DKCACHE_SAVE    0x010000 /* cache parameters are savable/save 
them */
+#define        DKCACHE_FUA     0x020000 /* Force Unit Access supported */
+#define        DKCACHE_DPO     0x040000 /* Disable Page Out supported */
 
                /* sync disk cache */
 #define        DIOCCACHESYNC   _IOW('d', 118, int)     /* sync cache (force?) 
*/
Index: kern/vfs_wapbl.c
===================================================================
RCS file: /cvsroot/src/sys/kern/vfs_wapbl.c,v
retrieving revision 1.87
diff -u -p -r1.87 vfs_wapbl.c
--- kern/vfs_wapbl.c    5 Mar 2017 13:57:29 -0000       1.87
+++ kern/vfs_wapbl.c    5 Mar 2017 22:08:35 -0000
@@ -70,6 +70,7 @@ __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,
 static struct sysctllog *wapbl_sysctl;
 static int wapbl_flush_disk_cache = 1;
 static int wapbl_verbose_commit = 0;
+static int wapbl_use_fua = 1;
 
 static inline size_t wapbl_space_free(size_t, off_t, off_t);
 
@@ -229,6 +230,12 @@ struct wapbl {
        u_char *wl_buffer;      /* l:   buffer for wapbl_buffered_write() */
        daddr_t wl_buffer_dblk; /* l:   buffer disk block address */
        size_t wl_buffer_used;  /* l:   buffer current use */
+
+       int wl_dkcache;         /* r:   disk cache flags */
+#define WAPBL_USE_FUA(wl)      \
+               (wapbl_use_fua && ISSET(wl->wl_dkcache, DKCACHE_FUA))
+       int wl_jwrite_flags;    /* r:   journal write flags */
+       int wl_mwrite_flags;    /* r:   metadata write flags */
 };
 
 #ifdef WAPBL_DEBUG_PRINT
@@ -280,6 +287,8 @@ static void wapbl_deallocation_free(stru
 static void wapbl_evcnt_init(struct wapbl *);
 static void wapbl_evcnt_free(struct wapbl *);
 
+static void wapbl_dkcache_init(struct wapbl *);
+
 #if 0
 int wapbl_replay_verify(struct wapbl_replay *, struct vnode *);
 #endif
@@ -390,6 +399,30 @@ wapbl_evcnt_free(struct wapbl *wl)
        evcnt_detach(&wl->wl_ev_cacheflush);
 }
 
+static void
+wapbl_dkcache_init(struct wapbl *wl)
+{
+       int error;
+
+       /* Get disk cache flags */
+       error = VOP_IOCTL(wl->wl_devvp, DIOCGCACHE, &wl->wl_dkcache,
+           FWRITE, FSCRED);
+       if (error) {
+               /* behave as if there is a write cache */
+               wl->wl_dkcache = DKCACHE_WRITE;
+       }
+
+       /* Use FUA instead of cache flush if available */
+       if (WAPBL_USE_FUA(wl)) {
+               wl->wl_jwrite_flags |= B_FUA;
+               wl->wl_mwrite_flags |= B_FUA;
+       }
+
+       /* Use DPO for journal writes if available */
+       if (ISSET(wl->wl_dkcache, DKCACHE_DPO))
+               wl->wl_jwrite_flags |= B_DPO;
+}
+
 static int
 wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr)
 {
@@ -562,6 +595,8 @@ wapbl_start(struct wapbl ** wlp, struct 
 
        wapbl_evcnt_init(wl);
 
+       wapbl_dkcache_init(wl);
+
        /* Initialize the commit header */
        {
                struct wapbl_wc_header *wc;
@@ -808,7 +843,6 @@ wapbl_doio(void *data, size_t len, struc
        struct buf *bp;
        int error;
 
-       KASSERT((flags & ~(B_WRITE | B_READ)) == 0);
        KASSERT(devvp->v_type == VBLK);
 
        if ((flags & (B_WRITE | B_READ)) == B_WRITE) {
@@ -822,7 +856,7 @@ wapbl_doio(void *data, size_t len, struc
 
        bp = getiobuf(devvp, true);
        bp->b_flags = flags;
-       bp->b_cflags = BC_BUSY; /* silly & dubious */
+       bp->b_cflags = BC_BUSY; /* mandatory, asserted by biowait() */
        bp->b_dev = devvp->v_rdev;
        bp->b_data = data;
        bp->b_bufsize = bp->b_resid = bp->b_bcount = len;
@@ -897,7 +931,8 @@ wapbl_buffered_flush(struct wapbl *wl)
                return 0;
 
        error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used,
-           wl->wl_devvp, wl->wl_buffer_dblk, B_WRITE);
+           wl->wl_devvp, wl->wl_buffer_dblk,
+           B_WRITE | wl->wl_jwrite_flags);
        wl->wl_buffer_used = 0;
 
        wl->wl_ev_journalwrite.ev_count++;
@@ -947,12 +982,10 @@ wapbl_buffered_write(void *data, size_t 
        if (len >= resid) {
                memcpy(wl->wl_buffer + wl->wl_buffer_used, data, resid);
                wl->wl_buffer_used += resid;
-               error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used,
-                   wl->wl_devvp, wl->wl_buffer_dblk, B_WRITE);
+               error = wapbl_buffered_flush(wl);
                data = (uint8_t *)data + resid;
                len -= resid;
                wl->wl_buffer_dblk = pbn + btodb(resid);
-               wl->wl_buffer_used = 0;
                if (error)
                        return error;
        }
@@ -1498,6 +1531,9 @@ wapbl_biodone(struct buf *bp)
                mutex_exit(&wl->wl_mtx);
        }
 
+       /* XXX unset FUA again here? */
+       /* bp->b_flags &= ~wl->wl_mwrite_flags; */
+
        /*
         * Release the buffer here. wapbl_flush() may wait for the
         * log to become empty and we better unbusy the buffer before
@@ -1753,6 +1789,10 @@ wapbl_flush(struct wapbl *wl, int waitfo
                }
                bp->b_iodone = wapbl_biodone;
                bp->b_private = we;
+
+               /* make sure the block is saved sync when FUA in use */
+               bp->b_flags |= wl->wl_mwrite_flags;
+
                bremfree(bp);
                wapbl_remove_buf_locked(wl, bp);
                mutex_exit(&wl->wl_mtx);
@@ -2200,7 +2240,7 @@ wapbl_cache_sync(struct wapbl *wl, const
        int force = 1;
        int error;
 
-       if (!wapbl_flush_disk_cache) {
+       if (!wapbl_flush_disk_cache || WAPBL_USE_FUA(wl)) {
                return 0;
        }
        if (verbose) {
Index: dev/ic/ld_nvme.c
===================================================================
RCS file: /cvsroot/src/sys/dev/ic/ld_nvme.c,v
retrieving revision 1.14
diff -u -p -r1.14 ld_nvme.c
--- dev/ic/ld_nvme.c    28 Feb 2017 20:55:09 -0000      1.14
+++ dev/ic/ld_nvme.c    5 Mar 2017 22:08:35 -0000
@@ -152,11 +152,15 @@ static int
 ld_nvme_start(struct ld_softc *ld, struct buf *bp)
 {
        struct ld_nvme_softc *sc = device_private(ld->sc_dv);
+       int flags = BUF_ISWRITE(bp) ? 0 : NVME_NS_CTX_F_READ;
+
+       if (bp->b_flags & B_FUA)
+               flags |= NVME_NS_CTX_F_FUA;
 
        return nvme_ns_dobio(sc->sc_nvme, sc->sc_nsid, sc,
            bp, bp->b_data, bp->b_bcount,
            sc->sc_ld.sc_secsize, bp->b_rawblkno,
-           BUF_ISWRITE(bp) ? 0 : NVME_NS_CTX_F_READ,
+           flags,
            ld_nvme_biodone);
 }
 
@@ -221,7 +225,11 @@ ld_nvme_getcache(struct ld_softc *ld, in
        int error;
        struct ld_nvme_softc *sc = device_private(ld->sc_dv);
 
-       *addr = 0;
+       /*
+        * DPO not supported, Dataset Management (DSM) field doesn't specify
+        * the same semantics.
+        */ 
+       *addr = DKCACHE_FUA;
 
        if (!nvme_has_volatile_write_cache(sc->sc_nvme)) {
                /* cache simply not present */
Index: dev/ic/nvme.c
===================================================================
RCS file: /cvsroot/src/sys/dev/ic/nvme.c,v
retrieving revision 1.25
diff -u -p -r1.25 nvme.c
--- dev/ic/nvme.c       28 Feb 2017 20:53:50 -0000      1.25
+++ dev/ic/nvme.c       5 Mar 2017 22:08:35 -0000
@@ -727,6 +727,9 @@ nvme_ns_io_fill(struct nvme_queue *q, st
 
        htolem64(&sqe->slba, ccb->nnc_blkno);
 
+       if (ISSET(ccb->nnc_flags, NVME_NS_CTX_F_FUA))
+               htolem16(&sqe->ioflags, NVM_SQE_IO_FUA);
+
        /* guaranteed by upper layers, but check just in case */
        KASSERT((ccb->nnc_datasize % ccb->nnc_secsize) == 0);
        htolem16(&sqe->nlb, (ccb->nnc_datasize / ccb->nnc_secsize) - 1);
Index: dev/ic/nvmevar.h
===================================================================
RCS file: /cvsroot/src/sys/dev/ic/nvmevar.h,v
retrieving revision 1.12
diff -u -p -r1.12 nvmevar.h
--- dev/ic/nvmevar.h    28 Feb 2017 20:53:50 -0000      1.12
+++ dev/ic/nvmevar.h    5 Mar 2017 22:08:35 -0000
@@ -64,6 +64,7 @@ struct nvme_ccb {
        uint16_t        nnc_flags;
 #define        NVME_NS_CTX_F_READ      __BIT(0)
 #define        NVME_NS_CTX_F_POLL      __BIT(1)
+#define        NVME_NS_CTX_F_FUA       __BIT(2)
 
        struct buf      *nnc_buf;
        daddr_t         nnc_blkno;
Index: dev/scsipi/scsipi_disk.h
===================================================================
RCS file: /cvsroot/src/sys/dev/scsipi/scsipi_disk.h,v
retrieving revision 1.21
diff -u -p -r1.21 scsipi_disk.h
--- dev/scsipi/scsipi_disk.h    25 Dec 2007 18:33:42 -0000      1.21
+++ dev/scsipi/scsipi_disk.h    5 Mar 2017 22:08:35 -0000
@@ -62,9 +62,10 @@ struct scsipi_rw_10 {
        u_int8_t opcode;
        u_int8_t byte2;
 #define        SRWB_RELADDR    0x01    /* obsolete */
-#define        SRWB_FUA_NV     0x02    /* force unit access non-volatile cache 
*/
-#define        SRWB_FUA        0x08    /* force unit access */
-#define        SRWB_DPO        0x10    /* disable page out */
+#define        SRWB_FUA_NV     0x02    /* force unit access non-volatile cache 
(SCSI-3) */
+#define        SRWB_RESV2      0x04    /* reserved (SCSI-2) */
+#define        SRWB_FUA        0x08    /* force unit access volatile cache 
(SCSI-2) */
+#define        SRWB_DPO        0x10    /* disable page out (SCSI-2) */
 #define        SRWB_PROTECT(x) ((x) << 5)
        u_int8_t addr[4];
        u_int8_t reserved;
@@ -159,4 +160,7 @@ struct scsipi_capacity_descriptor {
 #define        SCSIPI_CAP_DESC_CODE_FORMATTED          0x2
 #define        SCSIPI_CAP_DESC_CODE_NONE               0x3
 
+/* defines for the device specific byte in the mode select/sense header */
+#define        SMH_DSP_DPOFUA          0x10
+
 #endif /* _DEV_SCSIPI_SCSIPI_DISK_H_ */
Index: dev/scsipi/sd.c
===================================================================
RCS file: /cvsroot/src/sys/dev/scsipi/sd.c,v
retrieving revision 1.322
diff -u -p -r1.322 sd.c
--- dev/scsipi/sd.c     21 Dec 2016 21:28:30 -0000      1.322
+++ dev/scsipi/sd.c     5 Mar 2017 22:08:35 -0000
@@ -654,6 +654,7 @@ sd_diskstart(device_t dev, struct buf *b
        struct scsipi_generic *cmdp;
        struct scsipi_xfer *xs;
        int error, flags, nblks, cmdlen;
+       int cdb_flags;
 
        mutex_enter(chan_mtx(chan));
 
@@ -698,12 +699,27 @@ sd_diskstart(device_t dev, struct buf *b
                nblks = howmany(bp->b_bcount, sd->params.blksize);
 
        /*
+        * Pass FUA and/or DPO if requested. Must be done before CDB
+        * selection, as 6-byte CDB doesn't support the flags.
+        */
+       cdb_flags = 0;
+
+       if (bp->b_flags & B_FUA)
+               cdb_flags |= SRWB_FUA;
+
+       if (bp->b_flags & B_DPO)
+               cdb_flags |= SRWB_DPO;
+
+       /*
         * Fill out the scsi command.  Use the smallest CDB possible
-        * (6-byte, 10-byte, or 16-byte).
+        * (6-byte, 10-byte, or 16-byte). If we need FUA or DPO,
+        * need to use 10-byte or bigger, as the 6-byte doesn't support
+        * the flags.
         */
        if (((bp->b_rawblkno & 0x1fffff) == bp->b_rawblkno) &&
            ((nblks & 0xff) == nblks) &&
-           !(periph->periph_quirks & PQUIRK_ONLYBIG)) {
+           !(periph->periph_quirks & PQUIRK_ONLYBIG) &&
+           !cdb_flags) {
                /* 6-byte CDB */
                memset(&cmd_small, 0, sizeof(cmd_small));
                cmd_small.opcode = (bp->b_flags & B_READ) ?
@@ -732,6 +748,9 @@ sd_diskstart(device_t dev, struct buf *b
                cmdp = (struct scsipi_generic *)&cmd16;
        }
 
+       if (cdb_flags)
+               cmdp->bytes[0] = cdb_flags;
+
        /*
         * Figure out what flags to use.
         */
@@ -1796,7 +1815,9 @@ sd_getcache(struct sd_softc *sd, int *bi
        int error, bits = 0;
        int big;
        union scsi_disk_pages *pages;
+       uint8_t dev_spec;
 
+       /* only SCSI-2 and later supported */
        if (periph->periph_version < 2)
                return (EOPNOTSUPP);
 
@@ -1806,10 +1827,13 @@ sd_getcache(struct sd_softc *sd, int *bi
        if (error)
                return (error);
 
-       if (big)
+       if (big) {
                pages = (void *)(&scsipi_sense.header.big + 1);
-       else
+               dev_spec = scsipi_sense.header.big.dev_spec;
+       } else {
                pages = (void *)(&scsipi_sense.header.small + 1);
+               dev_spec = scsipi_sense.header.small.dev_spec;
+       }
 
        if ((pages->caching_params.flags & CACHING_RCD) == 0)
                bits |= DKCACHE_READ;
@@ -1818,6 +1842,13 @@ sd_getcache(struct sd_softc *sd, int *bi
        if (pages->caching_params.pg_code & PGCODE_PS)
                bits |= DKCACHE_SAVE;
 
+       /*
+        * Support for FUA/DPO, defined starting with SCSI-2. Use only
+        * if device claims to support it, according to the MODE SENSE.
+        */
+       if (ISSET(dev_spec, SMH_DSP_DPOFUA))
+               bits |= DKCACHE_FUA | DKCACHE_DPO;
+
        memset(&scsipi_sense, 0, sizeof(scsipi_sense));
        error = sd_mode_sense(sd, SMS_DBD, &scsipi_sense,
            sizeof(scsipi_sense.pages.caching_params),

Reply via email to