Attached is final version of the patch. It uses MEDIA prefix for the flags, but keeps the FUA/DPO - i.e names are B_MEDIA_FUA, B_MEDIA_DPO. For wapbl it introduces a sysctl to use the feature, default is off for now.
I plan to commit this later in the week or early next week, unless there are some serious objections. Jaromir 2017-03-05 23:22 GMT+01:00 Jaromír Doleček <jaromir.dole...@gmail.com>: > Here is an updated patch. It was updated to check for the FUA support > for SCSI, using the MODE SENSE device-specific flag. Code was tested > with QEMU emulated bha(4) and nvme. WAPBL code was updated to use the > flag. It keeps the flag naming for now. > > In the patch, WAPBL sets the flag for journal writes, and also for the > metadata buffer for bawrite() call after journal commit. > > There is possible layer violation for metadata write - b_flags are > supposed to be set by owner of the buffer. Not sure how strict we > want/need to be there - perhaps introduce another flag field? Also the > flag > probably needs to be unset in biodone hook, so that the code > guarantees the buffer in buffer cache doesn't accidentaly keep it over > to another I/O. > > Jaromir
? dev/ic/TODO.nvme Index: sys/buf.h =================================================================== RCS file: /cvsroot/src/sys/sys/buf.h,v retrieving revision 1.126 diff -u -p -r1.126 buf.h --- sys/buf.h 26 Dec 2016 23:12:33 -0000 1.126 +++ sys/buf.h 27 Mar 2017 22:31:22 -0000 @@ -198,16 +198,21 @@ struct buf { #define B_RAW 0x00080000 /* Set by physio for raw transfers. */ #define B_READ 0x00100000 /* Read buffer. */ #define B_DEVPRIVATE 0x02000000 /* Device driver private flag. */ +#define B_MEDIA_FUA 0x08000000 /* Set Force Unit Access for media. */ +#define B_MEDIA_DPO 0x10000000 /* Set Disable Page Out for media. */ #define BUF_FLAGBITS \ "\20\1AGE\3ASYNC\4BAD\5BUSY\10DELWRI" \ "\12DONE\13COWDONE\15GATHERED\16INVAL\17LOCKED\20NOCACHE" \ - "\23PHYS\24RAW\25READ\32DEVPRIVATE\33VFLUSH" + "\23PHYS\24RAW\25READ\32DEVPRIVATE\33VFLUSH\34MEDIAFUA\35MEDIADPO" /* Avoid weird code due to B_WRITE being a "pseudo flag" */ #define BUF_ISREAD(bp) (((bp)->b_flags & B_READ) == B_READ) #define BUF_ISWRITE(bp) (((bp)->b_flags & B_READ) == B_WRITE) +/* Media flags, to be passed for nested I/O */ +#define B_MEDIA_FLAGS (B_MEDIA_FUA|B_MEDIA_DPO) + /* * This structure describes a clustered I/O. It is stored in the b_saveaddr * field of the buffer on which I/O is done. At I/O completion, cluster Index: sys/dkio.h =================================================================== RCS file: /cvsroot/src/sys/sys/dkio.h,v retrieving revision 1.22 diff -u -p -r1.22 dkio.h --- sys/dkio.h 8 Dec 2015 20:36:15 -0000 1.22 +++ sys/dkio.h 27 Mar 2017 22:31:22 -0000 @@ -85,6 +85,8 @@ #define DKCACHE_RCHANGE 0x000100 /* read enable is changeable */ #define DKCACHE_WCHANGE 0x000200 /* write enable is changeable */ #define DKCACHE_SAVE 0x010000 /* cache parameters are savable/save them */ +#define DKCACHE_FUA 0x020000 /* Force Unit Access supported */ +#define DKCACHE_DPO 0x040000 /* Disable Page Out supported */ /* sync disk cache */ #define DIOCCACHESYNC _IOW('d', 118, int) /* sync cache (force?) */ Index: kern/vfs_bio.c =================================================================== RCS file: /cvsroot/src/sys/kern/vfs_bio.c,v retrieving revision 1.271 diff -u -p -r1.271 vfs_bio.c --- kern/vfs_bio.c 21 Mar 2017 10:46:49 -0000 1.271 +++ kern/vfs_bio.c 27 Mar 2017 22:31:22 -0000 @@ -2027,7 +2027,7 @@ nestiobuf_iodone(buf_t *bp) void nestiobuf_setup(buf_t *mbp, buf_t *bp, int offset, size_t size) { - const int b_read = mbp->b_flags & B_READ; + const int b_pass = mbp->b_flags & (B_READ|B_MEDIA_FLAGS); struct vnode *vp = mbp->b_vp; KASSERT(mbp->b_bcount >= offset + size); @@ -2035,14 +2035,14 @@ nestiobuf_setup(buf_t *mbp, buf_t *bp, i bp->b_dev = mbp->b_dev; bp->b_objlock = mbp->b_objlock; bp->b_cflags = BC_BUSY; - bp->b_flags = B_ASYNC | b_read; + bp->b_flags = B_ASYNC | b_pass; bp->b_iodone = nestiobuf_iodone; bp->b_data = (char *)mbp->b_data + offset; bp->b_resid = bp->b_bcount = size; bp->b_bufsize = bp->b_bcount; bp->b_private = mbp; BIO_COPYPRIO(bp, mbp); - if (!b_read && vp != NULL) { + if (BUF_ISWRITE(bp) && vp != NULL) { mutex_enter(vp->v_interlock); vp->v_numoutput++; mutex_exit(vp->v_interlock); Index: kern/vfs_wapbl.c =================================================================== RCS file: /cvsroot/src/sys/kern/vfs_wapbl.c,v retrieving revision 1.92 diff -u -p -r1.92 vfs_wapbl.c --- kern/vfs_wapbl.c 17 Mar 2017 03:19:46 -0000 1.92 +++ kern/vfs_wapbl.c 27 Mar 2017 22:31:23 -0000 @@ -71,6 +71,7 @@ __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c, static struct sysctllog *wapbl_sysctl; static int wapbl_flush_disk_cache = 1; static int wapbl_verbose_commit = 0; +static int wapbl_use_fua = 0; /* switched off by default for now */ static inline size_t wapbl_space_free(size_t, off_t, off_t); @@ -230,6 +231,16 @@ struct wapbl { u_char *wl_buffer; /* l: buffer for wapbl_buffered_write() */ daddr_t wl_buffer_dblk; /* l: buffer disk block address */ size_t wl_buffer_used; /* l: buffer current use */ + + int wl_dkcache; /* r: disk cache flags */ +#define WAPBL_USE_FUA(wl) \ + (wapbl_use_fua && ISSET((wl)->wl_dkcache, DKCACHE_FUA)) +#define WAPBL_JFLAGS(wl) \ + (WAPBL_USE_FUA(wl) ? (wl)->wl_jwrite_flags : 0) +#define WAPBL_MFLAGS(wl) \ + (WAPBL_USE_FUA(wl) ? (wl)->wl_mwrite_flags : 0) + int wl_jwrite_flags; /* r: journal write flags */ + int wl_mwrite_flags; /* r: metadata write flags */ }; #ifdef WAPBL_DEBUG_PRINT @@ -281,6 +292,8 @@ static void wapbl_deallocation_free(stru static void wapbl_evcnt_init(struct wapbl *); static void wapbl_evcnt_free(struct wapbl *); +static void wapbl_dkcache_init(struct wapbl *); + #if 0 int wapbl_replay_verify(struct wapbl_replay *, struct vnode *); #endif @@ -335,6 +348,18 @@ wapbl_sysctl_init(void) SYSCTL_DESCR("show time and size of wapbl log commits"), NULL, 0, &wapbl_verbose_commit, 0, CTL_CREATE, CTL_EOL); + if (rv) + return rv; + + rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode, + CTLFLAG_PERMANENT|CTLFLAG_READWRITE, + CTLTYPE_INT, "use_fua", + SYSCTL_DESCR("use FUA/DPO instead of cash flush if available"), + NULL, 0, &wapbl_use_fua, 0, + CTL_CREATE, CTL_EOL); + if (rv) + return rv; + return rv; } @@ -391,6 +416,30 @@ wapbl_evcnt_free(struct wapbl *wl) evcnt_detach(&wl->wl_ev_cacheflush); } +static void +wapbl_dkcache_init(struct wapbl *wl) +{ + int error; + + /* Get disk cache flags */ + error = VOP_IOCTL(wl->wl_devvp, DIOCGCACHE, &wl->wl_dkcache, + FWRITE, FSCRED); + if (error) { + /* behave as if there was a write cache */ + wl->wl_dkcache = DKCACHE_WRITE; + } + + /* Use FUA instead of cache flush if available */ + if (ISSET(wl->wl_dkcache, DKCACHE_FUA)) { + wl->wl_jwrite_flags |= B_MEDIA_FUA; + wl->wl_mwrite_flags |= B_MEDIA_FUA; + } + + /* Use DPO for journal writes if available */ + if (ISSET(wl->wl_dkcache, DKCACHE_DPO)) + wl->wl_jwrite_flags |= B_MEDIA_DPO; +} + static int wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr) { @@ -563,6 +612,8 @@ wapbl_start(struct wapbl ** wlp, struct wapbl_evcnt_init(wl); + wapbl_dkcache_init(wl); + /* Initialize the commit header */ { struct wapbl_wc_header *wc; @@ -809,7 +860,6 @@ wapbl_doio(void *data, size_t len, struc struct buf *bp; int error; - KASSERT((flags & ~(B_WRITE | B_READ)) == 0); KASSERT(devvp->v_type == VBLK); if ((flags & (B_WRITE | B_READ)) == B_WRITE) { @@ -823,7 +873,7 @@ wapbl_doio(void *data, size_t len, struc bp = getiobuf(devvp, true); bp->b_flags = flags; - bp->b_cflags = BC_BUSY; /* silly & dubious */ + bp->b_cflags = BC_BUSY; /* mandatory, asserted by biowait() */ bp->b_dev = devvp->v_rdev; bp->b_data = data; bp->b_bufsize = bp->b_resid = bp->b_bcount = len; @@ -898,7 +948,8 @@ wapbl_buffered_flush(struct wapbl *wl) return 0; error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used, - wl->wl_devvp, wl->wl_buffer_dblk, B_WRITE); + wl->wl_devvp, wl->wl_buffer_dblk, + B_WRITE | WAPBL_JFLAGS(wl)); wl->wl_buffer_used = 0; wl->wl_ev_journalwrite.ev_count++; @@ -948,12 +999,10 @@ wapbl_buffered_write(void *data, size_t if (len >= resid) { memcpy(wl->wl_buffer + wl->wl_buffer_used, data, resid); wl->wl_buffer_used += resid; - error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used, - wl->wl_devvp, wl->wl_buffer_dblk, B_WRITE); + error = wapbl_buffered_flush(wl); data = (uint8_t *)data + resid; len -= resid; wl->wl_buffer_dblk = pbn + btodb(resid); - wl->wl_buffer_used = 0; if (error) return error; } @@ -1500,6 +1549,13 @@ wapbl_biodone(struct buf *bp) } /* + * Make sure that the buf doesn't retain the media flags, so that + * e.g. wapbl_use_fua has immediate effect on any following I/O. + * The flags will be set again if needed by another I/O. + */ + bp->b_flags &= ~B_MEDIA_FLAGS; + + /* * Release the buffer here. wapbl_flush() may wait for the * log to become empty and we better unbusy the buffer before * wapbl_flush() returns. @@ -1754,6 +1810,10 @@ wapbl_flush(struct wapbl *wl, int waitfo } bp->b_iodone = wapbl_biodone; bp->b_private = we; + + /* make sure the block is saved sync when FUA in use */ + bp->b_flags |= WAPBL_MFLAGS(wl); + bremfree(bp); wapbl_remove_buf_locked(wl, bp); mutex_exit(&wl->wl_mtx); @@ -2201,7 +2261,8 @@ wapbl_cache_sync(struct wapbl *wl, const int force = 1; int error; - if (!wapbl_flush_disk_cache) { + /* Skip full cache sync if disabled, or when using FUA */ + if (!wapbl_flush_disk_cache || WAPBL_USE_FUA(wl)) { return 0; } if (verbose) { Index: dev/ic/ld_nvme.c =================================================================== RCS file: /cvsroot/src/sys/dev/ic/ld_nvme.c,v retrieving revision 1.14 diff -u -p -r1.14 ld_nvme.c --- dev/ic/ld_nvme.c 28 Feb 2017 20:55:09 -0000 1.14 +++ dev/ic/ld_nvme.c 27 Mar 2017 22:31:23 -0000 @@ -152,11 +152,15 @@ static int ld_nvme_start(struct ld_softc *ld, struct buf *bp) { struct ld_nvme_softc *sc = device_private(ld->sc_dv); + int flags = BUF_ISWRITE(bp) ? 0 : NVME_NS_CTX_F_READ; + + if (bp->b_flags & B_MEDIA_FUA) + flags |= NVME_NS_CTX_F_FUA; return nvme_ns_dobio(sc->sc_nvme, sc->sc_nsid, sc, bp, bp->b_data, bp->b_bcount, sc->sc_ld.sc_secsize, bp->b_rawblkno, - BUF_ISWRITE(bp) ? 0 : NVME_NS_CTX_F_READ, + flags, ld_nvme_biodone); } @@ -221,7 +225,11 @@ ld_nvme_getcache(struct ld_softc *ld, in int error; struct ld_nvme_softc *sc = device_private(ld->sc_dv); - *addr = 0; + /* + * DPO not supported, Dataset Management (DSM) field doesn't specify + * the same semantics. + */ + *addr = DKCACHE_FUA; if (!nvme_has_volatile_write_cache(sc->sc_nvme)) { /* cache simply not present */ Index: dev/ic/nvme.c =================================================================== RCS file: /cvsroot/src/sys/dev/ic/nvme.c,v retrieving revision 1.25 diff -u -p -r1.25 nvme.c --- dev/ic/nvme.c 28 Feb 2017 20:53:50 -0000 1.25 +++ dev/ic/nvme.c 27 Mar 2017 22:31:23 -0000 @@ -727,6 +727,9 @@ nvme_ns_io_fill(struct nvme_queue *q, st htolem64(&sqe->slba, ccb->nnc_blkno); + if (ISSET(ccb->nnc_flags, NVME_NS_CTX_F_FUA)) + htolem16(&sqe->ioflags, NVM_SQE_IO_FUA); + /* guaranteed by upper layers, but check just in case */ KASSERT((ccb->nnc_datasize % ccb->nnc_secsize) == 0); htolem16(&sqe->nlb, (ccb->nnc_datasize / ccb->nnc_secsize) - 1); Index: dev/ic/nvmevar.h =================================================================== RCS file: /cvsroot/src/sys/dev/ic/nvmevar.h,v retrieving revision 1.12 diff -u -p -r1.12 nvmevar.h --- dev/ic/nvmevar.h 28 Feb 2017 20:53:50 -0000 1.12 +++ dev/ic/nvmevar.h 27 Mar 2017 22:31:23 -0000 @@ -64,6 +64,7 @@ struct nvme_ccb { uint16_t nnc_flags; #define NVME_NS_CTX_F_READ __BIT(0) #define NVME_NS_CTX_F_POLL __BIT(1) +#define NVME_NS_CTX_F_FUA __BIT(2) struct buf *nnc_buf; daddr_t nnc_blkno; Index: dev/scsipi/scsipi_disk.h =================================================================== RCS file: /cvsroot/src/sys/dev/scsipi/scsipi_disk.h,v retrieving revision 1.21 diff -u -p -r1.21 scsipi_disk.h --- dev/scsipi/scsipi_disk.h 25 Dec 2007 18:33:42 -0000 1.21 +++ dev/scsipi/scsipi_disk.h 27 Mar 2017 22:31:23 -0000 @@ -62,9 +62,10 @@ struct scsipi_rw_10 { u_int8_t opcode; u_int8_t byte2; #define SRWB_RELADDR 0x01 /* obsolete */ -#define SRWB_FUA_NV 0x02 /* force unit access non-volatile cache */ -#define SRWB_FUA 0x08 /* force unit access */ -#define SRWB_DPO 0x10 /* disable page out */ +#define SRWB_FUA_NV 0x02 /* force unit access non-volatile cache (SCSI-3) */ +#define SRWB_RESV2 0x04 /* reserved (SCSI-2) */ +#define SRWB_FUA 0x08 /* force unit access volatile cache (SCSI-2) */ +#define SRWB_DPO 0x10 /* disable page out (SCSI-2) */ #define SRWB_PROTECT(x) ((x) << 5) u_int8_t addr[4]; u_int8_t reserved; @@ -159,4 +160,7 @@ struct scsipi_capacity_descriptor { #define SCSIPI_CAP_DESC_CODE_FORMATTED 0x2 #define SCSIPI_CAP_DESC_CODE_NONE 0x3 +/* defines for the device specific byte in the mode select/sense header */ +#define SMH_DSP_DPOFUA 0x10 + #endif /* _DEV_SCSIPI_SCSIPI_DISK_H_ */ Index: dev/scsipi/sd.c =================================================================== RCS file: /cvsroot/src/sys/dev/scsipi/sd.c,v retrieving revision 1.322 diff -u -p -r1.322 sd.c --- dev/scsipi/sd.c 21 Dec 2016 21:28:30 -0000 1.322 +++ dev/scsipi/sd.c 27 Mar 2017 22:31:23 -0000 @@ -654,6 +654,7 @@ sd_diskstart(device_t dev, struct buf *b struct scsipi_generic *cmdp; struct scsipi_xfer *xs; int error, flags, nblks, cmdlen; + int cdb_flags; mutex_enter(chan_mtx(chan)); @@ -698,12 +699,27 @@ sd_diskstart(device_t dev, struct buf *b nblks = howmany(bp->b_bcount, sd->params.blksize); /* + * Pass FUA and/or DPO if requested. Must be done before CDB + * selection, as 6-byte CDB doesn't support the flags. + */ + cdb_flags = 0; + + if (bp->b_flags & B_MEDIA_FUA) + cdb_flags |= SRWB_FUA; + + if (bp->b_flags & B_MEDIA_DPO) + cdb_flags |= SRWB_DPO; + + /* * Fill out the scsi command. Use the smallest CDB possible - * (6-byte, 10-byte, or 16-byte). + * (6-byte, 10-byte, or 16-byte). If we need FUA or DPO, + * need to use 10-byte or bigger, as the 6-byte doesn't support + * the flags. */ if (((bp->b_rawblkno & 0x1fffff) == bp->b_rawblkno) && ((nblks & 0xff) == nblks) && - !(periph->periph_quirks & PQUIRK_ONLYBIG)) { + !(periph->periph_quirks & PQUIRK_ONLYBIG) && + !cdb_flags) { /* 6-byte CDB */ memset(&cmd_small, 0, sizeof(cmd_small)); cmd_small.opcode = (bp->b_flags & B_READ) ? @@ -732,6 +748,9 @@ sd_diskstart(device_t dev, struct buf *b cmdp = (struct scsipi_generic *)&cmd16; } + if (cdb_flags) + cmdp->bytes[0] = cdb_flags; + /* * Figure out what flags to use. */ @@ -1796,7 +1815,9 @@ sd_getcache(struct sd_softc *sd, int *bi int error, bits = 0; int big; union scsi_disk_pages *pages; + uint8_t dev_spec; + /* only SCSI-2 and later supported */ if (periph->periph_version < 2) return (EOPNOTSUPP); @@ -1806,10 +1827,13 @@ sd_getcache(struct sd_softc *sd, int *bi if (error) return (error); - if (big) + if (big) { pages = (void *)(&scsipi_sense.header.big + 1); - else + dev_spec = scsipi_sense.header.big.dev_spec; + } else { pages = (void *)(&scsipi_sense.header.small + 1); + dev_spec = scsipi_sense.header.small.dev_spec; + } if ((pages->caching_params.flags & CACHING_RCD) == 0) bits |= DKCACHE_READ; @@ -1818,6 +1842,13 @@ sd_getcache(struct sd_softc *sd, int *bi if (pages->caching_params.pg_code & PGCODE_PS) bits |= DKCACHE_SAVE; + /* + * Support for FUA/DPO, defined starting with SCSI-2. Use only + * if device claims to support it, according to the MODE SENSE. + */ + if (ISSET(dev_spec, SMH_DSP_DPOFUA)) + bits |= DKCACHE_FUA | DKCACHE_DPO; + memset(&scsipi_sense, 0, sizeof(scsipi_sense)); error = sd_mode_sense(sd, SMS_DBD, &scsipi_sense, sizeof(scsipi_sense.pages.caching_params),