Module Name: src
Committed By: jdolecek
Date: Mon Apr 10 21:34:37 UTC 2017
Modified Files:
src/sys/kern: vfs_wapbl.c
Log Message:
improve performance of journal writes by parallelizing the I/O - use 4 bufs
by default, add sysctl vfs.wapbl.journal_iobufs to control it
this also removes need to allocate iobuf during commit, so it
might help to avoid deadlock during memory shortages like PR kern/47030
To generate a diff of this commit:
cvs rdiff -u -r1.94 -r1.95 src/sys/kern/vfs_wapbl.c
Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.
Modified files:
Index: src/sys/kern/vfs_wapbl.c
diff -u src/sys/kern/vfs_wapbl.c:1.94 src/sys/kern/vfs_wapbl.c:1.95
--- src/sys/kern/vfs_wapbl.c:1.94 Mon Apr 10 19:52:38 2017
+++ src/sys/kern/vfs_wapbl.c Mon Apr 10 21:34:37 2017
@@ -1,4 +1,4 @@
-/* $NetBSD: vfs_wapbl.c,v 1.94 2017/04/10 19:52:38 jdolecek Exp $ */
+/* $NetBSD: vfs_wapbl.c,v 1.95 2017/04/10 21:34:37 jdolecek Exp $ */
/*-
* Copyright (c) 2003, 2008, 2009 The NetBSD Foundation, Inc.
@@ -36,7 +36,7 @@
#define WAPBL_INTERNAL
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.94 2017/04/10 19:52:38 jdolecek Exp $");
+__KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.95 2017/04/10 21:34:37 jdolecek Exp $");
#include <sys/param.h>
#include <sys/bitops.h>
@@ -72,6 +72,7 @@ static struct sysctllog *wapbl_sysctl;
static int wapbl_flush_disk_cache = 1;
static int wapbl_verbose_commit = 0;
static int wapbl_allow_fuadpo = 0; /* switched off by default for now */
+static int wapbl_journal_iobufs = 4;
static inline size_t wapbl_space_free(size_t, off_t, off_t);
@@ -191,6 +192,8 @@ struct wapbl {
char wl_ev_group[EVCNT_STRING_MAX]; /* r */
struct evcnt wl_ev_commit; /* l */
struct evcnt wl_ev_journalwrite; /* l */
+ struct evcnt wl_ev_jbufs_bio_nowait; /* l */
+ struct evcnt wl_ev_jbufs_bio_wait; /* l */
struct evcnt wl_ev_metawrite; /* lm */
struct evcnt wl_ev_cacheflush; /* l */
#endif
@@ -228,9 +231,9 @@ struct wapbl {
SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction
accounting */
- u_char *wl_buffer; /* l: buffer for wapbl_buffered_write() */
- daddr_t wl_buffer_dblk; /* l: buffer disk block address */
- size_t wl_buffer_used; /* l: buffer current use */
+ /* buffers for wapbl_buffered_write() */
+ TAILQ_HEAD(, buf) wl_iobufs; /* l: Free or filling bufs */
+ TAILQ_HEAD(, buf) wl_iobufs_busy; /* l: In-transit bufs */
int wl_dkcache; /* r: disk cache flags */
#define WAPBL_USE_FUA(wl) \
@@ -360,6 +363,15 @@ wapbl_sysctl_init(void)
if (rv)
return rv;
+ rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
+ CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+ CTLTYPE_INT, "journal_iobufs",
+ SYSCTL_DESCR("count of bufs used for journal I/O (max async count)"),
+ NULL, 0, &wapbl_journal_iobufs, 0,
+ CTL_CREATE, CTL_EOL);
+ if (rv)
+ return rv;
+
return rv;
}
@@ -401,6 +413,10 @@ wapbl_evcnt_init(struct wapbl *wl)
NULL, wl->wl_ev_group, "commit");
evcnt_attach_dynamic(&wl->wl_ev_journalwrite, EVCNT_TYPE_MISC,
NULL, wl->wl_ev_group, "journal sync block write");
+ evcnt_attach_dynamic(&wl->wl_ev_jbufs_bio_nowait, EVCNT_TYPE_MISC,
+ NULL, wl->wl_ev_group, "journal I/O bufs no wait");
+ evcnt_attach_dynamic(&wl->wl_ev_jbufs_bio_wait, EVCNT_TYPE_MISC,
+ NULL, wl->wl_ev_group, "journal I/O bufs biowait");
evcnt_attach_dynamic(&wl->wl_ev_metawrite, EVCNT_TYPE_MISC,
NULL, wl->wl_ev_group, "metadata finished block write");
evcnt_attach_dynamic(&wl->wl_ev_cacheflush, EVCNT_TYPE_MISC,
@@ -412,6 +428,8 @@ wapbl_evcnt_free(struct wapbl *wl)
{
evcnt_detach(&wl->wl_ev_commit);
evcnt_detach(&wl->wl_ev_journalwrite);
+ evcnt_detach(&wl->wl_ev_jbufs_bio_nowait);
+ evcnt_detach(&wl->wl_ev_jbufs_bio_wait);
evcnt_detach(&wl->wl_ev_metawrite);
evcnt_detach(&wl->wl_ev_cacheflush);
}
@@ -605,9 +623,6 @@ wapbl_start(struct wapbl ** wlp, struct
wl->wl_dealloclim = wl->wl_bufbytes_max / mp->mnt_stat.f_bsize / 2;
TAILQ_INIT(&wl->wl_dealloclist);
- wl->wl_buffer = wapbl_alloc(MAXPHYS);
- wl->wl_buffer_used = 0;
-
wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE);
wapbl_evcnt_init(wl);
@@ -630,6 +645,25 @@ wapbl_start(struct wapbl ** wlp, struct
wl->wl_wc_scratch = wapbl_alloc(len);
}
+ TAILQ_INIT(&wl->wl_iobufs);
+ TAILQ_INIT(&wl->wl_iobufs_busy);
+ for (int i = 0; i < wapbl_journal_iobufs; i++) {
+ struct buf *bp;
+
+ if ((bp = geteblk(MAXPHYS)) == NULL)
+ goto errout;
+
+ mutex_enter(&bufcache_lock);
+ mutex_enter(devvp->v_interlock);
+ bgetvp(devvp, bp);
+ mutex_exit(devvp->v_interlock);
+ mutex_exit(&bufcache_lock);
+
+ bp->b_dev = devvp->v_rdev;
+
+ TAILQ_INSERT_TAIL(&wl->wl_iobufs, bp, b_wapbllist);
+ }
+
/*
* if there was an existing set of unlinked but
* allocated inodes, preserve it in the new
@@ -656,7 +690,13 @@ wapbl_start(struct wapbl ** wlp, struct
wapbl_discard(wl);
wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
- wapbl_free(wl->wl_buffer, MAXPHYS);
+ while (!TAILQ_EMPTY(&wl->wl_iobufs)) {
+ struct buf *bp;
+
+ bp = TAILQ_FIRST(&wl->wl_iobufs);
+ TAILQ_REMOVE(&wl->wl_iobufs, bp, b_wapbllist);
+ brelse(bp, BC_INVAL);
+ }
wapbl_inodetrk_free(wl);
wapbl_free(wl, sizeof(*wl));
@@ -832,10 +872,17 @@ wapbl_stop(struct wapbl *wl, int force)
KASSERT(wl->wl_inohashcnt == 0);
KASSERT(TAILQ_EMPTY(&wl->wl_dealloclist));
KASSERT(wl->wl_dealloccnt == 0);
+ KASSERT(TAILQ_EMPTY(&wl->wl_iobufs_busy));
wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
- wapbl_free(wl->wl_buffer, MAXPHYS);
+ while (!TAILQ_EMPTY(&wl->wl_iobufs)) {
+ struct buf *bp;
+
+ bp = TAILQ_FIRST(&wl->wl_iobufs);
+ TAILQ_REMOVE(&wl->wl_iobufs, bp, b_wapbllist);
+ brelse(bp, BC_INVAL);
+ }
wapbl_inodetrk_free(wl);
wapbl_evcnt_free(wl);
@@ -853,14 +900,10 @@ wapbl_stop(struct wapbl *wl, int force)
* Unbuffered disk I/O
*/
-static int
-wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags)
+static void
+wapbl_doio_accounting(struct vnode *devvp, int flags)
{
struct pstats *pstats = curlwp->l_proc->p_stats;
- struct buf *bp;
- int error;
-
- KASSERT(devvp->v_type == VBLK);
if ((flags & (B_WRITE | B_READ)) == B_WRITE) {
mutex_enter(devvp->v_interlock);
@@ -871,6 +914,18 @@ wapbl_doio(void *data, size_t len, struc
pstats->p_ru.ru_inblock++;
}
+}
+
+static int
+wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags)
+{
+ struct buf *bp;
+ int error;
+
+ KASSERT(devvp->v_type == VBLK);
+
+ wapbl_doio_accounting(devvp, flags);
+
bp = getiobuf(devvp, true);
bp->b_flags = flags;
bp->b_cflags = BC_BUSY; /* mandatory, asserted by biowait() */
@@ -935,24 +990,77 @@ wapbl_read(void *data, size_t len, struc
*/
/*
+ * wapbl_buffered_write_async(wl, bp)
+ *
+ * Send buffer for asynchronous write.
+ */
+static void
+wapbl_buffered_write_async(struct wapbl *wl, struct buf *bp)
+{
+ wapbl_doio_accounting(wl->wl_devvp, bp->b_flags);
+
+ KASSERT(TAILQ_FIRST(&wl->wl_iobufs) == bp);
+ TAILQ_REMOVE(&wl->wl_iobufs, bp, b_wapbllist);
+
+ bp->b_flags = B_WRITE | WAPBL_JFLAGS(wl);
+ bp->b_cflags = BC_BUSY; /* mandatory, asserted by biowait() */
+ bp->b_oflags = 0;
+ bp->b_bcount = bp->b_resid;
+ BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
+
+ VOP_STRATEGY(wl->wl_devvp, bp);
+
+ wl->wl_ev_journalwrite.ev_count++;
+
+ TAILQ_INSERT_TAIL(&wl->wl_iobufs_busy, bp, b_wapbllist);
+}
+
+/*
* wapbl_buffered_flush(wl)
*
* Flush any buffered writes from wapbl_buffered_write.
*/
static int
-wapbl_buffered_flush(struct wapbl *wl)
+wapbl_buffered_flush(struct wapbl *wl, bool full)
{
- int error;
+ int error = 0;
+ struct buf *bp, *bnext;
+ bool only_done = true, found = false;
- if (wl->wl_buffer_used == 0)
- return 0;
+ /* if there is outstanding buffered write, send it now */
+ if ((bp = TAILQ_FIRST(&wl->wl_iobufs)) && bp->b_resid > 0)
+ wapbl_buffered_write_async(wl, bp);
+
+ /* wait for I/O to complete */
+again:
+ TAILQ_FOREACH_SAFE(bp, &wl->wl_iobufs_busy, b_wapbllist, bnext) {
+ if (!full && only_done) {
+ /* skip unfinished */
+ if (!ISSET(bp->b_oflags, BO_DONE))
+ continue;
+ }
+
+ if (ISSET(bp->b_oflags, BO_DONE))
+ wl->wl_ev_jbufs_bio_nowait.ev_count++;
+ else
+ wl->wl_ev_jbufs_bio_wait.ev_count++;
- error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used,
- wl->wl_devvp, wl->wl_buffer_dblk,
- B_WRITE | WAPBL_JFLAGS(wl));
- wl->wl_buffer_used = 0;
+ TAILQ_REMOVE(&wl->wl_iobufs_busy, bp, b_wapbllist);
+ error = biowait(bp);
- wl->wl_ev_journalwrite.ev_count++;
+ /* reset for reuse */
+ bp->b_blkno = bp->b_resid = 0;
+ TAILQ_INSERT_TAIL(&wl->wl_iobufs, bp, b_wapbllist);
+ found = true;
+
+ if (!full)
+ break;
+ }
+
+ if (!found && only_done && !TAILQ_EMPTY(&wl->wl_iobufs_busy)) {
+ only_done = false;
+ goto again;
+ }
return error;
}
@@ -967,49 +1075,63 @@ wapbl_buffered_flush(struct wapbl *wl)
static int
wapbl_buffered_write(void *data, size_t len, struct wapbl *wl, daddr_t pbn)
{
- int error;
size_t resid;
+ struct buf *bp;
+
+again:
+ bp = TAILQ_FIRST(&wl->wl_iobufs);
+
+ if (bp == NULL) {
+ /* No more buffers, wait for any previous I/O to finish. */
+ wapbl_buffered_flush(wl, false);
+
+ bp = TAILQ_FIRST(&wl->wl_iobufs);
+ KASSERT(bp != NULL);
+ }
/*
* If not adjacent to buffered data flush first. Disk block
* address is always valid for non-empty buffer.
*/
- if (wl->wl_buffer_used > 0 &&
- pbn != wl->wl_buffer_dblk + btodb(wl->wl_buffer_used)) {
- error = wapbl_buffered_flush(wl);
- if (error)
- return error;
+ if ((bp->b_resid > 0 && pbn != bp->b_blkno + btodb(bp->b_resid))) {
+ wapbl_buffered_write_async(wl, bp);
+ goto again;
}
+
/*
* If this write goes to an empty buffer we have to
* save the disk block address first.
*/
- if (wl->wl_buffer_used == 0)
- wl->wl_buffer_dblk = pbn;
+ if (bp->b_blkno == 0)
+ bp->b_blkno = pbn;
+
/*
- * Remaining space so this buffer ends on a MAXPHYS boundary.
+ * Remaining space so this buffer ends on a buffer size boundary.
*
* Cannot become less or equal zero as the buffer would have been
* flushed on the last call then.
*/
- resid = MAXPHYS - dbtob(wl->wl_buffer_dblk % btodb(MAXPHYS)) -
- wl->wl_buffer_used;
+ resid = bp->b_bufsize - dbtob(bp->b_blkno % btodb(bp->b_bufsize)) -
+ bp->b_resid;
KASSERT(resid > 0);
KASSERT(dbtob(btodb(resid)) == resid);
+
+ if (len < resid)
+ resid = len;
+
+ memcpy((uint8_t *)bp->b_data + bp->b_resid, data, resid);
+ bp->b_resid += resid;
+
if (len >= resid) {
- memcpy(wl->wl_buffer + wl->wl_buffer_used, data, resid);
- wl->wl_buffer_used += resid;
- error = wapbl_buffered_flush(wl);
+ /* Just filled the buf, or data did not fit */
+ wapbl_buffered_write_async(wl, bp);
+
data = (uint8_t *)data + resid;
len -= resid;
- wl->wl_buffer_dblk = pbn + btodb(resid);
- if (error)
- return error;
- }
- KASSERT(len < MAXPHYS);
- if (len > 0) {
- memcpy(wl->wl_buffer + wl->wl_buffer_used, data, len);
- wl->wl_buffer_used += len;
+ pbn += btodb(resid);
+
+ if (len > 0)
+ goto again;
}
return 0;
@@ -2014,6 +2136,30 @@ wapbl_print(struct wapbl *wl,
}
(*pr)("\n");
}
+
+ (*pr)("iobufs free =");
+ TAILQ_FOREACH(bp, &wl->wl_iobufs, b_wapbllist) {
+ if (!TAILQ_NEXT(bp, b_wapbllist)) {
+ (*pr)(" %p", bp);
+ } else if ((++cnt % 6) == 0) {
+ (*pr)(" %p,\n\t", bp);
+ } else {
+ (*pr)(" %p,", bp);
+ }
+ }
+ (*pr)("\n");
+
+ (*pr)("iobufs busy =");
+ TAILQ_FOREACH(bp, &wl->wl_iobufs_busy, b_wapbllist) {
+ if (!TAILQ_NEXT(bp, b_wapbllist)) {
+ (*pr)(" %p", bp);
+ } else if ((++cnt % 6) == 0) {
+ (*pr)(" %p,\n\t", bp);
+ } else {
+ (*pr)(" %p,", bp);
+ }
+ }
+ (*pr)("\n");
}
}
@@ -2315,7 +2461,7 @@ wapbl_write_commit(struct wapbl *wl, off
int error;
daddr_t pbn;
- error = wapbl_buffered_flush(wl);
+ error = wapbl_buffered_flush(wl, true);
if (error)
return error;
/*
@@ -2352,7 +2498,7 @@ wapbl_write_commit(struct wapbl *wl, off
error = wapbl_buffered_write(wc, wc->wc_len, wl, pbn);
if (error)
return error;
- error = wapbl_buffered_flush(wl);
+ error = wapbl_buffered_flush(wl, true);
if (error)
return error;