Module Name: src Committed By: bouyer Date: Wed Sep 23 17:48:55 UTC 2009
Modified Files: src/sys/arch/xen/xen: xbd_xenbus.c xbdback_xenbus.c Log Message: xbdback: implement and publish "feature-flush-cache". xbd: if feature-flush-cache is present, use it for DIOCCACHESYNC. If not present, make DIOCCACHESYNC return EOPNOTSUPP and warn on first call. Should improve WAPBL reliability of Xen guests on a NetBSD dom0. Unfortunably not all linux guests seems to support this feature, and using feature-write-barrier would require a B_BARRIER flag in the buffer. To generate a diff of this commit: cvs rdiff -u -r1.42 -r1.43 src/sys/arch/xen/xen/xbd_xenbus.c cvs rdiff -u -r1.24 -r1.25 src/sys/arch/xen/xen/xbdback_xenbus.c Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/sys/arch/xen/xen/xbd_xenbus.c diff -u src/sys/arch/xen/xen/xbd_xenbus.c:1.42 src/sys/arch/xen/xen/xbd_xenbus.c:1.43 --- src/sys/arch/xen/xen/xbd_xenbus.c:1.42 Mon Sep 21 21:59:30 2009 +++ src/sys/arch/xen/xen/xbd_xenbus.c Wed Sep 23 17:48:55 2009 @@ -1,4 +1,4 @@ -/* $NetBSD: xbd_xenbus.c,v 1.42 2009/09/21 21:59:30 bouyer Exp $ */ +/* $NetBSD: xbd_xenbus.c,v 1.43 2009/09/23 17:48:55 bouyer Exp $ */ /* * Copyright (c) 2006 Manuel Bouyer. @@ -31,7 +31,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: xbd_xenbus.c,v 1.42 2009/09/21 21:59:30 bouyer Exp $"); +__KERNEL_RCSID(0, "$NetBSD: xbd_xenbus.c,v 1.43 2009/09/23 17:48:55 bouyer Exp $"); #include "opt_xen.h" #include "rnd.h" @@ -84,11 +84,24 @@ struct xbd_req { SLIST_ENTRY(xbd_req) req_next; uint16_t req_id; /* ID passed to backend */ - grant_ref_t req_gntref[BLKIF_MAX_SEGMENTS_PER_REQUEST]; - int req_nr_segments; /* number of segments in this request */ - struct buf *req_bp; /* buffer associated with this request */ - void *req_data; /* pointer to the data buffer */ + union { + struct { + grant_ref_t req_gntref[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + int req_nr_segments; /* number of segments in this request */ + struct buf *req_bp; /* buffer associated with this request */ + void *req_data; /* pointer to the data buffer */ + } req_rw; + struct { + int s_error; + volatile int s_done; + } req_sync; + } u; }; +#define req_gntref u.req_rw.req_gntref +#define req_nr_segments u.req_rw.req_nr_segments +#define req_bp u.req_rw.req_bp +#define req_data u.req_rw.req_data +#define req_sync u.req_sync struct xbd_xenbus_softc { device_t sc_dev; @@ -104,6 +117,7 @@ struct xbd_req sc_reqs[XBD_RING_SIZE]; SLIST_HEAD(,xbd_req) sc_xbdreq_head; /* list of free requests */ + bool sc_xbdreq_wait; /* special waiting on xbd_req */ int sc_backend_status; /* our status with backend */ #define BLKIF_STATE_DISCONNECTED 0 @@ -119,6 +133,7 @@ uint64_t sc_xbdsize; /* size of disk in DEV_BSIZE */ u_long sc_info; /* VDISK_* */ u_long sc_handle; /* from backend */ + int sc_cache_flush; /* backend supports BLKIF_OP_FLUSH_DISKCACHE */ #if NRND > 0 rndsource_element_t sc_rnd_source; #endif @@ -518,6 +533,7 @@ { int err; unsigned long long sectors; + u_long cache_flush; err = xenbus_read_ul(NULL, sc->sc_xbusd->xbusd_path, "virtual-device", &sc->sc_handle, 10); @@ -541,6 +557,14 @@ if (err) panic("%s: can't read number from %s/sector-size\n", device_xname(sc->sc_dev), sc->sc_xbusd->xbusd_otherend); + err = xenbus_read_ul(NULL, sc->sc_xbusd->xbusd_otherend, + "feature-flush-cache", &cache_flush, 10); + if (err) + cache_flush = 0; + if (cache_flush > 0) + sc->sc_cache_flush = 1; + else + sc->sc_cache_flush = 0; xenbus_switch_state(sc->sc_xbusd, NULL, XenbusStateConnected); } @@ -564,9 +588,16 @@ for (i = sc->sc_ring.rsp_cons; i != resp_prod; i++) { blkif_response_t *rep = RING_GET_RESPONSE(&sc->sc_ring, i); struct xbd_req *xbdreq = &sc->sc_reqs[rep->id]; - bp = xbdreq->req_bp; DPRINTF(("xbd_handler(%p): b_bcount = %ld\n", - bp, (long)bp->b_bcount)); + xbdreq->req_bp, (long)bp->b_bcount)); + bp = xbdreq->req_bp; + if (rep->operation == BLKIF_OP_FLUSH_DISKCACHE) { + xbdreq->req_sync.s_error = rep->status; + xbdreq->req_sync.s_done = 1; + wakeup(xbdreq); + /* caller will free the req */ + continue; + } for (seg = xbdreq->req_nr_segments - 1; seg >= 0; seg--) { if (__predict_false( xengnt_status(xbdreq->req_gntref[seg]))) { @@ -608,13 +639,15 @@ biodone(bp); SLIST_INSERT_HEAD(&sc->sc_xbdreq_head, xbdreq, req_next); } +done: xen_rmb(); sc->sc_ring.rsp_cons = i; RING_FINAL_CHECK_FOR_RESPONSES(&sc->sc_ring, more_to_do); if (more_to_do) goto again; -done: dk_iodone(sc->sc_di, &sc->sc_dksc); + if (sc->sc_xbdreq_wait) + wakeup(&sc->sc_xbdreq_wait); return 1; } @@ -717,6 +750,10 @@ struct dk_softc *dksc; int error; struct disk *dk; + int s; + struct xbd_req *xbdreq; + blkif_request_t *req; + int notify; DPRINTF(("xbdioctl(%d, %08lx, %p, %d, %p)\n", dev, cmd, data, flag, l)); @@ -731,6 +768,57 @@ case DIOCSSTRATEGY: error = EOPNOTSUPP; break; + case DIOCCACHESYNC: + if (sc->sc_cache_flush <= 0) { + if (sc->sc_cache_flush == 0) { + aprint_error_dev(sc->sc_dev, + "WARNING: cache flush not supported " + "by backend\n"); + sc->sc_cache_flush = -1; + } + return EOPNOTSUPP; + } + + s = splbio(); + + while (RING_FULL(&sc->sc_ring)) { + sc->sc_xbdreq_wait = 1; + tsleep(&sc->sc_xbdreq_wait, PRIBIO, "xbdreq", 0); + } + sc->sc_xbdreq_wait = 0; + + xbdreq = SLIST_FIRST(&sc->sc_xbdreq_head); + if (__predict_false(xbdreq == NULL)) { + DPRINTF(("xbdioctl: no req\n")); + error = ENOMEM; + } else { + SLIST_REMOVE_HEAD(&sc->sc_xbdreq_head, req_next); + req = RING_GET_REQUEST(&sc->sc_ring, + sc->sc_ring.req_prod_pvt); + req->id = xbdreq->req_id; + req->operation = BLKIF_OP_FLUSH_DISKCACHE; + req->handle = sc->sc_handle; + xbdreq->req_sync.s_done = 0; + sc->sc_ring.req_prod_pvt++; + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&sc->sc_ring, + notify); + if (notify) + hypervisor_notify_via_evtchn(sc->sc_evtchn); + /* request sent, no wait for completion */ + while (xbdreq->req_sync.s_done == 0) { + tsleep(xbdreq, PRIBIO, "xbdsync", 0); + } + if (xbdreq->req_sync.s_error == BLKIF_RSP_EOPNOTSUPP) + error = EOPNOTSUPP; + else if (xbdreq->req_sync.s_error == BLKIF_RSP_OKAY) + error = 0; + else + error = EIO; + SLIST_INSERT_HEAD(&sc->sc_xbdreq_head, xbdreq, + req_next); + } + splx(s); + break; default: error = dk_ioctl(sc->sc_di, dksc, dev, cmd, data, flag, l); break; @@ -788,7 +876,7 @@ } - if (RING_FULL(&sc->sc_ring)) { + if (RING_FULL(&sc->sc_ring) || sc->sc_xbdreq_wait) { DPRINTF(("xbdstart: ring_full\n")); ret = -1; goto out; Index: src/sys/arch/xen/xen/xbdback_xenbus.c diff -u src/sys/arch/xen/xen/xbdback_xenbus.c:1.24 src/sys/arch/xen/xen/xbdback_xenbus.c:1.25 --- src/sys/arch/xen/xen/xbdback_xenbus.c:1.24 Wed Jan 21 09:55:53 2009 +++ src/sys/arch/xen/xen/xbdback_xenbus.c Wed Sep 23 17:48:55 2009 @@ -1,4 +1,4 @@ -/* $NetBSD: xbdback_xenbus.c,v 1.24 2009/01/21 09:55:53 cegger Exp $ */ +/* $NetBSD: xbdback_xenbus.c,v 1.25 2009/09/23 17:48:55 bouyer Exp $ */ /* * Copyright (c) 2006 Manuel Bouyer. @@ -31,7 +31,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: xbdback_xenbus.c,v 1.24 2009/01/21 09:55:53 cegger Exp $"); +__KERNEL_RCSID(0, "$NetBSD: xbdback_xenbus.c,v 1.25 2009/09/23 17:48:55 bouyer Exp $"); #include <sys/types.h> #include <sys/param.h> @@ -91,6 +91,31 @@ * it's finished, set xbdi->xbdi_cont (see below) to NULL and the return * doesn't matter. Otherwise it's passed as the second parameter to * the new value of xbdi->xbdi_cont. + * Here's how the call graph is supposed to be for a single I/O: + * xbdback_co_main() + * | |-> xbdback_co_cache_doflush() -> stall + * | xbdback_co_cache_flush2() <- xbdback_co_flush_done() <- + * | | | + * | |-> xbdback_co_cache_flush() -> xbdback_co_flush() -- + * xbdback_co_main_loop() -> xbdback_co_main_done() -> xbdback_co_flush() + * | | | + * | xbdback_co_main_done2() <- xbdback_co_flush_done() + * | | + * | xbdback_co_main() or NULL + * xbdback_co_io() -> xbdback_co_main_incr() -> xbdback_co_main_loop() + * | + * xbdback_co_io_gotreq() -> xbdback_co_flush() -> xbdback_co_flush() + * | | | + * xbdback_co_io_loop() --- <---------------- xbdback_co_flush_done() + * | | + * xbdback_co_io_gotio() | + * | | + * xbdback_co_io_gotio2()<- + * | |--------> xbdback_co_io_gotfrag + * | | + * xbdback_co_io_gotfrag2() <----------| + * | |--> xbdback_co_io_loop() + * xbdback_co_main_incr() */ typedef void *(* xbdback_cont_t)(struct xbdback_instance *, void *); @@ -144,6 +169,7 @@ grant_ref_t xbdi_thisgrt, xbdi_lastgrt; /* grants */ /* other state */ int xbdi_same_page; /* are we merging two segments on the same page? */ + uint xbdi_pendingreqs; /* number of I/O in fly */ }; /* Manipulation of the above reference count. */ /* xxx...@panix.com: not MP-safe, and move the i386 asm elsewhere. */ @@ -180,16 +206,35 @@ */ struct xbdback_io { struct work xio_work; - struct buf xio_buf; /* our I/O */ /* The instance pointer is duplicated for convenience. */ struct xbdback_instance *xio_xbdi; /* our xbd instance */ - SLIST_HEAD(, xbdback_fragment) xio_rq; /* xbd requests involved */ - vaddr_t xio_vaddr; /* the virtual address to map the request at */ - grant_ref_t xio_gref[XENSHM_MAX_PAGES_PER_REQUEST]; /* grants to map */ - grant_handle_t xio_gh[XENSHM_MAX_PAGES_PER_REQUEST];/* grants release */ - uint16_t xio_nrma; /* number of guest pages */ - uint16_t xio_mapped; + uint8_t xio_operation; + union { + struct { + struct buf xio_buf; /* our I/O */ + /* xbd requests involved */ + SLIST_HEAD(, xbdback_fragment) xio_rq; + /* the virtual address to map the request at */ + vaddr_t xio_vaddr; + /* grants to map */ + grant_ref_t xio_gref[XENSHM_MAX_PAGES_PER_REQUEST]; + /* grants release */ + grant_handle_t xio_gh[XENSHM_MAX_PAGES_PER_REQUEST]; + uint16_t xio_nrma; /* number of guest pages */ + uint16_t xio_mapped; + } xio_rw; + uint64_t xio_flush_id; + } u; }; +#define xio_buf u.xio_rw.xio_buf +#define xio_rq u.xio_rw.xio_rq +#define xio_vaddr u.xio_rw.xio_vaddr +#define xio_gref u.xio_rw.xio_gref +#define xio_gh u.xio_rw.xio_gh +#define xio_nrma u.xio_rw.xio_nrma +#define xio_mapped u.xio_rw.xio_mapped + +#define xio_flush_id u.xio_flush_id /* * Rather than have the xbdback_io keep an array of the @@ -236,6 +281,10 @@ static void *xbdback_co_main_done(struct xbdback_instance *, void *); static void *xbdback_co_main_done2(struct xbdback_instance *, void *); +static void *xbdback_co_cache_flush(struct xbdback_instance *, void *); +static void *xbdback_co_cache_flush2(struct xbdback_instance *, void *); +static void *xbdback_co_cache_doflush(struct xbdback_instance *, void *); + static void *xbdback_co_io(struct xbdback_instance *, void *); static void *xbdback_co_io_gotreq(struct xbdback_instance *, void *); static void *xbdback_co_io_loop(struct xbdback_instance *, void *); @@ -742,6 +791,13 @@ xbusd->xbusd_path, err); goto abort; } + err = xenbus_printf(xbt, xbusd->xbusd_path, "feature-flush-cache", + "%u", 1); + if (err) { + printf("xbdback: failed to write %s/feature-flush-cache: %d\n", + xbusd->xbusd_path, err); + goto abort; + } err = xenbus_transaction_end(xbt, 0); if (err == EAGAIN) goto again; @@ -861,6 +917,10 @@ case BLKIF_OP_WRITE: xbdi->xbdi_cont = xbdback_co_io; break; + case BLKIF_OP_FLUSH_DISKCACHE: + xbdi_get(xbdi); + xbdi->xbdi_cont = xbdback_co_cache_flush; + break; default: printf("xbdback_evthandler domain %d: unknown " "operation %d\n", xbdi->xbdi_domid, req->operation); @@ -911,6 +971,50 @@ } static void * +xbdback_co_cache_flush(struct xbdback_instance *xbdi, void *obj) +{ + (void)obj; + XENPRINTF(("xbdback_co_cache_flush %p %p\n", xbdi, obj)); + if (xbdi->xbdi_io != NULL) { + xbdi->xbdi_cont = xbdback_co_flush; + xbdi->xbdi_cont_aux = xbdback_co_cache_flush2; + } else { + xbdi->xbdi_cont = xbdback_co_cache_flush2; + } + return xbdi; +} + +static void * +xbdback_co_cache_flush2(struct xbdback_instance *xbdi, void *obj) +{ + (void)obj; + XENPRINTF(("xbdback_co_cache_flush2 %p %p\n", xbdi, obj)); + if (xbdi->xbdi_pendingreqs > 0) { + /* event or iodone will restart processing */ + xbdi->xbdi_cont = NULL; + return NULL; + } + xbdi->xbdi_cont = xbdback_co_cache_doflush; + return xbdback_pool_get(&xbdback_io_pool, xbdi); +} + +static void * +xbdback_co_cache_doflush(struct xbdback_instance *xbdi, void *obj) +{ + struct xbdback_io *xbd_io; + + XENPRINTF(("xbdback_co_cache_doflush %p %p\n", xbdi, obj)); + xbd_io = xbdi->xbdi_io = obj; + xbd_io->xio_xbdi = xbdi; + xbd_io->xio_operation = xbdi->xbdi_xen_req.operation; + xbd_io->xio_flush_id = xbdi->xbdi_xen_req.id; + workqueue_enqueue(xbdback_workqueue, &xbdi->xbdi_io->xio_work, NULL); + /* xbdback_do_io() will advance req pointer and restart processing */ + xbdi->xbdi_cont = xbdback_co_cache_doflush; + return NULL; +} + +static void * xbdback_co_io(struct xbdback_instance *xbdi, void *obj) { int error; @@ -1051,7 +1155,6 @@ if (xbdi->xbdi_io == NULL) { xbdi->xbdi_cont = xbdback_co_io_gotio; xio = xbdback_pool_get(&xbdback_io_pool, xbdi); - buf_init(&xio->xio_buf); return xio; } else { xbdi->xbdi_cont = xbdback_co_io_gotio2; @@ -1073,12 +1176,15 @@ int buf_flags; xbdi_get(xbdi); + atomic_inc_uint(&xbdi->xbdi_pendingreqs); xbd_io = xbdi->xbdi_io = obj; + buf_init(&xbd_io->xio_buf); xbd_io->xio_xbdi = xbdi; SLIST_INIT(&xbd_io->xio_rq); xbd_io->xio_nrma = 0; xbd_io->xio_mapped = 0; + xbd_io->xio_operation = xbdi->xbdi_xen_req.operation; start_offset = xbdi->xbdi_this_fs * VBD_BSIZE; @@ -1208,6 +1314,33 @@ struct xbdback_io *xbd_io = (void *)wk; KASSERT(&xbd_io->xio_work == wk); + if (xbd_io->xio_operation == BLKIF_OP_FLUSH_DISKCACHE) { + int error; + struct xbdback_instance *xbdi = xbd_io->xio_xbdi; + + error = VOP_IOCTL(xbdi->xbdi_vp, DIOCCACHESYNC, NULL, FWRITE, + kauth_cred_get()); + if (error) { + aprint_error("xbdback %s: DIOCCACHESYNC returned %d\n", + xbdi->xbdi_xbusd->xbusd_path, error); + if (error == EOPNOTSUPP || error == ENOTTY) + error = BLKIF_RSP_EOPNOTSUPP; + else + error = BLKIF_RSP_ERROR; + } else + error = BLKIF_RSP_OKAY; + xbdback_send_reply(xbdi, xbd_io->xio_flush_id, + xbd_io->xio_operation, error); + xbdback_pool_put(&xbdback_io_pool, xbd_io); + xbdi_put(xbdi); + /* handle next IO */ + xbdi->xbdi_io = NULL; + xbdi->xbdi_cont = xbdback_co_main_incr; + xbdback_trampoline(xbdi, xbdi); + return; + } + + /* should be read or write */ xbd_io->xio_buf.b_data = (void *)((vaddr_t)xbd_io->xio_buf.b_data + xbd_io->xio_vaddr); #ifdef DIAGNOSTIC @@ -1295,8 +1428,14 @@ xbdback_pool_put(&xbdback_request_pool, xbd_req); } xbdi_put(xbdi); + atomic_dec_uint(&xbdi->xbdi_pendingreqs); buf_destroy(&xbd_io->xio_buf); xbdback_pool_put(&xbdback_io_pool, xbd_io); + if (xbdi->xbdi_cont == NULL) { + /* check if there is more work to do */ + xbdi->xbdi_cont = xbdback_co_main; + xbdback_trampoline(xbdi, xbdi); + } } /*