Module Name:    src
Committed By:   bouyer
Date:           Wed Sep 23 17:48:55 UTC 2009

Modified Files:
        src/sys/arch/xen/xen: xbd_xenbus.c xbdback_xenbus.c

Log Message:
xbdback: implement and publish "feature-flush-cache".
xbd: if feature-flush-cache is present, use it for DIOCCACHESYNC.
 If not present, make DIOCCACHESYNC return EOPNOTSUPP and warn on
 first call.
Should improve WAPBL reliability of Xen guests on a NetBSD dom0.
Unfortunably not all linux guests seems to support this feature, and using
feature-write-barrier would require a B_BARRIER flag in the buffer.


To generate a diff of this commit:
cvs rdiff -u -r1.42 -r1.43 src/sys/arch/xen/xen/xbd_xenbus.c
cvs rdiff -u -r1.24 -r1.25 src/sys/arch/xen/xen/xbdback_xenbus.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/arch/xen/xen/xbd_xenbus.c
diff -u src/sys/arch/xen/xen/xbd_xenbus.c:1.42 src/sys/arch/xen/xen/xbd_xenbus.c:1.43
--- src/sys/arch/xen/xen/xbd_xenbus.c:1.42	Mon Sep 21 21:59:30 2009
+++ src/sys/arch/xen/xen/xbd_xenbus.c	Wed Sep 23 17:48:55 2009
@@ -1,4 +1,4 @@
-/*      $NetBSD: xbd_xenbus.c,v 1.42 2009/09/21 21:59:30 bouyer Exp $      */
+/*      $NetBSD: xbd_xenbus.c,v 1.43 2009/09/23 17:48:55 bouyer Exp $      */
 
 /*
  * Copyright (c) 2006 Manuel Bouyer.
@@ -31,7 +31,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: xbd_xenbus.c,v 1.42 2009/09/21 21:59:30 bouyer Exp $");
+__KERNEL_RCSID(0, "$NetBSD: xbd_xenbus.c,v 1.43 2009/09/23 17:48:55 bouyer Exp $");
 
 #include "opt_xen.h"
 #include "rnd.h"
@@ -84,11 +84,24 @@
 struct xbd_req {
 	SLIST_ENTRY(xbd_req) req_next;
 	uint16_t req_id; /* ID passed to backend */
-	grant_ref_t req_gntref[BLKIF_MAX_SEGMENTS_PER_REQUEST];
-	int req_nr_segments; /* number of segments in this request */
-	struct buf *req_bp; /* buffer associated with this request */
-	void *req_data; /* pointer to the data buffer */
+	union {
+	    struct {
+		grant_ref_t req_gntref[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+		int req_nr_segments; /* number of segments in this request */
+		struct buf *req_bp; /* buffer associated with this request */
+		void *req_data; /* pointer to the data buffer */
+	    } req_rw;
+	    struct {
+		    int s_error;
+		    volatile int s_done;
+	    } req_sync;
+	} u;
 };
+#define req_gntref	u.req_rw.req_gntref
+#define req_nr_segments	u.req_rw.req_nr_segments
+#define req_bp		u.req_rw.req_bp
+#define req_data	u.req_rw.req_data
+#define req_sync	u.req_sync
 
 struct xbd_xenbus_softc {
 	device_t sc_dev;
@@ -104,6 +117,7 @@
 
 	struct xbd_req sc_reqs[XBD_RING_SIZE];
 	SLIST_HEAD(,xbd_req) sc_xbdreq_head; /* list of free requests */
+	bool sc_xbdreq_wait; /* special waiting on xbd_req */
 
 	int sc_backend_status; /* our status with backend */
 #define BLKIF_STATE_DISCONNECTED 0
@@ -119,6 +133,7 @@
 	uint64_t sc_xbdsize; /* size of disk in DEV_BSIZE */
 	u_long sc_info; /* VDISK_* */
 	u_long sc_handle; /* from backend */
+	int sc_cache_flush; /* backend supports BLKIF_OP_FLUSH_DISKCACHE */
 #if NRND > 0
 	rndsource_element_t     sc_rnd_source;
 #endif
@@ -518,6 +533,7 @@
 {
 	int err;
 	unsigned long long sectors;
+	u_long cache_flush;
 
 	err = xenbus_read_ul(NULL,
 	    sc->sc_xbusd->xbusd_path, "virtual-device", &sc->sc_handle, 10);
@@ -541,6 +557,14 @@
 	if (err)
 		panic("%s: can't read number from %s/sector-size\n", 
 		    device_xname(sc->sc_dev), sc->sc_xbusd->xbusd_otherend);
+	err = xenbus_read_ul(NULL, sc->sc_xbusd->xbusd_otherend,
+	    "feature-flush-cache", &cache_flush, 10);
+	if (err)
+		cache_flush = 0;
+	if (cache_flush > 0)
+		sc->sc_cache_flush = 1;
+	else
+		sc->sc_cache_flush = 0;
 
 	xenbus_switch_state(sc->sc_xbusd, NULL, XenbusStateConnected);
 }
@@ -564,9 +588,16 @@
 	for (i = sc->sc_ring.rsp_cons; i != resp_prod; i++) {
 		blkif_response_t *rep = RING_GET_RESPONSE(&sc->sc_ring, i);
 		struct xbd_req *xbdreq = &sc->sc_reqs[rep->id];
-		bp = xbdreq->req_bp;
 		DPRINTF(("xbd_handler(%p): b_bcount = %ld\n",
-		    bp, (long)bp->b_bcount));
+		    xbdreq->req_bp, (long)bp->b_bcount));
+		bp = xbdreq->req_bp;
+		if (rep->operation == BLKIF_OP_FLUSH_DISKCACHE) {
+			xbdreq->req_sync.s_error = rep->status;
+			xbdreq->req_sync.s_done = 1;
+			wakeup(xbdreq);
+			/* caller will free the req */
+			continue;
+		}
 		for (seg = xbdreq->req_nr_segments - 1; seg >= 0; seg--) {
 			if (__predict_false(
 			    xengnt_status(xbdreq->req_gntref[seg]))) {
@@ -608,13 +639,15 @@
 		biodone(bp);
 		SLIST_INSERT_HEAD(&sc->sc_xbdreq_head, xbdreq, req_next);
 	}
+done:
 	xen_rmb();
 	sc->sc_ring.rsp_cons = i;
 	RING_FINAL_CHECK_FOR_RESPONSES(&sc->sc_ring, more_to_do);
 	if (more_to_do)
 		goto again;
-done:
 	dk_iodone(sc->sc_di, &sc->sc_dksc);
+	if (sc->sc_xbdreq_wait)
+		wakeup(&sc->sc_xbdreq_wait);
 	return 1;
 }
 
@@ -717,6 +750,10 @@
 	struct	dk_softc *dksc;
 	int	error;
 	struct	disk *dk;
+	int s;
+	struct xbd_req *xbdreq;
+	blkif_request_t *req;
+	int notify;
 
 	DPRINTF(("xbdioctl(%d, %08lx, %p, %d, %p)\n",
 	    dev, cmd, data, flag, l));
@@ -731,6 +768,57 @@
 	case DIOCSSTRATEGY:
 		error = EOPNOTSUPP;
 		break;
+	case DIOCCACHESYNC:
+		if (sc->sc_cache_flush <= 0) {
+			if (sc->sc_cache_flush == 0) {
+				aprint_error_dev(sc->sc_dev,
+				    "WARNING: cache flush not supported "
+				    "by backend\n");
+				sc->sc_cache_flush = -1;
+			}
+			return EOPNOTSUPP;
+		}
+
+		s = splbio();
+
+		while (RING_FULL(&sc->sc_ring)) {
+			sc->sc_xbdreq_wait = 1;
+			tsleep(&sc->sc_xbdreq_wait, PRIBIO, "xbdreq", 0);
+		}
+		sc->sc_xbdreq_wait = 0;
+
+		xbdreq = SLIST_FIRST(&sc->sc_xbdreq_head);
+		if (__predict_false(xbdreq == NULL)) {
+			DPRINTF(("xbdioctl: no req\n"));
+			error = ENOMEM;
+		} else {
+			SLIST_REMOVE_HEAD(&sc->sc_xbdreq_head, req_next);
+			req = RING_GET_REQUEST(&sc->sc_ring,
+			    sc->sc_ring.req_prod_pvt);
+			req->id = xbdreq->req_id;
+			req->operation = BLKIF_OP_FLUSH_DISKCACHE;
+			req->handle = sc->sc_handle;
+			xbdreq->req_sync.s_done = 0;
+			sc->sc_ring.req_prod_pvt++;
+			RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&sc->sc_ring,
+			    notify);
+			if (notify)
+				hypervisor_notify_via_evtchn(sc->sc_evtchn);
+			/* request sent, no wait for completion */
+			while (xbdreq->req_sync.s_done == 0) {
+				tsleep(xbdreq, PRIBIO, "xbdsync", 0);
+			}
+			if (xbdreq->req_sync.s_error == BLKIF_RSP_EOPNOTSUPP)
+				error = EOPNOTSUPP;
+			else if (xbdreq->req_sync.s_error == BLKIF_RSP_OKAY)
+				error = 0;
+			else
+				error = EIO;
+			SLIST_INSERT_HEAD(&sc->sc_xbdreq_head, xbdreq,
+			    req_next);
+		}
+		splx(s);
+		break;
 	default:
 		error = dk_ioctl(sc->sc_di, dksc, dev, cmd, data, flag, l);
 		break;
@@ -788,7 +876,7 @@
 	}
 		
 
-	if (RING_FULL(&sc->sc_ring)) {
+	if (RING_FULL(&sc->sc_ring) || sc->sc_xbdreq_wait) {
 		DPRINTF(("xbdstart: ring_full\n"));
 		ret = -1;
 		goto out;

Index: src/sys/arch/xen/xen/xbdback_xenbus.c
diff -u src/sys/arch/xen/xen/xbdback_xenbus.c:1.24 src/sys/arch/xen/xen/xbdback_xenbus.c:1.25
--- src/sys/arch/xen/xen/xbdback_xenbus.c:1.24	Wed Jan 21 09:55:53 2009
+++ src/sys/arch/xen/xen/xbdback_xenbus.c	Wed Sep 23 17:48:55 2009
@@ -1,4 +1,4 @@
-/*      $NetBSD: xbdback_xenbus.c,v 1.24 2009/01/21 09:55:53 cegger Exp $      */
+/*      $NetBSD: xbdback_xenbus.c,v 1.25 2009/09/23 17:48:55 bouyer Exp $      */
 
 /*
  * Copyright (c) 2006 Manuel Bouyer.
@@ -31,7 +31,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: xbdback_xenbus.c,v 1.24 2009/01/21 09:55:53 cegger Exp $");
+__KERNEL_RCSID(0, "$NetBSD: xbdback_xenbus.c,v 1.25 2009/09/23 17:48:55 bouyer Exp $");
 
 #include <sys/types.h>
 #include <sys/param.h>
@@ -91,6 +91,31 @@
  * it's finished, set xbdi->xbdi_cont (see below) to NULL and the return
  * doesn't matter.  Otherwise it's passed as the second parameter to
  * the new value of xbdi->xbdi_cont.
+ * Here's how the call graph is supposed to be for a single I/O:
+ * xbdback_co_main()   
+ *        |           |-> xbdback_co_cache_doflush() -> stall
+ *        |          xbdback_co_cache_flush2() <-  xbdback_co_flush_done() <-
+ *        |                              |                                   |
+ *        |              |-> xbdback_co_cache_flush() -> xbdback_co_flush() --
+ * xbdback_co_main_loop() -> xbdback_co_main_done() -> xbdback_co_flush()
+ *        |                              |                      |
+ *        |                  xbdback_co_main_done2() <- xbdback_co_flush_done()
+ *        |                              |
+ *        |                  xbdback_co_main() or NULL
+ *   xbdback_co_io() -> xbdback_co_main_incr() -> xbdback_co_main_loop()
+ *        |
+ *   xbdback_co_io_gotreq() -> xbdback_co_flush() -> xbdback_co_flush()
+ *        |                |                                |
+ *   xbdback_co_io_loop() ---        <---------------- xbdback_co_flush_done()
+ *        |                 |
+ *   xbdback_co_io_gotio()  |
+ *        |                 |
+ *   xbdback_co_io_gotio2()<-
+ *        |              |-------->  xbdback_co_io_gotfrag
+ *        |                              |
+ *   xbdback_co_io_gotfrag2() <----------|
+ *        |                 |--> xbdback_co_io_loop()
+ *   xbdback_co_main_incr()
  */
 typedef void *(* xbdback_cont_t)(struct xbdback_instance *, void *);
 
@@ -144,6 +169,7 @@
 	grant_ref_t xbdi_thisgrt, xbdi_lastgrt; /* grants */
 	/* other state */
 	int xbdi_same_page; /* are we merging two segments on the same page? */
+	uint xbdi_pendingreqs; /* number of I/O in fly */
 };
 /* Manipulation of the above reference count. */
 /* xxx...@panix.com: not MP-safe, and move the i386 asm elsewhere. */
@@ -180,16 +206,35 @@
  */
 struct xbdback_io {
 	struct work xio_work;
-	struct buf xio_buf; /* our I/O */
 	/* The instance pointer is duplicated for convenience. */
 	struct xbdback_instance *xio_xbdi; /* our xbd instance */
-	SLIST_HEAD(, xbdback_fragment) xio_rq; /* xbd requests involved */
-	vaddr_t xio_vaddr; /* the virtual address to map the request at */
-	grant_ref_t xio_gref[XENSHM_MAX_PAGES_PER_REQUEST]; /* grants to map */
-	grant_handle_t xio_gh[XENSHM_MAX_PAGES_PER_REQUEST];/* grants release */
-	uint16_t xio_nrma; /* number of guest pages */
-	uint16_t xio_mapped;
+	uint8_t xio_operation;
+	union {
+		struct {
+			struct buf xio_buf; /* our I/O */
+			/* xbd requests involved */
+			SLIST_HEAD(, xbdback_fragment) xio_rq;
+			/* the virtual address to map the request at */
+			vaddr_t xio_vaddr;
+			/* grants to map */
+			grant_ref_t xio_gref[XENSHM_MAX_PAGES_PER_REQUEST];
+			/* grants release */
+			grant_handle_t xio_gh[XENSHM_MAX_PAGES_PER_REQUEST];
+			uint16_t xio_nrma; /* number of guest pages */
+			uint16_t xio_mapped;
+		} xio_rw;
+		uint64_t xio_flush_id;
+	} u;
 };
+#define xio_buf		u.xio_rw.xio_buf
+#define xio_rq		u.xio_rw.xio_rq
+#define xio_vaddr	u.xio_rw.xio_vaddr
+#define xio_gref	u.xio_rw.xio_gref
+#define xio_gh		u.xio_rw.xio_gh
+#define xio_nrma	u.xio_rw.xio_nrma
+#define xio_mapped	u.xio_rw.xio_mapped
+
+#define xio_flush_id	u.xio_flush_id
 
 /*
  * Rather than have the xbdback_io keep an array of the
@@ -236,6 +281,10 @@
 static void *xbdback_co_main_done(struct xbdback_instance *, void *);
 static void *xbdback_co_main_done2(struct xbdback_instance *, void *);
 
+static void *xbdback_co_cache_flush(struct xbdback_instance *, void *);
+static void *xbdback_co_cache_flush2(struct xbdback_instance *, void *);
+static void *xbdback_co_cache_doflush(struct xbdback_instance *, void *);
+
 static void *xbdback_co_io(struct xbdback_instance *, void *);
 static void *xbdback_co_io_gotreq(struct xbdback_instance *, void *);
 static void *xbdback_co_io_loop(struct xbdback_instance *, void *);
@@ -742,6 +791,13 @@
 		    xbusd->xbusd_path, err);
 		goto abort;
 	}
+	err = xenbus_printf(xbt, xbusd->xbusd_path, "feature-flush-cache",
+	    "%u", 1);
+	if (err) {
+		printf("xbdback: failed to write %s/feature-flush-cache: %d\n",
+		    xbusd->xbusd_path, err);
+		goto abort;
+	}
 	err = xenbus_transaction_end(xbt, 0);
 	if (err == EAGAIN)
 		goto again;
@@ -861,6 +917,10 @@
 		case BLKIF_OP_WRITE:
 			xbdi->xbdi_cont = xbdback_co_io;
 			break;
+		case BLKIF_OP_FLUSH_DISKCACHE:
+			xbdi_get(xbdi);
+			xbdi->xbdi_cont = xbdback_co_cache_flush;
+			break;
 		default:
 			printf("xbdback_evthandler domain %d: unknown "
 			    "operation %d\n", xbdi->xbdi_domid, req->operation);
@@ -911,6 +971,50 @@
 }
 
 static void *
+xbdback_co_cache_flush(struct xbdback_instance *xbdi, void *obj)
+{
+	(void)obj;
+	XENPRINTF(("xbdback_co_cache_flush %p %p\n", xbdi, obj));
+	if (xbdi->xbdi_io != NULL) {
+		xbdi->xbdi_cont = xbdback_co_flush;
+		xbdi->xbdi_cont_aux = xbdback_co_cache_flush2;
+	} else {
+		xbdi->xbdi_cont = xbdback_co_cache_flush2;
+	}
+	return xbdi;
+}
+
+static void *
+xbdback_co_cache_flush2(struct xbdback_instance *xbdi, void *obj)
+{
+	(void)obj;
+	XENPRINTF(("xbdback_co_cache_flush2 %p %p\n", xbdi, obj));
+	if (xbdi->xbdi_pendingreqs > 0) {
+		/* event or iodone will restart processing */
+		xbdi->xbdi_cont = NULL;
+		return NULL;
+	}
+	xbdi->xbdi_cont = xbdback_co_cache_doflush;
+	return xbdback_pool_get(&xbdback_io_pool, xbdi);
+}
+
+static void *
+xbdback_co_cache_doflush(struct xbdback_instance *xbdi, void *obj)
+{
+	struct xbdback_io *xbd_io;
+
+	XENPRINTF(("xbdback_co_cache_doflush %p %p\n", xbdi, obj));
+	xbd_io = xbdi->xbdi_io = obj;
+	xbd_io->xio_xbdi = xbdi;
+	xbd_io->xio_operation = xbdi->xbdi_xen_req.operation;
+	xbd_io->xio_flush_id = xbdi->xbdi_xen_req.id;
+	workqueue_enqueue(xbdback_workqueue, &xbdi->xbdi_io->xio_work, NULL);
+	/* xbdback_do_io() will advance req pointer and restart processing */
+	xbdi->xbdi_cont = xbdback_co_cache_doflush;
+	return NULL;
+}
+
+static void *
 xbdback_co_io(struct xbdback_instance *xbdi, void *obj)
 {	
 	int error;
@@ -1051,7 +1155,6 @@
 		if (xbdi->xbdi_io == NULL) {
 			xbdi->xbdi_cont = xbdback_co_io_gotio;
 			xio = xbdback_pool_get(&xbdback_io_pool, xbdi);
-			buf_init(&xio->xio_buf);
 			return xio;
 		} else {
 			xbdi->xbdi_cont = xbdback_co_io_gotio2;
@@ -1073,12 +1176,15 @@
 	int buf_flags;
 
 	xbdi_get(xbdi);
+	atomic_inc_uint(&xbdi->xbdi_pendingreqs);
 	
 	xbd_io = xbdi->xbdi_io = obj;
+	buf_init(&xbd_io->xio_buf);
 	xbd_io->xio_xbdi = xbdi;
 	SLIST_INIT(&xbd_io->xio_rq);
 	xbd_io->xio_nrma = 0;
 	xbd_io->xio_mapped = 0;
+	xbd_io->xio_operation = xbdi->xbdi_xen_req.operation;
 
 	start_offset = xbdi->xbdi_this_fs * VBD_BSIZE;
 	
@@ -1208,6 +1314,33 @@
 	struct xbdback_io *xbd_io = (void *)wk;
 	KASSERT(&xbd_io->xio_work == wk);
 
+	if (xbd_io->xio_operation == BLKIF_OP_FLUSH_DISKCACHE) {
+		int error;
+		struct xbdback_instance *xbdi = xbd_io->xio_xbdi;
+
+		error = VOP_IOCTL(xbdi->xbdi_vp, DIOCCACHESYNC, NULL, FWRITE,
+		    kauth_cred_get());
+		if (error) {
+			aprint_error("xbdback %s: DIOCCACHESYNC returned %d\n",
+			    xbdi->xbdi_xbusd->xbusd_path, error);
+			 if (error == EOPNOTSUPP || error == ENOTTY)
+				error = BLKIF_RSP_EOPNOTSUPP;
+			 else
+				error = BLKIF_RSP_ERROR;
+		} else
+			error = BLKIF_RSP_OKAY;
+		xbdback_send_reply(xbdi, xbd_io->xio_flush_id,
+		    xbd_io->xio_operation, error);
+		xbdback_pool_put(&xbdback_io_pool, xbd_io);
+		xbdi_put(xbdi);
+		/* handle next IO */
+		xbdi->xbdi_io = NULL;
+		xbdi->xbdi_cont = xbdback_co_main_incr;
+		xbdback_trampoline(xbdi, xbdi);
+		return;
+	}
+
+	/* should be read or write */
 	xbd_io->xio_buf.b_data =
 	    (void *)((vaddr_t)xbd_io->xio_buf.b_data + xbd_io->xio_vaddr);
 #ifdef DIAGNOSTIC
@@ -1295,8 +1428,14 @@
 		xbdback_pool_put(&xbdback_request_pool, xbd_req);
 	}
 	xbdi_put(xbdi);
+	atomic_dec_uint(&xbdi->xbdi_pendingreqs);
 	buf_destroy(&xbd_io->xio_buf);
 	xbdback_pool_put(&xbdback_io_pool, xbd_io);
+	if (xbdi->xbdi_cont == NULL) {
+		/* check if there is more work to do */
+		xbdi->xbdi_cont = xbdback_co_main;
+		xbdback_trampoline(xbdi, xbdi);
+	}
 }
 
 /*

Reply via email to