Module Name:    src
Committed By:   jdolecek
Date:           Tue Apr 21 13:56:18 UTC 2020

Modified Files:
        src/sys/arch/xen/xen: xbdback_xenbus.c

Log Message:
add support for indirect segments, which makes it possible to pass
up to MAXPHYS (implementation limit, interface allows more) using
single request

request using indirect segment requires 1 extra copy hypercall per
request, but saves 2 shared memory hypercalls (map_grant/unmap_grant),
so should be net performance boost due to less TLB flushing

this also effectively doubles disk queue size for xbd(4)


To generate a diff of this commit:
cvs rdiff -u -r1.85 -r1.86 src/sys/arch/xen/xen/xbdback_xenbus.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/arch/xen/xen/xbdback_xenbus.c
diff -u src/sys/arch/xen/xen/xbdback_xenbus.c:1.85 src/sys/arch/xen/xen/xbdback_xenbus.c:1.86
--- src/sys/arch/xen/xen/xbdback_xenbus.c:1.85	Mon Apr 20 19:29:09 2020
+++ src/sys/arch/xen/xen/xbdback_xenbus.c	Tue Apr 21 13:56:18 2020
@@ -1,4 +1,4 @@
-/*      $NetBSD: xbdback_xenbus.c,v 1.85 2020/04/20 19:29:09 jdolecek Exp $      */
+/*      $NetBSD: xbdback_xenbus.c,v 1.86 2020/04/21 13:56:18 jdolecek Exp $      */
 
 /*
  * Copyright (c) 2006 Manuel Bouyer.
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: xbdback_xenbus.c,v 1.85 2020/04/20 19:29:09 jdolecek Exp $");
+__KERNEL_RCSID(0, "$NetBSD: xbdback_xenbus.c,v 1.86 2020/04/21 13:56:18 jdolecek Exp $");
 
 #include <sys/atomic.h>
 #include <sys/buf.h>
@@ -73,6 +73,9 @@ __KERNEL_RCSID(0, "$NetBSD: xbdback_xenb
 
 /* Need to alloc one extra page to account for possible mapping offset */
 #define VBD_VA_SIZE	(MAXPHYS + PAGE_SIZE)
+#define VBD_MAX_INDIRECT_SEGMENTS	VBD_VA_SIZE >> PAGE_SHIFT
+
+CTASSERT(XENSHM_MAX_PAGES_PER_REQUEST >= VBD_MAX_INDIRECT_SEGMENTS);
 
 struct xbdback_io;
 struct xbdback_instance;
@@ -179,8 +182,10 @@ struct xbdback_instance {
 	RING_IDX xbdi_req_prod; /* limit on request indices */
 	xbdback_cont_t xbdi_cont;
 	/* _request state: track requests fetched from ring */
-	struct xbdback_request *xbdi_req; /* if NULL, ignore following */
 	blkif_request_t xbdi_xen_req;
+	struct blkif_request_segment xbdi_seg[VBD_MAX_INDIRECT_SEGMENTS];
+	bus_dmamap_t xbdi_seg_dmamap;
+	grant_ref_t xbdi_in_gntref;
 	/* _io state: I/O associated to this instance */
 	struct xbdback_io *xbdi_io;
 	/* other state */
@@ -221,9 +226,9 @@ struct xbdback_io {
 			struct xbdback_va *xio_xv;
 			vaddr_t xio_start_offset;	/* I/O start offset */
 			/* grants to map */
-			grant_ref_t xio_gref[XENSHM_MAX_PAGES_PER_REQUEST];
+			grant_ref_t xio_gref[VBD_MAX_INDIRECT_SEGMENTS];
 			/* grants release */
-			grant_handle_t xio_gh[XENSHM_MAX_PAGES_PER_REQUEST];
+			grant_handle_t xio_gh[VBD_MAX_INDIRECT_SEGMENTS];
 			uint16_t xio_nrma; /* number of guest pages */
 		} xio_rw;
 	} u;
@@ -374,6 +379,22 @@ xbdback_xenbus_create(struct xenbus_devi
 	xbusd->xbusd_otherend_changed = xbdback_frontend_changed;
 	xbdi->xbdi_xbusd = xbusd;
 
+	if (bus_dmamap_create(xbdi->xbdi_xbusd->xbusd_dmat, PAGE_SIZE,
+	    1, PAGE_SIZE, PAGE_SIZE, BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW,
+	    &xbdi->xbdi_seg_dmamap) != 0) {
+		printf("%s: can't create dma map for indirect segments\n",
+		    xbdi->xbdi_name);
+		goto fail;
+	}
+	if (bus_dmamap_load(xbdi->xbdi_xbusd->xbusd_dmat,
+	    xbdi->xbdi_seg_dmamap, xbdi->xbdi_seg,
+	    sizeof(xbdi->xbdi_seg), NULL, BUS_DMA_WAITOK) != 0) {
+		printf("%s: can't load dma map for indirect segments\n",
+		    xbdi->xbdi_name);
+		goto fail;
+	}
+	KASSERT(xbdi->xbdi_seg_dmamap->dm_nsegs == 1);
+
 	SLIST_INIT(&xbdi->xbdi_va_free);
 	for (i = 0; i < BLKIF_RING_SIZE; i++) {
 		xbdi->xbdi_va[i].xv_vaddr = uvm_km_alloc(kernel_map,
@@ -457,6 +478,9 @@ xbdback_xenbus_destroy(void *arg)
 		}
 	}
 
+	bus_dmamap_unload(xbdi->xbdi_xbusd->xbusd_dmat, xbdi->xbdi_seg_dmamap);
+	bus_dmamap_destroy(xbdi->xbdi_xbusd->xbusd_dmat, xbdi->xbdi_seg_dmamap);
+
 	mutex_destroy(&xbdi->xbdi_lock);
 	cv_destroy(&xbdi->xbdi_cv);
 	kmem_free(xbdi, sizeof(*xbdi));
@@ -804,6 +828,13 @@ again:
 		    xbusd->xbusd_path, err);
 		goto abort;
 	}
+	err = xenbus_printf(xbt, xbusd->xbusd_path,
+	    "feature-max-indirect-segments", "%u", VBD_MAX_INDIRECT_SEGMENTS);
+	if (err) {
+		printf("xbdback: failed to write %s/feature-indirect: %d\n",
+		    xbusd->xbusd_path, err);
+		goto abort;
+	}
 	err = xenbus_transaction_end(xbt, 0);
 	if (err == EAGAIN)
 		goto again;
@@ -936,39 +967,35 @@ xbdback_co_main(struct xbdback_instance 
  * the ring.
  */
 static void *
-xbdback_co_main_loop(struct xbdback_instance *xbdi, void *obj) 
+xbdback_co_main_loop(struct xbdback_instance *xbdi, void *obj __unused) 
 {
-	blkif_request_t *req;
+	blkif_request_t *req, *reqn;
 	blkif_x86_32_request_t *req32;
 	blkif_x86_64_request_t *req64;
+	blkif_request_indirect_t *rin;
 
-	(void)obj;
-	req = &xbdi->xbdi_xen_req;
 	if (xbdi->xbdi_ring.ring_n.req_cons != xbdi->xbdi_req_prod) {
+		req = &xbdi->xbdi_xen_req;
+		memset(req, 0, sizeof(*req));
+
 		switch(xbdi->xbdi_proto) {
 		case XBDIP_NATIVE:
-			memcpy(req, RING_GET_REQUEST(&xbdi->xbdi_ring.ring_n,
-			    xbdi->xbdi_ring.ring_n.req_cons),
-			    sizeof(blkif_request_t));
+			reqn = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_n,
+			    xbdi->xbdi_ring.ring_n.req_cons);
+			req->operation = reqn->operation;
+			req->id = reqn->id;
 			break;
 		case XBDIP_32:
 			req32 = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_32,
 			    xbdi->xbdi_ring.ring_n.req_cons);
 			req->operation = req32->operation;
-			req->nr_segments = req32->nr_segments;
-			req->handle = req32->handle;
 			req->id = req32->id;
-			req->sector_number = req32->sector_number;
 			break;
-			    
 		case XBDIP_64:
 			req64 = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_64,
 			    xbdi->xbdi_ring.ring_n.req_cons);
 			req->operation = req64->operation;
-			req->nr_segments = req64->nr_segments;
-			req->handle = req64->handle;
 			req->id = req64->id;
-			req->sector_number = req64->sector_number;
 			break;
 		}
 		__insn_barrier();
@@ -978,7 +1005,23 @@ xbdback_co_main_loop(struct xbdback_inst
 			xbdi->xbdi_req_prod,
 			xbdi->xbdi_ring.ring_n.rsp_prod_pvt,
 			req->id));
-		switch(req->operation) {
+		switch (req->operation) {
+		case BLKIF_OP_INDIRECT:
+			/* just check indirect_op, rest is handled later */
+			rin = (blkif_request_indirect_t *)
+			    RING_GET_REQUEST(&xbdi->xbdi_ring.ring_n,
+				xbdi->xbdi_ring.ring_n.req_cons);
+			if (rin->indirect_op != BLKIF_OP_READ &&
+			    rin->indirect_op != BLKIF_OP_WRITE) {
+				if (ratecheck(&xbdi->xbdi_lasterr_time,
+				    &xbdback_err_intvl)) {
+					printf("%s: unknown ind operation %d\n",
+					    xbdi->xbdi_name,
+					    rin->indirect_op);
+				}
+				goto fail;
+			}
+			/* FALLTHROUGH */
 		case BLKIF_OP_READ:
 		case BLKIF_OP_WRITE:
 			xbdi->xbdi_cont = xbdback_co_io;
@@ -993,6 +1036,7 @@ xbdback_co_main_loop(struct xbdback_inst
 				printf("%s: unknown operation %d\n",
 				    xbdi->xbdi_name, req->operation);
 			}
+fail:
 			xbdback_send_reply(xbdi, req->id, req->operation,
 			    BLKIF_RSP_ERROR);
 			xbdi->xbdi_cont = xbdback_co_main_incr;
@@ -1046,6 +1090,7 @@ xbdback_co_main_done2(struct xbdback_ins
 {
 	int work_to_do;
 
+	KASSERT(xbdi->xbdi_io == NULL);
 	RING_FINAL_CHECK_FOR_REQUESTS(&xbdi->xbdi_ring.ring_n, work_to_do);
 	if (work_to_do)
 		xbdi->xbdi_cont = xbdback_co_main;
@@ -1094,31 +1139,22 @@ xbdback_co_cache_doflush(struct xbdback_
  * then get the segment information directly from the ring request.
  */
 static void *
-xbdback_co_io(struct xbdback_instance *xbdi, void *obj)
+xbdback_co_io(struct xbdback_instance *xbdi, void *obj __unused)
 {	
 	int i, error;
-	blkif_request_t *req;
+	blkif_request_t *req, *reqn;
 	blkif_x86_32_request_t *req32;
 	blkif_x86_64_request_t *req64;
+	blkif_request_indirect_t *rinn;
+	blkif_x86_32_request_indirect_t *rin32;
+	blkif_x86_64_request_indirect_t *rin64;
 
-	(void)obj;
-
-	/* some sanity checks */
 	req = &xbdi->xbdi_xen_req;
-	if (req->nr_segments < 1 ||
-	    req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST) {
-		if (ratecheck(&xbdi->xbdi_lasterr_time,
-		    &xbdback_err_intvl)) {
-			printf("%s: invalid number of segments: %d\n",
-			       xbdi->xbdi_name,
-			       xbdi->xbdi_xen_req.nr_segments);
-		}
-		error = EINVAL;
-		goto end;
-	}
 
+	/* some sanity checks */
 	KASSERT(req->operation == BLKIF_OP_READ ||
-	    req->operation == BLKIF_OP_WRITE);
+	    req->operation == BLKIF_OP_WRITE ||
+	    req->operation == BLKIF_OP_INDIRECT);
 	if (req->operation == BLKIF_OP_WRITE) {
 		if (xbdi->xbdi_ro) {
 			error = EROFS;
@@ -1127,28 +1163,90 @@ xbdback_co_io(struct xbdback_instance *x
 	}
 
 	/* copy request segments */
-	switch(xbdi->xbdi_proto) {
+	switch (xbdi->xbdi_proto) {
 	case XBDIP_NATIVE:
-		/* already copied in xbdback_co_main_loop */
+		reqn = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_n,
+		    xbdi->xbdi_ring.ring_n.req_cons);
+		req->handle = reqn->handle;
+		req->sector_number = reqn->sector_number;
+		if (reqn->operation == BLKIF_OP_INDIRECT) {
+			rinn = (blkif_request_indirect_t *)reqn;
+			req->operation = rinn->indirect_op;
+			req->nr_segments = (uint8_t)rinn->nr_segments;
+			if (req->nr_segments > VBD_MAX_INDIRECT_SEGMENTS)
+				goto bad_nr_segments;
+			xbdi->xbdi_in_gntref = rinn->indirect_grefs[0];
+			/* first_sect and segment grefs fetched later */
+		} else {
+			req->nr_segments = reqn->nr_segments;
+			if (req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST)
+				goto bad_nr_segments;
+			for (i = 0; i < req->nr_segments; i++)
+				xbdi->xbdi_seg[i] = reqn->seg[i];
+			xbdi->xbdi_in_gntref = 0;
+		}
 		break;
 	case XBDIP_32:
 		req32 = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_32,
 		    xbdi->xbdi_ring.ring_n.req_cons);
-		for (i = 0; i < req->nr_segments; i++)
-			req->seg[i] = req32->seg[i];
+		req->handle = req32->handle;
+		req->sector_number = req32->sector_number;
+		if (req32->operation == BLKIF_OP_INDIRECT) {
+			rin32 = (blkif_x86_32_request_indirect_t *)req32;
+			req->operation = rin32->indirect_op;
+			req->nr_segments = (uint8_t)rin32->nr_segments;
+			if (req->nr_segments > VBD_MAX_INDIRECT_SEGMENTS)
+				goto bad_nr_segments;
+			xbdi->xbdi_in_gntref = rin32->indirect_grefs[0];
+			/* first_sect and segment grefs fetched later */
+		} else {
+			req->nr_segments = req32->nr_segments;
+			if (req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST)
+				goto bad_nr_segments;
+			for (i = 0; i < req->nr_segments; i++)
+				xbdi->xbdi_seg[i] = req32->seg[i];
+			xbdi->xbdi_in_gntref = 0;
+		}
 		break;
 	case XBDIP_64:
 		req64 = RING_GET_REQUEST(&xbdi->xbdi_ring.ring_64,
 		    xbdi->xbdi_ring.ring_n.req_cons);
-		for (i = 0; i < req->nr_segments; i++)
-			req->seg[i] = req64->seg[i];
+		req->handle = req64->handle;
+		req->sector_number = req64->sector_number;
+		if (req64->operation == BLKIF_OP_INDIRECT) {
+			rin64 = (blkif_x86_64_request_indirect_t *)req64;
+			req->nr_segments = (uint8_t)rin64->nr_segments;
+			if (req->nr_segments > VBD_MAX_INDIRECT_SEGMENTS)
+				goto bad_nr_segments;
+			xbdi->xbdi_in_gntref = rin64->indirect_grefs[0];
+			/* first_sect and segment grefs fetched later */
+		} else {
+			req->nr_segments = req64->nr_segments;
+			if (req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST)
+				goto bad_nr_segments;
+			for (i = 0; i < req->nr_segments; i++)
+				xbdi->xbdi_seg[i] = req64->seg[i];
+			xbdi->xbdi_in_gntref = 0;
+		}
 		break;
 	}
 
+	/* Max value checked already earlier */
+	if (req->nr_segments < 1)
+		goto bad_nr_segments;
+
 	KASSERT(xbdi->xbdi_io == NULL);
 	xbdi->xbdi_cont = xbdback_co_io_gotio;
 	return xbdback_pool_get(&xbdback_io_pool, xbdi);
 
+ bad_nr_segments:
+	if (ratecheck(&xbdi->xbdi_lasterr_time, &xbdback_err_intvl)) {
+		printf("%s: invalid number of segments: %d\n",
+		       xbdi->xbdi_name, req->nr_segments);
+	}
+	error = EINVAL;
+	/* FALLTHROUGH */
+
  end:
 	xbdback_send_reply(xbdi, xbdi->xbdi_xen_req.id,
 	    xbdi->xbdi_xen_req.operation,
@@ -1177,20 +1275,50 @@ xbdback_co_io_gotio(struct xbdback_insta
 	xbd_io->xio_operation = req->operation;
 	xbd_io->xio_id = req->id;
 
+	/* If segments are on an indirect page, copy them now */
+	if (xbdi->xbdi_in_gntref) {
+		gnttab_copy_t gop;
+		paddr_t ma;
+
+		gop.flags = GNTCOPY_source_gref;
+		gop.len = req->nr_segments
+		    * sizeof(struct blkif_request_segment);
+
+		gop.source.u.ref = xbdi->xbdi_in_gntref;
+		gop.source.offset = 0;
+		gop.source.domid = xbdi->xbdi_domid;
+
+		ma = xbdi->xbdi_seg_dmamap->dm_segs[0].ds_addr;
+		gop.dest.offset = ma & PAGE_MASK;
+		gop.dest.domid = DOMID_SELF;
+		gop.dest.u.gmfn = ma >> PAGE_SHIFT;
+
+		if (HYPERVISOR_grant_table_op(GNTTABOP_copy, &gop, 1) != 0) {
+			printf("%s: GNTTABOP_copy failed\n", xbdi->xbdi_name);
+			xbdback_send_reply(xbdi, xbdi->xbdi_xen_req.id,
+			    xbdi->xbdi_xen_req.operation,
+			    BLKIF_RSP_ERROR);
+			xbdi->xbdi_cont = xbdback_co_main_incr;
+			return NULL;
+		}
+	}
+
 	/* Process segments */
 	bcount = 0;
 	for (int i = 0; i < req->nr_segments; i++) {
-		xbd_io->xio_gref[i] = req->seg[i].gref;
-		bcount += (req->seg[i].last_sect - req->seg[i].first_sect + 1)
+		struct blkif_request_segment *seg = &xbdi->xbdi_seg[i];
+		xbd_io->xio_gref[i] = seg->gref;
+		bcount += (seg->last_sect - seg->first_sect + 1)
 			* VBD_BSIZE;
 	}
-	KASSERT(bcount <= MAXPHYS);
 	xbd_io->xio_nrma = req->nr_segments;
+	xbd_io->xio_start_offset = xbdi->xbdi_seg[0].first_sect * VBD_BSIZE;
 
-	xbd_io->xio_start_offset = req->seg[0].first_sect * VBD_BSIZE;
+	KASSERT(bcount <= MAXPHYS);
 	KASSERT(xbd_io->xio_start_offset < PAGE_SIZE);
 	KASSERT(bcount + xbd_io->xio_start_offset < VBD_VA_SIZE);
 
+	/* Fill-in the buf */
 	if (xbdi->xbdi_xen_req.operation == BLKIF_OP_WRITE) {
 		buf_flags = B_WRITE;
 	} else {

Reply via email to