This is another proposed patch for block devices to have them use mmap
instead of file io.

I use this several ways:
1. regression testing. I can boot a guest, have it mess around,
literally rm -rf all of / with no harm done
2. lots of guests. This saves overhead as lots of mmap'ed pages are shared
3. lockups. I'm getting a deadlock or lockup on when I have lots of
guests doing block io; I want to see if this fixes it.

So, comments on this welcome, and if somebody wants to do it better, I
have no problems with that.

ron
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index 3be8ab2..7ac7ca8 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -42,7 +42,6 @@
 #include "linux/virtio_blk.h"
 #include "linux/virtio_console.h"
 #include "linux/virtio_ring.h"
-#include "asm-x86/bootparam.h"
 /*L:110 We can ignore the 39 include files we need for this program, but I do
  * want to draw attention to the use of kernel-style types.
  *
@@ -55,6 +54,7 @@ typedef uint32_t u32;
 typedef uint16_t u16;
 typedef uint8_t u8;
 /*:*/
+#include "asm-x86/bootparam.h"
 
 #define PAGE_PRESENT 0x7 	/* Present, RW, Execute */
 #define NET_PEERNUM 1
@@ -1405,38 +1405,102 @@ struct vblk_info
 	/* IO thread writes to this file descriptor to mark it done, then
 	 * Launcher triggers interrupt to Guest. */
 	int done_fd;
+
+	/* If the file is mmap'ed, this is the pointer to the data */
+	void *map;
 };
 
 /*L:210
- * The Disk
+ * Mmap File IO
  *
- * Remember that the block device is handled by a separate I/O thread.  We head
- * straight into the core of that thread here:
+ * This block device uses a private mmap'ed area for the block device. 
+ * Very useful when you're running lots of guests, or don't want guest
+ * changes making their way back to the underlying file system. 
  */
-static bool service_io(struct device *dev)
+
+static int
+mmapfileio(struct vblk_info *vblk, struct virtio_blk_outhdr *out, struct iovec *iov, int out_num, int in_num, u8 *in)
 {
-	struct vblk_info *vblk = dev->priv;
-	unsigned int head, out_num, in_num, wlen;
-	int ret;
-	u8 *in;
-	struct virtio_blk_outhdr *out;
-	struct iovec iov[dev->vq->vring.num];
+	unsigned int wlen = sizeof(*in);
 	off64_t off;
+	u8 *data = vblk->map;
+	int i;
+	ssize_t iolen;
+	int iswrite = out->type & VIRTIO_BLK_T_OUT;
 
-	/* See if there's a request waiting.  If not, nothing to do. */
-	head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
-	if (head == dev->vq->vring.num)
-		return false;
 
-	/* Every block request should contain at least one output buffer
-	 * (detailing the location on disk and the type of request) and one
-	 * input buffer (to hold the result). */
-	if (out_num == 0 || in_num == 0)
-		errx(1, "Bad virtblk cmd %u out=%u in=%u",
-		     head, out_num, in_num);
+	off = out->sector * 512;
+
+	/* We first check that the read or write is within limits */
+	if (off >= vblk->len)
+		err(1, "Bad offset %llu vs %llu", off, vblk->len);
+
+	data += off;
+
+	verbose("mmapfileio: %s at offset %llu(%p), ", iswrite ? "WRITE" : "READ", 
+			off, data);
+
+	/* if we do no IO whatsoever we'll call it an error */
+	*in = VIRTIO_BLK_S_IOERR;
+
+	if (iswrite) {
+		for(i = 1; i < out_num; i++){
+			iolen = iov[i].iov_len;
+			/* don't even THINK about writing past the end */
+			if ((off + iolen) >= vblk->len)
+				errx(1, "Write past end of private block %llu + %u\n", 
+				off, iolen);
+			verbose("%p, %p, %d, ", data, iov[i].iov_base, iolen);
+			memcpy(data, iov[i].iov_base, iolen);
+			off += iolen;
+			data += iolen;
+			wlen += iolen;
+			/* If you were out of range, you're dead before you get here */
+			*in = VIRTIO_BLK_S_OK;
+		}
+	} else {
+		for(i = 1; i < in_num; i++){
+			iolen = iov[i].iov_len;
+			/* read past end? 
+ 			 * Just trim the read iolen and we're done 
+ 			 */
+			if ((off + iolen) >= vblk->len){
+				warn("Read past end of private block %llu + %u\n", off, iolen);
+				iolen = vblk->len - off;
+			}
+			if (iolen <= 0)
+				break;
+			verbose("%p, %p, %d, ", iov[i].iov_base, data, iolen);
+			memcpy(iov[i].iov_base, data, iolen);
+			off += iolen;
+			data += iolen;
+			wlen += iolen;
+			/* if we do even one IO, we're happy */
+			*in = VIRTIO_BLK_S_OK;
+		}
+	}
+
+	verbose("mmapfileio returns %d\n", wlen);
+	return wlen;
+
+}
+/*L:210
+ * Block File IO
+ *
+ * This block device does direct file I/O to the file backing the virtual block device. 
+ * This IO is persistent, i.e. when the guest is done, the device is changed. 
+ * If two guests are running and one changes a block, and the other later reads it, 
+ * the other guest will see changes made by the first. This property is very exciting
+ * for medata. 
+ */
+
+static int
+blockfileio(struct vblk_info *vblk, struct virtio_blk_outhdr *out, struct iovec *iov, int out_num, int in_num, u8 *in)
+{
+	unsigned int wlen;
+	int ret;
+	off64_t off;
 
-	out = convert(&iov[0], struct virtio_blk_outhdr);
-	in = convert(&iov[out_num+in_num-1], u8);
 	off = out->sector * 512;
 
 	/* The block device implements "barriers", where the Guest indicates
@@ -1446,15 +1510,8 @@ static bool service_io(struct device *dev)
 	if (out->type & VIRTIO_BLK_T_BARRIER)
 		fdatasync(vblk->fd);
 
-	/* In general the virtio block driver is allowed to try SCSI commands.
-	 * It'd be nice if we supported eject, for example, but we don't. */
-	if (out->type & VIRTIO_BLK_T_SCSI_CMD) {
-		fprintf(stderr, "Scsi commands unsupported\n");
-		*in = VIRTIO_BLK_S_UNSUPP;
-		wlen = sizeof(*in);
-	} else if (out->type & VIRTIO_BLK_T_OUT) {
-		/* Write */
-
+	/* Write */
+	if (out->type & VIRTIO_BLK_T_OUT) {
 		/* Move to the right location in the block file.  This can fail
 		 * if they try to write past end. */
 		if (lseek64(vblk->fd, off, SEEK_SET) != off)
@@ -1492,7 +1549,51 @@ static bool service_io(struct device *dev)
 			*in = VIRTIO_BLK_S_IOERR;
 		}
 	}
+	printf("block io returns %d\n", wlen);
+	return wlen;
+}
+
+/*L:210
+ * The Disk
+ *
+ * Remember that the block device is handled by a separate I/O thread.  We head
+ * straight into the core of that thread here:
+ */
+static bool service_io(struct device *dev)
+{
+	struct vblk_info *vblk = dev->priv;
+	unsigned int head, out_num, in_num, wlen;
+	u8 *in;
+	struct virtio_blk_outhdr *out;
+	struct iovec iov[dev->vq->vring.num];
+
+	/* See if there's a request waiting.  If not, nothing to do. */
+	head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
+	if (head == dev->vq->vring.num)
+		return false;
+
+	/* Every block request should contain at least one output buffer
+	 * (detailing the location on disk and the type of request) and one
+	 * input buffer (to hold the result). */
+	if (out_num == 0 || in_num == 0)
+		errx(1, "Bad virtblk cmd %u out=%u in=%u",
+		     head, out_num, in_num);
+
+	out = convert(&iov[0], struct virtio_blk_outhdr);
+	in = convert(&iov[out_num+in_num-1], u8);
 
+	/* In general the virtio block driver is allowed to try SCSI commands.
+	 * It'd be nice if we supported eject, for example, but we don't. */
+	if (out->type & VIRTIO_BLK_T_SCSI_CMD) {
+		fprintf(stderr, "Scsi commands unsupported\n");
+		*in = VIRTIO_BLK_S_UNSUPP;
+		wlen = sizeof(*in);
+	} else if (! vblk->map) {
+			wlen = blockfileio(vblk, out, iov, out_num, in_num, in);
+	} else {
+			wlen = mmapfileio(vblk, out, iov, out_num, in_num, in);
+	}
+	
 	/* We can't trigger an IRQ, because we're not the Launcher.  It does
 	 * that when we tell it we're done. */
 	add_used(dev->vq, head, wlen);
@@ -1554,7 +1655,7 @@ static void handle_virtblk_output(int fd, struct virtqueue *vq)
 }
 
 /*L:198 This actually sets up a virtual block device. */
-static void setup_block_file(const char *filename)
+static void setup_block_file(const char *filename, int mapfile)
 {
 	int p[2];
 	struct device *dev;
@@ -1578,6 +1679,22 @@ static void setup_block_file(const char *filename)
 	vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE);
 	vblk->len = lseek64(vblk->fd, 0, SEEK_END);
 
+	/* If we've been asked to mmap it, do so. We're not going to close 
+	 * the fd, however, as it's always possible we'll want it later. 
+	 * Why mmap? 
+	 * 1. You're running lots and lots of guests, and want to share a lot. 
+	 * 2. You want to run the guest but not really modify the disk image. 
+	 */
+	if (mapfile) {
+		vblk->map = mmap(NULL, page_align(vblk->len),
+			    PROT_READ|PROT_WRITE, MAP_PRIVATE, vblk->fd, 0);
+		/* We don't want to fall back to IO if mmap fails. The semantics
+		 * are utterly different. 
+		*/
+		if (vblk->map == MAP_FAILED)
+			err(1, "Mmaping physical block device");
+	}
+
 	/* We support barriers. */
 	add_feature(dev, VIRTIO_BLK_F_BARRIER);
 
@@ -1683,6 +1800,7 @@ static struct option opts[] = {
 	{ "verbose", 0, NULL, 'v' },
 	{ "tunnet", 1, NULL, 't' },
 	{ "block", 1, NULL, 'b' },
+	{ "pblock", 1, NULL, 'p' },
 	{ "initrd", 1, NULL, 'i' },
 	{ NULL },
 };
@@ -1755,7 +1873,10 @@ int main(int argc, char *argv[])
 			setup_tun_net(optarg);
 			break;
 		case 'b':
-			setup_block_file(optarg);
+			setup_block_file(optarg, 0);
+			break;
+		case 'p':
+			setup_block_file(optarg, 1);
 			break;
 		case 'i':
 			initrd_name = optarg;
_______________________________________________
Lguest mailing list
[email protected]
https://ozlabs.org/mailman/listinfo/lguest

Reply via email to