+ } else if (errno != EINTR) {
+ fprintf (stderr, "Error: %s %s sector_num=%" PRId64
+ " nb_sectors=%d. Pause process %d for debugging...\n",
+ do_read ? "READ" : "WRITE", bs->filename, sector_num,
+ nb_sectors, getpid ());
+ fgetc (stdin);
+ return -errno;
+ }
+ }
+
+ return 0;
+}
+
+static int blksim_read (BlockDriverState * bs, int64_t sector_num,
+ uint8_t * buf, int nb_sectors)
+{
+ return do_io (bs, sector_num, buf, nb_sectors, TRUE);
+}
+
+static int blksim_write (BlockDriverState * bs, int64_t sector_num,
+ const uint8_t * buf, int nb_sectors)
+{
+ return do_io (bs, sector_num, (uint8_t *) buf, nb_sectors, FALSE);
+}
+
+static void insert_in_list (SimAIOCB * acb)
+{
+ int64_t new_id = sim_uuid++;
+ CHECK_TASK (new_id);
+ acb->uuid = new_id;
+
+ if (rand_time<= 0) {
+ /* Working with qemu-io.c and not doing delay randomization.
+ * Insert it to the tail. */
+ acb->time = 0;
+ acb->prev = head.prev;
+ acb->next =&head;
+ head.prev->next = acb;
+ head.prev = acb;
+ return;
+ }
+
+ SimAIOCB *p = head.next;
+
+ if (acb->time>= 0) {
+ /* Introduce a random delay to better trigger rare race conditions. */
+ acb->time += random () % rand_time;
+
+ /* Find the position to insert. The list is sorted in ascending time.
*/
+ while (1) {
+ if (p->time> acb->time) {
+ break;
+ }
+ if (p->time == acb->time&& (random () % 2 == 0)) {
+ break;
+ }
+ p = p->next;
+ }
+ }
+
+ /* Insert acb before p. */
+ acb->next = p;
+ acb->prev = p->prev;
+ p->prev->next = acb;
+ p->prev = acb;
+}
+
+/* Debug problems related to reusing task objects. Problem already solved.*/
+#if 1
+# define my_qemu_aio_get qemu_aio_get
+# define my_qemu_aio_release qemu_aio_release
+
+#else
+static SimAIOCB *search_task_list (SimAIOCB * acb)
+{
+ SimAIOCB *p;
+ for (p = head.next; p !=&head; p = p->next) {
+ if (p == acb) {
+ return p;
+ }
+ }
+
+ return NULL;
+}
+
+static inline void *my_qemu_aio_get (AIOPool * pool, BlockDriverState * bs,
+ BlockDriverCompletionFunc * cb,
+ void *opaque)
+{
+ SimAIOCB *acb = (SimAIOCB *) qemu_aio_get (&sim_aio_pool, bs, cb, opaque);
+ QDEBUG ("SIM: qemu_aio_get reuse old task%" PRId64 "\n", acb->uuid);
+ ASSERT (!search_task_list (acb));
+ return acb;
+}
+
+static inline void my_qemu_aio_release (SimAIOCB * acb)
+{
+ QDEBUG ("SIM: qemu_aio_release task%" PRId64 "\n", acb->uuid);
+ qemu_aio_release (acb);
+}
+#endif
+
+static BlockDriverAIOCB *insert_task (int op, BlockDriverState * bs,
+ int64_t sector_num, QEMUIOVector * qiov,
+ int nb_sectors,
+ BlockDriverCompletionFunc * cb,
+ void *opaque)
+{
+ SimAIOCB *acb = my_qemu_aio_get (&sim_aio_pool, bs, cb, opaque);
+ if (!acb) {
+ return NULL;
+ }
+
+ acb->op = op;
+ acb->sector_num = sector_num;
+ acb->qiov = qiov;
+ acb->nb_sectors = nb_sectors;
+ acb->ret = disk_io_return_code;
+ acb->time = current_time;
+ insert_in_list (acb);
+
+ if (interactive_print) {
+ if (op == SIM_READ) {
+ printf ("Added READ uuid=%" PRId64 " filename=%s sector_num=%"
+ PRId64 " nb_sectors=%d\n", acb->uuid,
+ acb->common.bs->filename, acb->sector_num,
acb->nb_sectors);
+ } else if (op == SIM_WRITE) {
+ printf ("Added WRITE uuid=%" PRId64 " filename=%s sector_num=%"
+ PRId64 " nb_sectors=%d\n", acb->uuid,
+ acb->common.bs->filename, acb->sector_num,
acb->nb_sectors);
+ } else {
+ fprintf (stderr, "Unknown op %d\n", op);
+ exit (1);
+ }
+ }
+
+ return&acb->common;
+}
+
+static void insert_aio_callback (SimAIOCB * acb)
+{
+ acb->time = current_time;
+ insert_in_list (acb);
+
+ if (acb->op == SIM_FLUSH) {
+ acb->op = SIM_FLUSH_CALLBACK;
+ if (interactive_print) {
+ printf ("Added FLUSH_CALLBACK uuid=%" PRId64 " filename=%s\n",
+ acb->uuid, acb->common.bs->filename);
+ }
+ } else if (acb->op == SIM_READ) {
+ acb->op = SIM_READ_CALLBACK;
+ if (interactive_print) {
+ printf ("Added READ_CALLBACK uuid=%" PRId64
+ " filename=%s sector_num=%" PRId64 " nb_sectors=%d\n",
+ acb->uuid, acb->common.bs->filename, acb->sector_num,
+ acb->nb_sectors);
+ }
+ } else if (acb->op == SIM_WRITE) {
+ acb->op = SIM_WRITE_CALLBACK;
+ if (interactive_print) {
+ printf ("Added WRITE_CALLBACK uuid=%" PRId64
+ " filename=%s sector_num=%" PRId64 " nb_sectors=%d\n",
+ acb->uuid, acb->common.bs->filename, acb->sector_num,
+ acb->nb_sectors);
+ }
+ } else {
+ fprintf (stderr, "Wrong op %d\n", acb->op);
+ exit (1);
+ }
+}
+
+void blksim_list_tasks (void)
+{
+ SimAIOCB *acb;
+
+ for (acb = head.next; acb !=&head; acb = acb->next) {
+ if (acb->op == SIM_READ) {
+ printf ("uuid=%" PRId64 " READ file=%s sector_num=%"
+ PRIu64 " nb_sectors=%d\n", acb->uuid,
+ acb->common.bs->filename, acb->sector_num,
acb->nb_sectors);
+ } else if (acb->op == SIM_WRITE) {
+ printf ("uuid=%" PRId64 " WRITE file=%s sector_num=%"
+ PRIu64 " nb_sectors=%d\n", acb->uuid,
+ acb->common.bs->filename, acb->sector_num,
acb->nb_sectors);
+ } else if (acb->op == SIM_READ_CALLBACK) {
+ printf ("uuid=%" PRId64 " CALLBACK READ file=%s sector_num=%"
+ PRIu64 " nb_sectors=%d\n", acb->uuid,
+ acb->common.bs->filename, acb->sector_num,
acb->nb_sectors);
+ } else if (acb->op == SIM_WRITE_CALLBACK) {
+ printf ("uuid=%" PRId64 " CALLBACK WRITE file=%s sector_num=%"
+ PRIu64 " nb_sectors=%d\n", acb->uuid,
+ acb->common.bs->filename, acb->sector_num,
acb->nb_sectors);
+ } else {
+ fprintf (stderr, "Wrong OP %d\n", acb->op);
+ exit (1);
+ }
+ }
+}
+
+static inline void sim_callback (SimAIOCB * acb)
+{
+ acb->common.cb (acb->common.opaque, acb->ret);
+}
+
+int64_t blksim_get_time (void)
+{
+ return current_time;
+}
+
+void *blksim_new_timer (void *cb, void *opaque)
+{
+ SimAIOCB *acb = my_qemu_aio_get (&sim_aio_pool, NULL, cb, opaque);
+ acb->op = SIM_TIMER;
+ acb->prev = NULL;
+ return acb;
+}
+
+void blksim_mod_timer (void *ts, int64_t expire_time)
+{
+ SimAIOCB *acb = ts;
+
+ if (acb->prev) {
+ /* Remove it first. */
+ acb->next->prev = acb->prev;
+ acb->prev->next = acb->next;
+ }
+ acb->time = expire_time;
+ insert_in_list (acb);
+
+ if (interactive_print) {
+ printf ("Added TIMER uuid=%" PRId64 " expire_time=%"PRId64
+ " current_time=%"PRId64"\n",
+ acb->uuid, expire_time, current_time);
+ }
+}
+
+void blksim_free_timer (void *ts)
+{
+ SimAIOCB *acb = ts;
+ CHECK_TASK (acb->uuid);
+ my_qemu_aio_release (acb);
+}
+
+void blksim_del_timer (void *ts)
+{
+ SimAIOCB *acb = ts;
+
+ CHECK_TASK (acb->uuid);
+ if (acb->prev) {
+ /* Remove it from the list. */
+ acb->next->prev = acb->prev;
+ acb->prev->next = acb->next;
+
+ /* Mark it as not in list. */
+ acb->prev = NULL;
+ }
+}
+
+void blksim_bh_schedule (void *bh)
+{
+ if (instant_qemubh) {
+ blksim_mod_timer (bh, -1);
+ } else {
+ blksim_mod_timer (bh, current_time);
+ }
+}
+
+void blksim_set_instant_qemubh (int instant)
+{
+ instant_qemubh = instant;
+}
+
+void blksim_set_disk_io_return_code (int ret)
+{
+ disk_io_return_code = ret;
+}
+
+static void run_task_by_acb (SimAIOCB * acb)
+{
+ CHECK_TASK (acb->uuid);
+
+ /* Remove it from the list. */
+ acb->next->prev = acb->prev;
+ acb->prev->next = acb->next;
+ acb->prev = NULL; /* Indicate that it is no longer in the list. */
+
+ if (acb->time> current_time) {
+ current_time = acb->time;
+ }
+
+ if (acb->op == SIM_TIMER) {
+ QDEBUG ("SIM: execute task%" PRId64 " time=%" PRId64 " TIMER \n",
+ acb->uuid, acb->time);
+
+ ((QEMUTimerCB *) acb->common.cb) (acb->common.opaque);
+ return;
+ }
+
+ BlockDriverState *bs = acb->common.bs;
+
+ if (acb->op == SIM_READ) {
+ QDEBUG ("SIM: execute task%" PRId64 " time=%" PRId64
+ " READ %s sector_num=%" PRId64 " nb_sectors=%d\n",
+ acb->uuid, acb->time, bs->filename, acb->sector_num,
+ acb->nb_sectors);
+
+ if (acb->ret == 0) {
+ if (acb->qiov->niov == 1) {
+ if (blksim_read
+ (bs, acb->sector_num, acb->qiov->iov->iov_base,
+ acb->nb_sectors) != 0) {
+ fprintf (stderr, "Error in reading %s sector_num=%lld "
+ "nb_sectors=%d\n", acb->common.bs->filename,
+ acb->sector_num, acb->nb_sectors);
+ exit (1);
+ }
+ } else {
+ uint8_t *buf=qemu_blockalign (acb->common.bs, acb->qiov->size);
+ if (blksim_read (bs, acb->sector_num, buf,
+ acb->nb_sectors) != 0) {
+ fprintf (stderr, "Error in reading %s sector_num=%lld "
+ "nb_sectors=%d\n", acb->common.bs->filename,
+ acb->sector_num, acb->nb_sectors);
+ exit (1);
+ }
+ qemu_iovec_from_buffer (acb->qiov, buf, acb->qiov->size);
+ qemu_vfree (buf);
+ }
+ }
+
+ insert_aio_callback (acb);
+ } else if (acb->op == SIM_WRITE) {
+ QDEBUG ("SIM: execute task%" PRId64 " time=%" PRId64
+ " WRITE %s sector_num=%" PRId64 " nb_sectors=%d\n",
+ acb->uuid, acb->time, bs->filename,
+ acb->sector_num, acb->nb_sectors);
+
+ if (acb->ret == 0) {
+ if (acb->qiov->niov == 1) {
+ if (blksim_write (bs, acb->sector_num,
acb->qiov->iov->iov_base,
+ acb->nb_sectors) != 0) {
+ fprintf (stderr, "Error in writing %s sector_num=%lld "
+ "nb_sectors=%d\n", acb->common.bs->filename,
+ acb->sector_num, acb->nb_sectors);
+ exit (1);
+ }
+ } else {
+ uint8_t *buf = qemu_blockalign (acb->common.bs,
+ acb->qiov->size);
+ qemu_iovec_to_buffer (acb->qiov, buf);
+ if (blksim_write (bs, acb->sector_num, buf,
+ acb->nb_sectors)!= 0) {
+ fprintf (stderr, "Error in writing %s sector_num=%lld "
+ "nb_sectors=%d\n", acb->common.bs->filename,
+ acb->sector_num, acb->nb_sectors);
+ exit (1);
+ }
+ qemu_vfree (buf);
+ }
+ }
+
+ insert_aio_callback (acb);
+ } else if (acb->op == SIM_FLUSH) {
+ QDEBUG ("SIM: execute task%" PRId64 " time=%" PRId64 " FLUSH %s\n",
+ acb->uuid, acb->time, bs->filename);
+ /* Skip real flushing to speed up simulation:
+ * if (ret == 0) { * fdatasync (s->fd); } */
+ insert_aio_callback (acb);
+ } else if (acb->op == SIM_WRITE_CALLBACK || acb->op == SIM_READ_CALLBACK
+ || acb->op == SIM_FLUSH_CALLBACK) {
+ QDEBUG ("SIM: execute task%" PRId64 " time=%" PRId64 " CALLBACK\n",
+ acb->uuid, acb->time);
+ sim_callback (acb);
+ CHECK_TASK (acb->uuid);
+ my_qemu_aio_release (acb);
+ } else {
+ fprintf (stderr, "Unknown op %d\n", acb->op);
+ exit (1);
+ }
+}
+
+int blksim_run_task_by_uuid (int64_t uuid)
+{
+ SimAIOCB *acb;
+
+ for (acb = head.next; acb !=&head; acb = acb->next) {
+ if (acb->uuid == uuid) {
+ run_task_by_acb (acb);
+ return 0;
+ }
+ }
+
+ return -1;
+}
+
+int blksim_run_all_tasks (void)
+{
+ int n = 0;
+
+ while (1) {
+ SimAIOCB *acb = head.next;
+ if (acb ==&head) {
+ return n; /* No more tasks.*/
+ }
+
+ run_task_by_acb (acb);
+ n++;
+ }
+}
+
+static BlockDriverAIOCB *blksim_aio_readv (BlockDriverState * bs,
+ int64_t sector_num,
+ QEMUIOVector * qiov,
+ int nb_sectors,
+ BlockDriverCompletionFunc * cb,
+ void *opaque)
+{
+ return insert_task (SIM_READ, bs, sector_num, qiov, nb_sectors, cb,
opaque);
+}
+
+static BlockDriverAIOCB *blksim_aio_writev (BlockDriverState * bs,
+ int64_t sector_num,
+ QEMUIOVector * qiov,
+ int nb_sectors,
+ BlockDriverCompletionFunc * cb,
+ void *opaque)
+{
+ return insert_task (SIM_WRITE, bs, sector_num, qiov, nb_sectors, cb,
+ opaque);
+}
+
+static BlockDriverAIOCB *blksim_aio_flush (BlockDriverState * bs,
+ BlockDriverCompletionFunc * cb,
+ void *opaque)
+{
+ return insert_task (SIM_FLUSH, bs, 0, NULL, 0, cb, opaque);
+}
+
+static void sim_aio_cancel (BlockDriverAIOCB * blockacb)
+{
+ SimAIOCB *acb = container_of (blockacb, SimAIOCB, common);
+
+ CHECK_TASK (acb->uuid);
+ QDEBUG ("SIM: cancel task%" PRId64 "\n", acb->uuid);
+
+ if (acb->prev) {
+ acb->next->prev = acb->prev;
+ acb->prev->next = acb->next;
+ acb->prev = NULL;
+ my_qemu_aio_release (acb);
+ } else {
+ fprintf (stderr, "Error: cancel a blksim task that does not exist: "
+ "uuid=%"PRId64". Halt process %d for debugging...\n",
+ acb->uuid, getpid());
+ fgetc (stdin);
+ exit (1);
+ }
+}
+
+static int blksim_open (BlockDriverState * bs, const char *filename,
+ int bdrv_flags)
+{
+ BDRVSimState *s = bs->opaque;
+ int open_flags = O_BINARY | O_LARGEFILE;
+
+ blksim_invoked = TRUE;
+
+ if ((bdrv_flags& BDRV_O_RDWR)) {
+ open_flags |= O_RDWR;
+ } else {
+ open_flags |= O_RDONLY;
+ }
+
+ if ((bdrv_flags& BDRV_O_NOCACHE)) {
+ open_flags |= O_DIRECT;
+ } else if (!(bdrv_flags& BDRV_O_CACHE_WB)) {
+ open_flags |= O_DSYNC;
+ }
+
+ /* Parse the "blksim:" prefix */
+ if (!strncmp(filename, "blksim:", strlen("blksim:"))) {
+ filename += strlen("blksim:");
+ }
+
+ s->fd = open (filename, open_flags);
+ if (s->fd< 0)
+ return -1;
+
+ int64_t len = lseek (s->fd, 0, SEEK_END);
+ if (len>= 0) {
+ bs->total_sectors = len / 512;
+ } else {
+ bs->total_sectors = 0;
+ }
+
+ bs->growable = 1;
+ return 0;
+}
+
+static void blksim_close (BlockDriverState * bs)
+{
+ BDRVSimState *s = bs->opaque;
+ close (s->fd);
+}
+
+static int blksim_flush (BlockDriverState * bs)
+{
+ /*
+ * Skip real flushing to speed up simulation.
+ * BDRVSimState *s = bs->opaque;
+ * fdatasync (s->fd);
+ */
+ return 0;
+}
+
+static int blksim_has_zero_init (BlockDriverState * bs)
+{
+ struct stat buf;
+
+ if (stat (bs->filename,&buf) != 0) {
+ fprintf (stderr, "Failed to stat() %s\n", bs->filename);
+ exit (1);
+ }
+
+ if (S_ISBLK (buf.st_mode) || S_ISCHR (buf.st_mode)) {
+ return 0;
+ }
+
+ return 1;
+}
+
+static int blksim_truncate (BlockDriverState * bs, int64_t offset)
+{
+ BDRVSimState *s = bs->opaque;
+ return ftruncate (s->fd, offset);
+}
+
+static BlockDriver bdrv_blksim = {
+ .format_name = "blksim",
+ .protocol_name = "blksim",
+ .instance_size = sizeof (BDRVSimState),
+ .bdrv_file_open = blksim_open,
+ .bdrv_close = blksim_close,
+ .bdrv_flush = blksim_flush,
+ .bdrv_read = blksim_read,
+ .bdrv_write = blksim_write,
+ .bdrv_aio_readv = blksim_aio_readv,
+ .bdrv_aio_writev = blksim_aio_writev,
+ .bdrv_aio_flush = blksim_aio_flush,
+ .bdrv_has_zero_init = blksim_has_zero_init,
+ .bdrv_truncate = blksim_truncate,
+};
+
+static void bdrv_blksim_init(void)
+{
+ bdrv_register(&bdrv_blksim);
+}
+block_init(bdrv_blksim_init);
+
+void init_blksim (int print, int64_t _rand_time)
+{
+ interactive_print = print;
+ rand_time = _rand_time;
+}
+
+/*
+ * To work properly in the simulation mode, block device drivers that
+ * explicitly invoke qemu_aio_wait() should invoke blksim_qemu_aio_wait() if
+ * the block device is openned using blksim. Most block device drivers do not
+ * invoke qemu_aio_wait() and hence should not be concerned about this.
+ */
+int blksim_qemu_aio_wait (void)
+{
+ SimAIOCB *acb = head.next;
+ if (acb ==&head) {
+ return 0;
+ }
+ else {
+ run_task_by_acb (acb);
+ return 1;
+ }
+}
+
+int blksim_has_task (void)
+{
+ return head.next !=&head;
+}
+
+int using_blksim (void)
+{
+ return blksim_invoked;
+}
diff --git a/block/blksim.h b/block/blksim.h
new file mode 100644
index 0000000..fa1e20d
--- /dev/null
+++ b/block/blksim.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2010-2011 IBM
+ *
+ * Authors:
+ * Chunqiang Tang<ct...@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+/*=============================================================================
+ * A short description: this is the header of the simulated block device
+ * driver "blksim".
+
*============================================================================*/
+
+#ifndef __block_sim_h__
+#define __block_sim_h__