This patch adds core functions for request-based dm.

Signed-off-by: Kiyoshi Ueda <[EMAIL PROTECTED]>
Signed-off-by: Jun'ichi Nomura <[EMAIL PROTECTED]>
---
 drivers/md/dm.c |  452 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 drivers/md/dm.h |    7 
 2 files changed, 456 insertions(+), 3 deletions(-)

Index: 2.6.25-rc1/drivers/md/dm.c
===================================================================
--- 2.6.25-rc1.orig/drivers/md/dm.c
+++ 2.6.25-rc1/drivers/md/dm.c
@@ -75,6 +75,14 @@ union map_info *dm_get_mapinfo(struct bi
        return NULL;
 }
 
+union map_info *dm_get_rq_mapinfo(struct request *rq)
+{
+       if (rq && rq->end_io_data)
+               return &((struct dm_rq_target_io *)rq->end_io_data)->info;
+       return NULL;
+}
+EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
+
 #define MINOR_ALLOCED ((void *)-1)
 
 /*
@@ -86,6 +94,7 @@ union map_info *dm_get_mapinfo(struct bi
 #define DMF_FREEING 3
 #define DMF_DELETING 4
 #define DMF_NOFLUSH_SUSPENDING 5
+#define DMF_REQUEST_BASED 6
 
 /*
  * Work processed by per-device workqueue.
@@ -158,6 +167,9 @@ struct mapped_device {
 
        /* forced geometry settings */
        struct hd_geometry geometry;
+
+       /* For saving the address of __make_request for request based dm */
+       make_request_fn *saved_make_request_fn;
 };
 
 #define MIN_IOS 256
@@ -395,6 +407,17 @@ static void free_tio(struct mapped_devic
        mempool_free(tio, md->tio_pool);
 }
 
+static inline struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md)
+{
+       return mempool_alloc(md->tio_pool, GFP_ATOMIC);
+}
+
+static inline void free_rq_tio(struct mapped_device *md,
+                              struct dm_rq_target_io *tio)
+{
+       mempool_free(tio, md->tio_pool);
+}
+
 static void start_io_acct(struct dm_io *io)
 {
        struct mapped_device *md = io->md;
@@ -583,6 +606,181 @@ static void clone_endio(struct bio *bio,
        free_tio(md, tio);
 }
 
+static void __requeue_request(struct request_queue *q, struct request *rq)
+{
+       if (elv_queue_empty(q))
+               blk_plug_device(q);
+       blk_requeue_request(q, rq);
+}
+
+static void requeue_request(struct request_queue *q, struct request *rq)
+{
+       unsigned long flags = 0UL;
+
+       spin_lock_irqsave(q->queue_lock, flags);
+       __requeue_request(q, rq);
+       spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+static void dec_rq_pending(struct dm_rq_target_io *tio)
+{
+       if (!atomic_dec_return(&tio->md->pending))
+               /* nudge anyone waiting on suspend queue */
+               wake_up(&tio->md->wait);
+}
+
+static void blk_update_cloned_rq(struct request *rq, struct request *clone)
+{
+       clone->nr_phys_segments = rq->nr_phys_segments;
+       clone->nr_hw_segments = rq->nr_hw_segments;
+       clone->current_nr_sectors = rq->current_nr_sectors;
+       clone->hard_cur_sectors = rq->hard_cur_sectors;
+       clone->hard_nr_sectors = rq->hard_nr_sectors;
+       clone->nr_sectors = rq->nr_sectors;
+       clone->hard_sector = rq->hard_sector;
+       clone->sector = rq->sector;
+       clone->data_len = rq->data_len;
+       clone->buffer = rq->buffer;
+       clone->data = rq->data;
+       clone->bio = rq->bio;
+       clone->biotail = rq->biotail;
+}
+
+static void finish_clone(struct request *clone)
+{
+       if (!clone->q)
+               /*
+                * The clone was not dispatched into underlying devices and
+                * it means the caller is not underlying device driver,
+                * the caller should be dm. (e.g. dispatch_queued_ios() of
+                * dm-multipath)
+                * So no need to do anything here for this clone.
+                */
+               return;
+
+       /*
+        * For just cleaning up the information of the queue in which
+        * the clone was dispatched.
+        * The clone is *NOT* freed actually here because it is alloced from
+        * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags.
+        *
+        * The 'error' and 'nr_bytes' arguments of blk_end_io() don't matter
+        * because they aren't used for dm's clones.
+        */
+       if (blk_end_io(clone, 0, 0, 0, NULL))
+               DMWARN("dm ignores the immediate return request of callback.");
+}
+
+static void clean_clone(struct request *clone)
+{
+       finish_clone(clone);
+       clone->special = NULL;
+       clone->errors = 0;
+       clone->endio_error = 0;
+}
+
+/**
+ * Must be called without the queue lock
+ **/
+static int clone_end_request(struct request *clone, int error,
+                            unsigned int nr_bytes, unsigned int bidi_bytes,
+                            int (drv_callback)(struct request *))
+{
+       int r = 0, rw = rq_data_dir(clone), requeued = 0;
+       struct dm_rq_target_io *tio = clone->end_io_data;
+       dm_request_endio_first_fn endio_first = tio->ti->type->rq_end_io_first;
+       dm_request_endio_fn endio = tio->ti->type->rq_end_io;
+       dm_request_queue_in_tgt_fn queue_in_tgt = tio->ti->type->queue_in_tgt;
+       struct request *orig = tio->orig;
+       struct request_queue *q_orig = orig->q;
+
+       if (blk_fs_request(clone) && clone->rq_disk)
+               disk_stat_add(clone->rq_disk, sectors[rw], nr_bytes >> 9);
+
+       if (endio_first) {
+               r = endio_first(tio->ti, clone, error, &tio->info);
+               switch (r) {
+               case 0:
+                       /* Succeeded */
+                       break;
+               case DM_ENDIO_INCOMPLETE:
+                       /*
+                        * The target wants to handle the io without unmap.
+                        *
+                        * The clone must be cleaned up before the target
+                        * takes it so that the target can dispatch it
+                        * to (same or another) underlying device again.
+                        */
+                       clean_clone(clone);
+
+                       if (!queue_in_tgt) {
+                               DMERR("queue_in_tgt isn't implemented.");
+                               BUG();
+                       }
+                       queue_in_tgt(tio->ti, clone, &tio->info);
+                       blk_run_queue(q_orig);
+
+                       return 0;
+               case DM_ENDIO_REQUEUE:
+                       /*
+                        * The target wants to push back the I/O for noflush
+                        * suspension.
+                        * Don't invoke blk_run_queue() in this case so that
+                        * the requeued request won't be dispatched again soon.
+                        */
+                       requeue_request(q_orig, orig);
+                       requeued = 1;
+
+                       goto free_clone;
+               default:
+                       if (r >= 0) {
+                               DMWARN("unimplemented target endio return"
+                                      " value: %d", r);
+                               BUG();
+                       }
+
+                       /*
+                        * The target detected error, but didn't retry.
+                        * Direct the error to upper layer.
+                        */
+                       error = r;
+                       break;
+               }
+       }
+
+       /* Complete the original request's chunk */
+       r = blk_end_request(orig, error, nr_bytes);
+
+       /*
+        * Recopy the original request fields that were updated
+        * in blk_end_request() to the clone.
+        */
+       blk_update_cloned_rq(orig, clone);
+
+       if (r)
+               /* The original request has leftover */
+               return 1;
+
+free_clone:
+       /*
+        * Now the original request is completed and freed, or requeued.
+        * So no need the clone any more.
+        */
+
+       if (endio)
+               endio(tio->ti, clone, error, &tio->info);
+
+       finish_clone(clone);
+
+       if (!requeued)
+               blk_run_queue(q_orig);
+
+       dec_rq_pending(tio);
+       free_rq_tio(tio->md, tio);
+
+       return 0;
+}
+
 static sector_t max_io_len(struct mapped_device *md,
                           sector_t sector, struct dm_target *ti)
 {
@@ -854,7 +1052,7 @@ static int __split_bio(struct mapped_dev
  * The request function that just remaps the bio built up by
  * dm_merge_bvec.
  */
-static int dm_request(struct request_queue *q, struct bio *bio)
+static int _dm_request(struct request_queue *q, struct bio *bio)
 {
        int r = -EIO;
        int rw = bio_data_dir(bio);
@@ -904,12 +1102,203 @@ out_req:
        return 0;
 }
 
+static int dm_make_request(struct request_queue *q, struct bio *bio)
+{
+       int r = 0;
+       struct mapped_device *md = (struct mapped_device *)q->queuedata;
+
+       if (unlikely(bio_barrier(bio))) {
+               bio_endio(bio, -EOPNOTSUPP);
+               return 0;
+       }
+
+       if (unlikely(!md->map)) {
+               bio_endio(bio, -EIO);
+               return 0;
+       }
+
+       r = md->saved_make_request_fn(q, bio); /* call __make_request() */
+
+       return r;
+}
+
+static int dm_request(struct request_queue *q, struct bio *bio)
+{
+       struct mapped_device *md = q->queuedata;
+
+       if (test_bit(DMF_REQUEST_BASED, &md->flags))
+               return dm_make_request(q, bio);
+       else
+               return _dm_request(q, bio);
+}
+
+static void setup_clone(struct request *clone, struct request *rq)
+{
+       INIT_LIST_HEAD(&clone->queuelist);
+       INIT_LIST_HEAD(&clone->donelist);
+       clone->q = NULL;
+       clone->cmd_flags = (rq_data_dir(rq) | REQ_NOMERGE | REQ_CLONED);
+       clone->cmd_type = rq->cmd_type;
+       clone->sector = rq->sector;
+       clone->hard_sector = rq->hard_sector;
+       clone->nr_sectors = rq->nr_sectors;
+       clone->hard_nr_sectors = rq->hard_nr_sectors;
+       clone->current_nr_sectors = rq->current_nr_sectors;
+       clone->hard_cur_sectors = rq->hard_cur_sectors;
+       clone->bio = rq->bio;
+       clone->biotail = rq->biotail;
+       INIT_HLIST_NODE(&clone->hash);
+/*     RB_CLEAR_NODE(&clone->rb_node);*/
+       clone->completion_data = NULL;
+       clone->elevator_private = NULL;
+       clone->elevator_private2 = NULL;
+       clone->rq_disk = NULL;
+       clone->start_time = jiffies;
+       clone->nr_phys_segments = rq->nr_phys_segments;
+       clone->nr_hw_segments = rq->nr_hw_segments;
+       clone->ioprio = rq->ioprio;
+       clone->special = NULL;
+       clone->buffer = rq->buffer;
+       clone->tag = -1;
+       clone->errors = 0;
+       clone->ref_count = 1;
+       clone->cmd_len = rq->cmd_len;
+       memcpy(clone->cmd, rq->cmd, sizeof(rq->cmd));
+       clone->data_len = rq->data_len;
+       clone->sense_len = rq->sense_len;
+       clone->data = rq->data;
+       clone->sense = rq->sense;
+       clone->timeout = 0;
+       clone->retries = 0;
+/*     clone->dtor = NULL;
+       clone->dtor_data = NULL;*/
+       clone->end_io = NULL;
+       clone->complete_io = clone_end_request;
+       clone->end_io_data = NULL;
+       clone->next_rq = NULL;
+       clone->endio_error = 0;
+}
+
+void dm_dispatch_request(struct request_queue *q, struct request *rq)
+{
+       rq->start_time = jiffies;
+       blk_submit_request(q, rq);
+}
+EXPORT_SYMBOL_GPL(dm_dispatch_request);
+
+static int clone_and_map_request(struct dm_target *ti, struct request *rq,
+                                struct mapped_device *md)
+{
+       int r;
+       struct request *clone;
+       struct dm_rq_target_io *tio;
+
+       tio = alloc_rq_tio(md); /* only one for each original request */
+       if (!tio)
+               /* -ENOMEM */
+               goto requeue;
+       tio->md = md;
+       tio->error = 0;
+       tio->orig = rq;
+       tio->ti = ti;
+       memset(&tio->info, 0, sizeof(tio->info));
+
+       clone = &tio->clone;
+       setup_clone(clone, rq);
+       clone->end_io_data = tio;
+
+       atomic_inc(&md->pending);
+       r = ti->type->map_rq(ti, clone, &tio->info);
+       switch (r) {
+       case DM_MAPIO_SUBMITTED:
+               /* the target has taken the request to submit by itself */
+               break;
+       case DM_MAPIO_REMAPPED:
+               /* the clone has been remapped so dispatch it */
+               dm_dispatch_request(clone->q, clone);
+               break;
+       case DM_MAPIO_REQUEUE:
+               /* the target has requested to requeue the original request */
+               dec_rq_pending(tio);
+               free_rq_tio(md, tio);
+               goto requeue;
+       default:
+               if (r >= 0) {
+                       DMWARN("unimplemented target map return value: %d", r);
+                       BUG();
+               }
+
+               dec_rq_pending(tio);
+               free_rq_tio(md, tio);
+
+               /* Avoid printing "I/O error" message because we didn't I/O */
+               rq->cmd_flags |= REQ_QUIET;
+               blk_end_request(rq, -EIO, blk_rq_bytes(rq));
+               break;
+       }
+
+       return 0;
+
+requeue:
+       /*
+        * Actual requeue is done in dm_request_fn() after queue lock is taken
+        * so that we can avoid to get extra queue lock for the requeue
+        */
+       return 1;
+}
+
+int dm_underlying_device_congested(struct request_queue *q)
+{
+       return blk_lld_busy(q);
+}
+EXPORT_SYMBOL_GPL(dm_underlying_device_congested);
+
+/*
+ * q->request_fn for request-based dm.
+ * called with q->queue_lock held
+ */
+static void dm_request_fn(struct request_queue *q)
+{
+       int r;
+       struct mapped_device *md = (struct mapped_device *)q->queuedata;
+       struct dm_table *map = dm_get_table(md);
+       struct dm_target *ti;
+       dm_congested_fn congested;
+       struct request *rq;
+
+       while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) {
+               rq = elv_next_request(q);
+               if (!rq)
+                       break;
+
+               ti = dm_table_find_target(map, rq->sector);
+               congested = ti->type->congested;
+               if (congested && congested(ti))
+                       break;
+
+               blkdev_dequeue_request(rq);
+               spin_unlock(q->queue_lock);
+               r = clone_and_map_request(ti, rq, md);
+               spin_lock_irq(q->queue_lock);
+
+               if (r)
+                       __requeue_request(q, rq);
+       }
+
+       dm_table_put(map);
+
+       return;
+}
+
 static void dm_unplug_all(struct request_queue *q)
 {
        struct mapped_device *md = q->queuedata;
        struct dm_table *map = dm_get_table(md);
 
        if (map) {
+               if (test_bit(DMF_REQUEST_BASED, &md->flags))
+                       generic_unplug_device(q);
+
                dm_table_unplug_all(map);
                dm_table_put(map);
        }
@@ -923,6 +1312,9 @@ static int dm_any_congested(void *conges
 
        if (!map || test_bit(DMF_BLOCK_IO, &md->flags))
                r = bdi_bits;
+       else if (test_bit(DMF_REQUEST_BASED, &md->flags))
+               /* Request-based dm cares about only own queue */
+               r = md->queue->backing_dev_info.state & bdi_bits;
        else
                r = dm_table_any_congested(map, bdi_bits);
 
@@ -1417,6 +1809,25 @@ out:
        return r;
 }
 
+static void stop_queue(struct request_queue *q)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(q->queue_lock, flags);
+       blk_stop_queue(q);
+       spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+static void start_queue(struct request_queue *q)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(q->queue_lock, flags);
+       if (blk_queue_stopped(q))
+               blk_start_queue(q);
+       spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
 /*
  * Functions to lock and unlock any filesystem running on the
  * device.
@@ -1515,6 +1926,20 @@ int dm_suspend(struct mapped_device *md,
        add_wait_queue(&md->wait, &wait);
        up_write(&md->io_lock);
 
+       /*
+        * In request-based dm, stopping request_queue prevents mapping.
+        * Even after stopping the request_queue, submitted requests from
+        * upper-layer can be inserted to the request_queue.
+        * So original (unmapped) requests are kept in the request_queue
+        * during suspension.
+        *
+        * NOTE: To stop mapping correctly, dm_request_fn() must care about
+        *       the queue-stop status because underlying device drivers
+        *       may call q->request_fn() directly through blk_run_queue().
+        */
+       if (test_bit(DMF_REQUEST_BASED, &md->flags))
+               stop_queue(md->queue);
+
        /* unplug */
        if (map)
                dm_table_unplug_all(map);
@@ -1527,14 +1952,23 @@ int dm_suspend(struct mapped_device *md,
        down_write(&md->io_lock);
        remove_wait_queue(&md->wait, &wait);
 
-       if (noflush)
-               __merge_pushback_list(md);
+       if (noflush) {
+               if (test_bit(DMF_REQUEST_BASED, &md->flags))
+                       /* Request-based dm uses md->queue for noflush */
+                       clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
+               else
+                       __merge_pushback_list(md);
+       }
        up_write(&md->io_lock);
 
        /* were we interrupted ? */
        if (r < 0) {
                dm_queue_flush(md, DM_WQ_FLUSH_DEFERRED, NULL);
 
+               if (test_bit(DMF_REQUEST_BASED, &md->flags))
+                       /* Request-based dm uses md->queue for deferred I/Os */
+                       start_queue(md->queue);
+
                unlock_fs(md);
                goto out; /* pushback list is already flushed, so skip flush */
        }
@@ -1573,6 +2007,18 @@ int dm_resume(struct mapped_device *md)
        if (r)
                goto out;
 
+       /*
+        * Flushing deferred I/Os must be done after targets are resumed
+        * so that mapping of targets can work correctly.
+        *
+        * Resuming request_queue earlier than clear_bit(DMF_BLOCK_IO) means
+        * starting to flush requests before upper-layer starts to submit bios.
+        * It may be better because llds should be empty and no need to wait
+        * for bio merging so strictly at this time.
+        */
+       if (test_bit(DMF_REQUEST_BASED, &md->flags))
+               start_queue(md->queue);
+
        dm_queue_flush(md, DM_WQ_FLUSH_DEFERRED, NULL);
 
        unlock_fs(md);
Index: 2.6.25-rc1/drivers/md/dm.h
===================================================================
--- 2.6.25-rc1.orig/drivers/md/dm.h
+++ 2.6.25-rc1/drivers/md/dm.h
@@ -128,6 +128,12 @@ int dm_target_iterate(void (*iter_func)(
                                        void *param), void *param);
 
 /*-----------------------------------------------------------------
+ * Helper for block layer operations
+ *---------------------------------------------------------------*/
+void dm_dispatch_request(struct request_queue *q, struct request *rq);
+int dm_underlying_device_congested(struct request_queue *q);
+
+/*-----------------------------------------------------------------
  * Useful inlines.
  *---------------------------------------------------------------*/
 static inline int array_too_big(unsigned long fixed, unsigned long obj,
@@ -184,6 +190,7 @@ void dm_stripe_exit(void);
 
 void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size);
 union map_info *dm_get_mapinfo(struct bio *bio);
+union map_info *dm_get_rq_mapinfo(struct request *rq);
 int dm_open_count(struct mapped_device *md);
 int dm_lock_for_deletion(struct mapped_device *md);
 
-
To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to