Re: [PATCH 0/15] copy offload patches

2015-12-14 Thread Mikulas Patocka


On Thu, 10 Dec 2015, Martin K. Petersen wrote:

> >>>>> "Mikulas" == Mikulas Patocka <mpato...@redhat.com> writes:
> 
> Mikulas,
> 
> Mikulas> This patch series adds copy offload (the XCOPY command) to the
> Mikulas> block layer, SCSI subsystems and device mapper.
> 
> Now that the VFS stuff appears to stabilize I agree it's a good time to
> revisit all this. I just merged the required VPD patches from Hannes so
> those will be in 4.5.
> 
> I have a bunch of changes to the SCSI code that I worked on over the
> spring/summer based on a feedback from the array vendors after
> discussions we started at LSF/MM. Generally speaking, their comments
> didn't make things easier, nor prettier :( But your two bio approach is
> a requirement to accommodate those needs (token-based copy) so I'll work
> on consolidating your changes with mine.

Let me know when you make some progress with that.

Is there some software iSCSI implementation that supports token-based 
copy? So that I could try to make support for it.

Mikulas

> That said, we still need Mike Christie's patches to go in first.
> 
> Mike: What's your status? I'm afraid I didn't get a chance to dig very
> deep in your series since it coincided with me scrambling to sort out
> SCSI for 4.4. Do you think there's a chance we could get your patches in
> shape for 4.5? Is there an up-to-date tree I can look at?
> 
> -- 
> Martin K. PetersenOracle Linux Engineering

Mikulas
--
To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 14/15] dm kcopyd: change mutex to spinlock

2015-12-10 Thread Mikulas Patocka
job->lock is only taken for a finite amount of time and the process
doesn't block while holding it, so change it from mutex to spinlock.

This change is needed for the next patch that makes it possible to call
segment_complete from an interrupt. Taking mutexes inside an interrupt is
not allowed.

Signed-off-by: Mikulas Patocka <mpato...@redhat.com>

---
 drivers/md/dm-kcopyd.c |   10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

Index: linux-3.16-rc5/drivers/md/dm-kcopyd.c
===
--- linux-3.16-rc5.orig/drivers/md/dm-kcopyd.c  2014-07-15 19:20:34.0 
+0200
+++ linux-3.16-rc5/drivers/md/dm-kcopyd.c   2014-07-15 19:24:20.0 
+0200
@@ -21,7 +21,7 @@
 #include 
 #include 
 #include 
-#include 
+#include 
 #include 
 #include 
 #include 
@@ -356,7 +356,7 @@ struct kcopyd_job {
 * These fields are only used if the job has been split
 * into more manageable parts.
 */
-   struct mutex lock;
+   spinlock_t lock;
atomic_t sub_jobs;
sector_t progress;
 
@@ -629,7 +629,7 @@ static void segment_complete(int read_er
struct kcopyd_job *job = sub_job->master_job;
struct dm_kcopyd_client *kc = job->kc;
 
-   mutex_lock(>lock);
+   spin_lock(>lock);
 
/* update the error */
if (read_err)
@@ -653,7 +653,7 @@ static void segment_complete(int read_er
job->progress += count;
}
}
-   mutex_unlock(>lock);
+   spin_unlock(>lock);
 
if (count) {
int i;
@@ -708,7 +708,7 @@ static void submit_job(struct kcopyd_job
if (job->source.count <= SUB_JOB_SIZE)
dispatch_job(job);
else {
-   mutex_init(>lock);
+   spin_lock_init(>lock);
job->progress = 0;
split_job(job);
}

--
To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 13/15] dm kcopyd: support copy offload

2015-12-10 Thread Mikulas Patocka
This patch adds copy offload support to dm-kcopyd. If copy offload fails,
copying is performed using dm-io, just like before.

There is a module parameter "copy_offload" that can be set to enable or
disable this feature. It can be used to test performance of copy offload.

Signed-off-by: Mikulas Patocka <mpato...@redhat.com>

---
 drivers/md/dm-kcopyd.c |   44 
 1 file changed, 40 insertions(+), 4 deletions(-)

Index: linux-3.18-rc3/drivers/md/dm-kcopyd.c
===
--- linux-3.18-rc3.orig/drivers/md/dm-kcopyd.c  2014-11-05 18:09:23.0 
+0100
+++ linux-3.18-rc3/drivers/md/dm-kcopyd.c   2014-11-05 18:13:04.0 
+0100
@@ -96,6 +96,9 @@ static DEFINE_SPINLOCK(throttle_spinlock
  */
 #define MAX_SLEEPS 10
 
+static bool copy_offload = true;
+module_param(copy_offload, bool, S_IRUGO | S_IWUSR);
+
 static void io_job_start(struct dm_kcopyd_throttle *t)
 {
unsigned throttle, now, difference;
@@ -358,6 +361,8 @@ struct kcopyd_job {
sector_t progress;
 
struct kcopyd_job *master_job;
+
+   struct work_struct copy_work;
 };
 
 static struct kmem_cache *_job_cache;
@@ -709,6 +714,31 @@ static void submit_job(struct kcopyd_job
}
 }
 
+static void copy_offload_work(struct work_struct *work)
+{
+   struct kcopyd_job *job = container_of(work, struct kcopyd_job, 
copy_work);
+   sector_t copied;
+
+   blkdev_issue_copy(job->source.bdev, job->source.sector,
+ job->dests[0].bdev, job->dests[0].sector,
+ job->source.count,
+ GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | 
__GFP_NOWARN,
+ NULL, NULL, );
+
+   job->source.sector += copied;
+   job->source.count -= copied;
+   job->dests[0].sector += copied;
+   job->dests[0].count -= copied;
+
+   submit_job(job);
+}
+
+static void try_copy_offload(struct kcopyd_job *job)
+{
+   INIT_WORK(>copy_work, copy_offload_work);
+   queue_work(job->kc->kcopyd_wq, >copy_work);
+}
+
 int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
   unsigned int num_dests, struct dm_io_region *dests,
   unsigned int flags, dm_kcopyd_notify_fn fn, void *context)
@@ -733,10 +763,20 @@ int dm_kcopyd_copy(struct dm_kcopyd_clie
job->num_dests = num_dests;
memcpy(>dests, dests, sizeof(*dests) * num_dests);
 
+   job->fn = fn;
+   job->context = context;
+   job->master_job = job;
+
if (from) {
job->source = *from;
job->pages = NULL;
job->rw = READ;
+   if (copy_offload && num_dests == 1 &&
+   bdev_copy_offload(job->source.bdev) &&
+   bdev_copy_offload(job->dests[0].bdev)) {
+   try_copy_offload(job);
+   return 0;
+   }
} else {
memset(>source, 0, sizeof job->source);
job->source.count = job->dests[0].count;
@@ -753,10 +793,6 @@ int dm_kcopyd_copy(struct dm_kcopyd_clie
}
}
 
-   job->fn = fn;
-   job->context = context;
-   job->master_job = job;
-
submit_job(job);
 
return 0;

--
To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 12/15] dm kcopyd: introduce the function submit_job

2015-12-10 Thread Mikulas Patocka
We move some code to a function submit_job. It is needed for the next
patch that calls submit_job from another place.

Signed-off-by: Mikulas Patocka <mpato...@redhat.com>

---
 drivers/md/dm-kcopyd.c |   19 ---
 1 file changed, 12 insertions(+), 7 deletions(-)

Index: linux-3.16-rc5/drivers/md/dm-kcopyd.c
===
--- linux-3.16-rc5.orig/drivers/md/dm-kcopyd.c  2014-07-14 16:45:23.0 
+0200
+++ linux-3.16-rc5/drivers/md/dm-kcopyd.c   2014-07-14 17:28:36.0 
+0200
@@ -698,6 +698,17 @@ static void split_job(struct kcopyd_job 
}
 }
 
+static void submit_job(struct kcopyd_job *job)
+{
+   if (job->source.count <= SUB_JOB_SIZE)
+   dispatch_job(job);
+   else {
+   mutex_init(>lock);
+   job->progress = 0;
+   split_job(job);
+   }
+}
+
 int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
   unsigned int num_dests, struct dm_io_region *dests,
   unsigned int flags, dm_kcopyd_notify_fn fn, void *context)
@@ -746,13 +757,7 @@ int dm_kcopyd_copy(struct dm_kcopyd_clie
job->context = context;
job->master_job = job;
 
-   if (job->source.count <= SUB_JOB_SIZE)
-   dispatch_job(job);
-   else {
-   mutex_init(>lock);
-   job->progress = 0;
-   split_job(job);
-   }
+   submit_job(job);
 
return 0;
 }

--
To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 11/15] dm stripe: support copy

2015-12-10 Thread Mikulas Patocka
Support the copy operation for the stripe target.

In stripe_merge, we verify that the underlying device supports copy. If it
doesn't, we can fail fast without any bio being contructed.

Signed-off-by: Mikulas Patocka <mpato...@redhat.com>

---
 drivers/md/dm-stripe.c |1 +
 1 file changed, 1 insertion(+)

Index: linux-4.3-rc1/drivers/md/dm-stripe.c
===
--- linux-4.3-rc1.orig/drivers/md/dm-stripe.c   2015-09-14 16:06:36.0 
+0200
+++ linux-4.3-rc1/drivers/md/dm-stripe.c2015-09-14 16:23:01.0 
+0200
@@ -169,6 +169,7 @@ static int stripe_ctr(struct dm_target *
ti->num_flush_bios = stripes;
ti->num_discard_bios = stripes;
ti->num_write_same_bios = stripes;
+   ti->copy_supported = 1;
 
sc->chunk_size = chunk_size;
if (chunk_size & (chunk_size - 1))
--
To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 15/15] dm kcopyd: call copy offload with asynchronous callback

2015-12-10 Thread Mikulas Patocka
Change dm kcopyd so that it calls blkdev_issue_copy with an asynchronous
callback. There can be large number of pending kcopyd requests and holding
a process context for each of them may put too much load on the workqueue
subsystem.

This patch changes it so that blkdev_issue_copy returns after it submitted
the requests and copy_offload_callback is called when the copy operation
finishes.

Signed-off-by: Mikulas Patocka <mpato...@redhat.com>

---
 drivers/md/dm-kcopyd.c |   33 ++---
 1 file changed, 14 insertions(+), 19 deletions(-)

Index: linux-3.16-rc5/drivers/md/dm-kcopyd.c
===
--- linux-3.16-rc5.orig/drivers/md/dm-kcopyd.c  2014-07-15 19:24:20.0 
+0200
+++ linux-3.16-rc5/drivers/md/dm-kcopyd.c   2014-07-15 19:24:54.0 
+0200
@@ -361,8 +361,6 @@ struct kcopyd_job {
sector_t progress;
 
struct kcopyd_job *master_job;
-
-   struct work_struct copy_work;
 };
 
 static struct kmem_cache *_job_cache;
@@ -628,8 +626,9 @@ static void segment_complete(int read_er
struct kcopyd_job *sub_job = (struct kcopyd_job *) context;
struct kcopyd_job *job = sub_job->master_job;
struct dm_kcopyd_client *kc = job->kc;
+   unsigned long flags;
 
-   spin_lock(>lock);
+   spin_lock_irqsave(>lock, flags);
 
/* update the error */
if (read_err)
@@ -653,7 +652,7 @@ static void segment_complete(int read_er
job->progress += count;
}
}
-   spin_unlock(>lock);
+   spin_unlock_irqrestore(>lock, flags);
 
if (count) {
int i;
@@ -714,29 +713,25 @@ static void submit_job(struct kcopyd_job
}
 }
 
-static void copy_offload_work(struct work_struct *work)
+static void copy_offload_callback(void *ptr, int error)
 {
-   struct kcopyd_job *job = container_of(work, struct kcopyd_job, 
copy_work);
-   sector_t copied;
+   struct kcopyd_job *job = ptr;
 
-   blkdev_issue_copy(job->source.bdev, job->source.sector,
- job->dests[0].bdev, job->dests[0].sector,
- job->source.count,
- GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | 
__GFP_NOWARN,
- NULL, NULL, );
-
-   job->source.sector += copied;
-   job->source.count -= copied;
-   job->dests[0].sector += copied;
-   job->dests[0].count -= copied;
+   job->source.sector += job->progress;
+   job->source.count -= job->progress;
+   job->dests[0].sector += job->progress;
+   job->dests[0].count -= job->progress;
 
submit_job(job);
 }
 
 static void try_copy_offload(struct kcopyd_job *job)
 {
-   INIT_WORK(>copy_work, copy_offload_work);
-   queue_work(job->kc->kcopyd_wq, >copy_work);
+   blkdev_issue_copy(job->source.bdev, job->source.sector,
+ job->dests[0].bdev, job->dests[0].sector,
+ job->source.count,
+ GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | 
__GFP_NOWARN,
+ copy_offload_callback, job, >progress);
 }
 
 int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,

--
To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/15] block copy: initial XCOPY offload support

2015-12-10 Thread Mikulas Patocka
This is Martin Petersen's xcopy patch
(https://git.kernel.org/cgit/linux/kernel/git/mkp/linux.git/commit/?h=xcopy=0bdeed274e16b3038a851552188512071974eea8)
with some bug fixes, ported to the current kernel.

This patch makes it possible to use the SCSI XCOPY command.

We create a bio that has REQ_COPY flag in bi_rw and a bi_copy structure
that defines the source device. The target device is defined in the
bi_bdev and bi_iter.bi_sector.

There is a new BLKCOPY ioctl that makes it possible to use XCOPY from
userspace. The ioctl argument is a pointer to an array of four uint64_t
values.

The first value is a source byte offset, the second value is a destination
byte offset, the third value is byte length. The forth value is written by
the kernel and it represents the number of bytes that the kernel actually
copied.

Signed-off-by: Martin K. Petersen <martin.peter...@oracle.com>
Signed-off-by: Mikulas Patocka <mpato...@redhat.com>

---
 Documentation/ABI/testing/sysfs-block |9 +
 block/bio.c   |2 
 block/blk-core.c  |5 
 block/blk-lib.c   |   95 +++
 block/blk-merge.c |   11 -
 block/blk-settings.c  |   13 +
 block/blk-sysfs.c |   11 +
 block/compat_ioctl.c  |1 
 block/ioctl.c |   50 ++
 drivers/scsi/scsi.c   |   57 +++
 drivers/scsi/sd.c |  271 +-
 drivers/scsi/sd.h |4 
 include/linux/bio.h   |9 -
 include/linux/blk_types.h |   14 +
 include/linux/blkdev.h|   15 +
 include/scsi/scsi_device.h|3 
 include/uapi/linux/fs.h   |1 
 17 files changed, 557 insertions(+), 14 deletions(-)

Index: linux-4.4-rc4/Documentation/ABI/testing/sysfs-block
===
--- linux-4.4-rc4.orig/Documentation/ABI/testing/sysfs-block2015-12-10 
17:03:59.0 +0100
+++ linux-4.4-rc4/Documentation/ABI/testing/sysfs-block 2015-12-10 
17:04:30.0 +0100
@@ -235,3 +235,12 @@ Description:
write_same_max_bytes is 0, write same is not supported
by the device.
 
+
+What:  /sys/block//queue/copy_max_bytes
+Date:  January 2014
+Contact:   Martin K. Petersen <martin.peter...@oracle.com>
+Description:
+   Devices that support copy offloading will set this value
+   to indicate the maximum buffer size in bytes that can be
+   copied in one operation. If the copy_max_bytes is 0 the
+   device does not support copy offload.
Index: linux-4.4-rc4/block/blk-core.c
===
--- linux-4.4-rc4.orig/block/blk-core.c 2015-12-10 17:03:59.0 +0100
+++ linux-4.4-rc4/block/blk-core.c  2015-12-10 17:04:30.0 +0100
@@ -1957,6 +1957,11 @@ generic_make_request_checks(struct bio *
goto end_io;
}
 
+   if (bio->bi_rw & REQ_COPY && !bdev_copy_offload(bio->bi_bdev)) {
+   err = -EOPNOTSUPP;
+   goto end_io;
+   }
+
/*
 * Various block parts want %current->io_context and lazy ioc
 * allocation ends up trading a lot of pain for a small amount of
Index: linux-4.4-rc4/block/blk-lib.c
===
--- linux-4.4-rc4.orig/block/blk-lib.c  2015-12-10 17:03:59.0 +0100
+++ linux-4.4-rc4/block/blk-lib.c   2015-12-10 17:04:30.0 +0100
@@ -299,3 +299,98 @@ int blkdev_issue_zeroout(struct block_de
return __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask);
 }
 EXPORT_SYMBOL(blkdev_issue_zeroout);
+
+/**
+ * blkdev_issue_copy - queue a copy same operation
+ * @src_bdev:  source blockdev
+ * @src_sector:source sector
+ * @dst_bdev:  destination blockdev
+ * @dst_sector: destination sector
+ * @nr_sects:  number of sectors to copy
+ * @gfp_mask:  memory allocation flags (for bio_alloc)
+ *
+ * Description:
+ *Copy a block range from source device to target device.
+ */
+int blkdev_issue_copy(struct block_device *src_bdev, sector_t src_sector,
+ struct block_device *dst_bdev, sector_t dst_sector,
+ unsigned int nr_sects, gfp_t gfp_mask)
+{
+   DECLARE_COMPLETION_ONSTACK(wait);
+   struct request_queue *sq = bdev_get_queue(src_bdev);
+   struct request_queue *dq = bdev_get_queue(dst_bdev);
+   unsigned int max_copy_sectors;
+   struct bio_batch bb;
+   int ret = 0;
+
+   if (!sq || !dq)
+   return -ENXIO;
+
+   max_copy_sectors = min(sq->limits.max_copy_sectors,
+  dq->limits.max_copy_sectors);
+
+   if (max_copy_sectors == 0)
+   return -EO

[PATCH 10/15] dm linear: support copy

2015-12-10 Thread Mikulas Patocka
Support copy operation in the linear target.

Signed-off-by: Mikulas Patocka <mpato...@redhat.com>

---
 drivers/md/dm-linear.c |1 +
 1 file changed, 1 insertion(+)

Index: linux-4.3-rc1/drivers/md/dm-linear.c
===
--- linux-4.3-rc1.orig/drivers/md/dm-linear.c   2015-09-14 16:06:36.0 
+0200
+++ linux-4.3-rc1/drivers/md/dm-linear.c2015-09-14 16:22:44.0 
+0200
@@ -59,6 +59,7 @@ static int linear_ctr(struct dm_target *
ti->num_flush_bios = 1;
ti->num_discard_bios = 1;
ti->num_write_same_bios = 1;
+   ti->copy_supported = 1;
ti->private = lc;
return 0;
 

--
To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 9/15] dm: implement copy

2015-12-10 Thread Mikulas Patocka
This patch implements basic copy support for device mapper core.
Individual targets can enable copy support by setting ti->copy_supported.

Device mapper device advertises copy support if at least one target
supports copy and for this target, at least one underlying device supports
copy.

Signed-off-by: Mikulas Patocka <mpato...@redhat.com>

---
 drivers/md/dm-table.c |   16 
 drivers/md/dm.c   |   27 +++
 include/linux/device-mapper.h |5 +
 3 files changed, 48 insertions(+)

Index: linux-4.4-rc4/drivers/md/dm.c
===
--- linux-4.4-rc4.orig/drivers/md/dm.c  2015-12-10 17:04:05.0 +0100
+++ linux-4.4-rc4/drivers/md/dm.c   2015-12-10 17:05:48.0 +0100
@@ -1679,6 +1679,31 @@ static int __send_write_same(struct clon
return __send_changing_extent_only(ci, get_num_write_same_bios, NULL);
 }
 
+static int __send_copy(struct clone_info *ci)
+{
+   struct dm_target *ti;
+   sector_t bound;
+
+   ti = dm_table_find_target(ci->map, ci->sector);
+   if (!dm_target_is_valid(ti))
+   return -EIO;
+
+   if (!ti->copy_supported)
+   return -EOPNOTSUPP;
+
+   bound = max_io_len(ci->sector, ti);
+
+   if (unlikely(ci->sector_count > bound))
+   return -EOPNOTSUPP;
+
+   __clone_and_map_simple_bio(ci, ti, 0, NULL);
+
+   ci->sector += ci->sector_count;
+   ci->sector_count = 0;
+
+   return 0;
+}
+
 /*
  * Select the correct strategy for processing a non-flush bio.
  */
@@ -1692,6 +1717,8 @@ static int __split_and_process_non_flush
return __send_discard(ci);
else if (unlikely(bio->bi_rw & REQ_WRITE_SAME))
return __send_write_same(ci);
+   else if (unlikely(bio->bi_rw & REQ_COPY))
+   return __send_copy(ci);
 
ti = dm_table_find_target(ci->map, ci->sector);
if (!dm_target_is_valid(ti))
Index: linux-4.4-rc4/include/linux/device-mapper.h
===
--- linux-4.4-rc4.orig/include/linux/device-mapper.h2015-12-10 
17:04:05.0 +0100
+++ linux-4.4-rc4/include/linux/device-mapper.h 2015-12-10 17:04:59.0 
+0100
@@ -271,6 +271,11 @@ struct dm_target {
 * Set if this target does not return zeroes on discarded blocks.
 */
bool discard_zeroes_data_unsupported:1;
+
+   /*
+* Set if the target supports XCOPY.
+*/
+   bool copy_supported:1;
 };
 
 /* Each target can link one of these into the table */
Index: linux-4.4-rc4/drivers/md/dm-table.c
===
--- linux-4.4-rc4.orig/drivers/md/dm-table.c2015-12-10 17:03:51.0 
+0100
+++ linux-4.4-rc4/drivers/md/dm-table.c 2015-12-10 17:04:59.0 +0100
@@ -286,6 +286,11 @@ static int device_area_is_invalid(struct
limits->logical_block_size >> SECTOR_SHIFT;
char b[BDEVNAME_SIZE];
 
+   if (ti->copy_supported)
+   limits->max_copy_sectors =
+   min_not_zero(limits->max_copy_sectors,
+   bdev_get_queue(bdev)->limits.max_copy_sectors);
+
/*
 * Some devices exist without request functions,
 * such as loop devices not yet bound to backing files.
@@ -1256,6 +1261,13 @@ int dm_calculate_queue_limits(struct dm_
 
ti = dm_table_get_target(table, i++);
 
+   if (ti->begin)
+   ti_limits.copy_boundary = min(ti_limits.copy_boundary,
+   (unsigned char)__ffs64(ti->begin));
+   if (ti->max_io_len)
+   ti_limits.copy_boundary = min(ti_limits.copy_boundary,
+   (unsigned char)__ffs(ti->max_io_len));
+
if (!ti->type->iterate_devices)
goto combine_limits;
 
@@ -1289,6 +1301,10 @@ combine_limits:
   dm_device_name(table->md),
   (unsigned long long) ti->begin,
   (unsigned long long) ti->len);
+
+   limits->max_copy_sectors =
+   min_not_zero(limits->max_copy_sectors,
+   ti_limits.max_copy_sectors);
}
 
return validate_hardware_logical_block_alignment(table, limits);

--
To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 7/15] scsi xcopy: keep cache of failures

2015-12-10 Thread Mikulas Patocka
If xcopy between two devices fails, it is pointless to send more xcopy
command between there two devices because they take time and they will
likely also fail.

This patch keeps a cache of (source_device,destination_device) pairs where
copying failed and makes sure that no xcopy command is sooner than 30
seconds after the last failure.

Signed-off-by: Mikulas Patocka <mpato...@redhat.com>

---
 drivers/scsi/sd.c |   37 +
 1 file changed, 37 insertions(+)

Index: linux-4.4-rc4/drivers/scsi/sd.c
===
--- linux-4.4-rc4.orig/drivers/scsi/sd.c2015-12-07 16:59:09.0 
+0100
+++ linux-4.4-rc4/drivers/scsi/sd.c 2015-12-07 16:59:12.0 +0100
@@ -939,6 +939,26 @@ static void sd_config_copy(struct scsi_d
   (logical_block_size >> 9));
 }
 
+#define SD_COPY_DISABLED_CACHE_TIME(HZ * 30)
+#define SD_COPY_DISABLED_CACHE_HASH_BITS   6
+#define SD_COPY_DISABLED_CACHE_HASH(1 << 
SD_COPY_DISABLED_CACHE_HASH_BITS)
+
+struct sd_copy_disabled_cache_entry {
+   struct scsi_device *src;
+   struct scsi_device *dst;
+   unsigned long jiffies;
+};
+
+static struct sd_copy_disabled_cache_entry 
sd_copy_disabled_cache[SD_COPY_DISABLED_CACHE_HASH];
+
+static struct sd_copy_disabled_cache_entry *sd_copy_disabled_cache_hash(
+   struct scsi_device *src, struct scsi_device *dst)
+{
+   return _copy_disabled_cache[
+   hash_long((unsigned long)src + ((unsigned long)dst >> 1), 
SD_COPY_DISABLED_CACHE_HASH_BITS)
+   ];
+}
+
 static int sd_setup_copy_cmnd(struct scsi_cmnd *cmd)
 {
struct request *rq = cmd->request;
@@ -951,6 +971,7 @@ static int sd_setup_copy_cmnd(struct scs
struct bio *bio = rq->bio;
struct page *page;
unsigned char *buf;
+   struct sd_copy_disabled_cache_entry *e;
 
dst_sdp = scsi_disk(rq->rq_disk)->device;
dst_queue = rq->rq_disk->queue;
@@ -970,6 +991,12 @@ static int sd_setup_copy_cmnd(struct scs
if (src_sdp->sector_size != dst_sdp->sector_size)
return BLKPREP_KILL;
 
+   /* The copy failed in the past, so do not retry it for some time */
+   e = sd_copy_disabled_cache_hash(src_sdp, dst_sdp);
+   if (unlikely(jiffies - ACCESS_ONCE(e->jiffies) < 
SD_COPY_DISABLED_CACHE_TIME) &&
+   likely(ACCESS_ONCE(e->src) == src_sdp) && 
likely(ACCESS_ONCE(e->dst) == dst_sdp))
+   return BLKPREP_KILL;
+
dst_lba = blk_rq_pos(rq) >> (ilog2(dst_sdp->sector_size) - 9);
src_lba = bio->bi_copy->pair[0]->bi_iter.bi_sector >> 
(ilog2(src_sdp->sector_size) - 9);
nr_blocks = blk_rq_sectors(rq) >> (ilog2(dst_sdp->sector_size) - 9);
@@ -2003,6 +2030,16 @@ static int sd_done(struct scsi_cmnd *SCp
 */
case EXTENDED_COPY:
if ((SCpnt->cmnd[1] & 0x1f) == 0) {
+   struct sd_copy_disabled_cache_entry *e;
+   struct scsi_device *src_sdp, *dst_sdp;
+
+   dst_sdp = sdkp->device;
+   src_sdp = 
scsi_disk(req->bio->bi_copy->pair[0]->bi_bdev->bd_disk)->device;
+   e = 
sd_copy_disabled_cache_hash(src_sdp, dst_sdp);
+   ACCESS_ONCE(e->src) = src_sdp;
+   ACCESS_ONCE(e->dst) = dst_sdp;
+   ACCESS_ONCE(e->jiffies) = jiffies;
+
good_bytes = 0;
req->__data_len = blk_rq_bytes(req);
req->cmd_flags |= REQ_QUIET;
--
To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 8/15] block copy: introduce "copy_boundary" limits

2015-12-10 Thread Mikulas Patocka
There is no way to split copy requests, so the creator of the requests
(the function blkdev_issue_copy) must make requests with proper size.
Device mapper splits the requests at a boundary between targets or at
a boundary specified by each target driver. We must make sure that the
copy requets do not cross these boundaries.

This patch introduces a new queue limit "copy_boundary", it is log2 of the
boundary in sectors that the request must not cross. Device mapper will
use this limit to propagate its requirements through the device stack.

Signed-off-by: Mikulas Patocka <mpato...@redhat.com>

---
 block/blk-lib.c|   21 -
 block/blk-settings.c   |   17 +
 block/blk-sysfs.c  |   13 +
 include/linux/blkdev.h |1 +
 4 files changed, 51 insertions(+), 1 deletion(-)

Index: linux-4.4-rc4/block/blk-settings.c
===
--- linux-4.4-rc4.orig/block/blk-settings.c 2015-12-10 17:04:30.0 
+0100
+++ linux-4.4-rc4/block/blk-settings.c  2015-12-10 17:04:54.0 +0100
@@ -96,6 +96,7 @@ void blk_set_default_limits(struct queue
lim->chunk_sectors = 0;
lim->max_write_same_sectors = 0;
lim->max_copy_sectors = 0;
+   lim->copy_boundary = 63;
lim->max_discard_sectors = 0;
lim->max_hw_discard_sectors = 0;
lim->discard_granularity = 0;
@@ -311,6 +312,18 @@ void blk_queue_max_copy_sectors(struct r
 EXPORT_SYMBOL(blk_queue_max_copy_sectors);
 
 /**
+ * blk_queue_copy_boundary - set a boundary for copy operations. No copy
+ * operation may cross the boundary
+ * @q:  the request queue for the device
+ * @copy_boundary: log2 of the copy boundary in sectors
+ **/
+void blk_queue_copy_boundary(struct request_queue *q,
+unsigned char copy_boundary)
+{
+   q->limits.copy_boundary = copy_boundary;
+}
+
+/**
  * blk_queue_max_segments - set max hw segments for a request for this queue
  * @q:  the request queue for the device
  * @max_segments:  max number of segments
@@ -552,6 +565,10 @@ int blk_stack_limits(struct queue_limits
t->max_segment_size = min_not_zero(t->max_segment_size,
   b->max_segment_size);
 
+   t->copy_boundary = min(t->copy_boundary, b->copy_boundary);
+   if (start)
+   t->copy_boundary = min(t->copy_boundary, (unsigned 
char)__ffs64(start));
+
t->misaligned |= b->misaligned;
 
alignment = queue_limit_alignment_offset(b, start);
Index: linux-4.4-rc4/block/blk-sysfs.c
===
--- linux-4.4-rc4.orig/block/blk-sysfs.c2015-12-10 17:04:30.0 
+0100
+++ linux-4.4-rc4/block/blk-sysfs.c 2015-12-10 17:04:54.0 +0100
@@ -199,6 +199,13 @@ static ssize_t queue_copy_max_show(struc
(unsigned long long)q->limits.max_copy_sectors << 9);
 }
 
+static ssize_t queue_copy_boundary_show(struct request_queue *q, char *page)
+{
+   return sprintf(page, "%llu\n",
+   !q->limits.max_copy_sectors || q->limits.copy_boundary == 63 ?
+   0ULL : 512ULL << q->limits.copy_boundary);
+}
+
 static ssize_t
 queue_max_sectors_store(struct request_queue *q, const char *page, size_t 
count)
 {
@@ -453,6 +460,11 @@ static struct queue_sysfs_entry queue_co
.show = queue_copy_max_show,
 };
 
+static struct queue_sysfs_entry queue_copy_boundary_entry = {
+   .attr = {.name = "copy_boundary_bytes", .mode = S_IRUGO },
+   .show = queue_copy_boundary_show,
+};
+
 static struct queue_sysfs_entry queue_nonrot_entry = {
.attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
.show = queue_show_nonrot,
@@ -509,6 +521,7 @@ static struct attribute *default_attrs[]
_discard_zeroes_data_entry.attr,
_write_same_max_entry.attr,
_copy_max_entry.attr,
+   _copy_boundary_entry.attr,
_nonrot_entry.attr,
_nomerges_entry.attr,
_rq_affinity_entry.attr,
Index: linux-4.4-rc4/include/linux/blkdev.h
===
--- linux-4.4-rc4.orig/include/linux/blkdev.h   2015-12-10 17:04:46.0 
+0100
+++ linux-4.4-rc4/include/linux/blkdev.h2015-12-10 17:04:54.0 
+0100
@@ -273,6 +273,7 @@ struct queue_limits {
unsigned short  max_segments;
unsigned short  max_integrity_segments;
 
+   unsigned char   copy_boundary;
unsigned char   misaligned;
unsigned char   discard_misaligned;
unsigned char   cluster;
Index: linux-4.4-rc4/block/blk-lib.c
===
--- linux-4.4-rc4.orig/block/blk-lib.c  2015-12-10 17:04:46.0 +0100
+

[PATCH 6/15] scsi xcopy: suppress error messages

2015-12-10 Thread Mikulas Patocka
This patch suppresses error messages when copying between two arrays that
support XCOPY each, but that cannot copy data between each other.

Signed-off-by: Mikulas Patocka <mpato...@redhat.com>

---
 drivers/scsi/sd.c |   14 ++
 1 file changed, 14 insertions(+)

Index: linux-4.4-rc4/drivers/scsi/sd.c
===
--- linux-4.4-rc4.orig/drivers/scsi/sd.c2015-12-07 16:58:48.0 
+0100
+++ linux-4.4-rc4/drivers/scsi/sd.c 2015-12-07 16:59:09.0 +0100
@@ -1995,6 +1995,20 @@ static int sd_done(struct scsi_cmnd *SCp
req->cmd_flags |= REQ_QUIET;
}
}
+   } else if (sshdr.asc == 0x26) {
+   switch (op) {
+   /*
+* Copying between two arrays that support XCOPY, but
+* cannot access each other.
+*/
+   case EXTENDED_COPY:
+   if ((SCpnt->cmnd[1] & 0x1f) == 0) {
+   good_bytes = 0;
+   req->__data_len = blk_rq_bytes(req);
+   req->cmd_flags |= REQ_QUIET;
+   }
+   break;
+   }
}
break;
default:

--
To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/15] copy offload patches

2015-12-10 Thread Mikulas Patocka
Hi

This patch series adds copy offload (the XCOPY command) to the block 
layer, SCSI subsystems and device mapper.

The principle of operation is this:

We create two bios with REQ_COPY flag, one for read and one for write. The 
bios have no data pages and they both point to the same bio_copy 
structure. The bios may be submitted to the same logical volume or to 
different logical volumes. The bios go independently through the device 
mapper stack and when they both reach the physical device (the function 
blk_queue_bio), the bio pair is collected and the XCOPY request is sent to 
the SCSI device.

Mikulas
--
To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/15] block copy: use two bios

2015-12-10 Thread Mikulas Patocka
This patch changes the architecture of xcopy so that two bios are used.

There used to be just one bio that held pointers to both source and
destination block device. However a bio with two block devices cannot
really be passed though block midlayer drivers (dm and md).

When we need to send the XCOPY command, we call the function
blkdev_issue_copy. This function creates two bios, the first with bi_rw
READ | REQ_COPY and the second WRITE | REQ_COPY. The bios have a pointer
to a common bi_copy structure.

These bios travel independently through the block device stack. When both
the bios reach the physical disk driver (the function blk_queue_bio), they
are paired, a request is made and the request is sent to the SCSI disk
driver.

It is possible that one of the bios reaches a device that doesn't support
XCOPY, in that case both bios are aborted with an error.

Note that there is no guarantee that the XCOPY command will succeed. If it
doesn't succeed, the caller is supposed to perform the copy manually.

Signed-off-by: Mikulas Patocka <mpato...@redhat.com>

---
 block/bio.c   |   28 +++-
 block/blk-core.c  |   41 
 block/blk-lib.c   |   78 +-
 drivers/scsi/sd.c |7 +---
 include/linux/blk_types.h |   12 +--
 5 files changed, 142 insertions(+), 24 deletions(-)

Index: linux-4.4-rc4/block/blk-lib.c
===
--- linux-4.4-rc4.orig/block/blk-lib.c  2015-12-10 17:08:11.0 +0100
+++ linux-4.4-rc4/block/blk-lib.c   2015-12-10 17:59:40.0 +0100
@@ -300,6 +300,38 @@ int blkdev_issue_zeroout(struct block_de
 }
 EXPORT_SYMBOL(blkdev_issue_zeroout);
 
+static void bio_copy_end_io(struct bio *bio)
+{
+   struct bio_copy *bc = bio->bi_copy;
+   if (unlikely(bio->bi_error)) {
+   unsigned long flags;
+   int dir;
+   struct bio *other;
+
+   /* if the other bio is waiting for the pair, release it */
+   spin_lock_irqsave(>spinlock, flags);
+   if (bc->error >= 0)
+   bc->error = bio->bi_error;
+   dir = bio_data_dir(bio);
+   other = bc->pair[dir ^ 1];
+   bc->pair[dir ^ 1] = NULL;
+   spin_unlock_irqrestore(>spinlock, flags);
+   if (other) {
+   other->bi_error = bio->bi_error;
+   bio_endio(other);
+   }
+   }
+   bio_put(bio);
+   if (atomic_dec_and_test(>in_flight)) {
+   struct bio_batch *bb = bc->private;
+   if (unlikely(bc->error < 0) && !ACCESS_ONCE(bb->error))
+   ACCESS_ONCE(bb->error) = bc->error;
+   kfree(bc);
+   if (atomic_dec_and_test(>done))
+   complete(bb->wait);
+   }
+}
+
 /**
  * blkdev_issue_copy - queue a copy same operation
  * @src_bdev:  source blockdev
@@ -346,9 +378,9 @@ int blkdev_issue_copy(struct block_devic
bb.wait = 
 
while (nr_sects) {
-   struct bio *bio;
+   struct bio *read_bio, *write_bio;
struct bio_copy *bc;
-   unsigned int chunk;
+   unsigned int chunk = min(nr_sects, max_copy_sectors);
 
bc = kmalloc(sizeof(struct bio_copy), gfp_mask);
if (!bc) {
@@ -356,27 +388,43 @@ int blkdev_issue_copy(struct block_devic
break;
}
 
-   bio = bio_alloc(gfp_mask, 1);
-   if (!bio) {
+   read_bio = bio_alloc(gfp_mask, 1);
+   if (!read_bio) {
kfree(bc);
ret = -ENOMEM;
break;
}
 
-   chunk = min(nr_sects, max_copy_sectors);
-
-   bio->bi_iter.bi_sector = dst_sector;
-   bio->bi_iter.bi_size = chunk << 9;
-   bio->bi_end_io = bio_batch_end_io;
-   bio->bi_bdev = dst_bdev;
-   bio->bi_private = 
-   bio->bi_copy = bc;
+   write_bio = bio_alloc(gfp_mask, 1);
+   if (!write_bio) {
+   bio_put(read_bio);
+   kfree(bc);
+   ret = -ENOMEM;
+   break;
+   }
 
-   bc->bic_bdev = src_bdev;
-   bc->bic_sector = src_sector;
+   atomic_set(>in_flight, 2);
+   bc->error = 1;
+   bc->pair[0] = NULL;
+   bc->pair[1] = NULL;
+   bc->private = 
+   spin_lock_init(>spinlock);
+
+   read_bio->bi_iter.bi_sector = src_sector;
+   read_bio->bi_iter.bi_size = chunk << 9;
+ 

[PATCH 4/15] block copy: use a timer to fix a theoretical deadlock

2015-12-10 Thread Mikulas Patocka
The block layer creates two bios for each copy operation. The bios travel
independently through the storage stack and they are paired at the block
device.

There is a theoretical problem with this - the block device stack only
guarantees forward progress for a single bio. When two bios are sent, it
is possible (though very unlikely) that the first bio exhausts some
mempool and the second bio waits until there is free space in the mempool
(and thus it waits until the first bio finishes).

To avoid this deadlock, we introduce a timer. If the two bios are not
paired at the physical block device within 10 seconds, the copy operation
is aborted and the bio that waits to be paired is released with an error.

Note that there is no guarantee that any XCOPY operation succeed, so
aborting an operation with an error shouldn't cause any problems - the
caller is supposed to perform the copy manually if XCOPY fails.

Signed-off-by: Mikulas Patocka <mpato...@redhat.com>

---
 block/blk-lib.c   |   31 +++
 include/linux/blk_types.h |2 ++
 2 files changed, 33 insertions(+)

Index: linux-4.3/block/blk-lib.c
===
--- linux-4.3.orig/block/blk-lib.c  2015-11-02 18:43:06.0 +0100
+++ linux-4.3/block/blk-lib.c   2015-11-02 18:43:10.0 +0100
@@ -300,6 +300,34 @@ int blkdev_issue_zeroout(struct block_de
 }
 EXPORT_SYMBOL(blkdev_issue_zeroout);
 
+#define BLK_COPY_TIMEOUT   (10 * HZ)
+
+static void blk_copy_timeout(unsigned long bc_)
+{
+   struct bio_copy *bc = (struct bio_copy *)bc_;
+   struct bio *bio0 = NULL, *bio1 = NULL;
+
+   WARN_ON(!irqs_disabled());
+
+   spin_lock(>spinlock);   /* the timer is IRQSAFE */
+   if (bc->error == 1) {
+   bc->error = -ETIMEDOUT;
+   bio0 = bc->pair[0];
+   bio1 = bc->pair[1];
+   bc->pair[0] = bc->pair[1] = NULL;
+   }
+   spin_unlock(>spinlock);
+
+   if (bio0) {
+   bio0->bi_error = -ETIMEDOUT;
+   bio_endio(bio0);
+   }
+   if (bio1) {
+   bio1->bi_error = -ETIMEDOUT;
+   bio_endio(bio1);
+   }
+}
+
 static void bio_copy_end_io(struct bio *bio)
 {
struct bio_copy *bc = bio->bi_copy;
@@ -335,6 +363,7 @@ static void bio_copy_end_io(struct bio *
} while (unlikely(atomic64_cmpxchg(bc->first_error,
first_error, bc->offset) != first_error));
}
+   del_timer_sync(>timer);
kfree(bc);
if (atomic_dec_and_test(>done))
complete(bb->wait);
@@ -425,6 +454,8 @@ int blkdev_issue_copy(struct block_devic
bc->first_error = _error;
bc->offset = offset;
spin_lock_init(>spinlock);
+   __setup_timer(>timer, blk_copy_timeout, (unsigned long)bc, 
TIMER_IRQSAFE);
+   mod_timer(>timer, jiffies + BLK_COPY_TIMEOUT);
 
read_bio->bi_iter.bi_sector = src_sector;
read_bio->bi_iter.bi_size = chunk << 9;
Index: linux-4.3/include/linux/blk_types.h
===
--- linux-4.3.orig/include/linux/blk_types.h2015-11-02 18:43:06.0 
+0100
+++ linux-4.3/include/linux/blk_types.h 2015-11-02 18:43:10.0 +0100
@@ -6,6 +6,7 @@
 #define __LINUX_BLK_TYPES_H
 
 #include 
+#include 
 
 struct bio_set;
 struct bio;
@@ -52,6 +53,7 @@ struct bio_copy {
atomic64_t *first_error;
sector_t offset;
spinlock_t spinlock;
+   struct timer_list timer;
 };
 
 /*

--
To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/15] block copy: report the amount of copied data

2015-12-10 Thread Mikulas Patocka
This patch changes blkdev_issue_copy so that it returns the number of
copied sectors in the variable "copied".

The kernel makes best effort to copy as much data as possible, but because
of device mapper mapping, it may be possible that copying fails at some
stage. If we just returned the error number, the caller wouldn't know if
all or part of the operation failed and the caller would be required to
redo the whole copy operation.

We return the number of copied sectors so that the caller can skip these
sectors when doing the copy manually. On success (zero return code), the
number of copied sectors is equal to the number of requested sectors. On
error (negative return code), the number of copied sectors is smaller than
the number of requested sectors.

The number of copied bytes is returned as a fourth uint64_t argument in
the BLKCOPY ioctl.

Signed-off-by: Mikulas Patocka <mpato...@redhat.com>

---
 block/blk-lib.c   |   30 +-
 block/ioctl.c |   25 -
 include/linux/blk_types.h |2 ++
 include/linux/blkdev.h|3 ++-
 4 files changed, 45 insertions(+), 15 deletions(-)

Index: linux-4.4-rc4/block/blk-lib.c
===
--- linux-4.4-rc4.orig/block/blk-lib.c  2015-12-10 17:04:38.0 +0100
+++ linux-4.4-rc4/block/blk-lib.c   2015-12-10 17:04:40.0 +0100
@@ -324,8 +324,17 @@ static void bio_copy_end_io(struct bio *
bio_put(bio);
if (atomic_dec_and_test(>in_flight)) {
struct bio_batch *bb = bc->private;
-   if (unlikely(bc->error < 0) && !ACCESS_ONCE(bb->error))
-   ACCESS_ONCE(bb->error) = bc->error;
+   if (unlikely(bc->error < 0)) {
+   u64 first_error;
+   if (!ACCESS_ONCE(bb->error))
+   ACCESS_ONCE(bb->error) = bc->error;
+   do {
+   first_error = atomic64_read(bc->first_error);
+   if (bc->offset >= first_error)
+   break;
+   } while (unlikely(atomic64_cmpxchg(bc->first_error,
+   first_error, bc->offset) != first_error));
+   }
kfree(bc);
if (atomic_dec_and_test(>done))
complete(bb->wait);
@@ -346,7 +355,7 @@ static void bio_copy_end_io(struct bio *
  */
 int blkdev_issue_copy(struct block_device *src_bdev, sector_t src_sector,
  struct block_device *dst_bdev, sector_t dst_sector,
- unsigned int nr_sects, gfp_t gfp_mask)
+ sector_t nr_sects, gfp_t gfp_mask, sector_t *copied)
 {
DECLARE_COMPLETION_ONSTACK(wait);
struct request_queue *sq = bdev_get_queue(src_bdev);
@@ -354,6 +363,11 @@ int blkdev_issue_copy(struct block_devic
unsigned int max_copy_sectors;
struct bio_batch bb;
int ret = 0;
+   atomic64_t first_error = ATOMIC64_INIT(nr_sects);
+   sector_t offset = 0;
+
+   if (copied)
+   *copied = 0;
 
if (!sq || !dq)
return -ENXIO;
@@ -377,10 +391,10 @@ int blkdev_issue_copy(struct block_devic
bb.error = 0;
bb.wait = 
 
-   while (nr_sects) {
+   while (nr_sects && !ACCESS_ONCE(bb.error)) {
struct bio *read_bio, *write_bio;
struct bio_copy *bc;
-   unsigned int chunk = min(nr_sects, max_copy_sectors);
+   unsigned chunk = (unsigned)min(nr_sects, 
(sector_t)max_copy_sectors);
 
bc = kmalloc(sizeof(struct bio_copy), gfp_mask);
if (!bc) {
@@ -408,6 +422,8 @@ int blkdev_issue_copy(struct block_devic
bc->pair[0] = NULL;
bc->pair[1] = NULL;
bc->private = 
+   bc->first_error = _error;
+   bc->offset = offset;
spin_lock_init(>spinlock);
 
read_bio->bi_iter.bi_sector = src_sector;
@@ -429,12 +445,16 @@ int blkdev_issue_copy(struct block_devic
src_sector += chunk;
dst_sector += chunk;
nr_sects -= chunk;
+   offset += chunk;
}
 
/* Wait for bios in-flight */
if (!atomic_dec_and_test())
wait_for_completion_io();
 
+   if (copied)
+   *copied = min((sector_t)atomic64_read(_error), offset);
+
if (likely(!ret))
ret = bb.error;
 
Index: linux-4.4-rc4/include/linux/blk_types.h
===
--- linux-4.4-rc4.orig/include/linux/blk_types.h2015-12-10 
17:04:38.0 +0100
+++ linux-4.4-rc4/include/linux/blk_types.h 2015-12-10 17:04:

[PATCH 5/15] block copy: use asynchronous notification

2015-12-10 Thread Mikulas Patocka
In dm-snapshot target there may be large number of copy requests in
progress. If every pending copy request consumed a process context, it
would put too much load on the system.

To avoid this load, we need asynchronous notification when copy finishes -
we can pass a callback to the function blkdev_issue_copy, if the callback
is non-NULL, blkdev_issue_copy exits when it submits all the copy bios and
the callback is called when the copy operation finishes.

With the callback mechanism, there can be large number of in-progress copy
requests and we do not need process context for each of them.

Signed-off-by: Mikulas Patocka <mpato...@redhat.com>

---
 block/blk-lib.c   |  148 +-
 block/ioctl.c |2 
 include/linux/blk_types.h |5 -
 include/linux/blkdev.h|2 
 4 files changed, 112 insertions(+), 45 deletions(-)

Index: linux-4.4-rc4/block/blk-lib.c
===
--- linux-4.4-rc4.orig/block/blk-lib.c  2015-12-10 17:04:45.0 +0100
+++ linux-4.4-rc4/block/blk-lib.c   2015-12-10 17:04:46.0 +0100
@@ -300,6 +300,17 @@ int blkdev_issue_zeroout(struct block_de
 }
 EXPORT_SYMBOL(blkdev_issue_zeroout);
 
+struct bio_copy_batch {
+   atomic_long_t done;
+   int async_error;
+   int sync_error;
+   sector_t sync_copied;
+   atomic64_t first_error;
+   void (*callback)(void *data, int error);
+   void *data;
+   sector_t *copied;
+};
+
 #define BLK_COPY_TIMEOUT   (10 * HZ)
 
 static void blk_copy_timeout(unsigned long bc_)
@@ -328,6 +339,18 @@ static void blk_copy_timeout(unsigned lo
}
 }
 
+static void blk_copy_batch_finish(struct bio_copy_batch *batch)
+{
+   void (*fn)(void *, int) = batch->callback;
+   void *data = batch->data;
+   int error = unlikely(batch->sync_error) ? batch->sync_error : 
batch->async_error;
+   if (batch->copied)
+   *batch->copied = min(batch->sync_copied, 
(sector_t)atomic64_read(>first_error));
+   kfree(batch);
+   if (fn)
+   fn(data, error);
+}
+
 static void bio_copy_end_io(struct bio *bio)
 {
struct bio_copy *bc = bio->bi_copy;
@@ -351,25 +374,37 @@ static void bio_copy_end_io(struct bio *
}
bio_put(bio);
if (atomic_dec_and_test(>in_flight)) {
-   struct bio_batch *bb = bc->private;
+   struct bio_copy_batch *batch = bc->batch;
if (unlikely(bc->error < 0)) {
u64 first_error;
-   if (!ACCESS_ONCE(bb->error))
-   ACCESS_ONCE(bb->error) = bc->error;
+   if (!ACCESS_ONCE(batch->async_error))
+   ACCESS_ONCE(batch->async_error) = bc->error;
do {
-   first_error = atomic64_read(bc->first_error);
+   first_error = 
atomic64_read(>first_error);
if (bc->offset >= first_error)
break;
-   } while (unlikely(atomic64_cmpxchg(bc->first_error,
+   } while (unlikely(atomic64_cmpxchg(>first_error,
first_error, bc->offset) != first_error));
}
del_timer_sync(>timer);
kfree(bc);
-   if (atomic_dec_and_test(>done))
-   complete(bb->wait);
+   if (atomic_long_dec_and_test(>done))
+   blk_copy_batch_finish(batch);
}
 }
 
+struct bio_copy_completion {
+   struct completion wait;
+   int error;
+};
+
+static void bio_copy_sync_callback(void *ptr, int error)
+{
+   struct bio_copy_completion *comp = ptr;
+   comp->error = error;
+   complete(>wait);
+}
+
 /**
  * blkdev_issue_copy - queue a copy same operation
  * @src_bdev:  source blockdev
@@ -384,57 +419,83 @@ static void bio_copy_end_io(struct bio *
  */
 int blkdev_issue_copy(struct block_device *src_bdev, sector_t src_sector,
  struct block_device *dst_bdev, sector_t dst_sector,
- sector_t nr_sects, gfp_t gfp_mask, sector_t *copied)
+ sector_t nr_sects, gfp_t gfp_mask,
+ void (*callback)(void *, int), void *data,
+ sector_t *copied)
 {
DECLARE_COMPLETION_ONSTACK(wait);
struct request_queue *sq = bdev_get_queue(src_bdev);
struct request_queue *dq = bdev_get_queue(dst_bdev);
unsigned int max_copy_sectors;
-   struct bio_batch bb;
-   int ret = 0;
-   atomic64_t first_error = ATOMIC64_INIT(nr_sects);
-   sector_t offset = 0;
+   int ret;
+   struct bio_copy_batch *batch;
+   struct bio_copy_completion comp

[PATCH 4.1 v2] target-core: fix return without a value

2015-11-11 Thread Mikulas Patocka
Fix the warning drivers/target/target_core_pr.c:332:3: warning: 'return'
with no value, in function returning non-void [-Wreturn-type]

The patch 35afa65642a9a88c81913377b93a3a66220f8b9d committed to 4.1.11
adds a check if device_list is NULL. The patch adds a return statement
without a value to the function core_scsi3_pr_seq_non_holder that returns
int.

This patch has no upstream equivalent because the patch 35afa656 that 
introduced this bug has also no upstream equivalent. The code in upstream 
was already refactored, so neither 35afa656 nor this patch is needed 
there.

Signed-off-by: Mikulas Patocka <mpato...@redhat.com>

---
 drivers/target/target_core_pr.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Index: linux-4.1.13/drivers/target/target_core_pr.c
===
--- linux-4.1.13.orig/drivers/target/target_core_pr.c   2015-10-23 
18:25:26.0 +0200
+++ linux-4.1.13/drivers/target/target_core_pr.c2015-11-10 
19:22:07.0 +0100
@@ -329,7 +329,7 @@ static int core_scsi3_pr_seq_non_holder(
 * RESERVATION CONFLICT on some CDBs */
 
if (!se_sess->se_node_acl->device_list)
-   return;
+   return -EINVAL;
 
se_deve = se_sess->se_node_acl->device_list[cmd->orig_fe_lun];
/*
k
--
To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4.1] target-core: fix return without a value

2015-11-11 Thread Mikulas Patocka


On Tue, 10 Nov 2015, Greg Kroah-Hartman wrote:

> On Tue, Nov 10, 2015 at 06:31:47PM -0500, Mikulas Patocka wrote:
> > 
> > 
> > On Tue, 10 Nov 2015, Greg Kroah-Hartman wrote:
> > 
> > > On Tue, Nov 10, 2015 at 01:32:10PM -0500, Mikulas Patocka wrote:
> > > > Fix the warning drivers/target/target_core_pr.c:332:3: warning: 'return'
> > > > with no value, in function returning non-void [-Wreturn-type]
> > > > 
> > > > The patch 35afa65642a9a88c81913377b93a3a66220f8b9d committed to 4.1.11
> > > > adds a check if device_list is NULL. The patch adds a return statement
> > > > without a value to the function core_scsi3_pr_seq_non_holder that 
> > > > returns
> > > > int.
> > > > 
> > > > Signed-off-by: Mikulas Patocka <mpato...@redhat.com>
> > > > 
> > > > ---
> > > >  drivers/target/target_core_pr.c |2 +-
> > > >  1 file changed, 1 insertion(+), 1 deletion(-)
> > > 
> > > 
> > > 
> > > This is not the correct way to submit patches for inclusion in the
> > > stable kernel tree.  Please read Documentation/stable_kernel_rules.txt
> > > for how to do this properly.
> > > 
> > > 
> > 
> > This patch has no upstream equivalent (because the code in upstream was 
> > already refactored), so none of the rules in stable_kernel_rules.txt apply 
> > to it. The patch that broke it also has no upstream equivalent.
> >
> 
> Then you need to say that!
> 
> And the fact that a "please take this patch, it's correct but doesn't
> apply to upstream" patch is now broken is proof of why I didn't want to
> take it in the first place!
> 
> Ugh.  How about I just revert the original patch and then you send me
> backports of what is actually in Linus's tree so we don't have this
> problem anymore?

The original patch that caused this compilation warning fixes some 
crashes, so you should not revert it. You can work with subsystem 
maintainers on backporting the refactored code from upstream to 4.1 - I 
don't know how hard or possible it is.

Mikulas

> Oh, and you got the stable mailing list address wrong as well :(
> 
> Please fix all of this up and resend.
> 
> thanks,
> 
> greg k-h
--
To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4.1] target-core: fix return without a value

2015-11-10 Thread Mikulas Patocka
Fix the warning drivers/target/target_core_pr.c:332:3: warning: 'return'
with no value, in function returning non-void [-Wreturn-type]

The patch 35afa65642a9a88c81913377b93a3a66220f8b9d committed to 4.1.11
adds a check if device_list is NULL. The patch adds a return statement
without a value to the function core_scsi3_pr_seq_non_holder that returns
int.

Signed-off-by: Mikulas Patocka <mpato...@redhat.com>

---
 drivers/target/target_core_pr.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Index: linux-4.1.13/drivers/target/target_core_pr.c
===
--- linux-4.1.13.orig/drivers/target/target_core_pr.c   2015-10-23 
18:25:26.0 +0200
+++ linux-4.1.13/drivers/target/target_core_pr.c2015-11-10 
19:22:07.0 +0100
@@ -329,7 +329,7 @@ static int core_scsi3_pr_seq_non_holder(
 * RESERVATION CONFLICT on some CDBs */
 
if (!se_sess->se_node_acl->device_list)
-   return;
+   return -EINVAL;
 
se_deve = se_sess->se_node_acl->device_list[cmd->orig_fe_lun];
/*
--
To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4.1] target-core: fix return without a value

2015-11-10 Thread Mikulas Patocka


On Tue, 10 Nov 2015, Greg Kroah-Hartman wrote:

> On Tue, Nov 10, 2015 at 01:32:10PM -0500, Mikulas Patocka wrote:
> > Fix the warning drivers/target/target_core_pr.c:332:3: warning: 'return'
> > with no value, in function returning non-void [-Wreturn-type]
> > 
> > The patch 35afa65642a9a88c81913377b93a3a66220f8b9d committed to 4.1.11
> > adds a check if device_list is NULL. The patch adds a return statement
> > without a value to the function core_scsi3_pr_seq_non_holder that returns
> > int.
> > 
> > Signed-off-by: Mikulas Patocka <mpato...@redhat.com>
> > 
> > ---
> >  drivers/target/target_core_pr.c |2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> 
> 
> This is not the correct way to submit patches for inclusion in the
> stable kernel tree.  Please read Documentation/stable_kernel_rules.txt
> for how to do this properly.
> 
> 

This patch has no upstream equivalent (because the code in upstream was 
already refactored), so none of the rules in stable_kernel_rules.txt apply 
to it. The patch that broke it also has no upstream equivalent.


Documentation/stable_kernel_rules.txt

   --- Option 1 ---

   To have the patch automatically included in the stable tree, add the tag
 Cc: sta...@vger.kernel.org
   in the sign-off area. Once the patch is merged it will be applied to
   the stable tree without anything else needing to be done by the author
   or subsystem maintainer.

   --- Option 2 ---

   After the patch has been merged to Linus' tree, send an email to
   sta...@vger.kernel.org containing the subject of the patch, the commit ID,
   why you think it should be applied, and what kernel version you wish it to
   be applied to.

   --- Option 3 ---

   Send the patch, after verifying that it follows the above rules, to
   sta...@vger.kernel.org.  You must note the upstream commit ID in the
   changelog of your submission, as well as the kernel version you wish
   it to be applied to.

--
To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] sg: fix overflow in timeout calculation

2015-02-09 Thread Mikulas Patocka
USER_HZ may be greater than HZ and in that case arithmetics tries to
calculate the maximum accepted timeout overflows. We need to use INT_MAX
in this case.

Signed-off-by: Mikulas Patocka mpato...@redhat.com
Cc: sta...@vger.kernel.org

---
 drivers/scsi/sg.c |   10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

Index: linux-3.17-rc6/drivers/scsi/sg.c
===
--- linux-3.17-rc6.orig/drivers/scsi/sg.c   2014-09-26 20:10:22.0 
+0200
+++ linux-3.17-rc6/drivers/scsi/sg.c2014-09-26 20:10:50.0 +0200
@@ -90,6 +90,12 @@ static void sg_proc_cleanup(void);
  */
 #define MULDIV(X,MUL,DIV) X % DIV) * MUL) / DIV) + ((X / DIV) * MUL))
 
+#if USER_HZ  HZ
+#define MAX_USER_TIMEOUT MULDIV (INT_MAX, USER_HZ, HZ)
+#else
+#define MAX_USER_TIMEOUT INT_MAX
+#endif
+
 #define SG_DEFAULT_TIMEOUT MULDIV(SG_DEFAULT_TIMEOUT_USER, HZ, USER_HZ)
 
 int sg_big_buff = SG_DEF_RESERVED_SIZE;
@@ -892,8 +898,8 @@ sg_ioctl(struct file *filp, unsigned int
return result;
if (val  0)
return -EIO;
-   if (val = MULDIV (INT_MAX, USER_HZ, HZ))
-   val = MULDIV (INT_MAX, USER_HZ, HZ);
+   if (val = MAX_USER_TIMEOUT)
+   val = MAX_USER_TIMEOUT;
sfp-timeout_user = val;
sfp-timeout = MULDIV (val, HZ, USER_HZ);
 
--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [dm-devel] [PATCH 0/18] SCSI XCOPY support for the kernel and device mapper

2014-10-31 Thread Mikulas Patocka


On Sun, 26 Oct 2014, Bart Van Assche wrote:

 On 10/22/14 15:24, Mikulas Patocka wrote:
  I uploaded the new version of SCSI/device-mapper XCOPY patches here:
  http://people.redhat.com/~mpatocka/patches/kernel/xcopy/series.html
  
  I am also sending the patches in the following emails.
 
 Hello Mikulas,
 
 Which is the base commit of this patch series ? Patch 5/18 fails to apply on
 top of kernel v3.18-rc1 + patches 1..4.
 
 Thanks,
 
 Bart.

Hi

I have just tried it and it applies, on 3.18-rc1, 3.18-rc2 or current git 
head.

# quilt push
Applying patch scsi-xcopy-2bios.patch
patching file block/blk-lib.c
patching file include/linux/blk_types.h
patching file block/bio.c
Hunk #2 succeeded at 567 (offset 35 lines).
Hunk #3 succeeded at 1752 (offset 35 lines).
Hunk #4 succeeded at 1799 (offset 35 lines).
patching file block/blk-core.c
patching file drivers/scsi/sd.c

Now at patch scsi-xcopy-2bios.patch

Mikulas
--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/18] block copy: initial XCOPY offload support

2014-10-31 Thread Mikulas Patocka


On Wed, 22 Oct 2014, Douglas Gilbert wrote:

 See below ...
 
 Perhaps you are checking somewhere else ... EXTENDED_COPY
 opcode (0x83) has had service actions since SPC-4 rev 34
 (February 2012). By the time SPC-4 becomes standardized
 this command will most likely be called EXTENDED COPY(LID1)
 and will be opcode=0x83, service_action=0.
 
 To change the level of confusion, opcode 0x83 itself has
 been renamed Third-party Copy OUT. Not sure why T10 went
 for mixed capitalization here when most other opcode names
 are in upper case, perhaps to stress that it was an opcode
 shared by several commands.
 
 So please add a further check for ((cmd[1]  0x1f) == 0)
 unless that has been done elsewhere. With that in place
 a COPY OPERATION ABORT [opcode=0x83, service_action=0x1c]
 issued from my ddptctl utility won't trigger this code.

OK, I changed that. BTW. what happens if that code path is executed as a 
result of a command submitted via SG_IO? Is it correct to call sd_config_* 
or modify req-__data_len in that case? The WRITE_SAME path modifies 
req-__data_len too.

 There is a T10 proposal to drop EXTENDED COPY(LID1) in
 SPC-5 in favour of EXTENDED COPY(LID4) [opcode=0x83,
 service_action=0x1] which can be considered as a
 superset of the former. Something to think about for
 the future; perhaps a comment.

 And to push my own barrow here, have you considered
 token based copies based on POPULATE TOKEN and WRITE
 USING TOKEN? If not I can continue to use FreeBSD and
 FreeNAS as they have implemented them (plus the LID4
 equivalent of what its being presented here).

Is there some software iSCSI implementation that supports these commands? 
Target core doesn't seem to support them. If it doesn't support them, I 
can't test it.

 and 
 
  case UNMAP:
  sd_config_discard(sdkp, SD_LBP_DISABLE);
  break;
  @@ -2745,6 +2910,105 @@ static void sd_read_write_same(struct sc
  sdkp-ws10 = 1;
}
  
  +static void sd_read_copy_operations(struct scsi_disk *sdkp,
  +   unsigned char *buffer)
  +{
  +   struct scsi_device *sdev = sdkp-device;
  +   struct scsi_sense_hdr sshdr;
  +   unsigned char cdb[16];
  +   unsigned int result, len, i;
  +   bool b2b_desc = false, id_desc = false;
  +
  +   if (sdev-naa_len == 0)
  +   return;
  +
  +   /* Verify that the device has 3PC set in INQUIRY response */
  +   if (sdev-inquiry_len  6 || (sdev-inquiry[5]  (1  3)) == 0)
  +   return;
  +
  +   /* Receive Copy Operation Parameters */
  +   memset(cdb, 0, 16);
  +   cdb[0] = RECEIVE_COPY_RESULTS;
 
 This is now the Third-party Copy IN opcode [0x84]. In this
 case RECEIVE_COPY_RESULTS is deceptive as this is _not_ the
 command being built.
 
  +   cdb[1] = 0x3;
 
 with service action 0x3 which is the RECEIVE COPY OPERATION
 PARAMETERS command. Opcode 0x84 has had service actions for
 a lot longer than opcode 0x83, but the original naming lingers
 on.
 
 The code is correct, the naming could be clearer.
 
 Doug Gilbert

SPC-4 lists the command as RECEIVE_COPY_RESULTS and Linux already uses 
that name in the target-core driver.

Mikulas
--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/18] SCSI XCOPY support for the kernel and device mapper

2014-10-22 Thread Mikulas Patocka
Hi

I uploaded the new version of SCSI/device-mapper XCOPY patches here: 
http://people.redhat.com/~mpatocka/patches/kernel/xcopy/series.html

I am also sending the patches in the following emails.

Mikulas
--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/18] dm: remove num_write_bios

2014-10-22 Thread Mikulas Patocka
The target can set the function num_write_bios - dm will issue this
callback to ask the target how many bios does it want to receive.

This was intended for the dm-cache target, but it is not useable due to a
race condition (see the description of
e2e74d617eadc15f601983270c4f4a6935c5a943). num_write_bios is unused, so we
remove it.

Note that we deliberately leave the for loop in __clone_and_map_data_bio -
it will be used in the next patch.

Signed-off-by: Mikulas Patocka mpato...@redhat.com

---
 drivers/md/dm.c   |6 --
 include/linux/device-mapper.h |   15 ---
 2 files changed, 21 deletions(-)

Index: linux-3.18-rc1/drivers/md/dm.c
===
--- linux-3.18-rc1.orig/drivers/md/dm.c 2014-10-21 00:53:43.0 +0200
+++ linux-3.18-rc1/drivers/md/dm.c  2014-10-21 00:53:47.0 +0200
@@ -1433,12 +1433,6 @@ static void __clone_and_map_data_bio(str
unsigned target_bio_nr;
unsigned num_target_bios = 1;
 
-   /*
-* Does the target want to receive duplicate copies of the bio?
-*/
-   if (bio_data_dir(bio) == WRITE  ti-num_write_bios)
-   num_target_bios = ti-num_write_bios(ti, bio);
-
for (target_bio_nr = 0; target_bio_nr  num_target_bios; 
target_bio_nr++) {
tio = alloc_tio(ci, ti, target_bio_nr);
tio-len_ptr = len;
Index: linux-3.18-rc1/include/linux/device-mapper.h
===
--- linux-3.18-rc1.orig/include/linux/device-mapper.h   2014-10-21 
00:53:43.0 +0200
+++ linux-3.18-rc1/include/linux/device-mapper.h2014-10-21 
00:53:47.0 +0200
@@ -184,14 +184,6 @@ struct target_type {
 #define DM_TARGET_IMMUTABLE0x0004
 #define dm_target_is_immutable(type)   ((type)-features  DM_TARGET_IMMUTABLE)
 
-/*
- * Some targets need to be sent the same WRITE bio severals times so
- * that they can send copies of it to different devices.  This function
- * examines any supplied bio and returns the number of copies of it the
- * target requires.
- */
-typedef unsigned (*dm_num_write_bios_fn) (struct dm_target *ti, struct bio 
*bio);
-
 struct dm_target {
struct dm_table *table;
struct target_type *type;
@@ -231,13 +223,6 @@ struct dm_target {
 */
unsigned per_bio_data_size;
 
-   /*
-* If defined, this function is called to find out how many
-* duplicate bios should be sent to the target when writing
-* data.
-*/
-   dm_num_write_bios_fn num_write_bios;
-
/* target specific data */
void *private;
 

--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/18] blk-lib: fix error reporting

2014-10-22 Thread Mikulas Patocka
The function bio_batch_end_io ignores -EOPNOTSUPP. It doesn't matter for
discard (the device isn't required to discard anything, so missing the
error code and reporting success shouldn't cause any trouble). However,
for WRITE SAME command, missing the error code is obviously wrong. It may
fool the user into thinking that the data were written while in fact they
weren't.

Note that in device mapper, devices may be dynamically reconfigured, so a
device that supports WRITE SAME may stop supporting it at any time and
return -EOPNOTSUPP. Ignoring -EOPNOTSUPP is wrong.

This patch changes bio_batch-flags to an error field and stores the last
error there - so that the error is reported accurately and it isn't
ignored.

Signed-off-by: Mikulas Patocka mpato...@redhat.com
Cc: sta...@vger.kernel.org

---
 block/blk-lib.c |   25 -
 1 file changed, 12 insertions(+), 13 deletions(-)

Index: linux-3.16-rc3/block/blk-lib.c
===
--- linux-3.16-rc3.orig/block/blk-lib.c 2014-07-03 18:17:21.0 +0200
+++ linux-3.16-rc3/block/blk-lib.c  2014-07-03 18:36:52.0 +0200
@@ -11,7 +11,7 @@
 
 struct bio_batch {
atomic_tdone;
-   unsigned long   flags;
+   int error;
struct completion   *wait;
 };
 
@@ -19,8 +19,8 @@ static void bio_batch_end_io(struct bio 
 {
struct bio_batch *bb = bio-bi_private;
 
-   if (err  (err != -EOPNOTSUPP))
-   clear_bit(BIO_UPTODATE, bb-flags);
+   if (unlikely(err))
+   ACCESS_ONCE(bb-error) = err;
if (atomic_dec_and_test(bb-done))
complete(bb-wait);
bio_put(bio);
@@ -78,7 +78,7 @@ int blkdev_issue_discard(struct block_de
}
 
atomic_set(bb.done, 1);
-   bb.flags = 1  BIO_UPTODATE;
+   bb.error = 0;
bb.wait = wait;
 
blk_start_plug(plug);
@@ -134,8 +134,8 @@ int blkdev_issue_discard(struct block_de
if (!atomic_dec_and_test(bb.done))
wait_for_completion_io(wait);
 
-   if (!test_bit(BIO_UPTODATE, bb.flags))
-   ret = -EIO;
+   if (likely(!ret))
+   ret = bb.error;
 
return ret;
 }
@@ -172,7 +172,7 @@ int blkdev_issue_write_same(struct block
return -EOPNOTSUPP;
 
atomic_set(bb.done, 1);
-   bb.flags = 1  BIO_UPTODATE;
+   bb.error = 0;
bb.wait = wait;
 
while (nr_sects) {
@@ -208,8 +208,8 @@ int blkdev_issue_write_same(struct block
if (!atomic_dec_and_test(bb.done))
wait_for_completion_io(wait);
 
-   if (!test_bit(BIO_UPTODATE, bb.flags))
-   ret = -ENOTSUPP;
+   if (likely(!ret))
+   ret = bb.error;
 
return ret;
 }
@@ -236,7 +236,7 @@ static int __blkdev_issue_zeroout(struct
DECLARE_COMPLETION_ONSTACK(wait);
 
atomic_set(bb.done, 1);
-   bb.flags = 1  BIO_UPTODATE;
+   bb.error = 0;
bb.wait = wait;
 
ret = 0;
@@ -270,9 +270,8 @@ static int __blkdev_issue_zeroout(struct
if (!atomic_dec_and_test(bb.done))
wait_for_completion_io(wait);
 
-   if (!test_bit(BIO_UPTODATE, bb.flags))
-   /* One of bios in the batch was completed with error.*/
-   ret = -EIO;
+   if (likely(!ret))
+   ret = bb.error;
 
return ret;
 }

--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/18] dm: introduce dm_ask_for_duplicate_bios

2014-10-22 Thread Mikulas Patocka
This function can be used if the target needs to receive another duplicate
of the current bio.

Signed-off-by: Mikulas Patocka mpato...@redhat.com

---
 drivers/md/dm.c   |   24 +++-
 include/linux/device-mapper.h |2 ++
 2 files changed, 21 insertions(+), 5 deletions(-)

Index: linux-3.18-rc1/drivers/md/dm.c
===
--- linux-3.18-rc1.orig/drivers/md/dm.c 2014-10-21 00:42:31.0 +0200
+++ linux-3.18-rc1/drivers/md/dm.c  2014-10-21 00:44:13.0 +0200
@@ -1284,9 +1284,9 @@ EXPORT_SYMBOL_GPL(dm_set_target_max_io_l
  *  to make it empty)
  * The target requires that region 3 is to be sent in the next bio.
  *
- * If the target wants to receive multiple copies of the bio (via num_*bios, 
etc),
- * the partially processed part (the sum of regions 1+2) must be the same for 
all
- * copies of the bio.
+ * If the target wants to receive multiple copies of the bio with num_*_bios or
+ * dm_ask_for_duplicate_bio, the partially processed part (the sum of regions
+ * 1+2) must be the same for all copies of the bio.
  */
 void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
 {
@@ -1300,6 +1300,17 @@ void dm_accept_partial_bio(struct bio *b
 }
 EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
 
+/*
+ * The target driver can call this function only from the map routine. The
+ * target driver requests that the dm sends more duplicates of the current bio.
+ */
+void dm_ask_for_duplicate_bios(struct bio *bio, unsigned n_duplicates)
+{
+   struct dm_target_io *tio = container_of(bio, struct dm_target_io, 
clone);
+   (*tio-num_bios) += n_duplicates;
+}
+EXPORT_SYMBOL_GPL(dm_ask_for_duplicate_bios);
+
 static void __map_bio(struct dm_target_io *tio)
 {
int r;
@@ -1390,12 +1401,14 @@ static struct dm_target_io *alloc_tio(st
 
 static void __clone_and_map_simple_bio(struct clone_info *ci,
   struct dm_target *ti,
-  unsigned target_bio_nr, unsigned *len)
+  unsigned target_bio_nr, unsigned *len,
+  unsigned *num_bios)
 {
struct dm_target_io *tio = alloc_tio(ci, ti, target_bio_nr);
struct bio *clone = tio-clone;
 
tio-len_ptr = len;
+   tio-num_bios = num_bios;
 
__bio_clone_fast(clone, ci-bio);
if (len)
@@ -1410,7 +1423,7 @@ static void __send_duplicate_bios(struct
unsigned target_bio_nr;
 
for (target_bio_nr = 0; target_bio_nr  num_bios; target_bio_nr++)
-   __clone_and_map_simple_bio(ci, ti, target_bio_nr, len);
+   __clone_and_map_simple_bio(ci, ti, target_bio_nr, len, 
num_bios);
 }
 
 static int __send_empty_flush(struct clone_info *ci)
@@ -1436,6 +1449,7 @@ static void __clone_and_map_data_bio(str
for (target_bio_nr = 0; target_bio_nr  num_target_bios; 
target_bio_nr++) {
tio = alloc_tio(ci, ti, target_bio_nr);
tio-len_ptr = len;
+   tio-num_bios = num_target_bios;
clone_bio(tio, bio, sector, *len);
__map_bio(tio);
}
Index: linux-3.18-rc1/include/linux/device-mapper.h
===
--- linux-3.18-rc1.orig/include/linux/device-mapper.h   2014-10-21 
00:42:31.0 +0200
+++ linux-3.18-rc1/include/linux/device-mapper.h2014-10-21 
00:42:40.0 +0200
@@ -271,6 +271,7 @@ struct dm_target_io {
struct dm_target *ti;
unsigned target_bio_nr;
unsigned *len_ptr;
+   unsigned *num_bios;
struct bio clone;
 };
 
@@ -382,6 +383,7 @@ struct gendisk *dm_disk(struct mapped_de
 int dm_suspended(struct dm_target *ti);
 int dm_noflush_suspending(struct dm_target *ti);
 void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors);
+void dm_ask_for_duplicate_bios(struct bio *bio, unsigned n_duplicates);
 union map_info *dm_get_rq_mapinfo(struct request *rq);
 
 struct queue_limits *dm_get_queue_limits(struct mapped_device *md);

--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 5/18] block copy: use two bios

2014-10-22 Thread Mikulas Patocka
This patch changes the architecture of xcopy so that two bios are used.

There used to be just one bio that held pointers to both source and
destination block device. However a bio with two block devices cannot
really be passed though block midlayer drivers (dm and md).

When we need to send the XCOPY command, we call the function
blkdev_issue_copy. This function creates two bios, the first with bi_rw
READ | REQ_COPY and the second WRITE | REQ_COPY. The bios have a pointer
to a common bi_copy structure.

These bios travel independently through the block device stack. When both
the bios reach the physical disk driver (the function blk_queue_bio), they
are paired, a request is made and the request is sent to the SCSI disk
driver.

It is possible that one of the bios reaches a device that doesn't support
XCOPY, in that case both bios are aborted with an error.

Note that there is no guarantee that the XCOPY command will succeed. If it
doesn't succeed, the caller is supposed to perform the copy manually.

Signed-off-by: Mikulas Patocka mpato...@redhat.com

---
 block/bio.c   |   26 ++-
 block/blk-core.c  |   34 
 block/blk-lib.c   |   76 --
 drivers/scsi/sd.c |7 +---
 include/linux/blk_types.h |   12 ++-
 5 files changed, 131 insertions(+), 24 deletions(-)

Index: linux-3.18-rc1/block/blk-lib.c
===
--- linux-3.18-rc1.orig/block/blk-lib.c 2014-10-21 00:47:02.0 +0200
+++ linux-3.18-rc1/block/blk-lib.c  2014-10-21 00:48:51.0 +0200
@@ -305,6 +305,36 @@ int blkdev_issue_zeroout(struct block_de
 }
 EXPORT_SYMBOL(blkdev_issue_zeroout);
 
+static void bio_copy_end_io(struct bio *bio, int error)
+{
+   struct bio_copy *bc = bio-bi_copy;
+   if (unlikely(error)) {
+   unsigned long flags;
+   int dir;
+   struct bio *other;
+
+   /* if the other bio is waiting for the pair, release it */
+   spin_lock_irqsave(bc-spinlock, flags);
+   if (bc-error = 0)
+   bc-error = error;
+   dir = bio_data_dir(bio);
+   other = bc-pair[dir ^ 1];
+   bc-pair[dir ^ 1] = NULL;
+   spin_unlock_irqrestore(bc-spinlock, flags);
+   if (other)
+   bio_endio(other, error);
+   }
+   bio_put(bio);
+   if (atomic_dec_and_test(bc-in_flight)) {
+   struct bio_batch *bb = bc-private;
+   if (unlikely(bc-error  0)  !ACCESS_ONCE(bb-error))
+   ACCESS_ONCE(bb-error) = bc-error;
+   kfree(bc);
+   if (atomic_dec_and_test(bb-done))
+   complete(bb-wait);
+   }
+}
+
 /**
  * blkdev_issue_copy - queue a copy same operation
  * @src_bdev:  source blockdev
@@ -351,9 +381,9 @@ int blkdev_issue_copy(struct block_devic
bb.wait = wait;
 
while (nr_sects) {
-   struct bio *bio;
+   struct bio *read_bio, *write_bio;
struct bio_copy *bc;
-   unsigned int chunk;
+   unsigned int chunk = min(nr_sects, max_copy_sectors);
 
bc = kmalloc(sizeof(struct bio_copy), gfp_mask);
if (!bc) {
@@ -361,27 +391,43 @@ int blkdev_issue_copy(struct block_devic
break;
}
 
-   bio = bio_alloc(gfp_mask, 1);
-   if (!bio) {
+   read_bio = bio_alloc(gfp_mask, 1);
+   if (!read_bio) {
kfree(bc);
ret = -ENOMEM;
break;
}
 
-   chunk = min(nr_sects, max_copy_sectors);
-
-   bio-bi_iter.bi_sector = dst_sector;
-   bio-bi_iter.bi_size = chunk  9;
-   bio-bi_end_io = bio_batch_end_io;
-   bio-bi_bdev = dst_bdev;
-   bio-bi_private = bb;
-   bio-bi_copy = bc;
+   write_bio = bio_alloc(gfp_mask, 1);
+   if (!write_bio) {
+   bio_put(read_bio);
+   kfree(bc);
+   ret = -ENOMEM;
+   break;
+   }
 
-   bc-bic_bdev = src_bdev;
-   bc-bic_sector = src_sector;
+   atomic_set(bc-in_flight, 2);
+   bc-error = 1;
+   bc-pair[0] = NULL;
+   bc-pair[1] = NULL;
+   bc-private = bb;
+   spin_lock_init(bc-spinlock);
+
+   read_bio-bi_iter.bi_sector = src_sector;
+   read_bio-bi_iter.bi_size = chunk  9;
+   read_bio-bi_end_io = bio_copy_end_io;
+   read_bio-bi_bdev = src_bdev;
+   read_bio-bi_copy = bc;
+
+   write_bio-bi_iter.bi_sector = dst_sector;
+   write_bio

[PATCH 4/18] block copy: initial XCOPY offload support

2014-10-22 Thread Mikulas Patocka
This is Martin Petersen's xcopy patch
(https://git.kernel.org/cgit/linux/kernel/git/mkp/linux.git/commit/?h=xcopyid=0bdeed274e16b3038a851552188512071974eea8)
with some bug fixes, ported to the current kernel.

This patch makes it possible to use the SCSI XCOPY command.

We create a bio that has REQ_COPY flag in bi_rw and a bi_copy structure
that defines the source device. The target device is defined in the
bi_bdev and bi_iter.bi_sector.

There is a new BLKCOPY ioctl that makes it possible to use XCOPY from
userspace. The ioctl argument is a pointer to an array of four uint64_t
values.

The first value is a source byte offset, the second value is a destination
byte offset, the third value is byte length. The forth value is written by
the kernel and it represents the number of bytes that the kernel actually
copied.

Signed-off-by: Martin K. Petersen martin.peter...@oracle.com
Signed-off-by: Mikulas Patocka mpato...@redhat.com

---
 Documentation/ABI/testing/sysfs-block |9 +
 block/bio.c   |2 
 block/blk-core.c  |5 
 block/blk-lib.c   |   95 
 block/blk-merge.c |7 
 block/blk-settings.c  |   13 +
 block/blk-sysfs.c |   10 +
 block/compat_ioctl.c  |1 
 block/ioctl.c |   49 ++
 drivers/scsi/scsi.c   |   57 +++
 drivers/scsi/sd.c |  269 +-
 drivers/scsi/sd.h |4 
 include/linux/bio.h   |9 -
 include/linux/blk_types.h |   15 +
 include/linux/blkdev.h|   15 +
 include/scsi/scsi_device.h|3 
 include/uapi/linux/fs.h   |1 
 17 files changed, 551 insertions(+), 13 deletions(-)

Index: linux-3.18-rc1/Documentation/ABI/testing/sysfs-block
===
--- linux-3.18-rc1.orig/Documentation/ABI/testing/sysfs-block   2014-10-21 
01:04:13.0 +0200
+++ linux-3.18-rc1/Documentation/ABI/testing/sysfs-block2014-10-21 
01:04:22.0 +0200
@@ -228,3 +228,12 @@ Description:
write_same_max_bytes is 0, write same is not supported
by the device.
 
+
+What:  /sys/block/disk/queue/copy_max_bytes
+Date:  January 2014
+Contact:   Martin K. Petersen martin.peter...@oracle.com
+Description:
+   Devices that support copy offloading will set this value
+   to indicate the maximum buffer size in bytes that can be
+   copied in one operation. If the copy_max_bytes is 0 the
+   device does not support copy offload.
Index: linux-3.18-rc1/block/blk-core.c
===
--- linux-3.18-rc1.orig/block/blk-core.c2014-10-21 01:04:14.0 
+0200
+++ linux-3.18-rc1/block/blk-core.c 2014-10-21 01:04:22.0 +0200
@@ -1828,6 +1828,11 @@ generic_make_request_checks(struct bio *
goto end_io;
}
 
+   if (bio-bi_rw  REQ_COPY  !bdev_copy_offload(bio-bi_bdev)) {
+   err = -EOPNOTSUPP;
+   goto end_io;
+   }
+
/*
 * Various block parts want %current-io_context and lazy ioc
 * allocation ends up trading a lot of pain for a small amount of
Index: linux-3.18-rc1/block/blk-lib.c
===
--- linux-3.18-rc1.orig/block/blk-lib.c 2014-10-21 01:04:14.0 +0200
+++ linux-3.18-rc1/block/blk-lib.c  2014-10-21 01:04:22.0 +0200
@@ -304,3 +304,98 @@ int blkdev_issue_zeroout(struct block_de
return __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask);
 }
 EXPORT_SYMBOL(blkdev_issue_zeroout);
+
+/**
+ * blkdev_issue_copy - queue a copy same operation
+ * @src_bdev:  source blockdev
+ * @src_sector:source sector
+ * @dst_bdev:  destination blockdev
+ * @dst_sector: destination sector
+ * @nr_sects:  number of sectors to copy
+ * @gfp_mask:  memory allocation flags (for bio_alloc)
+ *
+ * Description:
+ *Copy a block range from source device to target device.
+ */
+int blkdev_issue_copy(struct block_device *src_bdev, sector_t src_sector,
+ struct block_device *dst_bdev, sector_t dst_sector,
+ unsigned int nr_sects, gfp_t gfp_mask)
+{
+   DECLARE_COMPLETION_ONSTACK(wait);
+   struct request_queue *sq = bdev_get_queue(src_bdev);
+   struct request_queue *dq = bdev_get_queue(dst_bdev);
+   unsigned int max_copy_sectors;
+   struct bio_batch bb;
+   int ret = 0;
+
+   if (!sq || !dq)
+   return -ENXIO;
+
+   max_copy_sectors = min(sq-limits.max_copy_sectors,
+  dq-limits.max_copy_sectors);
+
+   if (max_copy_sectors == 0)
+   return -EOPNOTSUPP;
+
+   if (src_sector

[PATCH 7/18] block copy: use a timer to fix a theoretical deadlock

2014-10-22 Thread Mikulas Patocka
The block layer creates two bios for each copy operation. The bios travel
independently through the storage stack and they are paired at the block
device.

There is a theoretical problem with this - the block device stack only
guarantees forward progress for a single bio. When two bios are sent, it
is possible (though very unlikely) that the first bio exhausts some
mempool and the second bio waits until there is free space in the mempool
(and thus it waits until the first bio finishes).

To avoid this deadlock, we introduce a timer. If the two bios are not
paired at the physical block device within 10 seconds, the copy operation
is aborted and the bio that waits to be paired is released with an error.

Note that there is no guarantee that any XCOPY operation succeed, so
aborting an operation with an error shouldn't cause any problems - the
caller is supposed to perform the copy manually if XCOPY fails.

Signed-off-by: Mikulas Patocka mpato...@redhat.com

---
 block/blk-lib.c   |   27 +++
 include/linux/blk_types.h |2 ++
 2 files changed, 29 insertions(+)

Index: linux-3.16-rc5/block/blk-lib.c
===
--- linux-3.16-rc5.orig/block/blk-lib.c 2014-07-15 15:27:49.0 +0200
+++ linux-3.16-rc5/block/blk-lib.c  2014-07-15 15:27:51.0 +0200
@@ -305,6 +305,30 @@ int blkdev_issue_zeroout(struct block_de
 }
 EXPORT_SYMBOL(blkdev_issue_zeroout);
 
+#define BLK_COPY_TIMEOUT   (10 * HZ)
+
+static void blk_copy_timeout(unsigned long bc_)
+{
+   struct bio_copy *bc = (struct bio_copy *)bc_;
+   struct bio *bio0 = NULL, *bio1 = NULL;
+
+   WARN_ON(!irqs_disabled());
+
+   spin_lock(bc-spinlock);   /* the timer is IRQSAFE */
+   if (bc-error == 1) {
+   bc-error = -ETIMEDOUT;
+   bio0 = bc-pair[0];
+   bio1 = bc-pair[1];
+   bc-pair[0] = bc-pair[1] = NULL;
+   }
+   spin_unlock(bc-spinlock);
+
+   if (bio0)
+   bio_endio(bio0, -ETIMEDOUT);
+   if (bio1)
+   bio_endio(bio1, -ETIMEDOUT);
+}
+
 static void bio_copy_end_io(struct bio *bio, int error)
 {
struct bio_copy *bc = bio-bi_copy;
@@ -338,6 +362,7 @@ static void bio_copy_end_io(struct bio *
} while (unlikely(atomic64_cmpxchg(bc-first_error,
first_error, bc-offset) != first_error));
}
+   del_timer_sync(bc-timer);
kfree(bc);
if (atomic_dec_and_test(bb-done))
complete(bb-wait);
@@ -428,6 +453,8 @@ int blkdev_issue_copy(struct block_devic
bc-first_error = first_error;
bc-offset = offset;
spin_lock_init(bc-spinlock);
+   __setup_timer(bc-timer, blk_copy_timeout, (unsigned long)bc, 
TIMER_IRQSAFE);
+   mod_timer(bc-timer, jiffies + BLK_COPY_TIMEOUT);
 
read_bio-bi_iter.bi_sector = src_sector;
read_bio-bi_iter.bi_size = chunk  9;
Index: linux-3.16-rc5/include/linux/blk_types.h
===
--- linux-3.16-rc5.orig/include/linux/blk_types.h   2014-07-15 
15:27:49.0 +0200
+++ linux-3.16-rc5/include/linux/blk_types.h2014-07-15 15:27:51.0 
+0200
@@ -6,6 +6,7 @@
 #define __LINUX_BLK_TYPES_H
 
 #include linux/types.h
+#include linux/timer.h
 
 struct bio_set;
 struct bio;
@@ -52,6 +53,7 @@ struct bio_copy {
atomic64_t *first_error;
sector_t offset;
spinlock_t spinlock;
+   struct timer_list timer;
 };
 
 /*

--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 9/18] block copy: use asynchronous notification

2014-10-22 Thread Mikulas Patocka
In dm-snapshot target there may be large number of copy requests in
progress. If every pending copy request consumed a process context, it
would put too much load on the system.

To avoid this load, we need asynchronous notification when copy finishes -
we can pass a callback to the function blkdev_issue_copy, if the callback
is non-NULL, blkdev_issue_copy exits when it submits all the copy bios and
the callback is called when the copy operation finishes.

With the callback mechanism, there can be large number of in-progress copy
requests and we do not need process context for each of them.

Signed-off-by: Mikulas Patocka mpato...@redhat.com

---
 block/blk-lib.c   |  152 --
 block/ioctl.c |2 
 include/linux/blk_types.h |5 -
 include/linux/blkdev.h|2 
 4 files changed, 114 insertions(+), 47 deletions(-)

Index: linux-3.18-rc1/block/blk-lib.c
===
--- linux-3.18-rc1.orig/block/blk-lib.c 2014-10-21 00:49:04.0 +0200
+++ linux-3.18-rc1/block/blk-lib.c  2014-10-21 00:49:05.0 +0200
@@ -305,6 +305,17 @@ int blkdev_issue_zeroout(struct block_de
 }
 EXPORT_SYMBOL(blkdev_issue_zeroout);
 
+struct bio_copy_batch {
+   atomic_long_t done;
+   int async_error;
+   int sync_error;
+   sector_t sync_copied;
+   atomic64_t first_error;
+   void (*callback)(void *data, int error);
+   void *data;
+   sector_t *copied;
+};
+
 #define BLK_COPY_TIMEOUT   (10 * HZ)
 
 static void blk_copy_timeout(unsigned long bc_)
@@ -329,6 +340,18 @@ static void blk_copy_timeout(unsigned lo
bio_endio(bio1, -ETIMEDOUT);
 }
 
+static void blk_copy_batch_finish(struct bio_copy_batch *batch)
+{
+   void (*fn)(void *, int) = batch-callback;
+   void *data = batch-data;
+   int error = unlikely(batch-sync_error) ? batch-sync_error : 
batch-async_error;
+   if (batch-copied)
+   *batch-copied = min(batch-sync_copied, 
(sector_t)atomic64_read(batch-first_error));
+   kfree(batch);
+   if (fn)
+   fn(data, error);
+}
+
 static void bio_copy_end_io(struct bio *bio, int error)
 {
struct bio_copy *bc = bio-bi_copy;
@@ -350,22 +373,22 @@ static void bio_copy_end_io(struct bio *
}
bio_put(bio);
if (atomic_dec_and_test(bc-in_flight)) {
-   struct bio_batch *bb = bc-private;
+   struct bio_copy_batch *batch = bc-batch;
if (unlikely(bc-error  0)) {
u64 first_error;
-   if (!ACCESS_ONCE(bb-error))
-   ACCESS_ONCE(bb-error) = bc-error;
+   if (!ACCESS_ONCE(batch-async_error))
+   ACCESS_ONCE(batch-async_error) = bc-error;
do {
-   first_error = atomic64_read(bc-first_error);
+   first_error = 
atomic64_read(batch-first_error);
if (bc-offset = first_error)
break;
-   } while (unlikely(atomic64_cmpxchg(bc-first_error,
+   } while (unlikely(atomic64_cmpxchg(batch-first_error,
first_error, bc-offset) != first_error));
}
del_timer_sync(bc-timer);
kfree(bc);
-   if (atomic_dec_and_test(bb-done))
-   complete(bb-wait);
+   if (atomic_long_dec_and_test(batch-done))
+   blk_copy_batch_finish(batch);
}
 }
 
@@ -394,6 +417,18 @@ static unsigned blkdev_copy_merge(struct
}
 }
 
+struct bio_copy_completion {
+   struct completion wait;
+   int error;
+};
+
+static void bio_copy_sync_callback(void *ptr, int error)
+{
+   struct bio_copy_completion *comp = ptr;
+   comp-error = error;
+   complete(comp-wait);
+}
+
 /**
  * blkdev_issue_copy - queue a copy same operation
  * @src_bdev:  source blockdev
@@ -408,69 +443,95 @@ static unsigned blkdev_copy_merge(struct
  */
 int blkdev_issue_copy(struct block_device *src_bdev, sector_t src_sector,
  struct block_device *dst_bdev, sector_t dst_sector,
- sector_t nr_sects, gfp_t gfp_mask, sector_t *copied)
+ sector_t nr_sects, gfp_t gfp_mask,
+ void (*callback)(void *, int), void *data,
+ sector_t *copied)
 {
DECLARE_COMPLETION_ONSTACK(wait);
struct request_queue *sq = bdev_get_queue(src_bdev);
struct request_queue *dq = bdev_get_queue(dst_bdev);
unsigned int max_copy_sectors;
-   struct bio_batch bb;
-   int ret = 0;
-   atomic64_t first_error = ATOMIC64_INIT(nr_sects);
-   sector_t offset = 0;
+   int ret;
+   struct bio_copy_batch *batch;
+   struct

[PATCH 8/18] block copy: use merge_bvec_fn for copies

2014-10-22 Thread Mikulas Patocka
We use merge_bvec_fn to make sure that copies do not split internal
boundaries of device mapper devices.

There is no possibility to split a copy bio (splitting would complicate
the design significantly), so we must use merge_bvec_fn to make sure that
the bios have appropriate size for the device mapper stack.

Signed-off-by: Mikulas Patocka mpato...@redhat.com

---
 block/blk-lib.c |   37 +
 1 file changed, 37 insertions(+)

Index: linux-3.16-rc5/block/blk-lib.c
===
--- linux-3.16-rc5.orig/block/blk-lib.c 2014-07-15 15:27:51.0 +0200
+++ linux-3.16-rc5/block/blk-lib.c  2014-07-15 15:27:59.0 +0200
@@ -369,6 +369,31 @@ static void bio_copy_end_io(struct bio *
}
 }
 
+static unsigned blkdev_copy_merge(struct block_device *bdev,
+ struct request_queue *q, unsigned long bi_rw,
+ sector_t sector, unsigned n)
+{
+   if (!q-merge_bvec_fn) {
+   return n;
+   } else {
+   unsigned m;
+   struct bvec_merge_data bvm = {
+   .bi_bdev = bdev,
+   .bi_sector = sector,
+   .bi_size = 0,
+   .bi_rw = bi_rw,
+   };
+   struct bio_vec vec = {
+   .bv_page = NULL,
+   .bv_len = likely(n = UINT_MAX  9) ? n  9 : 
UINT_MAX  ~511U,
+   .bv_offset = 0,
+   };
+   m = q-merge_bvec_fn(q, bvm, vec);
+   m = 9;
+   return min(m, n);
+   }
+}
+
 /**
  * blkdev_issue_copy - queue a copy same operation
  * @src_bdev:  source blockdev
@@ -424,6 +449,18 @@ int blkdev_issue_copy(struct block_devic
struct bio_copy *bc;
unsigned chunk = (unsigned)min(nr_sects, 
(sector_t)max_copy_sectors);
 
+   chunk = blkdev_copy_merge(src_bdev, sq, READ | REQ_COPY, 
src_sector, chunk);
+   if (!chunk) {
+   ret = -EOPNOTSUPP;
+   break;
+   }
+
+   chunk = blkdev_copy_merge(dst_bdev, dq, WRITE | REQ_COPY, 
dst_sector, chunk);
+   if (!chunk) {
+   ret = -EOPNOTSUPP;
+   break;
+   }
+
bc = kmalloc(sizeof(struct bio_copy), gfp_mask);
if (!bc) {
ret = -ENOMEM;

--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 10/18] scsi xcopy: suppress error messages

2014-10-22 Thread Mikulas Patocka
This patch suppresses error messages when copying between two arrays that
support XCOPY each, but that cannot copy data between each other.

Signed-off-by: Mikulas Patocka mpato...@redhat.com

---
 drivers/scsi/sd.c |   12 
 1 file changed, 12 insertions(+)

Index: linux-3.18-rc1/drivers/scsi/sd.c
===
--- linux-3.18-rc1.orig/drivers/scsi/sd.c   2014-10-21 00:48:51.0 
+0200
+++ linux-3.18-rc1/drivers/scsi/sd.c2014-10-21 00:49:21.0 +0200
@@ -1935,6 +1935,18 @@ static int sd_done(struct scsi_cmnd *SCp
req-cmd_flags |= REQ_QUIET;
}
}
+   } else if (sshdr.asc == 0x26) {
+   switch (op) {
+   /*
+* Copying between two arrays that support XCOPY, but
+* cannot access each other.
+*/
+   case EXTENDED_COPY:
+   good_bytes = 0;
+   req-__data_len = blk_rq_bytes(req);
+   req-cmd_flags |= REQ_QUIET;
+   break;
+   }
}
break;
default:

--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 11/18] scsi xcopy: keep cache of failures

2014-10-22 Thread Mikulas Patocka
If xcopy between two devices fails, it is pointless to send more xcopy
command between there two devices because they take time and they will
likely also fail.

This patch keeps a cache of (source_device,destination_device) pairs where
copying failed and makes sure that no xcopy command is sooner than 30
seconds after the last failure.

Signed-off-by: Mikulas Patocka mpato...@redhat.com

---
 drivers/scsi/sd.c |   35 +++
 1 file changed, 35 insertions(+)

Index: linux-3.18-rc1/drivers/scsi/sd.c
===
--- linux-3.18-rc1.orig/drivers/scsi/sd.c   2014-10-21 00:49:21.0 
+0200
+++ linux-3.18-rc1/drivers/scsi/sd.c2014-10-21 00:49:24.0 +0200
@@ -943,6 +943,26 @@ static void sd_config_copy(struct scsi_d
   (logical_block_size  9));
 }
 
+#define SD_COPY_DISABLED_CACHE_TIME(HZ * 30)
+#define SD_COPY_DISABLED_CACHE_HASH_BITS   6
+#define SD_COPY_DISABLED_CACHE_HASH(1  
SD_COPY_DISABLED_CACHE_HASH_BITS)
+
+struct sd_copy_disabled_cache_entry {
+   struct scsi_device *src;
+   struct scsi_device *dst;
+   unsigned long jiffies;
+};
+
+static struct sd_copy_disabled_cache_entry 
sd_copy_disabled_cache[SD_COPY_DISABLED_CACHE_HASH];
+
+static struct sd_copy_disabled_cache_entry *sd_copy_disabled_cache_hash(
+   struct scsi_device *src, struct scsi_device *dst)
+{
+   return sd_copy_disabled_cache[
+   hash_long((unsigned long)src + ((unsigned long)dst  1), 
SD_COPY_DISABLED_CACHE_HASH_BITS)
+   ];
+}
+
 static int sd_setup_copy_cmnd(struct scsi_cmnd *cmd)
 {
struct request *rq = cmd-request;
@@ -955,6 +975,7 @@ static int sd_setup_copy_cmnd(struct scs
struct bio *bio = rq-bio;
struct page *page;
unsigned char *buf;
+   struct sd_copy_disabled_cache_entry *e;
 
dst_sdp = scsi_disk(rq-rq_disk)-device;
dst_queue = rq-rq_disk-queue;
@@ -974,6 +995,12 @@ static int sd_setup_copy_cmnd(struct scs
if (src_sdp-sector_size != dst_sdp-sector_size)
return BLKPREP_KILL;
 
+   /* The copy failed in the past, so do not retry it for some time */
+   e = sd_copy_disabled_cache_hash(src_sdp, dst_sdp);
+   if (unlikely(jiffies - ACCESS_ONCE(e-jiffies)  
SD_COPY_DISABLED_CACHE_TIME) 
+   likely(ACCESS_ONCE(e-src) == src_sdp)  
likely(ACCESS_ONCE(e-dst) == dst_sdp))
+   return BLKPREP_KILL;
+
dst_lba = blk_rq_pos(rq)  (ilog2(dst_sdp-sector_size) - 9);
src_lba = bio-bi_copy-pair[0]-bi_iter.bi_sector  
(ilog2(src_sdp-sector_size) - 9);
nr_blocks = blk_rq_sectors(rq)  (ilog2(dst_sdp-sector_size) - 9);
@@ -1937,11 +1964,19 @@ static int sd_done(struct scsi_cmnd *SCp
}
} else if (sshdr.asc == 0x26) {
switch (op) {
+   struct sd_copy_disabled_cache_entry *e;
+   struct scsi_device *src_sdp, *dst_sdp;
/*
 * Copying between two arrays that support XCOPY, but
 * cannot access each other.
 */
case EXTENDED_COPY:
+   dst_sdp = sdkp-device;
+   src_sdp = 
scsi_disk(req-bio-bi_copy-pair[0]-bi_bdev-bd_disk)-device;
+   e = sd_copy_disabled_cache_hash(src_sdp, 
dst_sdp);
+   ACCESS_ONCE(e-src) = src_sdp;
+   ACCESS_ONCE(e-dst) = dst_sdp;
+   ACCESS_ONCE(e-jiffies) = jiffies;
good_bytes = 0;
req-__data_len = blk_rq_bytes(req);
req-cmd_flags |= REQ_QUIET;

--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 12/18] dm: implement copy

2014-10-22 Thread Mikulas Patocka
This patch implements basic copy support for device mapper core.
Individual targets can enable copy support by setting ti-copy_supported.

Device mapper device advertises copy support if at least one target
supports copy and for this target, at least one underlying device supports
copy.

Signed-off-by: Mikulas Patocka mpato...@redhat.com

---
 drivers/md/dm-table.c |9 +
 drivers/md/dm.c   |   42 +++---
 include/linux/device-mapper.h |5 +
 3 files changed, 53 insertions(+), 3 deletions(-)

Index: linux-3.18-rc1/drivers/md/dm.c
===
--- linux-3.18-rc1.orig/drivers/md/dm.c 2014-10-21 00:46:30.0 +0200
+++ linux-3.18-rc1/drivers/md/dm.c  2014-10-21 00:49:09.0 +0200
@@ -1559,6 +1559,31 @@ static int __send_write_same(struct clon
return __send_changing_extent_only(ci, get_num_write_same_bios, NULL);
 }
 
+static int __send_copy(struct clone_info *ci)
+{
+   struct dm_target *ti;
+   sector_t bound;
+
+   ti = dm_table_find_target(ci-map, ci-sector);
+   if (!dm_target_is_valid(ti))
+   return -EIO;
+
+   if (!ti-copy_supported)
+   return -EOPNOTSUPP;
+
+   bound = max_io_len(ci-sector, ti);
+
+   if (unlikely(ci-sector_count  bound))
+   return -EOPNOTSUPP;
+
+   __clone_and_map_simple_bio(ci, ti, 0, NULL, NULL);
+
+   ci-sector += ci-sector_count;
+   ci-sector_count = 0;
+
+   return 0;
+}
+
 /*
  * Select the correct strategy for processing a non-flush bio.
  */
@@ -1572,6 +1597,8 @@ static int __split_and_process_non_flush
return __send_discard(ci);
else if (unlikely(bio-bi_rw  REQ_WRITE_SAME))
return __send_write_same(ci);
+   else if (unlikely(bio-bi_rw  REQ_COPY))
+   return __send_copy(ci);
 
ti = dm_table_find_target(ci-map, ci-sector);
if (!dm_target_is_valid(ti))
@@ -1656,6 +1683,11 @@ static int dm_merge_bvec(struct request_
if (!dm_target_is_valid(ti))
goto out;
 
+   if (unlikely((bvm-bi_rw  REQ_COPY) != 0)) {
+   if (!ti-copy_supported)
+   goto out_ret_max_size;
+   }
+
/*
 * Find maximum amount of I/O that won't need splitting
 */
@@ -1679,17 +1711,21 @@ static int dm_merge_bvec(struct request_
 * entries.  So always set max_size to 0, and the code below allows
 * just one page.
 */
-   else if (queue_max_hw_sectors(q) = PAGE_SIZE  9)
+   else if (likely(!(bvm-bi_rw  REQ_COPY)) 
+queue_max_hw_sectors(q) = PAGE_SIZE  9)
max_size = 0;
 
 out:
-   dm_put_live_table_fast(md);
/*
 * Always allow an entire first page
 */
-   if (max_size = biovec-bv_len  !(bvm-bi_size  SECTOR_SHIFT))
+   if (likely(!(bvm-bi_rw  REQ_COPY)) 
+   max_size = biovec-bv_len  !(bvm-bi_size  SECTOR_SHIFT))
max_size = biovec-bv_len;
 
+out_ret_max_size:
+   dm_put_live_table_fast(md);
+
return max_size;
 }
 
Index: linux-3.18-rc1/include/linux/device-mapper.h
===
--- linux-3.18-rc1.orig/include/linux/device-mapper.h   2014-10-21 
00:42:40.0 +0200
+++ linux-3.18-rc1/include/linux/device-mapper.h2014-10-21 
00:49:09.0 +0200
@@ -251,6 +251,11 @@ struct dm_target {
 * Set if this target does not return zeroes on discarded blocks.
 */
bool discard_zeroes_data_unsupported:1;
+
+   /*
+* Set if the target supports XCOPY.
+*/
+   bool copy_supported:1;
 };
 
 /* Each target can link one of these into the table */
Index: linux-3.18-rc1/drivers/md/dm-table.c
===
--- linux-3.18-rc1.orig/drivers/md/dm-table.c   2014-10-21 00:34:45.0 
+0200
+++ linux-3.18-rc1/drivers/md/dm-table.c2014-10-21 00:49:09.0 
+0200
@@ -443,6 +443,11 @@ static int dm_set_device_limits(struct d
   q-limits.alignment_offset,
   (unsigned long long) start  SECTOR_SHIFT);
 
+   if (ti-copy_supported)
+   limits-max_copy_sectors =
+   min_not_zero(limits-max_copy_sectors,
+   bdev_get_queue(bdev)-limits.max_copy_sectors);
+
/*
 * Check if merge fn is supported.
 * If not we'll force DM to use PAGE_SIZE or
@@ -1264,6 +1269,10 @@ combine_limits:
   dm_device_name(table-md),
   (unsigned long long) ti-begin,
   (unsigned long long) ti-len);
+
+   limits-max_copy_sectors =
+   min_not_zero(limits-max_copy_sectors

[PATCH 13/18] dm linear: support copy

2014-10-22 Thread Mikulas Patocka
Support copy operation in the linear target.

Signed-off-by: Mikulas Patocka mpato...@redhat.com

---
 drivers/md/dm-linear.c |1 +
 1 file changed, 1 insertion(+)

Index: linux-3.16-rc4/drivers/md/dm-linear.c
===
--- linux-3.16-rc4.orig/drivers/md/dm-linear.c  2014-07-11 22:20:27.0 
+0200
+++ linux-3.16-rc4/drivers/md/dm-linear.c   2014-07-11 22:22:20.0 
+0200
@@ -56,6 +56,7 @@ static int linear_ctr(struct dm_target *
ti-num_flush_bios = 1;
ti-num_discard_bios = 1;
ti-num_write_same_bios = 1;
+   ti-copy_supported = 1;
ti-private = lc;
return 0;
 

--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 14/18] dm stripe: support copy

2014-10-22 Thread Mikulas Patocka
Support the copy operation for the stripe target.

In stripe_merge, we verify that the underlying device supports copy. If it
doesn't, we can fail fast without any bio being contructed.

Signed-off-by: Mikulas Patocka mpato...@redhat.com

---
 drivers/md/dm-stripe.c |   11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

Index: linux-3.16-rc4/drivers/md/dm-stripe.c
===
--- linux-3.16-rc4.orig/drivers/md/dm-stripe.c  2014-07-11 22:20:25.0 
+0200
+++ linux-3.16-rc4/drivers/md/dm-stripe.c   2014-07-11 22:23:54.0 
+0200
@@ -165,6 +165,7 @@ static int stripe_ctr(struct dm_target *
ti-num_flush_bios = stripes;
ti-num_discard_bios = stripes;
ti-num_write_same_bios = stripes;
+   ti-copy_supported = 1;
 
sc-chunk_size = chunk_size;
if (chunk_size  (chunk_size - 1))
@@ -416,11 +417,19 @@ static int stripe_merge(struct dm_target
struct stripe_c *sc = ti-private;
sector_t bvm_sector = bvm-bi_sector;
uint32_t stripe;
+   struct block_device *bdev;
struct request_queue *q;
 
stripe_map_sector(sc, bvm_sector, stripe, bvm_sector);
 
-   q = bdev_get_queue(sc-stripe[stripe].dev-bdev);
+   bdev = sc-stripe[stripe].dev-bdev;
+
+   if (unlikely((bvm-bi_rw  REQ_COPY) != 0)) {
+   if (!bdev_copy_offload(bdev))
+   return 0;
+   }
+
+   q = bdev_get_queue(bdev);
if (!q-merge_bvec_fn)
return max_size;
 

--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 16/18] dm kcopyd: support copy offload

2014-10-22 Thread Mikulas Patocka
This patch adds copy offload support to dm-kcopyd. If copy offload fails,
copying is performed using dm-io, just like before.

There is a module parameter copy_offload that can be set to enable or
disable this feature. It can be used to test performance of copy offload.

Signed-off-by: Mikulas Patocka mpato...@redhat.com

---
 drivers/md/dm-kcopyd.c |   38 +-
 1 file changed, 37 insertions(+), 1 deletion(-)

Index: linux-3.16-rc5/drivers/md/dm-kcopyd.c
===
--- linux-3.16-rc5.orig/drivers/md/dm-kcopyd.c  2014-07-15 16:18:15.0 
+0200
+++ linux-3.16-rc5/drivers/md/dm-kcopyd.c   2014-07-15 19:20:34.0 
+0200
@@ -96,6 +96,9 @@ static DEFINE_SPINLOCK(throttle_spinlock
  */
 #define MAX_SLEEPS 10
 
+static bool copy_offload = true;
+module_param(copy_offload, bool, S_IRUGO | S_IWUSR);
+
 static void io_job_start(struct dm_kcopyd_throttle *t)
 {
unsigned throttle, now, difference;
@@ -358,6 +361,8 @@ struct kcopyd_job {
sector_t progress;
 
struct kcopyd_job *master_job;
+
+   struct work_struct copy_work;
 };
 
 static struct kmem_cache *_job_cache;
@@ -709,6 +714,31 @@ static void submit_job(struct kcopyd_job
}
 }
 
+static void copy_offload_work(struct work_struct *work)
+{
+   struct kcopyd_job *job = container_of(work, struct kcopyd_job, 
copy_work);
+   sector_t copied;
+
+   blkdev_issue_copy(job-source.bdev, job-source.sector,
+ job-dests[0].bdev, job-dests[0].sector,
+ job-source.count,
+ GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | 
__GFP_NOWARN,
+ NULL, NULL, copied);
+
+   job-source.sector += copied;
+   job-source.count -= copied;
+   job-dests[0].sector += copied;
+   job-dests[0].count -= copied;
+
+   submit_job(job);
+}
+
+static void try_copy_offload(struct kcopyd_job *job)
+{
+   INIT_WORK(job-copy_work, copy_offload_work);
+   queue_work(job-kc-kcopyd_wq, job-copy_work);
+}
+
 int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
   unsigned int num_dests, struct dm_io_region *dests,
   unsigned int flags, dm_kcopyd_notify_fn fn, void *context)
@@ -757,7 +787,13 @@ int dm_kcopyd_copy(struct dm_kcopyd_clie
job-context = context;
job-master_job = job;
 
-   submit_job(job);
+   if (copy_offload  num_dests == 1 
+   bdev_copy_offload(job-source.bdev) 
+   bdev_copy_offload(job-dests[0].bdev)) {
+   try_copy_offload(job);
+   } else {
+   submit_job(job);
+   }
 
return 0;
 }

--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 15/18] dm kcopyd: introduce the function submit_job

2014-10-22 Thread Mikulas Patocka
We move some code to a function submit_job. It is needed for the next
patch that calls submit_job from another place.

Signed-off-by: Mikulas Patocka mpato...@redhat.com

---
 drivers/md/dm-kcopyd.c |   19 ---
 1 file changed, 12 insertions(+), 7 deletions(-)

Index: linux-3.16-rc5/drivers/md/dm-kcopyd.c
===
--- linux-3.16-rc5.orig/drivers/md/dm-kcopyd.c  2014-07-14 16:45:23.0 
+0200
+++ linux-3.16-rc5/drivers/md/dm-kcopyd.c   2014-07-14 17:28:36.0 
+0200
@@ -698,6 +698,17 @@ static void split_job(struct kcopyd_job 
}
 }
 
+static void submit_job(struct kcopyd_job *job)
+{
+   if (job-source.count = SUB_JOB_SIZE)
+   dispatch_job(job);
+   else {
+   mutex_init(job-lock);
+   job-progress = 0;
+   split_job(job);
+   }
+}
+
 int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
   unsigned int num_dests, struct dm_io_region *dests,
   unsigned int flags, dm_kcopyd_notify_fn fn, void *context)
@@ -746,13 +757,7 @@ int dm_kcopyd_copy(struct dm_kcopyd_clie
job-context = context;
job-master_job = job;
 
-   if (job-source.count = SUB_JOB_SIZE)
-   dispatch_job(job);
-   else {
-   mutex_init(job-lock);
-   job-progress = 0;
-   split_job(job);
-   }
+   submit_job(job);
 
return 0;
 }

--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 17/18] dm kcopyd: change mutex to spinlock

2014-10-22 Thread Mikulas Patocka
job-lock is only taken for a finite amount of time and the process
doesn't block while holding it, so change it from mutex to spinlock.

This change is needed for the next patch that makes it possible to call
segment_complete from an interrupt. Taking mutexes inside an interrupt is
not allowed.

Signed-off-by: Mikulas Patocka mpato...@redhat.com

---
 drivers/md/dm-kcopyd.c |   10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

Index: linux-3.16-rc5/drivers/md/dm-kcopyd.c
===
--- linux-3.16-rc5.orig/drivers/md/dm-kcopyd.c  2014-07-15 19:20:34.0 
+0200
+++ linux-3.16-rc5/drivers/md/dm-kcopyd.c   2014-07-15 19:24:20.0 
+0200
@@ -21,7 +21,7 @@
 #include linux/slab.h
 #include linux/vmalloc.h
 #include linux/workqueue.h
-#include linux/mutex.h
+#include linux/spinlock.h
 #include linux/delay.h
 #include linux/device-mapper.h
 #include linux/dm-kcopyd.h
@@ -356,7 +356,7 @@ struct kcopyd_job {
 * These fields are only used if the job has been split
 * into more manageable parts.
 */
-   struct mutex lock;
+   spinlock_t lock;
atomic_t sub_jobs;
sector_t progress;
 
@@ -629,7 +629,7 @@ static void segment_complete(int read_er
struct kcopyd_job *job = sub_job-master_job;
struct dm_kcopyd_client *kc = job-kc;
 
-   mutex_lock(job-lock);
+   spin_lock(job-lock);
 
/* update the error */
if (read_err)
@@ -653,7 +653,7 @@ static void segment_complete(int read_er
job-progress += count;
}
}
-   mutex_unlock(job-lock);
+   spin_unlock(job-lock);
 
if (count) {
int i;
@@ -708,7 +708,7 @@ static void submit_job(struct kcopyd_job
if (job-source.count = SUB_JOB_SIZE)
dispatch_job(job);
else {
-   mutex_init(job-lock);
+   spin_lock_init(job-lock);
job-progress = 0;
split_job(job);
}

--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 18/18] dm kcopyd: call copy offload with asynchronous callback

2014-10-22 Thread Mikulas Patocka
Change dm kcopyd so that it calls blkdev_issue_copy with an asynchronous
callback. There can be large number of pending kcopyd requests and holding
a process context for each of them may put too much load on the workqueue
subsystem.

This patch changes it so that blkdev_issue_copy returns after it submitted
the requests and copy_offload_callback is called when the copy operation
finishes.

Signed-off-by: Mikulas Patocka mpato...@redhat.com

---
 drivers/md/dm-kcopyd.c |   33 ++---
 1 file changed, 14 insertions(+), 19 deletions(-)

Index: linux-3.16-rc5/drivers/md/dm-kcopyd.c
===
--- linux-3.16-rc5.orig/drivers/md/dm-kcopyd.c  2014-07-15 19:24:20.0 
+0200
+++ linux-3.16-rc5/drivers/md/dm-kcopyd.c   2014-07-15 19:24:54.0 
+0200
@@ -361,8 +361,6 @@ struct kcopyd_job {
sector_t progress;
 
struct kcopyd_job *master_job;
-
-   struct work_struct copy_work;
 };
 
 static struct kmem_cache *_job_cache;
@@ -628,8 +626,9 @@ static void segment_complete(int read_er
struct kcopyd_job *sub_job = (struct kcopyd_job *) context;
struct kcopyd_job *job = sub_job-master_job;
struct dm_kcopyd_client *kc = job-kc;
+   unsigned long flags;
 
-   spin_lock(job-lock);
+   spin_lock_irqsave(job-lock, flags);
 
/* update the error */
if (read_err)
@@ -653,7 +652,7 @@ static void segment_complete(int read_er
job-progress += count;
}
}
-   spin_unlock(job-lock);
+   spin_unlock_irqrestore(job-lock, flags);
 
if (count) {
int i;
@@ -714,29 +713,25 @@ static void submit_job(struct kcopyd_job
}
 }
 
-static void copy_offload_work(struct work_struct *work)
+static void copy_offload_callback(void *ptr, int error)
 {
-   struct kcopyd_job *job = container_of(work, struct kcopyd_job, 
copy_work);
-   sector_t copied;
+   struct kcopyd_job *job = ptr;
 
-   blkdev_issue_copy(job-source.bdev, job-source.sector,
- job-dests[0].bdev, job-dests[0].sector,
- job-source.count,
- GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | 
__GFP_NOWARN,
- NULL, NULL, copied);
-
-   job-source.sector += copied;
-   job-source.count -= copied;
-   job-dests[0].sector += copied;
-   job-dests[0].count -= copied;
+   job-source.sector += job-progress;
+   job-source.count -= job-progress;
+   job-dests[0].sector += job-progress;
+   job-dests[0].count -= job-progress;
 
submit_job(job);
 }
 
 static void try_copy_offload(struct kcopyd_job *job)
 {
-   INIT_WORK(job-copy_work, copy_offload_work);
-   queue_work(job-kc-kcopyd_wq, job-copy_work);
+   blkdev_issue_copy(job-source.bdev, job-source.sector,
+ job-dests[0].bdev, job-dests[0].sector,
+ job-source.count,
+ GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | 
__GFP_NOWARN,
+ copy_offload_callback, job, job-progress);
 }
 
 int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,

--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/15] block copy: initial XCOPY offload support

2014-08-05 Thread Mikulas Patocka


On Mon, 4 Aug 2014, Pavel Machek wrote:

 On Tue 2014-07-15 15:34:47, Mikulas Patocka wrote:
  This is Martin Petersen's xcopy patch
  (https://git.kernel.org/cgit/linux/kernel/git/mkp/linux.git/commit/?h=xcopyid=0bdeed274e16b3038a851552188512071974eea8)
  with some bug fixes, ported to the current kernel.
  
  This patch makes it possible to use the SCSI XCOPY command.
  
  We create a bio that has REQ_COPY flag in bi_rw and a bi_copy structure
  that defines the source device. The target device is defined in the
  bi_bdev and bi_iter.bi_sector.
  
  There is a new BLKCOPY ioctl that makes it possible to use XCOPY from
  userspace. The ioctl argument is a pointer to an array of four uint64_t
  values.
 
 But it is there only for block devices, right?
 
 Is there plan to enable tools such as /bin/cp to use XCOPY?
 
   Pavel

It is interesting idea, but it would be far from simple. You could make 
sendfile (or maybe splice) use XCOPY, but it needs to interact with page 
cache.

Mikulas
--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/15] block copy: initial XCOPY offload support

2014-07-18 Thread Mikulas Patocka


On Fri, 18 Jul 2014, Tomas Henzl wrote:

  +   if (src_sector + nr_sects  src_sector ||
  +   dst_sector + nr_sects  dst_sector)
  +   return -EINVAL;
 
 Hi Mikulas,
 this^ is meant as an overflow test or what is the reason?
 Thanks, Tomas

Yes. It is a test for overflow.

Mikulas
--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


ANNOUNCE: SCSI XCOPY support for the kernel and device mapper

2014-07-15 Thread Mikulas Patocka
Hi

I annouce that I released the first version of a patch set that makes it 
possible to use SCSI XCOPY offload for the block layer and device mapper.

The patchset is at 
http://people.redhat.com/~mpatocka/patches/kernel/xcopy/series.html It 
requires kernel version at least 3.16-rc4. It was tested on target-core 
iSCSI implementation.

It is based on Martin Petersen's work 
https://git.kernel.org/cgit/linux/kernel/git/mkp/linux.git/commit/?h=xcopyid=0bdeed274e16b3038a851552188512071974eea8,
 
but it is changed significantly so that it is possible to propagate XCOPY 
bios through the device mapper stack.

The basic architecture is this: in the function blkdev_issue_copy we 
create two bios, one for read and one for write (with bi_rw READ|REQ_COPY 
and WRITE|REQ_COPY). Both bios have a pointer to the same bio_copy 
structure. These two bios travel independently through the device mapper 
stack - each bio can go through different device mapper devices. When both 
the bios reach the physical block device (in the function blk_queue_bio) 
the bio pair is collected and a XCOPY request is allocated and sent to the 
scsi disk driver.

Note that because device mapper mapping can dynamically change, there no 
guarantee that the XCOPY command succeeds. If it ends with an error, the 
caller is supposed to perform the copying manually.

The dm-kcopyd subsystem is modified to use the XCOPY command, so device 
mapper targets that use it (mirror, snapshot, thin, cache) take advantage 
of copy offload automatically.

There is a new ioctl BLKCOPY that makes it possible to use copy offload 
from userspace.

Mikulas
--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: ANNOUNCE: SCSI XCOPY support for the kernel and device mapper

2014-07-15 Thread Mikulas Patocka


On Tue, 15 Jul 2014, Christoph Hellwig wrote:

 On Tue, Jul 15, 2014 at 02:33:13PM -0400, Mikulas Patocka wrote:
  Hi
  
  I annouce that I released the first version of a patch set that makes it 
  possible to use SCSI XCOPY offload for the block layer and device mapper.
  
  The patchset is at 
  http://people.redhat.com/~mpatocka/patches/kernel/xcopy/series.html It 
  requires kernel version at least 3.16-rc4. It was tested on target-core 
  iSCSI implementation.
 
 Can you send the series to the mainling list(s)?  That makes it a lot
 easier to do a quick review.

I'll post them to the lists.

  The dm-kcopyd subsystem is modified to use the XCOPY command, so device 
  mapper targets that use it (mirror, snapshot, thin, cache) take advantage 
  of copy offload automatically.
 
 That sounds good, do you have any benchmarking data that shows the
 advantage of using XCOPY?

dm-mirror resynchronization is 4 times faster with XCOPY offload, but it 
is just a benchmark on a network with iSCSI between two slow machines.

I hope that we get some benchmarks on a real hardware.

Mikulas
--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/15] SCSI XCOPY support for the kernel and device mapper

2014-07-15 Thread Mikulas Patocka
This patch series makes it possible to use SCSI XCOPY offload for the 
block layer and device mapper.

It is based on Martin Petersen's work
https://git.kernel.org/cgit/linux/kernel/git/mkp/linux.git/commit/?h=xcopyid=0bdeed274e16b3038a851552188512071974eea8,
but it is changed significantly so that it is possible to propagate XCOPY
bios through the device mapper stack.

The basic architecture is this: in the function blkdev_issue_copy we
create two bios, one for read and one for write (with bi_rw READ|REQ_COPY
and WRITE|REQ_COPY). Both bios have a pointer to the same bio_copy
structure. These two bios travel independently through the device mapper
stack - each bio can go through different device mapper devices. When both
the bios reach the physical block device (in the function blk_queue_bio)
the bio pair is collected and a XCOPY request is allocated and sent to the
scsi disk driver.

Note that because device mapper mapping can dynamically change, there no
guarantee that the XCOPY command succeeds. If it ends with an error, the
caller is supposed to perform the copying manually.

The dm-kcopyd subsystem is modified to use the XCOPY command, so device
mapper targets that use it (mirror, snapshot, thin, cache) take advantage
of copy offload automatically.

There is a new ioctl BLKCOPY that makes it possible to use copy offload
from userspace.
--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/15] block copy: use two bios

2014-07-15 Thread Mikulas Patocka
This patch changes the architecture of xcopy so that two bios are used.

There used to be just one bio that held pointers to both source and
destination block device. However a bio with two block devices cannot
really be passed though block midlayer drivers (dm and md).

When we need to send the XCOPY command, we call the function
blkdev_issue_copy. This function creates two bios, the first with bi_rw
READ | REQ_COPY and the second WRITE | REQ_COPY. The bios have a pointer
to a common bi_copy structure.

These bios travel independently through the block device stack. When both
the bios reach the physical disk driver (the function blk_queue_bio), they
are paired, a request is made and the request is sent to the SCSI disk
driver.

It is possible that one of the bios reaches a device that doesn't support
XCOPY, in that case both bios are aborted with an error.

Note that there is no guarantee that the XCOPY command will succeed. If it
doesn't succeed, the caller is supposed to perform the copy manually.

Signed-off-by: Mikulas Patocka mpato...@redhat.com

---
 block/bio.c   |   26 ++-
 block/blk-core.c  |   34 
 block/blk-lib.c   |   76 --
 drivers/scsi/sd.c |7 +---
 include/linux/blk_types.h |   12 ++-
 5 files changed, 131 insertions(+), 24 deletions(-)

Index: linux-3.16-rc5/block/blk-lib.c
===
--- linux-3.16-rc5.orig/block/blk-lib.c 2014-07-14 16:32:21.0 +0200
+++ linux-3.16-rc5/block/blk-lib.c  2014-07-15 15:26:33.0 +0200
@@ -305,6 +305,36 @@ int blkdev_issue_zeroout(struct block_de
 }
 EXPORT_SYMBOL(blkdev_issue_zeroout);
 
+static void bio_copy_end_io(struct bio *bio, int error)
+{
+   struct bio_copy *bc = bio-bi_copy;
+   if (unlikely(error)) {
+   unsigned long flags;
+   int dir;
+   struct bio *other;
+
+   /* if the other bio is waiting for the pair, release it */
+   spin_lock_irqsave(bc-spinlock, flags);
+   if (bc-error = 0)
+   bc-error = error;
+   dir = bio_data_dir(bio);
+   other = bc-pair[dir ^ 1];
+   bc-pair[dir ^ 1] = NULL;
+   spin_unlock_irqrestore(bc-spinlock, flags);
+   if (other)
+   bio_endio(other, error);
+   }
+   bio_put(bio);
+   if (atomic_dec_and_test(bc-in_flight)) {
+   struct bio_batch *bb = bc-private;
+   if (unlikely(bc-error  0)  !ACCESS_ONCE(bb-error))
+   ACCESS_ONCE(bb-error) = bc-error;
+   kfree(bc);
+   if (atomic_dec_and_test(bb-done))
+   complete(bb-wait);
+   }
+}
+
 /**
  * blkdev_issue_copy - queue a copy same operation
  * @src_bdev:  source blockdev
@@ -351,9 +381,9 @@ int blkdev_issue_copy(struct block_devic
bb.wait = wait;
 
while (nr_sects) {
-   struct bio *bio;
+   struct bio *read_bio, *write_bio;
struct bio_copy *bc;
-   unsigned int chunk;
+   unsigned int chunk = min(nr_sects, max_copy_sectors);
 
bc = kmalloc(sizeof(struct bio_copy), gfp_mask);
if (!bc) {
@@ -361,27 +391,43 @@ int blkdev_issue_copy(struct block_devic
break;
}
 
-   bio = bio_alloc(gfp_mask, 1);
-   if (!bio) {
+   read_bio = bio_alloc(gfp_mask, 1);
+   if (!read_bio) {
kfree(bc);
ret = -ENOMEM;
break;
}
 
-   chunk = min(nr_sects, max_copy_sectors);
-
-   bio-bi_iter.bi_sector = dst_sector;
-   bio-bi_iter.bi_size = chunk  9;
-   bio-bi_end_io = bio_batch_end_io;
-   bio-bi_bdev = dst_bdev;
-   bio-bi_private = bb;
-   bio-bi_copy = bc;
+   write_bio = bio_alloc(gfp_mask, 1);
+   if (!write_bio) {
+   bio_put(read_bio);
+   kfree(bc);
+   ret = -ENOMEM;
+   break;
+   }
 
-   bc-bic_bdev = src_bdev;
-   bc-bic_sector = src_sector;
+   atomic_set(bc-in_flight, 2);
+   bc-error = 1;
+   bc-pair[0] = NULL;
+   bc-pair[1] = NULL;
+   bc-private = bb;
+   spin_lock_init(bc-spinlock);
+
+   read_bio-bi_iter.bi_sector = src_sector;
+   read_bio-bi_iter.bi_size = chunk  9;
+   read_bio-bi_end_io = bio_copy_end_io;
+   read_bio-bi_bdev = src_bdev;
+   read_bio-bi_copy = bc;
+
+   write_bio-bi_iter.bi_sector = dst_sector;
+   write_bio

[PATCH 1/15] block copy: initial XCOPY offload support

2014-07-15 Thread Mikulas Patocka
This is Martin Petersen's xcopy patch
(https://git.kernel.org/cgit/linux/kernel/git/mkp/linux.git/commit/?h=xcopyid=0bdeed274e16b3038a851552188512071974eea8)
with some bug fixes, ported to the current kernel.

This patch makes it possible to use the SCSI XCOPY command.

We create a bio that has REQ_COPY flag in bi_rw and a bi_copy structure
that defines the source device. The target device is defined in the
bi_bdev and bi_iter.bi_sector.

There is a new BLKCOPY ioctl that makes it possible to use XCOPY from
userspace. The ioctl argument is a pointer to an array of four uint64_t
values.

The first value is a source byte offset, the second value is a destination
byte offset, the third value is byte length. The forth value is written by
the kernel and it represents the number of bytes that the kernel actually
copied.

Signed-off-by: Martin K. Petersen martin.peter...@oracle.com
Signed-off-by: Mikulas Patocka mpato...@redhat.com

---
 Documentation/ABI/testing/sysfs-block |9 +
 block/bio.c   |2 
 block/blk-core.c  |5 
 block/blk-lib.c   |   95 
 block/blk-merge.c |7 
 block/blk-settings.c  |   13 +
 block/blk-sysfs.c |   10 +
 block/compat_ioctl.c  |1 
 block/ioctl.c |   49 ++
 drivers/scsi/scsi.c   |   57 +++
 drivers/scsi/sd.c |  263 +-
 drivers/scsi/sd.h |4 
 include/linux/bio.h   |9 -
 include/linux/blk_types.h |   15 +
 include/linux/blkdev.h|   15 +
 include/scsi/scsi_device.h|3 
 include/uapi/linux/fs.h   |1 
 17 files changed, 545 insertions(+), 13 deletions(-)

Index: linux-3.16-rc5/Documentation/ABI/testing/sysfs-block
===
--- linux-3.16-rc5.orig/Documentation/ABI/testing/sysfs-block   2014-07-14 
15:17:07.0 +0200
+++ linux-3.16-rc5/Documentation/ABI/testing/sysfs-block2014-07-14 
16:26:44.0 +0200
@@ -220,3 +220,12 @@ Description:
write_same_max_bytes is 0, write same is not supported
by the device.
 
+
+What:  /sys/block/disk/queue/copy_max_bytes
+Date:  January 2014
+Contact:   Martin K. Petersen martin.peter...@oracle.com
+Description:
+   Devices that support copy offloading will set this value
+   to indicate the maximum buffer size in bytes that can be
+   copied in one operation. If the copy_max_bytes is 0 the
+   device does not support copy offload.
Index: linux-3.16-rc5/block/blk-core.c
===
--- linux-3.16-rc5.orig/block/blk-core.c2014-07-14 16:26:22.0 
+0200
+++ linux-3.16-rc5/block/blk-core.c 2014-07-14 16:26:44.0 +0200
@@ -1831,6 +1831,11 @@ generic_make_request_checks(struct bio *
goto end_io;
}
 
+   if (bio-bi_rw  REQ_COPY  !bdev_copy_offload(bio-bi_bdev)) {
+   err = -EOPNOTSUPP;
+   goto end_io;
+   }
+
/*
 * Various block parts want %current-io_context and lazy ioc
 * allocation ends up trading a lot of pain for a small amount of
Index: linux-3.16-rc5/block/blk-lib.c
===
--- linux-3.16-rc5.orig/block/blk-lib.c 2014-07-14 16:26:40.0 +0200
+++ linux-3.16-rc5/block/blk-lib.c  2014-07-14 16:32:21.0 +0200
@@ -304,3 +304,98 @@ int blkdev_issue_zeroout(struct block_de
return __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask);
 }
 EXPORT_SYMBOL(blkdev_issue_zeroout);
+
+/**
+ * blkdev_issue_copy - queue a copy same operation
+ * @src_bdev:  source blockdev
+ * @src_sector:source sector
+ * @dst_bdev:  destination blockdev
+ * @dst_sector: destination sector
+ * @nr_sects:  number of sectors to copy
+ * @gfp_mask:  memory allocation flags (for bio_alloc)
+ *
+ * Description:
+ *Copy a block range from source device to target device.
+ */
+int blkdev_issue_copy(struct block_device *src_bdev, sector_t src_sector,
+ struct block_device *dst_bdev, sector_t dst_sector,
+ unsigned int nr_sects, gfp_t gfp_mask)
+{
+   DECLARE_COMPLETION_ONSTACK(wait);
+   struct request_queue *sq = bdev_get_queue(src_bdev);
+   struct request_queue *dq = bdev_get_queue(dst_bdev);
+   unsigned int max_copy_sectors;
+   struct bio_batch bb;
+   int ret = 0;
+
+   if (!sq || !dq)
+   return -ENXIO;
+
+   max_copy_sectors = min(sq-limits.max_copy_sectors,
+  dq-limits.max_copy_sectors);
+
+   if (max_copy_sectors == 0)
+   return -EOPNOTSUPP;
+
+   if (src_sector

[PATCH 4/15] block copy: use a timer to fix a theoretical deadlock

2014-07-15 Thread Mikulas Patocka
The block layer creates two bios for each copy operation. The bios travel
independently through the storage stack and they are paired at the block
device.

There is a theoretical problem with this - the block device stack only
guarantees forward progress for a single bio. When two bios are sent, it
is possible (though very unlikely) that the first bio exhausts some
mempool and the second bio waits until there is free space in the mempool
(and thus it waits until the first bio finishes).

To avoid this deadlock, we introduce a timer. If the two bios are not
paired at the physical block device within 10 seconds, the copy operation
is aborted and the bio that waits to be paired is released with an error.

Note that there is no guarantee that any XCOPY operation succeed, so
aborting an operation with an error shouldn't cause any problems - the
caller is supposed to perform the copy manually if XCOPY fails.

Signed-off-by: Mikulas Patocka mpato...@redhat.com

---
 block/blk-lib.c   |   27 +++
 include/linux/blk_types.h |2 ++
 2 files changed, 29 insertions(+)

Index: linux-3.16-rc5/block/blk-lib.c
===
--- linux-3.16-rc5.orig/block/blk-lib.c 2014-07-15 15:27:49.0 +0200
+++ linux-3.16-rc5/block/blk-lib.c  2014-07-15 15:27:51.0 +0200
@@ -305,6 +305,30 @@ int blkdev_issue_zeroout(struct block_de
 }
 EXPORT_SYMBOL(blkdev_issue_zeroout);
 
+#define BLK_COPY_TIMEOUT   (10 * HZ)
+
+static void blk_copy_timeout(unsigned long bc_)
+{
+   struct bio_copy *bc = (struct bio_copy *)bc_;
+   struct bio *bio0 = NULL, *bio1 = NULL;
+
+   WARN_ON(!irqs_disabled());
+
+   spin_lock(bc-spinlock);   /* the timer is IRQSAFE */
+   if (bc-error == 1) {
+   bc-error = -ETIMEDOUT;
+   bio0 = bc-pair[0];
+   bio1 = bc-pair[1];
+   bc-pair[0] = bc-pair[1] = NULL;
+   }
+   spin_unlock(bc-spinlock);
+
+   if (bio0)
+   bio_endio(bio0, -ETIMEDOUT);
+   if (bio1)
+   bio_endio(bio1, -ETIMEDOUT);
+}
+
 static void bio_copy_end_io(struct bio *bio, int error)
 {
struct bio_copy *bc = bio-bi_copy;
@@ -338,6 +362,7 @@ static void bio_copy_end_io(struct bio *
} while (unlikely(atomic64_cmpxchg(bc-first_error,
first_error, bc-offset) != first_error));
}
+   del_timer_sync(bc-timer);
kfree(bc);
if (atomic_dec_and_test(bb-done))
complete(bb-wait);
@@ -428,6 +453,8 @@ int blkdev_issue_copy(struct block_devic
bc-first_error = first_error;
bc-offset = offset;
spin_lock_init(bc-spinlock);
+   __setup_timer(bc-timer, blk_copy_timeout, (unsigned long)bc, 
TIMER_IRQSAFE);
+   mod_timer(bc-timer, jiffies + BLK_COPY_TIMEOUT);
 
read_bio-bi_iter.bi_sector = src_sector;
read_bio-bi_iter.bi_size = chunk  9;
Index: linux-3.16-rc5/include/linux/blk_types.h
===
--- linux-3.16-rc5.orig/include/linux/blk_types.h   2014-07-15 
15:27:49.0 +0200
+++ linux-3.16-rc5/include/linux/blk_types.h2014-07-15 15:27:51.0 
+0200
@@ -6,6 +6,7 @@
 #define __LINUX_BLK_TYPES_H
 
 #include linux/types.h
+#include linux/timer.h
 
 struct bio_set;
 struct bio;
@@ -52,6 +53,7 @@ struct bio_copy {
atomic64_t *first_error;
sector_t offset;
spinlock_t spinlock;
+   struct timer_list timer;
 };
 
 /*

--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/15] block copy: report the amount of copied data

2014-07-15 Thread Mikulas Patocka
This patch changes blkdev_issue_copy so that it returns the number of
copied sectors in the variable copied.

The kernel makes best effort to copy as much data as possible, but because
of device mapper mapping, it may be possible that copying fails at some
stage. If we just returned the error number, the caller wouldn't know if
all or part of the operation failed and the caller would be required to
redo the whole copy operation.

We return the number of copied sectors so that the caller can skip these
sectors when doing the copy manually. On success (zero return code), the
number of copied sectors is equal to the number of requested sectors. On
error (negative return code), the number of copied sectors is smaller than
the number of requested sectors.

The number of copied bytes is returned as a fourth uint64_t argument in
the BLKCOPY ioctl.

Signed-off-by: Mikulas Patocka mpato...@redhat.com

---
 block/blk-lib.c   |   30 +-
 block/ioctl.c |   25 -
 include/linux/blk_types.h |2 ++
 include/linux/blkdev.h|3 ++-
 4 files changed, 45 insertions(+), 15 deletions(-)

Index: linux-3.16-rc5/block/blk-lib.c
===
--- linux-3.16-rc5.orig/block/blk-lib.c 2014-07-15 15:26:33.0 +0200
+++ linux-3.16-rc5/block/blk-lib.c  2014-07-15 15:27:25.0 +0200
@@ -327,8 +327,17 @@ static void bio_copy_end_io(struct bio *
bio_put(bio);
if (atomic_dec_and_test(bc-in_flight)) {
struct bio_batch *bb = bc-private;
-   if (unlikely(bc-error  0)  !ACCESS_ONCE(bb-error))
-   ACCESS_ONCE(bb-error) = bc-error;
+   if (unlikely(bc-error  0)) {
+   u64 first_error;
+   if (!ACCESS_ONCE(bb-error))
+   ACCESS_ONCE(bb-error) = bc-error;
+   do {
+   first_error = atomic64_read(bc-first_error);
+   if (bc-offset = first_error)
+   break;
+   } while (unlikely(atomic64_cmpxchg(bc-first_error,
+   first_error, bc-offset) != first_error));
+   }
kfree(bc);
if (atomic_dec_and_test(bb-done))
complete(bb-wait);
@@ -349,7 +358,7 @@ static void bio_copy_end_io(struct bio *
  */
 int blkdev_issue_copy(struct block_device *src_bdev, sector_t src_sector,
  struct block_device *dst_bdev, sector_t dst_sector,
- unsigned int nr_sects, gfp_t gfp_mask)
+ sector_t nr_sects, gfp_t gfp_mask, sector_t *copied)
 {
DECLARE_COMPLETION_ONSTACK(wait);
struct request_queue *sq = bdev_get_queue(src_bdev);
@@ -357,6 +366,11 @@ int blkdev_issue_copy(struct block_devic
unsigned int max_copy_sectors;
struct bio_batch bb;
int ret = 0;
+   atomic64_t first_error = ATOMIC64_INIT(nr_sects);
+   sector_t offset = 0;
+
+   if (copied)
+   *copied = 0;
 
if (!sq || !dq)
return -ENXIO;
@@ -380,10 +394,10 @@ int blkdev_issue_copy(struct block_devic
bb.error = 0;
bb.wait = wait;
 
-   while (nr_sects) {
+   while (nr_sects  !ACCESS_ONCE(bb.error)) {
struct bio *read_bio, *write_bio;
struct bio_copy *bc;
-   unsigned int chunk = min(nr_sects, max_copy_sectors);
+   unsigned chunk = (unsigned)min(nr_sects, 
(sector_t)max_copy_sectors);
 
bc = kmalloc(sizeof(struct bio_copy), gfp_mask);
if (!bc) {
@@ -411,6 +425,8 @@ int blkdev_issue_copy(struct block_devic
bc-pair[0] = NULL;
bc-pair[1] = NULL;
bc-private = bb;
+   bc-first_error = first_error;
+   bc-offset = offset;
spin_lock_init(bc-spinlock);
 
read_bio-bi_iter.bi_sector = src_sector;
@@ -432,12 +448,16 @@ int blkdev_issue_copy(struct block_devic
src_sector += chunk;
dst_sector += chunk;
nr_sects -= chunk;
+   offset += chunk;
}
 
/* Wait for bios in-flight */
if (!atomic_dec_and_test(bb.done))
wait_for_completion_io(wait);
 
+   if (copied)
+   *copied = min((sector_t)atomic64_read(first_error), offset);
+
if (likely(!ret))
ret = bb.error;
 
Index: linux-3.16-rc5/include/linux/blk_types.h
===
--- linux-3.16-rc5.orig/include/linux/blk_types.h   2014-07-15 
15:26:05.0 +0200
+++ linux-3.16-rc5/include/linux/blk_types.h2014-07-15 15:27:12.0 
+0200
@@ -49,6 +49,8 @@ struct bio_copy {
atomic_t in_flight;
struct bio

[PATCH 5/15] block copy: use merge_bvec_fn for copies

2014-07-15 Thread Mikulas Patocka
We use merge_bvec_fn to make sure that copies do not split internal
boundaries of device mapper devices.

There is no possibility to split a copy bio (splitting would complicate
the design significantly), so we must use merge_bvec_fn to make sure that
the bios have appropriate size for the device mapper stack.

Signed-off-by: Mikulas Patocka mpato...@redhat.com

---
 block/blk-lib.c |   37 +
 1 file changed, 37 insertions(+)

Index: linux-3.16-rc5/block/blk-lib.c
===
--- linux-3.16-rc5.orig/block/blk-lib.c 2014-07-15 15:27:51.0 +0200
+++ linux-3.16-rc5/block/blk-lib.c  2014-07-15 15:27:59.0 +0200
@@ -369,6 +369,31 @@ static void bio_copy_end_io(struct bio *
}
 }
 
+static unsigned blkdev_copy_merge(struct block_device *bdev,
+ struct request_queue *q, unsigned long bi_rw,
+ sector_t sector, unsigned n)
+{
+   if (!q-merge_bvec_fn) {
+   return n;
+   } else {
+   unsigned m;
+   struct bvec_merge_data bvm = {
+   .bi_bdev = bdev,
+   .bi_sector = sector,
+   .bi_size = 0,
+   .bi_rw = bi_rw,
+   };
+   struct bio_vec vec = {
+   .bv_page = NULL,
+   .bv_len = likely(n = UINT_MAX  9) ? n  9 : 
UINT_MAX  ~511U,
+   .bv_offset = 0,
+   };
+   m = q-merge_bvec_fn(q, bvm, vec);
+   m = 9;
+   return min(m, n);
+   }
+}
+
 /**
  * blkdev_issue_copy - queue a copy same operation
  * @src_bdev:  source blockdev
@@ -424,6 +449,18 @@ int blkdev_issue_copy(struct block_devic
struct bio_copy *bc;
unsigned chunk = (unsigned)min(nr_sects, 
(sector_t)max_copy_sectors);
 
+   chunk = blkdev_copy_merge(src_bdev, sq, READ | REQ_COPY, 
src_sector, chunk);
+   if (!chunk) {
+   ret = -EOPNOTSUPP;
+   break;
+   }
+
+   chunk = blkdev_copy_merge(dst_bdev, dq, WRITE | REQ_COPY, 
dst_sector, chunk);
+   if (!chunk) {
+   ret = -EOPNOTSUPP;
+   break;
+   }
+
bc = kmalloc(sizeof(struct bio_copy), gfp_mask);
if (!bc) {
ret = -ENOMEM;

--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 6/15] block copy: use asynchronous notification

2014-07-15 Thread Mikulas Patocka
block copy: use asynchronous notification

In dm-snapshot target there may be large number of copy requests in
progress. If every pending copy request consumed a process context, it
would put too much load on the system.

To avoid this load, we need asynchronous notification when copy finishes -
we can pass a callback to the function blkdev_issue_copy, if the callback
is non-NULL, blkdev_issue_copy exits when it submits all the copy bios and
the callback is called when the copy operation finishes.

With the callback mechanism, there can be large number of in-progress copy
requests and we do not need process context for each of them.

Signed-off-by: Mikulas Patocka mpato...@redhat.com

---
 block/blk-lib.c   |  152 --
 block/ioctl.c |2 
 include/linux/blk_types.h |5 -
 include/linux/blkdev.h|2 
 4 files changed, 114 insertions(+), 47 deletions(-)

Index: linux-3.16-rc5/block/blk-lib.c
===
--- linux-3.16-rc5.orig/block/blk-lib.c 2014-07-15 15:27:59.0 +0200
+++ linux-3.16-rc5/block/blk-lib.c  2014-07-15 16:16:53.0 +0200
@@ -305,6 +305,17 @@ int blkdev_issue_zeroout(struct block_de
 }
 EXPORT_SYMBOL(blkdev_issue_zeroout);
 
+struct bio_copy_batch {
+   atomic_long_t done;
+   int async_error;
+   int sync_error;
+   sector_t sync_copied;
+   atomic64_t first_error;
+   void (*callback)(void *data, int error);
+   void *data;
+   sector_t *copied;
+};
+
 #define BLK_COPY_TIMEOUT   (10 * HZ)
 
 static void blk_copy_timeout(unsigned long bc_)
@@ -329,6 +340,18 @@ static void blk_copy_timeout(unsigned lo
bio_endio(bio1, -ETIMEDOUT);
 }
 
+static void blk_copy_batch_finish(struct bio_copy_batch *batch)
+{
+   void (*fn)(void *, int) = batch-callback;
+   void *data = batch-data;
+   int error = unlikely(batch-sync_error) ? batch-sync_error : 
batch-async_error;
+   if (batch-copied)
+   *batch-copied = min(batch-sync_copied, 
(sector_t)atomic64_read(batch-first_error));
+   kfree(batch);
+   if (fn)
+   fn(data, error);
+}
+
 static void bio_copy_end_io(struct bio *bio, int error)
 {
struct bio_copy *bc = bio-bi_copy;
@@ -350,22 +373,22 @@ static void bio_copy_end_io(struct bio *
}
bio_put(bio);
if (atomic_dec_and_test(bc-in_flight)) {
-   struct bio_batch *bb = bc-private;
+   struct bio_copy_batch *batch = bc-batch;
if (unlikely(bc-error  0)) {
u64 first_error;
-   if (!ACCESS_ONCE(bb-error))
-   ACCESS_ONCE(bb-error) = bc-error;
+   if (!ACCESS_ONCE(batch-async_error))
+   ACCESS_ONCE(batch-async_error) = bc-error;
do {
-   first_error = atomic64_read(bc-first_error);
+   first_error = 
atomic64_read(batch-first_error);
if (bc-offset = first_error)
break;
-   } while (unlikely(atomic64_cmpxchg(bc-first_error,
+   } while (unlikely(atomic64_cmpxchg(batch-first_error,
first_error, bc-offset) != first_error));
}
del_timer_sync(bc-timer);
kfree(bc);
-   if (atomic_dec_and_test(bb-done))
-   complete(bb-wait);
+   if (atomic_long_dec_and_test(batch-done))
+   blk_copy_batch_finish(batch);
}
 }
 
@@ -394,6 +417,18 @@ static unsigned blkdev_copy_merge(struct
}
 }
 
+struct bio_copy_completion {
+   struct completion wait;
+   int error;
+};
+
+static void bio_copy_sync_callback(void *ptr, int error)
+{
+   struct bio_copy_completion *comp = ptr;
+   comp-error = error;
+   complete(comp-wait);
+}
+
 /**
  * blkdev_issue_copy - queue a copy same operation
  * @src_bdev:  source blockdev
@@ -408,69 +443,95 @@ static unsigned blkdev_copy_merge(struct
  */
 int blkdev_issue_copy(struct block_device *src_bdev, sector_t src_sector,
  struct block_device *dst_bdev, sector_t dst_sector,
- sector_t nr_sects, gfp_t gfp_mask, sector_t *copied)
+ sector_t nr_sects, gfp_t gfp_mask,
+ void (*callback)(void *, int), void *data,
+ sector_t *copied)
 {
DECLARE_COMPLETION_ONSTACK(wait);
struct request_queue *sq = bdev_get_queue(src_bdev);
struct request_queue *dq = bdev_get_queue(dst_bdev);
unsigned int max_copy_sectors;
-   struct bio_batch bb;
-   int ret = 0;
-   atomic64_t first_error = ATOMIC64_INIT(nr_sects);
-   sector_t offset = 0;
+   int ret;
+   struct

[PATCH 8/15] dm: introduce dm_ask_for_duplicate_bios

2014-07-15 Thread Mikulas Patocka
[ this isn't connected to XCOPY, but it is requires for the following
device mapper patches to apply cleanly ]

This function can be used if the target needs to receive another duplicate
of the current bio.

Signed-off-by: Mikulas Patocka mpato...@redhat.com

---
 drivers/md/dm.c   |   24 +++-
 include/linux/device-mapper.h |2 ++
 2 files changed, 21 insertions(+), 5 deletions(-)

Index: linux-3.16-rc5/drivers/md/dm.c
===
--- linux-3.16-rc5.orig/drivers/md/dm.c 2014-07-14 16:25:29.0 +0200
+++ linux-3.16-rc5/drivers/md/dm.c  2014-07-14 16:25:31.0 +0200
@@ -1161,9 +1161,9 @@ EXPORT_SYMBOL_GPL(dm_set_target_max_io_l
  *  to make it empty)
  * The target requires that region 3 is to be sent in the next bio.
  *
- * If the target wants to receive multiple copies of the bio (via num_*bios, 
etc),
- * the partially processed part (the sum of regions 1+2) must be the same for 
all
- * copies of the bio.
+ * If the target wants to receive multiple copies of the bio with num_*_bios or
+ * dm_ask_for_duplicate_bio, the partially processed part (the sum of regions
+ * 1+2) must be the same for all copies of the bio.
  */
 void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
 {
@@ -1177,6 +1177,17 @@ void dm_accept_partial_bio(struct bio *b
 }
 EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
 
+/*
+ * The target driver can call this function only from the map routine. The
+ * target driver requests that the dm sends more duplicates of the current bio.
+ */
+void dm_ask_for_duplicate_bios(struct bio *bio, unsigned n_duplicates)
+{
+   struct dm_target_io *tio = container_of(bio, struct dm_target_io, 
clone);
+   (*tio-num_bios) += n_duplicates;
+}
+EXPORT_SYMBOL_GPL(dm_ask_for_duplicate_bios);
+
 static void __map_bio(struct dm_target_io *tio)
 {
int r;
@@ -1267,12 +1278,14 @@ static struct dm_target_io *alloc_tio(st
 
 static void __clone_and_map_simple_bio(struct clone_info *ci,
   struct dm_target *ti,
-  unsigned target_bio_nr, unsigned *len)
+  unsigned target_bio_nr, unsigned *len,
+  unsigned *num_bios)
 {
struct dm_target_io *tio = alloc_tio(ci, ti, ci-bio-bi_max_vecs, 
target_bio_nr);
struct bio *clone = tio-clone;
 
tio-len_ptr = len;
+   tio-num_bios = num_bios;
 
/*
 * Discard requests require the bio's inline iovecs be initialized.
@@ -1292,7 +1305,7 @@ static void __send_duplicate_bios(struct
unsigned target_bio_nr;
 
for (target_bio_nr = 0; target_bio_nr  num_bios; target_bio_nr++)
-   __clone_and_map_simple_bio(ci, ti, target_bio_nr, len);
+   __clone_and_map_simple_bio(ci, ti, target_bio_nr, len, 
num_bios);
 }
 
 static int __send_empty_flush(struct clone_info *ci)
@@ -1318,6 +1331,7 @@ static void __clone_and_map_data_bio(str
for (target_bio_nr = 0; target_bio_nr  num_target_bios; 
target_bio_nr++) {
tio = alloc_tio(ci, ti, 0, target_bio_nr);
tio-len_ptr = len;
+   tio-num_bios = num_target_bios;
clone_bio(tio, bio, sector, *len);
__map_bio(tio);
}
Index: linux-3.16-rc5/include/linux/device-mapper.h
===
--- linux-3.16-rc5.orig/include/linux/device-mapper.h   2014-07-14 
16:25:29.0 +0200
+++ linux-3.16-rc5/include/linux/device-mapper.h2014-07-14 
16:25:31.0 +0200
@@ -271,6 +271,7 @@ struct dm_target_io {
struct dm_target *ti;
unsigned target_bio_nr;
unsigned *len_ptr;
+   unsigned *num_bios;
struct bio clone;
 };
 
@@ -382,6 +383,7 @@ struct gendisk *dm_disk(struct mapped_de
 int dm_suspended(struct dm_target *ti);
 int dm_noflush_suspending(struct dm_target *ti);
 void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors);
+void dm_ask_for_duplicate_bios(struct bio *bio, unsigned n_duplicates);
 union map_info *dm_get_rq_mapinfo(struct request *rq);
 
 struct queue_limits *dm_get_queue_limits(struct mapped_device *md);

--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 7/15] dm: remove num_write_bios

2014-07-15 Thread Mikulas Patocka
[ this isn't connected to XCOPY, but it is requires for the following 
device mapper patches to apply cleanly ]

The target can set the function num_write_bios - dm will issue this
callback to ask the target how many bios does it want to receive.

This was intended for the dm-cache target, but it is not useable due to a
race condition (see the description of
e2e74d617eadc15f601983270c4f4a6935c5a943). num_write_bios is unused, so we
remove it.

Note that we deliberately leave the for loop in __clone_and_map_data_bio -
it will be used in the next patch.

Signed-off-by: Mikulas Patocka mpato...@redhat.com

---
 drivers/md/dm.c   |6 --
 include/linux/device-mapper.h |   15 ---
 2 files changed, 21 deletions(-)

Index: linux-3.16-rc5/drivers/md/dm.c
===
--- linux-3.16-rc5.orig/drivers/md/dm.c 2014-07-14 16:25:01.0 +0200
+++ linux-3.16-rc5/drivers/md/dm.c  2014-07-14 16:25:29.0 +0200
@@ -1315,12 +1315,6 @@ static void __clone_and_map_data_bio(str
unsigned target_bio_nr;
unsigned num_target_bios = 1;
 
-   /*
-* Does the target want to receive duplicate copies of the bio?
-*/
-   if (bio_data_dir(bio) == WRITE  ti-num_write_bios)
-   num_target_bios = ti-num_write_bios(ti, bio);
-
for (target_bio_nr = 0; target_bio_nr  num_target_bios; 
target_bio_nr++) {
tio = alloc_tio(ci, ti, 0, target_bio_nr);
tio-len_ptr = len;
Index: linux-3.16-rc5/include/linux/device-mapper.h
===
--- linux-3.16-rc5.orig/include/linux/device-mapper.h   2014-07-14 
16:25:07.0 +0200
+++ linux-3.16-rc5/include/linux/device-mapper.h2014-07-14 
16:25:29.0 +0200
@@ -184,14 +184,6 @@ struct target_type {
 #define DM_TARGET_IMMUTABLE0x0004
 #define dm_target_is_immutable(type)   ((type)-features  DM_TARGET_IMMUTABLE)
 
-/*
- * Some targets need to be sent the same WRITE bio severals times so
- * that they can send copies of it to different devices.  This function
- * examines any supplied bio and returns the number of copies of it the
- * target requires.
- */
-typedef unsigned (*dm_num_write_bios_fn) (struct dm_target *ti, struct bio 
*bio);
-
 struct dm_target {
struct dm_table *table;
struct target_type *type;
@@ -231,13 +223,6 @@ struct dm_target {
 */
unsigned per_bio_data_size;
 
-   /*
-* If defined, this function is called to find out how many
-* duplicate bios should be sent to the target when writing
-* data.
-*/
-   dm_num_write_bios_fn num_write_bios;
-
/* target specific data */
void *private;
 
--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 9/15] dm: implement copy

2014-07-15 Thread Mikulas Patocka
This patch implements basic copy support for device mapper core.
Individual targets can enable copy support by setting ti-copy_supported.

Device mapper device advertises copy support if at least one target
supports copy and for this target, at least one underlying device supports
copy.

Signed-off-by: Mikulas Patocka mpato...@redhat.com

---
 drivers/md/dm-table.c |9 +
 drivers/md/dm.c   |   42 +++---
 include/linux/device-mapper.h |5 +
 3 files changed, 53 insertions(+), 3 deletions(-)

Index: linux-3.16-rc5/drivers/md/dm.c
===
--- linux-3.16-rc5.orig/drivers/md/dm.c 2014-07-14 16:26:24.0 +0200
+++ linux-3.16-rc5/drivers/md/dm.c  2014-07-14 16:41:15.0 +0200
@@ -1403,6 +1403,31 @@ static int __send_write_same(struct clon
return __send_changing_extent_only(ci, get_num_write_same_bios, NULL);
 }
 
+static int __send_copy(struct clone_info *ci)
+{
+   struct dm_target *ti;
+   sector_t bound;
+
+   ti = dm_table_find_target(ci-map, ci-sector);
+   if (!dm_target_is_valid(ti))
+   return -EIO;
+
+   if (!ti-copy_supported)
+   return -EOPNOTSUPP;
+
+   bound = max_io_len(ci-sector, ti);
+
+   if (unlikely(ci-sector_count  bound))
+   return -EOPNOTSUPP;
+
+   __clone_and_map_simple_bio(ci, ti, 0, NULL, NULL);
+
+   ci-sector += ci-sector_count;
+   ci-sector_count = 0;
+
+   return 0;
+}
+
 /*
  * Select the correct strategy for processing a non-flush bio.
  */
@@ -1416,6 +1441,8 @@ static int __split_and_process_non_flush
return __send_discard(ci);
else if (unlikely(bio-bi_rw  REQ_WRITE_SAME))
return __send_write_same(ci);
+   else if (unlikely(bio-bi_rw  REQ_COPY))
+   return __send_copy(ci);
 
ti = dm_table_find_target(ci-map, ci-sector);
if (!dm_target_is_valid(ti))
@@ -1500,6 +1527,11 @@ static int dm_merge_bvec(struct request_
if (!dm_target_is_valid(ti))
goto out;
 
+   if (unlikely((bvm-bi_rw  REQ_COPY) != 0)) {
+   if (!ti-copy_supported)
+   goto out_ret_max_size;
+   }
+
/*
 * Find maximum amount of I/O that won't need splitting
 */
@@ -1523,17 +1555,21 @@ static int dm_merge_bvec(struct request_
 * entries.  So always set max_size to 0, and the code below allows
 * just one page.
 */
-   else if (queue_max_hw_sectors(q) = PAGE_SIZE  9)
+   else if (likely(!(bvm-bi_rw  REQ_COPY)) 
+queue_max_hw_sectors(q) = PAGE_SIZE  9)
max_size = 0;
 
 out:
-   dm_put_live_table_fast(md);
/*
 * Always allow an entire first page
 */
-   if (max_size = biovec-bv_len  !(bvm-bi_size  SECTOR_SHIFT))
+   if (likely(!(bvm-bi_rw  REQ_COPY)) 
+   max_size = biovec-bv_len  !(bvm-bi_size  SECTOR_SHIFT))
max_size = biovec-bv_len;
 
+out_ret_max_size:
+   dm_put_live_table_fast(md);
+
return max_size;
 }
 
Index: linux-3.16-rc5/include/linux/device-mapper.h
===
--- linux-3.16-rc5.orig/include/linux/device-mapper.h   2014-07-14 
16:26:24.0 +0200
+++ linux-3.16-rc5/include/linux/device-mapper.h2014-07-14 
16:41:15.0 +0200
@@ -251,6 +251,11 @@ struct dm_target {
 * Set if this target does not return zeroes on discarded blocks.
 */
bool discard_zeroes_data_unsupported:1;
+
+   /*
+* Set if the target supports XCOPY.
+*/
+   bool copy_supported:1;
 };
 
 /* Each target can link one of these into the table */
Index: linux-3.16-rc5/drivers/md/dm-table.c
===
--- linux-3.16-rc5.orig/drivers/md/dm-table.c   2014-07-14 16:26:25.0 
+0200
+++ linux-3.16-rc5/drivers/md/dm-table.c2014-07-14 16:41:15.0 
+0200
@@ -489,6 +489,11 @@ static int dm_set_device_limits(struct d
   q-limits.alignment_offset,
   (unsigned long long) start  SECTOR_SHIFT);
 
+   if (ti-copy_supported)
+   limits-max_copy_sectors =
+   min_not_zero(limits-max_copy_sectors,
+   bdev_get_queue(bdev)-limits.max_copy_sectors);
+
/*
 * Check if merge fn is supported.
 * If not we'll force DM to use PAGE_SIZE or
@@ -1298,6 +1303,10 @@ combine_limits:
   dm_device_name(table-md),
   (unsigned long long) ti-begin,
   (unsigned long long) ti-len);
+
+   limits-max_copy_sectors =
+   min_not_zero(limits-max_copy_sectors

[PATCH 10/15] dm linear: support copy

2014-07-15 Thread Mikulas Patocka
Support copy operation in the linear target.

Signed-off-by: Mikulas Patocka mpato...@redhat.com

---
 drivers/md/dm-linear.c |1 +
 1 file changed, 1 insertion(+)

Index: linux-3.16-rc4/drivers/md/dm-linear.c
===
--- linux-3.16-rc4.orig/drivers/md/dm-linear.c  2014-07-11 22:20:27.0 
+0200
+++ linux-3.16-rc4/drivers/md/dm-linear.c   2014-07-11 22:22:20.0 
+0200
@@ -56,6 +56,7 @@ static int linear_ctr(struct dm_target *
ti-num_flush_bios = 1;
ti-num_discard_bios = 1;
ti-num_write_same_bios = 1;
+   ti-copy_supported = 1;
ti-private = lc;
return 0;
 

--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 11/15] dm stripe: support copy

2014-07-15 Thread Mikulas Patocka
Support the copy operation for the stripe target.

In stripe_merge, we verify that the underlying device supports copy. If it
doesn't, we can fail fast without any bio being contructed.

Signed-off-by: Mikulas Patocka mpato...@redhat.com

---
 drivers/md/dm-stripe.c |   11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

Index: linux-3.16-rc4/drivers/md/dm-stripe.c
===
--- linux-3.16-rc4.orig/drivers/md/dm-stripe.c  2014-07-11 22:20:25.0 
+0200
+++ linux-3.16-rc4/drivers/md/dm-stripe.c   2014-07-11 22:23:54.0 
+0200
@@ -165,6 +165,7 @@ static int stripe_ctr(struct dm_target *
ti-num_flush_bios = stripes;
ti-num_discard_bios = stripes;
ti-num_write_same_bios = stripes;
+   ti-copy_supported = 1;
 
sc-chunk_size = chunk_size;
if (chunk_size  (chunk_size - 1))
@@ -416,11 +417,19 @@ static int stripe_merge(struct dm_target
struct stripe_c *sc = ti-private;
sector_t bvm_sector = bvm-bi_sector;
uint32_t stripe;
+   struct block_device *bdev;
struct request_queue *q;
 
stripe_map_sector(sc, bvm_sector, stripe, bvm_sector);
 
-   q = bdev_get_queue(sc-stripe[stripe].dev-bdev);
+   bdev = sc-stripe[stripe].dev-bdev;
+
+   if (unlikely((bvm-bi_rw  REQ_COPY) != 0)) {
+   if (!bdev_copy_offload(bdev))
+   return 0;
+   }
+
+   q = bdev_get_queue(bdev);
if (!q-merge_bvec_fn)
return max_size;
 

--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 13/15] dm kcopyd: support copy offload

2014-07-15 Thread Mikulas Patocka
This patch adds copy offload support to dm-kcopyd. If copy offload fails,
copying is performed using dm-io, just like before.

There is a module parameter copy_offload that can be set to enable or
disable this feature. It can be used to test performance of copy offload.

Signed-off-by: Mikulas Patocka mpato...@redhat.com

---
 drivers/md/dm-kcopyd.c |   38 +-
 1 file changed, 37 insertions(+), 1 deletion(-)

Index: linux-3.16-rc5/drivers/md/dm-kcopyd.c
===
--- linux-3.16-rc5.orig/drivers/md/dm-kcopyd.c  2014-07-15 16:18:15.0 
+0200
+++ linux-3.16-rc5/drivers/md/dm-kcopyd.c   2014-07-15 19:20:34.0 
+0200
@@ -96,6 +96,9 @@ static DEFINE_SPINLOCK(throttle_spinlock
  */
 #define MAX_SLEEPS 10
 
+static bool copy_offload = true;
+module_param(copy_offload, bool, S_IRUGO | S_IWUSR);
+
 static void io_job_start(struct dm_kcopyd_throttle *t)
 {
unsigned throttle, now, difference;
@@ -358,6 +361,8 @@ struct kcopyd_job {
sector_t progress;
 
struct kcopyd_job *master_job;
+
+   struct work_struct copy_work;
 };
 
 static struct kmem_cache *_job_cache;
@@ -709,6 +714,31 @@ static void submit_job(struct kcopyd_job
}
 }
 
+static void copy_offload_work(struct work_struct *work)
+{
+   struct kcopyd_job *job = container_of(work, struct kcopyd_job, 
copy_work);
+   sector_t copied;
+
+   blkdev_issue_copy(job-source.bdev, job-source.sector,
+ job-dests[0].bdev, job-dests[0].sector,
+ job-source.count,
+ GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | 
__GFP_NOWARN,
+ NULL, NULL, copied);
+
+   job-source.sector += copied;
+   job-source.count -= copied;
+   job-dests[0].sector += copied;
+   job-dests[0].count -= copied;
+
+   submit_job(job);
+}
+
+static void try_copy_offload(struct kcopyd_job *job)
+{
+   INIT_WORK(job-copy_work, copy_offload_work);
+   queue_work(job-kc-kcopyd_wq, job-copy_work);
+}
+
 int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
   unsigned int num_dests, struct dm_io_region *dests,
   unsigned int flags, dm_kcopyd_notify_fn fn, void *context)
@@ -757,7 +787,13 @@ int dm_kcopyd_copy(struct dm_kcopyd_clie
job-context = context;
job-master_job = job;
 
-   submit_job(job);
+   if (copy_offload  num_dests == 1 
+   bdev_copy_offload(job-source.bdev) 
+   bdev_copy_offload(job-dests[0].bdev)) {
+   try_copy_offload(job);
+   } else {
+   submit_job(job);
+   }
 
return 0;
 }

--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 14/15] dm kcopyd: change mutex to spinlock

2014-07-15 Thread Mikulas Patocka
job-lock is only taken for a finite amount of time and the process
doesn't block while holding it, so change it from mutex to spinlock.

This change is needed for the next patch that makes it possible to call
segment_complete from an interrupt. Taking mutexes inside an interrupt is
not allowed.

Signed-off-by: Mikulas Patocka mpato...@redhat.com

---
 drivers/md/dm-kcopyd.c |   10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

Index: linux-3.16-rc5/drivers/md/dm-kcopyd.c
===
--- linux-3.16-rc5.orig/drivers/md/dm-kcopyd.c  2014-07-15 19:20:34.0 
+0200
+++ linux-3.16-rc5/drivers/md/dm-kcopyd.c   2014-07-15 19:24:20.0 
+0200
@@ -21,7 +21,7 @@
 #include linux/slab.h
 #include linux/vmalloc.h
 #include linux/workqueue.h
-#include linux/mutex.h
+#include linux/spinlock.h
 #include linux/delay.h
 #include linux/device-mapper.h
 #include linux/dm-kcopyd.h
@@ -356,7 +356,7 @@ struct kcopyd_job {
 * These fields are only used if the job has been split
 * into more manageable parts.
 */
-   struct mutex lock;
+   spinlock_t lock;
atomic_t sub_jobs;
sector_t progress;
 
@@ -629,7 +629,7 @@ static void segment_complete(int read_er
struct kcopyd_job *job = sub_job-master_job;
struct dm_kcopyd_client *kc = job-kc;
 
-   mutex_lock(job-lock);
+   spin_lock(job-lock);
 
/* update the error */
if (read_err)
@@ -653,7 +653,7 @@ static void segment_complete(int read_er
job-progress += count;
}
}
-   mutex_unlock(job-lock);
+   spin_unlock(job-lock);
 
if (count) {
int i;
@@ -708,7 +708,7 @@ static void submit_job(struct kcopyd_job
if (job-source.count = SUB_JOB_SIZE)
dispatch_job(job);
else {
-   mutex_init(job-lock);
+   spin_lock_init(job-lock);
job-progress = 0;
split_job(job);
}

--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 12/15] dm kcopyd: introduce the function submit_job

2014-07-15 Thread Mikulas Patocka
We move some code to a function submit_job. It is needed for the next
patch that calls submit_job from another place.

Signed-off-by: Mikulas Patocka mpato...@redhat.com

---
 drivers/md/dm-kcopyd.c |   19 ---
 1 file changed, 12 insertions(+), 7 deletions(-)

Index: linux-3.16-rc5/drivers/md/dm-kcopyd.c
===
--- linux-3.16-rc5.orig/drivers/md/dm-kcopyd.c  2014-07-14 16:45:23.0 
+0200
+++ linux-3.16-rc5/drivers/md/dm-kcopyd.c   2014-07-14 17:28:36.0 
+0200
@@ -698,6 +698,17 @@ static void split_job(struct kcopyd_job 
}
 }
 
+static void submit_job(struct kcopyd_job *job)
+{
+   if (job-source.count = SUB_JOB_SIZE)
+   dispatch_job(job);
+   else {
+   mutex_init(job-lock);
+   job-progress = 0;
+   split_job(job);
+   }
+}
+
 int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
   unsigned int num_dests, struct dm_io_region *dests,
   unsigned int flags, dm_kcopyd_notify_fn fn, void *context)
@@ -746,13 +757,7 @@ int dm_kcopyd_copy(struct dm_kcopyd_clie
job-context = context;
job-master_job = job;
 
-   if (job-source.count = SUB_JOB_SIZE)
-   dispatch_job(job);
-   else {
-   mutex_init(job-lock);
-   job-progress = 0;
-   split_job(job);
-   }
+   submit_job(job);
 
return 0;
 }

--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 15/15] dm kcopyd: call copy offload with asynchronous callback

2014-07-15 Thread Mikulas Patocka
Change dm kcopyd so that it calls blkdev_issue_copy with an asynchronous
callback. There can be large number of pending kcopyd requests and holding
a process context for each of them may put too much load on the workqueue
subsystem.

This patch changes it so that blkdev_issue_copy returns after it submitted
the requests and copy_offload_callback is called when the copy operation
finishes.

Signed-off-by: Mikulas Patocka mpato...@redhat.com

---
 drivers/md/dm-kcopyd.c |   33 ++---
 1 file changed, 14 insertions(+), 19 deletions(-)

Index: linux-3.16-rc5/drivers/md/dm-kcopyd.c
===
--- linux-3.16-rc5.orig/drivers/md/dm-kcopyd.c  2014-07-15 19:24:20.0 
+0200
+++ linux-3.16-rc5/drivers/md/dm-kcopyd.c   2014-07-15 19:24:54.0 
+0200
@@ -361,8 +361,6 @@ struct kcopyd_job {
sector_t progress;
 
struct kcopyd_job *master_job;
-
-   struct work_struct copy_work;
 };
 
 static struct kmem_cache *_job_cache;
@@ -628,8 +626,9 @@ static void segment_complete(int read_er
struct kcopyd_job *sub_job = (struct kcopyd_job *) context;
struct kcopyd_job *job = sub_job-master_job;
struct dm_kcopyd_client *kc = job-kc;
+   unsigned long flags;
 
-   spin_lock(job-lock);
+   spin_lock_irqsave(job-lock, flags);
 
/* update the error */
if (read_err)
@@ -653,7 +652,7 @@ static void segment_complete(int read_er
job-progress += count;
}
}
-   spin_unlock(job-lock);
+   spin_unlock_irqrestore(job-lock, flags);
 
if (count) {
int i;
@@ -714,29 +713,25 @@ static void submit_job(struct kcopyd_job
}
 }
 
-static void copy_offload_work(struct work_struct *work)
+static void copy_offload_callback(void *ptr, int error)
 {
-   struct kcopyd_job *job = container_of(work, struct kcopyd_job, 
copy_work);
-   sector_t copied;
+   struct kcopyd_job *job = ptr;
 
-   blkdev_issue_copy(job-source.bdev, job-source.sector,
- job-dests[0].bdev, job-dests[0].sector,
- job-source.count,
- GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | 
__GFP_NOWARN,
- NULL, NULL, copied);
-
-   job-source.sector += copied;
-   job-source.count -= copied;
-   job-dests[0].sector += copied;
-   job-dests[0].count -= copied;
+   job-source.sector += job-progress;
+   job-source.count -= job-progress;
+   job-dests[0].sector += job-progress;
+   job-dests[0].count -= job-progress;
 
submit_job(job);
 }
 
 static void try_copy_offload(struct kcopyd_job *job)
 {
-   INIT_WORK(job-copy_work, copy_offload_work);
-   queue_work(job-kc-kcopyd_wq, job-copy_work);
+   blkdev_issue_copy(job-source.bdev, job-source.sector,
+ job-dests[0].bdev, job-dests[0].sector,
+ job-source.count,
+ GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | 
__GFP_NOWARN,
+ copy_offload_callback, job, job-progress);
 }
 
 int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,

--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Crash on WRITE SAME

2014-07-02 Thread Mikulas Patocka
Hi Sagi

Your commit d77e65350f2d82dfa0557707d505711f5a43c8fd causes crash on SCSI 
WRITE SAME command (it can be triggered by issuing the BLKZEROOUT ioctl). 
The crash happens in iscsi_tcp_segment_done because sg_next returns NULL.

Before that commit, there was this code in iscsi_prep_scsi_cmd_pdu:
unsigned out_len = scsi_out(sc)-length;
after the commit, there is this:
transfer_length = scsi_transfer_length(sc);
... scsi_transfer_length(struct scsi_cmnd *scmd) returns 
blk_rq_bytes(scmd-request);

The problem is this: suppose that we have WRITE_SAME command that writes 
two 512-byte sectors. blk_rq_bytes(scmd-request) returns 1024 (because 
1024 bytes are written to the disk), but scsi_out(sc)-length contains the 
value 512 (because only 512 bytes are transferred as data for the SCSI 
command).

Your patch changes that from 512 to 1024 and it causes the crash in 
iscsi_tcp_segment_done due to mismatching size.



I'm not exactly sure how to fix this bug in order to not break something 
else.

I'd like to know what was your intention for the function 
scsi_transfer_length? Is it supposed to return the size of data 
transferred on the SCSI bus? What should it return for bidirectional 
commands?

scsi_transfer_length tries to add 8 for each transferred sector depending 
on prot_op. What should it do with commands that transfer only part of a 
sector? (for example the UNMAP command only transfers 24 bytes of data). 
Should it add 8 for a partial sector tranferred or not?

Mikulas
--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Crash on WRITE SAME

2014-07-02 Thread Mikulas Patocka


On Wed, 2 Jul 2014, Christoph Hellwig wrote:

 On Wed, Jul 02, 2014 at 02:05:14PM -0400, Mikulas Patocka wrote:
  Hi Sagi
  
  Your commit d77e65350f2d82dfa0557707d505711f5a43c8fd causes crash on SCSI 
  WRITE SAME command (it can be triggered by issuing the BLKZEROOUT ioctl). 
  The crash happens in iscsi_tcp_segment_done because sg_next returns NULL.
 
 Martin already fixes this over a week ago, we're just waiting for James
 to send a pull request to Linus.
 
 Get this fix from the core-for-3.16 branch of
 git.infradead.org/users/hch/scsi-queue.git for now.

I see.

And what about protection information for commands that transfer partial 
sectors? (for example, UMAP transfers 24 bytes). Should 
scsi_transfer_length return 24 or 32 in this case?

Mikulas
--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] target: fix deadlock on unload

2014-06-27 Thread Mikulas Patocka


On Thu, 26 Jun 2014, Nicholas A. Bellinger wrote:

 Hi Mikulas,
 
 On Mon, 2014-06-23 at 13:42 -0400, Mikulas Patocka wrote:
  target: fix deadlock on unload
  
  On uniprocessor preemptible kernel, target core deadlocks on unload. The
  following events happen:
  * iscsit_del_np is called
  * it calls send_sig(SIGINT, np-np_thread, 1);
  * the scheduler switches to the np_thread
  * the np_thread is woken up, it sees that kthread_should_stop() returns
false, so it doesn't terminate
  * the np_thread clears signals with flush_signals(current); and goes back
to sleep in iscsit_accept_np
  * the scheduler switches back to iscsit_del_np
  * iscsit_del_np calls kthread_stop(np-np_thread);
  * the np_thread is waiting in iscsit_accept_np and it doesn't respond to
kthread_stop
  
  The deadlock could be resolved if the administrator sends SIGINT signal to
  the np_thread with killall -INT iscsi_np
  
  The reproducible deadlock was introduced in commit
  db6077fd0b7dd41dc6ff18329cec979379071f87, but the thread-stopping code was
  racy even before.
  
  This patch fixes the problem. Using kthread_should_stop to stop the
  np_thread is unreliable, so we test np_thread_state instead. If
  np_thread_state equals ISCSI_NP_THREAD_SHUTDOWN, the thread exits.
  
  Signed-off-by: Mikulas Patocka mpato...@redhat.com
  Cc: sta...@vger.kernel.org
  
 
 Apologies for the delayed response..
 
 Applied to target-pending/master and including in the next -rc3 PULL
 request.
 
 Also FYI, I've added '3.12+' to the stable tag to match how far back
 commit db6077fd0 has been included in stable.
 
 Thanks,
 
 --nab

Hi

I think db6077fd0 should be backported to stable kernels beginning with 
3.10 (because they set np-np_thread = NULL in 
__iscsi_target_login_thread). The current 3.10-stable branch misses this 
patch.


This patch for unload deadlock should be backported to all stable kernels 
(because unload is racy there), but because of different code, we should 
make a different patch for old stable branches.

For example in 3.4.95, __iscsi_target_login_thread contains this code:
spin_lock_bh(np-np_thread_lock);
if (np-np_thread_state == ISCSI_NP_THREAD_RESET) {
np-np_thread_state = ISCSI_NP_THREAD_ACTIVE;
complete(np-np_restart_comp);
} else {
np-np_thread_state = ISCSI_NP_THREAD_ACTIVE;
}
spin_unlock_bh(np-np_thread_lock);
If the state is ISCSI_NP_THREAD_SHUTDOWN, the above piece of code will 
change it to ISCSI_NP_THREAD_ACTIVE and open the same kthread_should_stop 
race.

Mikulas
--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] target: fix deadlock on unload

2014-06-23 Thread Mikulas Patocka
target: fix deadlock on unload

On uniprocessor preemptible kernel, target core deadlocks on unload. The
following events happen:
* iscsit_del_np is called
* it calls send_sig(SIGINT, np-np_thread, 1);
* the scheduler switches to the np_thread
* the np_thread is woken up, it sees that kthread_should_stop() returns
  false, so it doesn't terminate
* the np_thread clears signals with flush_signals(current); and goes back
  to sleep in iscsit_accept_np
* the scheduler switches back to iscsit_del_np
* iscsit_del_np calls kthread_stop(np-np_thread);
* the np_thread is waiting in iscsit_accept_np and it doesn't respond to
  kthread_stop

The deadlock could be resolved if the administrator sends SIGINT signal to
the np_thread with killall -INT iscsi_np

The reproducible deadlock was introduced in commit
db6077fd0b7dd41dc6ff18329cec979379071f87, but the thread-stopping code was
racy even before.

This patch fixes the problem. Using kthread_should_stop to stop the
np_thread is unreliable, so we test np_thread_state instead. If
np_thread_state equals ISCSI_NP_THREAD_SHUTDOWN, the thread exits.

Signed-off-by: Mikulas Patocka mpato...@redhat.com
Cc: sta...@vger.kernel.org

---
 drivers/target/iscsi/iscsi_target_login.c |   13 +++--
 1 file changed, 7 insertions(+), 6 deletions(-)

Index: linux-3.14.8/drivers/target/iscsi/iscsi_target_login.c
===
--- linux-3.14.8.orig/drivers/target/iscsi/iscsi_target_login.c 2014-06-19 
00:10:44.634113370 +0200
+++ linux-3.14.8/drivers/target/iscsi/iscsi_target_login.c  2014-06-23 
19:05:44.887437402 +0200
@@ -1196,7 +1196,7 @@ old_sess_out:
 static int __iscsi_target_login_thread(struct iscsi_np *np)
 {
u8 *buffer, zero_tsih = 0;
-   int ret = 0, rc, stop;
+   int ret = 0, rc;
struct iscsi_conn *conn = NULL;
struct iscsi_login *login;
struct iscsi_portal_group *tpg = NULL;
@@ -1210,6 +1210,9 @@ static int __iscsi_target_login_thread(s
if (np-np_thread_state == ISCSI_NP_THREAD_RESET) {
np-np_thread_state = ISCSI_NP_THREAD_ACTIVE;
complete(np-np_restart_comp);
+   } else if (np-np_thread_state == ISCSI_NP_THREAD_SHUTDOWN) {
+   spin_unlock_bh(np-np_thread_lock);
+   goto exit;
} else {
np-np_thread_state = ISCSI_NP_THREAD_ACTIVE;
}
@@ -1402,10 +1405,8 @@ old_sess_out:
}
 
 out:
-   stop = kthread_should_stop();
-   /* Wait for another socket.. */
-   if (!stop)
-   return 1;
+   return 1;
+
 exit:
iscsi_stop_login_thread_timer(np);
spin_lock_bh(np-np_thread_lock);
@@ -1422,7 +1423,7 @@ int iscsi_target_login_thread(void *arg)
 
allow_signal(SIGINT);
 
-   while (!kthread_should_stop()) {
+   while (1) {
ret = __iscsi_target_login_thread(np);
/*
 * We break and exit here unless another sock_accept() call
--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] target: fix memory leak on XCOPY

2014-05-17 Thread Mikulas Patocka
On each processed XCOPY command, two kmalloc-512 memory objects are
leaked. These represent two allocations of struct xcopy_pt_cmd in
target_core_xcopy.c.

The reason for the memory leak is that the cmd_kref field is not
initialized (thus, it is zero because the allocations were done with
kzalloc). When we decrement zero kref in target_put_sess_cmd, the result
is not zero, thus target_release_cmd_kref is not called.

This patch fixes the bug by moving kref initialization from
target_get_sess_cmd to transport_init_se_cmd (this function is called from
target_core_xcopy.c, so it will correctly initialize cmd_kref). It can be
easily verified that all code that calls target_get_sess_cmd also calls
transport_init_se_cmd earlier, thus moving kref_init shouldn't introduce
any new problems.

Signed-off-by: Mikulas Patocka mpato...@redhat.com
Cc: sta...@vger.kernel.org  # 3.12+

---
 drivers/target/target_core_transport.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Index: linux-3.15-rc5/drivers/target/target_core_transport.c
===
--- linux-3.15-rc5.orig/drivers/target/target_core_transport.c  2014-04-14 
16:08:15.0 +0200
+++ linux-3.15-rc5/drivers/target/target_core_transport.c   2014-05-16 
18:24:57.0 +0200
@@ -1113,6 +1113,7 @@ void transport_init_se_cmd(
init_completion(cmd-cmd_wait_comp);
init_completion(cmd-task_stop_comp);
spin_lock_init(cmd-t_state_lock);
+   kref_init(cmd-cmd_kref);
cmd-transport_state = CMD_T_DEV_ACTIVE;
 
cmd-se_tfo = tfo;
@@ -2357,7 +2358,6 @@ int target_get_sess_cmd(struct se_sessio
unsigned long flags;
int ret = 0;
 
-   kref_init(se_cmd-cmd_kref);
/*
 * Add a second kref if the fabric caller is expecting to handle
 * fabric acknowledgement that requires two target_put_sess_cmd()
--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] kref: warn on uninitialized kref

2014-05-17 Thread Mikulas Patocka
I found a memory leak in iSCSI target that was caused by kref initialized
to zero (the memory object was allocated with kzalloc, kref_init was not
called and kref_put_spinlock_irqsave was called which changed 0 to -1
and didn't free the object).

Similar bugs may exist in other kernel areas, so I submit this patch that
adds a check to kref.h. If the value is zero or negative, we can assume
that it is uninitialized and we warn about it.

Signed-off-by: Mikulas Patocka mpato...@redhat.com

---
 include/linux/kref.h |4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

Index: linux-3.15-rc5/include/linux/kref.h
===
--- linux-3.15-rc5.orig/include/linux/kref.h2013-07-02 22:23:38.0 
+0200
+++ linux-3.15-rc5/include/linux/kref.h 2014-05-16 18:56:18.0 +0200
@@ -69,7 +69,7 @@ static inline int kref_sub(struct kref *
 void (*release)(struct kref *kref))
 {
WARN_ON(release == NULL);
-
+   WARN_ON_ONCE(atomic_read(kref-refcount) = 0);
if (atomic_sub_and_test((int) count, kref-refcount)) {
release(kref);
return 1;
@@ -119,6 +119,7 @@ static inline int kref_put_spinlock_irqs
unsigned long flags;
 
WARN_ON(release == NULL);
+   WARN_ON_ONCE(atomic_read(kref-refcount) = 0);
if (atomic_add_unless(kref-refcount, -1, 1))
return 0;
spin_lock_irqsave(lock, flags);
@@ -136,6 +137,7 @@ static inline int kref_put_mutex(struct 
 struct mutex *lock)
 {
WARN_ON(release == NULL);
+   WARN_ON_ONCE(atomic_read(kref-refcount) = 0);
if (unlikely(!atomic_add_unless(kref-refcount, -1, 1))) {
mutex_lock(lock);
if (unlikely(!atomic_dec_and_test(kref-refcount))) {
--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] kref: warn on uninitialized kref

2014-05-17 Thread Mikulas Patocka


On Sat, 17 May 2014, Mateusz Guzik wrote:

 On Sat, May 17, 2014 at 06:53:17AM -0400, Mikulas Patocka wrote:
  I found a memory leak in iSCSI target that was caused by kref initialized
  to zero (the memory object was allocated with kzalloc, kref_init was not
  called and kref_put_spinlock_irqsave was called which changed 0 to -1
  and didn't free the object).
  
  Similar bugs may exist in other kernel areas, so I submit this patch that
  adds a check to kref.h. If the value is zero or negative, we can assume
  that it is uninitialized and we warn about it.
  
  Signed-off-by: Mikulas Patocka mpato...@redhat.com
  
  ---
   include/linux/kref.h |4 +++-
   1 file changed, 3 insertions(+), 1 deletion(-)
  
  Index: linux-3.15-rc5/include/linux/kref.h
  ===
  --- linux-3.15-rc5.orig/include/linux/kref.h2013-07-02 
  22:23:38.0 +0200
  +++ linux-3.15-rc5/include/linux/kref.h 2014-05-16 18:56:18.0 
  +0200
  @@ -69,7 +69,7 @@ static inline int kref_sub(struct kref *
   void (*release)(struct kref *kref))
   {
  WARN_ON(release == NULL);
  -
  +   WARN_ON_ONCE(atomic_read(kref-refcount) = 0);
  if (atomic_sub_and_test((int) count, kref-refcount)) {
  release(kref);
  return 1;
  @@ -119,6 +119,7 @@ static inline int kref_put_spinlock_irqs
  unsigned long flags;
   
  WARN_ON(release == NULL);
  +   WARN_ON_ONCE(atomic_read(kref-refcount) = 0);
  if (atomic_add_unless(kref-refcount, -1, 1))
  return 0;
  spin_lock_irqsave(lock, flags);
  @@ -136,6 +137,7 @@ static inline int kref_put_mutex(struct 
   struct mutex *lock)
   {
  WARN_ON(release == NULL);
  +   WARN_ON_ONCE(atomic_read(kref-refcount) = 0);
  if (unlikely(!atomic_add_unless(kref-refcount, -1, 1))) {
  mutex_lock(lock);
  if (unlikely(!atomic_dec_and_test(kref-refcount))) {
 
 This has a side effect of detecting some overputs, which is nice.
 
 However, could be made better if kref_sub checked that refs you want to
 take don't put the count below 0.
 
 i.e.
 WARN_ON(count  atomic_read(kref-refcount));
 
 this also detects your original problem.
 
 -- 
 Mateusz Guzik

Good point. I'm sending the updated patch.

Mikulas
--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2] kref: warn on uninitialized kref

2014-05-17 Thread Mikulas Patocka
I found a memory leak in iSCSI target that was caused by kref initialized
to zero (the memory object was allocated with kzalloc, kref_init was not
called and kref_put_spinlock_irqsave was called which changed 0 to -1
and didn't free the object).

Similar bugs may exist in other kernel areas, so I submit this patch that
adds a check to kref.h. If the value is zero or negative, we can assume
that it is uninitialized and we warn about it.

Signed-off-by: Mikulas Patocka mpato...@redhat.com

---
 include/linux/kref.h |4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

Index: linux-3.15-rc5/include/linux/kref.h
===
--- linux-3.15-rc5.orig/include/linux/kref.h2014-05-16 19:00:18.0 
+0200
+++ linux-3.15-rc5/include/linux/kref.h 2014-05-17 13:19:31.0 +0200
@@ -69,7 +69,7 @@ static inline int kref_sub(struct kref *
 void (*release)(struct kref *kref))
 {
WARN_ON(release == NULL);
-
+   WARN_ON_ONCE(atomic_read(kref-refcount)  (int)count);
if (atomic_sub_and_test((int) count, kref-refcount)) {
release(kref);
return 1;
@@ -119,6 +119,7 @@ static inline int kref_put_spinlock_irqs
unsigned long flags;
 
WARN_ON(release == NULL);
+   WARN_ON_ONCE(atomic_read(kref-refcount) = 0);
if (atomic_add_unless(kref-refcount, -1, 1))
return 0;
spin_lock_irqsave(lock, flags);
@@ -136,6 +137,7 @@ static inline int kref_put_mutex(struct 
 struct mutex *lock)
 {
WARN_ON(release == NULL);
+   WARN_ON_ONCE(atomic_read(kref-refcount) = 0);
if (unlikely(!atomic_add_unless(kref-refcount, -1, 1))) {
mutex_lock(lock);
if (unlikely(!atomic_dec_and_test(kref-refcount))) {

--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2] kref: warn on uninitialized kref

2014-05-17 Thread Mikulas Patocka


On Sat, 17 May 2014, Bart Van Assche wrote:

 On 05/17/14 14:38, Mikulas Patocka wrote:
  I found a memory leak in iSCSI target that was caused by kref initialized
  to zero (the memory object was allocated with kzalloc, kref_init was not
  called and kref_put_spinlock_irqsave was called which changed 0 to -1
  and didn't free the object).
  
  Similar bugs may exist in other kernel areas, so I submit this patch that
  adds a check to kref.h. If the value is zero or negative, we can assume
  that it is uninitialized and we warn about it.
  
  Signed-off-by: Mikulas Patocka mpato...@redhat.com
  
  ---
   include/linux/kref.h |4 +++-
   1 file changed, 3 insertions(+), 1 deletion(-)
  
  Index: linux-3.15-rc5/include/linux/kref.h
  ===
  --- linux-3.15-rc5.orig/include/linux/kref.h2014-05-16 
  19:00:18.0 +0200
  +++ linux-3.15-rc5/include/linux/kref.h 2014-05-17 13:19:31.0 
  +0200
  @@ -69,7 +69,7 @@ static inline int kref_sub(struct kref *
   void (*release)(struct kref *kref))
   {
  WARN_ON(release == NULL);
  -
  +   WARN_ON_ONCE(atomic_read(kref-refcount)  (int)count);
  if (atomic_sub_and_test((int) count, kref-refcount)) {
  release(kref);
  return 1;
  @@ -119,6 +119,7 @@ static inline int kref_put_spinlock_irqs
  unsigned long flags;
   
  WARN_ON(release == NULL);
  +   WARN_ON_ONCE(atomic_read(kref-refcount) = 0);
  if (atomic_add_unless(kref-refcount, -1, 1))
  return 0;
  spin_lock_irqsave(lock, flags);
  @@ -136,6 +137,7 @@ static inline int kref_put_mutex(struct 
   struct mutex *lock)
   {
  WARN_ON(release == NULL);
  +   WARN_ON_ONCE(atomic_read(kref-refcount) = 0);
  if (unlikely(!atomic_add_unless(kref-refcount, -1, 1))) {
  mutex_lock(lock);
  if (unlikely(!atomic_dec_and_test(kref-refcount))) {
 
 This patch adds two conditional branches and one atomic read to
 kref_sub(). What is the performance impact of this patch on kernel code
 that uses kref_put() in the hot path ? Has it been considered to enable
 the newly added code only if a CONFIG_DEBUG_* macro has been set ?
 
 Bart.

The atomic modification takes several tens of ticks, the atomic read and 
compare is just one tick. I don't exepct any performance problem with 
this.

BTW. if we talk about performance - what about replacing:

if (atomic_dec_and_test(variable)) {
... release(object);
}

with this:

if (atomic_read(variable) == 1 || atomic_dec_and_test(variable)) {
barrier();
... release(object);
}

It avoids the heavy atomic instruction if there is just one reference. Is 
there any problem with this? At least on x86 we could do this always 
(there is no read reordering in hardware, so barrier() is sufficient to 
prevent reads from being reordered with atomic_read). On the architectures 
that reorder reads, we could do it only if the release method doesn't 
contain any reads of the object being released.

Mikulas
--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] block devices: validate block device capacity

2014-02-03 Thread Mikulas Patocka


On Mon, 3 Feb 2014, Christoph Hellwig wrote:

 On Fri, Jan 31, 2014 at 03:20:17AM -0500, Mikulas Patocka wrote:
  So if you think you can support 16TiB devices and leave pgoff_t 32-bit, 
  send a patch that does it.
  
  Until you make it, you should apply the patch that I sent, that prevents 
  kernel lockups or data corruption when the user uses 16TiB device on 
  32-bit kernel.
 
 Exactly.  I had actually looked into support for  16TiB devices for
 a NAS use case a while ago, but when explaining the effort involves
 the idea was dropped quickly.  The Linux block device is too deeply
 tied to the pagecache to make it easily feasible.

The memory management routines use pgoff_t, so we could define pgoff_t to 
be 64-bit type. But there is lib/radix_tree.c that uses unsigned long as 
an index into the radix tree - and pgoff_t is cast to unsigned long when 
calling the radix_tree routines - so we'd need to change lib/radix_tree to 
use pgoff_t.

Then, there may be other places where pgoff_t is cast to unsigned long and 
they are not trivial to find (one could enable some extra compiler 
warnings about truncating values when casting them, but I suppose, this 
would trigger a lot of false positives). This needs some deep review by 
people who designed the memory management code.

Mikulas
--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] block devices: validate block device capacity

2014-01-31 Thread Mikulas Patocka


On Thu, 30 Jan 2014, James Bottomley wrote:

  So, if you want 64-bit page offsets, you need to increase pgoff_t size, 
  and that will increase the limit for both files and block devices.
 
 No.  The point is the page cache mapping of the device uses a
 manufactured inode saved in the backing device. It looks fixable in the
 buffer code before the page cache gets involved.

So if you think you can support 16TiB devices and leave pgoff_t 32-bit, 
send a patch that does it.

Until you make it, you should apply the patch that I sent, that prevents 
kernel lockups or data corruption when the user uses 16TiB device on 
32-bit kernel.

Mikulas
--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] block devices: validate block device capacity

2014-01-30 Thread Mikulas Patocka
When running the LVM2 testsuite on 32-bit kernel, there are unkillable
processes stuck in the kernel consuming 100% CPU:
blkid   R running  0  2005   1409 0x0004
ce009d00 0082 ffcf c11280ba 0060 560b5dfd 3111 00fe41cb
 ce009d00  d51cfeb0  001e 0002 
0002 c10748c1 0002 c106cca4    
Call Trace:
[c11280ba] ? radix_tree_next_chunk+0xda/0x2c0
[c10748c1] ? release_pages+0x61/0x160
[c106cca4] ? find_get_pages+0x84/0x100
[c1251fbe] ? _cond_resched+0x1e/0x40
[c10758cb] ? truncate_inode_pages_range+0x12b/0x440
[c1075cb7] ? truncate_inode_pages+0x17/0x20
[c10cf2ba] ? __blkdev_put+0x3a/0x140
[c10d02db] ? blkdev_close+0x1b/0x40
[c10a60b2] ? __fput+0x72/0x1c0
[c1039461] ? task_work_run+0x61/0xa0
[c1253b6f] ? work_notifysig+0x24/0x35

This is caused by the fact that the LVM2 testsuite creates 64TB device.
The kernel uses unsigned long to index pages in files and block devices,
on 64TB device unsigned long overflows (it can address up to 16TB with
4k pages), causing the infinite loop.

On 32-bit architectures, we must limit block device size to
PAGE_SIZE*(2^32-1).

The bug with untested device size is pervasive across the whole kernel, 
some drivers test that the device size fits in sector_t, but this test is 
not sufficient on 32-bit architectures. This patch introduces a new 
function validate_disk_capacity that tests if the disk capacity is OK for 
the current kernel and modifies the drivers brd, ide-gd, dm, sd to use it.

Signed-off-by: Mikulas Patocka mpato...@redhat.com

---
 block/genhd.c |   23 +++
 drivers/block/brd.c   |   15 +++
 drivers/ide/ide-gd.c  |8 
 drivers/md/dm-ioctl.c |3 +--
 drivers/md/dm-table.c |   14 +-
 drivers/scsi/sd.c |   20 +++-
 include/linux/device-mapper.h |2 +-
 include/linux/genhd.h |2 ++
 8 files changed, 70 insertions(+), 17 deletions(-)

Index: linux-2.6-compile/block/genhd.c
===
--- linux-2.6-compile.orig/block/genhd.c2014-01-30 17:23:15.0 
+0100
+++ linux-2.6-compile/block/genhd.c 2014-01-30 19:28:42.0 +0100
@@ -1835,3 +1835,26 @@ static void disk_release_events(struct g
WARN_ON_ONCE(disk-ev  disk-ev-block != 1);
kfree(disk-ev);
 }
+
+int validate_disk_capacity(u64 n_sectors, const char **reason)
+{
+   u64 n_pages;
+   if (n_sectors  9  9 != n_sectors) {
+   if (reason)
+   *reason = The number of bytes is greater than 2^64.;
+   return -EOVERFLOW;
+   }
+   n_pages = (n_sectors + (1  (PAGE_SHIFT - 9)) - 1)  (PAGE_SHIFT - 9);
+   if (n_pages  ULONG_MAX) {
+   if (reason)
+   *reason = Use 64-bit kernel.;
+   return -EFBIG;
+   }
+   if (n_sectors != (sector_t)n_sectors) {
+   if (reason)
+   *reason = Use a kernel compiled with support for large 
block devices.;
+   return -ENOSPC;
+   }
+   return 0;
+}
+EXPORT_SYMBOL(validate_disk_capacity);
Index: linux-2.6-compile/drivers/block/brd.c
===
--- linux-2.6-compile.orig/drivers/block/brd.c  2014-01-30 17:23:15.0 
+0100
+++ linux-2.6-compile/drivers/block/brd.c   2014-01-30 19:26:51.0 
+0100
@@ -429,12 +429,12 @@ static const struct block_device_operati
  * And now the modules code and kernel interface.
  */
 static int rd_nr;
-int rd_size = CONFIG_BLK_DEV_RAM_SIZE;
+static unsigned rd_size = CONFIG_BLK_DEV_RAM_SIZE;
 static int max_part;
 static int part_shift;
 module_param(rd_nr, int, S_IRUGO);
 MODULE_PARM_DESC(rd_nr, Maximum number of brd devices);
-module_param(rd_size, int, S_IRUGO);
+module_param(rd_size, uint, S_IRUGO);
 MODULE_PARM_DESC(rd_size, Size of each RAM disk in kbytes.);
 module_param(max_part, int, S_IRUGO);
 MODULE_PARM_DESC(max_part, Maximum number of partitions per RAM disk);
@@ -446,7 +446,7 @@ MODULE_ALIAS(rd);
 /* Legacy boot options - nonmodular */
 static int __init ramdisk_size(char *str)
 {
-   rd_size = simple_strtol(str, NULL, 0);
+   rd_size = simple_strtoul(str, NULL, 0);
return 1;
 }
 __setup(ramdisk_size=, ramdisk_size);
@@ -463,6 +463,13 @@ static struct brd_device *brd_alloc(int
 {
struct brd_device *brd;
struct gendisk *disk;
+   u64 capacity = (u64)rd_size * 2;
+   const char *reason;
+
+   if (validate_disk_capacity(capacity, reason)) {
+   printk(KERN_ERR brd: disk is too big: %s\n, reason);
+   goto out;
+   }
 
brd = kzalloc(sizeof(*brd), GFP_KERNEL);
if (!brd)
@@ -493,7 +500,7 @@ static struct brd_device *brd_alloc(int
disk-queue = brd-brd_queue;
disk-flags

Re: [PATCH] block devices: validate block device capacity

2014-01-30 Thread Mikulas Patocka


On Thu, 30 Jan 2014, James Bottomley wrote:

 Why is this?  the whole reason for CONFIG_LBDAF is supposed to be to
 allow 64 bit offsets for block devices on 32 bit.  It sounds like
 there's somewhere not using sector_t ... or using it wrongly which needs
 fixing.

The page cache uses unsigned long as a page index. Therefore, if unsigned 
long is 32-bit, the block device may have at most 2^32-1 pages.

  On 32-bit architectures, we must limit block device size to
  PAGE_SIZE*(2^32-1).
 
 So you're saying CONFIG_LBDAF can never work, why?
 
 James

CONFIG_LBDAF works, but it doesn't allow unlimited capacity: on x86, 
without CONFIG_LBDAF, the limit is 2TiB. With CONFIG_LBDAF, the limit is 
16TiB (4096*2^32).

Mikulas
--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] block devices: validate block device capacity

2014-01-30 Thread Mikulas Patocka


On Thu, 30 Jan 2014, James Bottomley wrote:

 On Thu, 2014-01-30 at 18:10 -0500, Mikulas Patocka wrote:
  
  On Thu, 30 Jan 2014, James Bottomley wrote:
  
   Why is this?  the whole reason for CONFIG_LBDAF is supposed to be to
   allow 64 bit offsets for block devices on 32 bit.  It sounds like
   there's somewhere not using sector_t ... or using it wrongly which needs
   fixing.
  
  The page cache uses unsigned long as a page index. Therefore, if unsigned 
  long is 32-bit, the block device may have at most 2^32-1 pages.
 
 Um, that's the index into the mapping, not the device; a device can have
 multiple mappings and each mapping has a radix tree of pages.  For most
 filesystems a mapping is equivalent to a file, so we can have large
 filesystems, but they can't have files over actually 4GB on 32 bits
 otherwise mmap fails.

A device may be accessed direcly (by opening /dev/sdX) and it creates a 
mapping too - thus, the size of a mapping limits the size of a block 
device.

The main problem is that pgoff_t has 4 bytes - chaning it to 8 bytes may 
fix it - but there may be some hidden places where pgoff is converted to 
unsigned long - who knows, if they exist or not?

 Are we running into a problems with struct address_space where we've
 assumed the inode belongs to the file and lvm is doing something where
 it's the whole device?

lvm creates a 64TiB device, udev runs blkid on that device and blkid opens 
the device and gets stuck because of unsigned long overflow.

On 32-bit architectures, we must limit block device size to
PAGE_SIZE*(2^32-1).
   
   So you're saying CONFIG_LBDAF can never work, why?
   
   James
  
  CONFIG_LBDAF works, but it doesn't allow unlimited capacity: on x86, 
  without CONFIG_LBDAF, the limit is 2TiB. With CONFIG_LBDAF, the limit is 
  16TiB (4096*2^32).
 
 I don't think the people who did the large block device work expected to
 gain only 3 bits for all their pain.
 
 James

One could change it to have three choices:
2TiB limit - 32-bit sector_t and 32-bit pgoff_t
16TiB limit - 64-bit sector_t and 32-bit pgoff_t
32PiB limit - 64-bit sector_t and 64-bit pgoff_t

Though, we need to know if the people who designed memory management agree 
with changing pgoff_t to 64 bits.

Mikulas
--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] block devices: validate block device capacity

2014-01-30 Thread Mikulas Patocka


On Thu, 30 Jan 2014, James Bottomley wrote:

  A device may be accessed direcly (by opening /dev/sdX) and it creates a 
  mapping too - thus, the size of a mapping limits the size of a block 
  device.
 
 Right, that's what I suspected below.  We can't damage large block
 support on filesystems just because of this corner case.

Devices larger than 16TiB never worked on 32-bit kernel, so this patch 
isn't damaging anything.

Note that if you attach a 16TiB block device, don't open it and mount it, 
it still won't work, because the buffer cache uses the page cache (see the 
function __find_get_block_slow and the variable pgoff_t index - that 
variable would overflow if the filesystem accessed a buffer beyond 16TiB).

  The main problem is that pgoff_t has 4 bytes - chaning it to 8 bytes may 
  fix it - but there may be some hidden places where pgoff is converted to 
  unsigned long - who knows, if they exist or not?
 
 I don't think we want to do that ... it will make struct page fatter and
 have knock on impacts in the radix tree code.  To fix this, we need to
 make the corner case (i.e. opening large block devices without a
 filesystem) bear the pain.  It sort of looks like we want to do a linear
 array of mappings of 64TB for the device so the page cache calculations
 don't overflow.

The code that reads and writes data to block devices and files is shared - 
the functions in mm/filemap.c work for both files and block devices.

So, if you want 64-bit page offsets, you need to increase pgoff_t size, 
and that will increase the limit for both files and block devices.

You shouldn't have separate functions for managing pages on files and 
separate functions for managing pages on block devices - that would 
increase code size and cause maintenance problems.

  Though, we need to know if the people who designed memory management agree 
  with changing pgoff_t to 64 bits.
 
 I don't think we can change the size of pgoff_t ... because it won't
 just be that, it will be other problems like the radix tree.

If we can't change it, then we must stay with the current 16TiB limit. 
There's no other way.

 However, you also have to bear in mind that truncating large block
 device support to 64TB on 32 bits is a technical ABI break.  Hopefully
 it is only technical because I don't know of any current consumer block
 device that is 64TB yet, but anyone who'd created a filesystem 64TB
 would find it no-longer mounted on 32 bits.
 James

It is not ABI break, because block devices larger than 16TiB never worked 
on 32-bit architectures. So it's better to refuse them outright, than to 
cause subtle lockups or data corruption.

Mikulas
--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] sg: fix integer overflow

2014-01-24 Thread Mikulas Patocka
On alpha, USER_HZ may be higher than HZ. This results in integer overflow
in MULDIV.

Signed-off-by: Mikulas Patocka mpato...@redhat.com
Cc: sta...@vger.kernel.org

---
 drivers/scsi/sg.c |2 ++
 1 file changed, 2 insertions(+)

Index: linux-3.13/drivers/scsi/sg.c
===
--- linux-3.13.orig/drivers/scsi/sg.c   2014-01-20 21:44:02.0 +0100
+++ linux-3.13/drivers/scsi/sg.c2014-01-24 17:28:02.0 +0100
@@ -856,8 +856,10 @@ sg_ioctl(struct file *filp, unsigned int
return result;
if (val  0)
return -EIO;
+#if USER_HZ  HZ
if (val = MULDIV (INT_MAX, USER_HZ, HZ))
val = MULDIV (INT_MAX, USER_HZ, HZ);
+#endif
sfp-timeout_user = val;
sfp-timeout = MULDIV (val, HZ, USER_HZ);
 
--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] sg: fix integer overflow

2014-01-24 Thread Mikulas Patocka


On Fri, 24 Jan 2014, James Bottomley wrote:

 On Fri, 2014-01-24 at 11:34 -0500, Mikulas Patocka wrote:
  On alpha, USER_HZ may be higher than HZ. This results in integer overflow
  in MULDIV.
  
  Signed-off-by: Mikulas Patocka mpato...@redhat.com
  Cc: sta...@vger.kernel.org
  
  ---
   drivers/scsi/sg.c |2 ++
   1 file changed, 2 insertions(+)
  
  Index: linux-3.13/drivers/scsi/sg.c
  ===
  --- linux-3.13.orig/drivers/scsi/sg.c   2014-01-20 21:44:02.0 
  +0100
  +++ linux-3.13/drivers/scsi/sg.c2014-01-24 17:28:02.0 +0100
  @@ -856,8 +856,10 @@ sg_ioctl(struct file *filp, unsigned int
  return result;
  if (val  0)
  return -EIO;
  +#if USER_HZ  HZ
  if (val = MULDIV (INT_MAX, USER_HZ, HZ))
  val = MULDIV (INT_MAX, USER_HZ, HZ);
  +#endif
 
 Is this really a problem worth fixing?  Is USER_HZ ever greater than HZ?

Yes. The integer overflow results in unpredictable value and the timeout 
is limited to that value.

  sfp-timeout_user = val;
 
 If it can happen, this needs fixing as well:
 
  sfp-timeout = MULDIV (val, HZ, USER_HZ);

This line is correct. Note, that here the arguments HZ and USER_HZ are 
swapped. Here, you divide val by USER_HZ and multiply it by HZ, so there 
is no oveflow if USER_HZ  HZ. There could be overflow if USER_HZ  HZ, 
but the above if condition avoids that.

 any val that would divide to less than 1 will now set the timeout to
 zero, which will cause the queue default timeout to be applied instead.
 
 I'd fix it in the macro rather than having code peppered with #ifs

You can't easily fix it in the macro: MULDIV (INT_MAX, USER_HZ, HZ) is 
undefined value if USER_HZ  HZ, it is not specified what the macro should 
return in this case.

 James

Mikulas
--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] sg: fix integer overflow

2014-01-24 Thread Mikulas Patocka


On Fri, 24 Jan 2014, Mikulas Patocka wrote:

 
 
 On Fri, 24 Jan 2014, James Bottomley wrote:
 
  On Fri, 2014-01-24 at 11:34 -0500, Mikulas Patocka wrote:
   On alpha, USER_HZ may be higher than HZ. This results in integer overflow
   in MULDIV.
   
   Signed-off-by: Mikulas Patocka mpato...@redhat.com
   Cc: sta...@vger.kernel.org
   
   ---
drivers/scsi/sg.c |2 ++
1 file changed, 2 insertions(+)
   
   Index: linux-3.13/drivers/scsi/sg.c
   ===
   --- linux-3.13.orig/drivers/scsi/sg.c 2014-01-20 21:44:02.0 
   +0100
   +++ linux-3.13/drivers/scsi/sg.c  2014-01-24 17:28:02.0 +0100
   @@ -856,8 +856,10 @@ sg_ioctl(struct file *filp, unsigned int
 return result;
 if (val  0)
 return -EIO;
   +#if USER_HZ  HZ
 if (val = MULDIV (INT_MAX, USER_HZ, HZ))
 val = MULDIV (INT_MAX, USER_HZ, HZ);
   +#endif
  
  Is this really a problem worth fixing?  Is USER_HZ ever greater than HZ?
 
 Yes. The integer overflow results in unpredictable value and the timeout 
 is limited to that value.
 
 sfp-timeout_user = val;
  
  If it can happen, this needs fixing as well:
  
 sfp-timeout = MULDIV (val, HZ, USER_HZ);
 
 This line is correct. Note, that here the arguments HZ and USER_HZ are 
 swapped. Here, you divide val by USER_HZ and multiply it by HZ, so there 
 is no oveflow if USER_HZ  HZ. There could be overflow if USER_HZ  HZ, 
 but the above if condition avoids that.
 
  any val that would divide to less than 1 will now set the timeout to
  zero, which will cause the queue default timeout to be applied instead.
  
  I'd fix it in the macro rather than having code peppered with #ifs
 
 You can't easily fix it in the macro: MULDIV (INT_MAX, USER_HZ, HZ) is 
 undefined value if USER_HZ  HZ, it is not specified what the macro should 
 return in this case.
 
  James
 
 Mikulas

BTW. if you want to refactor the code to keep #ifs out, you use this. It's 
does the same thing.



From: Mikulas Patocka mpato...@redhat.com
Subject: [PATCH] sg: fix integer overflow

On alpha, USER_HZ may be higher than HZ. This results in integer overflow 
in MULDIV.

Signed-off-by: Mikulas Patocka mpato...@redhat.com
Cc: sta...@vger.kernel.org


---
 drivers/scsi/sg.c |   10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

Index: linux-3.13/drivers/scsi/sg.c
===
--- linux-3.13.orig/drivers/scsi/sg.c   2014-01-24 18:21:49.0 +0100
+++ linux-3.13/drivers/scsi/sg.c2014-01-24 18:24:38.0 +0100
@@ -85,6 +85,12 @@ static void sg_proc_cleanup(void);
  */
 #define MULDIV(X,MUL,DIV) X % DIV) * MUL) / DIV) + ((X / DIV) * MUL))
 
+#if USER_HZ  HZ
+#define MAX_USER_TIMEOUT MULDIV (INT_MAX, USER_HZ, HZ))
+#else
+#define MAX_USER_TIMEOUT INT_MAX
+#endif
+
 #define SG_DEFAULT_TIMEOUT MULDIV(SG_DEFAULT_TIMEOUT_USER, HZ, USER_HZ)
 
 int sg_big_buff = SG_DEF_RESERVED_SIZE;
@@ -856,8 +862,8 @@ sg_ioctl(struct file *filp, unsigned int
return result;
if (val  0)
return -EIO;
-   if (val = MULDIV (INT_MAX, USER_HZ, HZ))
-   val = MULDIV (INT_MAX, USER_HZ, HZ);
+   if (val = MAX_USER_TIMEOUT)
+   val = MAX_USER_TIMEOUT;
sfp-timeout_user = val;
sfp-timeout = MULDIV (val, HZ, USER_HZ);
 
--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] sg: fix integer overflow (fwd)

2014-01-24 Thread Mikulas Patocka


On Fri, 24 Jan 2014, Mikulas Patocka wrote:

 
 
 On Fri, 24 Jan 2014, James Bottomley wrote:
 
  On Fri, 2014-01-24 at 11:34 -0500, Mikulas Patocka wrote:
   On alpha, USER_HZ may be higher than HZ. This results in integer overflow
   in MULDIV.
   
   Signed-off-by: Mikulas Patocka mpato...@redhat.com
   Cc: sta...@vger.kernel.org
   
   ---
drivers/scsi/sg.c |2 ++
1 file changed, 2 insertions(+)
   
   Index: linux-3.13/drivers/scsi/sg.c
   ===
   --- linux-3.13.orig/drivers/scsi/sg.c 2014-01-20 21:44:02.0 
   +0100
   +++ linux-3.13/drivers/scsi/sg.c  2014-01-24 17:28:02.0 +0100
   @@ -856,8 +856,10 @@ sg_ioctl(struct file *filp, unsigned int
 return result;
 if (val  0)
 return -EIO;
   +#if USER_HZ  HZ
 if (val = MULDIV (INT_MAX, USER_HZ, HZ))
 val = MULDIV (INT_MAX, USER_HZ, HZ);
   +#endif
  
  Is this really a problem worth fixing?  Is USER_HZ ever greater than HZ?
 
 Yes. The integer overflow results in unpredictable value and the timeout 
 is limited to that value.
 
 sfp-timeout_user = val;
  
  If it can happen, this needs fixing as well:
  
 sfp-timeout = MULDIV (val, HZ, USER_HZ);
 
 This line is correct. Note, that here the arguments HZ and USER_HZ are 
 swapped. Here, you divide val by USER_HZ and multiply it by HZ, so there 
 is no oveflow if USER_HZ  HZ. There could be overflow if USER_HZ  HZ, 
 but the above if condition avoids that.
 
  any val that would divide to less than 1 will now set the timeout to
  zero, which will cause the queue default timeout to be applied instead.
  
  I'd fix it in the macro rather than having code peppered with #ifs
 
 You can't easily fix it in the macro: MULDIV (INT_MAX, USER_HZ, HZ) is 
 undefined value if USER_HZ  HZ, it is not specified what the macro should 
 return in this case.
 
  James
 
 Mikulas

 BTW. if you want to refactor the code to keep #ifs out, you use this. 
 It's does the same thing.

Oh, I had an extra parenthesis in the definition of MAX_USER_TIMEOUT. So I 
send the patch again, corrected.



From: Mikulas Patocka mpato...@redhat.com
Subject: [PATCH v2] sg: fix integer overflow

On alpha, USER_HZ may be higher than HZ. This results in integer overflow 
in MULDIV.

Signed-off-by: Mikulas Patocka mpato...@redhat.com
Cc: sta...@vger.kernel.org


---
 drivers/scsi/sg.c |   10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

Index: linux-3.13/drivers/scsi/sg.c
===
--- linux-3.13.orig/drivers/scsi/sg.c   2014-01-24 18:21:49.0 +0100
+++ linux-3.13/drivers/scsi/sg.c2014-01-24 18:24:38.0 +0100
@@ -85,6 +85,12 @@ static void sg_proc_cleanup(void);
  */
 #define MULDIV(X,MUL,DIV) X % DIV) * MUL) / DIV) + ((X / DIV) * MUL))
 
+#if USER_HZ  HZ
+#define MAX_USER_TIMEOUT MULDIV (INT_MAX, USER_HZ, HZ)
+#else
+#define MAX_USER_TIMEOUT INT_MAX
+#endif
+
 #define SG_DEFAULT_TIMEOUT MULDIV(SG_DEFAULT_TIMEOUT_USER, HZ, USER_HZ)
 
 int sg_big_buff = SG_DEF_RESERVED_SIZE;
@@ -856,8 +862,8 @@ sg_ioctl(struct file *filp, unsigned int
return result;
if (val  0)
return -EIO;
-   if (val = MULDIV (INT_MAX, USER_HZ, HZ))
-   val = MULDIV (INT_MAX, USER_HZ, HZ);
+   if (val = MAX_USER_TIMEOUT)
+   val = MAX_USER_TIMEOUT;
sfp-timeout_user = val;
sfp-timeout = MULDIV (val, HZ, USER_HZ);
 
--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] sym53c8xx_2: Set DID_REQUEUE return code when aborting squeue.

2014-01-23 Thread Mikulas Patocka
When the controller encounters an error (including QUEUE FULL or BUSY 
status), it aborts all not yet submitted requests in the function 
sym_dequeue_from_squeue.

This function aborts them with DID_SOFT_ERROR.

If the disk has a full tag queue, the request that caused the overflow is 
aborted with QUEUE FULL status (and the scsi midlayer properly retries it 
until it is accepted by the disk), but other requests are aborted with 
DID_SOFT_ERROR --- for them, the midlayer does just a few retries and then 
signals the error up to sd.

The result is that disk returning QUEUE FULL causes request failures.

The error was reproduced on 53c895 with COMPAQ BD03685A24 disk (rebranded 
ST336607LC) with command queue 48 or 64 tags. The disk has 64 tags, but 
under some access patterns it return QUEUE FULL when there are less than 
64 pending tags. The SCSI specification allows returning QUEUE FULL 
anytime and it is up to the host to retry.

Signed-off-by: Mikulas Patocka mpato...@redhat.com
Cc: sta...@vger.kernel.org

---
 drivers/scsi/sym53c8xx_2/sym_hipd.c |4 
 1 file changed, 4 insertions(+)

Index: linux-2.6.36-rc5-fast/drivers/scsi/sym53c8xx_2/sym_hipd.c
===
--- linux-2.6.36-rc5-fast.orig/drivers/scsi/sym53c8xx_2/sym_hipd.c  
2010-09-27 10:25:59.0 +0200
+++ linux-2.6.36-rc5-fast/drivers/scsi/sym53c8xx_2/sym_hipd.c   2010-09-27 
10:26:27.0 +0200
@@ -3000,7 +3000,11 @@ sym_dequeue_from_squeue(struct sym_hcb *
if ((target == -1 || cp-target == target) 
(lun== -1 || cp-lun== lun)
(task   == -1 || cp-tag== task)) {
+#ifdef SYM_OPT_HANDLE_DEVICE_QUEUEING
sym_set_cam_status(cp-cmd, DID_SOFT_ERROR);
+#else
+   sym_set_cam_status(cp-cmd, DID_REQUEUE);
+#endif
sym_remque(cp-link_ccbq);
sym_insque_tail(cp-link_ccbq, np-comp_ccbq);
}
--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/5] ia64 simscsi: fix race condition and simplify the code

2014-01-23 Thread Mikulas Patocka
The simscsi driver processes the requests in the request routine and then
offloads the completion callback to a tasklet. This is buggy because there
is parallel unsynchronized access to the completion queue from the request
routine and from the tasklet.

With current SCSI architecture, requests can be completed directly from
the requets routine. So I removed the tasklet code.

Signed-off-by: Mikulas Patocka mpato...@redhat.com

---
 arch/ia64/hp/sim/simscsi.c |   34 ++
 1 file changed, 2 insertions(+), 32 deletions(-)

Index: linux-2.6-ia64/arch/ia64/hp/sim/simscsi.c
===
--- linux-2.6-ia64.orig/arch/ia64/hp/sim/simscsi.c  2014-01-24 
01:23:08.0 +0100
+++ linux-2.6-ia64/arch/ia64/hp/sim/simscsi.c   2014-01-24 01:26:16.0 
+0100
@@ -47,9 +47,6 @@
 
 static struct Scsi_Host *host;
 
-static void simscsi_interrupt (unsigned long val);
-static DECLARE_TASKLET(simscsi_tasklet, simscsi_interrupt, 0);
-
 struct disk_req {
unsigned long addr;
unsigned len;
@@ -64,13 +61,6 @@ static int desc[16] = {
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
 };
 
-static struct queue_entry {
-   struct scsi_cmnd *sc;
-} queue[SIMSCSI_REQ_QUEUE_LEN];
-
-static int rd, wr;
-static atomic_t num_reqs = ATOMIC_INIT(0);
-
 /* base name for default disks */
 static char *simscsi_root = DEFAULT_SIMSCSI_ROOT;
 
@@ -95,21 +85,6 @@ simscsi_setup (char *s)
 
 __setup(simscsi=, simscsi_setup);
 
-static void
-simscsi_interrupt (unsigned long val)
-{
-   struct scsi_cmnd *sc;
-
-   while ((sc = queue[rd].sc) != NULL) {
-   atomic_dec(num_reqs);
-   queue[rd].sc = NULL;
-   if (DBG)
-   printk(simscsi_interrupt: done with %ld\n, 
sc-serial_number);
-   (*sc-scsi_done)(sc);
-   rd = (rd + 1) % SIMSCSI_REQ_QUEUE_LEN;
-   }
-}
-
 static int
 simscsi_biosparam (struct scsi_device *sdev, struct block_device *n,
sector_t capacity, int ip[])
@@ -315,14 +290,9 @@ simscsi_queuecommand_lck (struct scsi_cm
sc-sense_buffer[0] = 0x70;
sc-sense_buffer[2] = 0x00;
}
-   if (atomic_read(num_reqs) = SIMSCSI_REQ_QUEUE_LEN) {
-   panic(Attempt to queue command while command is pending!!);
-   }
-   atomic_inc(num_reqs);
-   queue[wr].sc = sc;
-   wr = (wr + 1) % SIMSCSI_REQ_QUEUE_LEN;
 
-   tasklet_schedule(simscsi_tasklet);
+   (*sc-scsi_done)(sc);
+
return 0;
 }
 

--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [dm-devel] SCSI's heuristics for enabling WRITE SAME still need work [was: dm mpath: disable WRITE SAME if it fails]

2013-09-24 Thread Mikulas Patocka


On Fri, 20 Sep 2013, Mike Snitzer wrote:

 On Thu, Sep 19 2013 at 12:13pm -0400,
 Mike Snitzer snit...@redhat.com wrote:
 
  Workaround the SCSI layer's problematic WRITE SAME heuristics by
  disabling WRITE SAME in the DM multipath device's queue_limits if an
  underlying device disabled it.
 
 ...
 
  This fix doesn't help configurations that have additional devices
  stacked ontop of the mpath device (e.g. LVM created linear DM devices
  ontop).  A proper fix that restacks all the queue_limits from the bottom
  of the device stack up will need to be explored if SCSI will continue to
  use this model of optimistically allowing op codes and then disabling
  them after they fail for the first time.

Propagating the limits up the tree won't work anyway because of a race 
condition. Think of this case:

1. We test queue limits and find out that the device supports WRITE_SAME
2. We build a WRITE_SAME bio to be submitted.
3. The user reconfigures a device so that it no longer supports WRITE_SAME 
(for example, he adds a snapshot or converts linear volume to a mirror).
4. We submit the bio that we built in step 2.
5. The bio fails.


I think the correct solution is to specify that WRITE SAME request may 
fail anytime and the caller is responsible for retrying with normal WRITE 
request if WRITE SAME failed.

If the caller needs to send WRITE SAME, it should try to send it and if it 
fails, set a flag that the device doesn't support WRITE SAME and not send 
WRITE SAME in the future. (we could also reset that flag after some long 
time, in case the device was reconfigured to support WRITE SAME).


 I really don't think we can afford to keep SCSI's current heuristics for
 enabling WRITE SAME, re-stating them for the benefit of others:
 1) check if WRITE SAME is supported by sending REPORT SUPPORTED
OPERATION CODES to the device
 2a) if REPORT SUPPORTED OPERATION CODES shows WRITE SAME is supported,
 enable WRITE SAME
 2b) if REPORT SUPPORTED OPERATION CODES shows WRITE SAME is not
 supported, disable WRITE SAME
 2c) if REPORT SUPPORTED OPERATION CODES isn't supported _and_ the device
 doesn't have an ATA Information VPD page: enable WRITE SAME
 - if/when WRITE SAME does fail, disable it on the failing device
 
 AFAIK the reason for these heuristics is: devices that do support WRITE
 SAME cannot properly report as much because they don't support REPORT
 SUPPORTED OPERATION CODES -- this lack of RSOC support is apparently
 very common?
 
 I can appreciate the idea behind the current heuristics but I think the
 prevelence of the other side of the spectrum (SCSI devices that don't
 support RSOC or WRITE SAME) was underestimated.  As such we're seeing a
 fair amount of WRITE SAME error noise on these systems -- that noise is
 itself considered a bug to many.
 
 Also, please see my comment in a related Fedora 19 BZ:
 https://bugzilla.redhat.com/show_bug.cgi?id=995271#c23
 
 As I say in that comment:
 A proper fix could be to make SCSI's default be to disable WRITE SAME
 for devices that don't properly report they support it.  And possibly
 have a whitelist to opt-in to enabling WRITE SAME for select targets.
 
 I'm open to any ideas you might have.
 
 Thanks,
 Mike

What does happend if we send WRITE SAME to a disk that doesn't support it? 
If we only get Invalid field in CDB or similar sense code, we could just 
propagate it up and let the caller retry.

If we could get something worse (firmware crash), we must blacklist it.



Another idea: we could kill the REQ_WRITE_SAME flag entirely - if the some 
subsystem needs to clear some area of a disk, it builds a bio with all 
vector entries pointing to the zero page. Drivers that don't support WRITE 
SAME (such as IDE) will implement it doing DMA from the zero page. Drivers 
that support WRITE SAME can detect this special case and convert a normal 
WRITE request to WRITE SAME. The request is converted to WRITE SAME at the 
lowest level (in the SCSI driver), it passes as normal WRITE request 
through dm and md midlayers - thus, we magically get WRITE SAME support in 
all md and dm drivers without writing additional code for it.

Mikulas
--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] dm mpath: attach scsi_dh during table resume

2013-04-25 Thread Mikulas Patocka


On Mon, 22 Apr 2013, Mike Snitzer wrote:

 I spoke with Hannes at LSF, to address the potential crashes in the
 endio path (e.g. stpg_endio) we'd have to bump the scsi_dh_data kref
 where appropriate (e.g. for ALUA kref_get in submit_stpg and kref_put in
 stpg_endio).
 
 But that is just the tip of the iceberg relative to scsi_dh lifetime.
 Seems we've been playing it pretty fast and loose with scsi_dh issued
 requests vs detach for quite some time.
 
 I'm now inclined to not care about this issue.  Take away is: don't
 switch the device handler (attach the correct one from the start).

I did a patch that disables device handler switching and it was NACKed by 
Hannes. The problem that he pointed out was - when we load SCSI device 
handler modules, they attach automatically to SCSI devices they think they 
belong to. The user then can't set the desired device handler in multipath 
configuration because a different handler is already attached.

So we need a functionality to change device handlers.

(or maybe stop the scsi device handlers from attaching automatically, but 
it would surely generate a lot of other regressions)

Mikulas
--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] dm mpath: attach scsi_dh during table resume

2013-04-25 Thread Mikulas Patocka


On Thu, 25 Apr 2013, Mike Snitzer wrote:

 On Thu, Apr 25 2013 at  9:48am -0400,
 Mikulas Patocka mpato...@redhat.com wrote:
 
  
  
  On Mon, 22 Apr 2013, Mike Snitzer wrote:
  
   I spoke with Hannes at LSF, to address the potential crashes in the
   endio path (e.g. stpg_endio) we'd have to bump the scsi_dh_data kref
   where appropriate (e.g. for ALUA kref_get in submit_stpg and kref_put in
   stpg_endio).
   
   But that is just the tip of the iceberg relative to scsi_dh lifetime.
   Seems we've been playing it pretty fast and loose with scsi_dh issued
   requests vs detach for quite some time.
   
   I'm now inclined to not care about this issue.  Take away is: don't
   switch the device handler (attach the correct one from the start).
  
  I did a patch that disables device handler switching and it was NACKed by 
  Hannes. The problem that he pointed out was - when we load SCSI device 
  handler modules, they attach automatically to SCSI devices they think they 
  belong to. The user then can't set the desired device handler in multipath 
  configuration because a different handler is already attached.
 
 The handler that is automatically attached _should_ be the correct
 handler.  We now have the .match() hook for scsi_dh and it has made for
 reliable scsi_dh attachment of the correct handler.

The EMC devices work with both ALUA and EMC handlers - so there is no one 
correct handler, the correct handler is the one that the user specified 
in multipath configuration.

  So we need a functionality to change device handlers.
 
 I really cannot think of a sequence where the scsi_dh .match() will
 attach the incorrect handler.  This is why I added the
 retain_attached_hw_handler feature to mpath (commit a58a935d5).

The automatic handler assigment can't change existing handler.

But if one handler was automatically selected and the user selects a 
different handler in multipath configuration, the handler is changed.

  (or maybe stop the scsi device handlers from attaching automatically, but 
  it would surely generate a lot of other regressions)
 
 The need to support changing device handlers (via multipath table load)
 is overblown/historic.

So - do you mean that we make retain_attached_hw_handler the default 
option and don't allow the user to change existing device handler in 
multipath configuration?

That's what my patch did and it was NACKed by Hannes. The problem there is 
that behavior depends on module loading order - if you activate multipath 
with EMC option, it activates the EMC handler. If you load the ALUA 
module and activate multipath with EMC option, it stays with the ALUA 
handler.

Mikulas
--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [dm-devel] [PATCH] scsi-dh-emc: fix activate vs set_params race

2013-04-05 Thread Mikulas Patocka


On Thu, 4 Apr 2013, Laurence Oberman wrote:

 I can test it. I have a clarion Cx3
 Will get to it next week, traveling tomorrow
 Laurence
 
 Sent from my iPhone

OK.

So, enable most debug options in kernel configuration and try the patch.

Mikulas

 On Apr 4, 2013, at 7:11 PM, Mike Christie micha...@cs.wisc.edu wrote:
 
  On 04/02/2013 07:09 PM, Mikulas Patocka wrote:
  Hi
  
  This fixes a possible race in scsi_dh_emc. It is untested because I don't 
  have the hardware. It could happen when we reload a multipath device and 
  path failure happens at the same time.
  
  
  I think this patch is ok. I do not have the hw to test it anymore.
  
  If you wanted to test just to make sure it is safe you should bug Rob
  Evers. He can help you find a machine in the westford lab that has it
  --
  To unsubscribe from this list: send the line unsubscribe linux-scsi in
  the body of a message to majord...@vger.kernel.org
  More majordomo info at  http://vger.kernel.org/majordomo-info.html
 
--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] scsi-dh-emc: fix activate vs set_params race

2013-04-02 Thread Mikulas Patocka
Hi

This fixes a possible race in scsi_dh_emc. It is untested because I don't 
have the hardware. It could happen when we reload a multipath device and 
path failure happens at the same time.

This is a theoretical race condition (there are no reports of it 
hapenning), but a different bug was already triggered under the same 
conditions in QA testing.

Mikulas

---

scsi-dh-emc: fix activate vs set_params race

The activate routine may be called concurrently with the set_params
routine. This can happen when dm-multipath device is reloaded. When we
reload a device mapper device, the new target instance is being created
(that could call the set_params method) while the old target instance is
still active (that could call the activate method).

The activate and set_params routine share the same data and sense buffer,
so it could result in incorrect behaviour if they are called concurrently.
This patch adds a mutex to fix the race.

Signed-off-by: Mikulas Patocka mpato...@redhat.com

---
 drivers/scsi/device_handler/scsi_dh_emc.c |   11 +++
 1 file changed, 11 insertions(+)

Index: linux-3.9-rc5-fast/drivers/scsi/device_handler/scsi_dh_emc.c
===
--- linux-3.9-rc5-fast.orig/drivers/scsi/device_handler/scsi_dh_emc.c   
2013-04-03 01:32:16.0 +0200
+++ linux-3.9-rc5-fast/drivers/scsi/device_handler/scsi_dh_emc.c
2013-04-03 01:45:13.0 +0200
@@ -111,6 +111,8 @@ struct clariion_dh_data {
 * path's mapped LUN
 */
int current_sp;
+
+   struct mutex mtx;
 };
 
 static inline struct clariion_dh_data
@@ -536,6 +538,8 @@ static int clariion_activate(struct scsi
struct clariion_dh_data *csdev = get_clariion_data(sdev);
int result;
 
+   mutex_lock(csdev-mtx);
+
result = clariion_send_inquiry(sdev, csdev);
if (result != SCSI_DH_OK)
goto done;
@@ -562,6 +566,8 @@ done:
csdev-port, lun_state[csdev-lun_state],
csdev-default_sp + 'A');
 
+   mutex_unlock(csdev-mtx);
+
if (fn)
fn(data, result);
return 0;
@@ -602,6 +608,8 @@ static int clariion_set_params(struct sc
else
csdev-flags = ~CLARIION_HONOR_RESERVATIONS;
 
+   mutex_lock(csdev-mtx);
+
/*
 * If this path is owned, we have to send a trespass command
 * with the new parameters. If not, simply return. Next trespass
@@ -618,6 +626,8 @@ static int clariion_set_params(struct sc
/* Update status */
result = clariion_send_inquiry(sdev, csdev);
 
+   mutex_unlock(csdev-mtx);
+
 done:
return result;
 }
@@ -683,6 +693,7 @@ static int clariion_bus_attach(struct sc
h-lun_state = CLARIION_LUN_UNINITIALIZED;
h-default_sp = CLARIION_UNBOUND_LU;
h-current_sp = CLARIION_UNBOUND_LU;
+   mutex_init(h-mtx);
 
err = clariion_std_inquiry(sdev, h);
if (err != SCSI_DH_OK)
--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] dm: do not replace bioset for request-based dm

2013-02-26 Thread Mikulas Patocka
Acked-by: Mikulas Patocka mpato...@redhat.com

On Tue, 26 Feb 2013, Jun'ichi Nomura wrote:

 This patch fixes a regression introduced in v3.8, which causes oops
 like this when dm-multipath is used:
 
 general protection fault:  [#1] SMP
 RIP: 0010:[810fe754]  [810fe754] mempool_free+0x24/0xb0
 Call Trace:
   IRQ
   [81187417] bio_put+0x97/0xc0
   [a02247a5] end_clone_bio+0x35/0x90 [dm_mod]
   [81185efd] bio_endio+0x1d/0x30
   [811f03a3] req_bio_endio.isra.51+0xa3/0xe0
   [811f2f68] blk_update_request+0x118/0x520
   [811f3397] blk_update_bidi_request+0x27/0xa0
   [811f343c] blk_end_bidi_request+0x2c/0x80
   [811f34d0] blk_end_request+0x10/0x20
   [a000b32b] scsi_io_completion+0xfb/0x6c0 [scsi_mod]
   [a000107d] scsi_finish_command+0xbd/0x120 [scsi_mod]
   [a000b12f] scsi_softirq_done+0x13f/0x160 [scsi_mod]
   [811f9fd0] blk_done_softirq+0x80/0xa0
   [81044551] __do_softirq+0xf1/0x250
   [8142ee8c] call_softirq+0x1c/0x30
   [8100420d] do_softirq+0x8d/0xc0
   [81044885] irq_exit+0xd5/0xe0
   [8142f3e3] do_IRQ+0x63/0xe0
   [814257af] common_interrupt+0x6f/0x6f
   EOI
   [a021737c] srp_queuecommand+0x8c/0xcb0 [ib_srp]
   [a0002f18] scsi_dispatch_cmd+0x148/0x310 [scsi_mod]
   [a000a38e] scsi_request_fn+0x31e/0x520 [scsi_mod]
   [811f1e57] __blk_run_queue+0x37/0x50
   [811f1f69] blk_delay_work+0x29/0x40
   [81059003] process_one_work+0x1c3/0x5c0
   [8105b22e] worker_thread+0x15e/0x440
   [8106164b] kthread+0xdb/0xe0
   [8142db9c] ret_from_fork+0x7c/0xb0
 
 The regression was introduced by the change
 c0820cf5 dm: introduce per_bio_data, where dm started to replace
 bioset during table replacement.
 For bio-based dm, it is good because clone bios do not exist during the
 table replacement.
 For request-based dm, however, (not-yet-mapped) clone bios may stay in
 request queue and survive during the table replacement.
 So freeing the old bioset could cause the oops in bio_put().
 
 Since the size of front_pad may change only with bio-based dm,
 it is not necessary to replace bioset for request-based dm.
 
 Reported-by: Bart Van Assche bvanass...@acm.org
 Tested-by: Bart Van Assche bvanass...@acm.org
 Signed-off-by: Jun'ichi Nomura j-nom...@ce.jp.nec.com
 Cc: Alasdair G Kergon a...@redhat.com
 Cc: Mikulas Patocka mpato...@redhat.com
 Cc: Mike Snitzer snit...@redhat.com
 Cc: sta...@vger.kernel.org
 ---
  drivers/md/dm.c |   30 +-
  1 file changed, 21 insertions(+), 9 deletions(-)
 
 diff --git a/drivers/md/dm.c b/drivers/md/dm.c
 index 314a0e2..51fefb5 100644
 --- a/drivers/md/dm.c
 +++ b/drivers/md/dm.c
 @@ -1973,15 +1973,27 @@ static void __bind_mempools(struct mapped_device *md, 
 struct dm_table *t)
  {
   struct dm_md_mempools *p = dm_table_get_md_mempools(t);
  
 - if (md-io_pool  (md-tio_pool || dm_table_get_type(t) == 
 DM_TYPE_BIO_BASED)  md-bs) {
 - /*
 -  * The md already has necessary mempools. Reload just the
 -  * bioset because front_pad may have changed because
 -  * a different table was loaded.
 -  */
 - bioset_free(md-bs);
 - md-bs = p-bs;
 - p-bs = NULL;
 + if (md-io_pool  md-bs) {
 + /* The md already has necessary mempools. */
 + if (dm_table_get_type(t) == DM_TYPE_BIO_BASED) {
 + /*
 +  * Reload bioset because front_pad may have changed
 +  * because a different table was loaded.
 +  */
 + bioset_free(md-bs);
 + md-bs = p-bs;
 + p-bs = NULL;
 + } else if (dm_table_get_type(t) == DM_TYPE_REQUEST_BASED) {
 + BUG_ON(!md-tio_pool);
 + /*
 +  * No need to reload in case of request-based dm
 +  * because of fixed size front_pad.
 +  * Note for future: if you are to reload bioset,
 +  * prep-ed requests in queue may have reference
 +  * to bio from the old bioset.
 +  * So you must walk through the queue to unprep.
 +  */
 + }
   goto out;
   }
  
 
--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html