[PATCH V4 1/1] block: reject I/O for same fd if block size changed

2021-01-05 Thread Minwoo Im
This patch fixes I/O errors during BLKRRPART ioctl() behavior right
after format operation that changed logical block size of the block
device with a same file descriptor opened.

This issue can be easily reproduced with a single format command in case
of NVMe (logical block size 512B to 4096B).

nvme format /dev/nvme0n1 --lbaf=1 --force

This is because the application, nvme-cli format subcommand issues an
admin command followed by BLKRRPART ioctl to re-read partition
information without closing the file descriptor.  If file descriptor
stays opened, __blkdev_get() will not be invoked at all even logical
block size has been changed.

It will cause I/O errors with invalid Read operations during the
BLKRRPART ioctl due to i_blkbits mismatch. The invalid operations in
BLKRRPART happens with under-flowed Number of LBA(NLB) values
0x(65535) because i_blkbits is still set to 9 even the logical block
size has been updated to 4096.  The BLKRRPART will lead buffer_head to
hold 512B data which is less than the logical lock size of the block
device.

The root cause, which is because i_blkbits of inode of the block device
is not updated, can be solved easily by re-opening file descriptor
again from application.  But, that's just for application's business
and kernel should reject invalid Read operations during the BLKRRPART
ioctl.

This patch rejects I/O from the path of add_partitions() to avoid
issuing invalid Read operations to device.  It sets a flag to
request_queue in blk_queue_logical_block_size to minimize caller-side
updates.

Signed-off-by: Minwoo Im 
---
 block/blk-settings.c|  3 +++
 block/partitions/core.c | 12 
 fs/block_dev.c  |  8 
 include/linux/blkdev.h  |  1 +
 4 files changed, 24 insertions(+)

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 43990b1d148b..48a6fc7bb5f5 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -329,6 +329,9 @@ void blk_queue_logical_block_size(struct request_queue *q, 
unsigned int size)
 {
struct queue_limits *limits = &q->limits;
 
+   if (limits->logical_block_size != size)
+   blk_queue_flag_set(QUEUE_FLAG_LBSZ_CHANGED, q);
+
limits->logical_block_size = size;
 
if (limits->physical_block_size < size)
diff --git a/block/partitions/core.c b/block/partitions/core.c
index e7d776db803b..6f175ea18ff3 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -612,12 +612,24 @@ static bool blk_add_partition(struct gendisk *disk, 
struct block_device *bdev,
 
 int blk_add_partitions(struct gendisk *disk, struct block_device *bdev)
 {
+   struct request_queue *q = bdev_get_queue(bdev);
struct parsed_partitions *state;
int ret = -EAGAIN, p, highest;
 
if (!disk_part_scan_enabled(disk))
return 0;
 
+   /*
+* Reject to check partition information if block size has been changed
+* in the runtime.  If block size of a block device has been changed,
+* the file descriptor should be opened agian to update the blkbits.
+*/
+   if (test_bit(QUEUE_FLAG_LBSZ_CHANGED, &q->queue_flags)) {
+   pr_warn("%s: rejecting checking partition. fd should be opened 
again.\n",
+   disk->disk_name);
+   return -EBADFD;
+   }
+
state = check_partition(disk, bdev);
if (!state)
return 0;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 9293045e128c..8056a412a3d1 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -130,7 +130,15 @@ EXPORT_SYMBOL(truncate_bdev_range);
 
 static void set_init_blocksize(struct block_device *bdev)
 {
+   struct request_queue *q = bdev_get_queue(bdev);
+
bdev->bd_inode->i_blkbits = blksize_bits(bdev_logical_block_size(bdev));
+
+   /*
+* Allow I/O commands for this block device.  We can say that this
+* block device has proper blkbits updated.
+*/
+   blk_queue_flag_clear(QUEUE_FLAG_LBSZ_CHANGED, q);
 }
 
 int set_blocksize(struct block_device *bdev, int size)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 070de09425ad..6d0542434be6 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -625,6 +625,7 @@ struct request_queue {
 #define QUEUE_FLAG_RQ_ALLOC_TIME 27/* record rq->alloc_time_ns */
 #define QUEUE_FLAG_HCTX_ACTIVE 28  /* at least one blk-mq hctx is active */
 #define QUEUE_FLAG_NOWAIT   29 /* device supports NOWAIT */
+#define QUEUE_FLAG_LBSZ_CHANGED30  /* logical block size changed */
 
 #define QUEUE_FLAG_MQ_DEFAULT  ((1 << QUEUE_FLAG_IO_STAT) |\
 (1 << QUEUE_FLAG_SAME_COMP) |  \
-- 
2.17.1



[PATCH V4 0/1] block: fix I/O errors in BLKRRPART

2021-01-05 Thread Minwoo Im
Hello,

  This patch fixes I/O errors during BLKRRPART ioctl() behavior right
after format operation that changed logical block size of the block
device with a same file descriptor opened.

Testcase:

  The following testcase is a case of NVMe namespace with the following
conditions:

  - Current LBA format is lbaf=0 (512 bytes logical block size)
  - LBA Format(lbaf=1) has 4096 bytes logical block size

  # Format block device logical block size 512B to 4096B



   
  nvme format /dev/nvme0n1 --lbaf=1 --force

  This will cause I/O errors because BLKRRPART ioctl() happened right after
the format command with same file descriptor opened in application
(e.g., nvme-cli) like:

  fd = open("/dev/nvme0n1", O_RDONLY);

  nvme_format(fd, ...);
  if (ioctl(fd, BLKRRPART) < 0)
...

Errors:

  We can see the Read command with Number of LBA(NLB) 0x(65535) which
was under-flowed because BLKRRPART operation requested request size based
on i_blkbits of the block device which is 9 via buffer_head.

  [dmesg-snip]
[   10.771740] blk_update_request: operation not supported error, dev 
nvme0n1, sector 0 op 0x0:(READ) flags 0x0 phys_seg 1 prio class 0
[   10.780262] Buffer I/O error on dev nvme0n1, logical block 0, async page 
read

  [event-snip]
kworker/0:1H-56  [000]    913.456922: nvme_setup_cmd: nvme0: 
disk=nvme0n1, qid=1, cmdid=216, nsid=1, flags=0x0, meta=0x0, cmd=(nvme_cmd_read 
slba=0, len=65535, ctrl=0x0, dsmgmt=0, reftag=0)
 ksoftirqd/0-9   [000] .Ns.   916.566351: nvme_complete_rq: nvme0: 
disk=nvme0n1, qid=1, cmdid=216, res=0x0, retries=0, flags=0x0, status=0x4002

  The patch below fixes the I/O errors by rejecting I/O requests from the
block layer with setting a flag to request_queue until the file descriptor
re-opened to be updated by __blkdev_get().  This is based on the previous
discussion [1].

Since V3(RFC):
  - Move flag from gendisk to request_queue for future clean-ups.
(Christoph, [3])

Since V2(RFC):
  - Cover letter with testcase and error logs attached. Removed un-related
changes: empty line. (Chaitanya, [2])
  - Put blkdev with blkdev_put_no_open().

Since V1(RFC):
  - Updated patch to reject I/O rather than updating i_blkbits of the
block device's inode directly from driver. (Christoph, [1])

[1] 
https://lore.kernel.org/linux-nvme/20201223183143.GB13354@localhost.localdomain/T/#t
[2] 
https://lore.kernel.org/linux-nvme/20201230140504.GB7917@localhost.localdomain/T/#t
[3] 
https://lore.kernel.org/linux-block/20210105101202.GA9970@localhost.localdomain/T/#u

Thanks,

Minwoo Im (1):
  block: reject I/O for same fd if block size changed

 block/blk-settings.c|  3 +++
 block/partitions/core.c | 12 
 fs/block_dev.c  |  8 
 include/linux/blkdev.h  |  1 +
 4 files changed, 24 insertions(+)

-- 
2.17.1



Re: [RFC PATCH V3 1/1] block: reject I/O for same fd if block size changed

2021-01-05 Thread Minwoo Im
Hello,

On 21-01-05 08:50:09, Christoph Hellwig wrote:
> On Tue, Jan 05, 2021 at 10:04:56AM +0900, Minwoo Im wrote:
> > It was a point that I really would like to ask by RFC whether we can
> > have backpointer to the gendisk from the request_queue.  And I'd like to
> > have it to simplify this routine and for future usages also.
> 
> I think it is the right thing to do, at least mid-term, although I
> don't want to enforce the burden on you right now.
> 
> > I will restrict this one by checking GENHD_FL_UP flag from the gendisk
> > for the next patch.
> > 
> > > 
> > > Alternatively we could make this request_queue QUEUE* flag for now.
> > 
> > As this patch rejects I/O from the block layer partition code, can we
> > have this flag in gendisk rather than request_queue ?
> 
> For now we can as the request_queue is required.  I have some plans to
> clean up this area, but just using a request_queue flag for now is
> probably the simplest, even if it means more work for me later.

Please let me prepare the next quick fix for this issue with request_queue
flag.

Thanks!


Re: [RFC PATCH V3 1/1] block: reject I/O for same fd if block size changed

2021-01-04 Thread Minwoo Im
Hello Christoph,

Thanks for your review.

On 21-01-04 18:11:41, Christoph Hellwig wrote:
> On Mon, Jan 04, 2021 at 06:11:08PM +0100, Christoph Hellwig wrote:
> > On Mon, Jan 04, 2021 at 10:06:59PM +0900, Minwoo Im wrote:
> > > + if (q->backing_dev_info && q->backing_dev_info->owner &&
> > > + limits->logical_block_size != size) {
> > > + bdev = blkdev_get_no_open(q->backing_dev_info->owner->devt);
> > > + bdev->bd_disk->flags |= GENHD_FL_BLOCK_SIZE_CHANGED;
> > > + blkdev_put_no_open(bdev);
> > > + }
> > 
> > We really need the backpointer from the queue to the gendisk I've wanted
> > to add for a while.  Can we at least restrict this to a live gendisk?

It was a point that I really would like to ask by RFC whether we can
have backpointer to the gendisk from the request_queue.  And I'd like to
have it to simplify this routine and for future usages also.

I will restrict this one by checking GENHD_FL_UP flag from the gendisk
for the next patch.

> 
> Alternatively we could make this request_queue QUEUE* flag for now.

As this patch rejects I/O from the block layer partition code, can we
have this flag in gendisk rather than request_queue ?

Thanks,


[RFC PATCH V3 1/1] block: reject I/O for same fd if block size changed

2021-01-04 Thread Minwoo Im
This patch fixes I/O errors during BLKRRPART ioctl() behavior right
after format operation that changed logical block size of the block
device with a same file descriptor opened.

This issue can be easily reproduced with a single format command in case
of NVMe (logical block size 512B to 4096B).

nvme format /dev/nvme0n1 --lbaf=1 --force

This is because the application, nvme-cli format subcommand issues an
admin command followed by BLKRRPART ioctl to re-read partition
information without closing the file descriptor.  If file descriptor
stays opened, __blkdev_get() will not be invoked at all even logical
block size has been changed.

It will cause I/O errors with invalid Read operations during the
BLKRRPART ioctl due to i_blkbits mismatch. The invalid operations in
BLKRRPART happens with under-flowed Number of LBA(NLB) values
0x(65535) because i_blkbits is still set to 9 even the logical block
size has been updated to 4096.  The BLKRRPART will lead buffer_head to
hold 512B data which is less than the logical lock size of the block
device.

The root cause, which is because i_blkbits of inode of the block device
is not updated, can be solved easily by re-opening file descriptor
again from application.  But, that's just for application's business
and kernel should reject invalid Read operations during the BLKRRPART
ioctl.

This patch rejects I/O from the path of add_partitions() to avoid
issuing invalid Read operations to device.  It also sets a flag to
gendisk in blk_queue_logical_block_size to minimize caller-side updates.

Signed-off-by: Minwoo Im 
---
 block/blk-settings.c|  8 
 block/partitions/core.c | 11 +++
 fs/block_dev.c  |  6 ++
 include/linux/genhd.h   |  1 +
 4 files changed, 26 insertions(+)

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 43990b1d148b..84136ea4e2a4 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -328,6 +328,14 @@ EXPORT_SYMBOL(blk_queue_max_segment_size);
 void blk_queue_logical_block_size(struct request_queue *q, unsigned int size)
 {
struct queue_limits *limits = &q->limits;
+   struct block_device *bdev;
+
+   if (q->backing_dev_info && q->backing_dev_info->owner &&
+   limits->logical_block_size != size) {
+   bdev = blkdev_get_no_open(q->backing_dev_info->owner->devt);
+   bdev->bd_disk->flags |= GENHD_FL_BLOCK_SIZE_CHANGED;
+   blkdev_put_no_open(bdev);
+   }
 
limits->logical_block_size = size;
 
diff --git a/block/partitions/core.c b/block/partitions/core.c
index e7d776db803b..5a0330c1b6f9 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -618,6 +618,17 @@ int blk_add_partitions(struct gendisk *disk, struct 
block_device *bdev)
if (!disk_part_scan_enabled(disk))
return 0;
 
+   /*
+* Reject to check partition information if block size has been changed
+* in the runtime.  If block size of a block device has been changed,
+* the file descriptor should be opened agian to update the blkbits.
+*/
+   if (disk->flags & GENHD_FL_BLOCK_SIZE_CHANGED) {
+   pr_warn("%s: rejecting checking partition. fd should be opened 
again.\n",
+   disk->disk_name);
+   return -EBADFD;
+   }
+
state = check_partition(disk, bdev);
if (!state)
return 0;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 9293045e128c..c996de3d6084 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -131,6 +131,12 @@ EXPORT_SYMBOL(truncate_bdev_range);
 static void set_init_blocksize(struct block_device *bdev)
 {
bdev->bd_inode->i_blkbits = blksize_bits(bdev_logical_block_size(bdev));
+
+   /*
+* Allow I/O commands for this block device.  We can say that this
+* block device has been set to a proper block size.
+*/
+   bdev->bd_disk->flags &= ~GENHD_FL_BLOCK_SIZE_CHANGED;
 }
 
 int set_blocksize(struct block_device *bdev, int size)
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 809aaa32d53c..0e0e24917003 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -103,6 +103,7 @@ struct partition_meta_info {
 #define GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE0x0100
 #define GENHD_FL_NO_PART_SCAN  0x0200
 #define GENHD_FL_HIDDEN0x0400
+#define GENHD_FL_BLOCK_SIZE_CHANGED0x0800
 
 enum {
DISK_EVENT_MEDIA_CHANGE = 1 << 0, /* media changed */
-- 
2.17.1



[RFC PATCH V3 0/1] block: fix I/O errors in BLKRRPART

2021-01-04 Thread Minwoo Im
Hello,

  This patch fixes I/O errors during BLKRRPART ioctl() behavior right
after format operation that changed logical block size of the block
device with a same file descriptor opened.

Testcase:

  The following testcase is a case of NVMe namespace with the following
conditions:

  - Current LBA format is lbaf=0 (512 bytes logical block size)
  - LBA Format(lbaf=1) has 4096 bytes logical block size

  # Format block device logical block size 512B to 4096B
  nvme format /dev/nvme0n1 --lbaf=1 --force

  This will cause I/O errors because BLKRRPART ioctl() happened right after
the format command with same file descriptor opened in application
(e.g., nvme-cli) like:

  fd = open("/dev/nvme0n1", O_RDONLY);

  nvme_format(fd, ...);
  if (ioctl(fd, BLKRRPART) < 0)
...

Errors:

  We can see the Read command with Number of LBA(NLB) 0x(65535) which
was under-flowed because BLKRRPART operation requested request size based
on i_blkbits of the block device which is 9 via buffer_head.

  [dmesg-snip]  

[   10.771740] blk_update_request: operation not supported error, dev 
nvme0n1, sector 0 op 0x0:(READ) flags 0x0 phys_seg 1 prio class 0
[   10.780262] Buffer I/O error on dev nvme0n1, logical block 0, async page 
read

  [event-snip]  

 
kworker/0:1H-56  [000]    913.456922: nvme_setup_cmd: nvme0: 
disk=nvme0n1, qid=1, cmdid=216, nsid=1, flags=0x0, meta=0x0, cmd=(nvme_cmd_read 
slba=0, len=65535, ctrl=0x0, dsmgmt=0, reftag=0)
 ksoftirqd/0-9   [000] .Ns.   916.566351: nvme_complete_rq: nvme0: 
disk=nvme0n1, qid=1, cmdid=216, res=0x0, retries=0, flags=0x0, status=0x4002

  The patch below fixes the I/O errors by rejecting I/O requests from the
block layer with setting a flag to gendisk until the file descriptor
re-opened to be updated by __blkdev_get().  This is based on the previous
discussion [1].

Since V2:
  - Cover letter with testcase and error logs attached. Removed un-related
changes: empty line. (Chaitanya, [2])
  - Put blkdev with blkdev_put_no_open().

Since V1:
  - Updated patch to reject I/O rather than updating i_blkbits of the
block device's inode directly from driver. (Christoph, [1])

[1] 
https://lore.kernel.org/linux-nvme/20201223183143.GB13354@localhost.localdomain/T/#t
[2] 
https://lore.kernel.org/linux-nvme/20201230140504.GB7917@localhost.localdomain/T/#t

Thanks,

Minwoo Im (1):
  block: reject I/O for same fd if block size changed

 block/blk-settings.c|  8 
 block/partitions/core.c | 11 +++
 fs/block_dev.c  |  6 ++
 include/linux/genhd.h   |  1 +
 4 files changed, 26 insertions(+)

-- 
2.17.1



Re: [RFC V2] block: reject I/O for same fd if block size changed

2021-01-03 Thread Minwoo Im
On 21-01-04 00:57:07, Chaitanya Kulkarni wrote:
> On 12/30/20 08:03, Minwoo Im wrote:
> > Let's say, for example of NVMe device, Format command to change out
> > LBA format to another logical block size and BLKRRPART to re-read
> > partition information with a same file descriptor like:
> >
> > fd = open("/dev/nvme0n1", O_RDONLY);
> >
> > nvme_format(fd, ...);
> > if (ioctl(fd, BLKRRPART) < 0)
> > ..
> >
> > In this case, ioctl causes invalid Read operations which are triggered
> > by buffer_head I/O path to re-read partition information.  This is
> > because it's still playing around with i_blksize and i_blkbits.  So,
> > 512 -> 4096 -> 512 logical block size changes will cause an under-flowed
> > length of Read operations.
> >
> > Case for NVMe:
> >   (LBAF 1 512B, LBAF 0 4K logical block size)
> >
> >   nvme format /dev/nvme0n1 --lbaf=1 --force  # to 512B LBA
> >   nvme format /dev/nvme0n1 --lbaf=0 --force  # to 4096B LBA
> >
> > [dmesg-snip]
> >   [   10.771740] blk_update_request: operation not supported error, dev 
> > nvme0n1, sector 0 op 0x0:(READ) flags 0x0 phys_seg 1 prio class 0
> >   [   10.780262] Buffer I/O error on dev nvme0n1, logical block 0, async 
> > page read
> >
> > [event-snip]
> >   kworker/0:1H-56  [000]    913.456922: nvme_setup_cmd: nvme0: 
> > disk=nvme0n1, qid=1, cmdid=216, nsid=1, flags=0x0, meta=0x0, 
> > cmd=(nvme_cmd_read slba=0, len=65535, ctrl=0x0, dsmgmt=0, reftag=0)
> >ksoftirqd/0-9   [000] .Ns.   916.566351: nvme_complete_rq: nvme0: 
> > disk=nvme0n1, qid=1, cmdid=216, res=0x0, retries=0, flags=0x0, status=0x4002
> >
> > As the previous discussion [1], this patch introduced a gendisk flag
> > to indicate that block size has been changed in the runtime.  This flag
> > is set when logical block size is changed in the runtime in the block
> > layer.  It will be cleared when the file descriptor for the
> > block devie is opened again through __blkdev_get() which updates the block
> > size via set_init_blocksize().
> >
> > This patch rejects I/O from the path of add_partitions() to avoid
> > issuing invalid Read operations to device.  It also sets a flag to
> > gendisk in blk_queue_logical_block_size to minimize caller-side updates.
> >
> > [1] 
> > https://lore.kernel.org/linux-nvme/20201223183143.GB13354@localhost.localdomain/T/#t
> >
> > Signed-off-by: Minwoo Im 
> Rewrite the change-log similar to what we have in the repo and fix
> the spelling mistakes. Add a cover-letter to explain the testcase
> and the execution effect, also I'd move discussion link into
> cover-letter.

Thanks for your time.  Will prepare V3 with proper change logs and cover
letter to explain issue and testcase.

Thanks,


[RFC V2] block: reject I/O for same fd if block size changed

2020-12-30 Thread Minwoo Im
Let's say, for example of NVMe device, Format command to change out
LBA format to another logical block size and BLKRRPART to re-read
partition information with a same file descriptor like:

fd = open("/dev/nvme0n1", O_RDONLY);

nvme_format(fd, ...);
if (ioctl(fd, BLKRRPART) < 0)
..

In this case, ioctl causes invalid Read operations which are triggered
by buffer_head I/O path to re-read partition information.  This is
because it's still playing around with i_blksize and i_blkbits.  So,
512 -> 4096 -> 512 logical block size changes will cause an under-flowed
length of Read operations.

Case for NVMe:
  (LBAF 1 512B, LBAF 0 4K logical block size)

  nvme format /dev/nvme0n1 --lbaf=1 --force  # to 512B LBA
  nvme format /dev/nvme0n1 --lbaf=0 --force  # to 4096B LBA

[dmesg-snip]
  [   10.771740] blk_update_request: operation not supported error, dev 
nvme0n1, sector 0 op 0x0:(READ) flags 0x0 phys_seg 1 prio class 0
  [   10.780262] Buffer I/O error on dev nvme0n1, logical block 0, async page 
read

[event-snip]
  kworker/0:1H-56  [000]    913.456922: nvme_setup_cmd: nvme0: 
disk=nvme0n1, qid=1, cmdid=216, nsid=1, flags=0x0, meta=0x0, cmd=(nvme_cmd_read 
slba=0, len=65535, ctrl=0x0, dsmgmt=0, reftag=0)
   ksoftirqd/0-9   [000] .Ns.   916.566351: nvme_complete_rq: nvme0: 
disk=nvme0n1, qid=1, cmdid=216, res=0x0, retries=0, flags=0x0, status=0x4002

As the previous discussion [1], this patch introduced a gendisk flag
to indicate that block size has been changed in the runtime.  This flag
is set when logical block size is changed in the runtime in the block
layer.  It will be cleared when the file descriptor for the
block devie is opened again through __blkdev_get() which updates the block
size via set_init_blocksize().

This patch rejects I/O from the path of add_partitions() to avoid
issuing invalid Read operations to device.  It also sets a flag to
gendisk in blk_queue_logical_block_size to minimize caller-side updates.

[1] 
https://lore.kernel.org/linux-nvme/20201223183143.GB13354@localhost.localdomain/T/#t

Signed-off-by: Minwoo Im 
---
 block/blk-settings.c|  7 +++
 block/genhd.c   |  1 +
 block/partitions/core.c | 11 +++
 fs/block_dev.c  |  6 ++
 include/linux/genhd.h   |  1 +
 5 files changed, 26 insertions(+)

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 43990b1d148b..205822ffcaef 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -328,6 +328,13 @@ EXPORT_SYMBOL(blk_queue_max_segment_size);
 void blk_queue_logical_block_size(struct request_queue *q, unsigned int size)
 {
struct queue_limits *limits = &q->limits;
+   struct block_device *bdev;
+
+   if (q->backing_dev_info && q->backing_dev_info->owner &&
+   limits->logical_block_size != size) {
+   bdev = blkdev_get_no_open(q->backing_dev_info->owner->devt);
+   bdev->bd_disk->flags |= GENHD_FL_BLOCK_SIZE_CHANGED;
+   }
 
limits->logical_block_size = size;
 
diff --git a/block/genhd.c b/block/genhd.c
index 73faec438e49..c3a73cba7c88 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -81,6 +81,7 @@ bool set_capacity_and_notify(struct gendisk *disk, sector_t 
size)
 */
if (!capacity || !size)
return false;
+
kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
return true;
 }
diff --git a/block/partitions/core.c b/block/partitions/core.c
index e7d776db803b..5a0330c1b6f9 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -618,6 +618,17 @@ int blk_add_partitions(struct gendisk *disk, struct 
block_device *bdev)
if (!disk_part_scan_enabled(disk))
return 0;
 
+   /*
+* Reject to check partition information if block size has been changed
+* in the runtime.  If block size of a block device has been changed,
+* the file descriptor should be opened agian to update the blkbits.
+*/
+   if (disk->flags & GENHD_FL_BLOCK_SIZE_CHANGED) {
+   pr_warn("%s: rejecting checking partition. fd should be opened 
again.\n",
+   disk->disk_name);
+   return -EBADFD;
+   }
+
state = check_partition(disk, bdev);
if (!state)
return 0;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 9293045e128c..c996de3d6084 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -131,6 +131,12 @@ EXPORT_SYMBOL(truncate_bdev_range);
 static void set_init_blocksize(struct block_device *bdev)
 {
bdev->bd_inode->i_blkbits = blksize_bits(bdev_logical_block_size(bdev));
+
+   /*
+* Allow I/O commands for this block device.  We can say that this
+* block device has been set to a proper block size.
+*/
+   bdev->bd_disk->flags

Re: [RFC] block: reject I/O in BLKRRPART if block size changed

2020-12-30 Thread Minwoo Im
On 20-12-27 03:02:32, Minwoo Im wrote:
> Background:
>   Let's say we have 2 LBA format for 4096B and 512B LBA size for a
> NVMe namespace.  Assume that current LBA format is 4096B and in case
> we convert namespace to 512B and 4096B back again:
> 
>   nvme format /dev/nvme0n1 --lbaf=1 --force  # to 512B LBA
>   nvme format /dev/nvme0n1 --lbaf=0 --force  # to 4096B LBA
> 
>   Then we can see the following errors during the BLKRRPART ioctl from
> the nvme-cli format subcommand:
> 
>   [   10.771740] blk_update_request: operation not supported error, dev 
> nvme0n1, sector 0 op 0x0:(READ) flags 0x0 phys_seg 1 prio class 0
>   [   10.780262] Buffer I/O error on dev nvme0n1, logical block 0, async page 
> read
>   ...
> 
>   Also, we can see the Read commands followed by the Format command due
> to BLKRRPART ioctl with Number of LBAs to 65535(0x) which is
> under-flowed because the request for the Read commands are coming with
> 512B and this is because it's playing around with i_blkbits from the
> block_device inode which needs to be avoided as [1].
> 
>   kworker/0:1H-56  [000]    913.456922: nvme_setup_cmd: nvme0: 
> disk=nvme0n1, qid=1, cmdid=216, nsid=1, flags=0x0, meta=0x0, 
> cmd=(nvme_cmd_read slba=0, len=65535, ctrl=0x0, dsmgmt=0, reftag=0)
>   ksoftirqd/0-9   [000] .Ns.   916.566351: nvme_complete_rq: nvme0: 
> disk=nvme0n1, qid=1, cmdid=216, res=0x0, retries=0, flags=0x0, status=0x4002
>   ...
> 
>   Before we have commit 5ff9f19231a0 ("block: simplify
> set_init_blocksize"), block size used to be bumped up to the
> 4K(PAGE_SIZE) in this example and we have not seen these errors.  But
> with this patch, we have to make sure that bdev->bd_inode->i_blkbits to
> make sure that BLKRRPART ioctl pass proper request length based on the
> changed logical block size.
> 
> Description:
>   As the previous discussion [1], this patch introduced a gendisk flag
> to indicate that block size has been changed in the runtime.  This flag
> is set when logical block size is changed in the runtime with sector
> capacity itself.  It will be cleared when the file descriptor for the
> block devie is opened again which means __blkdev_get() updates the block
> size via set_init_blocksize().
>   This patch rejects I/O from the path of add_partitions() and
> application should open the file descriptor again to update the block
> size of the block device inode.
> 
> [1] 
> https://lore.kernel.org/linux-nvme/20201223183143.GB13354@localhost.localdomain/T/#t
> 
> Signed-off-by: Minwoo Im 

Hello,

Sorry for the noises here.  Please ignore this patch.  Will try to
prepare a new one for this issue.

Thanks,


[RFC] block: reject I/O in BLKRRPART if block size changed

2020-12-26 Thread Minwoo Im
Background:
  Let's say we have 2 LBA format for 4096B and 512B LBA size for a
NVMe namespace.  Assume that current LBA format is 4096B and in case
we convert namespace to 512B and 4096B back again:

  nvme format /dev/nvme0n1 --lbaf=1 --force  # to 512B LBA
  nvme format /dev/nvme0n1 --lbaf=0 --force  # to 4096B LBA

  Then we can see the following errors during the BLKRRPART ioctl from
the nvme-cli format subcommand:

  [   10.771740] blk_update_request: operation not supported error, dev 
nvme0n1, sector 0 op 0x0:(READ) flags 0x0 phys_seg 1 prio class 0
  [   10.780262] Buffer I/O error on dev nvme0n1, logical block 0, async page 
read
  ...

  Also, we can see the Read commands followed by the Format command due
to BLKRRPART ioctl with Number of LBAs to 65535(0x) which is
under-flowed because the request for the Read commands are coming with
512B and this is because it's playing around with i_blkbits from the
block_device inode which needs to be avoided as [1].

  kworker/0:1H-56  [000]    913.456922: nvme_setup_cmd: nvme0: 
disk=nvme0n1, qid=1, cmdid=216, nsid=1, flags=0x0, meta=0x0, cmd=(nvme_cmd_read 
slba=0, len=65535, ctrl=0x0, dsmgmt=0, reftag=0)
  ksoftirqd/0-9   [000] .Ns.   916.566351: nvme_complete_rq: nvme0: 
disk=nvme0n1, qid=1, cmdid=216, res=0x0, retries=0, flags=0x0, status=0x4002
  ...

  Before we have commit 5ff9f19231a0 ("block: simplify
set_init_blocksize"), block size used to be bumped up to the
4K(PAGE_SIZE) in this example and we have not seen these errors.  But
with this patch, we have to make sure that bdev->bd_inode->i_blkbits to
make sure that BLKRRPART ioctl pass proper request length based on the
changed logical block size.

Description:
  As the previous discussion [1], this patch introduced a gendisk flag
to indicate that block size has been changed in the runtime.  This flag
is set when logical block size is changed in the runtime with sector
capacity itself.  It will be cleared when the file descriptor for the
block devie is opened again which means __blkdev_get() updates the block
size via set_init_blocksize().
  This patch rejects I/O from the path of add_partitions() and
application should open the file descriptor again to update the block
size of the block device inode.

[1] 
https://lore.kernel.org/linux-nvme/20201223183143.GB13354@localhost.localdomain/T/#t

Signed-off-by: Minwoo Im 
---
 block/genhd.c   |  3 +++
 block/partitions/core.c | 11 +++
 fs/block_dev.c  |  6 ++
 include/linux/genhd.h   |  1 +
 4 files changed, 21 insertions(+)

diff --git a/block/genhd.c b/block/genhd.c
index b84b8671e627..1f64907fac3d 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -79,6 +79,9 @@ bool set_capacity_and_notify(struct gendisk *disk, sector_t 
size)
 */
if (!capacity || !size)
return false;
+
+   disk->flags |= GENHD_FL_BLOCK_SIZE_CHANGED;
+
kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
return true;
 }
diff --git a/block/partitions/core.c b/block/partitions/core.c
index deca253583bd..7dfcda96be9e 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -617,6 +617,17 @@ int blk_add_partitions(struct gendisk *disk, struct 
block_device *bdev)
if (!disk_part_scan_enabled(disk))
return 0;
 
+   /*
+* Reject to check partition information if block size has been changed
+* in the runtime.  If block size of a block device has been changed,
+* the file descriptor should be opened agian to update the blkbits.
+*/
+   if (disk->flags & GENHD_FL_BLOCK_SIZE_CHANGED) {
+   pr_warn("%s: rejecting checking partition. fd should be opened 
again.\n",
+   disk->disk_name);
+   return -EBADFD;
+   }
+
state = check_partition(disk, bdev);
if (!state)
return 0;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 9e56ee1f2652..813361ad77c1 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -132,6 +132,12 @@ EXPORT_SYMBOL(truncate_bdev_range);
 static void set_init_blocksize(struct block_device *bdev)
 {
bdev->bd_inode->i_blkbits = blksize_bits(bdev_logical_block_size(bdev));
+
+   /*
+* Allow I/O commands for this block device.  We can say that this
+* block device has been set to a proper block size.
+*/
+   bdev->bd_disk->flags &= ~GENHD_FL_BLOCK_SIZE_CHANGED;
 }
 
 int set_blocksize(struct block_device *bdev, int size)
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 809aaa32d53c..0e0e24917003 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -103,6 +103,7 @@ struct partition_meta_info {
 #define GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE0x0100
 #define GENHD_FL_NO_PART_SCAN  0x0200
 #define GENHD_FL_HIDDEN 

Re: [PATCH] vfs: remove comment for deleted argument 'cred'

2020-12-11 Thread Minwoo Im
On 20-12-02 21:52:32, Minwoo Im wrote:
> Removed credential argument comment with no functional changes.
> 
> Signed-off-by: Minwoo Im 
> ---
>  fs/open.c | 1 -
>  1 file changed, 1 deletion(-)
> 
> diff --git a/fs/open.c b/fs/open.c
> index 9af548fb841b..85a532af0946 100644
> --- a/fs/open.c
> +++ b/fs/open.c
> @@ -923,7 +923,6 @@ EXPORT_SYMBOL(file_path);
>   * vfs_open - open the file at the given path
>   * @path: path to open
>   * @file: newly allocated file with f_flag initialized
> - * @cred: credentials to use
>   */
>  int vfs_open(const struct path *path, struct file *file)
>  {
> -- 
> 2.17.1
> 

Hello,

Gentle ping,

Thanks!


Re: [PATCH 0/3] blk-mq: trivial helper and fixes comments

2020-12-11 Thread Minwoo Im
On 20-12-05 00:20:52, Minwoo Im wrote:
> Hello,
> 
> This patch set contains:
>   - Introduce a helper to allocate tagset tags for the first time
> without 'realloc' keyword that used to be taken.
>   - Fixes for comments need to be updated.
> 
> Please have a look.
> 
> Thanks,
> 
> 
> Minwoo Im (3):
>   blk-mq: add helper allocating tagset->tags
>   blk-mq: update arg in comment of blk_mq_map_queue
>   blk-mq: fix msec comment from micro to milli seconds
> 
>  block/blk-mq.c | 14 ++
>  block/blk-mq.h |  2 +-
>  2 files changed, 11 insertions(+), 5 deletions(-)
> 
> -- 
> 2.17.1
> 

Hello,

Gentle ping.

Thanks!


[PATCH 2/3] blk-mq: update arg in comment of blk_mq_map_queue

2020-12-04 Thread Minwoo Im
Update mis-named argument description of blk_mq_map_queue().  This patch
also updates description that argument to software queue percpu context.

Signed-off-by: Minwoo Im 
---
 block/blk-mq.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-mq.h b/block/blk-mq.h
index c696515766c7..c1458d9502f1 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -99,7 +99,7 @@ static inline struct blk_mq_hw_ctx 
*blk_mq_map_queue_type(struct request_queue *
  * blk_mq_map_queue() - map (cmd_flags,type) to hardware queue
  * @q: request queue
  * @flags: request command flags
- * @cpu: cpu ctx
+ * @ctx: software queue cpu ctx
  */
 static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
 unsigned int flags,
-- 
2.17.1



[PATCH 3/3] blk-mq: fix msec comment from micro to milli seconds

2020-12-04 Thread Minwoo Im
Delay to wait for queue running is milli second unit which is passed to
delayed work via msecs_to_jiffies() which is to convert milliseconds to
jiffies.

Signed-off-by: Minwoo Im 
---
 block/blk-mq.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 69771ba18f1f..284d103bd0e7 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1594,7 +1594,7 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx 
*hctx)
  * __blk_mq_delay_run_hw_queue - Run (or schedule to run) a hardware queue.
  * @hctx: Pointer to the hardware queue to run.
  * @async: If we want to run the queue asynchronously.
- * @msecs: Microseconds of delay to wait before running the queue.
+ * @msecs: Milliseconds of delay to wait before running the queue.
  *
  * If !@async, try to run the queue now. Else, run the queue asynchronously and
  * with a delay of @msecs.
@@ -1623,7 +1623,7 @@ static void __blk_mq_delay_run_hw_queue(struct 
blk_mq_hw_ctx *hctx, bool async,
 /**
  * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously.
  * @hctx: Pointer to the hardware queue to run.
- * @msecs: Microseconds of delay to wait before running the queue.
+ * @msecs: Milliseconds of delay to wait before running the queue.
  *
  * Run a hardware queue asynchronously with a delay of @msecs.
  */
@@ -1687,7 +1687,7 @@ EXPORT_SYMBOL(blk_mq_run_hw_queues);
 /**
  * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously.
  * @q: Pointer to the request queue to run.
- * @msecs: Microseconds of delay to wait before running the queues.
+ * @msecs: Milliseconds of delay to wait before running the queues.
  */
 void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
 {
-- 
2.17.1



[PATCH 1/3] blk-mq: add helper allocating tagset->tags

2020-12-04 Thread Minwoo Im
tagset->set is allocated from blk_mq_alloc_tag_set() rather than being
reallocated.  This patch added a helper to make its meaning explicitly
which is to allocate rather than to reallocate.

Signed-off-by: Minwoo Im 
---
 block/blk-mq.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 37c682855a63..69771ba18f1f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3377,6 +3377,12 @@ static int blk_mq_realloc_tag_set_tags(struct 
blk_mq_tag_set *set,
return 0;
 }
 
+static int blk_mq_alloc_tag_set_tags(struct blk_mq_tag_set *set,
+   int new_nr_hw_queues)
+{
+   return blk_mq_realloc_tag_set_tags(set, 0, new_nr_hw_queues);
+}
+
 /*
  * Alloc a tag set to be associated with one or more request queues.
  * May fail with EINVAL for various error conditions. May adjust the
@@ -3430,7 +3436,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids)
set->nr_hw_queues = nr_cpu_ids;
 
-   if (blk_mq_realloc_tag_set_tags(set, 0, set->nr_hw_queues) < 0)
+   if (blk_mq_alloc_tag_set_tags(set, set->nr_hw_queues) < 0)
return -ENOMEM;
 
ret = -ENOMEM;
-- 
2.17.1



[PATCH 0/3] blk-mq: trivial helper and fixes comments

2020-12-04 Thread Minwoo Im
Hello,

This patch set contains:
  - Introduce a helper to allocate tagset tags for the first time
without 'realloc' keyword that used to be taken.
  - Fixes for comments need to be updated.

Please have a look.

Thanks,


Minwoo Im (3):
  blk-mq: add helper allocating tagset->tags
  blk-mq: update arg in comment of blk_mq_map_queue
  blk-mq: fix msec comment from micro to milli seconds

 block/blk-mq.c | 14 ++
 block/blk-mq.h |  2 +-
 2 files changed, 11 insertions(+), 5 deletions(-)

-- 
2.17.1



[PATCH] vfs: remove comment for deleted argument 'cred'

2020-12-02 Thread Minwoo Im
Removed credential argument comment with no functional changes.

Signed-off-by: Minwoo Im 
---
 fs/open.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/open.c b/fs/open.c
index 9af548fb841b..85a532af0946 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -923,7 +923,6 @@ EXPORT_SYMBOL(file_path);
  * vfs_open - open the file at the given path
  * @path: path to open
  * @file: newly allocated file with f_flag initialized
- * @cred: credentials to use
  */
 int vfs_open(const struct path *path, struct file *file)
 {
-- 
2.17.1



Re: [PATCHv2] nvme: Assign subsy instance from first ctrl

2019-09-06 Thread Minwoo Im
> Subject: [PATCHv2] nvme: Assign subsy instance from first ctrl

I'm not sure but, I have not seen 'subsy' thing before.  Maybe
s/sybsy/subsys/ ?

> 
> The namespace disk names must be unique for the lifetime of the
> subsystem. This was accomplished by using their parent subsystems'
> instances which were allocated independently from the controllers
> connected to that subsystem. This allowed name prefixes assigned to
> namespaces to match a controller from an unrelated subsystem, and has
> created confusion among users examining device nodes.
> 
> Ensure a namespace's subsystem instance never clashes with a controller
> instance of another subsystem by transferring the instance ownership
> to the parent subsystem from the first controller discovered in that
> subsystem.
> 
> Reviewed-by: Logan Gunthorpe 
> Signed-off-by: Keith Busch 

Keith, Thanks for this patch.  I really like this concept which can
avoid from instance mistakes.

Otherwise looks good to me.

Reviewed-by: Minwoo Im 


[PATCH 2/2] lightnvm: print error when target is not found

2019-07-27 Thread Minwoo Im
If userspace requests target to be removed, nvm_remove_tgt() will
iterate the nvm_devices to find out the given target, but if not
found, then it should print out the error.

Cc: Matias Bjørling 
Cc: Javier González 
Signed-off-by: Minwoo Im 
---
 drivers/lightnvm/core.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 4c7b48f72e80..4c89a2420a51 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -494,8 +494,11 @@ static int nvm_remove_tgt(struct nvm_ioctl_remove *remove)
}
up_read(&nvm_lock);
 
-   if (!t)
+   if (!t) {
+   pr_err("failed to remove target %s not found\n",
+   remove->tgtname);
return 1;
+   }
 
__nvm_remove_target(t, true);
kref_put(&dev->ref, nvm_free);
-- 
2.17.1



Re: [PATCH] nvme-pci: Use dev_get_drvdata where possible

2019-07-25 Thread Minwoo Im
On 19-07-24 20:22:35, Chuhong Yuan wrote:
> Instead of using to_pci_dev + pci_get_drvdata,
> use dev_get_drvdata to make code simpler.
> 
> Signed-off-by: Chuhong Yuan 

This looks good to me.

Reviewed-by: Minwoo Im 


Re: [PATCH] firmware: qcom_scm: fix error for incompatible pointer

2019-07-23 Thread Minwoo Im
> > > > We just can cast phys_addr_t to dma_addr_t here.
> > > 
> > > IME, casting is rarely a proper solution.
> > 
> > *nod*
> > 
> > ptr_phys probably should be a dma_addr_t.  Unless this driver is so
> > magic that it really wants a physical and not a dma address, in which
> > case it needs to use alloc_pages instead of dma_alloc_coherent
> > and then call page_to_phys on the returned page, and a very big comment
> > explaining why it is so special.
> 
> The scm call takes physical addresses (which happens to be 1:1 with DMA
> addresses for this driver).
> 
> This allocation started off (downstream) as a simple kmalloc(), but
> while the scm call is being executed an access from Linux will cause a
> security violation (that's not handled gracefully). The properties of
> dma_alloc is closer, so that's where the code is today.
> 
> Optimally this should be something like alloc_pages() and some mechanism
> for unmapping the pages during the call. But no one has come up with a
> suitable patch for that.
> 
> 
> But there's a patch from Stephen for this already (not doing a
> typecast).  Apparently I missed merging this, so I'll do that.
> 
> https://lore.kernel.org/linux-arm-msm/20190517210923.202131-2-swb...@chromium.org/

Bjron,

I appreciate for checking this.  And also thanks all you guys for the
comments here!

Thanks,


[PATCH] firmware: qcom_scm: fix error for incompatible pointer

2019-07-19 Thread Minwoo Im
The following error can happen when trying to build it:

```
drivers/firmware/qcom_scm.c: In function ‘qcom_scm_assign_mem’:
drivers/firmware/qcom_scm.c:460:47: error: passing argument 3 of 
‘dma_alloc_coherent’ from incompatible pointer type 
[-Werror=incompatible-pointer-types]
  ptr = dma_alloc_coherent(__scm->dev, ptr_sz, &ptr_phys, GFP_KERNEL);
   ^
In file included from drivers/firmware/qcom_scm.c:12:0:
./include/linux/dma-mapping.h:636:21: note: expected ‘dma_addr_t * {aka long 
long unsigned int *}’ but argument is of type ‘phys_addr_t * {aka unsigned int 
*}’
 static inline void *dma_alloc_coherent(struct device *dev, size_t size,
 ^~
```

We just can cast phys_addr_t to dma_addr_t here.

Cc: Andy Gross 
Cc: David Brown 
Signed-off-by: Minwoo Im 
---
 drivers/firmware/qcom_scm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/firmware/qcom_scm.c b/drivers/firmware/qcom_scm.c
index 2ddc118dba1b..7f6c841fa200 100644
--- a/drivers/firmware/qcom_scm.c
+++ b/drivers/firmware/qcom_scm.c
@@ -457,7 +457,8 @@ int qcom_scm_assign_mem(phys_addr_t mem_addr, size_t mem_sz,
ptr_sz = ALIGN(src_sz, SZ_64) + ALIGN(mem_to_map_sz, SZ_64) +
ALIGN(dest_sz, SZ_64);
 
-   ptr = dma_alloc_coherent(__scm->dev, ptr_sz, &ptr_phys, GFP_KERNEL);
+   ptr = dma_alloc_coherent(__scm->dev, ptr_sz, (dma_addr_t *) &ptr_phys,
+   GFP_KERNEL);
if (!ptr)
return -ENOMEM;
 
-- 
2.17.1



Re: [PATCH v2 3/3] nvme-pci: Add support for Apple 2018+ models

2019-07-17 Thread Minwoo Im
On 19-07-17 10:45:27, Benjamin Herrenschmidt wrote:
> Based on reverse engineering and original patch by
> 
> Paul Pawlowski 
> 
> This adds support for Apple weird implementation of NVME in their
> 2018 or later machines. It accounts for the twice-as-big SQ entries
> for the IO queues, and the fact that only interrupt vector 0 appears
> to function properly.
> 
> Signed-off-by: Benjamin Herrenschmidt 
> 
> # Conflicts:
> # drivers/nvme/host/core.c
> ---
>  drivers/nvme/host/nvme.h | 10 ++
>  drivers/nvme/host/pci.c  | 21 -
>  2 files changed, 30 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index 716a876119c8..ced0e0a7e039 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -92,6 +92,16 @@ enum nvme_quirks {
>* Broken Write Zeroes.
>*/
>   NVME_QUIRK_DISABLE_WRITE_ZEROES = (1 << 9),
> +
> + /*
> +  * Use only one interrupt vector for all queues
> +  */
> + NVME_QUIRK_SINGLE_VECTOR= (1 << 10),
> +
> + /*
> +  * Use non-standard 128 bytes SQEs.
> +  */
> + NVME_QUIRK_128_BYTES_SQES   = (1 << 11),
>  };
>  
>  /*
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 1637677afb78..7088971d4c42 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -2081,6 +2081,13 @@ static int nvme_setup_irqs(struct nvme_dev *dev, 
> unsigned int nr_io_queues)
>   dev->io_queues[HCTX_TYPE_DEFAULT] = 1;
>   dev->io_queues[HCTX_TYPE_READ] = 0;
>  
> + /*
> +  * Some Apple controllers require all queues to use the
> +  * first vector.
> +  */
> + if (dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR)
> + irq_queues = 1;
> +
>   return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues,
> PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
>  }
> @@ -2322,7 +2329,16 @@ static int nvme_pci_enable(struct nvme_dev *dev)
>   io_queue_depth);
>   dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap);
>   dev->dbs = dev->bar + 4096;
> - dev->io_sqes = NVME_NVM_IOSQES;
> +
> + /*
> +  * Some Apple controllers require a non-standard SQE size.
> +  * Interestingly they also seem to ignore the CC:IOSQES register
> +  * so we don't bother updating it here.
> +  */

That is really interesting.

This also looks good to me.

Reviewed-by: Minwoo Im 

Thanks,


Re: [PATCH v2 2/3] nvme-pci: Add support for variable IO SQ element size

2019-07-17 Thread Minwoo Im
On 19-07-17 10:45:26, Benjamin Herrenschmidt wrote:
> The size of a submission queue element should always be 6 (64 bytes)
> by spec.
> 
> However some controllers such as Apple's are not properly implementing
> the standard and require a different size.
> 
> This provides the ground work for the subsequent quirks for these
> controllers.
> 
> Signed-off-by: Benjamin Herrenschmidt 
> ---
>  drivers/nvme/host/pci.c | 11 ---
>  include/linux/nvme.h|  1 +
>  2 files changed, 9 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 8f006638452b..1637677afb78 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -28,7 +28,7 @@
>  #include "trace.h"
>  #include "nvme.h"
>  
> -#define SQ_SIZE(q)   ((q)->q_depth * sizeof(struct nvme_command))
> +#define SQ_SIZE(q)   ((q)->q_depth << (q)->sqes)
>  #define CQ_SIZE(q)   ((q)->q_depth * sizeof(struct nvme_completion))
>  
>  #define SGES_PER_PAGE(PAGE_SIZE / sizeof(struct nvme_sgl_desc))
> @@ -100,6 +100,7 @@ struct nvme_dev {
>   unsigned io_queues[HCTX_MAX_TYPES];
>   unsigned int num_vecs;
>   int q_depth;
> + int io_sqes;
>   u32 db_stride;
>   void __iomem *bar;
>   unsigned long bar_mapped_size;
> @@ -162,7 +163,7 @@ static inline struct nvme_dev *to_nvme_dev(struct 
> nvme_ctrl *ctrl)
>  struct nvme_queue {
>   struct nvme_dev *dev;
>   spinlock_t sq_lock;
> - struct nvme_command *sq_cmds;
> + void *sq_cmds;

It would be great if it can remain the existing data type for the
SQEs...  But I'm fine with this also.

It looks good to me.

Reviewed-by: Minwoo Im 

Thanks,


Re: [PATCH v2 1/3] nvme-pci: Pass the queue to SQ_SIZE/CQ_SIZE macros

2019-07-17 Thread Minwoo Im
On 19-07-17 10:45:25, Benjamin Herrenschmidt wrote:
> This will make it easier to handle variable queue entry sizes
> later. No functional change.
> 
> Signed-off-by: Benjamin Herrenschmidt 
> Reviewed-by: Christoph Hellwig 

Reviewed-by: Minwoo Im 


Re: linux-next: Fixes tag needs some work in the block tree

2019-07-12 Thread Minwoo Im
On 19-07-11 16:03:22, Jens Axboe wrote:
> On 7/11/19 3:35 PM, Stephen Rothwell wrote:
> > Hi all,
> > 
> > In commit
> > 
> >8f3858763d33 ("nvme: fix NULL deref for fabrics options")
> > 
> > Fixes tag
> > 
> >Fixes: 958f2a0f8 ("nvme-tcp: set the STABLE_WRITES flag when data digests
> > 
> > has these problem(s):
> > 
> >- SHA1 should be at least 12 digits long
> >  Can be fixed by setting core.abbrev to 12 (or more) or (for git v2.11
> >  or later) just making sure it is not set (or set to "auto").
> >- Subject has leading but no trailing parentheses
> >- Subject has leading but no trailing quotes
> > 
> > Please do not split Fixes tags over more than one line.  Also do not
> > include blank lines among the tags.

I'm sorry for noises here.  I will keep that in mind.

Thanks Stephen,

> 
> I should have caught that. Since it's top-of-tree and recent, I'll
> amend it.

Jens,  I will do it from the next time.  Thanks for ammend.


Re: [PATCH -next] nvme-pci: Make nvme_dev_pm_ops static

2019-06-26 Thread Minwoo Im
On 19-06-26 10:09:02, YueHaibing wrote:
> Fix sparse warning:
> 
> drivers/nvme/host/pci.c:2926:25: warning:
>  symbol 'nvme_dev_pm_ops' was not declared. Should it be static?
> 
> Reported-by: Hulk Robot 
> Signed-off-by: YueHaibing 
> ---
>  drivers/nvme/host/pci.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 1893520..f500133 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -2923,7 +2923,7 @@ static int nvme_simple_resume(struct device *dev)
>   return 0;
>  }
>  
> -const struct dev_pm_ops nvme_dev_pm_ops = {
> +static const struct dev_pm_ops nvme_dev_pm_ops = {
>   .suspend= nvme_suspend,
>   .resume = nvme_resume,
>   .freeze     = nvme_simple_suspend,

IMHO, it should be in static.

Reviewed-by: Minwoo Im 


Re: [PATCH][next] nvme-trace: fix spelling mistake "spcecific" -> "specific"

2019-06-26 Thread Minwoo Im
On 19-06-26 13:43:23, Colin King wrote:
> From: Colin Ian King 
> 
> There are two spelling mistakes in trace_seq_printf messages, fix these.
> 
> Signed-off-by: Colin Ian King 
> ---
>  drivers/nvme/host/trace.c   | 2 +-
>  drivers/nvme/target/trace.c | 2 +-
>  2 files changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c
> index f01ad0fd60bb..6980ab827233 100644
> --- a/drivers/nvme/host/trace.c
> +++ b/drivers/nvme/host/trace.c
> @@ -178,7 +178,7 @@ static const char *nvme_trace_fabrics_common(struct 
> trace_seq *p, u8 *spc)
>  {
>   const char *ret = trace_seq_buffer_ptr(p);
>  
> - trace_seq_printf(p, "spcecific=%*ph", 24, spc);
> + trace_seq_printf(p, "specific=%*ph", 24, spc);
>   trace_seq_putc(p, 0);
>   return ret;
>  }
> diff --git a/drivers/nvme/target/trace.c b/drivers/nvme/target/trace.c
> index cdcdd14c6408..6af11d493271 100644
> --- a/drivers/nvme/target/trace.c
> +++ b/drivers/nvme/target/trace.c
> @@ -146,7 +146,7 @@ static const char *nvmet_trace_fabrics_common(struct 
> trace_seq *p, u8 *spc)
>  {
>   const char *ret = trace_seq_buffer_ptr(p);
>  
> - trace_seq_printf(p, "spcecific=%*ph", 24, spc);
> + trace_seq_printf(p, "specific=%*ph", 24, spc);
>   trace_seq_putc(p, 0);
>   return ret;
>  }
> -- 
> 2.20.1
> 
> 
> ___
> Linux-nvme mailing list
> linux-n...@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-nvme

*facepalm*..

Thanks for fixing this, Colin!

Reviewed-by: Minwoo Im 


Re: [RFC PATCH] mpt3sas: support target smid for [abort|query] task

2019-06-13 Thread Minwoo Im
On 19-06-13 16:54:02, Minwoo Im wrote:
> We can request task management IOCTL command(MPI2_FUNCTION_SCSI_TASK_MGMT)
> to /dev/mpt3ctl.  If the given task_type is either abort task or query
> task, it may need a field named "Initiator Port Transfer Tag to Manage"
> in the IU.
> 
> Current code does not support to check target IPTT tag from the
> tm_request.  This patch introduces to check TaskMID given from the
> userspace as a target tag.  We have a rule of relationship between
> (struct request *req->tag) and smid in mpt3sas_base.c:
> 
> 3318 u16
> 3319 mpt3sas_base_get_smid_scsiio(struct MPT3SAS_ADAPTER *ioc, u8 cb_idx,
> 3320 struct scsi_cmnd *scmd)
> 3321 {
> 3322 struct scsiio_tracker *request = scsi_cmd_priv(scmd);
> 3323 unsigned int tag = scmd->request->tag;
> 3324 u16 smid;
> 3325
> 3326 smid = tag + 1;
> 
> So if we want to abort a request tagged #X, then we can pass (X + 1) to
> this IOCTL handler.
> 
> Signed-off-by: Minwoo Im 
> ---

Sorry for the duplicated patches on the same mailing list.  Please ignore
the first two of it.

Thanks,


[RFC PATCH] mpt3sas: support target smid for [abort|query] task

2019-06-13 Thread Minwoo Im
We can request task management IOCTL command(MPI2_FUNCTION_SCSI_TASK_MGMT)
to /dev/mpt3ctl.  If the given task_type is either abort task or query
task, it may need a field named "Initiator Port Transfer Tag to Manage"
in the IU.

Current code does not support to check target IPTT tag from the
tm_request.  This patch introduces to check TaskMID given from the
userspace as a target tag.  We have a rule of relationship between
(struct request *req->tag) and smid in mpt3sas_base.c:

3318 u16
3319 mpt3sas_base_get_smid_scsiio(struct MPT3SAS_ADAPTER *ioc, u8 cb_idx,
3320 struct scsi_cmnd *scmd)
3321 {
3322 struct scsiio_tracker *request = scsi_cmd_priv(scmd);
3323 unsigned int tag = scmd->request->tag;
3324 u16 smid;
3325
3326 smid = tag + 1;

So if we want to abort a request tagged #X, then we can pass (X + 1) to
this IOCTL handler.

Signed-off-by: Minwoo Im 
---
 drivers/scsi/mpt3sas/mpt3sas_ctl.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/scsi/mpt3sas/mpt3sas_ctl.c 
b/drivers/scsi/mpt3sas/mpt3sas_ctl.c
index b2bb47c14d35..5c7539dae713 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_ctl.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_ctl.c
@@ -596,15 +596,17 @@ _ctl_set_task_mid(struct MPT3SAS_ADAPTER *ioc, struct 
mpt3_ioctl_command *karg,
if (priv_data->sas_target->handle != handle)
continue;
st = scsi_cmd_priv(scmd);
-   tm_request->TaskMID = cpu_to_le16(st->smid);
-   found = 1;
+   if (tm_request->TaskMID == st->smid) {
+   tm_request->TaskMID = cpu_to_le16(st->smid);
+   found = 1;
+   }
}
 
if (!found) {
dctlprintk(ioc,
-  ioc_info(ioc, "%s: handle(0x%04x), lun(%d), no 
active mid!!\n",
+  ioc_info(ioc, "%s: handle(0x%04x), lun(%d), no 
matched mid(%d)!!\n",
desc, le16_to_cpu(tm_request->DevHandle),
-   lun));
+   lun, tm_request->TaskMID));
tm_reply = ioc->ctl_cmds.reply;
tm_reply->DevHandle = tm_request->DevHandle;
tm_reply->Function = MPI2_FUNCTION_SCSI_TASK_MGMT;
-- 
2.16.1



[RFC PATCH] mpt3sas: support target smid for [abort|query] task

2019-06-13 Thread Minwoo Im
We can request task management IOCTL command(MPI2_FUNCTION_SCSI_TASK_MGMT)
to /dev/mpt3ctl.  If the given task_type is either abort task or query
task, it may need a field named "Initiator Port Transfer Tag to Manage"
in the IU.

Current code does not support to check target IPTT tag from the
tm_request.  This patch introduces to check TaskMID given from the
userspace as a target tag.  We have a rule of relationship between
(struct request *req->tag) and smid in mpt3sas_base.c:

3318 u16
3319 mpt3sas_base_get_smid_scsiio(struct MPT3SAS_ADAPTER *ioc, u8 cb_idx,
3320 struct scsi_cmnd *scmd)
3321 {
3322 struct scsiio_tracker *request = scsi_cmd_priv(scmd);
3323 unsigned int tag = scmd->request->tag;
3324 u16 smid;
3325
3326 smid = tag + 1;

So if we want to abort a request tagged #X, then we can pass (X + 1) to
this IOCTL handler.

Signed-off-by: Minwoo Im 
---
 drivers/scsi/mpt3sas/mpt3sas_ctl.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/scsi/mpt3sas/mpt3sas_ctl.c 
b/drivers/scsi/mpt3sas/mpt3sas_ctl.c
index b2bb47c14d35..5c7539dae713 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_ctl.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_ctl.c
@@ -596,15 +596,17 @@ _ctl_set_task_mid(struct MPT3SAS_ADAPTER *ioc, struct 
mpt3_ioctl_command *karg,
if (priv_data->sas_target->handle != handle)
continue;
st = scsi_cmd_priv(scmd);
-   tm_request->TaskMID = cpu_to_le16(st->smid);
-   found = 1;
+   if (tm_request->TaskMID == st->smid) {
+   tm_request->TaskMID = cpu_to_le16(st->smid);
+   found = 1;
+   }
}
 
if (!found) {
dctlprintk(ioc,
-  ioc_info(ioc, "%s: handle(0x%04x), lun(%d), no 
active mid!!\n",
+  ioc_info(ioc, "%s: handle(0x%04x), lun(%d), no 
matched mid(%d)!!\n",
desc, le16_to_cpu(tm_request->DevHandle),
-   lun));
+   lun, tm_request->TaskMID));
tm_reply = ioc->ctl_cmds.reply;
tm_reply->DevHandle = tm_request->DevHandle;
tm_reply->Function = MPI2_FUNCTION_SCSI_TASK_MGMT;
-- 
2.16.1



[RFC PATCH] mpt3sas: support target smid for [abort|query] task

2019-06-13 Thread Minwoo Im
We can request task management IOCTL command(MPI2_FUNCTION_SCSI_TASK_MGMT)
to /dev/mpt3ctl.  If the given task_type is either abort task or query
task, it may need a field named "Initiator Port Transfer Tag to Manage"
in the IU.

Current code does not support to check target IPTT tag from the
tm_request.  This patch introduces to check TaskMID given from the
userspace as a target tag.  We have a rule of relationship between
(struct request *req->tag) and smid in mpt3sas_base.c:

3318 u16
3319 mpt3sas_base_get_smid_scsiio(struct MPT3SAS_ADAPTER *ioc, u8 cb_idx,
3320 struct scsi_cmnd *scmd)
3321 {
3322 struct scsiio_tracker *request = scsi_cmd_priv(scmd);
3323 unsigned int tag = scmd->request->tag;
3324 u16 smid;
3325
3326 smid = tag + 1;

So if we want to abort a request tagged #X, then we can pass (X + 1) to
this IOCTL handler.

Signed-off-by: Minwoo Im 
---
 drivers/scsi/mpt3sas/mpt3sas_ctl.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/scsi/mpt3sas/mpt3sas_ctl.c 
b/drivers/scsi/mpt3sas/mpt3sas_ctl.c
index b2bb47c14d35..5c7539dae713 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_ctl.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_ctl.c
@@ -596,15 +596,17 @@ _ctl_set_task_mid(struct MPT3SAS_ADAPTER *ioc, struct 
mpt3_ioctl_command *karg,
if (priv_data->sas_target->handle != handle)
continue;
st = scsi_cmd_priv(scmd);
-   tm_request->TaskMID = cpu_to_le16(st->smid);
-   found = 1;
+   if (tm_request->TaskMID == st->smid) {
+   tm_request->TaskMID = cpu_to_le16(st->smid);
+   found = 1;
+   }
}
 
if (!found) {
dctlprintk(ioc,
-  ioc_info(ioc, "%s: handle(0x%04x), lun(%d), no 
active mid!!\n",
+  ioc_info(ioc, "%s: handle(0x%04x), lun(%d), no 
matched mid(%d)!!\n",
desc, le16_to_cpu(tm_request->DevHandle),
-   lun));
+   lun, tm_request->TaskMID));
tm_reply = ioc->ctl_cmds.reply;
tm_reply->DevHandle = tm_request->DevHandle;
tm_reply->Function = MPI2_FUNCTION_SCSI_TASK_MGMT;
-- 
2.16.1



[tip:irq/core] genirq/affinity: Remove unused argument from [__]irq_build_affinity_masks()

2019-06-12 Thread tip-bot for Minwoo Im
Commit-ID:  0e51833042fccfe882ef3e85a346252550d26c22
Gitweb: https://git.kernel.org/tip/0e51833042fccfe882ef3e85a346252550d26c22
Author: Minwoo Im 
AuthorDate: Sun, 2 Jun 2019 20:21:17 +0900
Committer:  Thomas Gleixner 
CommitDate: Wed, 12 Jun 2019 10:52:45 +0200

genirq/affinity: Remove unused argument from [__]irq_build_affinity_masks()

The *affd argument is neither used in irq_build_affinity_masks() nor
__irq_build_affinity_masks(). Remove it.

Signed-off-by: Minwoo Im 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Ming Lei 
Cc: Minwoo Im 
Cc: linux-bl...@vger.kernel.org
Link: https://lkml.kernel.org/r/20190602112117.31839-1-minwoo.im@gmail.com

---
 kernel/irq/affinity.c | 12 +---
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index f18cd5aa33e8..4352b08ae48d 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -94,8 +94,7 @@ static int get_nodes_in_cpumask(cpumask_var_t 
*node_to_cpumask,
return nodes;
 }
 
-static int __irq_build_affinity_masks(const struct irq_affinity *affd,
- unsigned int startvec,
+static int __irq_build_affinity_masks(unsigned int startvec,
  unsigned int numvecs,
  unsigned int firstvec,
  cpumask_var_t *node_to_cpumask,
@@ -171,8 +170,7 @@ static int __irq_build_affinity_masks(const struct 
irq_affinity *affd,
  * 1) spread present CPU on these vectors
  * 2) spread other possible CPUs on these vectors
  */
-static int irq_build_affinity_masks(const struct irq_affinity *affd,
-   unsigned int startvec, unsigned int numvecs,
+static int irq_build_affinity_masks(unsigned int startvec, unsigned int 
numvecs,
unsigned int firstvec,
struct irq_affinity_desc *masks)
 {
@@ -197,7 +195,7 @@ static int irq_build_affinity_masks(const struct 
irq_affinity *affd,
build_node_to_cpumask(node_to_cpumask);
 
/* Spread on present CPUs starting from affd->pre_vectors */
-   nr_present = __irq_build_affinity_masks(affd, curvec, numvecs,
+   nr_present = __irq_build_affinity_masks(curvec, numvecs,
firstvec, node_to_cpumask,
cpu_present_mask, nmsk, masks);
 
@@ -212,7 +210,7 @@ static int irq_build_affinity_masks(const struct 
irq_affinity *affd,
else
curvec = firstvec + nr_present;
cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
-   nr_others = __irq_build_affinity_masks(affd, curvec, numvecs,
+   nr_others = __irq_build_affinity_masks(curvec, numvecs,
   firstvec, node_to_cpumask,
   npresmsk, nmsk, masks);
put_online_cpus();
@@ -295,7 +293,7 @@ irq_create_affinity_masks(unsigned int nvecs, struct 
irq_affinity *affd)
unsigned int this_vecs = affd->set_size[i];
int ret;
 
-   ret = irq_build_affinity_masks(affd, curvec, this_vecs,
+   ret = irq_build_affinity_masks(curvec, this_vecs,
   curvec, masks);
if (ret) {
kfree(masks);


Re: [PATCH V3 0/5] nvme-trace: Add support for fabrics command

2019-05-17 Thread Minwoo Im
On 19-05-12 16:34:08, Minwoo Im wrote:
> Hi,
> 
> Here's a third patchset to support fabrics command tracing.  The first
> patch updated host/trace module to a outside of it to provide common
> interfaces for host and target both.  The second one adds support for
> tracing fabrics command from host-side.  The third is a trivial clean-up
> for providing a helper function to figure out given command structure is
> for fabrics or not.
> 
> The fourth and fifth are the major change points of this patchset.  4th
> patch adds request tracing from the target-side.  5th updated, of course,
> completion of given request.
> 
> Please review.
> Thanks,
> 
> Changes to V2:
>   - Provide a common code for both host and target. (Chaitanya)
>   - Add support for tracing requests in target-side (Chaitanya)
>   - Make it simple in trace.h without branch out from nvme core module
> (Christoph)
> 
> Changes to V1:
>   - fabrics commands should also be decoded, not just showing that it's
> a fabrics command. (Christoph)
>   - do not make it within nvme admin commands (Chaitanya)
> 
> Minwoo Im (5):
>   nvme: Make trace common for host and target both
>   nvme-trace: Support tracing fabrics commands from host-side
>   nvme: Introduce nvme_is_fabrics to check fabrics cmd
>   nvme-trace: Add tracing for req_init in trarget
>   nvme-trace: Add tracing for req_comp in target

Ping :)


Re: [PATCH v3 5/7] nvme-pci: add device coredump infrastructure

2019-05-13 Thread Minwoo Im
> +static int nvme_get_telemetry_log_blocks(struct nvme_ctrl *ctrl, void *buf,
> +  size_t bytes, loff_t offset)
> +{
> + loff_t pos = 0;
> + u32 chunk_size;
> +
> + if (check_mul_overflow(ctrl->max_hw_sectors, 512u, &chunk_size))
> + chunk_size = UINT_MAX;
> +
> + while (pos < bytes) {
> + size_t size = min_t(size_t, bytes - pos, chunk_size);
> + int ret;
> +
> + ret = nvme_get_log(ctrl, NVME_NSID_ALL,
> NVME_LOG_TELEMETRY_CTRL,
> +0, buf + pos, size, offset + pos);
> + if (ret)
> + return ret;
> +
> + pos += size;
> + }
> +
> + return 0;
> +}
> +
> +static int nvme_get_telemetry_log(struct nvme_ctrl *ctrl,
> +   struct sg_table *table, size_t bytes)
> +{
> + int n = sg_nents(table->sgl);
> + struct scatterlist *sg;
> + size_t offset = 0;
> + int i;
> +
> + for_each_sg(table->sgl, sg, n, i) {
> + struct page *page = sg_page(sg);
> + size_t size = min_t(int, bytes - offset, sg->length);
> + int ret;
> +
> + ret = nvme_get_telemetry_log_blocks(ctrl,
> page_address(page),
> + size, offset);
> + if (ret)
> + return ret;
> +
> + offset += size;
> + }
> +
> + return 0;
> +}

Can we have those two in nvme-core module instead of being in pci module?


Re: [PATCH v3 6/7] nvme-pci: trigger device coredump on command timeout

2019-05-13 Thread Minwoo Im
> -static void __maybe_unused nvme_coredump_init(struct nvme_dev *dev);
> -static void __maybe_unused nvme_coredump_logs(struct nvme_dev *dev);
> -static void __maybe_unused nvme_coredump_complete(struct nvme_dev
> *dev);
> +static void nvme_coredump_init(struct nvme_dev *dev);
> +static void nvme_coredump_logs(struct nvme_dev *dev);
> +static void nvme_coredump_complete(struct nvme_dev *dev);

You just have added those three prototypes in previous patch.  Did I miss
something here?


Re: [PATCH] nvme/pci: Use host managed power state for suspend

2019-05-12 Thread Minwoo Im
> > +   union nvme_result res;
> > +   int ret;
> > +
> > +   if (!result)
> > +   return -EINVAL;
> > +
> > +   memset(&c, 0, sizeof(c));
> > +   c.features.opcode = nvme_admin_get_features;
> > +   c.features.fid = cpu_to_le32(NVME_FEAT_POWER_MGMT);
> > +
> > +   ret = __nvme_submit_sync_cmd(ctrl->admin_q, &c, &res,
> > +   NULL, 0, 0, NVME_QID_ANY, 0, 0, false);
> > +   if (ret >= 0)
> > +   *result = le32_to_cpu(res.u32);
> May be add a check for result here in above if before deref pointer :-
>   if (ret >= 0 && result)
> 

'result' already has been checked in a few lines above.


[PATCH V3 3/5] nvme: Introduce nvme_is_fabrics to check fabrics cmd

2019-05-12 Thread Minwoo Im
This patch introduce a nvme_is_fabrics() inline function to check
whether or not the given command structure is for fabrics.

Cc: Keith Busch 
Cc: Jens Axboe 
Cc: Christoph Hellwig 
Cc: Sagi Grimberg 
Cc: James Smart 
Signed-off-by: Minwoo Im 
---
 drivers/nvme/host/fabrics.c   | 2 +-
 drivers/nvme/target/core.c| 2 +-
 drivers/nvme/target/fabrics-cmd.c | 2 +-
 drivers/nvme/target/fc.c  | 2 +-
 include/linux/nvme.h  | 7 ++-
 5 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 592d1e61ef7e..931995f2dbc3 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -578,7 +578,7 @@ bool __nvmf_check_ready(struct nvme_ctrl *ctrl, struct 
request *rq,
switch (ctrl->state) {
case NVME_CTRL_NEW:
case NVME_CTRL_CONNECTING:
-   if (req->cmd->common.opcode == nvme_fabrics_command &&
+   if (nvme_is_fabrics(req->cmd) &&
req->cmd->fabrics.fctype == nvme_fabrics_type_connect)
return true;
break;
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 7734a6acff85..da2ea97042af 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -871,7 +871,7 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq 
*cq,
status = nvmet_parse_connect_cmd(req);
else if (likely(req->sq->qid != 0))
status = nvmet_parse_io_cmd(req);
-   else if (req->cmd->common.opcode == nvme_fabrics_command)
+   else if (nvme_is_fabrics(req->cmd))
status = nvmet_parse_fabrics_cmd(req);
else if (req->sq->ctrl->subsys->type == NVME_NQN_DISC)
status = nvmet_parse_discovery_cmd(req);
diff --git a/drivers/nvme/target/fabrics-cmd.c 
b/drivers/nvme/target/fabrics-cmd.c
index 3b9f79aba98f..d16b55ffe79f 100644
--- a/drivers/nvme/target/fabrics-cmd.c
+++ b/drivers/nvme/target/fabrics-cmd.c
@@ -268,7 +268,7 @@ u16 nvmet_parse_connect_cmd(struct nvmet_req *req)
 {
struct nvme_command *cmd = req->cmd;
 
-   if (cmd->common.opcode != nvme_fabrics_command) {
+   if (!nvme_is_fabrics(cmd)) {
pr_err("invalid command 0x%x on unconnected queue.\n",
cmd->fabrics.opcode);
req->error_loc = offsetof(struct nvme_common_command, opcode);
diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index 508661af0f50..a59c5a013a5c 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -1806,7 +1806,7 @@ nvmet_fc_prep_fcp_rsp(struct nvmet_fc_tgtport *tgtport,
 */
rspcnt = atomic_inc_return(&fod->queue->zrspcnt);
if (!(rspcnt % fod->queue->ersp_ratio) ||
-   sqe->opcode == nvme_fabrics_command ||
+   nvme_is_fabrics((struct nvme_command *) sqe) ||
xfr_length != fod->req.transfer_len ||
(le16_to_cpu(cqe->status) & 0xFFFE) || cqewd[0] || cqewd[1] ||
(sqe->flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND)) ||
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index c40720cb59ac..ab5e9392b42d 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -1165,6 +1165,11 @@ struct nvme_command {
};
 };
 
+static inline bool nvme_is_fabrics(struct nvme_command *cmd)
+{
+   return cmd->common.opcode == nvme_fabrics_command;
+}
+
 struct nvme_error_slot {
__le64  error_count;
__le16  sqid;
@@ -1186,7 +1191,7 @@ static inline bool nvme_is_write(struct nvme_command *cmd)
 *
 * Why can't we simply have a Fabrics In and Fabrics out command?
 */
-   if (unlikely(cmd->common.opcode == nvme_fabrics_command))
+   if (unlikely(nvme_is_fabrics(cmd)))
return cmd->fabrics.fctype & 1;
return cmd->common.opcode & 1;
 }
-- 
2.17.1



[PATCH V3 5/5] nvme-trace: Add tracing for req_comp in target

2019-05-12 Thread Minwoo Im
We can have the common tracing code with different event entries.
  - nvme_complete_rq
  - nvmet_req_complete

This patch updates existing TRACE_EVENT to a template to provide a
common tracing interface.

We can have it as a common code because most of the fields need to be
printed out for both host and target system.

Cc: Keith Busch 
Cc: Jens Axboe 
Cc: Christoph Hellwig 
Cc: Sagi Grimberg 
Cc: James Smart 
Signed-off-by: Minwoo Im 
---
 drivers/nvme/host/core.c   |  2 +-
 drivers/nvme/target/core.c |  3 +++
 drivers/nvme/trace.c   |  1 +
 drivers/nvme/trace.h   | 51 ++
 4 files changed, 45 insertions(+), 12 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 39e49e9948c3..f377ed039a83 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -260,7 +260,7 @@ void nvme_complete_rq(struct request *req)
 {
blk_status_t status = nvme_error_status(req);
 
-   trace_nvme_complete_rq(req);
+   trace_nvme_complete_rq(NVME_TRACE_HOST, req);
 
if (nvme_req(req)->ctrl->kas)
nvme_req(req)->ctrl->comp_seen = true;
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 10b3b3767f91..0f184abe432f 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -690,6 +690,9 @@ static void __nvmet_req_complete(struct nvmet_req *req, u16 
status)
 
if (unlikely(status))
nvmet_set_error(req, status);
+
+   trace_nvmet_req_complete(NVME_TRACE_TARGET, req);
+
if (req->ns)
nvmet_put_namespace(req->ns);
req->ops->queue_response(req);
diff --git a/drivers/nvme/trace.c b/drivers/nvme/trace.c
index 8fe2dcee6a42..8071b60ec71d 100644
--- a/drivers/nvme/trace.c
+++ b/drivers/nvme/trace.c
@@ -222,3 +222,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(nvme_async_event);
 EXPORT_TRACEPOINT_SYMBOL_GPL(nvme_sq);
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(nvmet_req_init);
+EXPORT_TRACEPOINT_SYMBOL_GPL(nvmet_req_complete);
diff --git a/drivers/nvme/trace.h b/drivers/nvme/trace.h
index afda9c2ab4a1..0674bb85ac66 100644
--- a/drivers/nvme/trace.h
+++ b/drivers/nvme/trace.h
@@ -181,9 +181,9 @@ DEFINE_EVENT(nvme__cmd_begin, nvmet_req_init,
TP_ARGS(type, req, cmd)
 );
 
-TRACE_EVENT(nvme_complete_rq,
-   TP_PROTO(struct request *req),
-   TP_ARGS(req),
+DECLARE_EVENT_CLASS(nvme__cmd_end,
+   TP_PROTO(enum nvme_trace_type type, void *req),
+   TP_ARGS(type, req),
TP_STRUCT__entry(
__array(char, disk, DISK_NAME_LEN)
__field(int, ctrl_id)
@@ -195,20 +195,49 @@ TRACE_EVENT(nvme_complete_rq,
__field(u16, status)
),
TP_fast_assign(
-   __entry->ctrl_id = nvme_req(req)->ctrl->instance;
-   __entry->qid = nvme_req_qid(req);
-   __entry->cid = req->tag;
-   __entry->result = le64_to_cpu(nvme_req(req)->result.u64);
-   __entry->retries = nvme_req(req)->retries;
-   __entry->flags = nvme_req(req)->flags;
-   __entry->status = nvme_req(req)->status;
-   __assign_disk_name(__entry->disk, req->rq_disk);
+   if (type != NVME_TRACE_TARGET) {
+   struct request *req = (struct request *) req;
+
+   __entry->ctrl_id = nvme_req(req)->ctrl->instance;
+   __entry->qid = nvme_req_qid(req);
+   __entry->cid = req->tag;
+   __entry->result =
+   le64_to_cpu(nvme_req(req)->result.u64);
+   __entry->retries = nvme_req(req)->retries;
+   __entry->flags = nvme_req(req)->flags;
+   __entry->status = nvme_req(req)->status;
+   __assign_disk_name(__entry->disk, req->rq_disk);
+   } else {
+   struct nvmet_ctrl *ctrl = nvmet_req_to_ctrl(req);
+   struct nvmet_cq *cq = ((struct nvmet_req *) req)->cq;
+   struct nvme_completion *cqe =
+   ((struct nvmet_req *) req)->cqe;
+   struct nvmet_ns *ns = ((struct nvmet_req *) req)->ns;
+
+   __entry->ctrl_id = ctrl ? ctrl->cntlid : 0;
+   __entry->qid = cq->qid;
+   __entry->cid = cqe->command_id;
+   __entry->result = cqe->result.u64;
+   __entry->flags = 0;
+   __entry->status = cqe->status >> 1;
+   __assign_disk_name(__entry->disk, ns ?
+   ns->bdev->bd_disk : NULL);
+   }
   

[PATCH V3 4/5] nvme-trace: Add tracing for req_init in trarget

2019-05-12 Thread Minwoo Im
We can have the common tracing code with different event entries.
  - nvme_setup_cmd
  - nvmet_req_init

This patch updates existing TRACE_EVENT to a template to provide a
common tracing interface.  More than that, nvme_setup_cmd entry point
has been defined as an event referring template made.

It also introduces tracing at the point of request creation for the
target system.  This might be useful to figure out what kind of
request has been received in the target.

Here's the example of target tracing introduced with RDMA:
kworker/0:1H-1043  [000]    276.785946: nvmet_req_init: nvme0: qid=0, 
cmdid=0, nsid=1, flags=0x40, meta=0x0, cmd=(nvme_fabrics_type_connect recfmt=0, 
qid=0, sqsize=31, cattr=0, kato=0)
kworker/0:1H-1043  [000]    276.789893: nvmet_req_init: nvme1: qid=0, 
cmdid=10, nsid=4, flags=0x40, meta=0x0, cmd=(nvme_fabrics_type_property_get 
attrib=1, ofst=0x0)
kworker/0:1H-1043  [000]    276.791781: nvmet_req_init: nvme1: qid=0, 
cmdid=11, nsid=0, flags=0x40, meta=0x0, cmd=(nvme_fabrics_type_property_set 
attrib=0, ofst=0x14, value=0x460001)
kworker/0:1H-1043  [000]    276.794799: nvmet_req_init: nvme1: qid=0, 
cmdid=12, nsid=4, flags=0x40, meta=0x0, cmd=(nvme_fabrics_type_property_get 
attrib=0, ofst=0x1c)
kworker/0:1H-1043  [000]    276.796804: nvmet_req_init: nvme1: qid=0, 
cmdid=13, nsid=4, flags=0x40, meta=0x0, cmd=(nvme_fabrics_type_property_get 
attrib=0, ofst=0x8)
kworker/0:1H-1043  [000]    276.799163: nvmet_req_init: nvme1: qid=0, 
cmdid=10, nsid=4, flags=0x40, meta=0x0, cmd=(nvme_fabrics_type_property_get 
attrib=1, ofst=0x0)
kworker/0:1H-1043  [000]    276.801070: nvmet_req_init: nvme1: qid=0, 
cmdid=11, nsid=0, flags=0x40, meta=0x0, cmd=(nvme_admin_identify cns=1, 
ctrlid=0)
kworker/0:1H-1043  [000]    276.817592: nvmet_req_init: nvme1: qid=0, 
cmdid=12, nsid=0, flags=0x40, meta=0x0, cmd=(nvme_admin_get_log_page cdw10=70 
00 03 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00)
kworker/0:1H-1043  [000]    276.822963: nvmet_req_init: nvme1: qid=0, 
cmdid=13, nsid=0, flags=0x40, meta=0x0, cmd=(nvme_admin_get_log_page cdw10=70 
00 ff 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00)
kworker/0:1H-1043  [000]    276.831908: nvmet_req_init: nvme1: qid=0, 
cmdid=10, nsid=0, flags=0x40, meta=0x0, cmd=(nvme_admin_get_log_page cdw10=70 
00 03 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00)

Cc: Keith Busch 
Cc: Jens Axboe 
Cc: Christoph Hellwig 
Cc: Sagi Grimberg 
Cc: James Smart 
Signed-off-by: Minwoo Im 
---
 drivers/nvme/host/core.c|  2 +-
 drivers/nvme/target/core.c  |  3 +++
 drivers/nvme/target/nvmet.h |  9 +++
 drivers/nvme/trace.c|  2 ++
 drivers/nvme/trace.h| 50 -
 5 files changed, 59 insertions(+), 7 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index ae76c0b78a5f..39e49e9948c3 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -732,7 +732,7 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct 
request *req,
}
 
cmd->common.command_id = req->tag;
-   trace_nvme_setup_cmd(req, cmd);
+   trace_nvme_setup_cmd(NVME_TRACE_HOST, req, cmd);
return ret;
 }
 EXPORT_SYMBOL_GPL(nvme_setup_cmd);
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index da2ea97042af..10b3b3767f91 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -10,6 +10,7 @@
 #include 
 #include 
 
+#include "../trace.h"
 #include "nvmet.h"
 
 struct workqueue_struct *buffered_io_wq;
@@ -848,6 +849,8 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq 
*cq,
req->error_loc = NVMET_NO_ERROR_LOC;
req->error_slba = 0;
 
+   trace_nvmet_req_init(NVME_TRACE_TARGET, req, req->cmd);
+
/* no support for fused commands yet */
if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) {
req->error_loc = offsetof(struct nvme_common_command, flags);
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index c25d88fc9dec..2d569a1dc3f4 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -318,6 +318,15 @@ struct nvmet_req {
u64 error_slba;
 };
 
+static inline struct nvmet_ctrl *nvmet_req_to_ctrl(struct nvmet_req *req)
+{
+   struct nvmet_sq *sq = req->sq;
+
+   if (sq)
+   return sq->ctrl;
+   return NULL;
+}
+
 extern struct workqueue_struct *buffered_io_wq;
 
 static inline void nvmet_set_result(struct nvmet_req *req, u32 result)
diff --git a/drivers/nvme/trace.c b/drivers/nvme/trace.c
index 88dfadb68b92..8fe2dcee6a42 100644
--- a/drivers/nvme/trace.c
+++ b/drivers/nvme/trace.c
@@ -220,3 +220,5 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(nvme_setup_cmd);
 EXPORT_TRACEPOINT_SYMBOL_GPL(nvme_complete_rq);
 EXPORT_TRACEPOINT_SYMBOL_GPL(nvm

[PATCH V3 2/5] nvme-trace: Support tracing fabrics commands from host-side

2019-05-12 Thread Minwoo Im
This patch adds support for tracing fabrics commands detected from
host-side.  We can have trace_nvme_setup_cmd for nvme and nvme fabrics
both even fabrics submission queue entry does not have valid fields in
where nvme's fields are valid.

This patch modifies existing parse_nvme_cmd to have fctype as an
argument to support fabrics command which will be ignored in case of
nvme common case.

Cc: Keith Busch 
Cc: Jens Axboe 
Cc: Christoph Hellwig 
Cc: Sagi Grimberg 
Cc: James Smart 
Signed-off-by: Minwoo Im 
---
 drivers/nvme/trace.c | 67 
 drivers/nvme/trace.h | 41 +--
 2 files changed, 100 insertions(+), 8 deletions(-)

diff --git a/drivers/nvme/trace.c b/drivers/nvme/trace.c
index a2c8186d122d..88dfadb68b92 100644
--- a/drivers/nvme/trace.c
+++ b/drivers/nvme/trace.c
@@ -137,6 +137,73 @@ const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p,
}
 }
 
+static const char *nvme_trace_fabrics_property_set(struct trace_seq *p, u8 
*spc)
+{
+   const char *ret = trace_seq_buffer_ptr(p);
+   u8 attrib = spc[0];
+   u32 ofst = get_unaligned_le32(spc + 4);
+   u64 value = get_unaligned_le64(spc + 8);
+
+   trace_seq_printf(p, "attrib=%u, ofst=0x%x, value=0x%llx",
+   attrib, ofst, value);
+   trace_seq_putc(p, 0);
+
+   return ret;
+}
+
+static const char *nvme_trace_fabrics_connect(struct trace_seq *p, u8 *spc)
+{
+   const char *ret = trace_seq_buffer_ptr(p);
+   u16 recfmt = get_unaligned_le16(spc);
+   u16 qid = get_unaligned_le16(spc + 2);
+   u16 sqsize = get_unaligned_le16(spc + 4);
+   u8 cattr = spc[6];
+   u32 kato = get_unaligned_le32(spc + 8);
+
+   trace_seq_printf(p, "recfmt=%u, qid=%u, sqsize=%u, cattr=%u, kato=%u",
+   recfmt, qid, sqsize, cattr, kato);
+   trace_seq_putc(p, 0);
+
+   return ret;
+}
+
+static const char *nvme_trace_fabrics_property_get(struct trace_seq *p, u8 
*spc)
+{
+   const char *ret = trace_seq_buffer_ptr(p);
+   u8 attrib = spc[0];
+   u32 ofst = get_unaligned_le32(spc + 4);
+
+   trace_seq_printf(p, "attrib=%u, ofst=0x%x", attrib, ofst);
+   trace_seq_putc(p, 0);
+
+   return ret;
+}
+
+static const char *nvme_trace_fabrics_common(struct trace_seq *p, u8 *spc)
+{
+   const char *ret = trace_seq_buffer_ptr(p);
+
+   trace_seq_printf(p, "spcecific=%*ph", 24, spc);
+   trace_seq_putc(p, 0);
+
+   return ret;
+}
+
+const char *nvme_trace_parse_fabrics_cmd(struct trace_seq *p,
+u8 fctype, u8 *spc)
+{
+   switch (fctype) {
+   case nvme_fabrics_type_property_set:
+   return nvme_trace_fabrics_property_set(p, spc);
+   case nvme_fabrics_type_connect:
+   return nvme_trace_fabrics_connect(p, spc);
+   case nvme_fabrics_type_property_get:
+   return nvme_trace_fabrics_property_get(p, spc);
+   default:
+   return nvme_trace_fabrics_common(p, spc);
+   }
+}
+
 const char *nvme_trace_disk_name(struct trace_seq *p, char *name)
 {
const char *ret = trace_seq_buffer_ptr(p);
diff --git a/drivers/nvme/trace.h b/drivers/nvme/trace.h
index acf10c7a5bef..7fbfd6cb815c 100644
--- a/drivers/nvme/trace.h
+++ b/drivers/nvme/trace.h
@@ -57,18 +57,39 @@
nvme_opcode_name(nvme_cmd_resv_acquire),\
nvme_opcode_name(nvme_cmd_resv_release))
 
-#define show_opcode_name(qid, opcode)  \
-   (qid ? show_nvm_opcode_name(opcode) : show_admin_opcode_name(opcode))
+#define nvme_fabrics_type_name(type)   { type, #type }
+#define show_fabrics_type_name(type)   \
+   __print_symbolic(type,  \
+   nvme_fabrics_type_name(nvme_fabrics_type_property_set), \
+   nvme_fabrics_type_name(nvme_fabrics_type_connect),  \
+   nvme_fabrics_type_name(nvme_fabrics_type_property_get))
+
+/*
+ * If not fabrics, fctype will be ignored.
+ */
+#define show_opcode_name(qid, opcode, fctype)  \
+   ((opcode != nvme_fabrics_command) ? \
+   (qid ?  \
+   show_nvm_opcode_name(opcode) :  \
+   show_admin_opcode_name(opcode)) :   \
+   show_fabrics_type_name(fctype))
 
 const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode,
u8 *cdw10);
 const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, u8 opcode,
u8 *cdw10);
+const char *nvme_trace_parse_fabrics_cmd(struct trace_seq *p,
+   u8 fctype, u8 *spc);
 
-#define parse_nvme_cmd(qid, opco

[PATCH V3 1/5] nvme: Make trace common for host and target both

2019-05-12 Thread Minwoo Im
To support target-side trace, nvme-trace should be commonized for host
and target both.  Of course, not every single tracepoints are necessary
by both of them, but we don't need to have more than one trace module
for host and target.

Cc: Keith Busch 
Cc: Jens Axboe 
Cc: Christoph Hellwig 
Cc: Sagi Grimberg 
Cc: James Smart 
Signed-off-by: Minwoo Im 
---
 MAINTAINERS | 2 ++
 drivers/nvme/Makefile   | 3 +++
 drivers/nvme/host/Makefile  | 1 -
 drivers/nvme/host/core.c| 3 +--
 drivers/nvme/host/pci.c | 2 +-
 drivers/nvme/{host => }/trace.c | 5 +
 drivers/nvme/{host => }/trace.h | 2 +-
 7 files changed, 13 insertions(+), 5 deletions(-)
 rename drivers/nvme/{host => }/trace.c (95%)
 rename drivers/nvme/{host => }/trace.h (99%)

diff --git a/MAINTAINERS b/MAINTAINERS
index 09f43f1bdd15..5974cadafcf7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11084,6 +11084,7 @@ L:  linux-n...@lists.infradead.org
 T: git://git.infradead.org/nvme.git
 W: http://git.infradead.org/nvme.git
 S: Supported
+F: drivers/nvme/
 F: drivers/nvme/host/
 F: include/linux/nvme.h
 F: include/uapi/linux/nvme_ioctl.h
@@ -11105,6 +11106,7 @@ L:  linux-n...@lists.infradead.org
 T: git://git.infradead.org/nvme.git
 W: http://git.infradead.org/nvme.git
 S: Supported
+F: drivers/nvme/
 F: drivers/nvme/target/
 
 NVMEM FRAMEWORK
diff --git a/drivers/nvme/Makefile b/drivers/nvme/Makefile
index 0096a7fd1431..12f193502d46 100644
--- a/drivers/nvme/Makefile
+++ b/drivers/nvme/Makefile
@@ -1,3 +1,6 @@
 
+ccflags-y  += -I$(src)
+obj-$(CONFIG_TRACING)  += trace.o
+
 obj-y  += host/
 obj-y  += target/
diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
index 8a4b671c5f0c..46453352bfa0 100644
--- a/drivers/nvme/host/Makefile
+++ b/drivers/nvme/host/Makefile
@@ -10,7 +10,6 @@ obj-$(CONFIG_NVME_FC) += nvme-fc.o
 obj-$(CONFIG_NVME_TCP) += nvme-tcp.o
 
 nvme-core-y:= core.o
-nvme-core-$(CONFIG_TRACING)+= trace.o
 nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o
 nvme-core-$(CONFIG_NVM)+= lightnvm.o
 nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS)   += fault_inject.o
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index eebaeadaa800..ae76c0b78a5f 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -21,8 +21,7 @@
 #include 
 #include 
 
-#define CREATE_TRACE_POINTS
-#include "trace.h"
+#include "../trace.h"
 
 #include "nvme.h"
 #include "fabrics.h"
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 2a8708c9ac18..d90b66543d25 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -24,7 +24,7 @@
 #include 
 #include 
 
-#include "trace.h"
+#include "../trace.h"
 #include "nvme.h"
 
 #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command))
diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/trace.c
similarity index 95%
rename from drivers/nvme/host/trace.c
rename to drivers/nvme/trace.c
index 5f24ea7a28eb..a2c8186d122d 100644
--- a/drivers/nvme/host/trace.c
+++ b/drivers/nvme/trace.c
@@ -5,6 +5,8 @@
  */
 
 #include 
+
+#define CREATE_TRACE_POINTS
 #include "trace.h"
 
 static const char *nvme_trace_create_sq(struct trace_seq *p, u8 *cdw10)
@@ -147,4 +149,7 @@ const char *nvme_trace_disk_name(struct trace_seq *p, char 
*name)
 }
 EXPORT_SYMBOL_GPL(nvme_trace_disk_name);
 
+EXPORT_TRACEPOINT_SYMBOL_GPL(nvme_setup_cmd);
+EXPORT_TRACEPOINT_SYMBOL_GPL(nvme_complete_rq);
+EXPORT_TRACEPOINT_SYMBOL_GPL(nvme_async_event);
 EXPORT_TRACEPOINT_SYMBOL_GPL(nvme_sq);
diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/trace.h
similarity index 99%
rename from drivers/nvme/host/trace.h
rename to drivers/nvme/trace.h
index 97d3c77365b8..acf10c7a5bef 100644
--- a/drivers/nvme/host/trace.h
+++ b/drivers/nvme/trace.h
@@ -14,7 +14,7 @@
 #include 
 #include 
 
-#include "nvme.h"
+#include "host/nvme.h"
 
 #define nvme_admin_opcode_name(opcode) { opcode, #opcode }
 #define show_admin_opcode_name(val)\
-- 
2.17.1



[PATCH V3 0/5] nvme-trace: Add support for fabrics command

2019-05-12 Thread Minwoo Im
Hi,

Here's a third patchset to support fabrics command tracing.  The first
patch updated host/trace module to a outside of it to provide common
interfaces for host and target both.  The second one adds support for
tracing fabrics command from host-side.  The third is a trivial clean-up
for providing a helper function to figure out given command structure is
for fabrics or not.

The fourth and fifth are the major change points of this patchset.  4th
patch adds request tracing from the target-side.  5th updated, of course,
completion of given request.

Please review.
Thanks,

Changes to V2:
  - Provide a common code for both host and target. (Chaitanya)
  - Add support for tracing requests in target-side (Chaitanya)
  - Make it simple in trace.h without branch out from nvme core module
(Christoph)

Changes to V1:
  - fabrics commands should also be decoded, not just showing that it's
a fabrics command. (Christoph)
  - do not make it within nvme admin commands (Chaitanya)

Minwoo Im (5):
  nvme: Make trace common for host and target both
  nvme-trace: Support tracing fabrics commands from host-side
  nvme: Introduce nvme_is_fabrics to check fabrics cmd
  nvme-trace: Add tracing for req_init in trarget
  nvme-trace: Add tracing for req_comp in target

 MAINTAINERS   |   2 +
 drivers/nvme/Makefile |   3 +
 drivers/nvme/host/Makefile|   1 -
 drivers/nvme/host/core.c  |   7 +-
 drivers/nvme/host/fabrics.c   |   2 +-
 drivers/nvme/host/pci.c   |   2 +-
 drivers/nvme/target/core.c|   8 +-
 drivers/nvme/target/fabrics-cmd.c |   2 +-
 drivers/nvme/target/fc.c  |   2 +-
 drivers/nvme/target/nvmet.h   |   9 ++
 drivers/nvme/{host => }/trace.c   |  75 
 drivers/nvme/{host => }/trace.h   | 144 --
 include/linux/nvme.h  |   7 +-
 13 files changed, 227 insertions(+), 37 deletions(-)
 rename drivers/nvme/{host => }/trace.c (65%)
 rename drivers/nvme/{host => }/trace.h (59%)

-- 
2.17.1



Re: [PATCH v2 6/7] nvme-pci: add device coredump support

2019-05-07 Thread Minwoo Im
> This is a bit of a mine field. The shutdown_lock is held when reclaiming
> requests that didn't see a response. If you're holding it here and your
> telemetry log page times out, we're going to deadlock. And since the
> controller is probably in a buggered state when you try to retrieve one,
> I would guess an unrecoverable timeout is the most likely outcome.

Akinobu,

I actually agree with Keith's one.  In my experience, there was always internal
error inside device when timeout occurs in nvme driver which means the
following command might not be completed due to lack of response from
device.


Re: [PATCH 3/4] nvme-pci: add device coredump support

2019-05-04 Thread Minwoo Im

On 5/4/19 11:38 PM, Minwoo Im wrote:

On 5/4/19 11:26 PM, Akinobu Mita wrote:

2019年5月4日(土) 19:04 Minwoo Im :



+ { NVME_REG_INTMS,   "intms",    32 },
+ { NVME_REG_INTMC,   "intmc",    32 },
+ { NVME_REG_CC,  "cc",   32 },
+ { NVME_REG_CSTS,    "csts", 32 },
+ { NVME_REG_NSSR,    "nssr", 32 },
+ { NVME_REG_AQA, "aqa",  32 },
+ { NVME_REG_ASQ, "asq",  64 },
+ { NVME_REG_ACQ, "acq",  64 },
+ { NVME_REG_CMBLOC,  "cmbloc",   32 },
+ { NVME_REG_CMBSZ,   "cmbsz",    32 },


If it's going to support optional registers also, then we can have
BP-related things (BPINFO, BPRSEL, BPMBL) here also.


I'm going to change the register dump in binary format just like
'nvme show-regs -o binary' does.  So we'll have registers from 00h to 
4Fh.




Got it.

And now I can see those two commands `nvme show-regs` and
`nvme show-regs -o binary` have different results for the register
range.  The binary output covers just 0x50 size, but it shows all the
registers including BP-related things in normal && json format.

Anyway, I'll prepare a patch for nvme-cli to support binary output
format to cover BP things also.

Thanks, for your reply.


My bad, I misunderstood what you have said above.  Please ignore
what I mentioned. BP things are located from 40h. to 4Fh.

Sorry for making noises here. ;)


Re: [PATCH 3/4] nvme-pci: add device coredump support

2019-05-04 Thread Minwoo Im

On 5/4/19 11:26 PM, Akinobu Mita wrote:

2019年5月4日(土) 19:04 Minwoo Im :


Hi, Akinobu,

Regardless to reply of the cover, few nits here.

On 5/2/19 5:59 PM, Akinobu Mita wrote:

+
+static const struct nvme_reg nvme_regs[] = {
+ { NVME_REG_CAP, "cap",  64 },
+ { NVME_REG_VS,  "version",  32 },


Why don't we just go with "vs" instead of full name of it just like
the others.


I tried to imitate the output of 'nvme show-regs'.


Okay.




+ { NVME_REG_INTMS,   "intms",32 },
+ { NVME_REG_INTMC,   "intmc",32 },
+ { NVME_REG_CC,  "cc",   32 },
+ { NVME_REG_CSTS,"csts", 32 },
+ { NVME_REG_NSSR,"nssr", 32 },
+ { NVME_REG_AQA, "aqa",  32 },
+ { NVME_REG_ASQ, "asq",  64 },
+ { NVME_REG_ACQ, "acq",  64 },
+ { NVME_REG_CMBLOC,  "cmbloc",   32 },
+ { NVME_REG_CMBSZ,   "cmbsz",32 },


If it's going to support optional registers also, then we can have
BP-related things (BPINFO, BPRSEL, BPMBL) here also.


I'm going to change the register dump in binary format just like
'nvme show-regs -o binary' does.  So we'll have registers from 00h to 4Fh.



Got it.

And now I can see those two commands `nvme show-regs` and
`nvme show-regs -o binary` have different results for the register
range.  The binary output covers just 0x50 size, but it shows all the
registers including BP-related things in normal && json format.

Anyway, I'll prepare a patch for nvme-cli to support binary output
format to cover BP things also.

Thanks, for your reply.


Re: [PATCH 3/4] nvme-pci: add device coredump support

2019-05-04 Thread Minwoo Im

Hi, Akinobu,

Regardless to reply of the cover, few nits here.

On 5/2/19 5:59 PM, Akinobu Mita wrote:

+
+static const struct nvme_reg nvme_regs[] = {
+   { NVME_REG_CAP, "cap",64 },
+   { NVME_REG_VS,  "version",32 },


Why don't we just go with "vs" instead of full name of it just like
the others.


+   { NVME_REG_INTMS,   "intms",  32 },
+   { NVME_REG_INTMC,   "intmc",  32 },
+   { NVME_REG_CC,  "cc", 32 },
+   { NVME_REG_CSTS,"csts",   32 },
+   { NVME_REG_NSSR,"nssr",   32 },
+   { NVME_REG_AQA, "aqa",32 },
+   { NVME_REG_ASQ, "asq",64 },
+   { NVME_REG_ACQ, "acq",64 },
+   { NVME_REG_CMBLOC,  "cmbloc", 32 },
+   { NVME_REG_CMBSZ,   "cmbsz",  32 },


If it's going to support optional registers also, then we can have
BP-related things (BPINFO, BPRSEL, BPMBL) here also.

Thanks,


Re: [PATCH 0/4] nvme-pci: support device coredump

2019-05-04 Thread Minwoo Im

Hi Akinobu,

On 5/4/19 1:20 PM, Akinobu Mita wrote:

2019年5月3日(金) 21:20 Christoph Hellwig :


On Fri, May 03, 2019 at 06:12:32AM -0600, Keith Busch wrote:

Could you actually explain how the rest is useful? I personally have
never encountered an issue where knowing these values would have helped:
every device timeout always needed device specific internal firmware
logs in my experience.


I agree that the device specific internal logs like telemetry are the most
useful.  The memory dump of command queues and completion queues is not
that powerful but helps to know what commands have been submitted before
the controller goes wrong (IOW, it's sometimes not enough to know
which commands are actually failed), and it can be parsed without vendor
specific knowledge.


I'm not pretty sure I can say that memory dump of queues are useless at all.

As you mentioned, sometimes it's not enough to know which command has
actually been failed because we might want to know what happened before and
after the actual failure.

But, the information of commands handled from device inside would be much
more useful to figure out what happened because in case of multiple queues,
the arbitration among them could not be represented by this memory dump.



If the issue is reproducible, the nvme trace is the most powerful for this
kind of information.  The memory dump of the queues is not that powerful,
but it can always be enabled by default.


If the memory dump is a key to reproduce some issues, then it will be 
powerful
to hand it to a vendor to solve it.  But I'm afraid of it because the 
dump might

not be able to give relative submitted times among the commands in queues.




Yes.  Also not that NVMe now has the 'device initiated telemetry'
feauture, which is just a wired name for device coredump.  Wiring that
up so that we can easily provide that data to the device vendor would
actually be pretty useful.


This version of nvme coredump captures controller registers and each queue.
So before resetting controller is a suitable time to capture these.
If we'll capture other log pages in this mechanism, the coredump procedure
will be splitted into two phases (before resetting controller and after
resetting as soon as admin queue is available).


I agree with that it would be nice if we have a information that might not
be that powerful rather than nothing.

But, could we request controller-initiated telemetry log page if 
supported by
the controller to get the internal information at the point of failure 
like reset?
If the dump is generated with the telemetry log page, I think it would 
be great

to be a clue to solve the issue.

Thanks,


RE: Re: [PATCH] nvmet: fix ptr_ret.cocci warnings

2019-04-29 Thread Minwoo Im
Sure, I will.

Thanks,

> -Original Message-
> From: Linux-nvme [mailto:linux-nvme-boun...@lists.infradead.org] On
> Behalf Of Chaitanya Kulkarni
> Sent: Tuesday, April 30, 2019 7:14 AM
> To: Minwoo Im
> Cc: linux-kernel@vger.kernel.org; Christoph Hellwig; linux-
> n...@lists.infradead.org; Sagi Grimberg
> Subject: Re: [PATCH] nvmet: fix ptr_ret.cocci warnings
> 
> Hi Minwoo,
> 
> Can you please resend this patch with the suggested change ?
> 
> On 04/29/2019 10:58 AM, kbuild test robot wrote:
> > From: kbuild test robot 
> >
> > drivers/nvme/target/discovery.c:375:1-3: WARNING: PTR_ERR_OR_ZERO
> can be used
> >
> >
> >   Use PTR_ERR_OR_ZERO rather than if(IS_ERR(...)) + PTR_ERR
> >
> > Generated by: scripts/coccinelle/api/ptr_ret.cocci
> >
> > Fixes: 6b7e631b927c ("nvmet: return a specified error it subsys_alloc 
> > fails")
> > CC: Minwoo Im 
> > Signed-off-by: kbuild test robot 
> > ---
> >
> > tree:   https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
> master
> > head:   3d17a1de96a233cf89bfbb5a77ebb1a05c420681
> > commit: 6b7e631b927ca1266b2695307ab71ed7764af75e [9188/10649]
> nvmet: return a specified error it subsys_alloc fails
> >
> >   discovery.c |4 +---
> >   1 file changed, 1 insertion(+), 3 deletions(-)
> >
> > --- a/drivers/nvme/target/discovery.c
> > +++ b/drivers/nvme/target/discovery.c
> > @@ -372,9 +372,7 @@ int __init nvmet_init_discovery(void)
> >   {
> > nvmet_disc_subsys =
> > nvmet_subsys_alloc(NVME_DISC_SUBSYS_NAME,
> NVME_NQN_DISC);
> > -   if (IS_ERR(nvmet_disc_subsys))
> > -   return PTR_ERR(nvmet_disc_subsys);
> > -   return 0;
> > +   return PTR_ERR_OR_ZERO(nvmet_disc_subsys);
> >   }
> >
> >   void nvmet_exit_discovery(void)
> >
> 
> 
> ___
> Linux-nvme mailing list
> linux-n...@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-nvme


RE: [PATCH 1/3] drivers: nvme: target: core: fix build break

2019-04-24 Thread Minwoo Im
This looks good to me. 

Reviewed-by: Minwoo Im 

> -Original Message-
> From: Linux-nvme [mailto:linux-nvme-boun...@lists.infradead.org] On Behalf
> Of Enrico Weigelt, metux IT consult
> Sent: Wednesday, April 24, 2019 7:35 PM
> To: linux-kernel@vger.kernel.org
> Cc: ax...@kernel.dk; linux-...@vger.kernel.org; linux-
> n...@lists.infradead.org
> Subject: [PATCH 1/3] drivers: nvme: target: core: fix build break
> 
> Build breaks:
> 
> drivers/nvme/target/core.c: In function 'nvmet_req_alloc_sgl':
> drivers/nvme/target/core.c:939:12: error: implicit declaration of \
> function 'sgl_alloc'; did you mean 'bio_alloc'? \
> [-Werror=implicit-function-declaration]
>   req->sg = sgl_alloc(req->transfer_len, GFP_KERNEL, &req->sg_cnt);
> ^
> bio_alloc
> drivers/nvme/target/core.c:939:10: warning: assignment makes pointer \
> from integer without a cast [-Wint-conversion]
>   req->sg = sgl_alloc(req->transfer_len, GFP_KERNEL, &req->sg_cnt);
>   ^
> drivers/nvme/target/core.c: In function 'nvmet_req_free_sgl':
> drivers/nvme/target/core.c:952:3: error: implicit declaration of \
> function 'sgl_free'; did you mean 'ida_free'? [-Werror=implicit-function-
> declaration]
>sgl_free(req->sg);
>^~~~
>ida_free
> 
> Cause:
> 
> 1. missing include to 
> 2. SGL_ALLOC needs to be enabled
> 
> Therefore adding the missing include, as well as Kconfig dependency.
> 
> Signed-off-by: Enrico Weigelt, metux IT consult 
> ---
>  drivers/nvme/target/Kconfig | 1 +
>  drivers/nvme/target/core.c  | 1 +
>  2 files changed, 2 insertions(+)
> 
> diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig
> index d94f25c..3ef0a4e 100644
> --- a/drivers/nvme/target/Kconfig
> +++ b/drivers/nvme/target/Kconfig
> @@ -3,6 +3,7 @@ config NVME_TARGET
>   tristate "NVMe Target support"
>   depends on BLOCK
>   depends on CONFIGFS_FS
> + select SGL_ALLOC
>   help
> This enabled target side support for the NVMe protocol, that is
> it allows the Linux kernel to implement NVMe subsystems and
> diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
> index b3e765a..08851f5 100644
> --- a/drivers/nvme/target/core.c
> +++ b/drivers/nvme/target/core.c
> @@ -8,6 +8,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
> 
>  #include "nvmet.h"
> 
> --
> 1.9.1
> 
> 
> ___
> Linux-nvme mailing list
> linux-n...@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-nvme


Re: [PATCH] nvme: determine the number of IO queues

2019-04-18 Thread Minwoo Im

On 4/18/19 10:38 PM, Aaron Ma wrote:



On 4/18/19 9:33 PM, Minwoo Im wrote:

If the controller returns error for that command, how can we assure that
the controller would support a single I/O queue ?


Make sense, I will keep *count = 0 in V2.


IMHO, If you would like to set *count to 0, then what's gonna be V2?
I guess if some device is failed, we can make it as a quirk.



Thanks,
Aaron





Re: [PATCH] nvme: determine the number of IO queues

2019-04-18 Thread Minwoo Im

On 4/18/19 9:52 PM, Aaron Ma wrote:



On 4/18/19 8:13 PM, Minwoo Im wrote:

Yes the IO queues number is 0's based, but driver would return error and
remove the nvme device as dead.


IMHO, if a controller indicates an error with this set_feature command,
then
we need to figure out why the controller was returning the error to host.

If you really want to use at least a single queue to see an alive I/O
queue,
controller should not return the error because as you mentioned above,
NCQA, NSQA will be returned as 0-based.  If an error is there, that could
mean that controller may not able to provide even a single queue for I/O.


I was thinking about try to set 1 I/O queue in driver to try to probe
NVME device.
If it works, at least system can bootup to debug instead of just remove
NVME device and kernel boot hang at loading rootfs.


If the controller returns error for that command, how can we assure that
the controller would support a single I/O queue ?



If you still concern this 1 I/O queue I can still set it as
*count = 0;

At least we try all count, NVME device still failed to respond.

Regards,
Aaron



Thanks,
Minwoo Im




Re: [PATCH] nvme: determine the number of IO queues

2019-04-18 Thread Minwoo Im

On 4/18/19 3:21 PM, Aaron Ma wrote:

On 4/18/19 1:33 AM, Maxim Levitsky wrote:

On Wed, 2019-04-17 at 20:32 +0300, Maxim Levitsky wrote:

On Wed, 2019-04-17 at 22:12 +0800, Aaron Ma wrote:

Some controllers support limited IO queues, when over set
the number, it will return invalid field error.
Then NVME will be removed by driver.

Find the max number of IO queues that controller supports.
When it still got invalid result, set 1 IO queue at least to
bring NVME online.

To be honest a spec compliant device should not need this.
The spec states:

"Number of I/O Completion Queues Requested (NCQR): Indicates the number of I/O
Completion
Queues requested by software. This number does not include the Admin
Completion
Queue. A
minimum of one queue shall be requested, reflecting that the minimum support
is
for one I/O
Completion Queue. This is a 0’s based value. The maximum value that may be
specified is 65,534
(i.e., 65,535 I/O Completion Queues). If the value specified is 65,535, the
controller should return
an error of Invalid Field in Command."


This implies that you can ask for any value and the controller must not
respond
with an error, but rather indicate how many queues it supports.

Maybe its better to add a quirk for the broken device, which needs this?


Adding quirk only makes the code more complicated.
This patch doesn't change the default behavior.
Only handle the NVME error code.

Yes the IO queues number is 0's based, but driver would return error and
remove the nvme device as dead.


IMHO, if a controller indicates an error with this set_feature command, then
we need to figure out why the controller was returning the error to host.

If you really want to use at least a single queue to see an alive I/O queue,
controller should not return the error because as you mentioned above,
NCQA, NSQA will be returned as 0-based.  If an error is there, that could
mean that controller may not able to provide even a single queue for I/O.

Thanks,
Minwoo Im



So set it as 1 at least the NVME can be probed properly.

Regards,
Aaron


I forgot to add the relevant paragraph:

"Note: The value allocated may be smaller or larger than the number of queues
requested, often in virtualized
implementations. The controller may not have as many queues to allocate as are
requested. Alternatively,
the controller may have an allocation unit of queues (e.g. power of two) and may
supply more queues to
host software to satisfy its allocation unit."


Best regards,
Maxim Levitsky



___
Linux-nvme mailing list
linux-n...@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme





Re: [PATCH v3 2/3] PCI: Samsung SM961/PM961 NVMe disable before FLR quirk

2018-07-24 Thread Minwoo Im
Hi Alex,

On Tue, 2018-07-24 at 10:14 -0600, Alex Williamson wrote:
> The Samsung SM961/PM961 (960 EVO) sometimes fails to return from FLR
> with the PCI config space reading back as -1.  A reproducible instance
> of this behavior is resolved by clearing the enable bit in the NVMe
> configuration register and waiting for the ready status to clear
> (disabling the NVMe controller) prior to FLR.
> 
> Signed-off-by: Alex Williamson 
> ---
>  drivers/pci/quirks.c |   83
> ++
>  1 file changed, 83 insertions(+)
> 
> diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
> index e72c8742aafa..3899cdd2514b 100644
> --- a/drivers/pci/quirks.c
> +++ b/drivers/pci/quirks.c
> @@ -28,6 +28,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include  /* isa_dma_bridge_buggy */
>  #include "pci.h"
>  
> @@ -3669,6 +3670,87 @@ static int reset_chelsio_generic_dev(struct pci_dev
> *dev, int probe)
>  #define PCI_DEVICE_ID_INTEL_IVB_M_VGA  0x0156
>  #define PCI_DEVICE_ID_INTEL_IVB_M2_VGA 0x0166
>  
> +/*
> + * The Samsung SM961/PM961 controller can sometimes enter a fatal state after
> + * FLR where config space reads from the device return -1.  We seem to be
> + * able to avoid this condition if we disable the NVMe controller prior to
> + * FLR.  This quirk is generic for any NVMe class device requiring similar
> + * assistance to quiesce the device prior to FLR.
> + *
> + * NVMe specification: https://nvmexpress.org/resources/specifications/
> + * Revision 1.0e:

It seems too old version of NVMe specification.  Do you have any special reason
to comment the specified 1.0 version instead of 1.3 or something newer?

> + *Chapter 2: Required and optional PCI config registers
> + *Chapter 3: NVMe control registers
> + *Chapter 7.3: Reset behavior
> + */
> +static int nvme_disable_and_flr(struct pci_dev *dev, int probe)

The name of this function seems able to be started with 'reset_' prefix just
like other quirks for reset.
What about reset_samsung_pm961 or something?

> +{
> + void __iomem *bar;
> + u16 cmd;
> + u32 cfg;
> +
> + if (dev->class != PCI_CLASS_STORAGE_EXPRESS ||
> + !pcie_has_flr(dev) || !pci_resource_start(dev, 0))
> + return -ENOTTY;
> +
> + if (probe)
> + return 0;
> +
> + bar = pci_iomap(dev, 0, NVME_REG_CC + sizeof(cfg));
> + if (!bar)
> + return -ENOTTY;
> +
> + pci_read_config_word(dev, PCI_COMMAND, &cmd);
> + pci_write_config_word(dev, PCI_COMMAND, cmd | PCI_COMMAND_MEMORY);
> +
> + cfg = readl(bar + NVME_REG_CC);
> +
> + /* Disable controller if enabled */
> + if (cfg & NVME_CC_ENABLE) {
> + u64 cap = readq(bar + NVME_REG_CAP);
> + unsigned long timeout;
> +
> + /*
> +  * Per nvme_disable_ctrl() skip shutdown notification as it
> +  * could complete commands to the admin queue.  We only
> intend
> +  * to quiesce the device before reset.
> +  */
> + cfg &= ~(NVME_CC_SHN_MASK | NVME_CC_ENABLE);
> +
> + writel(cfg, bar + NVME_REG_CC);
> +
> + /*
> +  * Some controllers require an additional delay here, see
> +  * NVME_QUIRK_DELAY_BEFORE_CHK_RDY.  None of those are yet
> +  * supported by this quirk.
> +  */
> +
> + /* Cap register provides max timeout in 500ms increments */
> + timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
> +
> + for (;;) {
> + u32 status = readl(bar + NVME_REG_CSTS);
> +
> + /* Ready status becomes zero on disable complete */
> + if (!(status & NVME_CSTS_RDY))
> + break;
> +
> + msleep(100);
> +
> + if (time_after(jiffies, timeout)) {
> + pci_warn(dev, "Timeout waiting for NVMe ready
> status to clear after disable\n");
> + break;
> + }
> + }
> + }
> +
> + pci_iounmap(dev, bar);
> +
> + pcie_flr(dev);
> +
> + return 0;
> +}
> +
>  static const struct pci_dev_reset_methods pci_dev_reset_methods[] = {
>   { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82599_SFP_VF,
>    reset_intel_82599_sfp_virtfn },
> @@ -3676,6 +3758,7 @@ static const struct pci_dev_reset_methods
> pci_dev_reset_methods[] = {
>   reset_ivb_igd },
>   { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_M2_VGA,
>   reset_ivb_igd },
> + { PCI_VENDOR_ID_SAMSUNG, 0xa804, nvme_disable_and_flr },

Why don't we just define a macro just like other DEVICE_IDs. (e.g.
PCIE_DEVICE_ID_INTEL_82599_SFP_VF).

#define PCI_DEVICE_ID_SAMSUNG_PM961  0xa804

>   { PCI_VENDOR_ID_CHELSIO, PCI_ANY_ID,
>   reset_chelsio_generic_dev },
>   { 0 }
> 
> 
> _