[PATCH V4 1/1] block: reject I/O for same fd if block size changed
This patch fixes I/O errors during BLKRRPART ioctl() behavior right after format operation that changed logical block size of the block device with a same file descriptor opened. This issue can be easily reproduced with a single format command in case of NVMe (logical block size 512B to 4096B). nvme format /dev/nvme0n1 --lbaf=1 --force This is because the application, nvme-cli format subcommand issues an admin command followed by BLKRRPART ioctl to re-read partition information without closing the file descriptor. If file descriptor stays opened, __blkdev_get() will not be invoked at all even logical block size has been changed. It will cause I/O errors with invalid Read operations during the BLKRRPART ioctl due to i_blkbits mismatch. The invalid operations in BLKRRPART happens with under-flowed Number of LBA(NLB) values 0x(65535) because i_blkbits is still set to 9 even the logical block size has been updated to 4096. The BLKRRPART will lead buffer_head to hold 512B data which is less than the logical lock size of the block device. The root cause, which is because i_blkbits of inode of the block device is not updated, can be solved easily by re-opening file descriptor again from application. But, that's just for application's business and kernel should reject invalid Read operations during the BLKRRPART ioctl. This patch rejects I/O from the path of add_partitions() to avoid issuing invalid Read operations to device. It sets a flag to request_queue in blk_queue_logical_block_size to minimize caller-side updates. Signed-off-by: Minwoo Im --- block/blk-settings.c| 3 +++ block/partitions/core.c | 12 fs/block_dev.c | 8 include/linux/blkdev.h | 1 + 4 files changed, 24 insertions(+) diff --git a/block/blk-settings.c b/block/blk-settings.c index 43990b1d148b..48a6fc7bb5f5 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -329,6 +329,9 @@ void blk_queue_logical_block_size(struct request_queue *q, unsigned int size) { struct queue_limits *limits = &q->limits; + if (limits->logical_block_size != size) + blk_queue_flag_set(QUEUE_FLAG_LBSZ_CHANGED, q); + limits->logical_block_size = size; if (limits->physical_block_size < size) diff --git a/block/partitions/core.c b/block/partitions/core.c index e7d776db803b..6f175ea18ff3 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -612,12 +612,24 @@ static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev, int blk_add_partitions(struct gendisk *disk, struct block_device *bdev) { + struct request_queue *q = bdev_get_queue(bdev); struct parsed_partitions *state; int ret = -EAGAIN, p, highest; if (!disk_part_scan_enabled(disk)) return 0; + /* +* Reject to check partition information if block size has been changed +* in the runtime. If block size of a block device has been changed, +* the file descriptor should be opened agian to update the blkbits. +*/ + if (test_bit(QUEUE_FLAG_LBSZ_CHANGED, &q->queue_flags)) { + pr_warn("%s: rejecting checking partition. fd should be opened again.\n", + disk->disk_name); + return -EBADFD; + } + state = check_partition(disk, bdev); if (!state) return 0; diff --git a/fs/block_dev.c b/fs/block_dev.c index 9293045e128c..8056a412a3d1 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -130,7 +130,15 @@ EXPORT_SYMBOL(truncate_bdev_range); static void set_init_blocksize(struct block_device *bdev) { + struct request_queue *q = bdev_get_queue(bdev); + bdev->bd_inode->i_blkbits = blksize_bits(bdev_logical_block_size(bdev)); + + /* +* Allow I/O commands for this block device. We can say that this +* block device has proper blkbits updated. +*/ + blk_queue_flag_clear(QUEUE_FLAG_LBSZ_CHANGED, q); } int set_blocksize(struct block_device *bdev, int size) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 070de09425ad..6d0542434be6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -625,6 +625,7 @@ struct request_queue { #define QUEUE_FLAG_RQ_ALLOC_TIME 27/* record rq->alloc_time_ns */ #define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */ #define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */ +#define QUEUE_FLAG_LBSZ_CHANGED30 /* logical block size changed */ #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) |\ (1 << QUEUE_FLAG_SAME_COMP) | \ -- 2.17.1
[PATCH V4 0/1] block: fix I/O errors in BLKRRPART
Hello, This patch fixes I/O errors during BLKRRPART ioctl() behavior right after format operation that changed logical block size of the block device with a same file descriptor opened. Testcase: The following testcase is a case of NVMe namespace with the following conditions: - Current LBA format is lbaf=0 (512 bytes logical block size) - LBA Format(lbaf=1) has 4096 bytes logical block size # Format block device logical block size 512B to 4096B nvme format /dev/nvme0n1 --lbaf=1 --force This will cause I/O errors because BLKRRPART ioctl() happened right after the format command with same file descriptor opened in application (e.g., nvme-cli) like: fd = open("/dev/nvme0n1", O_RDONLY); nvme_format(fd, ...); if (ioctl(fd, BLKRRPART) < 0) ... Errors: We can see the Read command with Number of LBA(NLB) 0x(65535) which was under-flowed because BLKRRPART operation requested request size based on i_blkbits of the block device which is 9 via buffer_head. [dmesg-snip] [ 10.771740] blk_update_request: operation not supported error, dev nvme0n1, sector 0 op 0x0:(READ) flags 0x0 phys_seg 1 prio class 0 [ 10.780262] Buffer I/O error on dev nvme0n1, logical block 0, async page read [event-snip] kworker/0:1H-56 [000] 913.456922: nvme_setup_cmd: nvme0: disk=nvme0n1, qid=1, cmdid=216, nsid=1, flags=0x0, meta=0x0, cmd=(nvme_cmd_read slba=0, len=65535, ctrl=0x0, dsmgmt=0, reftag=0) ksoftirqd/0-9 [000] .Ns. 916.566351: nvme_complete_rq: nvme0: disk=nvme0n1, qid=1, cmdid=216, res=0x0, retries=0, flags=0x0, status=0x4002 The patch below fixes the I/O errors by rejecting I/O requests from the block layer with setting a flag to request_queue until the file descriptor re-opened to be updated by __blkdev_get(). This is based on the previous discussion [1]. Since V3(RFC): - Move flag from gendisk to request_queue for future clean-ups. (Christoph, [3]) Since V2(RFC): - Cover letter with testcase and error logs attached. Removed un-related changes: empty line. (Chaitanya, [2]) - Put blkdev with blkdev_put_no_open(). Since V1(RFC): - Updated patch to reject I/O rather than updating i_blkbits of the block device's inode directly from driver. (Christoph, [1]) [1] https://lore.kernel.org/linux-nvme/20201223183143.GB13354@localhost.localdomain/T/#t [2] https://lore.kernel.org/linux-nvme/20201230140504.GB7917@localhost.localdomain/T/#t [3] https://lore.kernel.org/linux-block/20210105101202.GA9970@localhost.localdomain/T/#u Thanks, Minwoo Im (1): block: reject I/O for same fd if block size changed block/blk-settings.c| 3 +++ block/partitions/core.c | 12 fs/block_dev.c | 8 include/linux/blkdev.h | 1 + 4 files changed, 24 insertions(+) -- 2.17.1
Re: [RFC PATCH V3 1/1] block: reject I/O for same fd if block size changed
Hello, On 21-01-05 08:50:09, Christoph Hellwig wrote: > On Tue, Jan 05, 2021 at 10:04:56AM +0900, Minwoo Im wrote: > > It was a point that I really would like to ask by RFC whether we can > > have backpointer to the gendisk from the request_queue. And I'd like to > > have it to simplify this routine and for future usages also. > > I think it is the right thing to do, at least mid-term, although I > don't want to enforce the burden on you right now. > > > I will restrict this one by checking GENHD_FL_UP flag from the gendisk > > for the next patch. > > > > > > > > Alternatively we could make this request_queue QUEUE* flag for now. > > > > As this patch rejects I/O from the block layer partition code, can we > > have this flag in gendisk rather than request_queue ? > > For now we can as the request_queue is required. I have some plans to > clean up this area, but just using a request_queue flag for now is > probably the simplest, even if it means more work for me later. Please let me prepare the next quick fix for this issue with request_queue flag. Thanks!
Re: [RFC PATCH V3 1/1] block: reject I/O for same fd if block size changed
Hello Christoph, Thanks for your review. On 21-01-04 18:11:41, Christoph Hellwig wrote: > On Mon, Jan 04, 2021 at 06:11:08PM +0100, Christoph Hellwig wrote: > > On Mon, Jan 04, 2021 at 10:06:59PM +0900, Minwoo Im wrote: > > > + if (q->backing_dev_info && q->backing_dev_info->owner && > > > + limits->logical_block_size != size) { > > > + bdev = blkdev_get_no_open(q->backing_dev_info->owner->devt); > > > + bdev->bd_disk->flags |= GENHD_FL_BLOCK_SIZE_CHANGED; > > > + blkdev_put_no_open(bdev); > > > + } > > > > We really need the backpointer from the queue to the gendisk I've wanted > > to add for a while. Can we at least restrict this to a live gendisk? It was a point that I really would like to ask by RFC whether we can have backpointer to the gendisk from the request_queue. And I'd like to have it to simplify this routine and for future usages also. I will restrict this one by checking GENHD_FL_UP flag from the gendisk for the next patch. > > Alternatively we could make this request_queue QUEUE* flag for now. As this patch rejects I/O from the block layer partition code, can we have this flag in gendisk rather than request_queue ? Thanks,
[RFC PATCH V3 1/1] block: reject I/O for same fd if block size changed
This patch fixes I/O errors during BLKRRPART ioctl() behavior right after format operation that changed logical block size of the block device with a same file descriptor opened. This issue can be easily reproduced with a single format command in case of NVMe (logical block size 512B to 4096B). nvme format /dev/nvme0n1 --lbaf=1 --force This is because the application, nvme-cli format subcommand issues an admin command followed by BLKRRPART ioctl to re-read partition information without closing the file descriptor. If file descriptor stays opened, __blkdev_get() will not be invoked at all even logical block size has been changed. It will cause I/O errors with invalid Read operations during the BLKRRPART ioctl due to i_blkbits mismatch. The invalid operations in BLKRRPART happens with under-flowed Number of LBA(NLB) values 0x(65535) because i_blkbits is still set to 9 even the logical block size has been updated to 4096. The BLKRRPART will lead buffer_head to hold 512B data which is less than the logical lock size of the block device. The root cause, which is because i_blkbits of inode of the block device is not updated, can be solved easily by re-opening file descriptor again from application. But, that's just for application's business and kernel should reject invalid Read operations during the BLKRRPART ioctl. This patch rejects I/O from the path of add_partitions() to avoid issuing invalid Read operations to device. It also sets a flag to gendisk in blk_queue_logical_block_size to minimize caller-side updates. Signed-off-by: Minwoo Im --- block/blk-settings.c| 8 block/partitions/core.c | 11 +++ fs/block_dev.c | 6 ++ include/linux/genhd.h | 1 + 4 files changed, 26 insertions(+) diff --git a/block/blk-settings.c b/block/blk-settings.c index 43990b1d148b..84136ea4e2a4 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -328,6 +328,14 @@ EXPORT_SYMBOL(blk_queue_max_segment_size); void blk_queue_logical_block_size(struct request_queue *q, unsigned int size) { struct queue_limits *limits = &q->limits; + struct block_device *bdev; + + if (q->backing_dev_info && q->backing_dev_info->owner && + limits->logical_block_size != size) { + bdev = blkdev_get_no_open(q->backing_dev_info->owner->devt); + bdev->bd_disk->flags |= GENHD_FL_BLOCK_SIZE_CHANGED; + blkdev_put_no_open(bdev); + } limits->logical_block_size = size; diff --git a/block/partitions/core.c b/block/partitions/core.c index e7d776db803b..5a0330c1b6f9 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -618,6 +618,17 @@ int blk_add_partitions(struct gendisk *disk, struct block_device *bdev) if (!disk_part_scan_enabled(disk)) return 0; + /* +* Reject to check partition information if block size has been changed +* in the runtime. If block size of a block device has been changed, +* the file descriptor should be opened agian to update the blkbits. +*/ + if (disk->flags & GENHD_FL_BLOCK_SIZE_CHANGED) { + pr_warn("%s: rejecting checking partition. fd should be opened again.\n", + disk->disk_name); + return -EBADFD; + } + state = check_partition(disk, bdev); if (!state) return 0; diff --git a/fs/block_dev.c b/fs/block_dev.c index 9293045e128c..c996de3d6084 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -131,6 +131,12 @@ EXPORT_SYMBOL(truncate_bdev_range); static void set_init_blocksize(struct block_device *bdev) { bdev->bd_inode->i_blkbits = blksize_bits(bdev_logical_block_size(bdev)); + + /* +* Allow I/O commands for this block device. We can say that this +* block device has been set to a proper block size. +*/ + bdev->bd_disk->flags &= ~GENHD_FL_BLOCK_SIZE_CHANGED; } int set_blocksize(struct block_device *bdev, int size) diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 809aaa32d53c..0e0e24917003 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -103,6 +103,7 @@ struct partition_meta_info { #define GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE0x0100 #define GENHD_FL_NO_PART_SCAN 0x0200 #define GENHD_FL_HIDDEN0x0400 +#define GENHD_FL_BLOCK_SIZE_CHANGED0x0800 enum { DISK_EVENT_MEDIA_CHANGE = 1 << 0, /* media changed */ -- 2.17.1
[RFC PATCH V3 0/1] block: fix I/O errors in BLKRRPART
Hello, This patch fixes I/O errors during BLKRRPART ioctl() behavior right after format operation that changed logical block size of the block device with a same file descriptor opened. Testcase: The following testcase is a case of NVMe namespace with the following conditions: - Current LBA format is lbaf=0 (512 bytes logical block size) - LBA Format(lbaf=1) has 4096 bytes logical block size # Format block device logical block size 512B to 4096B nvme format /dev/nvme0n1 --lbaf=1 --force This will cause I/O errors because BLKRRPART ioctl() happened right after the format command with same file descriptor opened in application (e.g., nvme-cli) like: fd = open("/dev/nvme0n1", O_RDONLY); nvme_format(fd, ...); if (ioctl(fd, BLKRRPART) < 0) ... Errors: We can see the Read command with Number of LBA(NLB) 0x(65535) which was under-flowed because BLKRRPART operation requested request size based on i_blkbits of the block device which is 9 via buffer_head. [dmesg-snip] [ 10.771740] blk_update_request: operation not supported error, dev nvme0n1, sector 0 op 0x0:(READ) flags 0x0 phys_seg 1 prio class 0 [ 10.780262] Buffer I/O error on dev nvme0n1, logical block 0, async page read [event-snip] kworker/0:1H-56 [000] 913.456922: nvme_setup_cmd: nvme0: disk=nvme0n1, qid=1, cmdid=216, nsid=1, flags=0x0, meta=0x0, cmd=(nvme_cmd_read slba=0, len=65535, ctrl=0x0, dsmgmt=0, reftag=0) ksoftirqd/0-9 [000] .Ns. 916.566351: nvme_complete_rq: nvme0: disk=nvme0n1, qid=1, cmdid=216, res=0x0, retries=0, flags=0x0, status=0x4002 The patch below fixes the I/O errors by rejecting I/O requests from the block layer with setting a flag to gendisk until the file descriptor re-opened to be updated by __blkdev_get(). This is based on the previous discussion [1]. Since V2: - Cover letter with testcase and error logs attached. Removed un-related changes: empty line. (Chaitanya, [2]) - Put blkdev with blkdev_put_no_open(). Since V1: - Updated patch to reject I/O rather than updating i_blkbits of the block device's inode directly from driver. (Christoph, [1]) [1] https://lore.kernel.org/linux-nvme/20201223183143.GB13354@localhost.localdomain/T/#t [2] https://lore.kernel.org/linux-nvme/20201230140504.GB7917@localhost.localdomain/T/#t Thanks, Minwoo Im (1): block: reject I/O for same fd if block size changed block/blk-settings.c| 8 block/partitions/core.c | 11 +++ fs/block_dev.c | 6 ++ include/linux/genhd.h | 1 + 4 files changed, 26 insertions(+) -- 2.17.1
Re: [RFC V2] block: reject I/O for same fd if block size changed
On 21-01-04 00:57:07, Chaitanya Kulkarni wrote: > On 12/30/20 08:03, Minwoo Im wrote: > > Let's say, for example of NVMe device, Format command to change out > > LBA format to another logical block size and BLKRRPART to re-read > > partition information with a same file descriptor like: > > > > fd = open("/dev/nvme0n1", O_RDONLY); > > > > nvme_format(fd, ...); > > if (ioctl(fd, BLKRRPART) < 0) > > .. > > > > In this case, ioctl causes invalid Read operations which are triggered > > by buffer_head I/O path to re-read partition information. This is > > because it's still playing around with i_blksize and i_blkbits. So, > > 512 -> 4096 -> 512 logical block size changes will cause an under-flowed > > length of Read operations. > > > > Case for NVMe: > > (LBAF 1 512B, LBAF 0 4K logical block size) > > > > nvme format /dev/nvme0n1 --lbaf=1 --force # to 512B LBA > > nvme format /dev/nvme0n1 --lbaf=0 --force # to 4096B LBA > > > > [dmesg-snip] > > [ 10.771740] blk_update_request: operation not supported error, dev > > nvme0n1, sector 0 op 0x0:(READ) flags 0x0 phys_seg 1 prio class 0 > > [ 10.780262] Buffer I/O error on dev nvme0n1, logical block 0, async > > page read > > > > [event-snip] > > kworker/0:1H-56 [000] 913.456922: nvme_setup_cmd: nvme0: > > disk=nvme0n1, qid=1, cmdid=216, nsid=1, flags=0x0, meta=0x0, > > cmd=(nvme_cmd_read slba=0, len=65535, ctrl=0x0, dsmgmt=0, reftag=0) > >ksoftirqd/0-9 [000] .Ns. 916.566351: nvme_complete_rq: nvme0: > > disk=nvme0n1, qid=1, cmdid=216, res=0x0, retries=0, flags=0x0, status=0x4002 > > > > As the previous discussion [1], this patch introduced a gendisk flag > > to indicate that block size has been changed in the runtime. This flag > > is set when logical block size is changed in the runtime in the block > > layer. It will be cleared when the file descriptor for the > > block devie is opened again through __blkdev_get() which updates the block > > size via set_init_blocksize(). > > > > This patch rejects I/O from the path of add_partitions() to avoid > > issuing invalid Read operations to device. It also sets a flag to > > gendisk in blk_queue_logical_block_size to minimize caller-side updates. > > > > [1] > > https://lore.kernel.org/linux-nvme/20201223183143.GB13354@localhost.localdomain/T/#t > > > > Signed-off-by: Minwoo Im > Rewrite the change-log similar to what we have in the repo and fix > the spelling mistakes. Add a cover-letter to explain the testcase > and the execution effect, also I'd move discussion link into > cover-letter. Thanks for your time. Will prepare V3 with proper change logs and cover letter to explain issue and testcase. Thanks,
[RFC V2] block: reject I/O for same fd if block size changed
Let's say, for example of NVMe device, Format command to change out LBA format to another logical block size and BLKRRPART to re-read partition information with a same file descriptor like: fd = open("/dev/nvme0n1", O_RDONLY); nvme_format(fd, ...); if (ioctl(fd, BLKRRPART) < 0) .. In this case, ioctl causes invalid Read operations which are triggered by buffer_head I/O path to re-read partition information. This is because it's still playing around with i_blksize and i_blkbits. So, 512 -> 4096 -> 512 logical block size changes will cause an under-flowed length of Read operations. Case for NVMe: (LBAF 1 512B, LBAF 0 4K logical block size) nvme format /dev/nvme0n1 --lbaf=1 --force # to 512B LBA nvme format /dev/nvme0n1 --lbaf=0 --force # to 4096B LBA [dmesg-snip] [ 10.771740] blk_update_request: operation not supported error, dev nvme0n1, sector 0 op 0x0:(READ) flags 0x0 phys_seg 1 prio class 0 [ 10.780262] Buffer I/O error on dev nvme0n1, logical block 0, async page read [event-snip] kworker/0:1H-56 [000] 913.456922: nvme_setup_cmd: nvme0: disk=nvme0n1, qid=1, cmdid=216, nsid=1, flags=0x0, meta=0x0, cmd=(nvme_cmd_read slba=0, len=65535, ctrl=0x0, dsmgmt=0, reftag=0) ksoftirqd/0-9 [000] .Ns. 916.566351: nvme_complete_rq: nvme0: disk=nvme0n1, qid=1, cmdid=216, res=0x0, retries=0, flags=0x0, status=0x4002 As the previous discussion [1], this patch introduced a gendisk flag to indicate that block size has been changed in the runtime. This flag is set when logical block size is changed in the runtime in the block layer. It will be cleared when the file descriptor for the block devie is opened again through __blkdev_get() which updates the block size via set_init_blocksize(). This patch rejects I/O from the path of add_partitions() to avoid issuing invalid Read operations to device. It also sets a flag to gendisk in blk_queue_logical_block_size to minimize caller-side updates. [1] https://lore.kernel.org/linux-nvme/20201223183143.GB13354@localhost.localdomain/T/#t Signed-off-by: Minwoo Im --- block/blk-settings.c| 7 +++ block/genhd.c | 1 + block/partitions/core.c | 11 +++ fs/block_dev.c | 6 ++ include/linux/genhd.h | 1 + 5 files changed, 26 insertions(+) diff --git a/block/blk-settings.c b/block/blk-settings.c index 43990b1d148b..205822ffcaef 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -328,6 +328,13 @@ EXPORT_SYMBOL(blk_queue_max_segment_size); void blk_queue_logical_block_size(struct request_queue *q, unsigned int size) { struct queue_limits *limits = &q->limits; + struct block_device *bdev; + + if (q->backing_dev_info && q->backing_dev_info->owner && + limits->logical_block_size != size) { + bdev = blkdev_get_no_open(q->backing_dev_info->owner->devt); + bdev->bd_disk->flags |= GENHD_FL_BLOCK_SIZE_CHANGED; + } limits->logical_block_size = size; diff --git a/block/genhd.c b/block/genhd.c index 73faec438e49..c3a73cba7c88 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -81,6 +81,7 @@ bool set_capacity_and_notify(struct gendisk *disk, sector_t size) */ if (!capacity || !size) return false; + kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); return true; } diff --git a/block/partitions/core.c b/block/partitions/core.c index e7d776db803b..5a0330c1b6f9 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -618,6 +618,17 @@ int blk_add_partitions(struct gendisk *disk, struct block_device *bdev) if (!disk_part_scan_enabled(disk)) return 0; + /* +* Reject to check partition information if block size has been changed +* in the runtime. If block size of a block device has been changed, +* the file descriptor should be opened agian to update the blkbits. +*/ + if (disk->flags & GENHD_FL_BLOCK_SIZE_CHANGED) { + pr_warn("%s: rejecting checking partition. fd should be opened again.\n", + disk->disk_name); + return -EBADFD; + } + state = check_partition(disk, bdev); if (!state) return 0; diff --git a/fs/block_dev.c b/fs/block_dev.c index 9293045e128c..c996de3d6084 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -131,6 +131,12 @@ EXPORT_SYMBOL(truncate_bdev_range); static void set_init_blocksize(struct block_device *bdev) { bdev->bd_inode->i_blkbits = blksize_bits(bdev_logical_block_size(bdev)); + + /* +* Allow I/O commands for this block device. We can say that this +* block device has been set to a proper block size. +*/ + bdev->bd_disk->flags
Re: [RFC] block: reject I/O in BLKRRPART if block size changed
On 20-12-27 03:02:32, Minwoo Im wrote: > Background: > Let's say we have 2 LBA format for 4096B and 512B LBA size for a > NVMe namespace. Assume that current LBA format is 4096B and in case > we convert namespace to 512B and 4096B back again: > > nvme format /dev/nvme0n1 --lbaf=1 --force # to 512B LBA > nvme format /dev/nvme0n1 --lbaf=0 --force # to 4096B LBA > > Then we can see the following errors during the BLKRRPART ioctl from > the nvme-cli format subcommand: > > [ 10.771740] blk_update_request: operation not supported error, dev > nvme0n1, sector 0 op 0x0:(READ) flags 0x0 phys_seg 1 prio class 0 > [ 10.780262] Buffer I/O error on dev nvme0n1, logical block 0, async page > read > ... > > Also, we can see the Read commands followed by the Format command due > to BLKRRPART ioctl with Number of LBAs to 65535(0x) which is > under-flowed because the request for the Read commands are coming with > 512B and this is because it's playing around with i_blkbits from the > block_device inode which needs to be avoided as [1]. > > kworker/0:1H-56 [000] 913.456922: nvme_setup_cmd: nvme0: > disk=nvme0n1, qid=1, cmdid=216, nsid=1, flags=0x0, meta=0x0, > cmd=(nvme_cmd_read slba=0, len=65535, ctrl=0x0, dsmgmt=0, reftag=0) > ksoftirqd/0-9 [000] .Ns. 916.566351: nvme_complete_rq: nvme0: > disk=nvme0n1, qid=1, cmdid=216, res=0x0, retries=0, flags=0x0, status=0x4002 > ... > > Before we have commit 5ff9f19231a0 ("block: simplify > set_init_blocksize"), block size used to be bumped up to the > 4K(PAGE_SIZE) in this example and we have not seen these errors. But > with this patch, we have to make sure that bdev->bd_inode->i_blkbits to > make sure that BLKRRPART ioctl pass proper request length based on the > changed logical block size. > > Description: > As the previous discussion [1], this patch introduced a gendisk flag > to indicate that block size has been changed in the runtime. This flag > is set when logical block size is changed in the runtime with sector > capacity itself. It will be cleared when the file descriptor for the > block devie is opened again which means __blkdev_get() updates the block > size via set_init_blocksize(). > This patch rejects I/O from the path of add_partitions() and > application should open the file descriptor again to update the block > size of the block device inode. > > [1] > https://lore.kernel.org/linux-nvme/20201223183143.GB13354@localhost.localdomain/T/#t > > Signed-off-by: Minwoo Im Hello, Sorry for the noises here. Please ignore this patch. Will try to prepare a new one for this issue. Thanks,
[RFC] block: reject I/O in BLKRRPART if block size changed
Background: Let's say we have 2 LBA format for 4096B and 512B LBA size for a NVMe namespace. Assume that current LBA format is 4096B and in case we convert namespace to 512B and 4096B back again: nvme format /dev/nvme0n1 --lbaf=1 --force # to 512B LBA nvme format /dev/nvme0n1 --lbaf=0 --force # to 4096B LBA Then we can see the following errors during the BLKRRPART ioctl from the nvme-cli format subcommand: [ 10.771740] blk_update_request: operation not supported error, dev nvme0n1, sector 0 op 0x0:(READ) flags 0x0 phys_seg 1 prio class 0 [ 10.780262] Buffer I/O error on dev nvme0n1, logical block 0, async page read ... Also, we can see the Read commands followed by the Format command due to BLKRRPART ioctl with Number of LBAs to 65535(0x) which is under-flowed because the request for the Read commands are coming with 512B and this is because it's playing around with i_blkbits from the block_device inode which needs to be avoided as [1]. kworker/0:1H-56 [000] 913.456922: nvme_setup_cmd: nvme0: disk=nvme0n1, qid=1, cmdid=216, nsid=1, flags=0x0, meta=0x0, cmd=(nvme_cmd_read slba=0, len=65535, ctrl=0x0, dsmgmt=0, reftag=0) ksoftirqd/0-9 [000] .Ns. 916.566351: nvme_complete_rq: nvme0: disk=nvme0n1, qid=1, cmdid=216, res=0x0, retries=0, flags=0x0, status=0x4002 ... Before we have commit 5ff9f19231a0 ("block: simplify set_init_blocksize"), block size used to be bumped up to the 4K(PAGE_SIZE) in this example and we have not seen these errors. But with this patch, we have to make sure that bdev->bd_inode->i_blkbits to make sure that BLKRRPART ioctl pass proper request length based on the changed logical block size. Description: As the previous discussion [1], this patch introduced a gendisk flag to indicate that block size has been changed in the runtime. This flag is set when logical block size is changed in the runtime with sector capacity itself. It will be cleared when the file descriptor for the block devie is opened again which means __blkdev_get() updates the block size via set_init_blocksize(). This patch rejects I/O from the path of add_partitions() and application should open the file descriptor again to update the block size of the block device inode. [1] https://lore.kernel.org/linux-nvme/20201223183143.GB13354@localhost.localdomain/T/#t Signed-off-by: Minwoo Im --- block/genhd.c | 3 +++ block/partitions/core.c | 11 +++ fs/block_dev.c | 6 ++ include/linux/genhd.h | 1 + 4 files changed, 21 insertions(+) diff --git a/block/genhd.c b/block/genhd.c index b84b8671e627..1f64907fac3d 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -79,6 +79,9 @@ bool set_capacity_and_notify(struct gendisk *disk, sector_t size) */ if (!capacity || !size) return false; + + disk->flags |= GENHD_FL_BLOCK_SIZE_CHANGED; + kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); return true; } diff --git a/block/partitions/core.c b/block/partitions/core.c index deca253583bd..7dfcda96be9e 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -617,6 +617,17 @@ int blk_add_partitions(struct gendisk *disk, struct block_device *bdev) if (!disk_part_scan_enabled(disk)) return 0; + /* +* Reject to check partition information if block size has been changed +* in the runtime. If block size of a block device has been changed, +* the file descriptor should be opened agian to update the blkbits. +*/ + if (disk->flags & GENHD_FL_BLOCK_SIZE_CHANGED) { + pr_warn("%s: rejecting checking partition. fd should be opened again.\n", + disk->disk_name); + return -EBADFD; + } + state = check_partition(disk, bdev); if (!state) return 0; diff --git a/fs/block_dev.c b/fs/block_dev.c index 9e56ee1f2652..813361ad77c1 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -132,6 +132,12 @@ EXPORT_SYMBOL(truncate_bdev_range); static void set_init_blocksize(struct block_device *bdev) { bdev->bd_inode->i_blkbits = blksize_bits(bdev_logical_block_size(bdev)); + + /* +* Allow I/O commands for this block device. We can say that this +* block device has been set to a proper block size. +*/ + bdev->bd_disk->flags &= ~GENHD_FL_BLOCK_SIZE_CHANGED; } int set_blocksize(struct block_device *bdev, int size) diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 809aaa32d53c..0e0e24917003 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -103,6 +103,7 @@ struct partition_meta_info { #define GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE0x0100 #define GENHD_FL_NO_PART_SCAN 0x0200 #define GENHD_FL_HIDDEN
Re: [PATCH] vfs: remove comment for deleted argument 'cred'
On 20-12-02 21:52:32, Minwoo Im wrote: > Removed credential argument comment with no functional changes. > > Signed-off-by: Minwoo Im > --- > fs/open.c | 1 - > 1 file changed, 1 deletion(-) > > diff --git a/fs/open.c b/fs/open.c > index 9af548fb841b..85a532af0946 100644 > --- a/fs/open.c > +++ b/fs/open.c > @@ -923,7 +923,6 @@ EXPORT_SYMBOL(file_path); > * vfs_open - open the file at the given path > * @path: path to open > * @file: newly allocated file with f_flag initialized > - * @cred: credentials to use > */ > int vfs_open(const struct path *path, struct file *file) > { > -- > 2.17.1 > Hello, Gentle ping, Thanks!
Re: [PATCH 0/3] blk-mq: trivial helper and fixes comments
On 20-12-05 00:20:52, Minwoo Im wrote: > Hello, > > This patch set contains: > - Introduce a helper to allocate tagset tags for the first time > without 'realloc' keyword that used to be taken. > - Fixes for comments need to be updated. > > Please have a look. > > Thanks, > > > Minwoo Im (3): > blk-mq: add helper allocating tagset->tags > blk-mq: update arg in comment of blk_mq_map_queue > blk-mq: fix msec comment from micro to milli seconds > > block/blk-mq.c | 14 ++ > block/blk-mq.h | 2 +- > 2 files changed, 11 insertions(+), 5 deletions(-) > > -- > 2.17.1 > Hello, Gentle ping. Thanks!
[PATCH 2/3] blk-mq: update arg in comment of blk_mq_map_queue
Update mis-named argument description of blk_mq_map_queue(). This patch also updates description that argument to software queue percpu context. Signed-off-by: Minwoo Im --- block/blk-mq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-mq.h b/block/blk-mq.h index c696515766c7..c1458d9502f1 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -99,7 +99,7 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue * * blk_mq_map_queue() - map (cmd_flags,type) to hardware queue * @q: request queue * @flags: request command flags - * @cpu: cpu ctx + * @ctx: software queue cpu ctx */ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, unsigned int flags, -- 2.17.1
[PATCH 3/3] blk-mq: fix msec comment from micro to milli seconds
Delay to wait for queue running is milli second unit which is passed to delayed work via msecs_to_jiffies() which is to convert milliseconds to jiffies. Signed-off-by: Minwoo Im --- block/blk-mq.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 69771ba18f1f..284d103bd0e7 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1594,7 +1594,7 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) * __blk_mq_delay_run_hw_queue - Run (or schedule to run) a hardware queue. * @hctx: Pointer to the hardware queue to run. * @async: If we want to run the queue asynchronously. - * @msecs: Microseconds of delay to wait before running the queue. + * @msecs: Milliseconds of delay to wait before running the queue. * * If !@async, try to run the queue now. Else, run the queue asynchronously and * with a delay of @msecs. @@ -1623,7 +1623,7 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, /** * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously. * @hctx: Pointer to the hardware queue to run. - * @msecs: Microseconds of delay to wait before running the queue. + * @msecs: Milliseconds of delay to wait before running the queue. * * Run a hardware queue asynchronously with a delay of @msecs. */ @@ -1687,7 +1687,7 @@ EXPORT_SYMBOL(blk_mq_run_hw_queues); /** * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously. * @q: Pointer to the request queue to run. - * @msecs: Microseconds of delay to wait before running the queues. + * @msecs: Milliseconds of delay to wait before running the queues. */ void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs) { -- 2.17.1
[PATCH 1/3] blk-mq: add helper allocating tagset->tags
tagset->set is allocated from blk_mq_alloc_tag_set() rather than being reallocated. This patch added a helper to make its meaning explicitly which is to allocate rather than to reallocate. Signed-off-by: Minwoo Im --- block/blk-mq.c | 8 +++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 37c682855a63..69771ba18f1f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3377,6 +3377,12 @@ static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set, return 0; } +static int blk_mq_alloc_tag_set_tags(struct blk_mq_tag_set *set, + int new_nr_hw_queues) +{ + return blk_mq_realloc_tag_set_tags(set, 0, new_nr_hw_queues); +} + /* * Alloc a tag set to be associated with one or more request queues. * May fail with EINVAL for various error conditions. May adjust the @@ -3430,7 +3436,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids) set->nr_hw_queues = nr_cpu_ids; - if (blk_mq_realloc_tag_set_tags(set, 0, set->nr_hw_queues) < 0) + if (blk_mq_alloc_tag_set_tags(set, set->nr_hw_queues) < 0) return -ENOMEM; ret = -ENOMEM; -- 2.17.1
[PATCH 0/3] blk-mq: trivial helper and fixes comments
Hello, This patch set contains: - Introduce a helper to allocate tagset tags for the first time without 'realloc' keyword that used to be taken. - Fixes for comments need to be updated. Please have a look. Thanks, Minwoo Im (3): blk-mq: add helper allocating tagset->tags blk-mq: update arg in comment of blk_mq_map_queue blk-mq: fix msec comment from micro to milli seconds block/blk-mq.c | 14 ++ block/blk-mq.h | 2 +- 2 files changed, 11 insertions(+), 5 deletions(-) -- 2.17.1
[PATCH] vfs: remove comment for deleted argument 'cred'
Removed credential argument comment with no functional changes. Signed-off-by: Minwoo Im --- fs/open.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/open.c b/fs/open.c index 9af548fb841b..85a532af0946 100644 --- a/fs/open.c +++ b/fs/open.c @@ -923,7 +923,6 @@ EXPORT_SYMBOL(file_path); * vfs_open - open the file at the given path * @path: path to open * @file: newly allocated file with f_flag initialized - * @cred: credentials to use */ int vfs_open(const struct path *path, struct file *file) { -- 2.17.1
Re: [PATCHv2] nvme: Assign subsy instance from first ctrl
> Subject: [PATCHv2] nvme: Assign subsy instance from first ctrl I'm not sure but, I have not seen 'subsy' thing before. Maybe s/sybsy/subsys/ ? > > The namespace disk names must be unique for the lifetime of the > subsystem. This was accomplished by using their parent subsystems' > instances which were allocated independently from the controllers > connected to that subsystem. This allowed name prefixes assigned to > namespaces to match a controller from an unrelated subsystem, and has > created confusion among users examining device nodes. > > Ensure a namespace's subsystem instance never clashes with a controller > instance of another subsystem by transferring the instance ownership > to the parent subsystem from the first controller discovered in that > subsystem. > > Reviewed-by: Logan Gunthorpe > Signed-off-by: Keith Busch Keith, Thanks for this patch. I really like this concept which can avoid from instance mistakes. Otherwise looks good to me. Reviewed-by: Minwoo Im
[PATCH 2/2] lightnvm: print error when target is not found
If userspace requests target to be removed, nvm_remove_tgt() will iterate the nvm_devices to find out the given target, but if not found, then it should print out the error. Cc: Matias Bjørling Cc: Javier González Signed-off-by: Minwoo Im --- drivers/lightnvm/core.c | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 4c7b48f72e80..4c89a2420a51 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -494,8 +494,11 @@ static int nvm_remove_tgt(struct nvm_ioctl_remove *remove) } up_read(&nvm_lock); - if (!t) + if (!t) { + pr_err("failed to remove target %s not found\n", + remove->tgtname); return 1; + } __nvm_remove_target(t, true); kref_put(&dev->ref, nvm_free); -- 2.17.1
Re: [PATCH] nvme-pci: Use dev_get_drvdata where possible
On 19-07-24 20:22:35, Chuhong Yuan wrote: > Instead of using to_pci_dev + pci_get_drvdata, > use dev_get_drvdata to make code simpler. > > Signed-off-by: Chuhong Yuan This looks good to me. Reviewed-by: Minwoo Im
Re: [PATCH] firmware: qcom_scm: fix error for incompatible pointer
> > > > We just can cast phys_addr_t to dma_addr_t here. > > > > > > IME, casting is rarely a proper solution. > > > > *nod* > > > > ptr_phys probably should be a dma_addr_t. Unless this driver is so > > magic that it really wants a physical and not a dma address, in which > > case it needs to use alloc_pages instead of dma_alloc_coherent > > and then call page_to_phys on the returned page, and a very big comment > > explaining why it is so special. > > The scm call takes physical addresses (which happens to be 1:1 with DMA > addresses for this driver). > > This allocation started off (downstream) as a simple kmalloc(), but > while the scm call is being executed an access from Linux will cause a > security violation (that's not handled gracefully). The properties of > dma_alloc is closer, so that's where the code is today. > > Optimally this should be something like alloc_pages() and some mechanism > for unmapping the pages during the call. But no one has come up with a > suitable patch for that. > > > But there's a patch from Stephen for this already (not doing a > typecast). Apparently I missed merging this, so I'll do that. > > https://lore.kernel.org/linux-arm-msm/20190517210923.202131-2-swb...@chromium.org/ Bjron, I appreciate for checking this. And also thanks all you guys for the comments here! Thanks,
[PATCH] firmware: qcom_scm: fix error for incompatible pointer
The following error can happen when trying to build it: ``` drivers/firmware/qcom_scm.c: In function ‘qcom_scm_assign_mem’: drivers/firmware/qcom_scm.c:460:47: error: passing argument 3 of ‘dma_alloc_coherent’ from incompatible pointer type [-Werror=incompatible-pointer-types] ptr = dma_alloc_coherent(__scm->dev, ptr_sz, &ptr_phys, GFP_KERNEL); ^ In file included from drivers/firmware/qcom_scm.c:12:0: ./include/linux/dma-mapping.h:636:21: note: expected ‘dma_addr_t * {aka long long unsigned int *}’ but argument is of type ‘phys_addr_t * {aka unsigned int *}’ static inline void *dma_alloc_coherent(struct device *dev, size_t size, ^~ ``` We just can cast phys_addr_t to dma_addr_t here. Cc: Andy Gross Cc: David Brown Signed-off-by: Minwoo Im --- drivers/firmware/qcom_scm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/firmware/qcom_scm.c b/drivers/firmware/qcom_scm.c index 2ddc118dba1b..7f6c841fa200 100644 --- a/drivers/firmware/qcom_scm.c +++ b/drivers/firmware/qcom_scm.c @@ -457,7 +457,8 @@ int qcom_scm_assign_mem(phys_addr_t mem_addr, size_t mem_sz, ptr_sz = ALIGN(src_sz, SZ_64) + ALIGN(mem_to_map_sz, SZ_64) + ALIGN(dest_sz, SZ_64); - ptr = dma_alloc_coherent(__scm->dev, ptr_sz, &ptr_phys, GFP_KERNEL); + ptr = dma_alloc_coherent(__scm->dev, ptr_sz, (dma_addr_t *) &ptr_phys, + GFP_KERNEL); if (!ptr) return -ENOMEM; -- 2.17.1
Re: [PATCH v2 3/3] nvme-pci: Add support for Apple 2018+ models
On 19-07-17 10:45:27, Benjamin Herrenschmidt wrote: > Based on reverse engineering and original patch by > > Paul Pawlowski > > This adds support for Apple weird implementation of NVME in their > 2018 or later machines. It accounts for the twice-as-big SQ entries > for the IO queues, and the fact that only interrupt vector 0 appears > to function properly. > > Signed-off-by: Benjamin Herrenschmidt > > # Conflicts: > # drivers/nvme/host/core.c > --- > drivers/nvme/host/nvme.h | 10 ++ > drivers/nvme/host/pci.c | 21 - > 2 files changed, 30 insertions(+), 1 deletion(-) > > diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h > index 716a876119c8..ced0e0a7e039 100644 > --- a/drivers/nvme/host/nvme.h > +++ b/drivers/nvme/host/nvme.h > @@ -92,6 +92,16 @@ enum nvme_quirks { >* Broken Write Zeroes. >*/ > NVME_QUIRK_DISABLE_WRITE_ZEROES = (1 << 9), > + > + /* > + * Use only one interrupt vector for all queues > + */ > + NVME_QUIRK_SINGLE_VECTOR= (1 << 10), > + > + /* > + * Use non-standard 128 bytes SQEs. > + */ > + NVME_QUIRK_128_BYTES_SQES = (1 << 11), > }; > > /* > diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c > index 1637677afb78..7088971d4c42 100644 > --- a/drivers/nvme/host/pci.c > +++ b/drivers/nvme/host/pci.c > @@ -2081,6 +2081,13 @@ static int nvme_setup_irqs(struct nvme_dev *dev, > unsigned int nr_io_queues) > dev->io_queues[HCTX_TYPE_DEFAULT] = 1; > dev->io_queues[HCTX_TYPE_READ] = 0; > > + /* > + * Some Apple controllers require all queues to use the > + * first vector. > + */ > + if (dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR) > + irq_queues = 1; > + > return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues, > PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd); > } > @@ -2322,7 +2329,16 @@ static int nvme_pci_enable(struct nvme_dev *dev) > io_queue_depth); > dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap); > dev->dbs = dev->bar + 4096; > - dev->io_sqes = NVME_NVM_IOSQES; > + > + /* > + * Some Apple controllers require a non-standard SQE size. > + * Interestingly they also seem to ignore the CC:IOSQES register > + * so we don't bother updating it here. > + */ That is really interesting. This also looks good to me. Reviewed-by: Minwoo Im Thanks,
Re: [PATCH v2 2/3] nvme-pci: Add support for variable IO SQ element size
On 19-07-17 10:45:26, Benjamin Herrenschmidt wrote: > The size of a submission queue element should always be 6 (64 bytes) > by spec. > > However some controllers such as Apple's are not properly implementing > the standard and require a different size. > > This provides the ground work for the subsequent quirks for these > controllers. > > Signed-off-by: Benjamin Herrenschmidt > --- > drivers/nvme/host/pci.c | 11 --- > include/linux/nvme.h| 1 + > 2 files changed, 9 insertions(+), 3 deletions(-) > > diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c > index 8f006638452b..1637677afb78 100644 > --- a/drivers/nvme/host/pci.c > +++ b/drivers/nvme/host/pci.c > @@ -28,7 +28,7 @@ > #include "trace.h" > #include "nvme.h" > > -#define SQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_command)) > +#define SQ_SIZE(q) ((q)->q_depth << (q)->sqes) > #define CQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_completion)) > > #define SGES_PER_PAGE(PAGE_SIZE / sizeof(struct nvme_sgl_desc)) > @@ -100,6 +100,7 @@ struct nvme_dev { > unsigned io_queues[HCTX_MAX_TYPES]; > unsigned int num_vecs; > int q_depth; > + int io_sqes; > u32 db_stride; > void __iomem *bar; > unsigned long bar_mapped_size; > @@ -162,7 +163,7 @@ static inline struct nvme_dev *to_nvme_dev(struct > nvme_ctrl *ctrl) > struct nvme_queue { > struct nvme_dev *dev; > spinlock_t sq_lock; > - struct nvme_command *sq_cmds; > + void *sq_cmds; It would be great if it can remain the existing data type for the SQEs... But I'm fine with this also. It looks good to me. Reviewed-by: Minwoo Im Thanks,
Re: [PATCH v2 1/3] nvme-pci: Pass the queue to SQ_SIZE/CQ_SIZE macros
On 19-07-17 10:45:25, Benjamin Herrenschmidt wrote: > This will make it easier to handle variable queue entry sizes > later. No functional change. > > Signed-off-by: Benjamin Herrenschmidt > Reviewed-by: Christoph Hellwig Reviewed-by: Minwoo Im
Re: linux-next: Fixes tag needs some work in the block tree
On 19-07-11 16:03:22, Jens Axboe wrote: > On 7/11/19 3:35 PM, Stephen Rothwell wrote: > > Hi all, > > > > In commit > > > >8f3858763d33 ("nvme: fix NULL deref for fabrics options") > > > > Fixes tag > > > >Fixes: 958f2a0f8 ("nvme-tcp: set the STABLE_WRITES flag when data digests > > > > has these problem(s): > > > >- SHA1 should be at least 12 digits long > > Can be fixed by setting core.abbrev to 12 (or more) or (for git v2.11 > > or later) just making sure it is not set (or set to "auto"). > >- Subject has leading but no trailing parentheses > >- Subject has leading but no trailing quotes > > > > Please do not split Fixes tags over more than one line. Also do not > > include blank lines among the tags. I'm sorry for noises here. I will keep that in mind. Thanks Stephen, > > I should have caught that. Since it's top-of-tree and recent, I'll > amend it. Jens, I will do it from the next time. Thanks for ammend.
Re: [PATCH -next] nvme-pci: Make nvme_dev_pm_ops static
On 19-06-26 10:09:02, YueHaibing wrote: > Fix sparse warning: > > drivers/nvme/host/pci.c:2926:25: warning: > symbol 'nvme_dev_pm_ops' was not declared. Should it be static? > > Reported-by: Hulk Robot > Signed-off-by: YueHaibing > --- > drivers/nvme/host/pci.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c > index 1893520..f500133 100644 > --- a/drivers/nvme/host/pci.c > +++ b/drivers/nvme/host/pci.c > @@ -2923,7 +2923,7 @@ static int nvme_simple_resume(struct device *dev) > return 0; > } > > -const struct dev_pm_ops nvme_dev_pm_ops = { > +static const struct dev_pm_ops nvme_dev_pm_ops = { > .suspend= nvme_suspend, > .resume = nvme_resume, > .freeze = nvme_simple_suspend, IMHO, it should be in static. Reviewed-by: Minwoo Im
Re: [PATCH][next] nvme-trace: fix spelling mistake "spcecific" -> "specific"
On 19-06-26 13:43:23, Colin King wrote: > From: Colin Ian King > > There are two spelling mistakes in trace_seq_printf messages, fix these. > > Signed-off-by: Colin Ian King > --- > drivers/nvme/host/trace.c | 2 +- > drivers/nvme/target/trace.c | 2 +- > 2 files changed, 2 insertions(+), 2 deletions(-) > > diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c > index f01ad0fd60bb..6980ab827233 100644 > --- a/drivers/nvme/host/trace.c > +++ b/drivers/nvme/host/trace.c > @@ -178,7 +178,7 @@ static const char *nvme_trace_fabrics_common(struct > trace_seq *p, u8 *spc) > { > const char *ret = trace_seq_buffer_ptr(p); > > - trace_seq_printf(p, "spcecific=%*ph", 24, spc); > + trace_seq_printf(p, "specific=%*ph", 24, spc); > trace_seq_putc(p, 0); > return ret; > } > diff --git a/drivers/nvme/target/trace.c b/drivers/nvme/target/trace.c > index cdcdd14c6408..6af11d493271 100644 > --- a/drivers/nvme/target/trace.c > +++ b/drivers/nvme/target/trace.c > @@ -146,7 +146,7 @@ static const char *nvmet_trace_fabrics_common(struct > trace_seq *p, u8 *spc) > { > const char *ret = trace_seq_buffer_ptr(p); > > - trace_seq_printf(p, "spcecific=%*ph", 24, spc); > + trace_seq_printf(p, "specific=%*ph", 24, spc); > trace_seq_putc(p, 0); > return ret; > } > -- > 2.20.1 > > > ___ > Linux-nvme mailing list > linux-n...@lists.infradead.org > http://lists.infradead.org/mailman/listinfo/linux-nvme *facepalm*.. Thanks for fixing this, Colin! Reviewed-by: Minwoo Im
Re: [RFC PATCH] mpt3sas: support target smid for [abort|query] task
On 19-06-13 16:54:02, Minwoo Im wrote: > We can request task management IOCTL command(MPI2_FUNCTION_SCSI_TASK_MGMT) > to /dev/mpt3ctl. If the given task_type is either abort task or query > task, it may need a field named "Initiator Port Transfer Tag to Manage" > in the IU. > > Current code does not support to check target IPTT tag from the > tm_request. This patch introduces to check TaskMID given from the > userspace as a target tag. We have a rule of relationship between > (struct request *req->tag) and smid in mpt3sas_base.c: > > 3318 u16 > 3319 mpt3sas_base_get_smid_scsiio(struct MPT3SAS_ADAPTER *ioc, u8 cb_idx, > 3320 struct scsi_cmnd *scmd) > 3321 { > 3322 struct scsiio_tracker *request = scsi_cmd_priv(scmd); > 3323 unsigned int tag = scmd->request->tag; > 3324 u16 smid; > 3325 > 3326 smid = tag + 1; > > So if we want to abort a request tagged #X, then we can pass (X + 1) to > this IOCTL handler. > > Signed-off-by: Minwoo Im > --- Sorry for the duplicated patches on the same mailing list. Please ignore the first two of it. Thanks,
[RFC PATCH] mpt3sas: support target smid for [abort|query] task
We can request task management IOCTL command(MPI2_FUNCTION_SCSI_TASK_MGMT) to /dev/mpt3ctl. If the given task_type is either abort task or query task, it may need a field named "Initiator Port Transfer Tag to Manage" in the IU. Current code does not support to check target IPTT tag from the tm_request. This patch introduces to check TaskMID given from the userspace as a target tag. We have a rule of relationship between (struct request *req->tag) and smid in mpt3sas_base.c: 3318 u16 3319 mpt3sas_base_get_smid_scsiio(struct MPT3SAS_ADAPTER *ioc, u8 cb_idx, 3320 struct scsi_cmnd *scmd) 3321 { 3322 struct scsiio_tracker *request = scsi_cmd_priv(scmd); 3323 unsigned int tag = scmd->request->tag; 3324 u16 smid; 3325 3326 smid = tag + 1; So if we want to abort a request tagged #X, then we can pass (X + 1) to this IOCTL handler. Signed-off-by: Minwoo Im --- drivers/scsi/mpt3sas/mpt3sas_ctl.c | 10 ++ 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/mpt3sas/mpt3sas_ctl.c b/drivers/scsi/mpt3sas/mpt3sas_ctl.c index b2bb47c14d35..5c7539dae713 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_ctl.c +++ b/drivers/scsi/mpt3sas/mpt3sas_ctl.c @@ -596,15 +596,17 @@ _ctl_set_task_mid(struct MPT3SAS_ADAPTER *ioc, struct mpt3_ioctl_command *karg, if (priv_data->sas_target->handle != handle) continue; st = scsi_cmd_priv(scmd); - tm_request->TaskMID = cpu_to_le16(st->smid); - found = 1; + if (tm_request->TaskMID == st->smid) { + tm_request->TaskMID = cpu_to_le16(st->smid); + found = 1; + } } if (!found) { dctlprintk(ioc, - ioc_info(ioc, "%s: handle(0x%04x), lun(%d), no active mid!!\n", + ioc_info(ioc, "%s: handle(0x%04x), lun(%d), no matched mid(%d)!!\n", desc, le16_to_cpu(tm_request->DevHandle), - lun)); + lun, tm_request->TaskMID)); tm_reply = ioc->ctl_cmds.reply; tm_reply->DevHandle = tm_request->DevHandle; tm_reply->Function = MPI2_FUNCTION_SCSI_TASK_MGMT; -- 2.16.1
[RFC PATCH] mpt3sas: support target smid for [abort|query] task
We can request task management IOCTL command(MPI2_FUNCTION_SCSI_TASK_MGMT) to /dev/mpt3ctl. If the given task_type is either abort task or query task, it may need a field named "Initiator Port Transfer Tag to Manage" in the IU. Current code does not support to check target IPTT tag from the tm_request. This patch introduces to check TaskMID given from the userspace as a target tag. We have a rule of relationship between (struct request *req->tag) and smid in mpt3sas_base.c: 3318 u16 3319 mpt3sas_base_get_smid_scsiio(struct MPT3SAS_ADAPTER *ioc, u8 cb_idx, 3320 struct scsi_cmnd *scmd) 3321 { 3322 struct scsiio_tracker *request = scsi_cmd_priv(scmd); 3323 unsigned int tag = scmd->request->tag; 3324 u16 smid; 3325 3326 smid = tag + 1; So if we want to abort a request tagged #X, then we can pass (X + 1) to this IOCTL handler. Signed-off-by: Minwoo Im --- drivers/scsi/mpt3sas/mpt3sas_ctl.c | 10 ++ 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/mpt3sas/mpt3sas_ctl.c b/drivers/scsi/mpt3sas/mpt3sas_ctl.c index b2bb47c14d35..5c7539dae713 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_ctl.c +++ b/drivers/scsi/mpt3sas/mpt3sas_ctl.c @@ -596,15 +596,17 @@ _ctl_set_task_mid(struct MPT3SAS_ADAPTER *ioc, struct mpt3_ioctl_command *karg, if (priv_data->sas_target->handle != handle) continue; st = scsi_cmd_priv(scmd); - tm_request->TaskMID = cpu_to_le16(st->smid); - found = 1; + if (tm_request->TaskMID == st->smid) { + tm_request->TaskMID = cpu_to_le16(st->smid); + found = 1; + } } if (!found) { dctlprintk(ioc, - ioc_info(ioc, "%s: handle(0x%04x), lun(%d), no active mid!!\n", + ioc_info(ioc, "%s: handle(0x%04x), lun(%d), no matched mid(%d)!!\n", desc, le16_to_cpu(tm_request->DevHandle), - lun)); + lun, tm_request->TaskMID)); tm_reply = ioc->ctl_cmds.reply; tm_reply->DevHandle = tm_request->DevHandle; tm_reply->Function = MPI2_FUNCTION_SCSI_TASK_MGMT; -- 2.16.1
[RFC PATCH] mpt3sas: support target smid for [abort|query] task
We can request task management IOCTL command(MPI2_FUNCTION_SCSI_TASK_MGMT) to /dev/mpt3ctl. If the given task_type is either abort task or query task, it may need a field named "Initiator Port Transfer Tag to Manage" in the IU. Current code does not support to check target IPTT tag from the tm_request. This patch introduces to check TaskMID given from the userspace as a target tag. We have a rule of relationship between (struct request *req->tag) and smid in mpt3sas_base.c: 3318 u16 3319 mpt3sas_base_get_smid_scsiio(struct MPT3SAS_ADAPTER *ioc, u8 cb_idx, 3320 struct scsi_cmnd *scmd) 3321 { 3322 struct scsiio_tracker *request = scsi_cmd_priv(scmd); 3323 unsigned int tag = scmd->request->tag; 3324 u16 smid; 3325 3326 smid = tag + 1; So if we want to abort a request tagged #X, then we can pass (X + 1) to this IOCTL handler. Signed-off-by: Minwoo Im --- drivers/scsi/mpt3sas/mpt3sas_ctl.c | 10 ++ 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/mpt3sas/mpt3sas_ctl.c b/drivers/scsi/mpt3sas/mpt3sas_ctl.c index b2bb47c14d35..5c7539dae713 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_ctl.c +++ b/drivers/scsi/mpt3sas/mpt3sas_ctl.c @@ -596,15 +596,17 @@ _ctl_set_task_mid(struct MPT3SAS_ADAPTER *ioc, struct mpt3_ioctl_command *karg, if (priv_data->sas_target->handle != handle) continue; st = scsi_cmd_priv(scmd); - tm_request->TaskMID = cpu_to_le16(st->smid); - found = 1; + if (tm_request->TaskMID == st->smid) { + tm_request->TaskMID = cpu_to_le16(st->smid); + found = 1; + } } if (!found) { dctlprintk(ioc, - ioc_info(ioc, "%s: handle(0x%04x), lun(%d), no active mid!!\n", + ioc_info(ioc, "%s: handle(0x%04x), lun(%d), no matched mid(%d)!!\n", desc, le16_to_cpu(tm_request->DevHandle), - lun)); + lun, tm_request->TaskMID)); tm_reply = ioc->ctl_cmds.reply; tm_reply->DevHandle = tm_request->DevHandle; tm_reply->Function = MPI2_FUNCTION_SCSI_TASK_MGMT; -- 2.16.1
[tip:irq/core] genirq/affinity: Remove unused argument from [__]irq_build_affinity_masks()
Commit-ID: 0e51833042fccfe882ef3e85a346252550d26c22 Gitweb: https://git.kernel.org/tip/0e51833042fccfe882ef3e85a346252550d26c22 Author: Minwoo Im AuthorDate: Sun, 2 Jun 2019 20:21:17 +0900 Committer: Thomas Gleixner CommitDate: Wed, 12 Jun 2019 10:52:45 +0200 genirq/affinity: Remove unused argument from [__]irq_build_affinity_masks() The *affd argument is neither used in irq_build_affinity_masks() nor __irq_build_affinity_masks(). Remove it. Signed-off-by: Minwoo Im Signed-off-by: Thomas Gleixner Reviewed-by: Ming Lei Cc: Minwoo Im Cc: linux-bl...@vger.kernel.org Link: https://lkml.kernel.org/r/20190602112117.31839-1-minwoo.im@gmail.com --- kernel/irq/affinity.c | 12 +--- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index f18cd5aa33e8..4352b08ae48d 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -94,8 +94,7 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, return nodes; } -static int __irq_build_affinity_masks(const struct irq_affinity *affd, - unsigned int startvec, +static int __irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, unsigned int firstvec, cpumask_var_t *node_to_cpumask, @@ -171,8 +170,7 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd, * 1) spread present CPU on these vectors * 2) spread other possible CPUs on these vectors */ -static int irq_build_affinity_masks(const struct irq_affinity *affd, - unsigned int startvec, unsigned int numvecs, +static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, unsigned int firstvec, struct irq_affinity_desc *masks) { @@ -197,7 +195,7 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd, build_node_to_cpumask(node_to_cpumask); /* Spread on present CPUs starting from affd->pre_vectors */ - nr_present = __irq_build_affinity_masks(affd, curvec, numvecs, + nr_present = __irq_build_affinity_masks(curvec, numvecs, firstvec, node_to_cpumask, cpu_present_mask, nmsk, masks); @@ -212,7 +210,7 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd, else curvec = firstvec + nr_present; cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); - nr_others = __irq_build_affinity_masks(affd, curvec, numvecs, + nr_others = __irq_build_affinity_masks(curvec, numvecs, firstvec, node_to_cpumask, npresmsk, nmsk, masks); put_online_cpus(); @@ -295,7 +293,7 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd) unsigned int this_vecs = affd->set_size[i]; int ret; - ret = irq_build_affinity_masks(affd, curvec, this_vecs, + ret = irq_build_affinity_masks(curvec, this_vecs, curvec, masks); if (ret) { kfree(masks);
Re: [PATCH V3 0/5] nvme-trace: Add support for fabrics command
On 19-05-12 16:34:08, Minwoo Im wrote: > Hi, > > Here's a third patchset to support fabrics command tracing. The first > patch updated host/trace module to a outside of it to provide common > interfaces for host and target both. The second one adds support for > tracing fabrics command from host-side. The third is a trivial clean-up > for providing a helper function to figure out given command structure is > for fabrics or not. > > The fourth and fifth are the major change points of this patchset. 4th > patch adds request tracing from the target-side. 5th updated, of course, > completion of given request. > > Please review. > Thanks, > > Changes to V2: > - Provide a common code for both host and target. (Chaitanya) > - Add support for tracing requests in target-side (Chaitanya) > - Make it simple in trace.h without branch out from nvme core module > (Christoph) > > Changes to V1: > - fabrics commands should also be decoded, not just showing that it's > a fabrics command. (Christoph) > - do not make it within nvme admin commands (Chaitanya) > > Minwoo Im (5): > nvme: Make trace common for host and target both > nvme-trace: Support tracing fabrics commands from host-side > nvme: Introduce nvme_is_fabrics to check fabrics cmd > nvme-trace: Add tracing for req_init in trarget > nvme-trace: Add tracing for req_comp in target Ping :)
Re: [PATCH v3 5/7] nvme-pci: add device coredump infrastructure
> +static int nvme_get_telemetry_log_blocks(struct nvme_ctrl *ctrl, void *buf, > + size_t bytes, loff_t offset) > +{ > + loff_t pos = 0; > + u32 chunk_size; > + > + if (check_mul_overflow(ctrl->max_hw_sectors, 512u, &chunk_size)) > + chunk_size = UINT_MAX; > + > + while (pos < bytes) { > + size_t size = min_t(size_t, bytes - pos, chunk_size); > + int ret; > + > + ret = nvme_get_log(ctrl, NVME_NSID_ALL, > NVME_LOG_TELEMETRY_CTRL, > +0, buf + pos, size, offset + pos); > + if (ret) > + return ret; > + > + pos += size; > + } > + > + return 0; > +} > + > +static int nvme_get_telemetry_log(struct nvme_ctrl *ctrl, > + struct sg_table *table, size_t bytes) > +{ > + int n = sg_nents(table->sgl); > + struct scatterlist *sg; > + size_t offset = 0; > + int i; > + > + for_each_sg(table->sgl, sg, n, i) { > + struct page *page = sg_page(sg); > + size_t size = min_t(int, bytes - offset, sg->length); > + int ret; > + > + ret = nvme_get_telemetry_log_blocks(ctrl, > page_address(page), > + size, offset); > + if (ret) > + return ret; > + > + offset += size; > + } > + > + return 0; > +} Can we have those two in nvme-core module instead of being in pci module?
Re: [PATCH v3 6/7] nvme-pci: trigger device coredump on command timeout
> -static void __maybe_unused nvme_coredump_init(struct nvme_dev *dev); > -static void __maybe_unused nvme_coredump_logs(struct nvme_dev *dev); > -static void __maybe_unused nvme_coredump_complete(struct nvme_dev > *dev); > +static void nvme_coredump_init(struct nvme_dev *dev); > +static void nvme_coredump_logs(struct nvme_dev *dev); > +static void nvme_coredump_complete(struct nvme_dev *dev); You just have added those three prototypes in previous patch. Did I miss something here?
Re: [PATCH] nvme/pci: Use host managed power state for suspend
> > + union nvme_result res; > > + int ret; > > + > > + if (!result) > > + return -EINVAL; > > + > > + memset(&c, 0, sizeof(c)); > > + c.features.opcode = nvme_admin_get_features; > > + c.features.fid = cpu_to_le32(NVME_FEAT_POWER_MGMT); > > + > > + ret = __nvme_submit_sync_cmd(ctrl->admin_q, &c, &res, > > + NULL, 0, 0, NVME_QID_ANY, 0, 0, false); > > + if (ret >= 0) > > + *result = le32_to_cpu(res.u32); > May be add a check for result here in above if before deref pointer :- > if (ret >= 0 && result) > 'result' already has been checked in a few lines above.
[PATCH V3 3/5] nvme: Introduce nvme_is_fabrics to check fabrics cmd
This patch introduce a nvme_is_fabrics() inline function to check whether or not the given command structure is for fabrics. Cc: Keith Busch Cc: Jens Axboe Cc: Christoph Hellwig Cc: Sagi Grimberg Cc: James Smart Signed-off-by: Minwoo Im --- drivers/nvme/host/fabrics.c | 2 +- drivers/nvme/target/core.c| 2 +- drivers/nvme/target/fabrics-cmd.c | 2 +- drivers/nvme/target/fc.c | 2 +- include/linux/nvme.h | 7 ++- 5 files changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 592d1e61ef7e..931995f2dbc3 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -578,7 +578,7 @@ bool __nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq, switch (ctrl->state) { case NVME_CTRL_NEW: case NVME_CTRL_CONNECTING: - if (req->cmd->common.opcode == nvme_fabrics_command && + if (nvme_is_fabrics(req->cmd) && req->cmd->fabrics.fctype == nvme_fabrics_type_connect) return true; break; diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 7734a6acff85..da2ea97042af 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -871,7 +871,7 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, status = nvmet_parse_connect_cmd(req); else if (likely(req->sq->qid != 0)) status = nvmet_parse_io_cmd(req); - else if (req->cmd->common.opcode == nvme_fabrics_command) + else if (nvme_is_fabrics(req->cmd)) status = nvmet_parse_fabrics_cmd(req); else if (req->sq->ctrl->subsys->type == NVME_NQN_DISC) status = nvmet_parse_discovery_cmd(req); diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index 3b9f79aba98f..d16b55ffe79f 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -268,7 +268,7 @@ u16 nvmet_parse_connect_cmd(struct nvmet_req *req) { struct nvme_command *cmd = req->cmd; - if (cmd->common.opcode != nvme_fabrics_command) { + if (!nvme_is_fabrics(cmd)) { pr_err("invalid command 0x%x on unconnected queue.\n", cmd->fabrics.opcode); req->error_loc = offsetof(struct nvme_common_command, opcode); diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 508661af0f50..a59c5a013a5c 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -1806,7 +1806,7 @@ nvmet_fc_prep_fcp_rsp(struct nvmet_fc_tgtport *tgtport, */ rspcnt = atomic_inc_return(&fod->queue->zrspcnt); if (!(rspcnt % fod->queue->ersp_ratio) || - sqe->opcode == nvme_fabrics_command || + nvme_is_fabrics((struct nvme_command *) sqe) || xfr_length != fod->req.transfer_len || (le16_to_cpu(cqe->status) & 0xFFFE) || cqewd[0] || cqewd[1] || (sqe->flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND)) || diff --git a/include/linux/nvme.h b/include/linux/nvme.h index c40720cb59ac..ab5e9392b42d 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1165,6 +1165,11 @@ struct nvme_command { }; }; +static inline bool nvme_is_fabrics(struct nvme_command *cmd) +{ + return cmd->common.opcode == nvme_fabrics_command; +} + struct nvme_error_slot { __le64 error_count; __le16 sqid; @@ -1186,7 +1191,7 @@ static inline bool nvme_is_write(struct nvme_command *cmd) * * Why can't we simply have a Fabrics In and Fabrics out command? */ - if (unlikely(cmd->common.opcode == nvme_fabrics_command)) + if (unlikely(nvme_is_fabrics(cmd))) return cmd->fabrics.fctype & 1; return cmd->common.opcode & 1; } -- 2.17.1
[PATCH V3 5/5] nvme-trace: Add tracing for req_comp in target
We can have the common tracing code with different event entries. - nvme_complete_rq - nvmet_req_complete This patch updates existing TRACE_EVENT to a template to provide a common tracing interface. We can have it as a common code because most of the fields need to be printed out for both host and target system. Cc: Keith Busch Cc: Jens Axboe Cc: Christoph Hellwig Cc: Sagi Grimberg Cc: James Smart Signed-off-by: Minwoo Im --- drivers/nvme/host/core.c | 2 +- drivers/nvme/target/core.c | 3 +++ drivers/nvme/trace.c | 1 + drivers/nvme/trace.h | 51 ++ 4 files changed, 45 insertions(+), 12 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 39e49e9948c3..f377ed039a83 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -260,7 +260,7 @@ void nvme_complete_rq(struct request *req) { blk_status_t status = nvme_error_status(req); - trace_nvme_complete_rq(req); + trace_nvme_complete_rq(NVME_TRACE_HOST, req); if (nvme_req(req)->ctrl->kas) nvme_req(req)->ctrl->comp_seen = true; diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 10b3b3767f91..0f184abe432f 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -690,6 +690,9 @@ static void __nvmet_req_complete(struct nvmet_req *req, u16 status) if (unlikely(status)) nvmet_set_error(req, status); + + trace_nvmet_req_complete(NVME_TRACE_TARGET, req); + if (req->ns) nvmet_put_namespace(req->ns); req->ops->queue_response(req); diff --git a/drivers/nvme/trace.c b/drivers/nvme/trace.c index 8fe2dcee6a42..8071b60ec71d 100644 --- a/drivers/nvme/trace.c +++ b/drivers/nvme/trace.c @@ -222,3 +222,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(nvme_async_event); EXPORT_TRACEPOINT_SYMBOL_GPL(nvme_sq); EXPORT_TRACEPOINT_SYMBOL_GPL(nvmet_req_init); +EXPORT_TRACEPOINT_SYMBOL_GPL(nvmet_req_complete); diff --git a/drivers/nvme/trace.h b/drivers/nvme/trace.h index afda9c2ab4a1..0674bb85ac66 100644 --- a/drivers/nvme/trace.h +++ b/drivers/nvme/trace.h @@ -181,9 +181,9 @@ DEFINE_EVENT(nvme__cmd_begin, nvmet_req_init, TP_ARGS(type, req, cmd) ); -TRACE_EVENT(nvme_complete_rq, - TP_PROTO(struct request *req), - TP_ARGS(req), +DECLARE_EVENT_CLASS(nvme__cmd_end, + TP_PROTO(enum nvme_trace_type type, void *req), + TP_ARGS(type, req), TP_STRUCT__entry( __array(char, disk, DISK_NAME_LEN) __field(int, ctrl_id) @@ -195,20 +195,49 @@ TRACE_EVENT(nvme_complete_rq, __field(u16, status) ), TP_fast_assign( - __entry->ctrl_id = nvme_req(req)->ctrl->instance; - __entry->qid = nvme_req_qid(req); - __entry->cid = req->tag; - __entry->result = le64_to_cpu(nvme_req(req)->result.u64); - __entry->retries = nvme_req(req)->retries; - __entry->flags = nvme_req(req)->flags; - __entry->status = nvme_req(req)->status; - __assign_disk_name(__entry->disk, req->rq_disk); + if (type != NVME_TRACE_TARGET) { + struct request *req = (struct request *) req; + + __entry->ctrl_id = nvme_req(req)->ctrl->instance; + __entry->qid = nvme_req_qid(req); + __entry->cid = req->tag; + __entry->result = + le64_to_cpu(nvme_req(req)->result.u64); + __entry->retries = nvme_req(req)->retries; + __entry->flags = nvme_req(req)->flags; + __entry->status = nvme_req(req)->status; + __assign_disk_name(__entry->disk, req->rq_disk); + } else { + struct nvmet_ctrl *ctrl = nvmet_req_to_ctrl(req); + struct nvmet_cq *cq = ((struct nvmet_req *) req)->cq; + struct nvme_completion *cqe = + ((struct nvmet_req *) req)->cqe; + struct nvmet_ns *ns = ((struct nvmet_req *) req)->ns; + + __entry->ctrl_id = ctrl ? ctrl->cntlid : 0; + __entry->qid = cq->qid; + __entry->cid = cqe->command_id; + __entry->result = cqe->result.u64; + __entry->flags = 0; + __entry->status = cqe->status >> 1; + __assign_disk_name(__entry->disk, ns ? + ns->bdev->bd_disk : NULL); + }
[PATCH V3 4/5] nvme-trace: Add tracing for req_init in trarget
We can have the common tracing code with different event entries. - nvme_setup_cmd - nvmet_req_init This patch updates existing TRACE_EVENT to a template to provide a common tracing interface. More than that, nvme_setup_cmd entry point has been defined as an event referring template made. It also introduces tracing at the point of request creation for the target system. This might be useful to figure out what kind of request has been received in the target. Here's the example of target tracing introduced with RDMA: kworker/0:1H-1043 [000] 276.785946: nvmet_req_init: nvme0: qid=0, cmdid=0, nsid=1, flags=0x40, meta=0x0, cmd=(nvme_fabrics_type_connect recfmt=0, qid=0, sqsize=31, cattr=0, kato=0) kworker/0:1H-1043 [000] 276.789893: nvmet_req_init: nvme1: qid=0, cmdid=10, nsid=4, flags=0x40, meta=0x0, cmd=(nvme_fabrics_type_property_get attrib=1, ofst=0x0) kworker/0:1H-1043 [000] 276.791781: nvmet_req_init: nvme1: qid=0, cmdid=11, nsid=0, flags=0x40, meta=0x0, cmd=(nvme_fabrics_type_property_set attrib=0, ofst=0x14, value=0x460001) kworker/0:1H-1043 [000] 276.794799: nvmet_req_init: nvme1: qid=0, cmdid=12, nsid=4, flags=0x40, meta=0x0, cmd=(nvme_fabrics_type_property_get attrib=0, ofst=0x1c) kworker/0:1H-1043 [000] 276.796804: nvmet_req_init: nvme1: qid=0, cmdid=13, nsid=4, flags=0x40, meta=0x0, cmd=(nvme_fabrics_type_property_get attrib=0, ofst=0x8) kworker/0:1H-1043 [000] 276.799163: nvmet_req_init: nvme1: qid=0, cmdid=10, nsid=4, flags=0x40, meta=0x0, cmd=(nvme_fabrics_type_property_get attrib=1, ofst=0x0) kworker/0:1H-1043 [000] 276.801070: nvmet_req_init: nvme1: qid=0, cmdid=11, nsid=0, flags=0x40, meta=0x0, cmd=(nvme_admin_identify cns=1, ctrlid=0) kworker/0:1H-1043 [000] 276.817592: nvmet_req_init: nvme1: qid=0, cmdid=12, nsid=0, flags=0x40, meta=0x0, cmd=(nvme_admin_get_log_page cdw10=70 00 03 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00) kworker/0:1H-1043 [000] 276.822963: nvmet_req_init: nvme1: qid=0, cmdid=13, nsid=0, flags=0x40, meta=0x0, cmd=(nvme_admin_get_log_page cdw10=70 00 ff 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00) kworker/0:1H-1043 [000] 276.831908: nvmet_req_init: nvme1: qid=0, cmdid=10, nsid=0, flags=0x40, meta=0x0, cmd=(nvme_admin_get_log_page cdw10=70 00 03 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00) Cc: Keith Busch Cc: Jens Axboe Cc: Christoph Hellwig Cc: Sagi Grimberg Cc: James Smart Signed-off-by: Minwoo Im --- drivers/nvme/host/core.c| 2 +- drivers/nvme/target/core.c | 3 +++ drivers/nvme/target/nvmet.h | 9 +++ drivers/nvme/trace.c| 2 ++ drivers/nvme/trace.h| 50 - 5 files changed, 59 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index ae76c0b78a5f..39e49e9948c3 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -732,7 +732,7 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, } cmd->common.command_id = req->tag; - trace_nvme_setup_cmd(req, cmd); + trace_nvme_setup_cmd(NVME_TRACE_HOST, req, cmd); return ret; } EXPORT_SYMBOL_GPL(nvme_setup_cmd); diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index da2ea97042af..10b3b3767f91 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -10,6 +10,7 @@ #include #include +#include "../trace.h" #include "nvmet.h" struct workqueue_struct *buffered_io_wq; @@ -848,6 +849,8 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, req->error_loc = NVMET_NO_ERROR_LOC; req->error_slba = 0; + trace_nvmet_req_init(NVME_TRACE_TARGET, req, req->cmd); + /* no support for fused commands yet */ if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) { req->error_loc = offsetof(struct nvme_common_command, flags); diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index c25d88fc9dec..2d569a1dc3f4 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -318,6 +318,15 @@ struct nvmet_req { u64 error_slba; }; +static inline struct nvmet_ctrl *nvmet_req_to_ctrl(struct nvmet_req *req) +{ + struct nvmet_sq *sq = req->sq; + + if (sq) + return sq->ctrl; + return NULL; +} + extern struct workqueue_struct *buffered_io_wq; static inline void nvmet_set_result(struct nvmet_req *req, u32 result) diff --git a/drivers/nvme/trace.c b/drivers/nvme/trace.c index 88dfadb68b92..8fe2dcee6a42 100644 --- a/drivers/nvme/trace.c +++ b/drivers/nvme/trace.c @@ -220,3 +220,5 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(nvme_setup_cmd); EXPORT_TRACEPOINT_SYMBOL_GPL(nvme_complete_rq); EXPORT_TRACEPOINT_SYMBOL_GPL(nvm
[PATCH V3 2/5] nvme-trace: Support tracing fabrics commands from host-side
This patch adds support for tracing fabrics commands detected from host-side. We can have trace_nvme_setup_cmd for nvme and nvme fabrics both even fabrics submission queue entry does not have valid fields in where nvme's fields are valid. This patch modifies existing parse_nvme_cmd to have fctype as an argument to support fabrics command which will be ignored in case of nvme common case. Cc: Keith Busch Cc: Jens Axboe Cc: Christoph Hellwig Cc: Sagi Grimberg Cc: James Smart Signed-off-by: Minwoo Im --- drivers/nvme/trace.c | 67 drivers/nvme/trace.h | 41 +-- 2 files changed, 100 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/trace.c b/drivers/nvme/trace.c index a2c8186d122d..88dfadb68b92 100644 --- a/drivers/nvme/trace.c +++ b/drivers/nvme/trace.c @@ -137,6 +137,73 @@ const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, } } +static const char *nvme_trace_fabrics_property_set(struct trace_seq *p, u8 *spc) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 attrib = spc[0]; + u32 ofst = get_unaligned_le32(spc + 4); + u64 value = get_unaligned_le64(spc + 8); + + trace_seq_printf(p, "attrib=%u, ofst=0x%x, value=0x%llx", + attrib, ofst, value); + trace_seq_putc(p, 0); + + return ret; +} + +static const char *nvme_trace_fabrics_connect(struct trace_seq *p, u8 *spc) +{ + const char *ret = trace_seq_buffer_ptr(p); + u16 recfmt = get_unaligned_le16(spc); + u16 qid = get_unaligned_le16(spc + 2); + u16 sqsize = get_unaligned_le16(spc + 4); + u8 cattr = spc[6]; + u32 kato = get_unaligned_le32(spc + 8); + + trace_seq_printf(p, "recfmt=%u, qid=%u, sqsize=%u, cattr=%u, kato=%u", + recfmt, qid, sqsize, cattr, kato); + trace_seq_putc(p, 0); + + return ret; +} + +static const char *nvme_trace_fabrics_property_get(struct trace_seq *p, u8 *spc) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 attrib = spc[0]; + u32 ofst = get_unaligned_le32(spc + 4); + + trace_seq_printf(p, "attrib=%u, ofst=0x%x", attrib, ofst); + trace_seq_putc(p, 0); + + return ret; +} + +static const char *nvme_trace_fabrics_common(struct trace_seq *p, u8 *spc) +{ + const char *ret = trace_seq_buffer_ptr(p); + + trace_seq_printf(p, "spcecific=%*ph", 24, spc); + trace_seq_putc(p, 0); + + return ret; +} + +const char *nvme_trace_parse_fabrics_cmd(struct trace_seq *p, +u8 fctype, u8 *spc) +{ + switch (fctype) { + case nvme_fabrics_type_property_set: + return nvme_trace_fabrics_property_set(p, spc); + case nvme_fabrics_type_connect: + return nvme_trace_fabrics_connect(p, spc); + case nvme_fabrics_type_property_get: + return nvme_trace_fabrics_property_get(p, spc); + default: + return nvme_trace_fabrics_common(p, spc); + } +} + const char *nvme_trace_disk_name(struct trace_seq *p, char *name) { const char *ret = trace_seq_buffer_ptr(p); diff --git a/drivers/nvme/trace.h b/drivers/nvme/trace.h index acf10c7a5bef..7fbfd6cb815c 100644 --- a/drivers/nvme/trace.h +++ b/drivers/nvme/trace.h @@ -57,18 +57,39 @@ nvme_opcode_name(nvme_cmd_resv_acquire),\ nvme_opcode_name(nvme_cmd_resv_release)) -#define show_opcode_name(qid, opcode) \ - (qid ? show_nvm_opcode_name(opcode) : show_admin_opcode_name(opcode)) +#define nvme_fabrics_type_name(type) { type, #type } +#define show_fabrics_type_name(type) \ + __print_symbolic(type, \ + nvme_fabrics_type_name(nvme_fabrics_type_property_set), \ + nvme_fabrics_type_name(nvme_fabrics_type_connect), \ + nvme_fabrics_type_name(nvme_fabrics_type_property_get)) + +/* + * If not fabrics, fctype will be ignored. + */ +#define show_opcode_name(qid, opcode, fctype) \ + ((opcode != nvme_fabrics_command) ? \ + (qid ? \ + show_nvm_opcode_name(opcode) : \ + show_admin_opcode_name(opcode)) : \ + show_fabrics_type_name(fctype)) const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode, u8 *cdw10); const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, u8 opcode, u8 *cdw10); +const char *nvme_trace_parse_fabrics_cmd(struct trace_seq *p, + u8 fctype, u8 *spc); -#define parse_nvme_cmd(qid, opco
[PATCH V3 1/5] nvme: Make trace common for host and target both
To support target-side trace, nvme-trace should be commonized for host and target both. Of course, not every single tracepoints are necessary by both of them, but we don't need to have more than one trace module for host and target. Cc: Keith Busch Cc: Jens Axboe Cc: Christoph Hellwig Cc: Sagi Grimberg Cc: James Smart Signed-off-by: Minwoo Im --- MAINTAINERS | 2 ++ drivers/nvme/Makefile | 3 +++ drivers/nvme/host/Makefile | 1 - drivers/nvme/host/core.c| 3 +-- drivers/nvme/host/pci.c | 2 +- drivers/nvme/{host => }/trace.c | 5 + drivers/nvme/{host => }/trace.h | 2 +- 7 files changed, 13 insertions(+), 5 deletions(-) rename drivers/nvme/{host => }/trace.c (95%) rename drivers/nvme/{host => }/trace.h (99%) diff --git a/MAINTAINERS b/MAINTAINERS index 09f43f1bdd15..5974cadafcf7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11084,6 +11084,7 @@ L: linux-n...@lists.infradead.org T: git://git.infradead.org/nvme.git W: http://git.infradead.org/nvme.git S: Supported +F: drivers/nvme/ F: drivers/nvme/host/ F: include/linux/nvme.h F: include/uapi/linux/nvme_ioctl.h @@ -11105,6 +11106,7 @@ L: linux-n...@lists.infradead.org T: git://git.infradead.org/nvme.git W: http://git.infradead.org/nvme.git S: Supported +F: drivers/nvme/ F: drivers/nvme/target/ NVMEM FRAMEWORK diff --git a/drivers/nvme/Makefile b/drivers/nvme/Makefile index 0096a7fd1431..12f193502d46 100644 --- a/drivers/nvme/Makefile +++ b/drivers/nvme/Makefile @@ -1,3 +1,6 @@ +ccflags-y += -I$(src) +obj-$(CONFIG_TRACING) += trace.o + obj-y += host/ obj-y += target/ diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile index 8a4b671c5f0c..46453352bfa0 100644 --- a/drivers/nvme/host/Makefile +++ b/drivers/nvme/host/Makefile @@ -10,7 +10,6 @@ obj-$(CONFIG_NVME_FC) += nvme-fc.o obj-$(CONFIG_NVME_TCP) += nvme-tcp.o nvme-core-y:= core.o -nvme-core-$(CONFIG_TRACING)+= trace.o nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o nvme-core-$(CONFIG_NVM)+= lightnvm.o nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS) += fault_inject.o diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index eebaeadaa800..ae76c0b78a5f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -21,8 +21,7 @@ #include #include -#define CREATE_TRACE_POINTS -#include "trace.h" +#include "../trace.h" #include "nvme.h" #include "fabrics.h" diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 2a8708c9ac18..d90b66543d25 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -24,7 +24,7 @@ #include #include -#include "trace.h" +#include "../trace.h" #include "nvme.h" #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/trace.c similarity index 95% rename from drivers/nvme/host/trace.c rename to drivers/nvme/trace.c index 5f24ea7a28eb..a2c8186d122d 100644 --- a/drivers/nvme/host/trace.c +++ b/drivers/nvme/trace.c @@ -5,6 +5,8 @@ */ #include + +#define CREATE_TRACE_POINTS #include "trace.h" static const char *nvme_trace_create_sq(struct trace_seq *p, u8 *cdw10) @@ -147,4 +149,7 @@ const char *nvme_trace_disk_name(struct trace_seq *p, char *name) } EXPORT_SYMBOL_GPL(nvme_trace_disk_name); +EXPORT_TRACEPOINT_SYMBOL_GPL(nvme_setup_cmd); +EXPORT_TRACEPOINT_SYMBOL_GPL(nvme_complete_rq); +EXPORT_TRACEPOINT_SYMBOL_GPL(nvme_async_event); EXPORT_TRACEPOINT_SYMBOL_GPL(nvme_sq); diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/trace.h similarity index 99% rename from drivers/nvme/host/trace.h rename to drivers/nvme/trace.h index 97d3c77365b8..acf10c7a5bef 100644 --- a/drivers/nvme/host/trace.h +++ b/drivers/nvme/trace.h @@ -14,7 +14,7 @@ #include #include -#include "nvme.h" +#include "host/nvme.h" #define nvme_admin_opcode_name(opcode) { opcode, #opcode } #define show_admin_opcode_name(val)\ -- 2.17.1
[PATCH V3 0/5] nvme-trace: Add support for fabrics command
Hi, Here's a third patchset to support fabrics command tracing. The first patch updated host/trace module to a outside of it to provide common interfaces for host and target both. The second one adds support for tracing fabrics command from host-side. The third is a trivial clean-up for providing a helper function to figure out given command structure is for fabrics or not. The fourth and fifth are the major change points of this patchset. 4th patch adds request tracing from the target-side. 5th updated, of course, completion of given request. Please review. Thanks, Changes to V2: - Provide a common code for both host and target. (Chaitanya) - Add support for tracing requests in target-side (Chaitanya) - Make it simple in trace.h without branch out from nvme core module (Christoph) Changes to V1: - fabrics commands should also be decoded, not just showing that it's a fabrics command. (Christoph) - do not make it within nvme admin commands (Chaitanya) Minwoo Im (5): nvme: Make trace common for host and target both nvme-trace: Support tracing fabrics commands from host-side nvme: Introduce nvme_is_fabrics to check fabrics cmd nvme-trace: Add tracing for req_init in trarget nvme-trace: Add tracing for req_comp in target MAINTAINERS | 2 + drivers/nvme/Makefile | 3 + drivers/nvme/host/Makefile| 1 - drivers/nvme/host/core.c | 7 +- drivers/nvme/host/fabrics.c | 2 +- drivers/nvme/host/pci.c | 2 +- drivers/nvme/target/core.c| 8 +- drivers/nvme/target/fabrics-cmd.c | 2 +- drivers/nvme/target/fc.c | 2 +- drivers/nvme/target/nvmet.h | 9 ++ drivers/nvme/{host => }/trace.c | 75 drivers/nvme/{host => }/trace.h | 144 -- include/linux/nvme.h | 7 +- 13 files changed, 227 insertions(+), 37 deletions(-) rename drivers/nvme/{host => }/trace.c (65%) rename drivers/nvme/{host => }/trace.h (59%) -- 2.17.1
Re: [PATCH v2 6/7] nvme-pci: add device coredump support
> This is a bit of a mine field. The shutdown_lock is held when reclaiming > requests that didn't see a response. If you're holding it here and your > telemetry log page times out, we're going to deadlock. And since the > controller is probably in a buggered state when you try to retrieve one, > I would guess an unrecoverable timeout is the most likely outcome. Akinobu, I actually agree with Keith's one. In my experience, there was always internal error inside device when timeout occurs in nvme driver which means the following command might not be completed due to lack of response from device.
Re: [PATCH 3/4] nvme-pci: add device coredump support
On 5/4/19 11:38 PM, Minwoo Im wrote: On 5/4/19 11:26 PM, Akinobu Mita wrote: 2019年5月4日(土) 19:04 Minwoo Im : + { NVME_REG_INTMS, "intms", 32 }, + { NVME_REG_INTMC, "intmc", 32 }, + { NVME_REG_CC, "cc", 32 }, + { NVME_REG_CSTS, "csts", 32 }, + { NVME_REG_NSSR, "nssr", 32 }, + { NVME_REG_AQA, "aqa", 32 }, + { NVME_REG_ASQ, "asq", 64 }, + { NVME_REG_ACQ, "acq", 64 }, + { NVME_REG_CMBLOC, "cmbloc", 32 }, + { NVME_REG_CMBSZ, "cmbsz", 32 }, If it's going to support optional registers also, then we can have BP-related things (BPINFO, BPRSEL, BPMBL) here also. I'm going to change the register dump in binary format just like 'nvme show-regs -o binary' does. So we'll have registers from 00h to 4Fh. Got it. And now I can see those two commands `nvme show-regs` and `nvme show-regs -o binary` have different results for the register range. The binary output covers just 0x50 size, but it shows all the registers including BP-related things in normal && json format. Anyway, I'll prepare a patch for nvme-cli to support binary output format to cover BP things also. Thanks, for your reply. My bad, I misunderstood what you have said above. Please ignore what I mentioned. BP things are located from 40h. to 4Fh. Sorry for making noises here. ;)
Re: [PATCH 3/4] nvme-pci: add device coredump support
On 5/4/19 11:26 PM, Akinobu Mita wrote: 2019年5月4日(土) 19:04 Minwoo Im : Hi, Akinobu, Regardless to reply of the cover, few nits here. On 5/2/19 5:59 PM, Akinobu Mita wrote: + +static const struct nvme_reg nvme_regs[] = { + { NVME_REG_CAP, "cap", 64 }, + { NVME_REG_VS, "version", 32 }, Why don't we just go with "vs" instead of full name of it just like the others. I tried to imitate the output of 'nvme show-regs'. Okay. + { NVME_REG_INTMS, "intms",32 }, + { NVME_REG_INTMC, "intmc",32 }, + { NVME_REG_CC, "cc", 32 }, + { NVME_REG_CSTS,"csts", 32 }, + { NVME_REG_NSSR,"nssr", 32 }, + { NVME_REG_AQA, "aqa", 32 }, + { NVME_REG_ASQ, "asq", 64 }, + { NVME_REG_ACQ, "acq", 64 }, + { NVME_REG_CMBLOC, "cmbloc", 32 }, + { NVME_REG_CMBSZ, "cmbsz",32 }, If it's going to support optional registers also, then we can have BP-related things (BPINFO, BPRSEL, BPMBL) here also. I'm going to change the register dump in binary format just like 'nvme show-regs -o binary' does. So we'll have registers from 00h to 4Fh. Got it. And now I can see those two commands `nvme show-regs` and `nvme show-regs -o binary` have different results for the register range. The binary output covers just 0x50 size, but it shows all the registers including BP-related things in normal && json format. Anyway, I'll prepare a patch for nvme-cli to support binary output format to cover BP things also. Thanks, for your reply.
Re: [PATCH 3/4] nvme-pci: add device coredump support
Hi, Akinobu, Regardless to reply of the cover, few nits here. On 5/2/19 5:59 PM, Akinobu Mita wrote: + +static const struct nvme_reg nvme_regs[] = { + { NVME_REG_CAP, "cap",64 }, + { NVME_REG_VS, "version",32 }, Why don't we just go with "vs" instead of full name of it just like the others. + { NVME_REG_INTMS, "intms", 32 }, + { NVME_REG_INTMC, "intmc", 32 }, + { NVME_REG_CC, "cc", 32 }, + { NVME_REG_CSTS,"csts", 32 }, + { NVME_REG_NSSR,"nssr", 32 }, + { NVME_REG_AQA, "aqa",32 }, + { NVME_REG_ASQ, "asq",64 }, + { NVME_REG_ACQ, "acq",64 }, + { NVME_REG_CMBLOC, "cmbloc", 32 }, + { NVME_REG_CMBSZ, "cmbsz", 32 }, If it's going to support optional registers also, then we can have BP-related things (BPINFO, BPRSEL, BPMBL) here also. Thanks,
Re: [PATCH 0/4] nvme-pci: support device coredump
Hi Akinobu, On 5/4/19 1:20 PM, Akinobu Mita wrote: 2019年5月3日(金) 21:20 Christoph Hellwig : On Fri, May 03, 2019 at 06:12:32AM -0600, Keith Busch wrote: Could you actually explain how the rest is useful? I personally have never encountered an issue where knowing these values would have helped: every device timeout always needed device specific internal firmware logs in my experience. I agree that the device specific internal logs like telemetry are the most useful. The memory dump of command queues and completion queues is not that powerful but helps to know what commands have been submitted before the controller goes wrong (IOW, it's sometimes not enough to know which commands are actually failed), and it can be parsed without vendor specific knowledge. I'm not pretty sure I can say that memory dump of queues are useless at all. As you mentioned, sometimes it's not enough to know which command has actually been failed because we might want to know what happened before and after the actual failure. But, the information of commands handled from device inside would be much more useful to figure out what happened because in case of multiple queues, the arbitration among them could not be represented by this memory dump. If the issue is reproducible, the nvme trace is the most powerful for this kind of information. The memory dump of the queues is not that powerful, but it can always be enabled by default. If the memory dump is a key to reproduce some issues, then it will be powerful to hand it to a vendor to solve it. But I'm afraid of it because the dump might not be able to give relative submitted times among the commands in queues. Yes. Also not that NVMe now has the 'device initiated telemetry' feauture, which is just a wired name for device coredump. Wiring that up so that we can easily provide that data to the device vendor would actually be pretty useful. This version of nvme coredump captures controller registers and each queue. So before resetting controller is a suitable time to capture these. If we'll capture other log pages in this mechanism, the coredump procedure will be splitted into two phases (before resetting controller and after resetting as soon as admin queue is available). I agree with that it would be nice if we have a information that might not be that powerful rather than nothing. But, could we request controller-initiated telemetry log page if supported by the controller to get the internal information at the point of failure like reset? If the dump is generated with the telemetry log page, I think it would be great to be a clue to solve the issue. Thanks,
RE: Re: [PATCH] nvmet: fix ptr_ret.cocci warnings
Sure, I will. Thanks, > -Original Message- > From: Linux-nvme [mailto:linux-nvme-boun...@lists.infradead.org] On > Behalf Of Chaitanya Kulkarni > Sent: Tuesday, April 30, 2019 7:14 AM > To: Minwoo Im > Cc: linux-kernel@vger.kernel.org; Christoph Hellwig; linux- > n...@lists.infradead.org; Sagi Grimberg > Subject: Re: [PATCH] nvmet: fix ptr_ret.cocci warnings > > Hi Minwoo, > > Can you please resend this patch with the suggested change ? > > On 04/29/2019 10:58 AM, kbuild test robot wrote: > > From: kbuild test robot > > > > drivers/nvme/target/discovery.c:375:1-3: WARNING: PTR_ERR_OR_ZERO > can be used > > > > > > Use PTR_ERR_OR_ZERO rather than if(IS_ERR(...)) + PTR_ERR > > > > Generated by: scripts/coccinelle/api/ptr_ret.cocci > > > > Fixes: 6b7e631b927c ("nvmet: return a specified error it subsys_alloc > > fails") > > CC: Minwoo Im > > Signed-off-by: kbuild test robot > > --- > > > > tree: https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git > master > > head: 3d17a1de96a233cf89bfbb5a77ebb1a05c420681 > > commit: 6b7e631b927ca1266b2695307ab71ed7764af75e [9188/10649] > nvmet: return a specified error it subsys_alloc fails > > > > discovery.c |4 +--- > > 1 file changed, 1 insertion(+), 3 deletions(-) > > > > --- a/drivers/nvme/target/discovery.c > > +++ b/drivers/nvme/target/discovery.c > > @@ -372,9 +372,7 @@ int __init nvmet_init_discovery(void) > > { > > nvmet_disc_subsys = > > nvmet_subsys_alloc(NVME_DISC_SUBSYS_NAME, > NVME_NQN_DISC); > > - if (IS_ERR(nvmet_disc_subsys)) > > - return PTR_ERR(nvmet_disc_subsys); > > - return 0; > > + return PTR_ERR_OR_ZERO(nvmet_disc_subsys); > > } > > > > void nvmet_exit_discovery(void) > > > > > ___ > Linux-nvme mailing list > linux-n...@lists.infradead.org > http://lists.infradead.org/mailman/listinfo/linux-nvme
RE: [PATCH 1/3] drivers: nvme: target: core: fix build break
This looks good to me. Reviewed-by: Minwoo Im > -Original Message- > From: Linux-nvme [mailto:linux-nvme-boun...@lists.infradead.org] On Behalf > Of Enrico Weigelt, metux IT consult > Sent: Wednesday, April 24, 2019 7:35 PM > To: linux-kernel@vger.kernel.org > Cc: ax...@kernel.dk; linux-...@vger.kernel.org; linux- > n...@lists.infradead.org > Subject: [PATCH 1/3] drivers: nvme: target: core: fix build break > > Build breaks: > > drivers/nvme/target/core.c: In function 'nvmet_req_alloc_sgl': > drivers/nvme/target/core.c:939:12: error: implicit declaration of \ > function 'sgl_alloc'; did you mean 'bio_alloc'? \ > [-Werror=implicit-function-declaration] > req->sg = sgl_alloc(req->transfer_len, GFP_KERNEL, &req->sg_cnt); > ^ > bio_alloc > drivers/nvme/target/core.c:939:10: warning: assignment makes pointer \ > from integer without a cast [-Wint-conversion] > req->sg = sgl_alloc(req->transfer_len, GFP_KERNEL, &req->sg_cnt); > ^ > drivers/nvme/target/core.c: In function 'nvmet_req_free_sgl': > drivers/nvme/target/core.c:952:3: error: implicit declaration of \ > function 'sgl_free'; did you mean 'ida_free'? [-Werror=implicit-function- > declaration] >sgl_free(req->sg); >^~~~ >ida_free > > Cause: > > 1. missing include to > 2. SGL_ALLOC needs to be enabled > > Therefore adding the missing include, as well as Kconfig dependency. > > Signed-off-by: Enrico Weigelt, metux IT consult > --- > drivers/nvme/target/Kconfig | 1 + > drivers/nvme/target/core.c | 1 + > 2 files changed, 2 insertions(+) > > diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig > index d94f25c..3ef0a4e 100644 > --- a/drivers/nvme/target/Kconfig > +++ b/drivers/nvme/target/Kconfig > @@ -3,6 +3,7 @@ config NVME_TARGET > tristate "NVMe Target support" > depends on BLOCK > depends on CONFIGFS_FS > + select SGL_ALLOC > help > This enabled target side support for the NVMe protocol, that is > it allows the Linux kernel to implement NVMe subsystems and > diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c > index b3e765a..08851f5 100644 > --- a/drivers/nvme/target/core.c > +++ b/drivers/nvme/target/core.c > @@ -8,6 +8,7 @@ > #include > #include > #include > +#include > > #include "nvmet.h" > > -- > 1.9.1 > > > ___ > Linux-nvme mailing list > linux-n...@lists.infradead.org > http://lists.infradead.org/mailman/listinfo/linux-nvme
Re: [PATCH] nvme: determine the number of IO queues
On 4/18/19 10:38 PM, Aaron Ma wrote: On 4/18/19 9:33 PM, Minwoo Im wrote: If the controller returns error for that command, how can we assure that the controller would support a single I/O queue ? Make sense, I will keep *count = 0 in V2. IMHO, If you would like to set *count to 0, then what's gonna be V2? I guess if some device is failed, we can make it as a quirk. Thanks, Aaron
Re: [PATCH] nvme: determine the number of IO queues
On 4/18/19 9:52 PM, Aaron Ma wrote: On 4/18/19 8:13 PM, Minwoo Im wrote: Yes the IO queues number is 0's based, but driver would return error and remove the nvme device as dead. IMHO, if a controller indicates an error with this set_feature command, then we need to figure out why the controller was returning the error to host. If you really want to use at least a single queue to see an alive I/O queue, controller should not return the error because as you mentioned above, NCQA, NSQA will be returned as 0-based. If an error is there, that could mean that controller may not able to provide even a single queue for I/O. I was thinking about try to set 1 I/O queue in driver to try to probe NVME device. If it works, at least system can bootup to debug instead of just remove NVME device and kernel boot hang at loading rootfs. If the controller returns error for that command, how can we assure that the controller would support a single I/O queue ? If you still concern this 1 I/O queue I can still set it as *count = 0; At least we try all count, NVME device still failed to respond. Regards, Aaron Thanks, Minwoo Im
Re: [PATCH] nvme: determine the number of IO queues
On 4/18/19 3:21 PM, Aaron Ma wrote: On 4/18/19 1:33 AM, Maxim Levitsky wrote: On Wed, 2019-04-17 at 20:32 +0300, Maxim Levitsky wrote: On Wed, 2019-04-17 at 22:12 +0800, Aaron Ma wrote: Some controllers support limited IO queues, when over set the number, it will return invalid field error. Then NVME will be removed by driver. Find the max number of IO queues that controller supports. When it still got invalid result, set 1 IO queue at least to bring NVME online. To be honest a spec compliant device should not need this. The spec states: "Number of I/O Completion Queues Requested (NCQR): Indicates the number of I/O Completion Queues requested by software. This number does not include the Admin Completion Queue. A minimum of one queue shall be requested, reflecting that the minimum support is for one I/O Completion Queue. This is a 0’s based value. The maximum value that may be specified is 65,534 (i.e., 65,535 I/O Completion Queues). If the value specified is 65,535, the controller should return an error of Invalid Field in Command." This implies that you can ask for any value and the controller must not respond with an error, but rather indicate how many queues it supports. Maybe its better to add a quirk for the broken device, which needs this? Adding quirk only makes the code more complicated. This patch doesn't change the default behavior. Only handle the NVME error code. Yes the IO queues number is 0's based, but driver would return error and remove the nvme device as dead. IMHO, if a controller indicates an error with this set_feature command, then we need to figure out why the controller was returning the error to host. If you really want to use at least a single queue to see an alive I/O queue, controller should not return the error because as you mentioned above, NCQA, NSQA will be returned as 0-based. If an error is there, that could mean that controller may not able to provide even a single queue for I/O. Thanks, Minwoo Im So set it as 1 at least the NVME can be probed properly. Regards, Aaron I forgot to add the relevant paragraph: "Note: The value allocated may be smaller or larger than the number of queues requested, often in virtualized implementations. The controller may not have as many queues to allocate as are requested. Alternatively, the controller may have an allocation unit of queues (e.g. power of two) and may supply more queues to host software to satisfy its allocation unit." Best regards, Maxim Levitsky ___ Linux-nvme mailing list linux-n...@lists.infradead.org http://lists.infradead.org/mailman/listinfo/linux-nvme
Re: [PATCH v3 2/3] PCI: Samsung SM961/PM961 NVMe disable before FLR quirk
Hi Alex, On Tue, 2018-07-24 at 10:14 -0600, Alex Williamson wrote: > The Samsung SM961/PM961 (960 EVO) sometimes fails to return from FLR > with the PCI config space reading back as -1. A reproducible instance > of this behavior is resolved by clearing the enable bit in the NVMe > configuration register and waiting for the ready status to clear > (disabling the NVMe controller) prior to FLR. > > Signed-off-by: Alex Williamson > --- > drivers/pci/quirks.c | 83 > ++ > 1 file changed, 83 insertions(+) > > diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c > index e72c8742aafa..3899cdd2514b 100644 > --- a/drivers/pci/quirks.c > +++ b/drivers/pci/quirks.c > @@ -28,6 +28,7 @@ > #include > #include > #include > +#include > #include /* isa_dma_bridge_buggy */ > #include "pci.h" > > @@ -3669,6 +3670,87 @@ static int reset_chelsio_generic_dev(struct pci_dev > *dev, int probe) > #define PCI_DEVICE_ID_INTEL_IVB_M_VGA 0x0156 > #define PCI_DEVICE_ID_INTEL_IVB_M2_VGA 0x0166 > > +/* > + * The Samsung SM961/PM961 controller can sometimes enter a fatal state after > + * FLR where config space reads from the device return -1. We seem to be > + * able to avoid this condition if we disable the NVMe controller prior to > + * FLR. This quirk is generic for any NVMe class device requiring similar > + * assistance to quiesce the device prior to FLR. > + * > + * NVMe specification: https://nvmexpress.org/resources/specifications/ > + * Revision 1.0e: It seems too old version of NVMe specification. Do you have any special reason to comment the specified 1.0 version instead of 1.3 or something newer? > + *Chapter 2: Required and optional PCI config registers > + *Chapter 3: NVMe control registers > + *Chapter 7.3: Reset behavior > + */ > +static int nvme_disable_and_flr(struct pci_dev *dev, int probe) The name of this function seems able to be started with 'reset_' prefix just like other quirks for reset. What about reset_samsung_pm961 or something? > +{ > + void __iomem *bar; > + u16 cmd; > + u32 cfg; > + > + if (dev->class != PCI_CLASS_STORAGE_EXPRESS || > + !pcie_has_flr(dev) || !pci_resource_start(dev, 0)) > + return -ENOTTY; > + > + if (probe) > + return 0; > + > + bar = pci_iomap(dev, 0, NVME_REG_CC + sizeof(cfg)); > + if (!bar) > + return -ENOTTY; > + > + pci_read_config_word(dev, PCI_COMMAND, &cmd); > + pci_write_config_word(dev, PCI_COMMAND, cmd | PCI_COMMAND_MEMORY); > + > + cfg = readl(bar + NVME_REG_CC); > + > + /* Disable controller if enabled */ > + if (cfg & NVME_CC_ENABLE) { > + u64 cap = readq(bar + NVME_REG_CAP); > + unsigned long timeout; > + > + /* > + * Per nvme_disable_ctrl() skip shutdown notification as it > + * could complete commands to the admin queue. We only > intend > + * to quiesce the device before reset. > + */ > + cfg &= ~(NVME_CC_SHN_MASK | NVME_CC_ENABLE); > + > + writel(cfg, bar + NVME_REG_CC); > + > + /* > + * Some controllers require an additional delay here, see > + * NVME_QUIRK_DELAY_BEFORE_CHK_RDY. None of those are yet > + * supported by this quirk. > + */ > + > + /* Cap register provides max timeout in 500ms increments */ > + timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; > + > + for (;;) { > + u32 status = readl(bar + NVME_REG_CSTS); > + > + /* Ready status becomes zero on disable complete */ > + if (!(status & NVME_CSTS_RDY)) > + break; > + > + msleep(100); > + > + if (time_after(jiffies, timeout)) { > + pci_warn(dev, "Timeout waiting for NVMe ready > status to clear after disable\n"); > + break; > + } > + } > + } > + > + pci_iounmap(dev, bar); > + > + pcie_flr(dev); > + > + return 0; > +} > + > static const struct pci_dev_reset_methods pci_dev_reset_methods[] = { > { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82599_SFP_VF, > reset_intel_82599_sfp_virtfn }, > @@ -3676,6 +3758,7 @@ static const struct pci_dev_reset_methods > pci_dev_reset_methods[] = { > reset_ivb_igd }, > { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_M2_VGA, > reset_ivb_igd }, > + { PCI_VENDOR_ID_SAMSUNG, 0xa804, nvme_disable_and_flr }, Why don't we just define a macro just like other DEVICE_IDs. (e.g. PCIE_DEVICE_ID_INTEL_82599_SFP_VF). #define PCI_DEVICE_ID_SAMSUNG_PM961 0xa804 > { PCI_VENDOR_ID_CHELSIO, PCI_ANY_ID, > reset_chelsio_generic_dev }, > { 0 } > > > _