Re: [PATCH] vhost: correct misleading printing information

2024-03-15 Thread Xianting Tian

it is a very minor fix, I think it can be applied

在 2024/3/11 下午4:21, Xianting Tian 写道:

Guest moved avail idx not used idx when we need to print log if
'(vq->avail_idx - last_avail_idx) > vq->num', so fix it.

Signed-off-by: Xianting Tian 
---
  drivers/vhost/vhost.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 045f666b4f12..1f3604c79394 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2515,7 +2515,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
  
  		if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) {

-   vq_err(vq, "Guest moved used index from %u to %u",
+   vq_err(vq, "Guest moved avail index from %u to %u",
last_avail_idx, vq->avail_idx);
return -EFAULT;
}




[PATCH] vhost: correct misleading printing information

2024-03-11 Thread Xianting Tian
Guest moved avail idx not used idx when we need to print log if
'(vq->avail_idx - last_avail_idx) > vq->num', so fix it.

Signed-off-by: Xianting Tian 
---
 drivers/vhost/vhost.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 045f666b4f12..1f3604c79394 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2515,7 +2515,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
 
if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) {
-   vq_err(vq, "Guest moved used index from %u to %u",
+   vq_err(vq, "Guest moved avail index from %u to %u",
last_avail_idx, vq->avail_idx);
return -EFAULT;
}
-- 
2.17.1




[PATCH] virtio: remove export for virtio_config_{enable, disable}

2021-02-21 Thread Xianting Tian
virtio_config_enable(), virtio_config_disable() are only used inside
drivers/virtio/virtio.c, so it doesn't need export the symbols.

Signed-off-by: Xianting Tian 
---
 drivers/virtio/virtio.c | 6 ++
 include/linux/virtio.h  | 2 --
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
index 42e09cc..4b15c00 100644
--- a/drivers/virtio/virtio.c
+++ b/drivers/virtio/virtio.c
@@ -141,15 +141,14 @@ void virtio_config_changed(struct virtio_device *dev)
 }
 EXPORT_SYMBOL_GPL(virtio_config_changed);
 
-void virtio_config_disable(struct virtio_device *dev)
+static void virtio_config_disable(struct virtio_device *dev)
 {
spin_lock_irq(&dev->config_lock);
dev->config_enabled = false;
spin_unlock_irq(&dev->config_lock);
 }
-EXPORT_SYMBOL_GPL(virtio_config_disable);
 
-void virtio_config_enable(struct virtio_device *dev)
+static void virtio_config_enable(struct virtio_device *dev)
 {
spin_lock_irq(&dev->config_lock);
dev->config_enabled = true;
@@ -158,7 +157,6 @@ void virtio_config_enable(struct virtio_device *dev)
dev->config_change_pending = false;
spin_unlock_irq(&dev->config_lock);
 }
-EXPORT_SYMBOL_GPL(virtio_config_enable);
 
 void virtio_add_status(struct virtio_device *dev, unsigned int status)
 {
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 55ea329..b1894e0 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -132,8 +132,6 @@ static inline struct virtio_device *dev_to_virtio(struct 
device *_dev)
 void virtio_break_device(struct virtio_device *dev);
 
 void virtio_config_changed(struct virtio_device *dev);
-void virtio_config_disable(struct virtio_device *dev);
-void virtio_config_enable(struct virtio_device *dev);
 int virtio_finalize_features(struct virtio_device *dev);
 #ifdef CONFIG_PM_SLEEP
 int virtio_device_freeze(struct virtio_device *dev);
-- 
1.8.3.1



[PATCH] virtio: remove export for virtio_config_{enable, disable}

2021-02-20 Thread Xianting Tian
virtio_config_enable(), virtio_config_disable() are only used inside
drivers/virtio/virtio.c, so it doesn't need export the symbols.

Signed-off-by: Xianting Tian 
---
 drivers/virtio/virtio.c | 6 ++
 include/linux/virtio.h  | 2 --
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
index 42e09cc..4b15c00 100644
--- a/drivers/virtio/virtio.c
+++ b/drivers/virtio/virtio.c
@@ -141,15 +141,14 @@ void virtio_config_changed(struct virtio_device *dev)
 }
 EXPORT_SYMBOL_GPL(virtio_config_changed);
 
-void virtio_config_disable(struct virtio_device *dev)
+static void virtio_config_disable(struct virtio_device *dev)
 {
spin_lock_irq(&dev->config_lock);
dev->config_enabled = false;
spin_unlock_irq(&dev->config_lock);
 }
-EXPORT_SYMBOL_GPL(virtio_config_disable);
 
-void virtio_config_enable(struct virtio_device *dev)
+static void virtio_config_enable(struct virtio_device *dev)
 {
spin_lock_irq(&dev->config_lock);
dev->config_enabled = true;
@@ -158,7 +157,6 @@ void virtio_config_enable(struct virtio_device *dev)
dev->config_change_pending = false;
spin_unlock_irq(&dev->config_lock);
 }
-EXPORT_SYMBOL_GPL(virtio_config_enable);
 
 void virtio_add_status(struct virtio_device *dev, unsigned int status)
 {
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 55ea329..b1894e0 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -132,8 +132,6 @@ static inline struct virtio_device *dev_to_virtio(struct 
device *_dev)
 void virtio_break_device(struct virtio_device *dev);
 
 void virtio_config_changed(struct virtio_device *dev);
-void virtio_config_disable(struct virtio_device *dev);
-void virtio_config_enable(struct virtio_device *dev);
 int virtio_finalize_features(struct virtio_device *dev);
 #ifdef CONFIG_PM_SLEEP
 int virtio_device_freeze(struct virtio_device *dev);
-- 
1.8.3.1



[PATCH] virtio_mmio: fix one typo

2021-02-06 Thread Xianting Tian
fix the typo 'there is are' to 'there are'.

Signed-off-by: Xianting Tian 
---
 drivers/virtio/virtio_mmio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c
index 238383f..a286d22 100644
--- a/drivers/virtio/virtio_mmio.c
+++ b/drivers/virtio/virtio_mmio.c
@@ -126,7 +126,7 @@ static int vm_finalize_features(struct virtio_device *vdev)
/* Give virtio_ring a chance to accept features. */
vring_transport_features(vdev);
 
-   /* Make sure there is are no mixed devices */
+   /* Make sure there are no mixed devices */
if (vm_dev->version == 2 &&
!__virtio_test_bit(vdev, VIRTIO_F_VERSION_1)) {
dev_err(&vdev->dev, "New virtio-mmio devices (version 2) must 
provide VIRTIO_F_VERSION_1 feature!\n");
-- 
1.8.3.1



[PATCH] [v3] blk-mq-tag: make blk_mq_tag_busy() return void

2020-12-09 Thread Xianting Tian
As no one cares about the return value of blk_mq_tag_busy() and
__blk_mq_tag_busy(), so make them return void.

Other change is to simplify blk_mq_tag_idle().

Signed-off-by: Xianting Tian 
Reviewed-by: Ming Lei 
---
 block/blk-mq-tag.c |  4 +---
 block/blk-mq-tag.h | 16 ++--
 2 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 9c92053e7..01c0bb1fb 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -21,7 +21,7 @@
  * to get tag when first time, the other shared-tag users could reserve
  * budget for it.
  */
-bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
+void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
 {
if (blk_mq_is_sbitmap_shared(hctx->flags)) {
struct request_queue *q = hctx->queue;
@@ -35,8 +35,6 @@ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
!test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
atomic_inc(&hctx->tags->active_queues);
}
-
-   return true;
 }
 
 /*
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 7d3e6b333..4b4ccd794 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -60,23 +60,19 @@ enum {
BLK_MQ_TAG_MAX  = BLK_MQ_NO_TAG - 1,
 };
 
-extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
+extern void __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
 extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *);
 
-static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
+static inline void blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
 {
-   if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED))
-   return false;
-
-   return __blk_mq_tag_busy(hctx);
+   if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
+   __blk_mq_tag_busy(hctx);
 }
 
 static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
 {
-   if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED))
-   return;
-
-   __blk_mq_tag_idle(hctx);
+   if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
+   __blk_mq_tag_idle(hctx);
 }
 
 static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags,
-- 
2.17.1



[PATCH] [v2] blk-mq-tag: make blk_mq_tag_busy() return void

2020-12-09 Thread Xianting Tian
As no one cares about the return value of blk_mq_tag_busy() and
__blk_mq_tag_busy(), so make them return void.

Other change is to simplify blk_mq_tag_idle().

Signed-off-by: Xianting Tian 
Reviewed-by: Ming Lei 
---
 block/blk-mq-tag.c |  4 ++--
 block/blk-mq-tag.h | 16 ++--
 2 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 9c92053e7..21ff7d156 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -21,7 +21,7 @@
  * to get tag when first time, the other shared-tag users could reserve
  * budget for it.
  */
-bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
+void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
 {
if (blk_mq_is_sbitmap_shared(hctx->flags)) {
struct request_queue *q = hctx->queue;
@@ -36,7 +36,7 @@ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
atomic_inc(&hctx->tags->active_queues);
}
 
-   return true;
+   return;
 }
 
 /*
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 7d3e6b333..4b4ccd794 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -60,23 +60,19 @@ enum {
BLK_MQ_TAG_MAX  = BLK_MQ_NO_TAG - 1,
 };
 
-extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
+extern void __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
 extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *);
 
-static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
+static inline void blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
 {
-   if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED))
-   return false;
-
-   return __blk_mq_tag_busy(hctx);
+   if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
+   __blk_mq_tag_busy(hctx);
 }
 
 static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
 {
-   if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED))
-   return;
-
-   __blk_mq_tag_idle(hctx);
+   if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
+   __blk_mq_tag_idle(hctx);
 }
 
 static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags,
-- 
2.17.1



[PATCH] blk-mq-tag: make blk_mq_tag_busy() return void

2020-12-08 Thread Xianting Tian
As no one cares about the return value of blk_mq_tag_busy() and
__blk_mq_tag_busy(), so make them return void.

Signed-off-by: Xianting Tian 
---
 block/blk-mq-tag.c | 4 ++--
 block/blk-mq-tag.h | 8 
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 9c92053e7..21ff7d156 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -21,7 +21,7 @@
  * to get tag when first time, the other shared-tag users could reserve
  * budget for it.
  */
-bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
+void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
 {
if (blk_mq_is_sbitmap_shared(hctx->flags)) {
struct request_queue *q = hctx->queue;
@@ -36,7 +36,7 @@ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
atomic_inc(&hctx->tags->active_queues);
}
 
-   return true;
+   return;
 }
 
 /*
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 7d3e6b333..dd80e5a85 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -60,15 +60,15 @@ enum {
BLK_MQ_TAG_MAX  = BLK_MQ_NO_TAG - 1,
 };
 
-extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
+extern void __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
 extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *);
 
-static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
+static inline void blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
 {
if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED))
-   return false;
+   return;
 
-   return __blk_mq_tag_busy(hctx);
+   __blk_mq_tag_busy(hctx);
 }
 
 static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
-- 
2.17.1



[PATCH] sched/rt: Print curr when RT throttling activated

2020-12-03 Thread Xianting Tian
We may meet the issue, that one RT thread occupied the cpu by 950ms/1s,
The RT thread maybe is a business thread or other unknown thread.

Currently, it only outputs the print "sched: RT throttling activated"
when RT throttling happen. It is hard to know what is the RT thread,
For further analysis, we need add more prints.

This patch is to print current RT task when RT throttling activated,
It help us to know what is the RT thread in the first time.

Signed-off-by: Xianting Tian 
---
 kernel/sched/rt.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index f215eea6a..8913f38cb 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -946,7 +946,7 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se)
return rt_task_of(rt_se)->prio;
 }
 
-static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
+static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq, struct task_struct 
*curr)
 {
u64 runtime = sched_rt_runtime(rt_rq);
 
@@ -970,7 +970,8 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 */
if (likely(rt_b->rt_runtime)) {
rt_rq->rt_throttled = 1;
-   printk_deferred_once("sched: RT throttling 
activated\n");
+   printk_deferred_once("sched: RT throttling activated 
(curr: pid %d, comm %s)\n",
+   curr->pid, curr->comm);
} else {
/*
 * In case we did anyway, make it go away,
@@ -1026,7 +1027,7 @@ static void update_curr_rt(struct rq *rq)
if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
raw_spin_lock(&rt_rq->rt_runtime_lock);
rt_rq->rt_time += delta_exec;
-   if (sched_rt_runtime_exceeded(rt_rq))
+   if (sched_rt_runtime_exceeded(rt_rq, curr))
resched_curr(rq);
raw_spin_unlock(&rt_rq->rt_runtime_lock);
}
-- 
2.17.1



[PATCH] scsi: megaraid_sas: use spin_lock() in hard IRQ

2020-10-20 Thread Xianting Tian
Since we already in hard IRQ context when running megasas_isr(), so use
spin_lock() is enough, which is faster than spin_lock_irqsave().

Signed-off-by: Xianting Tian 
---
 drivers/scsi/megaraid/megaraid_sas_base.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c 
b/drivers/scsi/megaraid/megaraid_sas_base.c
index 2b7e7b5f3..bd186254d 100644
--- a/drivers/scsi/megaraid/megaraid_sas_base.c
+++ b/drivers/scsi/megaraid/megaraid_sas_base.c
@@ -3977,15 +3977,14 @@ static irqreturn_t megasas_isr(int irq, void *devp)
 {
struct megasas_irq_context *irq_context = devp;
struct megasas_instance *instance = irq_context->instance;
-   unsigned long flags;
irqreturn_t rc;
 
if (atomic_read(&instance->fw_reset_no_pci_access))
return IRQ_HANDLED;
 
-   spin_lock_irqsave(&instance->hba_lock, flags);
+   spin_lock(&instance->hba_lock);
rc = megasas_deplete_reply_queue(instance, DID_OK);
-   spin_unlock_irqrestore(&instance->hba_lock, flags);
+   spin_unlock(&instance->hba_lock);
 
return rc;
 }
-- 
2.17.1



[PATCH] mm: bio_alloc never fails when set GFP_NOIO, GFP_KERNEL

2020-10-20 Thread Xianting Tian
bio_alloc with __GFP_DIRECT_RECLAIM(which is included in GFP_NOIO,
GFP_KERNEL) never fails, as stated in the comments of bio_alloc_bioset.

So we can remove multiple unneeded null checks of bio_alloc and simplify
the code.

We have done it in fs/ext4/readpage.c, fs/ext4/page-io.c, fs/direct-io.c,
and so forth.

Signed-off-by: Xianting Tian 
---
 mm/page_io.c | 31 +++
 1 file changed, 11 insertions(+), 20 deletions(-)

diff --git a/mm/page_io.c b/mm/page_io.c
index e485a6e8a..9215bb356 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -30,18 +30,20 @@ static struct bio *get_swap_bio(gfp_t gfp_flags,
struct page *page, bio_end_io_t end_io)
 {
struct bio *bio;
+   struct block_device *bdev;
 
+   /*
+* bio_alloc will _always_ be able to allocate a bio if
+* __GFP_DIRECT_RECLAIM is set, see comments for bio_alloc_bioset().
+*/
bio = bio_alloc(gfp_flags, 1);
-   if (bio) {
-   struct block_device *bdev;
+   bio->bi_iter.bi_sector = map_swap_page(page, &bdev);
+   bio_set_dev(bio, bdev);
+   bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9;
+   bio->bi_end_io = end_io;
 
-   bio->bi_iter.bi_sector = map_swap_page(page, &bdev);
-   bio_set_dev(bio, bdev);
-   bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9;
-   bio->bi_end_io = end_io;
+   bio_add_page(bio, page, thp_size(page), 0);
 
-   bio_add_page(bio, page, thp_size(page), 0);
-   }
return bio;
 }
 
@@ -351,19 +353,13 @@ int __swap_writepage(struct page *page, struct 
writeback_control *wbc,
 
ret = 0;
bio = get_swap_bio(GFP_NOIO, page, end_write_func);
-   if (bio == NULL) {
-   set_page_dirty(page);
-   unlock_page(page);
-   ret = -ENOMEM;
-   goto out;
-   }
bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc);
bio_associate_blkg_from_page(bio, page);
count_swpout_vm_event(page);
set_page_writeback(page);
unlock_page(page);
submit_bio(bio);
-out:
+
return ret;
 }
 
@@ -416,11 +412,6 @@ int swap_readpage(struct page *page, bool synchronous)
 
ret = 0;
bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
-   if (bio == NULL) {
-   unlock_page(page);
-   ret = -ENOMEM;
-   goto out;
-   }
disk = bio->bi_disk;
/*
 * Keep this task valid during swap readpage because the oom killer may
-- 
2.17.1



[PATCH] ext4: remove the null check of bio_vec page

2020-10-20 Thread Xianting Tian
bv_page can't be NULL in a valid bio_vec, so we can remove the NULL check,
as we did in other places when calling bio_for_each_segment_all() to go
through all bio_vec of a bio.

Signed-off-by: Xianting Tian 
---
 fs/ext4/page-io.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index defd2e10d..cb135a944 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -111,9 +111,6 @@ static void ext4_finish_bio(struct bio *bio)
unsigned under_io = 0;
unsigned long flags;
 
-   if (!page)
-   continue;
-
if (fscrypt_is_bounce_page(page)) {
bounce_page = page;
page = fscrypt_pagecache_page(bounce_page);
-- 
2.17.1



[PATCH] blk-mq: remove the calling of local_memory_node()

2020-10-19 Thread Xianting Tian
We don't need to check whether the node is memoryless numa node before
calling allocator interface. SLUB(and SLAB,SLOB) relies on the page
allocator to pick a node. Page allocator should deal with memoryless
nodes just fine. It has zonelists constructed for each possible nodes.
And it will automatically fall back into a node which is closest to the
requested node. As long as __GFP_THISNODE is not enforced of course.

The code comments of kmem_cache_alloc_node() of SLAB also showed this:
 * Fallback to other node is possible if __GFP_THISNODE is not set.

blk-mq code doesn't set __GFP_THISNODE, so we can remove the calling
of local_memory_node().

Fixes: bffed457160ab ("blk-mq: Avoid memoryless numa node encoded in hctx 
numa_node")

Signed-off-by: Xianting Tian 
---
 block/blk-mq-cpumap.c | 2 +-
 block/blk-mq.c| 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 0157f2b34..3db84d319 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -89,7 +89,7 @@ int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, 
unsigned int index)
 
for_each_possible_cpu(i) {
if (index == qmap->mq_map[i])
-   return local_memory_node(cpu_to_node(i));
+   return cpu_to_node(i);
}
 
return NUMA_NO_NODE;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index cdced4aca..48f8366b2 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2737,7 +2737,7 @@ static void blk_mq_init_cpu_queues(struct request_queue 
*q,
for (j = 0; j < set->nr_maps; j++) {
hctx = blk_mq_map_queue_type(q, j, i);
if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
-   hctx->numa_node = 
local_memory_node(cpu_to_node(i));
+   hctx->numa_node = cpu_to_node(i);
}
}
 }
-- 
2.17.1



[PATCH] gfs2: use helper macro abs()

2020-10-18 Thread Xianting Tian
Use helper macro abs() to simplify the "x >= y || x <= -y" cmp.

Signed-off-by: Xianting Tian 
---
 fs/gfs2/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 9f4d9e7be..05eb709de 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -304,7 +304,7 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, 
s64 free,
if (sdp->sd_args.ar_statfs_percent) {
x = 100 * l_sc->sc_free;
y = m_sc->sc_free * sdp->sd_args.ar_statfs_percent;
-   if (x >= y || x <= -y)
+   if (abs(x) >= y)
need_sync = 1;
}
spin_unlock(&sdp->sd_statfs_spin);
-- 
2.17.1



[PATCH] mm: vmscan: avoid a unnecessary reschedule in shrink_slab()

2020-10-15 Thread Xianting Tian
In shrink_slab(), it directly goes to 'out' label only when it
can't get the lock of shrinker_rwsew. In this case, it doesn't
do the real work of shrinking slab, so we don't need trigger a
reschedule by cond_resched().

Signed-off-by: Xianting Tian 
---
 mm/vmscan.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 466fc3144..676e97b28 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -687,8 +687,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
}
 
up_read(&shrinker_rwsem);
-out:
+
cond_resched();
+out:
return freed;
 }
 
-- 
2.17.1



[PATCH] blk-mq: add helper function to test hctx inactive

2020-10-14 Thread Xianting Tian
Introduce helper function blk_mq_hctx_inactive() to test
BLK_MQ_S_INACTIVE as we already done for BLK_MQ_S_STOPPED.

Signed-off-by: Xianting Tian 
---
 block/blk-mq-tag.c | 2 +-
 block/blk-mq.h | 5 +
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 32d82e23b..3119572bc 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -153,7 +153,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 * Give up this allocation if the hctx is inactive.  The caller will
 * retry on an active hctx.
 */
-   if (unlikely(test_bit(BLK_MQ_S_INACTIVE, &data->hctx->state))) {
+   if (unlikely(blk_mq_hctx_inactive(data->hctx))) {
blk_mq_put_tag(tags, data->ctx, tag + tag_offset);
return BLK_MQ_NO_TAG;
}
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 863a2f334..9813269c2 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -171,6 +171,11 @@ static inline bool blk_mq_hctx_stopped(struct 
blk_mq_hw_ctx *hctx)
return test_bit(BLK_MQ_S_STOPPED, &hctx->state);
 }
 
+static inline bool blk_mq_hctx_inactive(struct blk_mq_hw_ctx *hctx)
+{
+   return test_bit(BLK_MQ_S_INACTIVE, &hctx->state);
+}
+
 static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
 {
return hctx->nr_ctx && hctx->tags;
-- 
2.17.1



[PATCH] mm: Make allocator take care of memoryless numa node

2020-10-12 Thread Xianting Tian
In architecture like powerpc, we can have cpus without any local memory
attached to it. In such cases the node does not have real memory.

In many places of current kernel code, it doesn't judge whether the node is
memoryless numa node before calling allocator interface.

This patch is to use local_memory_node(), which is guaranteed to have
memory, in allocator interface. local_memory_node() is a noop in other
architectures that don't support memoryless nodes.

As the call path:
alloc_pages_node
__alloc_pages_node
__alloc_pages_nodemask
and __alloc_pages_node,__alloc_pages_nodemask may be called directly,
so only add local_memory_node() in __alloc_pages_nodemask.

Signed-off-by: Xianting Tian 
---
 include/linux/slab.h |  3 +++
 mm/page_alloc.c  |  1 +
 mm/slab.c|  6 +-
 mm/slob.c|  1 +
 mm/slub.c| 10 --
 5 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 24df2393e..527e811e0 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -574,6 +574,7 @@ static __always_inline void *kmalloc_node(size_t size, 
gfp_t flags, int node)
flags, node, size);
}
 #endif
+   node = local_memory_node(node);
return __kmalloc_node(size, flags, node);
 }
 
@@ -626,6 +627,8 @@ static inline void *kmalloc_array_node(size_t n, size_t 
size, gfp_t flags,
return NULL;
if (__builtin_constant_p(n) && __builtin_constant_p(size))
return kmalloc_node(bytes, flags, node);
+
+   node = local_memory_node(node);
return __kmalloc_node(bytes, flags, node);
 }
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6866533de..be63c62c2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4878,6 +4878,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int 
order, int preferred_nid,
return NULL;
}
 
+   preferred_nid = local_memory_node(preferred_nid);
gfp_mask &= gfp_allowed_mask;
alloc_mask = gfp_mask;
if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, 
&alloc_mask, &alloc_flags))
diff --git a/mm/slab.c b/mm/slab.c
index f658e86ec..263c2f2e1 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3575,7 +3575,10 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace);
  */
 void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 {
-   void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
+   void *ret;
+
+   nodeid = local_memory_node(nodeid);
+   ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
 
trace_kmem_cache_alloc_node(_RET_IP_, ret,
cachep->object_size, cachep->size,
@@ -3593,6 +3596,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache 
*cachep,
 {
void *ret;
 
+   nodeid = local_memory_node(nodeid);
ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
 
ret = kasan_kmalloc(cachep, ret, size, flags);
diff --git a/mm/slob.c b/mm/slob.c
index 7cc9805c8..1f1c25e06 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -636,6 +636,7 @@ EXPORT_SYMBOL(__kmalloc_node);
 
 void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t gfp, int node)
 {
+   node = local_memory_node(node);
return slob_alloc_node(cachep, gfp, node);
 }
 EXPORT_SYMBOL(kmem_cache_alloc_node);
diff --git a/mm/slub.c b/mm/slub.c
index 6d3574013..6e5e12b04 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2921,7 +2921,10 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace);
 #ifdef CONFIG_NUMA
 void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
 {
-   void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
+   void *ret;
+
+   node = local_memory_node(node);
+   ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
 
trace_kmem_cache_alloc_node(_RET_IP_, ret,
s->object_size, s->size, gfpflags, node);
@@ -2935,7 +2938,10 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
gfp_t gfpflags,
int node, size_t size)
 {
-   void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
+   void *ret;
+
+   node = local_memory_node(node);
+   ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
 
trace_kmalloc_node(_RET_IP_, ret,
   size, s->size, gfpflags, node);
-- 
2.17.1



[PATCH] net: Avoid allocing memory on memoryless numa node

2020-10-10 Thread Xianting Tian
In architecture like powerpc, we can have cpus without any local memory
attached to it. In such cases the node does not have real memory.

Use local_memory_node(), which is guaranteed to have memory.
local_memory_node is a noop in other architectures that does not support
memoryless nodes.

Signed-off-by: Xianting Tian 
---
 net/core/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 266073e30..dcb4533ef 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2590,7 +2590,7 @@ static struct xps_map *expand_xps_map(struct xps_map 
*map, int attr_index,
new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
else
new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
-  cpu_to_node(attr_index));
+  
local_memory_node(cpu_to_node(attr_index)));
if (!new_map)
return NULL;
 
-- 
2.17.1



[PATCH] ext2: Remove unnecessary blank

2020-10-10 Thread Xianting Tian
Remove unnecessary blank when calling kmalloc_array().

Signed-off-by: Xianting Tian 
---
 fs/ext2/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 7fab2b3b5..551e69755 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1070,7 +1070,7 @@ static int ext2_fill_super(struct super_block *sb, void 
*data, int silent)
/ EXT2_BLOCKS_PER_GROUP(sb)) + 1;
db_count = (sbi->s_groups_count + EXT2_DESC_PER_BLOCK(sb) - 1) /
   EXT2_DESC_PER_BLOCK(sb);
-   sbi->s_group_desc = kmalloc_array (db_count,
+   sbi->s_group_desc = kmalloc_array(db_count,
   sizeof(struct buffer_head *),
   GFP_KERNEL);
if (sbi->s_group_desc == NULL) {
-- 
2.17.1



[PATCH] IB/hfi1: Avoid allocing memory on memoryless numa node

2020-10-10 Thread Xianting Tian
In architecture like powerpc, we can have cpus without any local memory
attached to it. In such cases the node does not have real memory.

Use local_memory_node(), which is guaranteed to have memory.
local_memory_node is a noop in other architectures that does not support
memoryless nodes.

Signed-off-by: Xianting Tian 
---
 drivers/infiniband/hw/hfi1/file_ops.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/hfi1/file_ops.c 
b/drivers/infiniband/hw/hfi1/file_ops.c
index 8ca51e43c..79fa22cc7 100644
--- a/drivers/infiniband/hw/hfi1/file_ops.c
+++ b/drivers/infiniband/hw/hfi1/file_ops.c
@@ -965,7 +965,7 @@ static int allocate_ctxt(struct hfi1_filedata *fd, struct 
hfi1_devdata *dd,
 */
fd->rec_cpu_num = hfi1_get_proc_affinity(dd->node);
if (fd->rec_cpu_num != -1)
-   numa = cpu_to_node(fd->rec_cpu_num);
+   numa = local_memory_node(cpu_to_node(fd->rec_cpu_num));
else
numa = numa_node_id();
ret = hfi1_create_ctxtdata(dd->pport, numa, &uctxt);
-- 
2.17.1



[PATCH] bpf: Avoid allocing memory on memoryless numa node

2020-10-10 Thread Xianting Tian
In architecture like powerpc, we can have cpus without any local memory
attached to it. In such cases the node does not have real memory.

Use local_memory_node(), which is guaranteed to have memory.
local_memory_node is a noop in other architectures that does not support
memoryless nodes.

Signed-off-by: Xianting Tian 
---
 kernel/bpf/cpumap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 6386b7bb9..2c885c00a 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -423,7 +423,7 @@ __cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 
cpu, int map_id)
struct xdp_bulk_queue *bq;
 
/* Have map->numa_node, but choose node of redirect target CPU */
-   numa = cpu_to_node(cpu);
+   numa = local_memory_node(cpu_to_node(cpu));
 
rcpu = kzalloc_node(sizeof(*rcpu), gfp, numa);
if (!rcpu)
-- 
2.17.1



[tip: sched/core] sched/fair: Remove the force parameter of update_tg_load_avg()

2020-09-29 Thread tip-bot2 for Xianting Tian
The following commit has been merged into the sched/core branch of tip:

Commit-ID: fe7491580d7c56152ea8d9d3124201191617435d
Gitweb:
https://git.kernel.org/tip/fe7491580d7c56152ea8d9d3124201191617435d
Author:Xianting Tian 
AuthorDate:Thu, 24 Sep 2020 09:47:55 +08:00
Committer: Peter Zijlstra 
CommitterDate: Fri, 25 Sep 2020 14:23:25 +02:00

sched/fair: Remove the force parameter of update_tg_load_avg()

In the file fair.c, sometims update_tg_load_avg(cfs_rq, 0) is used,
sometimes update_tg_load_avg(cfs_rq, false) is used.
update_tg_load_avg() has the parameter force, but in current code,
it never set 1 or true to it, so remove the force parameter.

Signed-off-by: Xianting Tian 
Signed-off-by: Peter Zijlstra (Intel) 
Link: https://lkml.kernel.org/r/20200924014755.36253-1-tian.xiant...@h3c.com
---
 kernel/sched/fair.c | 19 +--
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9613e5d..b56276a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -831,7 +831,7 @@ void init_entity_runnable_average(struct sched_entity *se)
 void post_init_entity_util_avg(struct task_struct *p)
 {
 }
-static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
+static void update_tg_load_avg(struct cfs_rq *cfs_rq)
 {
 }
 #endif /* CONFIG_SMP */
@@ -3293,7 +3293,6 @@ static inline void cfs_rq_util_change(struct cfs_rq 
*cfs_rq, int flags)
 /**
  * update_tg_load_avg - update the tg's load avg
  * @cfs_rq: the cfs_rq whose avg changed
- * @force: update regardless of how small the difference
  *
  * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
  * However, because tg->load_avg is a global value there are performance
@@ -3305,7 +3304,7 @@ static inline void cfs_rq_util_change(struct cfs_rq 
*cfs_rq, int flags)
  *
  * Updating tg's load_avg is necessary before update_cfs_share().
  */
-static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
 {
long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
 
@@ -3315,7 +3314,7 @@ static inline void update_tg_load_avg(struct cfs_rq 
*cfs_rq, int force)
if (cfs_rq->tg == &root_task_group)
return;
 
-   if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
+   if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
atomic_long_add(delta, &cfs_rq->tg->load_avg);
cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
}
@@ -3617,7 +3616,7 @@ static inline bool skip_blocked_update(struct 
sched_entity *se)
 
 #else /* CONFIG_FAIR_GROUP_SCHED */
 
-static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
 
 static inline int propagate_entity_load_avg(struct sched_entity *se)
 {
@@ -3805,13 +3804,13 @@ static inline void update_load_avg(struct cfs_rq 
*cfs_rq, struct sched_entity *s
 * IOW we're enqueueing a task on a new CPU.
 */
attach_entity_load_avg(cfs_rq, se);
-   update_tg_load_avg(cfs_rq, 0);
+   update_tg_load_avg(cfs_rq);
 
} else if (decayed) {
cfs_rq_util_change(cfs_rq, 0);
 
if (flags & UPDATE_TG)
-   update_tg_load_avg(cfs_rq, 0);
+   update_tg_load_avg(cfs_rq);
}
 }
 
@@ -7898,7 +7897,7 @@ static bool __update_blocked_fair(struct rq *rq, bool 
*done)
struct sched_entity *se;
 
if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
-   update_tg_load_avg(cfs_rq, 0);
+   update_tg_load_avg(cfs_rq);
 
if (cfs_rq == &rq->cfs)
decayed = true;
@@ -10797,7 +10796,7 @@ static void detach_entity_cfs_rq(struct sched_entity 
*se)
/* Catch up with the cfs_rq and remove our load when we leave */
update_load_avg(cfs_rq, se, 0);
detach_entity_load_avg(cfs_rq, se);
-   update_tg_load_avg(cfs_rq, false);
+   update_tg_load_avg(cfs_rq);
propagate_entity_cfs_rq(se);
 }
 
@@ -10816,7 +10815,7 @@ static void attach_entity_cfs_rq(struct sched_entity 
*se)
/* Synchronize entity with its cfs_rq */
update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : 
SKIP_AGE_LOAD);
attach_entity_load_avg(cfs_rq, se);
-   update_tg_load_avg(cfs_rq, false);
+   update_tg_load_avg(cfs_rq);
propagate_entity_cfs_rq(se);
 }
 


[PATCH] [v3] blk-mq: add cond_resched() in __blk_mq_alloc_rq_maps()

2020-09-25 Thread Xianting Tian
We found blk_mq_alloc_rq_maps() takes more time in kernel space when
testing nvme device hot-plugging. The test and anlysis as below.

Debug code,
1, blk_mq_alloc_rq_maps():
u64 start, end;
depth = set->queue_depth;
start = ktime_get_ns();
pr_err("[%d:%s switch:%ld,%ld] queue depth %d, nr_hw_queues %d\n",
current->pid, current->comm, current->nvcsw, 
current->nivcsw,
set->queue_depth, set->nr_hw_queues);
do {
err = __blk_mq_alloc_rq_maps(set);
if (!err)
break;

set->queue_depth >>= 1;
if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) {
err = -ENOMEM;
break;
}
} while (set->queue_depth);
end = ktime_get_ns();
pr_err("[%d:%s switch:%ld,%ld] all hw queues init cost time %lld ns\n",
current->pid, current->comm,
current->nvcsw, current->nivcsw, end - start);

2, __blk_mq_alloc_rq_maps():
u64 start, end;
for (i = 0; i < set->nr_hw_queues; i++) {
start = ktime_get_ns();
if (!__blk_mq_alloc_rq_map(set, i))
goto out_unwind;
end = ktime_get_ns();
pr_err("hw queue %d init cost time %lld ns\n", i, end - start);
}

Test nvme hot-plugging with above debug code, we found it totally cost more
than 3ms in kernel space without being scheduled out when alloc rqs for all
16 hw queues with depth 1023, each hw queue cost about 140-250us. The cost
time will be increased with hw queue number and queue depth increasing. And
in an extreme case, if __blk_mq_alloc_rq_maps() returns -ENOMEM, it will try
"queue_depth >>= 1", more time will be consumed.
[  428.428771] nvme nvme0: pci function 1:01:00.0
[  428.428798] nvme 1:01:00.0: enabling device ( -> 0002)
[  428.428806] pcieport 1:00:00.0: can't derive routing for PCI INT 
A
[  428.428809] nvme 1:01:00.0: PCI INT A: no GSI
[  432.593374] [4688:kworker/u33:8 switch:663,2] queue depth 30, 
nr_hw_queues 1
[  432.593404] hw queue 0 init cost time 22883 ns
[  432.593408] [4688:kworker/u33:8 switch:663,2] all hw queues init 
cost time 35960 ns
[  432.595953] nvme nvme0: 16/0/0 default/read/poll queues
[  432.595958] [4688:kworker/u33:8 switch:700,2] queue depth 1023, 
nr_hw_queues 16
[  432.596203] hw queue 0 init cost time 242630 ns
[  432.596441] hw queue 1 init cost time 235913 ns
[  432.596659] hw queue 2 init cost time 216461 ns
[  432.596877] hw queue 3 init cost time 215851 ns
[  432.597107] hw queue 4 init cost time 228406 ns
[  432.597336] hw queue 5 init cost time 227298 ns
[  432.597564] hw queue 6 init cost time 224633 ns
[  432.597785] hw queue 7 init cost time 219954 ns
[  432.597937] hw queue 8 init cost time 150930 ns
[  432.598082] hw queue 9 init cost time 143496 ns
[  432.598231] hw queue 10 init cost time 147261 ns
[  432.598397] hw queue 11 init cost time 164522 ns
[  432.598542] hw queue 12 init cost time 143401 ns
[  432.598692] hw queue 13 init cost time 148934 ns
[  432.598841] hw queue 14 init cost time 147194 ns
[  432.598991] hw queue 15 init cost time 148942 ns
[  432.598993] [4688:kworker/u33:8 switch:700,2] all hw queues init 
cost time 3035099 ns
[  432.602611]  nvme0n1: p1

So use this patch to trigger schedule between each hw queue init, to avoid
other threads getting stuck. It is not in atomic context when executing
__blk_mq_alloc_rq_maps(), so it is safe to call cond_resched().

Signed-off-by: Xianting Tian 
---
 block/blk-mq.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index b3d2785ee..62d152d03 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3256,9 +3256,11 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set 
*set)
 {
int i;
 
-   for (i = 0; i < set->nr_hw_queues; i++)
+   for (i = 0; i < set->nr_hw_queues; i++) {
if (!__blk_mq_alloc_map_and_request(set, i))
goto out_unwind;
+   cond_resched();
+   }
 
return 0;
 
-- 
2.17.1



[PATCH] sched/fair: Remove the force parameter of update_tg_load_avg()

2020-09-23 Thread Xianting Tian
In the file fair.c, sometims update_tg_load_avg(cfs_rq, 0) is used,
sometimes update_tg_load_avg(cfs_rq, false) is used.
update_tg_load_avg() has the parameter force, but in current code,
it never set 1 or true to it, so remove the force parameter.

Signed-off-by: Xianting Tian 
---
 kernel/sched/fair.c | 19 +--
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1a68a0536..7056fa97f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -831,7 +831,7 @@ void init_entity_runnable_average(struct sched_entity *se)
 void post_init_entity_util_avg(struct task_struct *p)
 {
 }
-static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
+static void update_tg_load_avg(struct cfs_rq *cfs_rq)
 {
 }
 #endif /* CONFIG_SMP */
@@ -3288,7 +3288,6 @@ static inline void cfs_rq_util_change(struct cfs_rq 
*cfs_rq, int flags)
 /**
  * update_tg_load_avg - update the tg's load avg
  * @cfs_rq: the cfs_rq whose avg changed
- * @force: update regardless of how small the difference
  *
  * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
  * However, because tg->load_avg is a global value there are performance
@@ -3300,7 +3299,7 @@ static inline void cfs_rq_util_change(struct cfs_rq 
*cfs_rq, int flags)
  *
  * Updating tg's load_avg is necessary before update_cfs_share().
  */
-static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
 {
long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
 
@@ -3310,7 +3309,7 @@ static inline void update_tg_load_avg(struct cfs_rq 
*cfs_rq, int force)
if (cfs_rq->tg == &root_task_group)
return;
 
-   if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
+   if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
atomic_long_add(delta, &cfs_rq->tg->load_avg);
cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
}
@@ -3612,7 +3611,7 @@ static inline bool skip_blocked_update(struct 
sched_entity *se)
 
 #else /* CONFIG_FAIR_GROUP_SCHED */
 
-static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
 
 static inline int propagate_entity_load_avg(struct sched_entity *se)
 {
@@ -3800,13 +3799,13 @@ static inline void update_load_avg(struct cfs_rq 
*cfs_rq, struct sched_entity *s
 * IOW we're enqueueing a task on a new CPU.
 */
attach_entity_load_avg(cfs_rq, se);
-   update_tg_load_avg(cfs_rq, 0);
+   update_tg_load_avg(cfs_rq);
 
} else if (decayed) {
cfs_rq_util_change(cfs_rq, 0);
 
if (flags & UPDATE_TG)
-   update_tg_load_avg(cfs_rq, 0);
+   update_tg_load_avg(cfs_rq);
}
 }
 
@@ -7887,7 +7886,7 @@ static bool __update_blocked_fair(struct rq *rq, bool 
*done)
struct sched_entity *se;
 
if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
-   update_tg_load_avg(cfs_rq, 0);
+   update_tg_load_avg(cfs_rq);
 
if (cfs_rq == &rq->cfs)
decayed = true;
@@ -10786,7 +10785,7 @@ static void detach_entity_cfs_rq(struct sched_entity 
*se)
/* Catch up with the cfs_rq and remove our load when we leave */
update_load_avg(cfs_rq, se, 0);
detach_entity_load_avg(cfs_rq, se);
-   update_tg_load_avg(cfs_rq, false);
+   update_tg_load_avg(cfs_rq);
propagate_entity_cfs_rq(se);
 }
 
@@ -10805,7 +10804,7 @@ static void attach_entity_cfs_rq(struct sched_entity 
*se)
/* Synchronize entity with its cfs_rq */
update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : 
SKIP_AGE_LOAD);
attach_entity_load_avg(cfs_rq, se);
-   update_tg_load_avg(cfs_rq, false);
+   update_tg_load_avg(cfs_rq);
propagate_entity_cfs_rq(se);
 }
 
-- 
2.17.1



[PATCH] sched/fair: Use bool parameter for update_tg_load_avg()

2020-09-23 Thread Xianting Tian
In the file fair.c, sometims update_tg_load_avg(cfs_rq, 0) is used,
sometimes update_tg_load_avg(cfs_rq, false) is used. So change it
to use bool parameter.

Signed-off-by: Xianting Tian 
---
 kernel/sched/fair.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1a68a0536..61dac1c58 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -831,7 +831,7 @@ void init_entity_runnable_average(struct sched_entity *se)
 void post_init_entity_util_avg(struct task_struct *p)
 {
 }
-static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
+static void update_tg_load_avg(struct cfs_rq *cfs_rq, bool force)
 {
 }
 #endif /* CONFIG_SMP */
@@ -3300,7 +3300,7 @@ static inline void cfs_rq_util_change(struct cfs_rq 
*cfs_rq, int flags)
  *
  * Updating tg's load_avg is necessary before update_cfs_share().
  */
-static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, bool force)
 {
long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
 
@@ -3612,7 +3612,7 @@ static inline bool skip_blocked_update(struct 
sched_entity *se)
 
 #else /* CONFIG_FAIR_GROUP_SCHED */
 
-static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, bool force) {}
 
 static inline int propagate_entity_load_avg(struct sched_entity *se)
 {
@@ -3800,13 +3800,13 @@ static inline void update_load_avg(struct cfs_rq 
*cfs_rq, struct sched_entity *s
 * IOW we're enqueueing a task on a new CPU.
 */
attach_entity_load_avg(cfs_rq, se);
-   update_tg_load_avg(cfs_rq, 0);
+   update_tg_load_avg(cfs_rq, false);
 
} else if (decayed) {
cfs_rq_util_change(cfs_rq, 0);
 
if (flags & UPDATE_TG)
-   update_tg_load_avg(cfs_rq, 0);
+   update_tg_load_avg(cfs_rq, false);
}
 }
 
@@ -7887,7 +7887,7 @@ static bool __update_blocked_fair(struct rq *rq, bool 
*done)
struct sched_entity *se;
 
if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
-   update_tg_load_avg(cfs_rq, 0);
+   update_tg_load_avg(cfs_rq, false);
 
if (cfs_rq == &rq->cfs)
decayed = true;
-- 
2.17.1



[PATCH] [v2] nvme: replace meaningless judgement by checking whether req is null

2020-09-21 Thread Xianting Tian
Currently, we use nvmeq->q_depth as the upper limit for a valid tag in
nvme_handle_cqe(), it is not correct. Because the available tag number
is recorded in tagset, which is not equal to nvmeq->q_depth.

The nvme driver registers interrupts for queues before initializing the
tagset, because it uses the number of successful request_irq() calls to
configure the tagset parameters. This allows a race condition with the
current tag validity check if the controller happens to produce an
interrupt with a corrupted CQE before the tagset is initialized.

Replace the driver's indirect tag check with the one already provided by
the block layer. With this patch, we can avoid a null pointer deference
as below.

[ 1124.256246] nvme nvme5: pci function :e1:00.0
[ 1124.256323] nvme :e1:00.0: enabling device ( -> 0002)
[ 1125.720859] nvme nvme5: 96/0/0 default/read/poll queues
[ 1125.732483]  nvme5n1: p1 p2 p3
[ 1125.788049] BUG: unable to handle kernel NULL pointer dereference at 
0130
[ 1125.788054] PGD 0 P4D 0
[ 1125.788057] Oops: 0002 [#1] SMP NOPTI
[ 1125.788059] CPU: 50 PID: 0 Comm: swapper/50 Kdump: loaded Tainted: G
--- -t - 
4.18.0-147.el8.x86_64 #1
[ 1125.788065] RIP: 0010:nvme_irq+0xe8/0x240 [nvme]
[ 1125.788068] RSP: 0018:916b8ec83ed0 EFLAGS: 00010813
[ 1125.788069] RAX:  RBX: 918ae9211b00 RCX: 

[ 1125.788070] RDX: 400b RSI:  RDI: 

[ 1125.788071] RBP: 918ae887 R08: 0004 R09: 
918ae887
[ 1125.788072] R10:  R11:  R12: 

[ 1125.788073] R13: 0001 R14:  R15: 
0001
[ 1125.788075] FS:  () GS:916b8ec8() 
knlGS:
[ 1125.788075] CS:  0010 DS:  ES:  CR0: 80050033
[ 1125.788076] CR2: 0130 CR3: 001768f0 CR4: 
00340ee0
[ 1125.788077] Call Trace:
[ 1125.788080]  
[ 1125.788085]  __handle_irq_event_percpu+0x40/0x180
[ 1125.788087]  handle_irq_event_percpu+0x30/0x80
[ 1125.788089]  handle_irq_event+0x36/0x53
[ 1125.788090]  handle_edge_irq+0x82/0x190
[ 1125.788094]  handle_irq+0xbf/0x100
[ 1125.788098]  do_IRQ+0x49/0xd0
[ 1125.788100]  common_interrupt+0xf/0xf

Signed-off-by: Xianting Tian 
---
 drivers/nvme/host/pci.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 899d2f4d7..f7cf01fc7 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -940,13 +940,6 @@ static inline void nvme_handle_cqe(struct nvme_queue 
*nvmeq, u16 idx)
struct nvme_completion *cqe = &nvmeq->cqes[idx];
struct request *req;
 
-   if (unlikely(cqe->command_id >= nvmeq->q_depth)) {
-   dev_warn(nvmeq->dev->ctrl.device,
-   "invalid id %d completed on queue %d\n",
-   cqe->command_id, le16_to_cpu(cqe->sq_id));
-   return;
-   }
-
/*
 * AEN requests are special as they don't time out and can
 * survive any kind of queue freeze and often don't respond to
@@ -960,6 +953,13 @@ static inline void nvme_handle_cqe(struct nvme_queue 
*nvmeq, u16 idx)
}
 
req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), cqe->command_id);
+   if (unlikely(!req)) {
+   dev_warn(nvmeq->dev->ctrl.device,
+   "invalid id %d completed on queue %d\n",
+   cqe->command_id, le16_to_cpu(cqe->sq_id));
+   return;
+   }
+
trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
if (!nvme_try_complete_req(req, cqe->status, cqe->result))
nvme_pci_complete_rq(req);
-- 
2.17.1



[PATCH] nvme: replace meaningless judgement by checking whether req is null

2020-09-20 Thread Xianting Tian
gt;q_depth))"
in nvme_handle_cqe() is useless, we will get a null pointer, which is
returned by blk_mq_tag_to_rq():
[   16.649973] nvme nvme0: command_id 968 completed on queue 13,
nvmeq q_depth 1024, nvme tagset q_depth 1023, admin 
q_depth 30
[   16.649974] tag, nr_tags:968 1023

In function blk_mq_alloc_map_and_requests(), it will adjust tagset depth by
'set->queue_depth >>= 1' if there is no enough memory for rqs. If this happens,
the real available number of tags(nr_tags) is much smaller than nvmeq->q_depth.
So the judgement "if (unlikely(cqe->command_id >= nvmeq->q_depth))" is really
meaningless.

It has the same issue for nvme admin queue, whose nvmeq->q_depth is 32, but
tagset depth is 30:
[7.489345] nvme nvme2: command id 24 completed on queue 0,
nvmeq q_depth 32, nvme tagset q_depth 0, admin q_depth 
30
[7.489347] tag, nr_tags:24 30

This patch is to remove the meaningless judgement, we check whether the returned
req is null, if it is null, directly return. So with this patch, we can avoid a
potential null pointer deference.

Signed-off-by: Xianting Tian 
---
 drivers/nvme/host/pci.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 899d2f4d7..18a857b59 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -940,13 +940,6 @@ static inline void nvme_handle_cqe(struct nvme_queue 
*nvmeq, u16 idx)
struct nvme_completion *cqe = &nvmeq->cqes[idx];
struct request *req;
 
-   if (unlikely(cqe->command_id >= nvmeq->q_depth)) {
-   dev_warn(nvmeq->dev->ctrl.device,
-   "invalid id %d completed on queue %d\n",
-   cqe->command_id, le16_to_cpu(cqe->sq_id));
-   return;
-   }
-
/*
 * AEN requests are special as they don't time out and can
 * survive any kind of queue freeze and often don't respond to
@@ -960,6 +953,13 @@ static inline void nvme_handle_cqe(struct nvme_queue 
*nvmeq, u16 idx)
}
 
req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), cqe->command_id);
+   if (unlikely(!req)) {
+   dev_warn(nvmeq->dev->ctrl.device,
+   "req is null for tag %d completed on queue %d\n",
+   cqe->command_id, le16_to_cpu(cqe->sq_id));
+   return;
+   }
+
trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
if (!nvme_try_complete_req(req, cqe->status, cqe->result))
nvme_pci_complete_rq(req);
-- 
2.17.1



[PATCH] [v2] nvme: use correct upper limit for tag in nvme_handle_cqe()

2020-09-18 Thread Xianting Tian
d_id >= nvmeq->q_depth))"
in nvme_handle_cqe() is useless, we will get a null pointer, which is
returned by blk_mq_tag_to_rq():
[   16.649973] nvme nvme0: command_id 968 completed on queue 13, nvmeq 
q_depth 1024, nvme tagset q_depth 1023, admin q_depth 30
[   16.649974] tag, nr_tags:968 1023

It has the same issue for nvme admin queue:
[7.489345] nvme nvme2: command id 24 completed on queue 0, nvmeq 
q_depth 32, nvme tagset q_depth 0, admin q_depth 30
[7.489347] tag, nr_tags:24 30

This patch is to make command_id match to its correct upper limit
'nvmeq->dev->tagset.queue_depth' or
'nvmeq->dev->admin_tagset.queue_depth', not nvmeq->q_depth. So with
this patch, we can avoid a potential null pointer deference.

Signed-off-by: Xianting Tian 
---
 drivers/nvme/host/pci.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 899d2f4d7..c0ae9d511 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -940,7 +940,9 @@ static inline void nvme_handle_cqe(struct nvme_queue 
*nvmeq, u16 idx)
struct nvme_completion *cqe = &nvmeq->cqes[idx];
struct request *req;
 
-   if (unlikely(cqe->command_id >= nvmeq->q_depth)) {
+   if (unlikely(cqe->command_id >=
+   nvmeq->qid ? nvmeq->dev->tagset.queue_depth :
+   nvmeq->dev->admin_tagset.queue_depth)) {
dev_warn(nvmeq->dev->ctrl.device,
"invalid id %d completed on queue %d\n",
cqe->command_id, le16_to_cpu(cqe->sq_id));
-- 
2.17.1



[PATCH] nvme: use correct upper limit for tag in nvme_handle_cqe()

2020-09-18 Thread Xianting Tian
less, we will get a null pointer, which is returned
by blk_mq_tag_to_rq():
[   16.649973] nvme nvme0: command_id 968 completed on queue 13, nvmeq 
q_depth 1024, nvme tagset q_depth 1023
[   16.649974] tag, nr_tags:968 1023

This patch is to make command_id match to its correct upper limit
'nvmeq->dev->tagset.queue_depth', not nvmeq->q_depth. So even if
we got 1023 of command_id, we can avoid a null pointer deference.

Signed-off-by: Xianting Tian 
---
 drivers/nvme/host/pci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 899d2f4d7..c681e26d0 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -940,7 +940,7 @@ static inline void nvme_handle_cqe(struct nvme_queue 
*nvmeq, u16 idx)
struct nvme_completion *cqe = &nvmeq->cqes[idx];
struct request *req;
 
-   if (unlikely(cqe->command_id >= nvmeq->q_depth)) {
+   if (unlikely(cqe->command_id >= nvmeq->dev->tagset.queue_depth)) {
dev_warn(nvmeq->dev->ctrl.device,
"invalid id %d completed on queue %d\n",
cqe->command_id, le16_to_cpu(cqe->sq_id));
-- 
2.17.1



[PATCH] [v2] blk-mq: add cond_resched() in __blk_mq_alloc_rq_maps()

2020-09-17 Thread Xianting Tian
We found it takes more time of blk_mq_alloc_rq_maps() in kernel space when
testing nvme hot-plugging. The test and anlysis as below.

Debug code,
1, blk_mq_alloc_rq_maps():
u64 start, end;
depth = set->queue_depth;
start = ktime_get_ns();
pr_err("[%d:%s switch:%ld,%ld] queue depth %d, nr_hw_queues %d\n",
current->pid, current->comm, current->nvcsw, 
current->nivcsw,
set->queue_depth, set->nr_hw_queues);
do {
err = __blk_mq_alloc_rq_maps(set);
if (!err)
break;

set->queue_depth >>= 1;
if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) {
err = -ENOMEM;
break;
}
} while (set->queue_depth);
end = ktime_get_ns();
pr_err("[%d:%s switch:%ld,%ld] all hw queues init cost time %lld ns\n",
current->pid, current->comm,
current->nvcsw, current->nivcsw, end - start);

2, __blk_mq_alloc_rq_maps():
u64 start, end;
for (i = 0; i < set->nr_hw_queues; i++) {
start = ktime_get_ns();
if (!__blk_mq_alloc_rq_map(set, i))
goto out_unwind;
end = ktime_get_ns();
pr_err("hw queue %d init cost time %lld\n", i, end - start);
}

Test nvme hot-plugging with above debug code, we found it totally cost more
than 3ms in kernel space without being scheduled out when alloc rqs for all
16 hw queues with depth 1024, each hw queue cost about 140-250us. The time
cost will be increased with hw queue number and queue depth increasing. And
if __blk_mq_alloc_rq_maps() returns -ENOMEM, it will try "queue_depth >>= 1",
more time will be consumed.
[  428.428771] nvme nvme0: pci function 1:01:00.0
[  428.428798] nvme 1:01:00.0: enabling device ( -> 0002)
[  428.428806] pcieport 1:00:00.0: can't derive routing for PCI INT 
A
[  428.428809] nvme 1:01:00.0: PCI INT A: no GSI
[  432.593374] [4688:kworker/u33:8 switch:663,2] queue depth 30, 
nr_hw_queues 1
[  432.593404] hw queue 0 init cost time 22883 ns
[  432.593408] [4688:kworker/u33:8 switch:663,2] all hw queues init 
cost time 35960 ns
[  432.595953] nvme nvme0: 16/0/0 default/read/poll queues
[  432.595958] [4688:kworker/u33:8 switch:700,2] queue depth 1023, 
nr_hw_queues 16
[  432.596203] hw queue 0 init cost time 242630 ns
[  432.596441] hw queue 1 init cost time 235913 ns
[  432.596659] hw queue 2 init cost time 216461 ns
[  432.596877] hw queue 3 init cost time 215851 ns
[  432.597107] hw queue 4 init cost time 228406 ns
[  432.597336] hw queue 5 init cost time 227298 ns
[  432.597564] hw queue 6 init cost time 224633 ns
[  432.597785] hw queue 7 init cost time 219954 ns
[  432.597937] hw queue 8 init cost time 150930 ns
[  432.598082] hw queue 9 init cost time 143496 ns
[  432.598231] hw queue 10 init cost time 147261 ns
[  432.598397] hw queue 11 init cost time 164522 ns
[  432.598542] hw queue 12 init cost time 143401 ns
[  432.598692] hw queue 13 init cost time 148934 ns
[  432.598841] hw queue 14 init cost time 147194 ns
[  432.598991] hw queue 15 init cost time 148942 ns
[  432.598993] [4688:kworker/u33:8 switch:700,2] all hw queues init 
cost time 3035099 ns
[  432.602611]  nvme0n1: p1

So use this patch to trigger schedule between each hw queue init, to avoid
other threads getting stuck. We call cond_resched() only when
"queue depth >= 512". We are not in atomic context when executing
__blk_mq_alloc_rq_maps(), so it is safe to call cond_resched().

Signed-off-by: Xianting Tian 
---
 block/blk-mq.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index b3d2785ee..5a71fe53a 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3255,11 +3255,16 @@ void blk_mq_exit_queue(struct request_queue *q)
 static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
 {
int i;
+   unsigned int depth = set->queue_depth;
 
-   for (i = 0; i < set->nr_hw_queues; i++)
+   for (i = 0; i < set->nr_hw_queues; i++) {
if (!__blk_mq_alloc_map_and_request(set, i))
goto out_unwind;
 
+   if (depth >= 512)
+   cond_resched();
+   }
+
return 0;
 
 out_unwind:
-- 
2.17.1



[PATCH] blk-mq: add cond_resched() in __blk_mq_alloc_rq_maps()

2020-09-16 Thread Xianting Tian
We found it takes more time of blk_mq_alloc_rq_maps() in kernel space when
testing nvme hot-plugging. The test and anlysis as below.

Debug code,
1, blk_mq_alloc_rq_maps():
u64 start, end;
depth = set->queue_depth;
start = ktime_get_ns();
pr_err("[%d:%s switch:%ld,%ld] queue depth %d, nr_hw_queues %d\n",
current->pid, current->comm, current->nvcsw, 
current->nivcsw,
set->queue_depth, set->nr_hw_queues);
do {
err = __blk_mq_alloc_rq_maps(set);
if (!err)
break;

set->queue_depth >>= 1;
if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) {
err = -ENOMEM;
break;
}
} while (set->queue_depth);
end = ktime_get_ns();
pr_err("[%d:%s switch:%ld,%ld] all hw queues init cost time %lld ns\n",
current->pid, current->comm,
current->nvcsw, current->nivcsw, end - start);

2, __blk_mq_alloc_rq_maps():
u64 start, end;
for (i = 0; i < set->nr_hw_queues; i++) {
start = ktime_get_ns();
if (!__blk_mq_alloc_rq_map(set, i))
goto out_unwind;
end = ktime_get_ns();
pr_err("hw queue %d init cost time %lld\n", i, end - start);
}

Test nvme hot-plugging with above debug code, we found it totally cost more
than 3ms in kernel space without being scheduled out when alloc rqs for all
hw queues, each  hw queue cost about 140-250us. The time cost will be
increased with hw queue number increasing. And if __blk_mq_alloc_rq_maps()
returns -ENOMEM, it will try "queue_depth >>= 1", it will cost more time.
[  428.428771] nvme nvme0: pci function 1:01:00.0
[  428.428798] nvme 1:01:00.0: enabling device ( -> 0002)
[  428.428806] pcieport 1:00:00.0: can't derive routing for PCI INT 
A
[  428.428809] nvme 1:01:00.0: PCI INT A: no GSI
[  432.593374] [4688:kworker/u33:8 switch:663,2] queue depth 30, 
nr_hw_queues 1
[  432.593404] hw queue 0 init cost time 22883 ns
[  432.593408] [4688:kworker/u33:8 switch:663,2] all hw queues init 
cost time 35960 ns
[  432.595953] nvme nvme0: 16/0/0 default/read/poll queues
[  432.595958] [4688:kworker/u33:8 switch:700,2] queue depth 1023, 
nr_hw_queues 16
[  432.596203] hw queue 0 init cost time 242630 ns
[  432.596441] hw queue 1 init cost time 235913 ns
[  432.596659] hw queue 2 init cost time 216461 ns
[  432.596877] hw queue 3 init cost time 215851 ns
[  432.597107] hw queue 4 init cost time 228406 ns
[  432.597336] hw queue 5 init cost time 227298 ns
[  432.597564] hw queue 6 init cost time 224633 ns
[  432.597785] hw queue 7 init cost time 219954 ns
[  432.597937] hw queue 8 init cost time 150930 ns
[  432.598082] hw queue 9 init cost time 143496 ns
[  432.598231] hw queue 10 init cost time 147261 ns
[  432.598397] hw queue 11 init cost time 164522 ns
[  432.598542] hw queue 12 init cost time 143401 ns
[  432.598692] hw queue 13 init cost time 148934 ns
[  432.598841] hw queue 14 init cost time 147194 ns
[  432.598991] hw queue 15 init cost time 148942 ns
[  432.598993] [4688:kworker/u33:8 switch:700,2] all hw queues init 
cost time 3035099 ns
[  432.602611]  nvme0n1: p1

So use this patch to trigger schedule between each hw queue init, to avoid
stuck of other tasks.

Signed-off-by: Xianting Tian 
---
 block/blk-mq.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index b3d2785ee..12418ecf5 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3256,10 +3256,13 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set 
*set)
 {
int i;
 
-   for (i = 0; i < set->nr_hw_queues; i++)
+   for (i = 0; i < set->nr_hw_queues; i++) {
if (!__blk_mq_alloc_map_and_request(set, i))
goto out_unwind;
 
+   cond_resched();
+   }
+
return 0;
 
 out_unwind:
@@ -3292,6 +3295,9 @@ static int blk_mq_alloc_map_and_requests(struct 
blk_mq_tag_set *set)
}
} while (set->queue_depth);
 
+
+
+
if (!set->queue_depth || err) {
pr_err("blk-mq: failed to allocate request map\n");
return -ENOMEM;
-- 
2.17.1



[PATCH] ipmi: add retry in try_get_dev_id()

2020-09-16 Thread Xianting Tian
Use retry machanism to give device more opportunitys to correctly response
kernel when we received specific completion codes.

This is similar to what we done in __get_device_id().

Signed-off-by: Xianting Tian 
---
 drivers/char/ipmi/ipmi_msghandler.c |  2 --
 drivers/char/ipmi/ipmi_si_intf.c| 17 +
 include/uapi/linux/ipmi.h   |  2 ++
 3 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/drivers/char/ipmi/ipmi_msghandler.c 
b/drivers/char/ipmi/ipmi_msghandler.c
index b9685093e..75cb7e062 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -62,8 +62,6 @@ enum ipmi_panic_event_op {
 #define IPMI_PANIC_DEFAULT IPMI_SEND_PANIC_EVENT_NONE
 #endif
 
-#define GET_DEVICE_ID_MAX_RETRY5
-
 static enum ipmi_panic_event_op ipmi_send_panic_event = IPMI_PANIC_DEFAULT;
 
 static int panic_op_write_handler(const char *val,
diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c
index 77b8d551a..beeb705f1 100644
--- a/drivers/char/ipmi/ipmi_si_intf.c
+++ b/drivers/char/ipmi/ipmi_si_intf.c
@@ -1316,6 +1316,7 @@ static int try_get_dev_id(struct smi_info *smi_info)
unsigned char *resp;
unsigned long resp_len;
int   rv = 0;
+   unsigned int  retry_count = 0;
 
resp = kmalloc(IPMI_MAX_MSG_LENGTH, GFP_KERNEL);
if (!resp)
@@ -1327,6 +1328,8 @@ static int try_get_dev_id(struct smi_info *smi_info)
 */
msg[0] = IPMI_NETFN_APP_REQUEST << 2;
msg[1] = IPMI_GET_DEVICE_ID_CMD;
+
+retry:
smi_info->handlers->start_transaction(smi_info->si_sm, msg, 2);
 
rv = wait_for_msg_done(smi_info);
@@ -1339,6 +1342,20 @@ static int try_get_dev_id(struct smi_info *smi_info)
/* Check and record info from the get device id, in case we need it. */
rv = ipmi_demangle_device_id(resp[0] >> 2, resp[1],
resp + 2, resp_len - 2, &smi_info->device_id);
+   if (rv) {
+   /* record completion code */
+   char cc = *(resp + 2);
+
+   if ((cc == IPMI_DEVICE_IN_FW_UPDATE_ERR
+   || cc == IPMI_DEVICE_IN_INIT_ERR
+   || cc == IPMI_NOT_IN_MY_STATE_ERR)
+   && ++retry_count <= GET_DEVICE_ID_MAX_RETRY) {
+   dev_warn(smi_info->io.dev,
+   "retry to get device id as completion code 
0x%x\n",
+cc);
+   goto retry;
+   }
+   }
 
 out:
kfree(resp);
diff --git a/include/uapi/linux/ipmi.h b/include/uapi/linux/ipmi.h
index 32d148309..bc57f07e3 100644
--- a/include/uapi/linux/ipmi.h
+++ b/include/uapi/linux/ipmi.h
@@ -426,4 +426,6 @@ struct ipmi_timing_parms {
 #define IPMICTL_GET_MAINTENANCE_MODE_CMD   _IOR(IPMI_IOC_MAGIC, 30, int)
 #define IPMICTL_SET_MAINTENANCE_MODE_CMD   _IOW(IPMI_IOC_MAGIC, 31, int)
 
+#define GET_DEVICE_ID_MAX_RETRY5
+
 #endif /* _UAPI__LINUX_IPMI_H */
-- 
2.17.1



[PATCH] ipmi: print current state when error

2020-09-15 Thread Xianting Tian
Print current state before returning IPMI_NOT_IN_MY_STATE_ERR,
we can know where this issue is coming from and possibly fix
the state machine.

Signed-off-by: Xianting Tian 
---
 drivers/char/ipmi/ipmi_bt_sm.c   | 4 +++-
 drivers/char/ipmi/ipmi_kcs_sm.c  | 4 +++-
 drivers/char/ipmi/ipmi_smic_sm.c | 4 +++-
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/char/ipmi/ipmi_bt_sm.c b/drivers/char/ipmi/ipmi_bt_sm.c
index f3f216cdf..2de0c6c30 100644
--- a/drivers/char/ipmi/ipmi_bt_sm.c
+++ b/drivers/char/ipmi/ipmi_bt_sm.c
@@ -213,8 +213,10 @@ static int bt_start_transaction(struct si_sm_data *bt,
if (bt->state == BT_STATE_LONG_BUSY)
return IPMI_NODE_BUSY_ERR;
 
-   if (bt->state != BT_STATE_IDLE)
+   if (bt->state != BT_STATE_IDLE) {
+   dev_warn(bt->io->dev, "BT is now in the state %d\n", bt->state);
return IPMI_NOT_IN_MY_STATE_ERR;
+   }
 
if (bt_debug & BT_DEBUG_MSG) {
dev_dbg(bt->io->dev, "+ New command\n");
diff --git a/drivers/char/ipmi/ipmi_kcs_sm.c b/drivers/char/ipmi/ipmi_kcs_sm.c
index 2e7cda08b..49ece4ba3 100644
--- a/drivers/char/ipmi/ipmi_kcs_sm.c
+++ b/drivers/char/ipmi/ipmi_kcs_sm.c
@@ -268,8 +268,10 @@ static int start_kcs_transaction(struct si_sm_data *kcs, 
unsigned char *data,
if (size > MAX_KCS_WRITE_SIZE)
return IPMI_REQ_LEN_EXCEEDED_ERR;
 
-   if ((kcs->state != KCS_IDLE) && (kcs->state != KCS_HOSED))
+   if ((kcs->state != KCS_IDLE) && (kcs->state != KCS_HOSED)) {
+   printk(KERN_WARNING, "KCS is now in the state %d\n", 
kcs->state);
return IPMI_NOT_IN_MY_STATE_ERR;
+   }
 
if (kcs_debug & KCS_DEBUG_MSG) {
printk(KERN_DEBUG "start_kcs_transaction -");
diff --git a/drivers/char/ipmi/ipmi_smic_sm.c b/drivers/char/ipmi/ipmi_smic_sm.c
index b6225bba2..690d4f53e 100644
--- a/drivers/char/ipmi/ipmi_smic_sm.c
+++ b/drivers/char/ipmi/ipmi_smic_sm.c
@@ -126,8 +126,10 @@ static int start_smic_transaction(struct si_sm_data *smic,
if (size > MAX_SMIC_WRITE_SIZE)
return IPMI_REQ_LEN_EXCEEDED_ERR;
 
-   if ((smic->state != SMIC_IDLE) && (smic->state != SMIC_HOSED))
+   if ((smic->state != SMIC_IDLE) && (smic->state != SMIC_HOSED)) {
+   printk(KERN_WARNING, "SMIC is now in the state %d\n", 
smic->state);
return IPMI_NOT_IN_MY_STATE_ERR;
+   }
 
if (smic_debug & SMIC_DEBUG_MSG) {
printk(KERN_DEBUG "start_smic_transaction -");
-- 
2.17.1



[PATCH] [v3] ipmi: retry to get device id when error

2020-09-15 Thread Xianting Tian
We can't get bmc's device id with low probability when loading ipmi driver,
it caused bmc device register failed. When this issue happened, we got
below kernel prints:
[Wed Sep  9 19:52:03 2020] ipmi_si IPI0001:00: IPMI message handler: 
device id demangle failed: -22
[Wed Sep  9 19:52:03 2020] IPMI BT: using default values
[Wed Sep  9 19:52:03 2020] IPMI BT: req2rsp=5 secs retries=2
[Wed Sep  9 19:52:03 2020] ipmi_si IPI0001:00: Unable to get the device 
id: -5
[Wed Sep  9 19:52:04 2020] ipmi_si IPI0001:00: Unable to register 
device: error -5

When this issue happened, we want to manually unload the driver and try to
load it again, but it can't be unloaded by 'rmmod' as it is already 'in use'.

We add below 'printk' in handle_one_recv_msg(), when this issue happened,
the msg we received is "Recv: 1c 01 d5", which means the data_len is 1,
data[0] is 0xd5(completion code), which means "bmc cannot execute command.
Command, or request parameter(s), not supported in present state".
Debug code:
static int handle_one_recv_msg(struct ipmi_smi *intf,
   struct ipmi_smi_msg *msg) {
printk("Recv: %*ph\n", msg->rsp_size, msg->rsp);
... ...
}
Then in ipmi_demangle_device_id(), it returned '-EINVAL' as 'data_len < 7'
and 'data[0] != 0'.

We used this patch to retry to get device id when error happen, we
reproduced this issue again and the retry succeed on the first retry, we
finally got the correct msg and then all is ok:
Recv: 1c 01 00 01 81 05 84 02 af db 07 00 01 00 b9 00 10 00

So use retry machanism in this patch to give bmc more opportunity to
correctly response kernel when we received specific completion codes.

Signed-off-by: Xianting Tian 
---
 drivers/char/ipmi/ipmi_msghandler.c | 29 +
 include/uapi/linux/ipmi_msgdefs.h   |  2 ++
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/drivers/char/ipmi/ipmi_msghandler.c 
b/drivers/char/ipmi/ipmi_msghandler.c
index 737c0b6b2..b9685093e 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -34,6 +34,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define IPMI_DRIVER_VERSION "39.2"
 
@@ -60,6 +61,9 @@ enum ipmi_panic_event_op {
 #else
 #define IPMI_PANIC_DEFAULT IPMI_SEND_PANIC_EVENT_NONE
 #endif
+
+#define GET_DEVICE_ID_MAX_RETRY5
+
 static enum ipmi_panic_event_op ipmi_send_panic_event = IPMI_PANIC_DEFAULT;
 
 static int panic_op_write_handler(const char *val,
@@ -317,6 +321,7 @@ struct bmc_device {
intdyn_guid_set;
struct krefusecount;
struct work_struct remove_work;
+   char   cc; /* completion code */
 };
 #define to_bmc_device(x) container_of((x), struct bmc_device, pdev.dev)
 
@@ -2381,6 +2386,8 @@ static void bmc_device_id_handler(struct ipmi_smi *intf,
msg->msg.data, msg->msg.data_len, &intf->bmc->fetch_id);
if (rv) {
dev_warn(intf->si_dev, "device id demangle failed: %d\n", rv);
+   /* record completion code when error */
+   intf->bmc->cc = msg->msg.data[0];
intf->bmc->dyn_id_set = 0;
} else {
/*
@@ -2426,19 +2433,34 @@ send_get_device_id_cmd(struct ipmi_smi *intf)
 static int __get_device_id(struct ipmi_smi *intf, struct bmc_device *bmc)
 {
int rv;
-
-   bmc->dyn_id_set = 2;
+   unsigned int retry_count = 0;
 
intf->null_user_handler = bmc_device_id_handler;
 
+retry:
+   bmc->cc = 0;
+   bmc->dyn_id_set = 2;
+
rv = send_get_device_id_cmd(intf);
if (rv)
return rv;
 
wait_event(intf->waitq, bmc->dyn_id_set != 2);
 
-   if (!bmc->dyn_id_set)
+   if (!bmc->dyn_id_set) {
+   if ((bmc->cc == IPMI_DEVICE_IN_FW_UPDATE_ERR
+|| bmc->cc ==  IPMI_DEVICE_IN_INIT_ERR
+|| bmc->cc ==  IPMI_NOT_IN_MY_STATE_ERR)
+&& ++retry_count <= GET_DEVICE_ID_MAX_RETRY) {
+   msleep(500);
+   dev_warn(intf->si_dev,
+   "retry to get bmc device id as completion code 
0x%x\n",
+   bmc->cc);
+   goto retry;
+   }
+
rv = -EIO; /* Something went wrong in the fetch. */
+   }
 
/* dyn_id_set makes the id data available. */
smp_rmb();
@@ -3245,7 +3267,6 @@ channel_handler(struct ipmi_smi *intf, struct 
ipmi_recv_msg *msg)
/* It's the one we want */
if (msg->msg.data[0] != 0) {
/* Got an err

[PATCH] [v2] ipmi: retry to get device id when error

2020-09-14 Thread Xianting Tian
We can't get bmc's device id with low probability when loading ipmi driver,
it caused bmc device register failed. When this issue happened, we got
below kernel printks:
[Wed Sep  9 19:52:03 2020] ipmi_si IPI0001:00: IPMI message handler: 
device id demangle failed: -22
[Wed Sep  9 19:52:03 2020] IPMI BT: using default values
[Wed Sep  9 19:52:03 2020] IPMI BT: req2rsp=5 secs retries=2
[Wed Sep  9 19:52:03 2020] ipmi_si IPI0001:00: Unable to get the device 
id: -5
[Wed Sep  9 19:52:04 2020] ipmi_si IPI0001:00: Unable to register 
device: error -5

When this issue happened, we want to manually unload the driver and try to
load it again, but it can't be unloaded by 'rmmod' as it is already 'in use'.

We add below 'printk' in handle_one_recv_msg(), when this issue happened,
the msg we received is "Recv: 1c 01 d5", which means the data_len is 1,
data[0] is 0xd5(completion code), which means "bmc cannot execute command.
Command, or request parameter(s), not supported in present state".
Debug code:
static int handle_one_recv_msg(struct ipmi_smi *intf,
   struct ipmi_smi_msg *msg) {
printk("Recv: %*ph\n", msg->rsp_size, msg->rsp);
... ...
}
Then in ipmi_demangle_device_id(), it returned '-EINVAL' as 'data_len < 7'
and 'data[0] != 0'.

We used this patch to retry to get device id when error happen, we
reproduced this issue again and the retry succeed on the first retry, we
finally got the correct msg and then all is ok:
Recv: 1c 01 00 01 81 05 84 02 af db 07 00 01 00 b9 00 10 00

So use retry machanism in this patch to give bmc more opportunity to
correctly response kernel when we received specific completion code.

Signed-off-by: Xianting Tian 
---
 drivers/char/ipmi/ipmi_msghandler.c | 29 +
 include/uapi/linux/ipmi_msgdefs.h   |  2 ++
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/drivers/char/ipmi/ipmi_msghandler.c 
b/drivers/char/ipmi/ipmi_msghandler.c
index 737c0b6b2..07d5be2cd 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -34,6 +34,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define IPMI_DRIVER_VERSION "39.2"
 
@@ -60,6 +61,9 @@ enum ipmi_panic_event_op {
 #else
 #define IPMI_PANIC_DEFAULT IPMI_SEND_PANIC_EVENT_NONE
 #endif
+
+#define GET_DEVICE_ID_MAX_RETRY5
+
 static enum ipmi_panic_event_op ipmi_send_panic_event = IPMI_PANIC_DEFAULT;
 
 static int panic_op_write_handler(const char *val,
@@ -317,6 +321,7 @@ struct bmc_device {
intdyn_guid_set;
struct krefusecount;
struct work_struct remove_work;
+   char   cc; /* completion code */
 };
 #define to_bmc_device(x) container_of((x), struct bmc_device, pdev.dev)
 
@@ -2381,6 +2386,8 @@ static void bmc_device_id_handler(struct ipmi_smi *intf,
msg->msg.data, msg->msg.data_len, &intf->bmc->fetch_id);
if (rv) {
dev_warn(intf->si_dev, "device id demangle failed: %d\n", rv);
+   /* record completion code when error */
+   intf->bmc->cc = msg->msg.data[0];
intf->bmc->dyn_id_set = 0;
} else {
/*
@@ -2426,19 +2433,34 @@ send_get_device_id_cmd(struct ipmi_smi *intf)
 static int __get_device_id(struct ipmi_smi *intf, struct bmc_device *bmc)
 {
int rv;
-
-   bmc->dyn_id_set = 2;
+   unsigned int retry_count = 0;
 
intf->null_user_handler = bmc_device_id_handler;
 
+retry:
+   bmc->dyn_id_set = 2;
+
rv = send_get_device_id_cmd(intf);
if (rv)
return rv;
 
wait_event(intf->waitq, bmc->dyn_id_set != 2);
 
-   if (!bmc->dyn_id_set)
+   if (!bmc->dyn_id_set) {
+   if ((bmc->cc == IPMI_NOT_IN_MY_STATE_ERR
+|| bmc->cc == IPMI_NOT_IN_MY_STATE_ERR_1
+|| bmc->cc == IPMI_NOT_IN_MY_STATE_ERR_2)
+&& ++retry_count <= GET_DEVICE_ID_MAX_RETRY) {
+   msleep(500);
+   dev_warn(intf->si_dev,
+   "retry to get bmc device id as completion code 
0x%x\n",
+   bmc->cc);
+   bmc->cc = 0;
+   goto retry;
+   }
+
rv = -EIO; /* Something went wrong in the fetch. */
+   }
 
/* dyn_id_set makes the id data available. */
smp_rmb();
@@ -3245,7 +3267,6 @@ channel_handler(struct ipmi_smi *intf, struct 
ipmi_recv_msg *msg)
/* It's the one we want */
if (msg->msg.data[0] != 0) {
 

[PATCH] ipmi: retry to get device id when error

2020-09-13 Thread Xianting Tian
We can't get bmc's device id with low probability when loading ipmi driver,
it caused bmc device register failed. This issue may caused by bad lpc
signal quality. When this issue happened, we got below kernel printks:
[Wed Sep  9 19:52:03 2020] ipmi_si IPI0001:00: IPMI message handler: 
device id demangle failed: -22
[Wed Sep  9 19:52:03 2020] IPMI BT: using default values
[Wed Sep  9 19:52:03 2020] IPMI BT: req2rsp=5 secs retries=2
[Wed Sep  9 19:52:03 2020] ipmi_si IPI0001:00: Unable to get the device 
id: -5
[Wed Sep  9 19:52:04 2020] ipmi_si IPI0001:00: Unable to register 
device: error -5

When this issue happened, we want to manually unload the driver and try to
load it again, but it can't be unloaded by 'rmmod' as it is already 'in use'.

We add below 'printk' in handle_one_recv_msg(), when this issue happened,
the msg we received is "Recv: 1c 01 d5", which means the data_len is 1,
data[0] is 0xd5.
Debug code:
static int handle_one_recv_msg(struct ipmi_smi *intf,
   struct ipmi_smi_msg *msg) {
printk("Recv: %*ph\n", msg->rsp_size, msg->rsp);
... ...
}
Then in ipmi_demangle_device_id(), it returned '-EINVAL' as 'data_len < 7'
and 'data[0] != 0'.

We used this patch to retry to get device id when error happen, we
reproduced this issue again and the retry succeed on the first retry, we
finally got the correct msg and then all is ok:
Recv: 1c 01 00 01 81 05 84 02 af db 07 00 01 00 b9 00 10 00

So use retry machanism in this patch to give bmc more opportunity to
correctly response kernel.

Signed-off-by: Xianting Tian 
---
 drivers/char/ipmi/ipmi_msghandler.c | 17 ++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/drivers/char/ipmi/ipmi_msghandler.c 
b/drivers/char/ipmi/ipmi_msghandler.c
index 737c0b6b2..bfb2de77a 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -34,6 +34,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define IPMI_DRIVER_VERSION "39.2"
 
@@ -60,6 +61,9 @@ enum ipmi_panic_event_op {
 #else
 #define IPMI_PANIC_DEFAULT IPMI_SEND_PANIC_EVENT_NONE
 #endif
+
+#define GET_DEVICE_ID_MAX_RETRY5
+
 static enum ipmi_panic_event_op ipmi_send_panic_event = IPMI_PANIC_DEFAULT;
 
 static int panic_op_write_handler(const char *val,
@@ -2426,19 +2430,26 @@ send_get_device_id_cmd(struct ipmi_smi *intf)
 static int __get_device_id(struct ipmi_smi *intf, struct bmc_device *bmc)
 {
int rv;
-
-   bmc->dyn_id_set = 2;
+   unsigned int retry_count = 0;
 
intf->null_user_handler = bmc_device_id_handler;
 
+retry:
+   bmc->dyn_id_set = 2;
+
rv = send_get_device_id_cmd(intf);
if (rv)
return rv;
 
wait_event(intf->waitq, bmc->dyn_id_set != 2);
 
-   if (!bmc->dyn_id_set)
+   if (!bmc->dyn_id_set) {
+   msleep(1000);
+   if (++retry_count <= GET_DEVICE_ID_MAX_RETRY)
+   goto retry;
+
rv = -EIO; /* Something went wrong in the fetch. */
+   }
 
/* dyn_id_set makes the id data available. */
smp_rmb();
-- 
2.17.1



[PATCH] [v2] blkcg: add plugging support for punt bio

2020-09-09 Thread Xianting Tian
art,size:13544544,8 count=0 
plug?0
[50861.355468] [kworker/u66:19:32376] bio start,size:13544552,8 count=0 
plug?0
[50861.355499] [kworker/u66:19:32376] bio start,size:13544560,8 count=0 
plug?0
[50861.355532] [kworker/u66:19:32376] bio start,size:13544568,8 count=0 
plug?0
[50861.355575] [kworker/u66:19:32376] bio start,size:13544576,8 count=0 
plug?0
[50861.355618] [kworker/u66:19:32376] bio start,size:13544584,8 count=0 
plug?0
[50861.355659] [kworker/u66:19:32376] bio start,size:13544592,8 count=0 
plug?0
[50861.355740] [kworker/u66:0:32346] bio start,size:13544600,8 count=0 
plug?1
[50861.355748] [kworker/u66:0:32346] bio start,size:13544608,8 count=1 
plug?1
[50861.355962] [kworker/u66:2:32347] bio start,size:13544616,8 count=0 
plug?0
[50861.356272] [kworker/u66:7:31962] bio start,size:13544624,8 count=0 
plug?0
[50861.356446] [kworker/u66:7:31962] bio start,size:13544632,8 count=0 
plug?0
[50861.356567] [kworker/u66:7:31962] bio start,size:13544640,8 count=0 
plug?0
[50861.356707] [kworker/u66:19:32376] bio start,size:13544648,8 count=0 
plug?0
[50861.356748] [kworker/u66:15:32355] bio start,size:13544656,8 count=0 
plug?0
[50861.356825] [kworker/u66:17:31970] bio start,size:13544664,8 count=0 
plug?0

Analysis of above 3 test results with different system load:
>From above test, we can see more and more continuous bios can be plugged
with system load increasing. When run "stress -c 64 &", 310 continuous
bios are plugged; When run "stress -c 32 &", 260 continuous bios are
plugged; When don't run stress, at most only 2 continuous bios are
plugged, in most cases, bio_list only contains one single bio.

How to explain above phenomenon:
We know, in submit_bio(), if the bio is a REQ_CGROUP_PUNT io, it will
queue a work to workqueue blkcg_punt_bio_wq. But when the workqueue is
scheduled, it depends on the system load.  When system load is low, the
workqueue will be quickly scheduled, and the bio in bio_list will be
quickly processed in blkg_async_bio_workfn(), so there is less chance
that the same io submit thread can add multiple continuous bios to
bio_list before workqueue is scheduled to run. The analysis aligned with
above test "3".
When system load is high, there is some delay before the workqueue can
be scheduled to run, the higher the system load the greater the delay.
So there is more chance that the same io submit thread can add multiple
continuous bios to bio_list. Then when the workqueue is scheduled to run,
there are more continuous bios in bio_list, which will be processed in
blkg_async_bio_workfn(). The analysis aligned with above test "1" and "2".

According to test, we can get io performance improved with the patch,
especially when system load is higher. Another optimazition is to use
the plug only when bio_list contains at least 2 bios.

Signed-off-by: Xianting Tian 
---
 block/blk-cgroup.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index c195365c9..f35a205d5 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -119,6 +119,8 @@ static void blkg_async_bio_workfn(struct work_struct *work)
 async_bio_work);
struct bio_list bios = BIO_EMPTY_LIST;
struct bio *bio;
+   struct blk_plug plug;
+   bool need_plug = false;
 
/* as long as there are pending bios, @blkg can't go away */
spin_lock_bh(&blkg->async_bio_lock);
@@ -126,8 +128,15 @@ static void blkg_async_bio_workfn(struct work_struct *work)
bio_list_init(&blkg->async_bios);
spin_unlock_bh(&blkg->async_bio_lock);
 
+   /* start plug only when bio_list contains at least 2 bios */
+   if (bios.head && bios.head->bi_next) {
+   need_plug = true;
+   blk_start_plug(&plug);
+   }
while ((bio = bio_list_pop(&bios)))
submit_bio(bio);
+   if (need_plug)
+   blk_finish_plug(&plug);
 }
 
 /**
-- 
2.17.1



[PATCH] block: remove redundant empty check of mq_list

2020-09-08 Thread Xianting Tian
blk_mq_flush_plug_list() itself will do the empty check of mq_list,
so remove such check in blk_flush_plug_list().
Actually normally mq_list is not empty when blk_flush_plug_list is
called.

Signed-off-by: Xianting Tian 
---
 block/blk-core.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 10c08ac50..dda301610 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1864,8 +1864,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool 
from_schedule)
 {
flush_plug_callbacks(plug, from_schedule);
 
-   if (!list_empty(&plug->mq_list))
-   blk_mq_flush_plug_list(plug, from_schedule);
+   blk_mq_flush_plug_list(plug, from_schedule);
 }
 
 /**
-- 
2.17.1



[PATCH] fs: use correct parameter in notes of generic_file_llseek_size()

2020-09-05 Thread Xianting Tian
Fix warning when compiling with W=1:
fs/read_write.c:88: warning: Function parameter or member 'maxsize' not 
described in 'generic_file_llseek_size'
fs/read_write.c:88: warning: Excess function parameter 'size' description in 
'generic_file_llseek_size'

Signed-off-by: Xianting Tian 
---
 fs/read_write.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/read_write.c b/fs/read_write.c
index 5db58b8c7..058563ee2 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -71,7 +71,7 @@ EXPORT_SYMBOL(vfs_setpos);
  * @file:  file structure to seek on
  * @offset:file offset to seek to
  * @whence:type of seek
- * @size:  max size of this file in file system
+ * @maxsize:   max size of this file in file system
  * @eof:   offset used for SEEK_END position
  *
  * This is a variant of generic_file_llseek that allows passing in a custom
-- 
2.17.1



[PATCH] nvme: use kobj_to_dev() to get device

2020-09-04 Thread Xianting Tian
We already has the interface kobj_to_dev(), which can be used to get
'struct device *' from kobj, so use it.

Signed-off-by: Xianting Tian 
---
 drivers/nvme/host/core.c | 4 ++--
 drivers/nvme/host/lightnvm.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 154942fc6..eb50615ba 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3422,7 +3422,7 @@ static struct attribute *nvme_ns_id_attrs[] = {
 static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj,
struct attribute *a, int n)
 {
-   struct device *dev = container_of(kobj, struct device, kobj);
+   struct device *dev = kobj_to_dev(kobj);
struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids;
 
if (a == &dev_attr_uuid.attr) {
@@ -3665,7 +3665,7 @@ static struct attribute *nvme_dev_attrs[] = {
 static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj,
struct attribute *a, int n)
 {
-   struct device *dev = container_of(kobj, struct device, kobj);
+   struct device *dev = kobj_to_dev(kobj);
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
 
if (a == &dev_attr_delete_controller.attr && !ctrl->ops->delete_ctrl)
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 8e562d0f2..70c87ff0d 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -1241,7 +1241,7 @@ static struct attribute *nvm_dev_attrs[] = {
 static umode_t nvm_dev_attrs_visible(struct kobject *kobj,
 struct attribute *attr, int index)
 {
-   struct device *dev = container_of(kobj, struct device, kobj);
+   struct device *dev = kobj_to_dev(kobj);
struct gendisk *disk = dev_to_disk(dev);
struct nvme_ns *ns = disk->private_data;
struct nvm_dev *ndev = ns->ndev;
-- 
2.17.1



[PATCH] clocksource: return negative error code

2020-08-31 Thread Xianting Tian
A negative error code should be returned

Signed-off-by: Xianting Tian 
---
 drivers/clocksource/h8300_timer8.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/clocksource/h8300_timer8.c 
b/drivers/clocksource/h8300_timer8.c
index 1d740a8c4..47114c2a7 100644
--- a/drivers/clocksource/h8300_timer8.c
+++ b/drivers/clocksource/h8300_timer8.c
@@ -169,7 +169,7 @@ static int __init h8300_8timer_init(struct device_node 
*node)
return PTR_ERR(clk);
}
 
-   ret = ENXIO;
+   ret = -ENXIO;
base = of_iomap(node, 0);
if (!base) {
pr_err("failed to map registers for clockevent\n");
-- 
2.17.1



[PATCH] [v2] nvme-pci: check req to prevent crash in nvme_handle_cqe()

2020-08-31 Thread Xianting Tian
We met a crash issue when hot-insert a nvme device, blk_mq_tag_to_rq()
returned null(req=null), then crash happened in nvme_end_request():
struct nvme_request *rq = nvme_req(req);
rq->result = result;  <==crash here

The test env is, a server is configured with 2 backplanes, each backplane
support 8 nvme devices, this crash happened when hot-insert a nvme device
to the second backplane. We measured the signal, which is send out of cpu
to ack nvme interrupt, the signal is very weak when it reached the second
backplane, the device can't distinguish it as a ack signal. So it caused
the device can't clear the interrupt flag.
After updating related driver, the signal sending out of cpu to the second
backplane is good, the crash issue disappeared.

As blk_mq_tag_to_rq() may return null, so it should be check whether it is
null before using it to prevent a crash.

[ 1124.256246] nvme nvme5: pci function :e1:00.0
[ 1124.256323] nvme :e1:00.0: enabling device ( -> 0002)
[ 1125.720859] nvme nvme5: 96/0/0 default/read/poll queues
[ 1125.732483]  nvme5n1: p1 p2 p3
[ 1125.788049] BUG: unable to handle kernel NULL pointer dereference at 
0130
[ 1125.788054] PGD 0 P4D 0
[ 1125.788057] Oops: 0002 [#1] SMP NOPTI
[ 1125.788059] CPU: 50 PID: 0 Comm: swapper/50 Kdump: loaded Tainted: G 
  --- -t - 4.18.0-147.el8.x86_64 #1
[ 1125.788065] RIP: 0010:nvme_irq+0xe8/0x240 [nvme]
[ 1125.788068] RSP: 0018:916b8ec83ed0 EFLAGS: 00010813
[ 1125.788069] RAX:  RBX: 918ae9211b00 RCX: 

[ 1125.788070] RDX: 400b RSI:  RDI: 

[ 1125.788071] RBP: 918ae887 R08: 0004 R09: 
918ae887
[ 1125.788072] R10:  R11:  R12: 

[ 1125.788073] R13: 0001 R14:  R15: 
0001
[ 1125.788075] FS:  () GS:916b8ec8() 
knlGS:
[ 1125.788075] CS:  0010 DS:  ES:  CR0: 80050033
[ 1125.788076] CR2: 0130 CR3: 001768f0 CR4: 
00340ee0
[ 1125.788077] Call Trace:
[ 1125.788080]  
[ 1125.788085]  __handle_irq_event_percpu+0x40/0x180
[ 1125.788087]  handle_irq_event_percpu+0x30/0x80
[ 1125.788089]  handle_irq_event+0x36/0x53
[ 1125.788090]  handle_edge_irq+0x82/0x190
[ 1125.788094]  handle_irq+0xbf/0x100
[ 1125.788098]  do_IRQ+0x49/0xd0
[ 1125.788100]  common_interrupt+0xf/0xf

Signed-off-by: Xianting Tian 
---
 drivers/nvme/host/pci.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index ba725ae47..5f1c51a43 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -960,6 +960,13 @@ static inline void nvme_handle_cqe(struct nvme_queue 
*nvmeq, u16 idx)
}
 
req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), cqe->command_id);
+   if (unlikely(!req)) {
+   dev_warn(nvmeq->dev->ctrl.device,
+   "req is null(tag:%d) on queue %d\n",
+   cqe->command_id, le16_to_cpu(cqe->sq_id));
+   return;
+   }
+
trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
if (!nvme_end_request(req, cqe->status, cqe->result))
nvme_pci_complete_rq(req);
-- 
2.17.1



[PATCH] nvme-pci: check req to prevent crash in nvme_handle_cqe()

2020-08-31 Thread Xianting Tian
We met a crash issue when hot-insert a nvme device, blk_mq_tag_to_rq()
returned null(req=null), then crash happened in nvme_end_request():
struct nvme_request *rq = nvme_req(req);
rq->result = result;  <==crash here

The test env is, a server is configured with 2 backplanes, each backplane
support 8 nvme devices, this crash happened when hot-insert a nvme device
to the second backplane. We measured the signal, which is send out of cpu
to ack nvme interrupt, the signal is very weak when it reached the second
backplane, the device can't distinguish it as a ack signal. So it caused
the device can't clear the interrupt flag.
After updating related driver, the signal sending out of cpu to the second
backplane is good, the crash issue disappeared.

As blk_mq_tag_to_rq() may return null, so it should be check whether it is
null before using it to prevent a crash.

[ 1124.256246] nvme nvme5: pci function :e1:00.0
[ 1124.256323] nvme :e1:00.0: enabling device ( -> 0002)
[ 1125.720859] nvme nvme5: 96/0/0 default/read/poll queues
[ 1125.732483]  nvme5n1: p1 p2 p3
[ 1125.788049] BUG: unable to handle kernel NULL pointer dereference at 
0130
[ 1125.788054] PGD 0 P4D 0
[ 1125.788057] Oops: 0002 [#1] SMP NOPTI
[ 1125.788059] CPU: 50 PID: 0 Comm: swapper/50 Kdump: loaded Tainted: G 
  - -t - 4.18.0-147.el8.x86_64 #1
[ 1125.788065] RIP: 0010:nvme_irq+0xe8/0x240 [nvme]
[ 1125.788068] RSP: 0018:916b8ec83ed0 EFLAGS: 00010813
[ 1125.788069] RAX:  RBX: 918ae9211b00 RCX: 

[ 1125.788070] RDX: 400b RSI:  RDI: 

[ 1125.788071] RBP: 918ae887 R08: 0004 R09: 
918ae887
[ 1125.788072] R10:  R11:  R12: 

[ 1125.788073] R13: 0001 R14:  R15: 
0001
[ 1125.788075] FS:  () GS:916b8ec8() 
knlGS:
[ 1125.788075] CS:  0010 DS:  ES:  CR0: 80050033
[ 1125.788076] CR2: 0130 CR3: 001768f0 CR4: 
00340ee0
[ 1125.788077] Call Trace:
[ 1125.788080]  
[ 1125.788085]  __handle_irq_event_percpu+0x40/0x180
[ 1125.788087]  handle_irq_event_percpu+0x30/0x80
[ 1125.788089]  handle_irq_event+0x36/0x53
[ 1125.788090]  handle_edge_irq+0x82/0x190
[ 1125.788094]  handle_irq+0xbf/0x100
[ 1125.788098]  do_IRQ+0x49/0xd0
[ 1125.788100]  common_interrupt+0xf/0xf

Signed-off-by: Xianting Tian 
---
 drivers/nvme/host/pci.c | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index ba725ae47..32712a41c 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -939,6 +939,7 @@ static inline void nvme_handle_cqe(struct nvme_queue 
*nvmeq, u16 idx)
 {
struct nvme_completion *cqe = &nvmeq->cqes[idx];
struct request *req;
+   struct blk_mq_tags *tags;
 
if (unlikely(cqe->command_id >= nvmeq->q_depth)) {
dev_warn(nvmeq->dev->ctrl.device,
@@ -959,7 +960,15 @@ static inline void nvme_handle_cqe(struct nvme_queue 
*nvmeq, u16 idx)
return;
}
 
-   req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), cqe->command_id);
+   tags = nvme_queue_tagset(nvmeq);
+   req = blk_mq_tag_to_rq(tags, cqe->command_id);
+   if (unlikely(!req)) {
+   dev_warn(nvmeq->dev->ctrl.device,
+   "req is null(tag:%d nr_tags:%d) on queue %d\n"
+   cqe->command_id, tags->nr_tags, le16_to_cpu(cqe->sq_id);
+   return;
+   }
+
trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
if (!nvme_end_request(req, cqe->status, cqe->result))
nvme_pci_complete_rq(req);
-- 
2.17.1



[PATCH] scsi: qla2xxx: Fix the return value

2020-08-29 Thread Xianting Tian
A negative error code should be returned.

Signed-off-by: Xianting Tian 
---
 drivers/scsi/qla2xxx/qla_target.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/scsi/qla2xxx/qla_target.c 
b/drivers/scsi/qla2xxx/qla_target.c
index fbb80a043..612e001cc 100644
--- a/drivers/scsi/qla2xxx/qla_target.c
+++ b/drivers/scsi/qla2xxx/qla_target.c
@@ -3781,7 +3781,7 @@ int qlt_abort_cmd(struct qla_tgt_cmd *cmd)
"multiple abort. %p transport_state %x, t_state %x, "
"se_cmd_flags %x\n", cmd, cmd->se_cmd.transport_state,
cmd->se_cmd.t_state, cmd->se_cmd.se_cmd_flags);
-   return EIO;
+   return -EIO;
}
cmd->aborted = 1;
cmd->trc_flags |= TRC_ABORT;
-- 
2.17.1



[PATCH] aio: make aio wait path to account iowait time

2020-08-27 Thread Xianting Tian
g,   0 stopped,   0 zombie
%Cpu(s):  0.4 us,  0.9 sy,  0.0 ni, 98.6 id,  0.0 wa,  0.0 hi,  0.0 si, 
 0.0 st
KiB Mem : 19668915+total, 19513945+free,   879652 used,   670040 
buff/cache
KiB Swap:  4194300 total,  4194300 free,0 used. 19448636+avail 
Mem

  PID USER  PR  NIVIRTRESSHR S  %CPU %MEM TIME+ 
COMMAND
16243 root  20   0  294092  63736  63068 S   1.7  0.0   0:03.06 fio
16277 root  20   0  272336   3568   1868 S   1.7  0.0   0:03.59 fio
16287 root  20   0  272376   3564   1864 S   1.7  0.0   0:03.64 fio
16291 root  20   0  272392   3620   1868 S   1.7  0.0   0:03.63 fio
16298 root  20   0  272420   3564   1868 S   1.7  0.0   0:03.61 fio
16302 root  20   0  272436   3560   1868 S   1.7  0.0   0:03.61 fio
16303 root  20   0  272440   3552   1800 S   1.7  0.0   0:03.62 fio
16308 root  20   0  272460   3568   1864 S   1.7  0.0   0:03.60 fio
16278 root  20   0  272340   3568   1868 S   1.3  0.0   0:03.59 fio
16279 root  20   0  272344   3508   1800 S   1.3  0.0   0:03.60 fio
16280 root  20   0  272348   3564   1864 S   1.3  0.0   0:03.60 fio
16281 root  20   0  272352   3624   1872 S   1.3  0.0   0:03.57 fio
16283 root  20   0  272360   3612   1860 S   1.3  0.0   0:03.60 fio
16285 root  20   0  272368   3592   1840 S   1.3  0.0   0:03.62 fio
16286 root  20   0  272372   3580   1828 S   1.3  0.0   0:03.61 fio
16288 root  20   0  272380   3620   1868 S   1.3  0.0   0:03.55 fio
16289 root  20   0  272384   3564   1868 S   1.3  0.0   0:03.59 fio
16292 root  20   0  272396   3536   1836 S   1.3  0.0   0:03.62 fio
16293 root  20   0  272400   3624   1872 S   1.3  0.0   0:03.63 fio
16295 root  20   0  272408   3620   1868 S   1.3  0.0   0:03.61 fio
16297 root  20   0  272416   3568   1868 S   1.3  0.0   0:03.62 fio
16300 root  20   0  272428   3564   1864 R   1.3  0.0   0:03.61 fio
16304 root  20   0  272444   3564   1864 S   1.3  0.0   0:03.59 fio
16305 root  20   0  272448   3456   1760 S   1.3  0.0   0:03.65 fio
16307 root  20   0  272456   3568   1864 S   1.3  0.0   0:03.64 fio
16282 root  20   0  272356   3556   1860 S   1.0  0.0   0:03.55 fio
16284 root  20   0  272364   3612   1860 S   1.0  0.0   0:03.57 fio
16290 root  20   0  272388   3616   1864 S   1.0  0.0   0:03.54 fio
16294 root  20   0  272404   3624   1872 S   1.0  0.0   0:03.60 fio
16296 root  20   0  272412   3564   1864 S   1.0  0.0   0:03.60 fio
16299 root  20   0  272424   3540   1840 S   1.0  0.0   0:03.62 fio
16301 root  20   0  272432   3568   1868 S   1.0  0.0   0:03.63 fio
16306 root  20   0  272452   3624   1872 S   1.0  0.0   0:03.60 fio

Signed-off-by: Xianting Tian 
---
 fs/aio.c |  2 +-
 include/linux/wait.h | 26 ++
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 5736bff48..8d00548e0 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1290,7 +1290,7 @@ static long read_events(struct kioctx *ctx, long min_nr, 
long nr,
if (until == 0)
aio_read_events(ctx, min_nr, nr, event, &ret);
else
-   wait_event_interruptible_hrtimeout(ctx->wait,
+   io_wait_event_hrtimeout(ctx->wait,
aio_read_events(ctx, min_nr, nr, event, &ret),
until);
return ret;
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 898c890fc..fb5902a25 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -312,6 +312,13 @@ do {   
\
(void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, 
\
io_schedule())
 
+#define __io_wait_event_hrtimeout(wq_head, condition, timeout) 
\
+({ 
\
+   int __ret = 0;  
\
+   __ret = __wait_event_hrtimeout(wq_head, condition, timeout, 
\
+   TASK_UNINTERRUPTIBLE, io_schedule());   
\
+})
+
 /*
  * io_wait_event() -- like wait_event() but with io_schedule()
  */
@@ -323,6 +330,15 @@ do {   
\
__io_wait_event(wq_head, condition);
\
 } while (0)
 
+
+#define io_wait_event_hrtimeout(wq_head, condition, timeout)   
\
+do {

[PATCH] [v3] blk-mq: use BLK_MQ_NO_TAG for no tag

2020-08-27 Thread Xianting Tian
Replace various magic -1 constants for tags with BLK_MQ_NO_TAG.

Signed-off-by: Xianting Tian 
---
 block/blk-core.c | 4 ++--
 block/blk-mq-sched.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index d9d632639..c7eaf7504 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -116,8 +116,8 @@ void blk_rq_init(struct request_queue *q, struct request 
*rq)
rq->__sector = (sector_t) -1;
INIT_HLIST_NODE(&rq->hash);
RB_CLEAR_NODE(&rq->rb_node);
-   rq->tag = -1;
-   rq->internal_tag = -1;
+   rq->tag = BLK_MQ_NO_TAG;
+   rq->internal_tag = BLK_MQ_NO_TAG;
rq->start_time_ns = ktime_get_ns();
rq->part = NULL;
refcount_set(&rq->ref, 1);
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index a19cdf159..439481f59 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -522,7 +522,7 @@ void blk_mq_sched_insert_request(struct request *rq, bool 
at_head,
goto run;
}
 
-   WARN_ON(e && (rq->tag != -1));
+   WARN_ON(e && (rq->tag != BLK_MQ_NO_TAG));
 
if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) {
/*
-- 
2.17.1



[PATCH] [v2] blk-mq: use BLK_MQ_NO_TAG for no tag

2020-08-25 Thread Xianting Tian
Replace various magic -1 constants for tags with BLK_MQ_NO_TAG.
And move the definition of BLK_MQ_NO_TAG from 'block/blk-mq-tag.h'
to 'include/linux/blk-mq.h'

Signed-off-by: Xianting Tian 
---
 block/blk-core.c   | 4 ++--
 block/blk-mq-sched.c   | 2 +-
 block/blk-mq-tag.h | 6 --
 include/linux/blk-mq.h | 8 +++-
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index d9d632639..c7eaf7504 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -116,8 +116,8 @@ void blk_rq_init(struct request_queue *q, struct request 
*rq)
rq->__sector = (sector_t) -1;
INIT_HLIST_NODE(&rq->hash);
RB_CLEAR_NODE(&rq->rb_node);
-   rq->tag = -1;
-   rq->internal_tag = -1;
+   rq->tag = BLK_MQ_NO_TAG;
+   rq->internal_tag = BLK_MQ_NO_TAG;
rq->start_time_ns = ktime_get_ns();
rq->part = NULL;
refcount_set(&rq->ref, 1);
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index a19cdf159..439481f59 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -522,7 +522,7 @@ void blk_mq_sched_insert_request(struct request *rq, bool 
at_head,
goto run;
}
 
-   WARN_ON(e && (rq->tag != -1));
+   WARN_ON(e && (rq->tag != BLK_MQ_NO_TAG));
 
if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) {
/*
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index b1acac518..8fc48aa72 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -45,12 +45,6 @@ static inline struct sbq_wait_state *bt_wait_ptr(struct 
sbitmap_queue *bt,
return sbq_wait_ptr(bt, &hctx->wait_index);
 }
 
-enum {
-   BLK_MQ_NO_TAG   = -1U,
-   BLK_MQ_TAG_MIN  = 1,
-   BLK_MQ_TAG_MAX  = BLK_MQ_NO_TAG - 1,
-};
-
 extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
 extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *);
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 9d2d5ad36..2499d8aae 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -209,6 +209,12 @@ enum hctx_type {
HCTX_MAX_TYPES,
 };
 
+enum {
+   BLK_MQ_NO_TAG   = -1U,
+   BLK_MQ_TAG_MIN  = 1,
+   BLK_MQ_TAG_MAX  = BLK_MQ_NO_TAG - 1,
+};
+
 /**
  * struct blk_mq_tag_set - tag set that can be shared between request queues
  * @map:  One or more ctx -> hctx mappings. One map exists for each
@@ -569,7 +575,7 @@ static inline void *blk_mq_rq_to_pdu(struct request *rq)
 static inline blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx,
struct request *rq)
 {
-   if (rq->tag != -1)
+   if (rq->tag != BLK_MQ_NO_TAG)
return rq->tag | (hctx->queue_num << BLK_QC_T_SHIFT);
 
return rq->internal_tag | (hctx->queue_num << BLK_QC_T_SHIFT) |
-- 
2.17.1



[PATCH] blk-mq: use BLK_MQ_NO_TAG for no tag

2020-08-23 Thread Xianting Tian
Replace various magic -1 constants for tags with BLK_MQ_NO_TAG.

Signed-off-by: Xianting Tian 
---
 block/blk-core.c   | 4 ++--
 block/blk-mq-sched.c   | 2 +-
 include/linux/blk-mq.h | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index d9d632639..c7eaf7504 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -116,8 +116,8 @@ void blk_rq_init(struct request_queue *q, struct request 
*rq)
rq->__sector = (sector_t) -1;
INIT_HLIST_NODE(&rq->hash);
RB_CLEAR_NODE(&rq->rb_node);
-   rq->tag = -1;
-   rq->internal_tag = -1;
+   rq->tag = BLK_MQ_NO_TAG;
+   rq->internal_tag = BLK_MQ_NO_TAG;
rq->start_time_ns = ktime_get_ns();
rq->part = NULL;
refcount_set(&rq->ref, 1);
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index a19cdf159..439481f59 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -522,7 +522,7 @@ void blk_mq_sched_insert_request(struct request *rq, bool 
at_head,
goto run;
}
 
-   WARN_ON(e && (rq->tag != -1));
+   WARN_ON(e && (rq->tag != BLK_MQ_NO_TAG));
 
if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) {
/*
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 9d2d5ad36..161d8a0e6 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -569,7 +569,7 @@ static inline void *blk_mq_rq_to_pdu(struct request *rq)
 static inline blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx,
struct request *rq)
 {
-   if (rq->tag != -1)
+   if (rq->tag != BLK_MQ_NO_TAG)
return rq->tag | (hctx->queue_num << BLK_QC_T_SHIFT);
 
return rq->internal_tag | (hctx->queue_num << BLK_QC_T_SHIFT) |
-- 
2.17.1



[PATCH] btrfs: prevent hung check firing during long sync IO

2020-08-19 Thread Xianting Tian
For sync and flush io, it may take long time to complete.
So it's better to use wait_for_completion_io_timeout() in a
while loop to avoid prevent hung check and crash(when set
/proc/sys/kernel/hung_task_panic).

This is similar to prevent hung task check in submit_bio_wait(),
blk_execute_rq().

Signed-off-by: Xianting Tian 
---
 fs/btrfs/disk-io.c | 12 +++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9ae25f632..1eb560de0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -17,6 +17,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include "ctree.h"
@@ -3699,12 +3700,21 @@ static void write_dev_flush(struct btrfs_device *device)
 static blk_status_t wait_dev_flush(struct btrfs_device *device)
 {
struct bio *bio = device->flush_bio;
+   unsigned long hang_check;
 
if (!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state))
return BLK_STS_OK;
 
clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
-   wait_for_completion_io(&device->flush_wait);
+
+   /* Prevent hang_check timer from firing at us during very long I/O */
+   hang_check = sysctl_hung_task_timeout_secs;
+   if (hang_check)
+   while (!wait_for_completion_io_timeout(&device->flush_wait,
+   hang_check * (HZ/2)))
+   ;
+   else
+   wait_for_completion_io(&device->flush_wait);
 
return bio->bi_status;
 }
-- 
2.17.1



[PATCH] mm/memory-failure: do pgoff calculation before for_each_process()

2020-08-18 Thread Xianting Tian
There is no need to calcaulate pgoff in each loop of for_each_process(),
so move it to the place before for_each_process(), which can save some
CPU cycles.

Signed-off-by: Xianting Tian 
---
 mm/memory-failure.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 47b8ccb1f..7dc2c9d3b 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -484,11 +484,12 @@ static void collect_procs_file(struct page *page, struct 
list_head *to_kill,
struct vm_area_struct *vma;
struct task_struct *tsk;
struct address_space *mapping = page->mapping;
+   pgoff_t pgoff;
 
i_mmap_lock_read(mapping);
read_lock(&tasklist_lock);
+   pgoff = page_to_pgoff(page);
for_each_process(tsk) {
-   pgoff_t pgoff = page_to_pgoff(page);
struct task_struct *t = task_early_kill(tsk, force_early);
 
if (!t)
-- 
2.17.1



[PATCH] md: only calculate blocksize once and use i_blocksize()

2020-08-17 Thread Xianting Tian
We alreday has the interface i_blocksize(), which can be used
to get blocksize, so use it.
Only calculate blocksize once and use it within read_page().

Signed-off-by: Xianting Tian 
---
 drivers/md/md-bitmap.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 95a5f3757..0d5544868 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -357,11 +357,12 @@ static int read_page(struct file *file, unsigned long 
index,
struct inode *inode = file_inode(file);
struct buffer_head *bh;
sector_t block, blk_cur;
+   unsigned long blocksize = i_blocksize(inode);
 
pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE,
 (unsigned long long)index << PAGE_SHIFT);
 
-   bh = alloc_page_buffers(page, 1<i_blkbits, false);
+   bh = alloc_page_buffers(page, blocksize, false);
if (!bh) {
ret = -ENOMEM;
goto out;
@@ -383,10 +384,10 @@ static int read_page(struct file *file, unsigned long 
index,
 
bh->b_blocknr = block;
bh->b_bdev = inode->i_sb->s_bdev;
-   if (count < (1<i_blkbits))
+   if (count < blocksize)
count = 0;
else
-   count -= (1<i_blkbits);
+   count -= blocksize;
 
bh->b_end_io = end_bitmap_write;
bh->b_private = bitmap;
-- 
2.17.1



[PATCH] exfat: use i_blocksize() to get blocksize

2020-08-15 Thread Xianting Tian
We alreday has the interface i_blocksize() to get blocksize,
so use it.

Signed-off-by: Xianting Tian 
---
 fs/exfat/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index a6a063830..163b599db 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -226,7 +226,7 @@ void exfat_truncate(struct inode *inode, loff_t size)
 {
struct super_block *sb = inode->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
-   unsigned int blocksize = 1 << inode->i_blkbits;
+   unsigned int blocksize = i_blocksize(inode);
loff_t aligned_size;
int err;
 
-- 
2.17.1



[PATCH] sched: Remove useless settings when 'tg == d->tg'

2020-08-14 Thread Xianting Tian
if 'tg == d->tg' is true, the previous settings for period,runtime
are unnecessary. So move the settings to the 'else' branch.

Signed-off-by: Xianting Tian 
---
 kernel/sched/rt.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index f395ddb75..8b5505735 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2504,12 +2504,12 @@ static int tg_rt_schedulable(struct task_group *tg, 
void *data)
unsigned long total, sum = 0;
u64 period, runtime;
 
-   period = ktime_to_ns(tg->rt_bandwidth.rt_period);
-   runtime = tg->rt_bandwidth.rt_runtime;
-
if (tg == d->tg) {
period = d->rt_period;
runtime = d->rt_runtime;
+   } else {
+   period = ktime_to_ns(tg->rt_bandwidth.rt_period);
+   runtime = tg->rt_bandwidth.rt_runtime;
}
 
/*
-- 
2.17.1



[PATCH] tracing: use __this_cpu_read() in trace_buffered_event_enable()

2020-08-13 Thread Xianting Tian
The code is executed with preemption disabled, so it's
safe to use __this_cpu_read().

Signed-off-by: Xianting Tian 
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index bb6226972..7d0d71ce9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2514,7 +2514,7 @@ void trace_buffered_event_enable(void)
 
preempt_disable();
if (cpu == smp_processor_id() &&
-   this_cpu_read(trace_buffered_event) !=
+   __this_cpu_read(trace_buffered_event) !=
per_cpu(trace_buffered_event, cpu))
WARN_ON_ONCE(1);
preempt_enable();
-- 
2.17.1



[PATCH] block: don't read block device if it's invalid

2020-08-11 Thread Xianting Tian
We found several processes in 'D' state after nvme device hot-removed,
The call trace as below, we can see process 848 got lock 'bdev->bd_mutex'
in blkdev_reread_part(), but scheduled out due to wait for IO done. But
the IO won't be completed as the device is hot-removed. Then it caused
the lock 'bdev->bd_mutex' can't be unlocked. As a result, it caused
other processes, which need to get the same lock 'bdev->bd_mutex',
blocked on this lock.

When nvme device hot-removed, kernel will start a thread to handle the
task of nvme device removing, as the call trace of process 504 shows
below. I listed the call trace of nvme_kill_queues() in detail as below,
we can see 'NVME_NS_DEAD' is set, then when executing
nvme_revalidate_disk(), it found 'NVME_NS_DEAD' is set and
'set_capacity(disk, 0)' will be called to set disk capacity to 0.
nvme_kill_queues()
if (test_and_set_bit(NVME_NS_DEAD, &ns->flags)) return;
revalidate_disk(disk)
disk->fops->revalidate_disk(disk) <=for nvme device, 
revalidate_disk=nvme_revalidate_disk()
 mutex_lock(&bdev->bd_mutex)

This patch is to reduce the probability of such problem. Before getting
the lock of 'bdev->bd_mutex' in blkdev_reread_part(), add the code to
check if the capacity of the disk is 0, just return. Then we can avoid
the happen of the issue:
nvme device is hot-removed, and its capacity is alreday set to 0; then
if there is process like 848 want to read the device, it will return
directly in blkdev_reread_part(), then it will not get the lock
"bdev->bd_mutex", which can't be unlocked by the process itself as IO
can't be completed.

cat /proc/848/stack
[] io_schedule+0x16/0x40
[] do_read_cache_page+0x3ee/0x5e0
[] read_cache_page+0x15/0x20
[] read_dev_sector+0x2d/0xa0
[] read_lba+0x104/0x1c0
[] find_valid_gpt+0xfa/0x720
[] efi_partition+0x89/0x430
[] check_partition+0x100/0x1f0
[] rescan_partitions+0xb4/0x360
[] __blkdev_reread_part+0x64/0x70
[] blkdev_reread_part+0x23/0x40  
<<==mutex_lock(&bdev->bd_mutex);
[] blkdev_ioctl+0x44b/0x8e0
[] block_ioctl+0x41/0x50
[] do_vfs_ioctl+0xa7/0x5e0
[] SyS_ioctl+0x79/0x90
[] entry_SYSCALL_64_fastpath+0x1f/0xb9
[] 0x

cat /proc/504/stack
[] revalidate_disk+0x49/0x80  <<==mutex_lock(&bdev->bd_mutex);
[] nvme_kill_queues+0x52/0x80 [nvme_core]
[] nvme_remove_namespaces+0x44/0x50 [nvme_core]
[] nvme_remove+0x85/0x130 [nvme]
[] pci_device_remove+0x39/0xc0
[] device_release_driver_internal+0x141/0x210
[] device_release_driver+0x12/0x20
[] pci_stop_bus_device+0x8c/0xa0
[] pci_stop_and_remove_bus_device+0x12/0x20
[] pciehp_unconfigure_device+0x7a/0x1e0
[] pciehp_disable_slot+0x52/0xd0
[] pciehp_power_thread+0x8a/0xb0
[] process_one_work+0x14e/0x370
[] worker_thread+0x4d/0x3f0
[] kthread+0x109/0x140
[] ret_from_fork+0x2a/0x40
[] 0x

cat /proc/1197767/stack
[] __blkdev_get+0x6e/0x450  
<<==mutex_lock_nested(&bdev->bd_mutex, for_part);
[] blkdev_get+0x1a4/0x300
[] blkdev_open+0x7a/0xa0
[] do_dentry_open+0x20f/0x330
[] vfs_open+0x50/0x70
[] path_openat+0x548/0x13b0
[] do_filp_open+0x91/0x100
[] do_sys_open+0x124/0x210
[] SyS_open+0x1e/0x20
[] do_syscall_64+0x6c/0x1b0
[] entry_SYSCALL64_slow_path+0x25/0x25
[] 0x

ps -eo pid,comm,state | grep '  D'
848 systemd-udevd   D
504 kworker/10:1D
1197767 isdct   D
1198830 isdct   D
1580322 xxd D
1616804 kworker/10:0D
1626264 isdct   D
1734726 kworker/10:2D
2197993 isdct   D
2662117 xxd D
3083718 xxd D
3189834 xxd D

Signed-off-by: Xianting Tian 
---
 block/ioctl.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/block/ioctl.c b/block/ioctl.c
index bdb3bbb..159bceb 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -94,6 +94,9 @@ static int blkdev_reread_part(struct block_device *bdev)
 {
int ret;
 
+   if (unlikely(!get_capacity(bdev->bd_disk)))
+   return -EIO;
+
if (!disk_part_scan_enabled(bdev->bd_disk) || bdev != bdev->bd_contains)
return -EINVAL;
if (!capable(CAP_SYS_ADMIN))
-- 
1.8.3.1



[PATCH] ext4: use kmemdup_nul() instead of kstrndup()

2020-08-08 Thread Xianting Tian
kmemdup_nul() is more efficient than kmemdup_nul() if
the size is known exactly.

The description of kstrndup() already suggested:
Note: Use kmemdup_nul() instead if the size is known exactly.

Signed-off-by: Xianting Tian 
---
 fs/ext4/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 330957e..be37556 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4016,7 +4016,7 @@ static int ext4_fill_super(struct super_block *sb, void 
*data, int silent)
}
 
if (sbi->s_es->s_mount_opts[0]) {
-   char *s_mount_opts = kstrndup(sbi->s_es->s_mount_opts,
+   char *s_mount_opts = kmemdup_nul(sbi->s_es->s_mount_opts,
  sizeof(sbi->s_es->s_mount_opts),
  GFP_KERNEL);
if (!s_mount_opts)
-- 
1.8.3.1



[PATCH] blkcg: add plugging support for punt bio

2020-08-06 Thread Xianting Tian
Try to merge continuous bio to current task's plug fisrt.

Signed-off-by: Xianting Tian 
---
 block/blk-cgroup.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 0ecc897..fe5d361 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -122,6 +122,7 @@ static void blkg_async_bio_workfn(struct work_struct *work)
 async_bio_work);
struct bio_list bios = BIO_EMPTY_LIST;
struct bio *bio;
+   struct blk_plug plug;
 
/* as long as there are pending bios, @blkg can't go away */
spin_lock_bh(&blkg->async_bio_lock);
@@ -129,8 +130,10 @@ static void blkg_async_bio_workfn(struct work_struct *work)
bio_list_init(&blkg->async_bios);
spin_unlock_bh(&blkg->async_bio_lock);
 
+   blk_start_plug(&plug);
while ((bio = bio_list_pop(&bios)))
submit_bio(bio);
+   blk_finish_plug(&plug);
 }
 
 /**
-- 
1.8.3.1



[PATCH] aio: use wait_for_completion_io() when waiting for completion of io

2020-08-05 Thread Xianting Tian
When waiting for the completion of io, we need account iowait time. As
wait_for_completion() calls schedule_timeout(), which doesn't account
iowait time. While wait_for_completion_io() calls io_schedule_timeout(),
which will account iowait time.

So using wait_for_completion_io() instead of wait_for_completion()
when waiting for completion of io before exit_aio and io_destroy.

Signed-off-by: Xianting Tian 
---
 fs/aio.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 91e7cc4..498b8a0 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -892,7 +892,7 @@ void exit_aio(struct mm_struct *mm)
 
if (!atomic_sub_and_test(skipped, &wait.count)) {
/* Wait until all IO for the context are done. */
-   wait_for_completion(&wait.comp);
+   wait_for_completion_io(&wait.comp);
}
 
RCU_INIT_POINTER(mm->ioctx_table, NULL);
@@ -1400,7 +1400,7 @@ static long read_events(struct kioctx *ctx, long min_nr, 
long nr,
 * is destroyed.
 */
if (!ret)
-   wait_for_completion(&wait.comp);
+   wait_for_completion_io(&wait.comp);
 
return ret;
}
-- 
1.8.3.1



[PATCH] mm: use blk_io_schedule() for avoiding task hung in sync io

2020-08-03 Thread Xianting Tian
swap_readpage() does the sync io for one page, the io is not big, normally,
the io can be finished quickly, but it may take long time or wait forever
in case of io failure or discard.
This patch is to use blk_io_schedule() instead of io_schedule() to avoid
task hung and crash(when set /proc/sys/kernel/hung_task_panic) in case of
above exception occur.
We have prevented task hung in submit_bio_wait(), blk_execute_rq() and
__blkdev_direct_IO().

Signed-off-by: Xianting Tian 
---
 mm/page_io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/page_io.c b/mm/page_io.c
index e8726f3..5d52f7b 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -424,7 +424,7 @@ int swap_readpage(struct page *page, bool synchronous)
break;
 
if (!blk_poll(disk->queue, qc, true))
-   io_schedule();
+   blk_io_schedule();
}
__set_current_state(TASK_RUNNING);
bio_put(bio);
-- 
1.8.3.1



[PATCH] ext4: move buffer_mapped() to proper position

2020-07-31 Thread Xianting Tian
As you know, commit a17712c8 has added below code to aviod a
crash( 'BUG_ON(!buffer_mapped(bh))' in submit_bh_wbc) when
device hot-removed(a physical device is unpluged from pcie slot
or a nbd device's network is shutdown).
static int ext4_commit_super():
if (!sbh || block_device_ejected(sb))
return error;
+
+   /*
+* The superblock bh should be mapped, but it might not be if the
+* device was hot-removed. Not much we can do but fail the I/O.
+*/
+   if (!buffer_mapped(sbh))
+   return error;

And the call trace, which leads to the crash, as below:
ext4_commit_super()
  __sync_dirty_buffer()
submit_bh()
  submit_bh_wbc()
BUG_ON(!buffer_mapped(bh));

But recently we met the same crash(with very low probability) when
device hot-removed even though the kernel already contained
above exception protection code. Still, the crash is caused by
'BUG_ON(!buffer_mapped(bh))' in submit_bh_wbc(), and the same
call trace as below.

As my understanding and below code,there are still some more
codes needs to run between 'buffer_mapped(sbh)'(which is added
by commit a17712c8) and 'BUG_ON(!buffer_mapped(bh))' in
submit_bh_wbc(), especially lock_buffer is called two times(sometimes,
it may take more times to get the lock). So when do the test of
device hot-remove, there is low probability that the sbh is mapped
when executing 'buffer_mapped(sbh)'(which is added by commit a17712c8)
but sbh is not mapped when executing 'BUG_ON(!buffer_mapped(bh))'
in submit_bh_wbc().
Code path:
ext4_commit_super
judge if 'buffer_mapped(sbh)' is false, return <== commit a17712c8
  lock_buffer(sbh)
  ...
  unlock_buffer(sbh)
   __sync_dirty_buffer(sbh,...
lock_buffer(sbh)
judge if 'buffer_mapped(sbh))' is false, return <== 
added by this patch
submit_bh(...,sbh)
submit_bh_wbc(...,sbh,...)

This patch is to move the check of 'buffer_mapped(sbh)' to the place just
before calling 'BUG_ON(!buffer_mapped(bh))' in submit_bh_wbc().

[100722.966497] kernel BUG at fs/buffer.c:3095! <== BUG_ON(!buffer_mapped(bh))' 
in submit_bh_wbc()
[100722.966503] invalid opcode:  [#1] SMP
[100722.966566] task: 8817e15a9e40 task.stack: c90024744000
[100722.966574] RIP: 0010:submit_bh_wbc+0x180/0x190
[100722.966575] RSP: 0018:c90024747a90 EFLAGS: 00010246
[100722.966576] RAX: 00620005 RBX: 8818a80603a8 RCX: 

[100722.966576] RDX: 8818a80603a8 RSI: 00020800 RDI: 
0001
[100722.966577] RBP: c90024747ac0 R08:  R09: 
88207f94170d
[100722.966578] R10: 000437c8 R11: 0001 R12: 
00020800
[100722.966578] R13: 0001 R14: 0bf9a438 R15: 
88195f333000
[100722.966580] FS:  7fa2eee27700() GS:88203d84() 
knlGS:
[100722.966580] CS:  0010 DS:  ES:  CR0: 80050033
[100722.966581] CR2: 00f0b008 CR3: 00201a622003 CR4: 
007606e0
[100722.966582] DR0:  DR1:  DR2: 

[100722.966583] DR3:  DR6: fffe0ff0 DR7: 
0400
[100722.966583] PKRU: 5554
[100722.966583] Call Trace:
[100722.966588]  __sync_dirty_buffer+0x6e/0xd0
[100722.966614]  ext4_commit_super+0x1d8/0x290 [ext4]
[100722.966626]  __ext4_std_error+0x78/0x100 [ext4]
[100722.966635]  ? __ext4_journal_get_write_access+0xca/0x120 [ext4]
[100722.966646]  ext4_reserve_inode_write+0x58/0xb0 [ext4]
[100722.966655]  ? ext4_dirty_inode+0x48/0x70 [ext4]
[100722.93]  ext4_mark_inode_dirty+0x53/0x1e0 [ext4]
[100722.966671]  ? __ext4_journal_start_sb+0x6d/0xf0 [ext4]
[100722.966679]  ext4_dirty_inode+0x48/0x70 [ext4]
[100722.966682]  __mark_inode_dirty+0x17f/0x350
[100722.966686]  generic_update_time+0x87/0xd0
[100722.966687]  touch_atime+0xa9/0xd0
[100722.966690]  generic_file_read_iter+0xa09/0xcd0
[100722.966694]  ? page_cache_tree_insert+0xb0/0xb0
[100722.966704]  ext4_file_read_iter+0x4a/0x100 [ext4]
[100722.966707]  ? __inode_security_revalidate+0x4f/0x60
[100722.966709]  __vfs_read+0xec/0x160
[100722.966711]  vfs_read+0x8c/0x130
[100722.966712]  SyS_pread64+0x87/0xb0
[100722.966716]  do_syscall_64+0x67/0x1b0
[100722.966719]  entry_SYSCALL64_slow_path+0x25/0x25

Signed-off-by: Xianting Tian 
---
 fs/buffer.c | 9 +
 fs/ext4/super.c | 7 ---
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 64fe82e..75a8849 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3160,6 +3160,15 @@ int __sync_dirty_buffer(struct buffer_head *bh, int 
op_flags)
WARN_ON(atomic_read(&bh->b_count) < 1);
lock_buffer(bh);
if (test_clear_buffer_dirty(b

[PATCH] ext4: check superblock mapped prior to get write access

2020-07-28 Thread Xianting Tian
One crash issue happened when directly down the network interface,
which nbd device is connected to. The kernel version is kernel
4.14.0-115.
According to the debug log and call trace, the buffer of ext4
superblock already unmapped after the network of nbd device down.
But the code continue to run until crash.
I checked latest kernel code of 5.8-rc7 based on the call trace,
no function checked if buffer of ext4 superblock unmapped.
The patch is similar to commit 742b06b, aim to check superblock
mapped prior to get write access.

The crash reason described as below:
struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh)
{
... ...
jbd_lock_bh_journal_head(bh);
if (buffer_jbd(bh)) {
jh = bh2jh(bh); <<== jh is null!!!
} else {
... ...
}
jh->b_jcount++; <<==crash here
jbd_unlock_bh_journal_head(bh);
... ...
}

Debug code added to __ext4_journal_get_write_access:
int __ext4_journal_get_write_access(const char *where, unsigned int line,
handle_t *handle, struct buffer_head *bh)
{
int err = 0;

might_sleep();

if (ext4_handle_valid(handle)) {
struct super_block *sb;
struct buffer_head *sbh;

sb = handle->h_transaction->t_journal->j_private;
if (unlikely(ext4_forced_shutdown(EXT4_SB(sb {
jbd2_journal_abort_handle(handle);
return -EIO;
}

sbh = EXT4_SB(sb)->s_sbh;
if (!buffer_mapped(sbh)) {
ext4 sb bh not mapped\n");  <<==debug code
}

err = jbd2_journal_get_write_access(handle, bh);
if (err)
ext4_journal_abort_handle(where, line, __func__, bh,
  handle, err);
}
return err;
}

Call trace of crash:
[ 1715.669527] print_req_error: I/O error, dev nbd3, sector 42211904

[ 1715.674940] ext4 sb bh not mapped   <<== debug log, which is added and 
printed by the
function 
"__ext4_journal_get_write_access"

[ 1715.674946] BUG: unable to handle kernel NULL pointer dereference at 
0008
[ 1715.674955] IP: jbd2_journal_add_journal_head+0x9d/0x110 [jbd2]
[ 1715.674956] PGD 2010004067 P4D 2010004067 PUD 201000b067 PMD 0
[ 1715.674961] Oops: 0002 [#1] SMP
[ 1715.675020] task: 8808a4d3dac0 task.stack: c9002e78c000
[ 1715.675024] RIP: 0010:jbd2_journal_add_journal_head+0x9d/0x110 [jbd2] <== 
the crash is caused
[ 1715.675025] RSP: 0018:c9002e78fb50 EFLAGS: 00010206
[ 1715.675026] RAX:  RBX: 8816b71cad00 RCX: 
[ 1715.675026] RDX:  RSI: 8816b71cad00 RDI: 8816b71cad00
[ 1715.675027] RBP: c9002e78fb58 R08: 001b R09: 88207f82fe07
[ 1715.675028] R10: 113d R11:  R12: 8820223a5ab0
[ 1715.675028] R13:  R14: 8816b71cad00 R15: 88196053d930
[ 1715.675029] FS:  7fc2ce9e9700() GS:88203d74() 
knlGS:
[ 1715.675030] CS:  0010 DS:  ES:  CR0: 80050033
[ 1715.675031] CR2: 0008 CR3: 002016d2c004 CR4: 007606e0
[ 1715.675033] DR0:  DR1:  DR2: 
[ 1715.675034] DR3:  DR6: fffe0ff0 DR7: 0400
[ 1715.675034] PKRU: 5554
[ 1715.675035] Call Trace:
[ 1715.675041]  jbd2_journal_get_write_access+0x6c/0xc0 [jbd2]
[ 1715.675057]  __ext4_journal_get_write_access+0x8f/0x120 [ext4]
[ 1715.675069]  ext4_reserve_inode_write+0x7b/0xb0 [ext4]
[ 1715.675079]  ? ext4_dirty_inode+0x48/0x70 [ext4]
[ 1715.675088]  ext4_mark_inode_dirty+0x53/0x1e0 [ext4]
[ 1715.675096]  ? __ext4_journal_start_sb+0x6d/0xf0 [ext4]
[ 1715.675104]  ext4_dirty_inode+0x48/0x70 [ext4]
[ 1715.675111]  __mark_inode_dirty+0x17f/0x350
[ 1715.675116]  generic_update_time+0x87/0xd0
[ 1715.675119]  file_update_time+0xbc/0x110
[ 1715.675122]  ? try_to_wake_up+0x59/0x470
[ 1715.675125]  __generic_file_write_iter+0x9d/0x1e0
[ 1715.675134]  ext4_file_write_iter+0xca/0x420 [ext4]
[ 1715.675136]  __vfs_write+0xf3/0x170
[ 1715.675138]  vfs_write+0xb2/0x1b0
[ 1715.675141]  ? syscall_trace_enter+0x1d0/0x2b0
[ 1715.675142]  SyS_write+0x55/0xc0
[ 1715.675144]  do_syscall_64+0x67/0x1b0
[ 1715.675147]  entry_SYSCALL64_slow_path+0x25/0x25

Signed-off-by: Xianting Tian 
---
 fs/ext4/ext4_jbd2.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 0c76cdd..9a60ca7 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -203,6 +203,15 @@ int __ext4_journal_get_write_access(const char *where, 
unsigned int line,
might_sleep();
 
if (ext4_handl

[PATCH] jbd2: check bh2jh() before accessing it

2020-07-18 Thread Xianting Tian
We met a crash issue when testing nbd device on kernel 4.14.0-115,
the scenario of the issue is "nbd device disconnected before unmounting
ext4 filesystem".
The call trace of the crash as below:
[346961.426274] block nbd2: Connection timed out
[346961.426943] EXT4-fs warning (device nbd2): ext4_end_bio:323: I/O error 10 
writing to inode 5768758
(offset 155926528 size 8192 starting block 8998070)
[346961.426957] Aborting journal on device nbd2-8.
[346961.427027] EXT4-fs error (device nbd2) in __ext4_new_inode:927: Readonly 
filesystem
 ... ...
[346961.437288] Buffer I/O error on dev nbd2, logical block 13139968, lost sync 
page write
[346961.437878] JBD2: Error -5 detected when updating journal superblock for 
nbd2-8.
[346961.438478] BUG: unable to handle kernel NULL pointer dereference at 
0008
[346961.452495] RIP: 0010:jbd2_journal_grab_journal_head+0x1e/0x40 [jbd2]  <== 
crash code offset is 0x1e(30)
[346961.453457] RSP: 0018:c9000ffbbca8 EFLAGS: 00010206
[346961.454414] RAX:  RBX: 881dafe04960 RCX: 
881aee5b0ac8
[346961.455378] RDX: 881df7768690 RSI: 88100a5e9800 RDI: 
880a22593d40
[346961.456360] RBP: c9000ffbbca8 R08: 881dafe04960 R09: 
00018040001c
[346961.457332] R10: 2fe92601 R11: 88202fe90700 R12: 
88100a5e9800
[346961.458302] R13:  R14: 880a22593d40 R15: 
881dafe04960
[346961.459269] FS:  () GS:88103e5c() 
knlGS:
[346961.460250] CS:  0010 DS:  ES:  CR0: 80050033
[346961.461216] CR2: 0008 CR3: 01c09004 CR4: 
007606e0
[346961.462201] DR0:  DR1:  DR2: 

[346961.463164] DR3:  DR6: fffe0ff0 DR7: 
0400
[346961.465047] Call Trace:
[346961.465981]  __jbd2_journal_insert_checkpoint+0x28/0x80 [jbd2]
[346961.466907]  jbd2_journal_commit_transaction+0x1185/0x1a20 [jbd2]
[346961.467862]  ? lock_timer_base+0x7d/0xa0
[346961.468794]  kjournald2+0xd2/0x260 [jbd2]
[346961.469717]  ? remove_wait_queue+0x60/0x60
[346961.470630]  kthread+0x109/0x140
[346961.471533]  ? commit_timeout+0x10/0x10 [jbd2]
[346961.472438]  ? kthread_park+0x60/0x60
[346961.473521]  ? do_syscall_64+0x182/0x1b0
[346961.474546]  ret_from_fork+0x25/0x30

Analysis of the crash code as below:
struct journal_head *jbd2_journal_grab_journal_head(struct buffer_head *bh)
{
struct journal_head *jh = NULL;

jbd_lock_bh_journal_head(bh);
if (buffer_jbd(bh)) {
jh = bh2jh(bh); <== jh is NULL (bh->b_private = NULL)
jh->b_jcount++; <== crash here!!!
b_jcount offset in 'struct journal_head' is 
0x8
}
jbd_unlock_bh_journal_head(bh);
return jh;
}

crash> dis -l jbd2_journal_grab_journal_head
0xa00b6050 :nopl   0x0(%rax,%rax,1) 
[FTRACE NOP]
0xa00b6055 :  push   %rbp
0xa00b6056 :  mov%rsp,%rbp
0xa00b6059 :  lock btsl $0x18,(%rdi)
0xa00b605e : jb 
0xa00b6079 
0xa00b6060 : mov(%rdi),%rax
0xa00b6063 : test   $0x2,%eax
0xa00b6068 : je 
0xa00b6087 
0xa00b606a : mov0x40(%rdi),%rax  
<== jh is NULL(b_private's offset in 'struct buffer_head' is 0x40)
0xa00b606e : addl   $0x1,0x8(%rax)  
<== "jh->b_jcount++" crash!!!

According to the logical in above code, buffer_head has an attached
journal_head("buffer_jbd(bh)" is true), but buffer_head doesn't record
it(bh->b_private is NULL).
So testing if "buffer_jbd(bh)" is true can't guarantee "bh->b_private"
is not NULL under the abnormal test case.

Signed-off-by: Xianting Tian 
---
 fs/jbd2/journal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e494443..cb661d4 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -2535,7 +2535,7 @@ struct journal_head 
*jbd2_journal_grab_journal_head(struct buffer_head *bh)
struct journal_head *jh = NULL;
 
jbd_lock_bh_journal_head(bh);
-   if (buffer_jbd(bh)) {
+   if (buffer_jbd(bh) && bh2jh(bh)) {
jh = bh2jh(bh);
jh->b_jcount++;
}
-- 
1.8.3.1



[PATCH] jbd2: fix incorrect code style

2020-07-18 Thread Xianting Tian
Remove unnecessary blank.

Signed-off-by: Xianting Tian 
---
 fs/jbd2/journal.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e494443..5eccf8c 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1285,7 +1285,7 @@ journal_t *jbd2_journal_init_inode(struct inode *inode)
  * superblock as being NULL to prevent the journal destroy from writing
  * back a bogus superblock.
  */
-static void journal_fail_superblock (journal_t *journal)
+static void journal_fail_superblock(journal_t *journal)
 {
struct buffer_head *bh = journal->j_sb_buffer;
brelse(bh);
@@ -1815,7 +1815,7 @@ int jbd2_journal_destroy(journal_t *journal)
 
 
 /**
- *int jbd2_journal_check_used_features () - Check if features specified are 
used.
+ *int jbd2_journal_check_used_features() - Check if features specified are 
used.
  * @journal: Journal to check.
  * @compat: bitmask of compatible features
  * @ro: bitmask of features that force read-only mount
@@ -1825,7 +1825,7 @@ int jbd2_journal_destroy(journal_t *journal)
  * features.  Return true (non-zero) if it does.
  **/
 
-int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat,
+int jbd2_journal_check_used_features(journal_t *journal, unsigned long compat,
 unsigned long ro, unsigned long incompat)
 {
journal_superblock_t *sb;
@@ -1860,7 +1860,7 @@ int jbd2_journal_check_used_features (journal_t *journal, 
unsigned long compat,
  * all of a given set of features on this journal.  Return true
  * (non-zero) if it can. */
 
-int jbd2_journal_check_available_features (journal_t *journal, unsigned long 
compat,
+int jbd2_journal_check_available_features(journal_t *journal, unsigned long 
compat,
  unsigned long ro, unsigned long incompat)
 {
if (!compat && !ro && !incompat)
@@ -1882,7 +1882,7 @@ int jbd2_journal_check_available_features (journal_t 
*journal, unsigned long com
 }
 
 /**
- * int jbd2_journal_set_features () - Mark a given journal feature in the 
superblock
+ * int jbd2_journal_set_features() - Mark a given journal feature in the 
superblock
  * @journal: Journal to act on.
  * @compat: bitmask of compatible features
  * @ro: bitmask of features that force read-only mount
@@ -1893,7 +1893,7 @@ int jbd2_journal_check_available_features (journal_t 
*journal, unsigned long com
  *
  */
 
-int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
+int jbd2_journal_set_features(journal_t *journal, unsigned long compat,
  unsigned long ro, unsigned long incompat)
 {
 #define INCOMPAT_FEATURE_ON(f) \
-- 
1.8.3.1



[PATCH] scsi: virtio_scsi: remove unnecessary condition check

2020-07-09 Thread Xianting Tian
kmem_cache_destroy and mempool_destroy can correctly handle
null pointer parameter, so there is no need to check if the
parameter is null before calling kmem_cache_destroy and
mempool_destroy.

Signed-off-by: Xianting Tian 
---
 drivers/scsi/virtio_scsi.c | 12 
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index bfec84a..54ac83e 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -1003,14 +1003,10 @@ static int __init init(void)
return 0;
 
 error:
-   if (virtscsi_cmd_pool) {
-   mempool_destroy(virtscsi_cmd_pool);
-   virtscsi_cmd_pool = NULL;
-   }
-   if (virtscsi_cmd_cache) {
-   kmem_cache_destroy(virtscsi_cmd_cache);
-   virtscsi_cmd_cache = NULL;
-   }
+   mempool_destroy(virtscsi_cmd_pool);
+   virtscsi_cmd_pool = NULL;
+   kmem_cache_destroy(virtscsi_cmd_cache);
+   virtscsi_cmd_cache = NULL;
return ret;
 }
 
-- 
1.8.3.1



[PATCH] scsi: virtio_scsi: remove unnecessary condition check

2020-07-09 Thread Xianting Tian
kmem_cache_destroy can correctly handle null pointer parameter,
so there is no need to check if the parameter is null before
calling kmem_cache_destroy.

Signed-off-by: Xianting Tian 
---
 drivers/scsi/virtio_scsi.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index bfec84a..5bc288f 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -1007,10 +1007,8 @@ static int __init init(void)
mempool_destroy(virtscsi_cmd_pool);
virtscsi_cmd_pool = NULL;
}
-   if (virtscsi_cmd_cache) {
-   kmem_cache_destroy(virtscsi_cmd_cache);
-   virtscsi_cmd_cache = NULL;
-   }
+   kmem_cache_destroy(virtscsi_cmd_cache);
+   virtscsi_cmd_cache = NULL;
return ret;
 }
 
-- 
1.8.3.1



[PATCH] direct-io: pass correct argument to dio_complete

2020-06-08 Thread Xianting Tian
When submit async direct-io write operation in function
do_blockdev_direct_IO, 'struct dio' records the info of all bios,
initial value of dio->refcount is set to 1, 'dio->refcount++' is
executed in dio_bio_submit when submit one bio, 'dio->refcount--'
is executed in bio completion handler dio_bio_end_aio.

In do_blockdev_direct_IO, it also calls drop_refcount to do
'dio->refcount--', then judge if dio->refcount is 0, if yes, it
will call dio_complete to complete the dio:
if (drop_refcount(dio) == 0) {
  retval = dio_complete(dio, retval, DIO_COMPLETE_INVALIDATE);
} else

dio_bio_end_aio and drop_refcount will race to judge if dio->refcount
is 0:
1, if dio_bio_end_aio finds dio->refcount is 0, it will queue work if
   defer_completion is set, work handler
   dio_aio_complete_work->dio_complete will be called:
  dio_complete(dio, 0,
DIO_COMPLETE_ASYNC | DIO_COMPLETE_INVALIDATE);
   if defer_completion not set, it will call:
  dio_complete(dio, 0, DIO_COMPLETE_ASYNC);
   In above two cases, because DIO_COMPLETE_ASYNC is passed to
   dio_complete. So in dio_complete, it will call aio completion handler:
  dio->iocb->ki_complete(dio->iocb, ret, 0);
   As ki_complete is set to aio_complete for async io, which will fill
   an event to ring buffer, then user can use io_getevents to get this
   event.
2, if drop_refcount finds dio->refcount is 0, it will call:
  dio_complete(dio, retval, DIO_COMPLETE_INVALIDATE);
   As no DIO_COMPLETE_ASYNC is passed to dio_complete. So in dio_complete,
   ki_complete(aio_complete) will not be called. Eventually, no one fills
   the completion event to ring buffer, so user can't get the completion
   event via io_getevents.

Currently, we doesn't meet above issue with existing kernel code,
I think because do_blockdev_direct_IO is called in bio submission path,
it will be quickly completed before all aync bios completion in almost
all cases, so when drop_refcounng is executing, it finds dio->refcount is
not 0 after 'dio->refcount--'. But when the last bio completed,
dio_bio_end_aio will be called, which will find dio->refcount is 0,
then below code will be executed and the async events ring buffer getting
to be filled:
  dio_complete(dio, 0, DIO_COMPLETE_ASYNC | DIO_COMPLETE_INVALIDATE);
  or
  dio_complete(dio, 0, DIO_COMPLETE_ASYNC);

Make the code logically with this patch and cover above scenario.

Signed-off-by: Xianting Tian 
---
 fs/direct-io.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 1543b5a..552459f 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1345,7 +1345,9 @@ static inline int drop_refcount(struct dio *dio)
dio_await_completion(dio);
 
if (drop_refcount(dio) == 0) {
-   retval = dio_complete(dio, retval, DIO_COMPLETE_INVALIDATE);
+   retval = dio_complete(dio, retval, dio->is_async ?
+   DIO_COMPLETE_ASYNC | DIO_COMPLETE_INVALIDATE :
+   DIO_COMPLETE_INVALIDATE);
} else
BUG_ON(retval != -EIOCBQUEUED);
 
-- 
1.8.3.1



[PATCH] timers: Use set_current_state macro

2020-05-14 Thread Xianting Tian
Use set_current_state macro instead of current->state = TASK_RUNNING

Signed-off-by: Xianting Tian 
---
 kernel/time/timer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 4820823..b9ecf87 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1882,7 +1882,7 @@ signed long __sched schedule_timeout(signed long timeout)
printk(KERN_ERR "schedule_timeout: wrong timeout "
"value %lx\n", timeout);
dump_stack();
-   current->state = TASK_RUNNING;
+   set_current_state(TASK_RUNNING);
goto out;
}
}
-- 
1.8.3.1



[PATCH] timers: use set_current_state macro

2020-05-07 Thread Xianting Tian
Use set_current_state macro instead of current->state = TASK_RUNNING.

Signed-off-by: Xianting Tian 
---
 kernel/time/timer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index a5221abb4..7c6d42755 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1885,7 +1885,7 @@ signed long __sched schedule_timeout(signed long timeout)
printk(KERN_ERR "schedule_timeout: wrong timeout "
"value %lx\n", timeout);
dump_stack();
-   current->state = TASK_RUNNING;
+   set_current_state(TASK_RUNNING);
goto out;
}
}
--
2.17.1

-
±¾Óʼþ¼°Æ丽¼þº¬ÓÐлªÈý¼¯Íŵı£ÃÜÐÅÏ¢£¬½öÏÞÓÚ·¢Ë͸øÉÏÃæµØÖ·ÖÐÁгö
µÄ¸öÈË»òȺ×é¡£½ûÖ¹ÈκÎÆäËûÈËÒÔÈκÎÐÎʽʹÓ㨰üÀ¨µ«²»ÏÞÓÚÈ«²¿»ò²¿·ÖµØй¶¡¢¸´ÖÆ¡¢
»òÉ¢·¢£©±¾ÓʼþÖеÄÐÅÏ¢¡£Èç¹ûÄú´íÊÕÁ˱¾Óʼþ£¬ÇëÄúÁ¢¼´µç»°»òÓʼþ֪ͨ·¢¼þÈ˲¢É¾³ý±¾
Óʼþ£¡
This e-mail and its attachments contain confidential information from New H3C, 
which is
intended only for the person or entity whose address is listed above. Any use 
of the
information contained herein in any way (including, but not limited to, total 
or partial
disclosure, reproduction, or dissemination) by persons other than the intended
recipient(s) is prohibited. If you receive this e-mail in error, please notify 
the sender
by phone or email immediately and delete it!


[PATCH] sched/fair: Fix typo in comment

2020-05-06 Thread Xianting Tian
check_prempt_curr() -> check_preempt_curr()

Signed-off-by: Xianting Tian 
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 02f323b85..458ab5521 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6858,7 +6858,7 @@ static void check_preempt_wakeup(struct rq *rq, struct 
task_struct *p, int wake_

/*
 * This is possible from callers such as attach_tasks(), in which we
-* unconditionally check_prempt_curr() after an enqueue (which may have
+* unconditionally check_preempt_curr() after an enqueue (which may have
 * lead to a throttle).  This both saves work and prevents false
 * next-buddy nomination below.
 */
--
2.17.1

-
±¾Óʼþ¼°Æ丽¼þº¬ÓÐлªÈý¼¯Íŵı£ÃÜÐÅÏ¢£¬½öÏÞÓÚ·¢Ë͸øÉÏÃæµØÖ·ÖÐÁгö
µÄ¸öÈË»òȺ×é¡£½ûÖ¹ÈκÎÆäËûÈËÒÔÈκÎÐÎʽʹÓ㨰üÀ¨µ«²»ÏÞÓÚÈ«²¿»ò²¿·ÖµØй¶¡¢¸´ÖÆ¡¢
»òÉ¢·¢£©±¾ÓʼþÖеÄÐÅÏ¢¡£Èç¹ûÄú´íÊÕÁ˱¾Óʼþ£¬ÇëÄúÁ¢¼´µç»°»òÓʼþ֪ͨ·¢¼þÈ˲¢É¾³ý±¾
Óʼþ£¡
This e-mail and its attachments contain confidential information from New H3C, 
which is
intended only for the person or entity whose address is listed above. Any use 
of the
information contained herein in any way (including, but not limited to, total 
or partial
disclosure, reproduction, or dissemination) by persons other than the intended
recipient(s) is prohibited. If you receive this e-mail in error, please notify 
the sender
by phone or email immediately and delete it!