[PATCH v4 2/2] staging: vt6655: add handling memory leak on vnt_start()

2018-03-29 Thread Ji-Hun Kim
There was no code for handling memory leaks of device_init_rings() and
request_irq(). It needs to free allocated memory in the device_init_rings()
, when request_irq() is failed. Add freeing sequences of irq and device
init rings.

Signed-off-by: Ji-Hun Kim 
---
It's additional memory leak handling patch from
[PATCH v4 1/2] staging: vt6655: check for memory allocation failures

 drivers/staging/vt6655/device_main.c | 13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/staging/vt6655/device_main.c 
b/drivers/staging/vt6655/device_main.c
index c9752df..3604f2d 100644
--- a/drivers/staging/vt6655/device_main.c
+++ b/drivers/staging/vt6655/device_main.c
@@ -1194,14 +1194,17 @@ static int vnt_start(struct ieee80211_hw *hw)
int ret;
 
priv->rx_buf_sz = PKT_BUF_SZ;
-   if (!device_init_rings(priv))
-   return -ENOMEM;
+   ret = (int)device_init_rings(priv);
+   if (!ret) {
+   ret = -ENOMEM;
+   goto err_init_rings;
+   }
 
ret = request_irq(priv->pcid->irq, vnt_interrupt,
  IRQF_SHARED, "vt6655", priv);
if (ret) {
dev_dbg(>pcid->dev, "failed to start irq\n");
-   return ret;
+   goto err_irq;
}
 
dev_dbg(>pcid->dev, "call device init rd0 ring\n");
@@ -1234,6 +1237,10 @@ static int vnt_start(struct ieee80211_hw *hw)
 err_init_rd1_ring:
device_free_rd0_ring(priv);
 err_init_rd0_ring:
+   free_irq(priv->pcid->irq, priv);
+err_irq:
+   device_free_rings(priv);
+err_init_rings:
return ret;
 }
 
-- 
1.9.1



[PATCH v4 2/2] staging: vt6655: add handling memory leak on vnt_start()

2018-03-29 Thread Ji-Hun Kim
There was no code for handling memory leaks of device_init_rings() and
request_irq(). It needs to free allocated memory in the device_init_rings()
, when request_irq() is failed. Add freeing sequences of irq and device
init rings.

Signed-off-by: Ji-Hun Kim 
---
It's additional memory leak handling patch from
[PATCH v4 1/2] staging: vt6655: check for memory allocation failures

 drivers/staging/vt6655/device_main.c | 13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/staging/vt6655/device_main.c 
b/drivers/staging/vt6655/device_main.c
index c9752df..3604f2d 100644
--- a/drivers/staging/vt6655/device_main.c
+++ b/drivers/staging/vt6655/device_main.c
@@ -1194,14 +1194,17 @@ static int vnt_start(struct ieee80211_hw *hw)
int ret;
 
priv->rx_buf_sz = PKT_BUF_SZ;
-   if (!device_init_rings(priv))
-   return -ENOMEM;
+   ret = (int)device_init_rings(priv);
+   if (!ret) {
+   ret = -ENOMEM;
+   goto err_init_rings;
+   }
 
ret = request_irq(priv->pcid->irq, vnt_interrupt,
  IRQF_SHARED, "vt6655", priv);
if (ret) {
dev_dbg(>pcid->dev, "failed to start irq\n");
-   return ret;
+   goto err_irq;
}
 
dev_dbg(>pcid->dev, "call device init rd0 ring\n");
@@ -1234,6 +1237,10 @@ static int vnt_start(struct ieee80211_hw *hw)
 err_init_rd1_ring:
device_free_rd0_ring(priv);
 err_init_rd0_ring:
+   free_irq(priv->pcid->irq, priv);
+err_irq:
+   device_free_rings(priv);
+err_init_rings:
return ret;
 }
 
-- 
1.9.1



[PATCH] f2fs: remain written times to update inode during fsync

2018-03-29 Thread Jaegeuk Kim
This fixes xfstests/generic/392.

Signed-off-by: Jaegeuk Kim 
---
 fs/f2fs/f2fs.h  | 15 +++
 fs/f2fs/inode.c |  4 
 2 files changed, 19 insertions(+)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 000f93f6767e..675c39d85111 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -664,6 +664,7 @@ struct f2fs_inode_info {
kprojid_t i_projid; /* id for project quota */
int i_inline_xattr_size;/* inline xattr size */
struct timespec i_crtime;   /* inode creation time */
+   struct timespec i_disk_time[4]; /* inode disk times */
 };
 
 static inline void get_extent_info(struct extent_info *ext,
@@ -2457,6 +2458,11 @@ static inline void clear_file(struct inode *inode, int 
type)
f2fs_mark_inode_dirty_sync(inode, true);
 }
 
+static inline bool time_equal(struct timespec a, struct timespec b)
+{
+   return (a.tv_sec == b.tv_sec) && (a.tv_nsec == b.tv_nsec);
+}
+
 static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync)
 {
bool ret;
@@ -2474,6 +2480,15 @@ static inline bool f2fs_skip_inode_update(struct inode 
*inode, int dsync)
i_size_read(inode) & ~PAGE_MASK)
return false;
 
+   if (!time_equal(F2FS_I(inode)->i_disk_time[0], inode->i_atime))
+   return false;
+   if (!time_equal(F2FS_I(inode)->i_disk_time[1], inode->i_ctime))
+   return false;
+   if (!time_equal(F2FS_I(inode)->i_disk_time[2], inode->i_mtime))
+   return false;
+   if (!time_equal(F2FS_I(inode)->i_disk_time[3], F2FS_I(inode)->i_crtime))
+   return false;
+
down_read(_I(inode)->i_sem);
ret = F2FS_I(inode)->last_disk_size == i_size_read(inode);
up_read(_I(inode)->i_sem);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 401f09ccce7e..70aba580f4b0 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -444,6 +444,10 @@ void update_inode(struct inode *inode, struct page 
*node_page)
if (inode->i_nlink == 0)
clear_inline_node(node_page);
 
+   F2FS_I(inode)->i_disk_time[0] = inode->i_atime;
+   F2FS_I(inode)->i_disk_time[1] = inode->i_ctime;
+   F2FS_I(inode)->i_disk_time[2] = inode->i_mtime;
+   F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime;
 }
 
 void update_inode_page(struct inode *inode)
-- 
2.15.0.531.g2ccb3012c9-goog



[PATCH v4 1/2] staging: vt6655: check for memory allocation failures

2018-03-29 Thread Ji-Hun Kim
There are no null pointer checking on rd_info and td_info values which
are allocated by kzalloc. It has potential null pointer dereferencing
issues. Implement error handling code on device_init_rd*, device_init_td*
and vnt_start for the allocation failures.

Signed-off-by: Ji-Hun Kim 
---
Changes v2:
- Delete WARN_ON which can makes crashes on some machines.
- Instead of return directly, goto freeing function for freeing previously
  allocated memory in the for loop after kzalloc() failed.
- In the freeing function, add if statement for freeing to only allocated
  values.

Changes v3:
- Modify return type of device_init_rd*, device_init_td*. Then add returns
  error code at those functions and vnt_start as well.

Changes v4:
- Fix potential memory leaks from error handling code of device init
  functions in vnt_start().

 drivers/staging/vt6655/device_main.c | 121 ++-
 1 file changed, 89 insertions(+), 32 deletions(-)

diff --git a/drivers/staging/vt6655/device_main.c 
b/drivers/staging/vt6655/device_main.c
index fbc4bc6..c9752df 100644
--- a/drivers/staging/vt6655/device_main.c
+++ b/drivers/staging/vt6655/device_main.c
@@ -124,10 +124,10 @@
 static void device_free_info(struct vnt_private *priv);
 static void device_print_info(struct vnt_private *priv);
 
-static void device_init_rd0_ring(struct vnt_private *priv);
-static void device_init_rd1_ring(struct vnt_private *priv);
-static void device_init_td0_ring(struct vnt_private *priv);
-static void device_init_td1_ring(struct vnt_private *priv);
+static int device_init_rd0_ring(struct vnt_private *priv);
+static int device_init_rd1_ring(struct vnt_private *priv);
+static int device_init_td0_ring(struct vnt_private *priv);
+static int device_init_td1_ring(struct vnt_private *priv);
 
 static int  device_rx_srv(struct vnt_private *priv, unsigned int idx);
 static int  device_tx_srv(struct vnt_private *priv, unsigned int idx);
@@ -528,18 +528,22 @@ static void device_free_rings(struct vnt_private *priv)
  priv->tx0_bufs, priv->tx_bufs_dma0);
 }
 
-static void device_init_rd0_ring(struct vnt_private *priv)
+static int device_init_rd0_ring(struct vnt_private *priv)
 {
int i;
dma_addr_t  curr = priv->rd0_pool_dma;
struct vnt_rx_desc *desc;
+   int ret = 0;
 
/* Init the RD0 ring entries */
for (i = 0; i < priv->opts.rx_descs0;
 i ++, curr += sizeof(struct vnt_rx_desc)) {
desc = >aRD0Ring[i];
desc->rd_info = kzalloc(sizeof(*desc->rd_info), GFP_KERNEL);
-
+   if (!desc->rd_info) {
+   ret = -ENOMEM;
+   goto error;
+   }
if (!device_alloc_rx_buf(priv, desc))
dev_err(>pcid->dev, "can not alloc rx bufs\n");
 
@@ -550,20 +554,29 @@ static void device_init_rd0_ring(struct vnt_private *priv)
if (i > 0)
priv->aRD0Ring[i-1].next_desc = cpu_to_le32(priv->rd0_pool_dma);
priv->pCurrRD[0] = >aRD0Ring[0];
+
+   return 0;
+error:
+   device_free_rd0_ring(priv);
+   return ret;
 }
 
-static void device_init_rd1_ring(struct vnt_private *priv)
+static int device_init_rd1_ring(struct vnt_private *priv)
 {
int i;
dma_addr_t  curr = priv->rd1_pool_dma;
struct vnt_rx_desc *desc;
+   int ret = 0;
 
/* Init the RD1 ring entries */
for (i = 0; i < priv->opts.rx_descs1;
 i ++, curr += sizeof(struct vnt_rx_desc)) {
desc = >aRD1Ring[i];
desc->rd_info = kzalloc(sizeof(*desc->rd_info), GFP_KERNEL);
-
+   if (!desc->rd_info) {
+   ret = -ENOMEM;
+   goto error;
+   }
if (!device_alloc_rx_buf(priv, desc))
dev_err(>pcid->dev, "can not alloc rx bufs\n");
 
@@ -574,6 +587,11 @@ static void device_init_rd1_ring(struct vnt_private *priv)
if (i > 0)
priv->aRD1Ring[i-1].next_desc = cpu_to_le32(priv->rd1_pool_dma);
priv->pCurrRD[1] = >aRD1Ring[0];
+
+   return 0;
+error:
+   device_free_rd1_ring(priv);
+   return ret;
 }
 
 static void device_free_rd0_ring(struct vnt_private *priv)
@@ -584,12 +602,12 @@ static void device_free_rd0_ring(struct vnt_private *priv)
struct vnt_rx_desc *desc = >aRD0Ring[i];
struct vnt_rd_info *rd_info = desc->rd_info;
 
-   dma_unmap_single(>pcid->dev, rd_info->skb_dma,
-priv->rx_buf_sz, DMA_FROM_DEVICE);
-
-   dev_kfree_skb(rd_info->skb);
-
-   kfree(desc->rd_info);
+   if (rd_info) {
+   dma_unmap_single(>pcid->dev, rd_info->skb_dma,
+   priv->rx_buf_sz, DMA_FROM_DEVICE);
+   dev_kfree_skb(rd_info->skb);
+   

[PATCH] f2fs: remain written times to update inode during fsync

2018-03-29 Thread Jaegeuk Kim
This fixes xfstests/generic/392.

Signed-off-by: Jaegeuk Kim 
---
 fs/f2fs/f2fs.h  | 15 +++
 fs/f2fs/inode.c |  4 
 2 files changed, 19 insertions(+)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 000f93f6767e..675c39d85111 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -664,6 +664,7 @@ struct f2fs_inode_info {
kprojid_t i_projid; /* id for project quota */
int i_inline_xattr_size;/* inline xattr size */
struct timespec i_crtime;   /* inode creation time */
+   struct timespec i_disk_time[4]; /* inode disk times */
 };
 
 static inline void get_extent_info(struct extent_info *ext,
@@ -2457,6 +2458,11 @@ static inline void clear_file(struct inode *inode, int 
type)
f2fs_mark_inode_dirty_sync(inode, true);
 }
 
+static inline bool time_equal(struct timespec a, struct timespec b)
+{
+   return (a.tv_sec == b.tv_sec) && (a.tv_nsec == b.tv_nsec);
+}
+
 static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync)
 {
bool ret;
@@ -2474,6 +2480,15 @@ static inline bool f2fs_skip_inode_update(struct inode 
*inode, int dsync)
i_size_read(inode) & ~PAGE_MASK)
return false;
 
+   if (!time_equal(F2FS_I(inode)->i_disk_time[0], inode->i_atime))
+   return false;
+   if (!time_equal(F2FS_I(inode)->i_disk_time[1], inode->i_ctime))
+   return false;
+   if (!time_equal(F2FS_I(inode)->i_disk_time[2], inode->i_mtime))
+   return false;
+   if (!time_equal(F2FS_I(inode)->i_disk_time[3], F2FS_I(inode)->i_crtime))
+   return false;
+
down_read(_I(inode)->i_sem);
ret = F2FS_I(inode)->last_disk_size == i_size_read(inode);
up_read(_I(inode)->i_sem);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 401f09ccce7e..70aba580f4b0 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -444,6 +444,10 @@ void update_inode(struct inode *inode, struct page 
*node_page)
if (inode->i_nlink == 0)
clear_inline_node(node_page);
 
+   F2FS_I(inode)->i_disk_time[0] = inode->i_atime;
+   F2FS_I(inode)->i_disk_time[1] = inode->i_ctime;
+   F2FS_I(inode)->i_disk_time[2] = inode->i_mtime;
+   F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime;
 }
 
 void update_inode_page(struct inode *inode)
-- 
2.15.0.531.g2ccb3012c9-goog



[PATCH v4 1/2] staging: vt6655: check for memory allocation failures

2018-03-29 Thread Ji-Hun Kim
There are no null pointer checking on rd_info and td_info values which
are allocated by kzalloc. It has potential null pointer dereferencing
issues. Implement error handling code on device_init_rd*, device_init_td*
and vnt_start for the allocation failures.

Signed-off-by: Ji-Hun Kim 
---
Changes v2:
- Delete WARN_ON which can makes crashes on some machines.
- Instead of return directly, goto freeing function for freeing previously
  allocated memory in the for loop after kzalloc() failed.
- In the freeing function, add if statement for freeing to only allocated
  values.

Changes v3:
- Modify return type of device_init_rd*, device_init_td*. Then add returns
  error code at those functions and vnt_start as well.

Changes v4:
- Fix potential memory leaks from error handling code of device init
  functions in vnt_start().

 drivers/staging/vt6655/device_main.c | 121 ++-
 1 file changed, 89 insertions(+), 32 deletions(-)

diff --git a/drivers/staging/vt6655/device_main.c 
b/drivers/staging/vt6655/device_main.c
index fbc4bc6..c9752df 100644
--- a/drivers/staging/vt6655/device_main.c
+++ b/drivers/staging/vt6655/device_main.c
@@ -124,10 +124,10 @@
 static void device_free_info(struct vnt_private *priv);
 static void device_print_info(struct vnt_private *priv);
 
-static void device_init_rd0_ring(struct vnt_private *priv);
-static void device_init_rd1_ring(struct vnt_private *priv);
-static void device_init_td0_ring(struct vnt_private *priv);
-static void device_init_td1_ring(struct vnt_private *priv);
+static int device_init_rd0_ring(struct vnt_private *priv);
+static int device_init_rd1_ring(struct vnt_private *priv);
+static int device_init_td0_ring(struct vnt_private *priv);
+static int device_init_td1_ring(struct vnt_private *priv);
 
 static int  device_rx_srv(struct vnt_private *priv, unsigned int idx);
 static int  device_tx_srv(struct vnt_private *priv, unsigned int idx);
@@ -528,18 +528,22 @@ static void device_free_rings(struct vnt_private *priv)
  priv->tx0_bufs, priv->tx_bufs_dma0);
 }
 
-static void device_init_rd0_ring(struct vnt_private *priv)
+static int device_init_rd0_ring(struct vnt_private *priv)
 {
int i;
dma_addr_t  curr = priv->rd0_pool_dma;
struct vnt_rx_desc *desc;
+   int ret = 0;
 
/* Init the RD0 ring entries */
for (i = 0; i < priv->opts.rx_descs0;
 i ++, curr += sizeof(struct vnt_rx_desc)) {
desc = >aRD0Ring[i];
desc->rd_info = kzalloc(sizeof(*desc->rd_info), GFP_KERNEL);
-
+   if (!desc->rd_info) {
+   ret = -ENOMEM;
+   goto error;
+   }
if (!device_alloc_rx_buf(priv, desc))
dev_err(>pcid->dev, "can not alloc rx bufs\n");
 
@@ -550,20 +554,29 @@ static void device_init_rd0_ring(struct vnt_private *priv)
if (i > 0)
priv->aRD0Ring[i-1].next_desc = cpu_to_le32(priv->rd0_pool_dma);
priv->pCurrRD[0] = >aRD0Ring[0];
+
+   return 0;
+error:
+   device_free_rd0_ring(priv);
+   return ret;
 }
 
-static void device_init_rd1_ring(struct vnt_private *priv)
+static int device_init_rd1_ring(struct vnt_private *priv)
 {
int i;
dma_addr_t  curr = priv->rd1_pool_dma;
struct vnt_rx_desc *desc;
+   int ret = 0;
 
/* Init the RD1 ring entries */
for (i = 0; i < priv->opts.rx_descs1;
 i ++, curr += sizeof(struct vnt_rx_desc)) {
desc = >aRD1Ring[i];
desc->rd_info = kzalloc(sizeof(*desc->rd_info), GFP_KERNEL);
-
+   if (!desc->rd_info) {
+   ret = -ENOMEM;
+   goto error;
+   }
if (!device_alloc_rx_buf(priv, desc))
dev_err(>pcid->dev, "can not alloc rx bufs\n");
 
@@ -574,6 +587,11 @@ static void device_init_rd1_ring(struct vnt_private *priv)
if (i > 0)
priv->aRD1Ring[i-1].next_desc = cpu_to_le32(priv->rd1_pool_dma);
priv->pCurrRD[1] = >aRD1Ring[0];
+
+   return 0;
+error:
+   device_free_rd1_ring(priv);
+   return ret;
 }
 
 static void device_free_rd0_ring(struct vnt_private *priv)
@@ -584,12 +602,12 @@ static void device_free_rd0_ring(struct vnt_private *priv)
struct vnt_rx_desc *desc = >aRD0Ring[i];
struct vnt_rd_info *rd_info = desc->rd_info;
 
-   dma_unmap_single(>pcid->dev, rd_info->skb_dma,
-priv->rx_buf_sz, DMA_FROM_DEVICE);
-
-   dev_kfree_skb(rd_info->skb);
-
-   kfree(desc->rd_info);
+   if (rd_info) {
+   dma_unmap_single(>pcid->dev, rd_info->skb_dma,
+   priv->rx_buf_sz, DMA_FROM_DEVICE);
+   dev_kfree_skb(rd_info->skb);
+   kfree(desc->rd_info);
+  

Re: General protection fault with use_blk_mq=1.

2018-03-29 Thread Zephaniah E. Loss-Cutler-Hull
On 03/29/2018 02:12 AM, Zephaniah E. Loss-Cutler-Hull wrote:
> On 03/28/2018 10:13 PM, Paolo Valente wrote:
>> In addition, the outcome of your attempt without
>> CONFIG_DEBUG_BLK_CGROUP would give us useful bisection information:
>> - if no failure occurs, then the issue is likely to be confined in
>> that debugging code (which, on the bright side, is likely to be of
>> occasional interest, for only a handful of developers)
>> - if the issue still shows up, then we may have new hints on this odd
>> failure
>>
>> Finally, consider that this issue has been reported to disappear from
>> 4.16 [1], and, as a plus, that the service quality of BFQ had a
>> further boost exactly from 4.16.
> 
> I look forward to that either way then.
>>
>> Looking forward to your feedback, in case you try BFQ without
>> CONFIG_DEBUG_BLK_CGROUP,
> 
> I'm running that now, judging from the past if it survives until
> tomorrow evening then we're good, so I should hopefully know in the next
> day.

Alright, I now have an uptime of over 20 hours, with
scsi_mod.use_blk_mq=1 dm_mod.use_blk_mq=1

I did upgrade from 4.15.13 to 4.15.14 in the process, but a quick look
at the changes doesn't have anything jump out at me as impacting this.

So I'm reasonably comfortable stating that disabling
CONFIG_DEBUG_BLK_CGROUP was sufficient to render this stable.

Regards,
Zephaniah E. Loss-Cutler-Hull.


Re: General protection fault with use_blk_mq=1.

2018-03-29 Thread Zephaniah E. Loss-Cutler-Hull
On 03/29/2018 02:12 AM, Zephaniah E. Loss-Cutler-Hull wrote:
> On 03/28/2018 10:13 PM, Paolo Valente wrote:
>> In addition, the outcome of your attempt without
>> CONFIG_DEBUG_BLK_CGROUP would give us useful bisection information:
>> - if no failure occurs, then the issue is likely to be confined in
>> that debugging code (which, on the bright side, is likely to be of
>> occasional interest, for only a handful of developers)
>> - if the issue still shows up, then we may have new hints on this odd
>> failure
>>
>> Finally, consider that this issue has been reported to disappear from
>> 4.16 [1], and, as a plus, that the service quality of BFQ had a
>> further boost exactly from 4.16.
> 
> I look forward to that either way then.
>>
>> Looking forward to your feedback, in case you try BFQ without
>> CONFIG_DEBUG_BLK_CGROUP,
> 
> I'm running that now, judging from the past if it survives until
> tomorrow evening then we're good, so I should hopefully know in the next
> day.

Alright, I now have an uptime of over 20 hours, with
scsi_mod.use_blk_mq=1 dm_mod.use_blk_mq=1

I did upgrade from 4.15.13 to 4.15.14 in the process, but a quick look
at the changes doesn't have anything jump out at me as impacting this.

So I'm reasonably comfortable stating that disabling
CONFIG_DEBUG_BLK_CGROUP was sufficient to render this stable.

Regards,
Zephaniah E. Loss-Cutler-Hull.


Re: [PATCH 0/1] cover-letter/lz4: Implement lz4 with dynamic offset length.

2018-03-29 Thread Sergey Senozhatsky
On (03/29/18 15:56), Maninder Singh wrote:
> Hello Nick/Sergey,
> 
> Any suggestion or comments, so that we can change code and resend the patch?

Well... there were no replies to
 https://marc.info/?l=linux-kernel=152161450026771=2
and
 https://marc.info/?l=linux-kernel=152161860627974=2

-ss


Re: [PATCH 0/1] cover-letter/lz4: Implement lz4 with dynamic offset length.

2018-03-29 Thread Sergey Senozhatsky
On (03/29/18 15:56), Maninder Singh wrote:
> Hello Nick/Sergey,
> 
> Any suggestion or comments, so that we can change code and resend the patch?

Well... there were no replies to
 https://marc.info/?l=linux-kernel=152161450026771=2
and
 https://marc.info/?l=linux-kernel=152161860627974=2

-ss


Re: [PATCH v2 04/21] kconfig: reference environments directly and remove 'option env=' syntax

2018-03-29 Thread Masahiro Yamada
2018-03-29 11:19 GMT+09:00 Ulf Magnusson :.

>>  %%
>> +static void expand_string(const char *in)
>> +{
>> +   char *p, *q;
>> +
>> +   p = expand_string_value(in);
>> +
>> +   q = p + strlen(p);
>> +   while (q > p)
>> +   unput(*--q);
>> +
>> +   free(p);
>> +}
>> +
>
> I like the simplicity of this approach, but I suspect it might be too simple.
>
> For example, the following breaks with a syntax error if $ENV has any
> double quotes in its value:
>
> config FOO
> string "foo"
> default "$ENV"
>
> The following will only work as expected if $ENV expands to a valid
> Kconfig symbol name. If it doesn't, random stuff will happen (most
> likely a syntax error).
>
> config FOO
> string "foo"
> default $ENV
>
> The reason it works if $ENV expands to a valid symbol name is that
> undefined symbols get their name as their (string) value. If the
> symbol happens to be defined, it will be referenced, which seems
> confusing too.
>
> In general, that reinterpretation of expanded values feels a bit icky
> to me, and as something that might add complexity to Kconfig for
> little value. If $ENV outside of quotes absolutely must be supported,
> I think it should be a shorthand for "$ENV" (which means "constant
> value" in Kconfig speak).
>

You are right.  This implementation is too lazy, and bad.
I will change the implementation.

Thanks.




-- 
Best Regards
Masahiro Yamada


Re: [PATCH v2 04/21] kconfig: reference environments directly and remove 'option env=' syntax

2018-03-29 Thread Masahiro Yamada
2018-03-29 11:19 GMT+09:00 Ulf Magnusson :.

>>  %%
>> +static void expand_string(const char *in)
>> +{
>> +   char *p, *q;
>> +
>> +   p = expand_string_value(in);
>> +
>> +   q = p + strlen(p);
>> +   while (q > p)
>> +   unput(*--q);
>> +
>> +   free(p);
>> +}
>> +
>
> I like the simplicity of this approach, but I suspect it might be too simple.
>
> For example, the following breaks with a syntax error if $ENV has any
> double quotes in its value:
>
> config FOO
> string "foo"
> default "$ENV"
>
> The following will only work as expected if $ENV expands to a valid
> Kconfig symbol name. If it doesn't, random stuff will happen (most
> likely a syntax error).
>
> config FOO
> string "foo"
> default $ENV
>
> The reason it works if $ENV expands to a valid symbol name is that
> undefined symbols get their name as their (string) value. If the
> symbol happens to be defined, it will be referenced, which seems
> confusing too.
>
> In general, that reinterpretation of expanded values feels a bit icky
> to me, and as something that might add complexity to Kconfig for
> little value. If $ENV outside of quotes absolutely must be supported,
> I think it should be a shorthand for "$ENV" (which means "constant
> value" in Kconfig speak).
>

You are right.  This implementation is too lazy, and bad.
I will change the implementation.

Thanks.




-- 
Best Regards
Masahiro Yamada


Re: [GIT PULL] sound fixes for 4.16-final

2018-03-29 Thread Linus Torvalds
On Thu, Mar 29, 2018 at 4:23 AM, Takashi Iwai  wrote:
>
>   git://git.kernel.org/pub/scm/linux/kernel/git/tiwai/sound.git 
> tags/sound-4.16

No such tag. Forgot to push?

There's the "for-linus" branch that matches the commit id you stated,
but no tags that point to it..

Hmm?

  Linus


Re: [GIT PULL] sound fixes for 4.16-final

2018-03-29 Thread Linus Torvalds
On Thu, Mar 29, 2018 at 4:23 AM, Takashi Iwai  wrote:
>
>   git://git.kernel.org/pub/scm/linux/kernel/git/tiwai/sound.git 
> tags/sound-4.16

No such tag. Forgot to push?

There's the "for-linus" branch that matches the commit id you stated,
but no tags that point to it..

Hmm?

  Linus


Re: [RFC PATCH ghak32 V2 01/13] audit: add container id

2018-03-29 Thread Richard Guy Briggs
On 2018-03-29 07:03, Jonathan Corbet wrote:
> On Thu, 29 Mar 2018 05:01:32 -0400
> Richard Guy Briggs  wrote:
> 
> > > A little detail, but still...  
> > 
> > I am understanding that you would prefer more context (as opposed to
> > operational detail) in the description, laying out the use case for this
> > patch(set)?
> 
> No, sorry, "a little detail" was referring to my comment.  The use case,
> I believe, has been well described.

Ah!  "A minor nit".  :-)

> jon

- RGB

--
Richard Guy Briggs 
Sr. S/W Engineer, Kernel Security, Base Operating Systems
Remote, Ottawa, Red Hat Canada
IRC: rgb, SunRaycer
Voice: +1.647.777.2635, Internal: (81) 32635


Re: [RFC PATCH ghak32 V2 01/13] audit: add container id

2018-03-29 Thread Richard Guy Briggs
On 2018-03-29 07:03, Jonathan Corbet wrote:
> On Thu, 29 Mar 2018 05:01:32 -0400
> Richard Guy Briggs  wrote:
> 
> > > A little detail, but still...  
> > 
> > I am understanding that you would prefer more context (as opposed to
> > operational detail) in the description, laying out the use case for this
> > patch(set)?
> 
> No, sorry, "a little detail" was referring to my comment.  The use case,
> I believe, has been well described.

Ah!  "A minor nit".  :-)

> jon

- RGB

--
Richard Guy Briggs 
Sr. S/W Engineer, Kernel Security, Base Operating Systems
Remote, Ottawa, Red Hat Canada
IRC: rgb, SunRaycer
Voice: +1.647.777.2635, Internal: (81) 32635


[PATCH] mm: limit a process RSS

2018-03-29 Thread Li RongQing
we cannot limit a process RSS although there is ulimit -m,
not sure why and when ulimit -m is not working, make it work

similar requirement:
https://stackoverflow.com/questions/3360348/why-ulimit-cant-limit-resident-memory-successfully-and-how

Signed-off-by: Li RongQing 
---
 mm/memory.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/mm/memory.c b/mm/memory.c
index 5fcfc24904d1..50cf9399477c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4140,6 +4140,9 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned 
long address,
ret = __handle_mm_fault(vma, address, flags);
 
if (flags & FAULT_FLAG_USER) {
+   unsigned long total_rss = get_mm_rss(current->mm);
+   u64 rlimit;
+
mem_cgroup_oom_disable();
/*
 * The task may have entered a memcg OOM situation but
@@ -4149,6 +4152,17 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned 
long address,
 */
if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
mem_cgroup_oom_synchronize(false);
+
+   rlimit = current->signal->rlim[RLIMIT_RSS].rlim_cur;
+
+   if (unlikely(total_rss > (rlimit >> PAGE_SHIFT)) &&
+   (current->pid != 1)) {
+
+   pr_info("kill process %s rsslimit[%lluK] rss[%luK]\n",
+   current->comm, (rlimit >> 10),
+   total_rss << (PAGE_SHIFT - 10));
+   do_group_exit(SIGKILL);
+   }
}
 
return ret;
-- 
2.11.0



[PATCH] mm: limit a process RSS

2018-03-29 Thread Li RongQing
we cannot limit a process RSS although there is ulimit -m,
not sure why and when ulimit -m is not working, make it work

similar requirement:
https://stackoverflow.com/questions/3360348/why-ulimit-cant-limit-resident-memory-successfully-and-how

Signed-off-by: Li RongQing 
---
 mm/memory.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/mm/memory.c b/mm/memory.c
index 5fcfc24904d1..50cf9399477c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4140,6 +4140,9 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned 
long address,
ret = __handle_mm_fault(vma, address, flags);
 
if (flags & FAULT_FLAG_USER) {
+   unsigned long total_rss = get_mm_rss(current->mm);
+   u64 rlimit;
+
mem_cgroup_oom_disable();
/*
 * The task may have entered a memcg OOM situation but
@@ -4149,6 +4152,17 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned 
long address,
 */
if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
mem_cgroup_oom_synchronize(false);
+
+   rlimit = current->signal->rlim[RLIMIT_RSS].rlim_cur;
+
+   if (unlikely(total_rss > (rlimit >> PAGE_SHIFT)) &&
+   (current->pid != 1)) {
+
+   pr_info("kill process %s rsslimit[%lluK] rss[%luK]\n",
+   current->comm, (rlimit >> 10),
+   total_rss << (PAGE_SHIFT - 10));
+   do_group_exit(SIGKILL);
+   }
}
 
return ret;
-- 
2.11.0



[PATCH v5 3/3] perf script: extend misc field decoding with switch out event type

2018-03-29 Thread Alexey Budankov

Append 'p' sign to 'S' tag designating the type of context switch out event so 
'Sp' means preemption context switch. Documentation is extended to cover 
new presentation changes.

perf script --show-switch-events -F +misc -I -i perf.data:

  hdparm  4073 [004] U   762.198265: 380194 cycles:ppp:  
7faf727f5a23 strchr (/usr/lib64/ld-2.26.so)
  hdparm  4073 [004] K   762.198366: 441572 cycles:ppp:  
b9218435 alloc_set_pte (/lib/modules/4.16.0-rc6+/build/vmlinux)
  hdparm  4073 [004] S   762.198391: PERF_RECORD_SWITCH_CPU_WIDE 
OUT  next pid/tid: 0/0
 swapper 0 [004] 762.198392: PERF_RECORD_SWITCH_CPU_WIDE IN 
  prev pid/tid:  4073/4073 
 swapper 0 [004] Sp  762.198477: PERF_RECORD_SWITCH_CPU_WIDE 
OUT preempt  next pid/tid:  4073/4073 
  hdparm  4073 [004] 762.198478: PERF_RECORD_SWITCH_CPU_WIDE IN 
  prev pid/tid: 0/0
 swapper 0 [007] K   762.198514:2303073 cycles:ppp:  
b98b0c66 intel_idle (/lib/modules/4.16.0-rc6+/build/vmlinux)
 swapper 0 [007] Sp  762.198561: PERF_RECORD_SWITCH_CPU_WIDE 
OUT preempt  next pid/tid:  1134/1134 
  kworker/u16:18  1134 [007] 762.198562: PERF_RECORD_SWITCH_CPU_WIDE IN 
  prev pid/tid: 0/0
  kworker/u16:18  1134 [007] S   762.198567: PERF_RECORD_SWITCH_CPU_WIDE 
OUT  next pid/tid: 0/0

Signed-off-by: Alexey Budankov 
---
 tools/perf/Documentation/perf-script.txt | 17 +
 tools/perf/builtin-script.c  |  5 -
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/tools/perf/Documentation/perf-script.txt 
b/tools/perf/Documentation/perf-script.txt
index 36ec0257f8d3..afdafe2110a1 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -228,14 +228,15 @@ OPTIONS
For sample events it's possible to display misc field with -F +misc 
option,
following letters are displayed for each bit:
 
- PERF_RECORD_MISC_KERNELK
- PERF_RECORD_MISC_USER  U
- PERF_RECORD_MISC_HYPERVISORH
- PERF_RECORD_MISC_GUEST_KERNEL  G
- PERF_RECORD_MISC_GUEST_USERg
- PERF_RECORD_MISC_MMAP_DATA*M
- PERF_RECORD_MISC_COMM_EXEC E
- PERF_RECORD_MISC_SWITCH_OUTS
+ PERF_RECORD_MISC_KERNEL   K
+ PERF_RECORD_MISC_USER U
+ PERF_RECORD_MISC_HYPERVISOR   H
+ PERF_RECORD_MISC_GUEST_KERNEL G
+ PERF_RECORD_MISC_GUEST_USER   g
+ PERF_RECORD_MISC_MMAP_DATA*   M
+ PERF_RECORD_MISC_COMM_EXECE
+ PERF_RECORD_MISC_SWITCH_OUT   S
+ PERF_RECORD_MISC_SWITCH_OUT_PREEMPT   Sp
 
  $ perf script -F +misc ...
   sched-messaging  1414 K 28690.636582:   4590 cycles ...
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 313c42423393..0f916e8e1835 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -657,8 +657,11 @@ static int perf_sample__fprintf_start(struct perf_sample 
*sample,
break;
case PERF_RECORD_SWITCH:
case PERF_RECORD_SWITCH_CPU_WIDE:
-   if (has(SWITCH_OUT))
+   if (has(SWITCH_OUT)) {
ret += fprintf(fp, "S");
+   if (sample->misc & 
PERF_RECORD_MISC_SWITCH_OUT_PREEMPT)
+   ret += fprintf(fp, "p");
+   }
default:
break;
}



[PATCH v5 3/3] perf script: extend misc field decoding with switch out event type

2018-03-29 Thread Alexey Budankov

Append 'p' sign to 'S' tag designating the type of context switch out event so 
'Sp' means preemption context switch. Documentation is extended to cover 
new presentation changes.

perf script --show-switch-events -F +misc -I -i perf.data:

  hdparm  4073 [004] U   762.198265: 380194 cycles:ppp:  
7faf727f5a23 strchr (/usr/lib64/ld-2.26.so)
  hdparm  4073 [004] K   762.198366: 441572 cycles:ppp:  
b9218435 alloc_set_pte (/lib/modules/4.16.0-rc6+/build/vmlinux)
  hdparm  4073 [004] S   762.198391: PERF_RECORD_SWITCH_CPU_WIDE 
OUT  next pid/tid: 0/0
 swapper 0 [004] 762.198392: PERF_RECORD_SWITCH_CPU_WIDE IN 
  prev pid/tid:  4073/4073 
 swapper 0 [004] Sp  762.198477: PERF_RECORD_SWITCH_CPU_WIDE 
OUT preempt  next pid/tid:  4073/4073 
  hdparm  4073 [004] 762.198478: PERF_RECORD_SWITCH_CPU_WIDE IN 
  prev pid/tid: 0/0
 swapper 0 [007] K   762.198514:2303073 cycles:ppp:  
b98b0c66 intel_idle (/lib/modules/4.16.0-rc6+/build/vmlinux)
 swapper 0 [007] Sp  762.198561: PERF_RECORD_SWITCH_CPU_WIDE 
OUT preempt  next pid/tid:  1134/1134 
  kworker/u16:18  1134 [007] 762.198562: PERF_RECORD_SWITCH_CPU_WIDE IN 
  prev pid/tid: 0/0
  kworker/u16:18  1134 [007] S   762.198567: PERF_RECORD_SWITCH_CPU_WIDE 
OUT  next pid/tid: 0/0

Signed-off-by: Alexey Budankov 
---
 tools/perf/Documentation/perf-script.txt | 17 +
 tools/perf/builtin-script.c  |  5 -
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/tools/perf/Documentation/perf-script.txt 
b/tools/perf/Documentation/perf-script.txt
index 36ec0257f8d3..afdafe2110a1 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -228,14 +228,15 @@ OPTIONS
For sample events it's possible to display misc field with -F +misc 
option,
following letters are displayed for each bit:
 
- PERF_RECORD_MISC_KERNELK
- PERF_RECORD_MISC_USER  U
- PERF_RECORD_MISC_HYPERVISORH
- PERF_RECORD_MISC_GUEST_KERNEL  G
- PERF_RECORD_MISC_GUEST_USERg
- PERF_RECORD_MISC_MMAP_DATA*M
- PERF_RECORD_MISC_COMM_EXEC E
- PERF_RECORD_MISC_SWITCH_OUTS
+ PERF_RECORD_MISC_KERNEL   K
+ PERF_RECORD_MISC_USER U
+ PERF_RECORD_MISC_HYPERVISOR   H
+ PERF_RECORD_MISC_GUEST_KERNEL G
+ PERF_RECORD_MISC_GUEST_USER   g
+ PERF_RECORD_MISC_MMAP_DATA*   M
+ PERF_RECORD_MISC_COMM_EXECE
+ PERF_RECORD_MISC_SWITCH_OUT   S
+ PERF_RECORD_MISC_SWITCH_OUT_PREEMPT   Sp
 
  $ perf script -F +misc ...
   sched-messaging  1414 K 28690.636582:   4590 cycles ...
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 313c42423393..0f916e8e1835 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -657,8 +657,11 @@ static int perf_sample__fprintf_start(struct perf_sample 
*sample,
break;
case PERF_RECORD_SWITCH:
case PERF_RECORD_SWITCH_CPU_WIDE:
-   if (has(SWITCH_OUT))
+   if (has(SWITCH_OUT)) {
ret += fprintf(fp, "S");
+   if (sample->misc & 
PERF_RECORD_MISC_SWITCH_OUT_PREEMPT)
+   ret += fprintf(fp, "p");
+   }
default:
break;
}



[PATCH v5 2/3] perf report: extend raw dump (-D) out with switch out event type

2018-03-29 Thread Alexey Budankov

Print additional 'preempt' tag for PERF_RECORD_SWITCH[_CPU_WIDE] OUT records 
when
event header misc field contains PERF_RECORD_MISC_SWITCH_OUT_PREEMPT bit set 
designating preemption context switch out event:

tools/perf/perf report -D -i perf.data | grep _SWITCH

0 768361415226 0x27f076 [0x28]: PERF_RECORD_SWITCH_CPU_WIDE IN   prev 
pid/tid: 8/8
4 768362216813 0x28f45e [0x28]: PERF_RECORD_SWITCH_CPU_WIDE OUT  next 
pid/tid: 0/0
4 768362217824 0x28f486 [0x28]: PERF_RECORD_SWITCH_CPU_WIDE IN   prev 
pid/tid:  4073/4073 
0 768362414027 0x27f0ce [0x28]: PERF_RECORD_SWITCH_CPU_WIDE OUT preempt  next 
pid/tid: 8/8
0 768362414367 0x27f0f6 [0x28]: PERF_RECORD_SWITCH_CPU_WIDE IN   prev 
pid/tid: 0/0

Signed-off-by: Alexey Budankov 
---
 tools/perf/util/event.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index f0a6cbd033cc..98ff3a6a3d50 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -1421,7 +1421,9 @@ size_t perf_event__fprintf_itrace_start(union perf_event 
*event, FILE *fp)
 size_t perf_event__fprintf_switch(union perf_event *event, FILE *fp)
 {
bool out = event->header.misc & PERF_RECORD_MISC_SWITCH_OUT;
-   const char *in_out = out ? "OUT" : "IN ";
+   const char *in_out = !out ? "IN " :
+   !(event->header.misc & PERF_RECORD_MISC_SWITCH_OUT_PREEMPT) ?
+   "OUT" : "OUT preempt";
 
if (event->header.type == PERF_RECORD_SWITCH)
return fprintf(fp, " %s\n", in_out);



[PATCH v5 2/3] perf report: extend raw dump (-D) out with switch out event type

2018-03-29 Thread Alexey Budankov

Print additional 'preempt' tag for PERF_RECORD_SWITCH[_CPU_WIDE] OUT records 
when
event header misc field contains PERF_RECORD_MISC_SWITCH_OUT_PREEMPT bit set 
designating preemption context switch out event:

tools/perf/perf report -D -i perf.data | grep _SWITCH

0 768361415226 0x27f076 [0x28]: PERF_RECORD_SWITCH_CPU_WIDE IN   prev 
pid/tid: 8/8
4 768362216813 0x28f45e [0x28]: PERF_RECORD_SWITCH_CPU_WIDE OUT  next 
pid/tid: 0/0
4 768362217824 0x28f486 [0x28]: PERF_RECORD_SWITCH_CPU_WIDE IN   prev 
pid/tid:  4073/4073 
0 768362414027 0x27f0ce [0x28]: PERF_RECORD_SWITCH_CPU_WIDE OUT preempt  next 
pid/tid: 8/8
0 768362414367 0x27f0f6 [0x28]: PERF_RECORD_SWITCH_CPU_WIDE IN   prev 
pid/tid: 0/0

Signed-off-by: Alexey Budankov 
---
 tools/perf/util/event.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index f0a6cbd033cc..98ff3a6a3d50 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -1421,7 +1421,9 @@ size_t perf_event__fprintf_itrace_start(union perf_event 
*event, FILE *fp)
 size_t perf_event__fprintf_switch(union perf_event *event, FILE *fp)
 {
bool out = event->header.misc & PERF_RECORD_MISC_SWITCH_OUT;
-   const char *in_out = out ? "OUT" : "IN ";
+   const char *in_out = !out ? "IN " :
+   !(event->header.misc & PERF_RECORD_MISC_SWITCH_OUT_PREEMPT) ?
+   "OUT" : "OUT preempt";
 
if (event->header.type == PERF_RECORD_SWITCH)
return fprintf(fp, " %s\n", in_out);



[RFC PATCH] lib/ioremap: Avoid triggering BUG_ON when end is not PAGE_ALIGN

2018-03-29 Thread Yisheng Xie
Zhou reported a bug on Hisilicon arm64 D06 platform with 64KB page size:

 [2.470908] kernel BUG at lib/ioremap.c:72!
 [2.475079] Internal error: Oops - BUG: 0 [#1] PREEMPT SMP
 [2.480551] Modules linked in:
 [2.483594] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 
4.16.0-rc7-00062-g0b41260-dirty #23
 [2.491756] Hardware name: Huawei D06/D06, BIOS Hisilicon D06 UEFI Nemo 2.0 
RC0 - B120 03/23/2018
 [2.500614] pstate: 80c9 (Nzcv daif +PAN +UAO)
 [2.505395] pc : ioremap_page_range+0x268/0x36c
 [2.509912] lr : pci_remap_iospace+0xe4/0x100
 [...]
 [2.603733] Call trace:
 [2.606168]  ioremap_page_range+0x268/0x36c
 [2.610337]  pci_remap_iospace+0xe4/0x100
 [2.614334]  acpi_pci_probe_root_resources+0x1d4/0x214
 [2.619460]  pci_acpi_root_prepare_resources+0x18/0xa8
 [2.624585]  acpi_pci_root_create+0x98/0x214
 [2.628843]  pci_acpi_scan_root+0x124/0x20c
 [2.633013]  acpi_pci_root_add+0x224/0x494
 [2.637096]  acpi_bus_attach+0xf8/0x200
 [2.640918]  acpi_bus_attach+0x98/0x200
 [2.644740]  acpi_bus_attach+0x98/0x200
 [2.648562]  acpi_bus_scan+0x48/0x9c
 [2.652125]  acpi_scan_init+0x104/0x268
 [2.655948]  acpi_init+0x308/0x374
 [2.659337]  do_one_initcall+0x48/0x14c
 [2.663160]  kernel_init_freeable+0x19c/0x250
 [2.667504]  kernel_init+0x10/0x100
 [2.670979]  ret_from_fork+0x10/0x18

The cause is the size of PCI IO resource is 32KB, which is 4K aligned but
not 64KB aligned, so when do ioremap_pte_range(), its incoming end is not
PAGE_ALIGN on 64KB page size system, but ioremap_pte_range increase the
addr by PAGE_SIZE, which makes addr != end until trigger BUG_ON.

This patch introduces pte_addr_end(addr, end) to resolve this problem, just
as what pmd_addr_end do. When end is not PAGE_ALIGN, it will return end
instead of addr + PAGE_SIZE, therefore ioremap_pte_range() can break out
when real end is coming.

Reported-by: Zhou Wang 
Tested-by: Xiaojun Tan 
Signed-off-by: Yisheng Xie 
---
 include/asm-generic/pgtable.h | 7 +++
 lib/ioremap.c | 5 -
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index bfbb44a..7d5ee84 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -478,6 +478,13 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, 
pgprot_t newprot)
 })
 #endif
 
+#ifndef pte_addr_end
+#define pte_addr_end(addr, end)
\
+({ unsigned long __boundary = ((addr) + PAGE_SIZE) & PAGE_MASK;\
+   (__boundary - 1 < (end) - 1) ? __boundary : (end);  \
+})
+#endif
+
 /*
  * When walking page tables, we usually want to skip any p?d_none entries;
  * and any p?d_bad entries - reporting the error before resetting to none.
diff --git a/lib/ioremap.c b/lib/ioremap.c
index 54e5bba..82c8502 100644
--- a/lib/ioremap.c
+++ b/lib/ioremap.c
@@ -63,16 +63,19 @@ static int ioremap_pte_range(pmd_t *pmd, unsigned long addr,
 {
pte_t *pte;
u64 pfn;
+   unsigned long next;
 
pfn = phys_addr >> PAGE_SHIFT;
pte = pte_alloc_kernel(pmd, addr);
if (!pte)
return -ENOMEM;
do {
+   next = pte_addr_end(addr, end);
+
BUG_ON(!pte_none(*pte));
set_pte_at(_mm, addr, pte, pfn_pte(pfn, prot));
pfn++;
-   } while (pte++, addr += PAGE_SIZE, addr != end);
+   } while (pte++, addr = next, addr != end);
return 0;
 }
 
-- 
1.7.12.4



[RFC PATCH] lib/ioremap: Avoid triggering BUG_ON when end is not PAGE_ALIGN

2018-03-29 Thread Yisheng Xie
Zhou reported a bug on Hisilicon arm64 D06 platform with 64KB page size:

 [2.470908] kernel BUG at lib/ioremap.c:72!
 [2.475079] Internal error: Oops - BUG: 0 [#1] PREEMPT SMP
 [2.480551] Modules linked in:
 [2.483594] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 
4.16.0-rc7-00062-g0b41260-dirty #23
 [2.491756] Hardware name: Huawei D06/D06, BIOS Hisilicon D06 UEFI Nemo 2.0 
RC0 - B120 03/23/2018
 [2.500614] pstate: 80c9 (Nzcv daif +PAN +UAO)
 [2.505395] pc : ioremap_page_range+0x268/0x36c
 [2.509912] lr : pci_remap_iospace+0xe4/0x100
 [...]
 [2.603733] Call trace:
 [2.606168]  ioremap_page_range+0x268/0x36c
 [2.610337]  pci_remap_iospace+0xe4/0x100
 [2.614334]  acpi_pci_probe_root_resources+0x1d4/0x214
 [2.619460]  pci_acpi_root_prepare_resources+0x18/0xa8
 [2.624585]  acpi_pci_root_create+0x98/0x214
 [2.628843]  pci_acpi_scan_root+0x124/0x20c
 [2.633013]  acpi_pci_root_add+0x224/0x494
 [2.637096]  acpi_bus_attach+0xf8/0x200
 [2.640918]  acpi_bus_attach+0x98/0x200
 [2.644740]  acpi_bus_attach+0x98/0x200
 [2.648562]  acpi_bus_scan+0x48/0x9c
 [2.652125]  acpi_scan_init+0x104/0x268
 [2.655948]  acpi_init+0x308/0x374
 [2.659337]  do_one_initcall+0x48/0x14c
 [2.663160]  kernel_init_freeable+0x19c/0x250
 [2.667504]  kernel_init+0x10/0x100
 [2.670979]  ret_from_fork+0x10/0x18

The cause is the size of PCI IO resource is 32KB, which is 4K aligned but
not 64KB aligned, so when do ioremap_pte_range(), its incoming end is not
PAGE_ALIGN on 64KB page size system, but ioremap_pte_range increase the
addr by PAGE_SIZE, which makes addr != end until trigger BUG_ON.

This patch introduces pte_addr_end(addr, end) to resolve this problem, just
as what pmd_addr_end do. When end is not PAGE_ALIGN, it will return end
instead of addr + PAGE_SIZE, therefore ioremap_pte_range() can break out
when real end is coming.

Reported-by: Zhou Wang 
Tested-by: Xiaojun Tan 
Signed-off-by: Yisheng Xie 
---
 include/asm-generic/pgtable.h | 7 +++
 lib/ioremap.c | 5 -
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index bfbb44a..7d5ee84 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -478,6 +478,13 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, 
pgprot_t newprot)
 })
 #endif
 
+#ifndef pte_addr_end
+#define pte_addr_end(addr, end)
\
+({ unsigned long __boundary = ((addr) + PAGE_SIZE) & PAGE_MASK;\
+   (__boundary - 1 < (end) - 1) ? __boundary : (end);  \
+})
+#endif
+
 /*
  * When walking page tables, we usually want to skip any p?d_none entries;
  * and any p?d_bad entries - reporting the error before resetting to none.
diff --git a/lib/ioremap.c b/lib/ioremap.c
index 54e5bba..82c8502 100644
--- a/lib/ioremap.c
+++ b/lib/ioremap.c
@@ -63,16 +63,19 @@ static int ioremap_pte_range(pmd_t *pmd, unsigned long addr,
 {
pte_t *pte;
u64 pfn;
+   unsigned long next;
 
pfn = phys_addr >> PAGE_SHIFT;
pte = pte_alloc_kernel(pmd, addr);
if (!pte)
return -ENOMEM;
do {
+   next = pte_addr_end(addr, end);
+
BUG_ON(!pte_none(*pte));
set_pte_at(_mm, addr, pte, pfn_pte(pfn, prot));
pfn++;
-   } while (pte++, addr += PAGE_SIZE, addr != end);
+   } while (pte++, addr = next, addr != end);
return 0;
 }
 
-- 
1.7.12.4



[PATCH v5 1/3] perf/core: store context switch out type into Perf trace

2018-03-29 Thread Alexey Budankov

Store preempting context switch out event into Perf trace as a part of 
PERF_RECORD_SWITCH[_CPU_WIDE] record.

Percentage of preempting and non-preempting context switches help 
understanding the nature of workloads (CPU or IO bound) that are running 
on a machine;

The event is treated as preemption one when task->state value of the 
thread being switched out is TASK_RUNNING. Event type encoding is 
implemented using PERF_RECORD_MISC_SWITCH_OUT_PREEMPT bit;

Signed-off-by: Alexey Budankov 
---
 include/uapi/linux/perf_event.h   | 18 +++---
 kernel/events/core.c  |  4 
 tools/include/uapi/linux/perf_event.h | 18 +++---
 3 files changed, 34 insertions(+), 6 deletions(-)

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 912b85b52344..b8e288a1f740 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -650,11 +650,23 @@ struct perf_event_mmap_page {
 #define PERF_RECORD_MISC_COMM_EXEC (1 << 13)
 #define PERF_RECORD_MISC_SWITCH_OUT(1 << 13)
 /*
- * Indicates that the content of PERF_SAMPLE_IP points to
- * the actual instruction that triggered the event. See also
- * perf_event_attr::precise_ip.
+ * These PERF_RECORD_MISC_* flags below are safely reused
+ * for the following events:
+ *
+ *   PERF_RECORD_MISC_EXACT_IP   - PERF_RECORD_SAMPLE of precise events
+ *   PERF_RECORD_MISC_SWITCH_OUT_PREEMPT - PERF_RECORD_SWITCH* events
+ *
+ *
+ * PERF_RECORD_MISC_EXACT_IP:
+ *   Indicates that the content of PERF_SAMPLE_IP points to
+ *   the actual instruction that triggered the event. See also
+ *   perf_event_attr::precise_ip.
+ *
+ * PERF_RECORD_MISC_SWITCH_OUT_PREEMPT:
+ *   Indicates that thread was preempted in TASK_RUNNING state.
  */
 #define PERF_RECORD_MISC_EXACT_IP  (1 << 14)
+#define PERF_RECORD_MISC_SWITCH_OUT_PREEMPT(1 << 14)
 /*
  * Reserve the last bit to indicate some extended misc field
  */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index fc1c330c6bd6..872a5aaa77eb 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7584,6 +7584,10 @@ static void perf_event_switch(struct task_struct *task,
},
};
 
+   if (!sched_in && task->state == TASK_RUNNING)
+   switch_event.event_id.header.misc |=
+   PERF_RECORD_MISC_SWITCH_OUT_PREEMPT;
+
perf_iterate_sb(perf_event_switch_output,
   _event,
   NULL);
diff --git a/tools/include/uapi/linux/perf_event.h 
b/tools/include/uapi/linux/perf_event.h
index 912b85b52344..b8e288a1f740 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -650,11 +650,23 @@ struct perf_event_mmap_page {
 #define PERF_RECORD_MISC_COMM_EXEC (1 << 13)
 #define PERF_RECORD_MISC_SWITCH_OUT(1 << 13)
 /*
- * Indicates that the content of PERF_SAMPLE_IP points to
- * the actual instruction that triggered the event. See also
- * perf_event_attr::precise_ip.
+ * These PERF_RECORD_MISC_* flags below are safely reused
+ * for the following events:
+ *
+ *   PERF_RECORD_MISC_EXACT_IP   - PERF_RECORD_SAMPLE of precise events
+ *   PERF_RECORD_MISC_SWITCH_OUT_PREEMPT - PERF_RECORD_SWITCH* events
+ *
+ *
+ * PERF_RECORD_MISC_EXACT_IP:
+ *   Indicates that the content of PERF_SAMPLE_IP points to
+ *   the actual instruction that triggered the event. See also
+ *   perf_event_attr::precise_ip.
+ *
+ * PERF_RECORD_MISC_SWITCH_OUT_PREEMPT:
+ *   Indicates that thread was preempted in TASK_RUNNING state.
  */
 #define PERF_RECORD_MISC_EXACT_IP  (1 << 14)
+#define PERF_RECORD_MISC_SWITCH_OUT_PREEMPT(1 << 14)
 /*
  * Reserve the last bit to indicate some extended misc field
  */



[PATCH v5 1/3] perf/core: store context switch out type into Perf trace

2018-03-29 Thread Alexey Budankov

Store preempting context switch out event into Perf trace as a part of 
PERF_RECORD_SWITCH[_CPU_WIDE] record.

Percentage of preempting and non-preempting context switches help 
understanding the nature of workloads (CPU or IO bound) that are running 
on a machine;

The event is treated as preemption one when task->state value of the 
thread being switched out is TASK_RUNNING. Event type encoding is 
implemented using PERF_RECORD_MISC_SWITCH_OUT_PREEMPT bit;

Signed-off-by: Alexey Budankov 
---
 include/uapi/linux/perf_event.h   | 18 +++---
 kernel/events/core.c  |  4 
 tools/include/uapi/linux/perf_event.h | 18 +++---
 3 files changed, 34 insertions(+), 6 deletions(-)

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 912b85b52344..b8e288a1f740 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -650,11 +650,23 @@ struct perf_event_mmap_page {
 #define PERF_RECORD_MISC_COMM_EXEC (1 << 13)
 #define PERF_RECORD_MISC_SWITCH_OUT(1 << 13)
 /*
- * Indicates that the content of PERF_SAMPLE_IP points to
- * the actual instruction that triggered the event. See also
- * perf_event_attr::precise_ip.
+ * These PERF_RECORD_MISC_* flags below are safely reused
+ * for the following events:
+ *
+ *   PERF_RECORD_MISC_EXACT_IP   - PERF_RECORD_SAMPLE of precise events
+ *   PERF_RECORD_MISC_SWITCH_OUT_PREEMPT - PERF_RECORD_SWITCH* events
+ *
+ *
+ * PERF_RECORD_MISC_EXACT_IP:
+ *   Indicates that the content of PERF_SAMPLE_IP points to
+ *   the actual instruction that triggered the event. See also
+ *   perf_event_attr::precise_ip.
+ *
+ * PERF_RECORD_MISC_SWITCH_OUT_PREEMPT:
+ *   Indicates that thread was preempted in TASK_RUNNING state.
  */
 #define PERF_RECORD_MISC_EXACT_IP  (1 << 14)
+#define PERF_RECORD_MISC_SWITCH_OUT_PREEMPT(1 << 14)
 /*
  * Reserve the last bit to indicate some extended misc field
  */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index fc1c330c6bd6..872a5aaa77eb 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7584,6 +7584,10 @@ static void perf_event_switch(struct task_struct *task,
},
};
 
+   if (!sched_in && task->state == TASK_RUNNING)
+   switch_event.event_id.header.misc |=
+   PERF_RECORD_MISC_SWITCH_OUT_PREEMPT;
+
perf_iterate_sb(perf_event_switch_output,
   _event,
   NULL);
diff --git a/tools/include/uapi/linux/perf_event.h 
b/tools/include/uapi/linux/perf_event.h
index 912b85b52344..b8e288a1f740 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -650,11 +650,23 @@ struct perf_event_mmap_page {
 #define PERF_RECORD_MISC_COMM_EXEC (1 << 13)
 #define PERF_RECORD_MISC_SWITCH_OUT(1 << 13)
 /*
- * Indicates that the content of PERF_SAMPLE_IP points to
- * the actual instruction that triggered the event. See also
- * perf_event_attr::precise_ip.
+ * These PERF_RECORD_MISC_* flags below are safely reused
+ * for the following events:
+ *
+ *   PERF_RECORD_MISC_EXACT_IP   - PERF_RECORD_SAMPLE of precise events
+ *   PERF_RECORD_MISC_SWITCH_OUT_PREEMPT - PERF_RECORD_SWITCH* events
+ *
+ *
+ * PERF_RECORD_MISC_EXACT_IP:
+ *   Indicates that the content of PERF_SAMPLE_IP points to
+ *   the actual instruction that triggered the event. See also
+ *   perf_event_attr::precise_ip.
+ *
+ * PERF_RECORD_MISC_SWITCH_OUT_PREEMPT:
+ *   Indicates that thread was preempted in TASK_RUNNING state.
  */
 #define PERF_RECORD_MISC_EXACT_IP  (1 << 14)
+#define PERF_RECORD_MISC_SWITCH_OUT_PREEMPT(1 << 14)
 /*
  * Reserve the last bit to indicate some extended misc field
  */



Re: [PATCH] nvme-multipath: implement active-active round-robin path selector

2018-03-29 Thread Baegjae Sung
2018-03-29 4:47 GMT+09:00 Keith Busch :
> On Wed, Mar 28, 2018 at 10:06:46AM +0200, Christoph Hellwig wrote:
>> For PCIe devices the right policy is not a round robin but to use
>> the pcie device closer to the node.  I did a prototype for that
>> long ago and the concept can work.  Can you look into that and
>> also make that policy used automatically for PCIe devices?
>
> Yeah, that is especially true if you've multiple storage accessing
> threads scheduled on different nodes. On the other hand, round-robin
> may still benefit if both paths are connected to different root ports
> on the same node (who would do that?!).
>
> But I wasn't aware people use dual-ported PCIe NVMe connected to a
> single host (single path from two hosts seems more common). If that's a
> thing, we should get some numa awareness. I couldn't find your prototype,
> though. I had one stashed locally from a while back and hope it resembles
> what you had in mind:

Our prototype uses dual-ported PCIe NVMe connected to a single host. The
host's HBA is connected to two switches, and the two switches are connected
to a dual-port NVMe SSD. In this environment, active-active round-robin path
selection is good to utilize the full performance of a dual-port NVMe SSD.
You can also fail over a single switch failure. You can see the prototype
in link below.
https://youtu.be/u_ou-AQsvOs?t=307 (presentation in OCP Summit 2018)

I agree that active-standby closer path selection is the right policy
if multiple
nodes attempt to access the storage system through multiple paths.
However, I believe that NVMe multipath needs to provide multiple policy for
path selection. Some people may want to use multiple paths simultaneously
(active-active) if they use a small number of nodes and want to utilize full
capability. If the capability of paths is same, the round-robin can be
the right
policy. If the capability of paths is different, a more adoptive
method would be
needed (e.g., checking path condition to balance IO).

We are moving to the NVMe fabrics for our next prototype. So, I think we will
have a chance to discuss about this policy issue in more detail. I will continue
to follow this issue.

> ---
> struct nvme_ns *nvme_find_path_numa(struct nvme_ns_head *head)
> {
> int distance, current = INT_MAX, node = 
> cpu_to_node(smp_processor_id());
> struct nvme_ns *ns, *path = NULL;
>
> list_for_each_entry_rcu(ns, >list, siblings) {
> if (ns->ctrl->state != NVME_CTRL_LIVE)
> continue;
> if (ns->disk->node_id == node)
> return ns;
>
> distance = node_distance(node, ns->disk->node_id);
> if (distance < current) {
> current = distance;
> path = ns;
> }
> }
> return path;
> }
> --


Re: [PATCH] nvme-multipath: implement active-active round-robin path selector

2018-03-29 Thread Baegjae Sung
2018-03-29 4:47 GMT+09:00 Keith Busch :
> On Wed, Mar 28, 2018 at 10:06:46AM +0200, Christoph Hellwig wrote:
>> For PCIe devices the right policy is not a round robin but to use
>> the pcie device closer to the node.  I did a prototype for that
>> long ago and the concept can work.  Can you look into that and
>> also make that policy used automatically for PCIe devices?
>
> Yeah, that is especially true if you've multiple storage accessing
> threads scheduled on different nodes. On the other hand, round-robin
> may still benefit if both paths are connected to different root ports
> on the same node (who would do that?!).
>
> But I wasn't aware people use dual-ported PCIe NVMe connected to a
> single host (single path from two hosts seems more common). If that's a
> thing, we should get some numa awareness. I couldn't find your prototype,
> though. I had one stashed locally from a while back and hope it resembles
> what you had in mind:

Our prototype uses dual-ported PCIe NVMe connected to a single host. The
host's HBA is connected to two switches, and the two switches are connected
to a dual-port NVMe SSD. In this environment, active-active round-robin path
selection is good to utilize the full performance of a dual-port NVMe SSD.
You can also fail over a single switch failure. You can see the prototype
in link below.
https://youtu.be/u_ou-AQsvOs?t=307 (presentation in OCP Summit 2018)

I agree that active-standby closer path selection is the right policy
if multiple
nodes attempt to access the storage system through multiple paths.
However, I believe that NVMe multipath needs to provide multiple policy for
path selection. Some people may want to use multiple paths simultaneously
(active-active) if they use a small number of nodes and want to utilize full
capability. If the capability of paths is same, the round-robin can be
the right
policy. If the capability of paths is different, a more adoptive
method would be
needed (e.g., checking path condition to balance IO).

We are moving to the NVMe fabrics for our next prototype. So, I think we will
have a chance to discuss about this policy issue in more detail. I will continue
to follow this issue.

> ---
> struct nvme_ns *nvme_find_path_numa(struct nvme_ns_head *head)
> {
> int distance, current = INT_MAX, node = 
> cpu_to_node(smp_processor_id());
> struct nvme_ns *ns, *path = NULL;
>
> list_for_each_entry_rcu(ns, >list, siblings) {
> if (ns->ctrl->state != NVME_CTRL_LIVE)
> continue;
> if (ns->disk->node_id == node)
> return ns;
>
> distance = node_distance(node, ns->disk->node_id);
> if (distance < current) {
> current = distance;
> path = ns;
> }
> }
> return path;
> }
> --


[PATCH v5 0/3] perf/core: expose thread context switch out event type to user space

2018-03-29 Thread Alexey Budankov

Implement preempting context switch out event as a part of 
PERF_RECORD_SWITCH[_CPU_WIDE] record. The event is treated as preemption 
one when task->state value of the thread being switched out is TASK_RUNNING;

Percentage of preempting and non-preempting context switches help 
understanding the nature of workloads (CPU or IO bound) that are running 
on the machine;

Event type encoding is implemented using a new
PERF_RECORD_MISC_SWITCH_OUT_PREEMPT bit in misc field of event record header;

Perf tool report and script commands have been extended to decode 
new PREEMPT bit and the updated output looks like this:

tools/perf/perf report -D -i perf.data | grep _SWITCH

0 768361415226 0x27f076 [0x28]: PERF_RECORD_SWITCH_CPU_WIDE IN   prev 
pid/tid: 8/8
4 768362216813 0x28f45e [0x28]: PERF_RECORD_SWITCH_CPU_WIDE OUT  next 
pid/tid: 0/0
4 768362217824 0x28f486 [0x28]: PERF_RECORD_SWITCH_CPU_WIDE IN   prev 
pid/tid:  4073/4073 
0 768362414027 0x27f0ce [0x28]: PERF_RECORD_SWITCH_CPU_WIDE OUT preempt  next 
pid/tid: 8/8
0 768362414367 0x27f0f6 [0x28]: PERF_RECORD_SWITCH_CPU_WIDE IN   prev 
pid/tid: 0/0

perf script --show-switch-events -F +misc -I -i perf.data:

  hdparm  4073 [004] U   762.198265: 380194 cycles:ppp:  
7faf727f5a23 strchr (/usr/lib64/ld-2.26.so)
  hdparm  4073 [004] K   762.198366: 441572 cycles:ppp:  
b9218435 alloc_set_pte (/lib/modules/4.16.0-rc6+/build/vmlinux)
  hdparm  4073 [004] S   762.198391: PERF_RECORD_SWITCH_CPU_WIDE 
OUT  next pid/tid: 0/0
 swapper 0 [004] 762.198392: PERF_RECORD_SWITCH_CPU_WIDE IN 
  prev pid/tid:  4073/4073 
 swapper 0 [004] Sp  762.198477: PERF_RECORD_SWITCH_CPU_WIDE 
OUT preempt  next pid/tid:  4073/4073 
  hdparm  4073 [004] 762.198478: PERF_RECORD_SWITCH_CPU_WIDE IN 
  prev pid/tid: 0/0
 swapper 0 [007] K   762.198514:2303073 cycles:ppp:  
b98b0c66 intel_idle (/lib/modules/4.16.0-rc6+/build/vmlinux)
 swapper 0 [007] Sp  762.198561: PERF_RECORD_SWITCH_CPU_WIDE 
OUT preempt  next pid/tid:  1134/1134 
  kworker/u16:18  1134 [007] 762.198562: PERF_RECORD_SWITCH_CPU_WIDE IN 
  prev pid/tid: 0/0
  kworker/u16:18  1134 [007] S   762.198567: PERF_RECORD_SWITCH_CPU_WIDE 
OUT  next pid/tid: 0/0

The documentation has been updated to mention PREEMPT switch out events 
and its decoding symbols in perf script output.

The changes have been manually tested on Fedora 27 with the patched kernel:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git perf/core

---
 Alexey Budankov (3):
perf/core: store context switch out type into Perf trace
perf report: extend raw dump (-D) out with switch out event type
perf script: extend misc field decoding with switch out event type

 include/uapi/linux/perf_event.h  | 18 +++---
 kernel/events/core.c |  4 
 tools/include/uapi/linux/perf_event.h| 18 +++---
 tools/perf/Documentation/perf-script.txt | 17 +
 tools/perf/builtin-script.c  |  5 -
 tools/perf/util/event.c  |  4 +++-
 6 files changed, 50 insertions(+), 16 deletions(-)


[PATCH v5 0/3] perf/core: expose thread context switch out event type to user space

2018-03-29 Thread Alexey Budankov

Implement preempting context switch out event as a part of 
PERF_RECORD_SWITCH[_CPU_WIDE] record. The event is treated as preemption 
one when task->state value of the thread being switched out is TASK_RUNNING;

Percentage of preempting and non-preempting context switches help 
understanding the nature of workloads (CPU or IO bound) that are running 
on the machine;

Event type encoding is implemented using a new
PERF_RECORD_MISC_SWITCH_OUT_PREEMPT bit in misc field of event record header;

Perf tool report and script commands have been extended to decode 
new PREEMPT bit and the updated output looks like this:

tools/perf/perf report -D -i perf.data | grep _SWITCH

0 768361415226 0x27f076 [0x28]: PERF_RECORD_SWITCH_CPU_WIDE IN   prev 
pid/tid: 8/8
4 768362216813 0x28f45e [0x28]: PERF_RECORD_SWITCH_CPU_WIDE OUT  next 
pid/tid: 0/0
4 768362217824 0x28f486 [0x28]: PERF_RECORD_SWITCH_CPU_WIDE IN   prev 
pid/tid:  4073/4073 
0 768362414027 0x27f0ce [0x28]: PERF_RECORD_SWITCH_CPU_WIDE OUT preempt  next 
pid/tid: 8/8
0 768362414367 0x27f0f6 [0x28]: PERF_RECORD_SWITCH_CPU_WIDE IN   prev 
pid/tid: 0/0

perf script --show-switch-events -F +misc -I -i perf.data:

  hdparm  4073 [004] U   762.198265: 380194 cycles:ppp:  
7faf727f5a23 strchr (/usr/lib64/ld-2.26.so)
  hdparm  4073 [004] K   762.198366: 441572 cycles:ppp:  
b9218435 alloc_set_pte (/lib/modules/4.16.0-rc6+/build/vmlinux)
  hdparm  4073 [004] S   762.198391: PERF_RECORD_SWITCH_CPU_WIDE 
OUT  next pid/tid: 0/0
 swapper 0 [004] 762.198392: PERF_RECORD_SWITCH_CPU_WIDE IN 
  prev pid/tid:  4073/4073 
 swapper 0 [004] Sp  762.198477: PERF_RECORD_SWITCH_CPU_WIDE 
OUT preempt  next pid/tid:  4073/4073 
  hdparm  4073 [004] 762.198478: PERF_RECORD_SWITCH_CPU_WIDE IN 
  prev pid/tid: 0/0
 swapper 0 [007] K   762.198514:2303073 cycles:ppp:  
b98b0c66 intel_idle (/lib/modules/4.16.0-rc6+/build/vmlinux)
 swapper 0 [007] Sp  762.198561: PERF_RECORD_SWITCH_CPU_WIDE 
OUT preempt  next pid/tid:  1134/1134 
  kworker/u16:18  1134 [007] 762.198562: PERF_RECORD_SWITCH_CPU_WIDE IN 
  prev pid/tid: 0/0
  kworker/u16:18  1134 [007] S   762.198567: PERF_RECORD_SWITCH_CPU_WIDE 
OUT  next pid/tid: 0/0

The documentation has been updated to mention PREEMPT switch out events 
and its decoding symbols in perf script output.

The changes have been manually tested on Fedora 27 with the patched kernel:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git perf/core

---
 Alexey Budankov (3):
perf/core: store context switch out type into Perf trace
perf report: extend raw dump (-D) out with switch out event type
perf script: extend misc field decoding with switch out event type

 include/uapi/linux/perf_event.h  | 18 +++---
 kernel/events/core.c |  4 
 tools/include/uapi/linux/perf_event.h| 18 +++---
 tools/perf/Documentation/perf-script.txt | 17 +
 tools/perf/builtin-script.c  |  5 -
 tools/perf/util/event.c  |  4 +++-
 6 files changed, 50 insertions(+), 16 deletions(-)


[PATCH] RDMA/ucma: reject AF_IB ip multicast requests

2018-03-29 Thread Greg Thelen
syzbot discovered that ucma_join_ip_multicast() mishandles AF_IB request
addresses.  If an RDMA_USER_CM_CMD_JOIN_IP_MCAST request has
cmd.addr.sa_family=AF_IB then ucma_join_ip_multicast() reads beyond the
end of its cmd.addr.

Reject non IP RDMA_USER_CM_CMD_JOIN_IP_MCAST requests.
RDMA_USER_CM_CMD_JOIN_MCAST is interface for AF_IB multicast.

And add a buffer length safety check.

Fixes: 5bc2b7b397b0 ("RDMA/ucma: Allow user space to specify AF_IB when joining 
multicast")
Signed-off-by: Greg Thelen 
---
 drivers/infiniband/core/ucma.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index e5a1e7d81326..e410e03940ff 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -1423,11 +1423,19 @@ static ssize_t ucma_join_ip_multicast(struct ucma_file 
*file,
if (copy_from_user(, inbuf, sizeof(cmd)))
return -EFAULT;
 
+   switch (cmd.addr.sin6_family) {
+   case AF_INET:
+   case AF_INET6:
+   break;
+   default:
+   return -EINVAL;
+   }
+
join_cmd.response = cmd.response;
join_cmd.uid = cmd.uid;
join_cmd.id = cmd.id;
join_cmd.addr_size = rdma_addr_size((struct sockaddr *) );
-   if (!join_cmd.addr_size)
+   if (!join_cmd.addr_size || join_cmd.addr_size > sizeof(cmd.addr))
return -EINVAL;
 
join_cmd.join_flags = RDMA_MC_JOIN_FLAG_FULLMEMBER;
-- 
2.17.0.rc1.321.gba9d0f2565-goog



[PATCH] RDMA/ucma: reject AF_IB ip multicast requests

2018-03-29 Thread Greg Thelen
syzbot discovered that ucma_join_ip_multicast() mishandles AF_IB request
addresses.  If an RDMA_USER_CM_CMD_JOIN_IP_MCAST request has
cmd.addr.sa_family=AF_IB then ucma_join_ip_multicast() reads beyond the
end of its cmd.addr.

Reject non IP RDMA_USER_CM_CMD_JOIN_IP_MCAST requests.
RDMA_USER_CM_CMD_JOIN_MCAST is interface for AF_IB multicast.

And add a buffer length safety check.

Fixes: 5bc2b7b397b0 ("RDMA/ucma: Allow user space to specify AF_IB when joining 
multicast")
Signed-off-by: Greg Thelen 
---
 drivers/infiniband/core/ucma.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index e5a1e7d81326..e410e03940ff 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -1423,11 +1423,19 @@ static ssize_t ucma_join_ip_multicast(struct ucma_file 
*file,
if (copy_from_user(, inbuf, sizeof(cmd)))
return -EFAULT;
 
+   switch (cmd.addr.sin6_family) {
+   case AF_INET:
+   case AF_INET6:
+   break;
+   default:
+   return -EINVAL;
+   }
+
join_cmd.response = cmd.response;
join_cmd.uid = cmd.uid;
join_cmd.id = cmd.id;
join_cmd.addr_size = rdma_addr_size((struct sockaddr *) );
-   if (!join_cmd.addr_size)
+   if (!join_cmd.addr_size || join_cmd.addr_size > sizeof(cmd.addr))
return -EINVAL;
 
join_cmd.join_flags = RDMA_MC_JOIN_FLAG_FULLMEMBER;
-- 
2.17.0.rc1.321.gba9d0f2565-goog



[PATCH v3] kbuild: use -fmacro-prefix-map to make __FILE__ a relative path

2018-03-29 Thread Masahiro Yamada
The __FILE__ macro is used everywhere in the kernel to locate the file
printing the log message, such as WARN_ON(), etc.  If the kernel is
built out of tree, this can be a long absolute path, like this:

  WARNING: CPU: 1 PID: 1 at /path/to/build/directory/arch/arm64/kernel/foo.c:...

This is because Kbuild runs in the objtree instead of the srctree,
then __FILE__ is expanded to a file path prefixed with $(srctree)/.

Commit 9da0763bdd82 ("kbuild: Use relative path when building in a
subdir of the source tree") improved this to some extent; $(srctree)
becomes ".." if the objtree is a child of the srctree.

For other cases of out-of-tree build, __FILE__ is still the absolute
path.  It also means the kernel image depends on where it was built.

A brand-new option from GCC, -fmacro-prefix-map, solves this problem.
If your compiler supports it, __FILE__ is the relative path from the
srctree regardless of O= option.  This provides more readable log and
more reproducible builds.

Please note __FILE__ is always an absolute path for external modules.

Signed-off-by: Masahiro Yamada 
---

I tested this on GCC 8.
(not released yet, but you can get the source code from the trunk.)


Changes in v3:
 - It is harmless to always set this option.
   Remove ifneq ifneq ($(KBUILD_SRC),)

Changes in v2:
 - Comment-in the ifeq

 Makefile | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Makefile b/Makefile
index 7ba478a..83d25c9 100644
--- a/Makefile
+++ b/Makefile
@@ -856,6 +856,9 @@ KBUILD_CFLAGS   += $(call 
cc-option,-Werror=incompatible-pointer-types)
 # Require designated initializers for all marked structures
 KBUILD_CFLAGS   += $(call cc-option,-Werror=designated-init)
 
+# change __FILE__ to the relative path from the srctree
+KBUILD_CFLAGS  += $(call cc-option,-fmacro-prefix-map=$(srctree)/=)
+
 # use the deterministic mode of AR if available
 KBUILD_ARFLAGS := $(call ar-option,D)
 
-- 
2.7.4



[PATCH v3] kbuild: use -fmacro-prefix-map to make __FILE__ a relative path

2018-03-29 Thread Masahiro Yamada
The __FILE__ macro is used everywhere in the kernel to locate the file
printing the log message, such as WARN_ON(), etc.  If the kernel is
built out of tree, this can be a long absolute path, like this:

  WARNING: CPU: 1 PID: 1 at /path/to/build/directory/arch/arm64/kernel/foo.c:...

This is because Kbuild runs in the objtree instead of the srctree,
then __FILE__ is expanded to a file path prefixed with $(srctree)/.

Commit 9da0763bdd82 ("kbuild: Use relative path when building in a
subdir of the source tree") improved this to some extent; $(srctree)
becomes ".." if the objtree is a child of the srctree.

For other cases of out-of-tree build, __FILE__ is still the absolute
path.  It also means the kernel image depends on where it was built.

A brand-new option from GCC, -fmacro-prefix-map, solves this problem.
If your compiler supports it, __FILE__ is the relative path from the
srctree regardless of O= option.  This provides more readable log and
more reproducible builds.

Please note __FILE__ is always an absolute path for external modules.

Signed-off-by: Masahiro Yamada 
---

I tested this on GCC 8.
(not released yet, but you can get the source code from the trunk.)


Changes in v3:
 - It is harmless to always set this option.
   Remove ifneq ifneq ($(KBUILD_SRC),)

Changes in v2:
 - Comment-in the ifeq

 Makefile | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Makefile b/Makefile
index 7ba478a..83d25c9 100644
--- a/Makefile
+++ b/Makefile
@@ -856,6 +856,9 @@ KBUILD_CFLAGS   += $(call 
cc-option,-Werror=incompatible-pointer-types)
 # Require designated initializers for all marked structures
 KBUILD_CFLAGS   += $(call cc-option,-Werror=designated-init)
 
+# change __FILE__ to the relative path from the srctree
+KBUILD_CFLAGS  += $(call cc-option,-fmacro-prefix-map=$(srctree)/=)
+
 # use the deterministic mode of AR if available
 KBUILD_ARFLAGS := $(call ar-option,D)
 
-- 
2.7.4



Re: [PATCH 2/2] iommu/amd: Add basic debugfs infrastructure for AMD IOMMU

2018-03-29 Thread Tom Lendacky
On 3/29/2018 5:54 PM, Gary R Hook wrote:
> Implement a skeleton framework for debugfs support in the
> AMD IOMMU.
> 
> 
> Signed-off-by: Gary R Hook 
> ---
>  drivers/iommu/Kconfig |6 ++---
>  drivers/iommu/Makefile|2 +-
>  drivers/iommu/amd_iommu_debugfs.c |   47 
> +
>  drivers/iommu/amd_iommu_init.c|9 ---
>  drivers/iommu/amd_iommu_proto.h   |6 +
>  drivers/iommu/amd_iommu_types.h   |3 ++
>  include/linux/iommu.h |8 +++---
>  7 files changed, 69 insertions(+), 12 deletions(-)
>  create mode 100644 drivers/iommu/amd_iommu_debugfs.c
> 
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> index d40248446214..8d50151d5bf4 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -65,9 +65,9 @@ config IOMMU_DEBUG
>   depends on DEBUG_FS
>   default n
>   help
> -   Base enablement for access to any IOMMU device. Allows individual
> -   drivers to populate debugfs for access to IOMMU registers and
> -   data structures.
> +   Enable exposure of IOMMU device internals. Allow devices to
> +   populate debugfs for access to IOMMU registers and data
> +   structures.

This help text shouldn't change just because a driver is making use of
the interface that was created in the previous patch.

>  
>  config IOMMU_IOVA
>   tristate
> diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
> index 5e5c3339681d..0ca250f626d9 100644
> --- a/drivers/iommu/Makefile
> +++ b/drivers/iommu/Makefile
> @@ -11,7 +11,7 @@ obj-$(CONFIG_IOMMU_IOVA) += iova.o
>  obj-$(CONFIG_OF_IOMMU)   += of_iommu.o
>  obj-$(CONFIG_MSM_IOMMU) += msm_iommu.o
>  obj-$(CONFIG_AMD_IOMMU) += amd_iommu.o amd_iommu_init.o
> -obj-$(CONFIG_AMD_IOMMU_DEBUG) += amd_iommu_debugfs.o
> +obj-$(CONFIG_IOMMU_DEBUG) += amd_iommu_debugfs.o
>  obj-$(CONFIG_AMD_IOMMU_V2) += amd_iommu_v2.o
>  obj-$(CONFIG_ARM_SMMU) += arm-smmu.o
>  obj-$(CONFIG_ARM_SMMU_V3) += arm-smmu-v3.o
> diff --git a/drivers/iommu/amd_iommu_debugfs.c 
> b/drivers/iommu/amd_iommu_debugfs.c
> new file mode 100644
> index ..3547ad3339c1
> --- /dev/null
> +++ b/drivers/iommu/amd_iommu_debugfs.c
> @@ -0,0 +1,47 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * AMD IOMMU driver
> + *
> + * Copyright (C) 2018 Advanced Micro Devices, Inc.
> + *
> + * Author: Gary R Hook 
> + */
> +
> +#ifdef   CONFIG_IOMMU_DEBUG

Since the module won't be built unless this is defined, you don't
need this.

> +
> +#include 
> +#include 
> +#include 
> +#include "amd_iommu_proto.h"
> +#include "amd_iommu_types.h"
> +
> +static struct dentry *iommu_debugfs_dir;
> +static DEFINE_RWLOCK(iommu_debugfs_lock);

Use amd_iommu_...

Also, didn't you run into an issue with the CCP and debugfs creation
during probe using a RWLOCK?  Should probably make this a mutex.

> +
> +#define  MAX_NAME_LEN20
> +
> +void amd_iommu_debugfs_setup(struct amd_iommu *iommu)
> +{
> + char name[MAX_NAME_LEN + 1];
> + struct dentry *d_top;
> + unsigned long flags;
> +
> + if (!debugfs_initialized())
> + return;
> +
> + write_lock_irqsave(_debugfs_lock, flags);
> + if (!iommu_debugfs_dir && (d_top = iommu_debugfs_setup()))
> + iommu_debugfs_dir = debugfs_create_dir("amd", d_top);
> + write_unlock_irqrestore(_debugfs_lock, flags);
> + if (iommu_debugfs_dir) {
> + snprintf(name, MAX_NAME_LEN, "iommu%02d", iommu->index);
> + iommu->debugfs_instance = debugfs_create_dir(name,
> +  iommu_debugfs_dir);
> + if (!iommu->debugfs_instance)
> + debugfs_remove_recursive(iommu_debugfs_dir);

So this might cause an error.  You remove it, but don't set the variable
to NULL, so the next IOMMU that is probed could try to use it.  But why
are you deleting it anyway?

> + }
> +
> + return;
> +}
> +
> +#endif
> diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
> index 99d48c42a12f..43856c7f4ea1 100644
> --- a/drivers/iommu/amd_iommu_init.c
> +++ b/drivers/iommu/amd_iommu_init.c
> @@ -89,6 +89,7 @@
>  #define ACPI_DEVFLAG_ATSDIS 0x1000
>  
>  #define LOOP_TIMEOUT 10
> +

Spurious newline.

>  /*
>   * ACPI table definitions
>   *
> @@ -2720,6 +2721,7 @@ int __init amd_iommu_enable_faulting(void)
>   */
>  static int __init amd_iommu_init(void)
>  {
> + struct amd_iommu *iommu;
>   int ret;
>  
>   ret = iommu_go_to_state(IOMMU_INITIALIZED);
> @@ -2729,14 +2731,15 @@ static int __init amd_iommu_init(void)
>   disable_iommus();
>   free_iommu_resources();
>   } else {
> - struct amd_iommu *iommu;
> -
>   uninit_device_table_dma();
>   for_each_iommu(iommu)
>   

Re: [PATCH 2/2] iommu/amd: Add basic debugfs infrastructure for AMD IOMMU

2018-03-29 Thread Tom Lendacky
On 3/29/2018 5:54 PM, Gary R Hook wrote:
> Implement a skeleton framework for debugfs support in the
> AMD IOMMU.
> 
> 
> Signed-off-by: Gary R Hook 
> ---
>  drivers/iommu/Kconfig |6 ++---
>  drivers/iommu/Makefile|2 +-
>  drivers/iommu/amd_iommu_debugfs.c |   47 
> +
>  drivers/iommu/amd_iommu_init.c|9 ---
>  drivers/iommu/amd_iommu_proto.h   |6 +
>  drivers/iommu/amd_iommu_types.h   |3 ++
>  include/linux/iommu.h |8 +++---
>  7 files changed, 69 insertions(+), 12 deletions(-)
>  create mode 100644 drivers/iommu/amd_iommu_debugfs.c
> 
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> index d40248446214..8d50151d5bf4 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -65,9 +65,9 @@ config IOMMU_DEBUG
>   depends on DEBUG_FS
>   default n
>   help
> -   Base enablement for access to any IOMMU device. Allows individual
> -   drivers to populate debugfs for access to IOMMU registers and
> -   data structures.
> +   Enable exposure of IOMMU device internals. Allow devices to
> +   populate debugfs for access to IOMMU registers and data
> +   structures.

This help text shouldn't change just because a driver is making use of
the interface that was created in the previous patch.

>  
>  config IOMMU_IOVA
>   tristate
> diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
> index 5e5c3339681d..0ca250f626d9 100644
> --- a/drivers/iommu/Makefile
> +++ b/drivers/iommu/Makefile
> @@ -11,7 +11,7 @@ obj-$(CONFIG_IOMMU_IOVA) += iova.o
>  obj-$(CONFIG_OF_IOMMU)   += of_iommu.o
>  obj-$(CONFIG_MSM_IOMMU) += msm_iommu.o
>  obj-$(CONFIG_AMD_IOMMU) += amd_iommu.o amd_iommu_init.o
> -obj-$(CONFIG_AMD_IOMMU_DEBUG) += amd_iommu_debugfs.o
> +obj-$(CONFIG_IOMMU_DEBUG) += amd_iommu_debugfs.o
>  obj-$(CONFIG_AMD_IOMMU_V2) += amd_iommu_v2.o
>  obj-$(CONFIG_ARM_SMMU) += arm-smmu.o
>  obj-$(CONFIG_ARM_SMMU_V3) += arm-smmu-v3.o
> diff --git a/drivers/iommu/amd_iommu_debugfs.c 
> b/drivers/iommu/amd_iommu_debugfs.c
> new file mode 100644
> index ..3547ad3339c1
> --- /dev/null
> +++ b/drivers/iommu/amd_iommu_debugfs.c
> @@ -0,0 +1,47 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * AMD IOMMU driver
> + *
> + * Copyright (C) 2018 Advanced Micro Devices, Inc.
> + *
> + * Author: Gary R Hook 
> + */
> +
> +#ifdef   CONFIG_IOMMU_DEBUG

Since the module won't be built unless this is defined, you don't
need this.

> +
> +#include 
> +#include 
> +#include 
> +#include "amd_iommu_proto.h"
> +#include "amd_iommu_types.h"
> +
> +static struct dentry *iommu_debugfs_dir;
> +static DEFINE_RWLOCK(iommu_debugfs_lock);

Use amd_iommu_...

Also, didn't you run into an issue with the CCP and debugfs creation
during probe using a RWLOCK?  Should probably make this a mutex.

> +
> +#define  MAX_NAME_LEN20
> +
> +void amd_iommu_debugfs_setup(struct amd_iommu *iommu)
> +{
> + char name[MAX_NAME_LEN + 1];
> + struct dentry *d_top;
> + unsigned long flags;
> +
> + if (!debugfs_initialized())
> + return;
> +
> + write_lock_irqsave(_debugfs_lock, flags);
> + if (!iommu_debugfs_dir && (d_top = iommu_debugfs_setup()))
> + iommu_debugfs_dir = debugfs_create_dir("amd", d_top);
> + write_unlock_irqrestore(_debugfs_lock, flags);
> + if (iommu_debugfs_dir) {
> + snprintf(name, MAX_NAME_LEN, "iommu%02d", iommu->index);
> + iommu->debugfs_instance = debugfs_create_dir(name,
> +  iommu_debugfs_dir);
> + if (!iommu->debugfs_instance)
> + debugfs_remove_recursive(iommu_debugfs_dir);

So this might cause an error.  You remove it, but don't set the variable
to NULL, so the next IOMMU that is probed could try to use it.  But why
are you deleting it anyway?

> + }
> +
> + return;
> +}
> +
> +#endif
> diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
> index 99d48c42a12f..43856c7f4ea1 100644
> --- a/drivers/iommu/amd_iommu_init.c
> +++ b/drivers/iommu/amd_iommu_init.c
> @@ -89,6 +89,7 @@
>  #define ACPI_DEVFLAG_ATSDIS 0x1000
>  
>  #define LOOP_TIMEOUT 10
> +

Spurious newline.

>  /*
>   * ACPI table definitions
>   *
> @@ -2720,6 +2721,7 @@ int __init amd_iommu_enable_faulting(void)
>   */
>  static int __init amd_iommu_init(void)
>  {
> + struct amd_iommu *iommu;
>   int ret;
>  
>   ret = iommu_go_to_state(IOMMU_INITIALIZED);
> @@ -2729,14 +2731,15 @@ static int __init amd_iommu_init(void)
>   disable_iommus();
>   free_iommu_resources();
>   } else {
> - struct amd_iommu *iommu;
> -
>   uninit_device_table_dma();
>   for_each_iommu(iommu)
>   iommu_flush_all_caches(iommu);
>

Re: linux-next 20180327 - "SELinux: (dev dm-3, type ext4) getxattr errno 34"

2018-03-29 Thread valdis . kletnieks
On Thu, 29 Mar 2018 21:32:21 -0400, "Theodore Y. Ts'o" said:

> Yes, the breakage is my fault; my apologies.  The new version of the
> patch is already posted in bugzilla (and on linux-ext4).  I'll be
> pushing out a refreshed ext4.git branch shortly.

Confirming that reverting de57a63ea4389e39b1cdd1cef15e1ec9b58a964c
and applying the new version from the bugzilla results in a clean boot.



pgpviSZYDm_s9.pgp
Description: PGP signature


Re: linux-next 20180327 - "SELinux: (dev dm-3, type ext4) getxattr errno 34"

2018-03-29 Thread valdis . kletnieks
On Thu, 29 Mar 2018 21:32:21 -0400, "Theodore Y. Ts'o" said:

> Yes, the breakage is my fault; my apologies.  The new version of the
> patch is already posted in bugzilla (and on linux-ext4).  I'll be
> pushing out a refreshed ext4.git branch shortly.

Confirming that reverting de57a63ea4389e39b1cdd1cef15e1ec9b58a964c
and applying the new version from the bugzilla results in a clean boot.



pgpviSZYDm_s9.pgp
Description: PGP signature


Re: [PATCH 1/2] iommu - Enable debugfs exposure of the IOMMU

2018-03-29 Thread Tom Lendacky
On 3/29/2018 5:54 PM, Gary R Hook wrote:
> Provide base enablement for using debugfs to expose internal data of
> an IOMMU driver. When enabled, create the /sys/kernel/debug/iommu

So this can't actually create anything yet since nothing invokes the
function.  Maybe describe how it should be used by other drivers (and
probably include that description as a commment for the function) to
create the directory.

> directory.  Emit a strong warning at boot time to indicate that this
> feature is enabled.
> 
> Signed-off-by: Gary R Hook 
> ---
>  drivers/iommu/Kconfig  |   11 +++
>  drivers/iommu/Makefile |2 ++
>  drivers/iommu/amd_iommu_init.c |2 ++
>  drivers/iommu/iommu-debugfs.c  |   32 
>  include/linux/iommu.h  |4 
>  5 files changed, 51 insertions(+)
>  create mode 100644 drivers/iommu/iommu-debugfs.c
> 
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> index f3a21343e636..c1e39dabfec2 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -60,6 +60,17 @@ config IOMMU_IO_PGTABLE_ARMV7S_SELFTEST
>  
>  endmenu
>  
> +config IOMMU_DEBUG
> + bool "Enable IOMMU internals in DebugFS"
> + depends on DEBUG_FS
> + default n
> + help
> +   Allows exposure of IOMMU device internals. This option enables
> +   the use of debugfs by IOMMU drivers as required. Devices can,
> +   at initialization time, cause the IOMMU code to create a top-level
> +   debug/iommu directory, and then populate a subdirectory with
> +   entries as required.
> +
>  config IOMMU_IOVA
>   tristate
>  
> diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
> index 1fb695854809..5e5c3339681d 100644
> --- a/drivers/iommu/Makefile
> +++ b/drivers/iommu/Makefile
> @@ -2,6 +2,7 @@
>  obj-$(CONFIG_IOMMU_API) += iommu.o
>  obj-$(CONFIG_IOMMU_API) += iommu-traces.o
>  obj-$(CONFIG_IOMMU_API) += iommu-sysfs.o
> +obj-$(CONFIG_IOMMU_DEBUG) += iommu-debugfs.o
>  obj-$(CONFIG_IOMMU_DMA) += dma-iommu.o
>  obj-$(CONFIG_IOMMU_IO_PGTABLE) += io-pgtable.o
>  obj-$(CONFIG_IOMMU_IO_PGTABLE_ARMV7S) += io-pgtable-arm-v7s.o
> @@ -10,6 +11,7 @@ obj-$(CONFIG_IOMMU_IOVA) += iova.o
>  obj-$(CONFIG_OF_IOMMU)   += of_iommu.o
>  obj-$(CONFIG_MSM_IOMMU) += msm_iommu.o
>  obj-$(CONFIG_AMD_IOMMU) += amd_iommu.o amd_iommu_init.o
> +obj-$(CONFIG_AMD_IOMMU_DEBUG) += amd_iommu_debugfs.o

Where is this CONFIG defined?

>  obj-$(CONFIG_AMD_IOMMU_V2) += amd_iommu_v2.o
>  obj-$(CONFIG_ARM_SMMU) += arm-smmu.o
>  obj-$(CONFIG_ARM_SMMU_V3) += arm-smmu-v3.o
> diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
> index 6fe2d0346073..99d48c42a12f 100644
> --- a/drivers/iommu/amd_iommu_init.c
> +++ b/drivers/iommu/amd_iommu_init.c
> @@ -2783,6 +2783,8 @@ int __init amd_iommu_detect(void)
>   iommu_detected = 1;
>   x86_init.iommu.iommu_init = amd_iommu_init;
>  
> +dump_stack();
> +

This definitely shouldn't be here.

>   return 1;
>  }
>  
> diff --git a/drivers/iommu/iommu-debugfs.c b/drivers/iommu/iommu-debugfs.c
> new file mode 100644
> index ..94c9acc63b65
> --- /dev/null
> +++ b/drivers/iommu/iommu-debugfs.c
> @@ -0,0 +1,32 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * AMD IOMMU driver

This isn't the AMD IOMMU driver.

> + *
> + * Copyright (C) 2018 Advanced Micro Devices, Inc.
> + *
> + * Author: Gary R Hook 
> + */
> +
> +#include 
> +#include 
> +#include 
> +
> +static struct dentry *iommu_debugfs_dir;
> +
> +#define  MAX_NAME_LEN20

This isn't used anywhere.

> +
> +/* Return a zero on failure; 1 on successful setup */

Return NULL on failure, pointer to IOMMU debugfs dentry on success.

> +struct dentry *iommu_debugfs_setup(void)
> +{
> + if (!debugfs_initialized())
> + return NULL;
> +
> + if (!iommu_debugfs_dir)
> + iommu_debugfs_dir = debugfs_create_dir("iommu", NULL);
> +
> + if (iommu_debugfs_dir)
> + pr_warn("WARNING: IOMMU DEBUGFS SUPPORT HAS BEEN ENABLED IN 
> THIS KERNEL\n");
> +
> + return iommu_debugfs_dir;
> +}
> +EXPORT_SYMBOL_GPL(iommu_debugfs_setup);
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index 41b8c5757859..cb2957dac43b 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -696,6 +696,10 @@ const struct iommu_ops *iommu_ops_from_fwnode(struct 
> fwnode_handle *fwnode)
>   return NULL;
>  }
>  
> +#ifdef   CONFIG_IOMMU_DEBUG

Should be a space between the #ifdef and the CONFIG_..., not a tab.

Thanks,
Tom

> +struct dentry *iommu_debugfs_setup(void);
> +#endif
> +
>  #endif /* CONFIG_IOMMU_API */
>  
>  #endif /* __LINUX_IOMMU_H */
> 


Re: [PATCH 1/2] iommu - Enable debugfs exposure of the IOMMU

2018-03-29 Thread Tom Lendacky
On 3/29/2018 5:54 PM, Gary R Hook wrote:
> Provide base enablement for using debugfs to expose internal data of
> an IOMMU driver. When enabled, create the /sys/kernel/debug/iommu

So this can't actually create anything yet since nothing invokes the
function.  Maybe describe how it should be used by other drivers (and
probably include that description as a commment for the function) to
create the directory.

> directory.  Emit a strong warning at boot time to indicate that this
> feature is enabled.
> 
> Signed-off-by: Gary R Hook 
> ---
>  drivers/iommu/Kconfig  |   11 +++
>  drivers/iommu/Makefile |2 ++
>  drivers/iommu/amd_iommu_init.c |2 ++
>  drivers/iommu/iommu-debugfs.c  |   32 
>  include/linux/iommu.h  |4 
>  5 files changed, 51 insertions(+)
>  create mode 100644 drivers/iommu/iommu-debugfs.c
> 
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> index f3a21343e636..c1e39dabfec2 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -60,6 +60,17 @@ config IOMMU_IO_PGTABLE_ARMV7S_SELFTEST
>  
>  endmenu
>  
> +config IOMMU_DEBUG
> + bool "Enable IOMMU internals in DebugFS"
> + depends on DEBUG_FS
> + default n
> + help
> +   Allows exposure of IOMMU device internals. This option enables
> +   the use of debugfs by IOMMU drivers as required. Devices can,
> +   at initialization time, cause the IOMMU code to create a top-level
> +   debug/iommu directory, and then populate a subdirectory with
> +   entries as required.
> +
>  config IOMMU_IOVA
>   tristate
>  
> diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
> index 1fb695854809..5e5c3339681d 100644
> --- a/drivers/iommu/Makefile
> +++ b/drivers/iommu/Makefile
> @@ -2,6 +2,7 @@
>  obj-$(CONFIG_IOMMU_API) += iommu.o
>  obj-$(CONFIG_IOMMU_API) += iommu-traces.o
>  obj-$(CONFIG_IOMMU_API) += iommu-sysfs.o
> +obj-$(CONFIG_IOMMU_DEBUG) += iommu-debugfs.o
>  obj-$(CONFIG_IOMMU_DMA) += dma-iommu.o
>  obj-$(CONFIG_IOMMU_IO_PGTABLE) += io-pgtable.o
>  obj-$(CONFIG_IOMMU_IO_PGTABLE_ARMV7S) += io-pgtable-arm-v7s.o
> @@ -10,6 +11,7 @@ obj-$(CONFIG_IOMMU_IOVA) += iova.o
>  obj-$(CONFIG_OF_IOMMU)   += of_iommu.o
>  obj-$(CONFIG_MSM_IOMMU) += msm_iommu.o
>  obj-$(CONFIG_AMD_IOMMU) += amd_iommu.o amd_iommu_init.o
> +obj-$(CONFIG_AMD_IOMMU_DEBUG) += amd_iommu_debugfs.o

Where is this CONFIG defined?

>  obj-$(CONFIG_AMD_IOMMU_V2) += amd_iommu_v2.o
>  obj-$(CONFIG_ARM_SMMU) += arm-smmu.o
>  obj-$(CONFIG_ARM_SMMU_V3) += arm-smmu-v3.o
> diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
> index 6fe2d0346073..99d48c42a12f 100644
> --- a/drivers/iommu/amd_iommu_init.c
> +++ b/drivers/iommu/amd_iommu_init.c
> @@ -2783,6 +2783,8 @@ int __init amd_iommu_detect(void)
>   iommu_detected = 1;
>   x86_init.iommu.iommu_init = amd_iommu_init;
>  
> +dump_stack();
> +

This definitely shouldn't be here.

>   return 1;
>  }
>  
> diff --git a/drivers/iommu/iommu-debugfs.c b/drivers/iommu/iommu-debugfs.c
> new file mode 100644
> index ..94c9acc63b65
> --- /dev/null
> +++ b/drivers/iommu/iommu-debugfs.c
> @@ -0,0 +1,32 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * AMD IOMMU driver

This isn't the AMD IOMMU driver.

> + *
> + * Copyright (C) 2018 Advanced Micro Devices, Inc.
> + *
> + * Author: Gary R Hook 
> + */
> +
> +#include 
> +#include 
> +#include 
> +
> +static struct dentry *iommu_debugfs_dir;
> +
> +#define  MAX_NAME_LEN20

This isn't used anywhere.

> +
> +/* Return a zero on failure; 1 on successful setup */

Return NULL on failure, pointer to IOMMU debugfs dentry on success.

> +struct dentry *iommu_debugfs_setup(void)
> +{
> + if (!debugfs_initialized())
> + return NULL;
> +
> + if (!iommu_debugfs_dir)
> + iommu_debugfs_dir = debugfs_create_dir("iommu", NULL);
> +
> + if (iommu_debugfs_dir)
> + pr_warn("WARNING: IOMMU DEBUGFS SUPPORT HAS BEEN ENABLED IN 
> THIS KERNEL\n");
> +
> + return iommu_debugfs_dir;
> +}
> +EXPORT_SYMBOL_GPL(iommu_debugfs_setup);
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index 41b8c5757859..cb2957dac43b 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -696,6 +696,10 @@ const struct iommu_ops *iommu_ops_from_fwnode(struct 
> fwnode_handle *fwnode)
>   return NULL;
>  }
>  
> +#ifdef   CONFIG_IOMMU_DEBUG

Should be a space between the #ifdef and the CONFIG_..., not a tab.

Thanks,
Tom

> +struct dentry *iommu_debugfs_setup(void);
> +#endif
> +
>  #endif /* CONFIG_IOMMU_API */
>  
>  #endif /* __LINUX_IOMMU_H */
> 


[PATCH 0/2] add thermal monitor support for UniPhier PXs3 SoC

2018-03-29 Thread Kunihiko Hayashi
Add support for thermal monitor implemented on UniPhier PXs3 SoC.

Kunihiko Hayashi (2):
  dt-bindings: thermal: uniphier: add a compatible string for PXs3
  thermal: uniphier: add UniPhier PXs3 support

 Documentation/devicetree/bindings/thermal/uniphier-thermal.txt | 1 +
 drivers/thermal/uniphier_thermal.c | 4 
 2 files changed, 5 insertions(+)

-- 
2.7.4



[PATCH 2/2] thermal: uniphier: add UniPhier PXs3 support

2018-03-29 Thread Kunihiko Hayashi
Add support for UniPhier PXs3 SoC. It is equivalent to LD20.

Signed-off-by: Kunihiko Hayashi 
---
 drivers/thermal/uniphier_thermal.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/thermal/uniphier_thermal.c 
b/drivers/thermal/uniphier_thermal.c
index 9570473..55477d7 100644
--- a/drivers/thermal/uniphier_thermal.c
+++ b/drivers/thermal/uniphier_thermal.c
@@ -365,6 +365,10 @@ static const struct of_device_id uniphier_tm_dt_ids[] = {
.compatible = "socionext,uniphier-ld20-thermal",
.data   = _ld20_tm_data,
},
+   {
+   .compatible = "socionext,uniphier-pxs3-thermal",
+   .data   = _ld20_tm_data,
+   },
{ /* sentinel */ }
 };
 MODULE_DEVICE_TABLE(of, uniphier_tm_dt_ids);
-- 
2.7.4



[PATCH 1/2] dt-bindings: thermal: uniphier: add a compatible string for PXs3

2018-03-29 Thread Kunihiko Hayashi
Add a compatible string for thermal monitor implemented on
UniPhier PXs3 SoC.

Signed-off-by: Kunihiko Hayashi 
---
 Documentation/devicetree/bindings/thermal/uniphier-thermal.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/devicetree/bindings/thermal/uniphier-thermal.txt 
b/Documentation/devicetree/bindings/thermal/uniphier-thermal.txt
index 686c0b4..ceb92a9 100644
--- a/Documentation/devicetree/bindings/thermal/uniphier-thermal.txt
+++ b/Documentation/devicetree/bindings/thermal/uniphier-thermal.txt
@@ -8,6 +8,7 @@ Required properties:
 - compatible :
   - "socionext,uniphier-pxs2-thermal" : For UniPhier PXs2 SoC
   - "socionext,uniphier-ld20-thermal" : For UniPhier LD20 SoC
+  - "socionext,uniphier-pxs3-thermal" : For UniPhier PXs3 SoC
 - interrupts : IRQ for the temperature alarm
 - #thermal-sensor-cells : Should be 0. See ./thermal.txt for details.
 
-- 
2.7.4



[PATCH 2/2] thermal: uniphier: add UniPhier PXs3 support

2018-03-29 Thread Kunihiko Hayashi
Add support for UniPhier PXs3 SoC. It is equivalent to LD20.

Signed-off-by: Kunihiko Hayashi 
---
 drivers/thermal/uniphier_thermal.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/thermal/uniphier_thermal.c 
b/drivers/thermal/uniphier_thermal.c
index 9570473..55477d7 100644
--- a/drivers/thermal/uniphier_thermal.c
+++ b/drivers/thermal/uniphier_thermal.c
@@ -365,6 +365,10 @@ static const struct of_device_id uniphier_tm_dt_ids[] = {
.compatible = "socionext,uniphier-ld20-thermal",
.data   = _ld20_tm_data,
},
+   {
+   .compatible = "socionext,uniphier-pxs3-thermal",
+   .data   = _ld20_tm_data,
+   },
{ /* sentinel */ }
 };
 MODULE_DEVICE_TABLE(of, uniphier_tm_dt_ids);
-- 
2.7.4



[PATCH 1/2] dt-bindings: thermal: uniphier: add a compatible string for PXs3

2018-03-29 Thread Kunihiko Hayashi
Add a compatible string for thermal monitor implemented on
UniPhier PXs3 SoC.

Signed-off-by: Kunihiko Hayashi 
---
 Documentation/devicetree/bindings/thermal/uniphier-thermal.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/devicetree/bindings/thermal/uniphier-thermal.txt 
b/Documentation/devicetree/bindings/thermal/uniphier-thermal.txt
index 686c0b4..ceb92a9 100644
--- a/Documentation/devicetree/bindings/thermal/uniphier-thermal.txt
+++ b/Documentation/devicetree/bindings/thermal/uniphier-thermal.txt
@@ -8,6 +8,7 @@ Required properties:
 - compatible :
   - "socionext,uniphier-pxs2-thermal" : For UniPhier PXs2 SoC
   - "socionext,uniphier-ld20-thermal" : For UniPhier LD20 SoC
+  - "socionext,uniphier-pxs3-thermal" : For UniPhier PXs3 SoC
 - interrupts : IRQ for the temperature alarm
 - #thermal-sensor-cells : Should be 0. See ./thermal.txt for details.
 
-- 
2.7.4



[PATCH 0/2] add thermal monitor support for UniPhier PXs3 SoC

2018-03-29 Thread Kunihiko Hayashi
Add support for thermal monitor implemented on UniPhier PXs3 SoC.

Kunihiko Hayashi (2):
  dt-bindings: thermal: uniphier: add a compatible string for PXs3
  thermal: uniphier: add UniPhier PXs3 support

 Documentation/devicetree/bindings/thermal/uniphier-thermal.txt | 1 +
 drivers/thermal/uniphier_thermal.c | 4 
 2 files changed, 5 insertions(+)

-- 
2.7.4



Re: [PATCH v3] staging: vt6655: check for memory allocation failures

2018-03-29 Thread Jia-Ju Bai



On 2018/3/30 11:39, Ji-Hun Kim wrote:

On Fri, Mar 30, 2018 at 11:15:03AM +0800, Jia-Ju Bai wrote:


On 2018/3/30 10:44, Ji-Hun Kim wrote:

@@ -1165,10 +1205,18 @@ static int vnt_start(struct ieee80211_hw *hw)
}
dev_dbg(>pcid->dev, "call device init rd0 ring\n");
-   device_init_rd0_ring(priv);
-   device_init_rd1_ring(priv);
-   device_init_td0_ring(priv);
-   device_init_td1_ring(priv);
+   ret = device_init_rd0_ring(priv);
+   if (ret)
+   goto error;
+   ret = device_init_rd1_ring(priv);
+   if (ret)
+   goto error;
+   ret = device_init_td0_ring(priv);
+   if (ret)
+   goto error;
+   ret = device_init_td1_ring(priv);
+   if (ret)
+   goto error;
device_init_registers(priv);
@@ -1178,6 +1226,8 @@ static int vnt_start(struct ieee80211_hw *hw)
ieee80211_wake_queues(hw);
return 0;
+error:
+   return ret;
  }

This code will lead to memory leaks when device_init_rd1_ring()
fails, because the memory allocated by device_init_rd0_ring() is not
freed.

I think this one will be better:
 ret = device_init_rd0_ring(priv);
 if (ret)
 goto error_init_rd0_ring;
 ret = device_init_rd1_ring(priv);
 if (ret)
 goto error_init_rd1_ring;
 ret = device_init_td0_ring(priv);
 if (ret)
 goto error_init_td0_ring;
 ret = device_init_td1_ring(priv);
 if (ret)
 goto error_init_td1_ring;
 ..
error_init_td1_ring:
 device_free_td0_ring(priv);
error_init_td0_ring:
 device_free_rd1_ring(priv);
error_init_rd1_ring:
 device_free_rd0_ring(priv);
error_init_rd0_ring:
 return ret;


Best wishes,
Jia-Ju Bai



But, those freeing function are already placed in the each device_init
functions for allocation fail like below.


I think it is okay.
I suppose that, for each device_init function, you only free the 
resources allocated in this function, and do not handle the resources in 
other functions.



@@ -550,20 +554,29 @@ static void device_init_rd0_ring(struct vnt_private *priv)
+   return 0;
+error:
+   device_free_rd0_ring(priv);
+   return ret;


This code is needed to free the resources allocated in this function.


Best wishes,
Jia-Ju Bai


Re: [PATCH v3] staging: vt6655: check for memory allocation failures

2018-03-29 Thread Jia-Ju Bai



On 2018/3/30 11:39, Ji-Hun Kim wrote:

On Fri, Mar 30, 2018 at 11:15:03AM +0800, Jia-Ju Bai wrote:


On 2018/3/30 10:44, Ji-Hun Kim wrote:

@@ -1165,10 +1205,18 @@ static int vnt_start(struct ieee80211_hw *hw)
}
dev_dbg(>pcid->dev, "call device init rd0 ring\n");
-   device_init_rd0_ring(priv);
-   device_init_rd1_ring(priv);
-   device_init_td0_ring(priv);
-   device_init_td1_ring(priv);
+   ret = device_init_rd0_ring(priv);
+   if (ret)
+   goto error;
+   ret = device_init_rd1_ring(priv);
+   if (ret)
+   goto error;
+   ret = device_init_td0_ring(priv);
+   if (ret)
+   goto error;
+   ret = device_init_td1_ring(priv);
+   if (ret)
+   goto error;
device_init_registers(priv);
@@ -1178,6 +1226,8 @@ static int vnt_start(struct ieee80211_hw *hw)
ieee80211_wake_queues(hw);
return 0;
+error:
+   return ret;
  }

This code will lead to memory leaks when device_init_rd1_ring()
fails, because the memory allocated by device_init_rd0_ring() is not
freed.

I think this one will be better:
 ret = device_init_rd0_ring(priv);
 if (ret)
 goto error_init_rd0_ring;
 ret = device_init_rd1_ring(priv);
 if (ret)
 goto error_init_rd1_ring;
 ret = device_init_td0_ring(priv);
 if (ret)
 goto error_init_td0_ring;
 ret = device_init_td1_ring(priv);
 if (ret)
 goto error_init_td1_ring;
 ..
error_init_td1_ring:
 device_free_td0_ring(priv);
error_init_td0_ring:
 device_free_rd1_ring(priv);
error_init_rd1_ring:
 device_free_rd0_ring(priv);
error_init_rd0_ring:
 return ret;


Best wishes,
Jia-Ju Bai



But, those freeing function are already placed in the each device_init
functions for allocation fail like below.


I think it is okay.
I suppose that, for each device_init function, you only free the 
resources allocated in this function, and do not handle the resources in 
other functions.



@@ -550,20 +554,29 @@ static void device_init_rd0_ring(struct vnt_private *priv)
+   return 0;
+error:
+   device_free_rd0_ring(priv);
+   return ret;


This code is needed to free the resources allocated in this function.


Best wishes,
Jia-Ju Bai


Re: [PATCH v3] staging: vt6655: check for memory allocation failures

2018-03-29 Thread Ji-Hun Kim
On Fri, Mar 30, 2018 at 11:15:03AM +0800, Jia-Ju Bai wrote:
> 
> 
> On 2018/3/30 10:44, Ji-Hun Kim wrote:
> >@@ -1165,10 +1205,18 @@ static int vnt_start(struct ieee80211_hw *hw)
> > }
> > dev_dbg(>pcid->dev, "call device init rd0 ring\n");
> >-device_init_rd0_ring(priv);
> >-device_init_rd1_ring(priv);
> >-device_init_td0_ring(priv);
> >-device_init_td1_ring(priv);
> >+ret = device_init_rd0_ring(priv);
> >+if (ret)
> >+goto error;
> >+ret = device_init_rd1_ring(priv);
> >+if (ret)
> >+goto error;
> >+ret = device_init_td0_ring(priv);
> >+if (ret)
> >+goto error;
> >+ret = device_init_td1_ring(priv);
> >+if (ret)
> >+goto error;
> > device_init_registers(priv);
> >@@ -1178,6 +1226,8 @@ static int vnt_start(struct ieee80211_hw *hw)
> > ieee80211_wake_queues(hw);
> > return 0;
> >+error:
> >+return ret;
> >  }
> 
> This code will lead to memory leaks when device_init_rd1_ring()
> fails, because the memory allocated by device_init_rd0_ring() is not
> freed.
> 
> I think this one will be better:
> ret = device_init_rd0_ring(priv);
> if (ret)
> goto error_init_rd0_ring;
> ret = device_init_rd1_ring(priv);
> if (ret)
> goto error_init_rd1_ring;
> ret = device_init_td0_ring(priv);
> if (ret)
> goto error_init_td0_ring;
> ret = device_init_td1_ring(priv);
> if (ret)
> goto error_init_td1_ring;
> ..
> error_init_td1_ring:
> device_free_td0_ring(priv);
> error_init_td0_ring:
> device_free_rd1_ring(priv);
> error_init_rd1_ring:
> device_free_rd0_ring(priv);
> error_init_rd0_ring:
> return ret;
> 
> 
> Best wishes,
> Jia-Ju Bai
> 
> 
Oh, sorry, I got what you said. Yes, you are right. I am going to make patch 
v4. Thanks!

Best regards,
Ji-Hun


Re: [PATCH v3] staging: vt6655: check for memory allocation failures

2018-03-29 Thread Ji-Hun Kim
On Fri, Mar 30, 2018 at 11:15:03AM +0800, Jia-Ju Bai wrote:
> 
> 
> On 2018/3/30 10:44, Ji-Hun Kim wrote:
> >@@ -1165,10 +1205,18 @@ static int vnt_start(struct ieee80211_hw *hw)
> > }
> > dev_dbg(>pcid->dev, "call device init rd0 ring\n");
> >-device_init_rd0_ring(priv);
> >-device_init_rd1_ring(priv);
> >-device_init_td0_ring(priv);
> >-device_init_td1_ring(priv);
> >+ret = device_init_rd0_ring(priv);
> >+if (ret)
> >+goto error;
> >+ret = device_init_rd1_ring(priv);
> >+if (ret)
> >+goto error;
> >+ret = device_init_td0_ring(priv);
> >+if (ret)
> >+goto error;
> >+ret = device_init_td1_ring(priv);
> >+if (ret)
> >+goto error;
> > device_init_registers(priv);
> >@@ -1178,6 +1226,8 @@ static int vnt_start(struct ieee80211_hw *hw)
> > ieee80211_wake_queues(hw);
> > return 0;
> >+error:
> >+return ret;
> >  }
> 
> This code will lead to memory leaks when device_init_rd1_ring()
> fails, because the memory allocated by device_init_rd0_ring() is not
> freed.
> 
> I think this one will be better:
> ret = device_init_rd0_ring(priv);
> if (ret)
> goto error_init_rd0_ring;
> ret = device_init_rd1_ring(priv);
> if (ret)
> goto error_init_rd1_ring;
> ret = device_init_td0_ring(priv);
> if (ret)
> goto error_init_td0_ring;
> ret = device_init_td1_ring(priv);
> if (ret)
> goto error_init_td1_ring;
> ..
> error_init_td1_ring:
> device_free_td0_ring(priv);
> error_init_td0_ring:
> device_free_rd1_ring(priv);
> error_init_rd1_ring:
> device_free_rd0_ring(priv);
> error_init_rd0_ring:
> return ret;
> 
> 
> Best wishes,
> Jia-Ju Bai
> 
> 
Oh, sorry, I got what you said. Yes, you are right. I am going to make patch 
v4. Thanks!

Best regards,
Ji-Hun


Re: [PATCH v3] staging: vt6655: check for memory allocation failures

2018-03-29 Thread Ji-Hun Kim
On Fri, Mar 30, 2018 at 11:15:03AM +0800, Jia-Ju Bai wrote:
> 
> 
> On 2018/3/30 10:44, Ji-Hun Kim wrote:
> >@@ -1165,10 +1205,18 @@ static int vnt_start(struct ieee80211_hw *hw)
> > }
> > dev_dbg(>pcid->dev, "call device init rd0 ring\n");
> >-device_init_rd0_ring(priv);
> >-device_init_rd1_ring(priv);
> >-device_init_td0_ring(priv);
> >-device_init_td1_ring(priv);
> >+ret = device_init_rd0_ring(priv);
> >+if (ret)
> >+goto error;
> >+ret = device_init_rd1_ring(priv);
> >+if (ret)
> >+goto error;
> >+ret = device_init_td0_ring(priv);
> >+if (ret)
> >+goto error;
> >+ret = device_init_td1_ring(priv);
> >+if (ret)
> >+goto error;
> > device_init_registers(priv);
> >@@ -1178,6 +1226,8 @@ static int vnt_start(struct ieee80211_hw *hw)
> > ieee80211_wake_queues(hw);
> > return 0;
> >+error:
> >+return ret;
> >  }
> 
> This code will lead to memory leaks when device_init_rd1_ring()
> fails, because the memory allocated by device_init_rd0_ring() is not
> freed.
> 
> I think this one will be better:
> ret = device_init_rd0_ring(priv);
> if (ret)
> goto error_init_rd0_ring;
> ret = device_init_rd1_ring(priv);
> if (ret)
> goto error_init_rd1_ring;
> ret = device_init_td0_ring(priv);
> if (ret)
> goto error_init_td0_ring;
> ret = device_init_td1_ring(priv);
> if (ret)
> goto error_init_td1_ring;
> ..
> error_init_td1_ring:
> device_free_td0_ring(priv);
> error_init_td0_ring:
> device_free_rd1_ring(priv);
> error_init_rd1_ring:
> device_free_rd0_ring(priv);
> error_init_rd0_ring:
> return ret;
> 
> 
> Best wishes,
> Jia-Ju Bai
> 
> 
But, those freeing function are already placed in the each device_init
functions for allocation fail like below. 

@@ -550,20 +554,29 @@ static void device_init_rd0_ring(struct vnt_private *priv)
+   return 0;
+error:
+   device_free_rd0_ring(priv);
+   return ret;

Would freeing in the vnt_start() be better instead of freeing in
device_init_rd0_ring function?

Best regards,
Ji-Hun


Re: [PATCH v3] staging: vt6655: check for memory allocation failures

2018-03-29 Thread Ji-Hun Kim
On Fri, Mar 30, 2018 at 11:15:03AM +0800, Jia-Ju Bai wrote:
> 
> 
> On 2018/3/30 10:44, Ji-Hun Kim wrote:
> >@@ -1165,10 +1205,18 @@ static int vnt_start(struct ieee80211_hw *hw)
> > }
> > dev_dbg(>pcid->dev, "call device init rd0 ring\n");
> >-device_init_rd0_ring(priv);
> >-device_init_rd1_ring(priv);
> >-device_init_td0_ring(priv);
> >-device_init_td1_ring(priv);
> >+ret = device_init_rd0_ring(priv);
> >+if (ret)
> >+goto error;
> >+ret = device_init_rd1_ring(priv);
> >+if (ret)
> >+goto error;
> >+ret = device_init_td0_ring(priv);
> >+if (ret)
> >+goto error;
> >+ret = device_init_td1_ring(priv);
> >+if (ret)
> >+goto error;
> > device_init_registers(priv);
> >@@ -1178,6 +1226,8 @@ static int vnt_start(struct ieee80211_hw *hw)
> > ieee80211_wake_queues(hw);
> > return 0;
> >+error:
> >+return ret;
> >  }
> 
> This code will lead to memory leaks when device_init_rd1_ring()
> fails, because the memory allocated by device_init_rd0_ring() is not
> freed.
> 
> I think this one will be better:
> ret = device_init_rd0_ring(priv);
> if (ret)
> goto error_init_rd0_ring;
> ret = device_init_rd1_ring(priv);
> if (ret)
> goto error_init_rd1_ring;
> ret = device_init_td0_ring(priv);
> if (ret)
> goto error_init_td0_ring;
> ret = device_init_td1_ring(priv);
> if (ret)
> goto error_init_td1_ring;
> ..
> error_init_td1_ring:
> device_free_td0_ring(priv);
> error_init_td0_ring:
> device_free_rd1_ring(priv);
> error_init_rd1_ring:
> device_free_rd0_ring(priv);
> error_init_rd0_ring:
> return ret;
> 
> 
> Best wishes,
> Jia-Ju Bai
> 
> 
But, those freeing function are already placed in the each device_init
functions for allocation fail like below. 

@@ -550,20 +554,29 @@ static void device_init_rd0_ring(struct vnt_private *priv)
+   return 0;
+error:
+   device_free_rd0_ring(priv);
+   return ret;

Would freeing in the vnt_start() be better instead of freeing in
device_init_rd0_ring function?

Best regards,
Ji-Hun


Re: [PATCH v1] kernel/trace:check the val against the available mem

2018-03-29 Thread Zhaoyang Huang
On Fri, Mar 30, 2018 at 12:05 AM, Steven Rostedt  wrote:
> On Thu, 29 Mar 2018 18:41:44 +0800
> Zhaoyang Huang  wrote:
>
>> It is reported that some user app would like to echo a huge
>> number to "/sys/kernel/debug/tracing/buffer_size_kb" regardless
>>  of the available memory, which will cause the coinstantaneous
>> page allocation failed and introduce OOM. The commit checking the
>> val against the available mem first to avoid the consequence allocation.
>>
>
> One of my tests is to stress buffer_size_kb, and it fails nicely if you
> try to get too much. Although, it may cause an OOM, but that's expected.
>
> The application should do the test (try "free" on the command line).
> This isn't something that the kernel should be responsible for. If
> someone wants to allocate all memory for tracing, that's their
> prerogative.
>
> -- Steve
Steve, thanks for your quick feedback. The original purpose is to
avoid such kind
of OOM as kernel can not define the behavior of the application. I
think it is no need
to do the alloc->free process if we have known the number of pages
difinitly lead to failure.
Furthermore,  the app which screw up the thing usually escape the OOM and cause
the innocent to be sacrificed.


Re: [PATCH v1] kernel/trace:check the val against the available mem

2018-03-29 Thread Zhaoyang Huang
On Fri, Mar 30, 2018 at 12:05 AM, Steven Rostedt  wrote:
> On Thu, 29 Mar 2018 18:41:44 +0800
> Zhaoyang Huang  wrote:
>
>> It is reported that some user app would like to echo a huge
>> number to "/sys/kernel/debug/tracing/buffer_size_kb" regardless
>>  of the available memory, which will cause the coinstantaneous
>> page allocation failed and introduce OOM. The commit checking the
>> val against the available mem first to avoid the consequence allocation.
>>
>
> One of my tests is to stress buffer_size_kb, and it fails nicely if you
> try to get too much. Although, it may cause an OOM, but that's expected.
>
> The application should do the test (try "free" on the command line).
> This isn't something that the kernel should be responsible for. If
> someone wants to allocate all memory for tracing, that's their
> prerogative.
>
> -- Steve
Steve, thanks for your quick feedback. The original purpose is to
avoid such kind
of OOM as kernel can not define the behavior of the application. I
think it is no need
to do the alloc->free process if we have known the number of pages
difinitly lead to failure.
Furthermore,  the app which screw up the thing usually escape the OOM and cause
the innocent to be sacrificed.


linux-next: Signed-off-by missing for commits in the spi-nor tree

2018-03-29 Thread Stephen Rothwell
Hi Cyrille,

Commits

  3efa50903ff3 ("mtd: fsl-quadspi: Distinguish the mtd device names")
  85da6da22843 ("dt-bindings: fsl-quadspi: Add the example of two SPI NOR")

are missing a Signed-off-by from their committer.

P.S. should I be changing the contact I have for the spi-nor trees?

-- 
Cheers,
Stephen Rothwell


pgpk_7oFJnJmr.pgp
Description: OpenPGP digital signature


linux-next: Signed-off-by missing for commits in the spi-nor tree

2018-03-29 Thread Stephen Rothwell
Hi Cyrille,

Commits

  3efa50903ff3 ("mtd: fsl-quadspi: Distinguish the mtd device names")
  85da6da22843 ("dt-bindings: fsl-quadspi: Add the example of two SPI NOR")

are missing a Signed-off-by from their committer.

P.S. should I be changing the contact I have for the spi-nor trees?

-- 
Cheers,
Stephen Rothwell


pgpk_7oFJnJmr.pgp
Description: OpenPGP digital signature


Re: [PATCH v3] x86,sched: allow topologies where NUMA nodes share an LLC

2018-03-29 Thread kbuild test robot
Hi Alison,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on v4.16-rc7]
[also build test WARNING on next-20180329]
[cannot apply to tip/x86/core]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Alison-Schofield/x86-sched-allow-topologies-where-NUMA-nodes-share-an-LLC/20180330-070743
config: x86_64-allyesdebian (attached as .config)
compiler: gcc-7 (Debian 7.3.0-1) 7.3.0
reproduce:
# save the attached .config to linux build tree
make ARCH=x86_64 

All warnings (new ones prefixed by >>):

>> WARNING: vmlinux.o(.text+0x9601e): Section mismatch in reference from the 
>> function set_cpu_sibling_map() to the variable .init.rodata:snc_cpu
   The function set_cpu_sibling_map() references
   the variable __initconst snc_cpu.
   This is often because set_cpu_sibling_map lacks a __initconst
   annotation or the annotation of snc_cpu is wrong.

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip


Re: [PATCH v3] x86,sched: allow topologies where NUMA nodes share an LLC

2018-03-29 Thread kbuild test robot
Hi Alison,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on v4.16-rc7]
[also build test WARNING on next-20180329]
[cannot apply to tip/x86/core]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Alison-Schofield/x86-sched-allow-topologies-where-NUMA-nodes-share-an-LLC/20180330-070743
config: x86_64-allyesdebian (attached as .config)
compiler: gcc-7 (Debian 7.3.0-1) 7.3.0
reproduce:
# save the attached .config to linux build tree
make ARCH=x86_64 

All warnings (new ones prefixed by >>):

>> WARNING: vmlinux.o(.text+0x9601e): Section mismatch in reference from the 
>> function set_cpu_sibling_map() to the variable .init.rodata:snc_cpu
   The function set_cpu_sibling_map() references
   the variable __initconst snc_cpu.
   This is often because set_cpu_sibling_map lacks a __initconst
   annotation or the annotation of snc_cpu is wrong.

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip


Re: [PATCH] MAINTAINERS: update entry for entry for ARM/berlin

2018-03-29 Thread Jisheng Zhang
Hi,

On Thu, 29 Mar 2018 15:08:04 +0200 Andrew Lunn wrote:

> On Thu, Mar 29, 2018 at 04:42:40PM +0800, Jisheng Zhang wrote:
> > 
> > Change the entry name and move it to its alphabetical location.
> > We move to ARM/Synaptics instead of ARM/Marvell.  
> 
> Hi Jisheng
> 
> Please could you add some comments about this Marvell->Synaptics
> change.
> 
> Has Marvell sold the Berlin chip group to Synaptics?
> 

Thanks for the review. Synaptics has acquired the Multimedia Solutions
Business of Marvell.

https://www.synaptics.com/company/news/conexant-marvell

I have sent out v2.

Thanks


Re: [PATCH] MAINTAINERS: update entry for entry for ARM/berlin

2018-03-29 Thread Jisheng Zhang
Hi,

On Thu, 29 Mar 2018 15:08:04 +0200 Andrew Lunn wrote:

> On Thu, Mar 29, 2018 at 04:42:40PM +0800, Jisheng Zhang wrote:
> > 
> > Change the entry name and move it to its alphabetical location.
> > We move to ARM/Synaptics instead of ARM/Marvell.  
> 
> Hi Jisheng
> 
> Please could you add some comments about this Marvell->Synaptics
> change.
> 
> Has Marvell sold the Berlin chip group to Synaptics?
> 

Thanks for the review. Synaptics has acquired the Multimedia Solutions
Business of Marvell.

https://www.synaptics.com/company/news/conexant-marvell

I have sent out v2.

Thanks


[PATCH v4 2/6] doc: Add documentation for Coresight panic kdump

2018-03-29 Thread Leo Yan
Add detailed documentation for Coresight panic kdump, which contains
the idea for why need Coresight panic kdump and introduce the
implementation of Coresight panic kdump framework; the last section is
to explain what's usage.

Credits to Mathieu Poirier for many suggestions since the first version
patch reviewing.  The suggestions include using an array to manage dump
related info, this makes code scalable for more CPUs; the Coresight
kdump driver and integration kdump flow with other Coresight devices
also have many ideas from Mathieu.

Suggested-by: Mathieu Poirier 
Signed-off-by: Leo Yan 
---
 .../trace/coresight/coresight-panic-kdump.txt  | 130 +
 MAINTAINERS|   1 +
 2 files changed, 131 insertions(+)
 create mode 100644 Documentation/trace/coresight/coresight-panic-kdump.txt

diff --git a/Documentation/trace/coresight/coresight-panic-kdump.txt 
b/Documentation/trace/coresight/coresight-panic-kdump.txt
new file mode 100644
index 000..c02e520
--- /dev/null
+++ b/Documentation/trace/coresight/coresight-panic-kdump.txt
@@ -0,0 +1,130 @@
+   Coresight Panic Kdump
+   =
+
+   Author:   Leo Yan 
+   Date: March 29th, 2018
+
+Introduction
+
+
+Coresight has different sinks for trace data, the trace data is quite useful
+for postmortem debugging.  Embedded Trace Buffer (ETB) is one type sink which
+provides on-chip storage of trace data, usually uses SRAM as the buffer with
+several KBs size; if the SoC designs to support 'Local ETF' (ARM DDI 0461B,
+chapter 1.2.7), every CPU has one local ETB buffer so the per CPU trace data
+can avoid being overwritten by each other.  Trace Memory Controller (TMC) is
+another kind sink designed as a successor to the CoreSight ETB to capture trace
+into DRAM.
+
+After Linux kernel panic has occurred, the trace data keeps the last execution
+flow before issues happen.  We could consider the trace data is quite useful 
for
+postmortem debugging, especially when we can save trace data into DRAM and 
rely on
+kdump to preserve them into vmcore file; at the end, we can retrieve trace data
+from vmcore file and "offline" to analyze the execution flow.
+
+
+Implementation
+--
+
+Coresight panic kdump is a simple framework to support Coresight dump
+functionality when panic happens, it maintains an array for the dump, every 
array
+item is dedicated to one specific CPU by using CPU number as an index.  For
+'offline' recovery and analysis Coresight tracing data, except should to 
recovery
+tracing data for sinks, we also need to know CPU tracer configurations; for 
this
+purpose, the array item is a structure which combines source and sink device
+handlers, the device handler points to Coresight device structure which 
contains
+dump info: dump buffer base address and buffer size.  Below diagram is to
+present data structures relationship:
+
+  array: coresight_kdump_nodes
+  +--+--+--+
+  | CPU0 | CPU1 |   ...|
+  +--+--+--+
+ |
+ V
+  coresight_kdump_node  coresight_device
+  +---+ +---+
+  |source_csdev   | --> |kdump_buf  |
+  +---+  /  +---+
+  |sink_csdev | '   |kdump_buf_sz   |
+  +---+ +---+
+
+Every CPU has its own tracer, we need save tracer registers for tracer ID and
+configuration related information as metadata, the metadata is used by tracing
+decoder.  But the tracer has the different configuration at the different 
phase,
+below diagram explains tracer configurations for different time points: at the
+system boot phase, the tracer is disabled so its registers have not been set;
+after tracer has been enabled or when panic happens, tracer registers have been
+configured, but we need to consider if there has CPU is locked up at panic 
phase
+then this dead CPU has no chance to handle inter-processor interrupt for panic
+dump; thus we choose tracer enabling phase to save tracer metadata.  Coresight
+panic kdump provides API coresight_kdump_source() as helper function for
+recording tracer metadata.
+
+ Boot Enabling Panic
+
+  Timeline:  --->|--->|--->|--->
+
+  Tracer:Disabled Configured   Configured
+  Sink:  Disabled Enabled  Enabled with tracing data
+||
+|`--> Tracing data dump
+|
+`--> Tracer metadata dump
+
+After enabling Coresight sink device, function coresight_kdump_sink() is used 
to
+set sink device handler for related CPU; sink device 

[PATCH v4 2/6] doc: Add documentation for Coresight panic kdump

2018-03-29 Thread Leo Yan
Add detailed documentation for Coresight panic kdump, which contains
the idea for why need Coresight panic kdump and introduce the
implementation of Coresight panic kdump framework; the last section is
to explain what's usage.

Credits to Mathieu Poirier for many suggestions since the first version
patch reviewing.  The suggestions include using an array to manage dump
related info, this makes code scalable for more CPUs; the Coresight
kdump driver and integration kdump flow with other Coresight devices
also have many ideas from Mathieu.

Suggested-by: Mathieu Poirier 
Signed-off-by: Leo Yan 
---
 .../trace/coresight/coresight-panic-kdump.txt  | 130 +
 MAINTAINERS|   1 +
 2 files changed, 131 insertions(+)
 create mode 100644 Documentation/trace/coresight/coresight-panic-kdump.txt

diff --git a/Documentation/trace/coresight/coresight-panic-kdump.txt 
b/Documentation/trace/coresight/coresight-panic-kdump.txt
new file mode 100644
index 000..c02e520
--- /dev/null
+++ b/Documentation/trace/coresight/coresight-panic-kdump.txt
@@ -0,0 +1,130 @@
+   Coresight Panic Kdump
+   =
+
+   Author:   Leo Yan 
+   Date: March 29th, 2018
+
+Introduction
+
+
+Coresight has different sinks for trace data, the trace data is quite useful
+for postmortem debugging.  Embedded Trace Buffer (ETB) is one type sink which
+provides on-chip storage of trace data, usually uses SRAM as the buffer with
+several KBs size; if the SoC designs to support 'Local ETF' (ARM DDI 0461B,
+chapter 1.2.7), every CPU has one local ETB buffer so the per CPU trace data
+can avoid being overwritten by each other.  Trace Memory Controller (TMC) is
+another kind sink designed as a successor to the CoreSight ETB to capture trace
+into DRAM.
+
+After Linux kernel panic has occurred, the trace data keeps the last execution
+flow before issues happen.  We could consider the trace data is quite useful 
for
+postmortem debugging, especially when we can save trace data into DRAM and 
rely on
+kdump to preserve them into vmcore file; at the end, we can retrieve trace data
+from vmcore file and "offline" to analyze the execution flow.
+
+
+Implementation
+--
+
+Coresight panic kdump is a simple framework to support Coresight dump
+functionality when panic happens, it maintains an array for the dump, every 
array
+item is dedicated to one specific CPU by using CPU number as an index.  For
+'offline' recovery and analysis Coresight tracing data, except should to 
recovery
+tracing data for sinks, we also need to know CPU tracer configurations; for 
this
+purpose, the array item is a structure which combines source and sink device
+handlers, the device handler points to Coresight device structure which 
contains
+dump info: dump buffer base address and buffer size.  Below diagram is to
+present data structures relationship:
+
+  array: coresight_kdump_nodes
+  +--+--+--+
+  | CPU0 | CPU1 |   ...|
+  +--+--+--+
+ |
+ V
+  coresight_kdump_node  coresight_device
+  +---+ +---+
+  |source_csdev   | --> |kdump_buf  |
+  +---+  /  +---+
+  |sink_csdev | '   |kdump_buf_sz   |
+  +---+ +---+
+
+Every CPU has its own tracer, we need save tracer registers for tracer ID and
+configuration related information as metadata, the metadata is used by tracing
+decoder.  But the tracer has the different configuration at the different 
phase,
+below diagram explains tracer configurations for different time points: at the
+system boot phase, the tracer is disabled so its registers have not been set;
+after tracer has been enabled or when panic happens, tracer registers have been
+configured, but we need to consider if there has CPU is locked up at panic 
phase
+then this dead CPU has no chance to handle inter-processor interrupt for panic
+dump; thus we choose tracer enabling phase to save tracer metadata.  Coresight
+panic kdump provides API coresight_kdump_source() as helper function for
+recording tracer metadata.
+
+ Boot Enabling Panic
+
+  Timeline:  --->|--->|--->|--->
+
+  Tracer:Disabled Configured   Configured
+  Sink:  Disabled Enabled  Enabled with tracing data
+||
+|`--> Tracing data dump
+|
+`--> Tracer metadata dump
+
+After enabling Coresight sink device, function coresight_kdump_sink() is used 
to
+set sink device handler for related CPU; sink device handler points to 
Coresight
+device structure, furthermore we can 

Re: [PATCH V3 0/4] genirq/affinity: irq vector spread among online CPUs as far as possible

2018-03-29 Thread Ming Lei
Hi Thomas,

On Fri, Mar 09, 2018 at 04:08:19PM +0100, Thomas Gleixner wrote:
> On Fri, 9 Mar 2018, Ming Lei wrote:
> > On Fri, Mar 09, 2018 at 11:08:54AM +0100, Thomas Gleixner wrote:
> > > > > So my understanding is that these irq patches are enhancements and 
> > > > > not bug
> > > > > fixes. I'll queue them for 4.17 then.
> > > > 
> > > > Wrt. this IO hang issue, these patches shouldn't be bug fix, but they 
> > > > may
> > > > fix performance regression[1] for some systems caused by 84676c1f21 
> > > > ("genirq/affinity:
> > > > assign vectors to all possible CPUs").
> > > > 
> > > > [1] https://marc.info/?l=linux-block=152050347831149=2
> > > 
> > > Hmm. The patches are rather large for urgent and evtl. backporting. Is
> > > there a simpler way to address that performance issue?
> > 
> > Not thought of a simpler solution. The problem is that number of active 
> > msix vector
> > is decreased a lot by commit 84676c1f21.
> 
> It's reduced in cases where the number of possible CPUs is way larger than
> the number of online CPUs.
> 
> Now, if you look at the number of present CPUs on such systems it's
> probably the same as the number of online CPUs.
> 
> It only differs on machines which support physical hotplug, but that's not
> the normal case. Those systems are more special and less wide spread.
> 
> So the obvious simple fix for this regression issue is to spread out the
> vectors accross present CPUs and not accross possible CPUs.
> 
> I'm not sure if there is a clear indicator whether physcial hotplug is
> supported or not, but the ACPI folks (x86) and architecture maintainers
> should be able to answer that question. I have a machine which says:
> 
>smpboot: Allowing 128 CPUs, 96 hotplug CPUs
> 
> There is definitely no way to hotplug anything on that machine and sure the
> existing spread algorithm will waste vectors to no end.

percpu variable may waste space too if the possible cpu number is
provided not accurately from ACPI.

> 
> Sure then there is virt, which can pretend to have a gazillion of possible
> hotpluggable CPUs, but virt is an insanity on its own. Though someone might
> come up with reasonable heuristics for that as well.

There are also IBM s390, in which physical CPU hotplug is one normal use
case.

Looks not see any other solution posted out for virt, and it may cause
complicated queue dependency issue by re-introducing CPU hotplug
handler for blk-mq.

> 
> Thoughts?

Given this patchset doesn't have effect on normal machines without
supporting physical CPU hotplug, it can fix performance regression on
machines which might support physical CPU hotplug(cpu_present_mask !=
cpu_possible_mask) with some extra memory allocation cost.

So is there any chance to make it in v4.17?

Thanks,
Ming


[PATCH v4 3/6] coresight: Support panic kdump functionality

2018-03-29 Thread Leo Yan
After kernel panic happens, Coresight tracing data has much useful info
which can be used for analysis.  For example, the trace info from ETB
RAM can be used to check the CPU execution flows before the crash.  So
we can save the tracing data from sink devices, and rely on kdump to
save DDR content and uses "crash" tool to extract Coresight dumping
from the vmcore file.

This patch is to add a simple framework to support panic dump
functionality; it registers panic notifier, and provide the helper
functions coresight_kdump_source()/coresight_kdump_sink() so Coresight
source and sink devices can be recorded into Coresight kdump node for
kernel panic kdump.

When kernel panic happens, the notifier iterates dump array and invoke
callback function to dump tracing data.  Later the tracing data can be
used to reverse execution flow before the kernel panic.

Signed-off-by: Leo Yan 
---
 drivers/hwtracing/coresight/Kconfig|   9 +
 drivers/hwtracing/coresight/Makefile   |   1 +
 .../hwtracing/coresight/coresight-panic-kdump.c| 199 +
 drivers/hwtracing/coresight/coresight-priv.h   |  12 ++
 include/linux/coresight.h  |   4 +
 5 files changed, 225 insertions(+)
 create mode 100644 drivers/hwtracing/coresight/coresight-panic-kdump.c

diff --git a/drivers/hwtracing/coresight/Kconfig 
b/drivers/hwtracing/coresight/Kconfig
index ef9cb3c..3089abf 100644
--- a/drivers/hwtracing/coresight/Kconfig
+++ b/drivers/hwtracing/coresight/Kconfig
@@ -103,4 +103,13 @@ config CORESIGHT_CPU_DEBUG
  properly, please refer Documentation/trace/coresight-cpu-debug.txt
  for detailed description and the example for usage.
 
+config CORESIGHT_PANIC_KDUMP
+   bool "CoreSight Panic Kdump driver"
+   depends on ARM || ARM64
+   help
+ This driver provides panic kdump functionality for CoreSight devices.
+ When kernel panic happen Coresight device supplied callback function
+ is to dump trace data to memory. From then on, kdump can be used to
+ extract the trace data from kernel dump file.
+
 endif
diff --git a/drivers/hwtracing/coresight/Makefile 
b/drivers/hwtracing/coresight/Makefile
index 61db9dd..946fe19 100644
--- a/drivers/hwtracing/coresight/Makefile
+++ b/drivers/hwtracing/coresight/Makefile
@@ -18,3 +18,4 @@ obj-$(CONFIG_CORESIGHT_SOURCE_ETM4X) += coresight-etm4x.o \
 obj-$(CONFIG_CORESIGHT_DYNAMIC_REPLICATOR) += coresight-dynamic-replicator.o
 obj-$(CONFIG_CORESIGHT_STM) += coresight-stm.o
 obj-$(CONFIG_CORESIGHT_CPU_DEBUG) += coresight-cpu-debug.o
+obj-$(CONFIG_CORESIGHT_PANIC_KDUMP) += coresight-panic-kdump.o
diff --git a/drivers/hwtracing/coresight/coresight-panic-kdump.c 
b/drivers/hwtracing/coresight/coresight-panic-kdump.c
new file mode 100644
index 000..f4589e9
--- /dev/null
+++ b/drivers/hwtracing/coresight/coresight-panic-kdump.c
@@ -0,0 +1,199 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2017~2018 Linaro Limited.
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "coresight-priv.h"
+
+/**
+ * struct coresight_kdump_node - Node information for dump
+ * @source_csdev:  Handler for source coresight device
+ * @sink_csdev:Handler for sink coresight device
+ */
+struct coresight_kdump_node {
+   struct coresight_device *source_csdev;
+   struct coresight_device *sink_csdev;
+};
+
+static DEFINE_SPINLOCK(coresight_kdump_lock);
+static struct coresight_kdump_node *coresight_kdump_nodes;
+static struct notifier_block coresight_kdump_nb;
+
+/**
+ * coresight_kdump_source - Set source dump info for specific CPU
+ * @cpu:   CPU ID
+ * @csdev: Source device structure handler
+ * @data:  Pointer for source device metadata buffer
+ * @data_sz:   Size of source device metadata buffer
+ *
+ * This function is a helper function which is used to set/clear source device
+ * handler and metadata when the tracer is enabled; and it can be used to clear
+ * source device related info when the tracer is disabled.
+ *
+ * Returns:0 on success, negative errno otherwise.
+ */
+int coresight_kdump_source(int cpu, struct coresight_device *csdev,
+  char *data, unsigned int data_sz)
+{
+   struct coresight_kdump_node *node;
+   unsigned long flags;
+
+   if (!coresight_kdump_nodes)
+   return -EPROBE_DEFER;
+
+   spin_lock_irqsave(_kdump_lock, flags);
+
+   node = _kdump_nodes[cpu];
+   node->source_csdev = csdev;
+
+   csdev->kdump_buf = data;
+   csdev->kdump_buf_sz = data_sz;
+
+   spin_unlock_irqrestore(_kdump_lock, flags);
+   return 0;
+}
+
+/**
+ * coresight_kdump_sink - Set sink device handler for specific CPU
+ * @cpu:   CPU ID
+ * @csdev: Sink device structure handler
+ *
+ * This function is a helper function which is used to set sink device handler
+ * when the Coresight 

[PATCH v4 1/6] doc: Add Coresight documentation directory

2018-03-29 Thread Leo Yan
For easy management and friendly adding more Coresight documentation,
this commit creates a new directory: Documentation/trace/coresight.

This commit also moves Coresight related docs into the new directory
and updates MAINTAINERS file to reflect docs movement.

Signed-off-by: Leo Yan 
---
 Documentation/trace/coresight-cpu-debug.txt| 187 --
 Documentation/trace/coresight.txt  | 383 -
 .../trace/coresight/coresight-cpu-debug.txt| 187 ++
 Documentation/trace/coresight/coresight.txt| 383 +
 MAINTAINERS|   4 +-
 5 files changed, 572 insertions(+), 572 deletions(-)
 delete mode 100644 Documentation/trace/coresight-cpu-debug.txt
 delete mode 100644 Documentation/trace/coresight.txt
 create mode 100644 Documentation/trace/coresight/coresight-cpu-debug.txt
 create mode 100644 Documentation/trace/coresight/coresight.txt

diff --git a/Documentation/trace/coresight-cpu-debug.txt 
b/Documentation/trace/coresight-cpu-debug.txt
deleted file mode 100644
index 2b9b51c..000
--- a/Documentation/trace/coresight-cpu-debug.txt
+++ /dev/null
@@ -1,187 +0,0 @@
-   Coresight CPU Debug Module
-   ==
-
-   Author:   Leo Yan 
-   Date: April 5th, 2017
-
-Introduction
-
-
-Coresight CPU debug module is defined in ARMv8-a architecture reference manual
-(ARM DDI 0487A.k) Chapter 'Part H: External debug', the CPU can integrate
-debug module and it is mainly used for two modes: self-hosted debug and
-external debug. Usually the external debug mode is well known as the external
-debugger connects with SoC from JTAG port; on the other hand the program can
-explore debugging method which rely on self-hosted debug mode, this document
-is to focus on this part.
-
-The debug module provides sample-based profiling extension, which can be used
-to sample CPU program counter, secure state and exception level, etc; usually
-every CPU has one dedicated debug module to be connected. Based on self-hosted
-debug mechanism, Linux kernel can access these related registers from mmio
-region when the kernel panic happens. The callback notifier for kernel panic
-will dump related registers for every CPU; finally this is good for assistant
-analysis for panic.
-
-
-Implementation
---
-
-- During driver registration, it uses EDDEVID and EDDEVID1 - two device ID
-  registers to decide if sample-based profiling is implemented or not. On some
-  platforms this hardware feature is fully or partially implemented; and if
-  this feature is not supported then registration will fail.
-
-- At the time this documentation was written, the debug driver mainly relies on
-  information gathered by the kernel panic callback notifier from three
-  sampling registers: EDPCSR, EDVIDSR and EDCIDSR: from EDPCSR we can get
-  program counter; EDVIDSR has information for secure state, exception level,
-  bit width, etc; EDCIDSR is context ID value which contains the sampled value
-  of CONTEXTIDR_EL1.
-
-- The driver supports a CPU running in either AArch64 or AArch32 mode. The
-  registers naming convention is a bit different between them, AArch64 uses
-  'ED' for register prefix (ARM DDI 0487A.k, chapter H9.1) and AArch32 uses
-  'DBG' as prefix (ARM DDI 0487A.k, chapter G5.1). The driver is unified to
-  use AArch64 naming convention.
-
-- ARMv8-a (ARM DDI 0487A.k) and ARMv7-a (ARM DDI 0406C.b) have different
-  register bits definition. So the driver consolidates two difference:
-
-  If PCSROffset=0b, on ARMv8-a the feature of EDPCSR is not implemented;
-  but ARMv7-a defines "PCSR samples are offset by a value that depends on the
-  instruction set state". For ARMv7-a, the driver checks furthermore if CPU
-  runs with ARM or thumb instruction set and calibrate PCSR value, the
-  detailed description for offset is in ARMv7-a ARM (ARM DDI 0406C.b) chapter
-  C11.11.34 "DBGPCSR, Program Counter Sampling Register".
-
-  If PCSROffset=0b0010, ARMv8-a defines "EDPCSR implemented, and samples have
-  no offset applied and do not sample the instruction set state in AArch32
-  state". So on ARMv8 if EDDEVID1.PCSROffset is 0b0010 and the CPU operates
-  in AArch32 state, EDPCSR is not sampled; when the CPU operates in AArch64
-  state EDPCSR is sampled and no offset are applied.
-
-
-Clock and power domain
---
-
-Before accessing debug registers, we should ensure the clock and power domain
-have been enabled properly. In ARMv8-a ARM (ARM DDI 0487A.k) chapter 'H9.1
-Debug registers', the debug registers are spread into two domains: the debug
-domain and the CPU domain.
-
-+---+
-|   |
-|   |
- +--+--+|
-dbg_clock -->|  |**| 

[PATCH v4 5/6] coresight: Set and clear sink device handler for kdump node

2018-03-29 Thread Leo Yan
If Coresight path is enabled for specific CPU, the sink device handler
need to be set to kdump node; on the other hand we also need to clear
sink device handler when path is disabled.

This patch sets sink devices handler for kdump node for two separate
Coresight enabling modes: CS_MODE_SYSFS and CS_MODE_PERF; and clear the
handler when Coresight is disabled.

Signed-off-by: Leo Yan 
---
 drivers/hwtracing/coresight/coresight-etm-perf.c |  5 +
 drivers/hwtracing/coresight/coresight.c  | 16 ++--
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.c 
b/drivers/hwtracing/coresight/coresight-etm-perf.c
index 8a0ad77..f8b159c 100644
--- a/drivers/hwtracing/coresight/coresight-etm-perf.c
+++ b/drivers/hwtracing/coresight/coresight-etm-perf.c
@@ -139,6 +139,8 @@ static void free_event_data(struct work_struct *work)
for_each_cpu(cpu, mask) {
if (!(IS_ERR_OR_NULL(event_data->path[cpu])))
coresight_release_path(event_data->path[cpu]);
+
+   coresight_kdump_sink(cpu, NULL);
}
 
kfree(event_data->path);
@@ -238,6 +240,9 @@ static void *etm_setup_aux(int event_cpu, void **pages,
event_data->path[cpu] = coresight_build_path(csdev, sink);
if (IS_ERR(event_data->path[cpu]))
goto err;
+
+   if (coresight_kdump_sink(cpu, sink))
+   goto err;
}
 
if (!sink_ops(sink)->alloc_buffer)
diff --git a/drivers/hwtracing/coresight/coresight.c 
b/drivers/hwtracing/coresight/coresight.c
index 389c4ba..483a1f7 100644
--- a/drivers/hwtracing/coresight/coresight.c
+++ b/drivers/hwtracing/coresight/coresight.c
@@ -272,6 +272,7 @@ static int coresight_enable_source(struct coresight_device 
*csdev, u32 mode)
 static bool coresight_disable_source(struct coresight_device *csdev)
 {
if (atomic_dec_return(csdev->refcnt) == 0) {
+
if (source_ops(csdev)->disable)
source_ops(csdev)->disable(csdev, NULL);
csdev->enable = false;
@@ -612,6 +613,13 @@ int coresight_enable(struct coresight_device *csdev)
if (ret)
goto err_source;
 
+   cpu = source_ops(csdev)->cpu_id(csdev);
+
+   /* Set sink device handler into kdump node */
+   ret = coresight_kdump_sink(cpu, sink);
+   if (ret)
+   goto err_kdump;
+
switch (subtype) {
case CORESIGHT_DEV_SUBTYPE_SOURCE_PROC:
/*
@@ -621,7 +629,6 @@ int coresight_enable(struct coresight_device *csdev)
 * be a single session per tracer (when working from sysFS)
 * a per-cpu variable will do just fine.
 */
-   cpu = source_ops(csdev)->cpu_id(csdev);
per_cpu(tracer_path, cpu) = path;
break;
case CORESIGHT_DEV_SUBTYPE_SOURCE_SOFTWARE:
@@ -636,6 +643,9 @@ int coresight_enable(struct coresight_device *csdev)
mutex_unlock(_mutex);
return ret;
 
+err_kdump:
+   coresight_disable_source(csdev);
+
 err_source:
coresight_disable_path(path);
 
@@ -659,9 +669,10 @@ void coresight_disable(struct coresight_device *csdev)
if (!csdev->enable || !coresight_disable_source(csdev))
goto out;
 
+   cpu = source_ops(csdev)->cpu_id(csdev);
+
switch (csdev->subtype.source_subtype) {
case CORESIGHT_DEV_SUBTYPE_SOURCE_PROC:
-   cpu = source_ops(csdev)->cpu_id(csdev);
path = per_cpu(tracer_path, cpu);
per_cpu(tracer_path, cpu) = NULL;
break;
@@ -674,6 +685,7 @@ void coresight_disable(struct coresight_device *csdev)
break;
}
 
+   coresight_kdump_sink(cpu, NULL);
coresight_disable_path(path);
coresight_release_path(path);
 
-- 
2.7.4



Re: [PATCH V3 0/4] genirq/affinity: irq vector spread among online CPUs as far as possible

2018-03-29 Thread Ming Lei
Hi Thomas,

On Fri, Mar 09, 2018 at 04:08:19PM +0100, Thomas Gleixner wrote:
> On Fri, 9 Mar 2018, Ming Lei wrote:
> > On Fri, Mar 09, 2018 at 11:08:54AM +0100, Thomas Gleixner wrote:
> > > > > So my understanding is that these irq patches are enhancements and 
> > > > > not bug
> > > > > fixes. I'll queue them for 4.17 then.
> > > > 
> > > > Wrt. this IO hang issue, these patches shouldn't be bug fix, but they 
> > > > may
> > > > fix performance regression[1] for some systems caused by 84676c1f21 
> > > > ("genirq/affinity:
> > > > assign vectors to all possible CPUs").
> > > > 
> > > > [1] https://marc.info/?l=linux-block=152050347831149=2
> > > 
> > > Hmm. The patches are rather large for urgent and evtl. backporting. Is
> > > there a simpler way to address that performance issue?
> > 
> > Not thought of a simpler solution. The problem is that number of active 
> > msix vector
> > is decreased a lot by commit 84676c1f21.
> 
> It's reduced in cases where the number of possible CPUs is way larger than
> the number of online CPUs.
> 
> Now, if you look at the number of present CPUs on such systems it's
> probably the same as the number of online CPUs.
> 
> It only differs on machines which support physical hotplug, but that's not
> the normal case. Those systems are more special and less wide spread.
> 
> So the obvious simple fix for this regression issue is to spread out the
> vectors accross present CPUs and not accross possible CPUs.
> 
> I'm not sure if there is a clear indicator whether physcial hotplug is
> supported or not, but the ACPI folks (x86) and architecture maintainers
> should be able to answer that question. I have a machine which says:
> 
>smpboot: Allowing 128 CPUs, 96 hotplug CPUs
> 
> There is definitely no way to hotplug anything on that machine and sure the
> existing spread algorithm will waste vectors to no end.

percpu variable may waste space too if the possible cpu number is
provided not accurately from ACPI.

> 
> Sure then there is virt, which can pretend to have a gazillion of possible
> hotpluggable CPUs, but virt is an insanity on its own. Though someone might
> come up with reasonable heuristics for that as well.

There are also IBM s390, in which physical CPU hotplug is one normal use
case.

Looks not see any other solution posted out for virt, and it may cause
complicated queue dependency issue by re-introducing CPU hotplug
handler for blk-mq.

> 
> Thoughts?

Given this patchset doesn't have effect on normal machines without
supporting physical CPU hotplug, it can fix performance regression on
machines which might support physical CPU hotplug(cpu_present_mask !=
cpu_possible_mask) with some extra memory allocation cost.

So is there any chance to make it in v4.17?

Thanks,
Ming


[PATCH v4 3/6] coresight: Support panic kdump functionality

2018-03-29 Thread Leo Yan
After kernel panic happens, Coresight tracing data has much useful info
which can be used for analysis.  For example, the trace info from ETB
RAM can be used to check the CPU execution flows before the crash.  So
we can save the tracing data from sink devices, and rely on kdump to
save DDR content and uses "crash" tool to extract Coresight dumping
from the vmcore file.

This patch is to add a simple framework to support panic dump
functionality; it registers panic notifier, and provide the helper
functions coresight_kdump_source()/coresight_kdump_sink() so Coresight
source and sink devices can be recorded into Coresight kdump node for
kernel panic kdump.

When kernel panic happens, the notifier iterates dump array and invoke
callback function to dump tracing data.  Later the tracing data can be
used to reverse execution flow before the kernel panic.

Signed-off-by: Leo Yan 
---
 drivers/hwtracing/coresight/Kconfig|   9 +
 drivers/hwtracing/coresight/Makefile   |   1 +
 .../hwtracing/coresight/coresight-panic-kdump.c| 199 +
 drivers/hwtracing/coresight/coresight-priv.h   |  12 ++
 include/linux/coresight.h  |   4 +
 5 files changed, 225 insertions(+)
 create mode 100644 drivers/hwtracing/coresight/coresight-panic-kdump.c

diff --git a/drivers/hwtracing/coresight/Kconfig 
b/drivers/hwtracing/coresight/Kconfig
index ef9cb3c..3089abf 100644
--- a/drivers/hwtracing/coresight/Kconfig
+++ b/drivers/hwtracing/coresight/Kconfig
@@ -103,4 +103,13 @@ config CORESIGHT_CPU_DEBUG
  properly, please refer Documentation/trace/coresight-cpu-debug.txt
  for detailed description and the example for usage.
 
+config CORESIGHT_PANIC_KDUMP
+   bool "CoreSight Panic Kdump driver"
+   depends on ARM || ARM64
+   help
+ This driver provides panic kdump functionality for CoreSight devices.
+ When kernel panic happen Coresight device supplied callback function
+ is to dump trace data to memory. From then on, kdump can be used to
+ extract the trace data from kernel dump file.
+
 endif
diff --git a/drivers/hwtracing/coresight/Makefile 
b/drivers/hwtracing/coresight/Makefile
index 61db9dd..946fe19 100644
--- a/drivers/hwtracing/coresight/Makefile
+++ b/drivers/hwtracing/coresight/Makefile
@@ -18,3 +18,4 @@ obj-$(CONFIG_CORESIGHT_SOURCE_ETM4X) += coresight-etm4x.o \
 obj-$(CONFIG_CORESIGHT_DYNAMIC_REPLICATOR) += coresight-dynamic-replicator.o
 obj-$(CONFIG_CORESIGHT_STM) += coresight-stm.o
 obj-$(CONFIG_CORESIGHT_CPU_DEBUG) += coresight-cpu-debug.o
+obj-$(CONFIG_CORESIGHT_PANIC_KDUMP) += coresight-panic-kdump.o
diff --git a/drivers/hwtracing/coresight/coresight-panic-kdump.c 
b/drivers/hwtracing/coresight/coresight-panic-kdump.c
new file mode 100644
index 000..f4589e9
--- /dev/null
+++ b/drivers/hwtracing/coresight/coresight-panic-kdump.c
@@ -0,0 +1,199 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2017~2018 Linaro Limited.
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "coresight-priv.h"
+
+/**
+ * struct coresight_kdump_node - Node information for dump
+ * @source_csdev:  Handler for source coresight device
+ * @sink_csdev:Handler for sink coresight device
+ */
+struct coresight_kdump_node {
+   struct coresight_device *source_csdev;
+   struct coresight_device *sink_csdev;
+};
+
+static DEFINE_SPINLOCK(coresight_kdump_lock);
+static struct coresight_kdump_node *coresight_kdump_nodes;
+static struct notifier_block coresight_kdump_nb;
+
+/**
+ * coresight_kdump_source - Set source dump info for specific CPU
+ * @cpu:   CPU ID
+ * @csdev: Source device structure handler
+ * @data:  Pointer for source device metadata buffer
+ * @data_sz:   Size of source device metadata buffer
+ *
+ * This function is a helper function which is used to set/clear source device
+ * handler and metadata when the tracer is enabled; and it can be used to clear
+ * source device related info when the tracer is disabled.
+ *
+ * Returns:0 on success, negative errno otherwise.
+ */
+int coresight_kdump_source(int cpu, struct coresight_device *csdev,
+  char *data, unsigned int data_sz)
+{
+   struct coresight_kdump_node *node;
+   unsigned long flags;
+
+   if (!coresight_kdump_nodes)
+   return -EPROBE_DEFER;
+
+   spin_lock_irqsave(_kdump_lock, flags);
+
+   node = _kdump_nodes[cpu];
+   node->source_csdev = csdev;
+
+   csdev->kdump_buf = data;
+   csdev->kdump_buf_sz = data_sz;
+
+   spin_unlock_irqrestore(_kdump_lock, flags);
+   return 0;
+}
+
+/**
+ * coresight_kdump_sink - Set sink device handler for specific CPU
+ * @cpu:   CPU ID
+ * @csdev: Sink device structure handler
+ *
+ * This function is a helper function which is used to set sink device handler
+ * when the Coresight path has been enabled 

[PATCH v4 1/6] doc: Add Coresight documentation directory

2018-03-29 Thread Leo Yan
For easy management and friendly adding more Coresight documentation,
this commit creates a new directory: Documentation/trace/coresight.

This commit also moves Coresight related docs into the new directory
and updates MAINTAINERS file to reflect docs movement.

Signed-off-by: Leo Yan 
---
 Documentation/trace/coresight-cpu-debug.txt| 187 --
 Documentation/trace/coresight.txt  | 383 -
 .../trace/coresight/coresight-cpu-debug.txt| 187 ++
 Documentation/trace/coresight/coresight.txt| 383 +
 MAINTAINERS|   4 +-
 5 files changed, 572 insertions(+), 572 deletions(-)
 delete mode 100644 Documentation/trace/coresight-cpu-debug.txt
 delete mode 100644 Documentation/trace/coresight.txt
 create mode 100644 Documentation/trace/coresight/coresight-cpu-debug.txt
 create mode 100644 Documentation/trace/coresight/coresight.txt

diff --git a/Documentation/trace/coresight-cpu-debug.txt 
b/Documentation/trace/coresight-cpu-debug.txt
deleted file mode 100644
index 2b9b51c..000
--- a/Documentation/trace/coresight-cpu-debug.txt
+++ /dev/null
@@ -1,187 +0,0 @@
-   Coresight CPU Debug Module
-   ==
-
-   Author:   Leo Yan 
-   Date: April 5th, 2017
-
-Introduction
-
-
-Coresight CPU debug module is defined in ARMv8-a architecture reference manual
-(ARM DDI 0487A.k) Chapter 'Part H: External debug', the CPU can integrate
-debug module and it is mainly used for two modes: self-hosted debug and
-external debug. Usually the external debug mode is well known as the external
-debugger connects with SoC from JTAG port; on the other hand the program can
-explore debugging method which rely on self-hosted debug mode, this document
-is to focus on this part.
-
-The debug module provides sample-based profiling extension, which can be used
-to sample CPU program counter, secure state and exception level, etc; usually
-every CPU has one dedicated debug module to be connected. Based on self-hosted
-debug mechanism, Linux kernel can access these related registers from mmio
-region when the kernel panic happens. The callback notifier for kernel panic
-will dump related registers for every CPU; finally this is good for assistant
-analysis for panic.
-
-
-Implementation
---
-
-- During driver registration, it uses EDDEVID and EDDEVID1 - two device ID
-  registers to decide if sample-based profiling is implemented or not. On some
-  platforms this hardware feature is fully or partially implemented; and if
-  this feature is not supported then registration will fail.
-
-- At the time this documentation was written, the debug driver mainly relies on
-  information gathered by the kernel panic callback notifier from three
-  sampling registers: EDPCSR, EDVIDSR and EDCIDSR: from EDPCSR we can get
-  program counter; EDVIDSR has information for secure state, exception level,
-  bit width, etc; EDCIDSR is context ID value which contains the sampled value
-  of CONTEXTIDR_EL1.
-
-- The driver supports a CPU running in either AArch64 or AArch32 mode. The
-  registers naming convention is a bit different between them, AArch64 uses
-  'ED' for register prefix (ARM DDI 0487A.k, chapter H9.1) and AArch32 uses
-  'DBG' as prefix (ARM DDI 0487A.k, chapter G5.1). The driver is unified to
-  use AArch64 naming convention.
-
-- ARMv8-a (ARM DDI 0487A.k) and ARMv7-a (ARM DDI 0406C.b) have different
-  register bits definition. So the driver consolidates two difference:
-
-  If PCSROffset=0b, on ARMv8-a the feature of EDPCSR is not implemented;
-  but ARMv7-a defines "PCSR samples are offset by a value that depends on the
-  instruction set state". For ARMv7-a, the driver checks furthermore if CPU
-  runs with ARM or thumb instruction set and calibrate PCSR value, the
-  detailed description for offset is in ARMv7-a ARM (ARM DDI 0406C.b) chapter
-  C11.11.34 "DBGPCSR, Program Counter Sampling Register".
-
-  If PCSROffset=0b0010, ARMv8-a defines "EDPCSR implemented, and samples have
-  no offset applied and do not sample the instruction set state in AArch32
-  state". So on ARMv8 if EDDEVID1.PCSROffset is 0b0010 and the CPU operates
-  in AArch32 state, EDPCSR is not sampled; when the CPU operates in AArch64
-  state EDPCSR is sampled and no offset are applied.
-
-
-Clock and power domain
---
-
-Before accessing debug registers, we should ensure the clock and power domain
-have been enabled properly. In ARMv8-a ARM (ARM DDI 0487A.k) chapter 'H9.1
-Debug registers', the debug registers are spread into two domains: the debug
-domain and the CPU domain.
-
-+---+
-|   |
-|   |
- +--+--+|
-dbg_clock -->|  |**||<-- cpu_clock
- 

[PATCH v4 5/6] coresight: Set and clear sink device handler for kdump node

2018-03-29 Thread Leo Yan
If Coresight path is enabled for specific CPU, the sink device handler
need to be set to kdump node; on the other hand we also need to clear
sink device handler when path is disabled.

This patch sets sink devices handler for kdump node for two separate
Coresight enabling modes: CS_MODE_SYSFS and CS_MODE_PERF; and clear the
handler when Coresight is disabled.

Signed-off-by: Leo Yan 
---
 drivers/hwtracing/coresight/coresight-etm-perf.c |  5 +
 drivers/hwtracing/coresight/coresight.c  | 16 ++--
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.c 
b/drivers/hwtracing/coresight/coresight-etm-perf.c
index 8a0ad77..f8b159c 100644
--- a/drivers/hwtracing/coresight/coresight-etm-perf.c
+++ b/drivers/hwtracing/coresight/coresight-etm-perf.c
@@ -139,6 +139,8 @@ static void free_event_data(struct work_struct *work)
for_each_cpu(cpu, mask) {
if (!(IS_ERR_OR_NULL(event_data->path[cpu])))
coresight_release_path(event_data->path[cpu]);
+
+   coresight_kdump_sink(cpu, NULL);
}
 
kfree(event_data->path);
@@ -238,6 +240,9 @@ static void *etm_setup_aux(int event_cpu, void **pages,
event_data->path[cpu] = coresight_build_path(csdev, sink);
if (IS_ERR(event_data->path[cpu]))
goto err;
+
+   if (coresight_kdump_sink(cpu, sink))
+   goto err;
}
 
if (!sink_ops(sink)->alloc_buffer)
diff --git a/drivers/hwtracing/coresight/coresight.c 
b/drivers/hwtracing/coresight/coresight.c
index 389c4ba..483a1f7 100644
--- a/drivers/hwtracing/coresight/coresight.c
+++ b/drivers/hwtracing/coresight/coresight.c
@@ -272,6 +272,7 @@ static int coresight_enable_source(struct coresight_device 
*csdev, u32 mode)
 static bool coresight_disable_source(struct coresight_device *csdev)
 {
if (atomic_dec_return(csdev->refcnt) == 0) {
+
if (source_ops(csdev)->disable)
source_ops(csdev)->disable(csdev, NULL);
csdev->enable = false;
@@ -612,6 +613,13 @@ int coresight_enable(struct coresight_device *csdev)
if (ret)
goto err_source;
 
+   cpu = source_ops(csdev)->cpu_id(csdev);
+
+   /* Set sink device handler into kdump node */
+   ret = coresight_kdump_sink(cpu, sink);
+   if (ret)
+   goto err_kdump;
+
switch (subtype) {
case CORESIGHT_DEV_SUBTYPE_SOURCE_PROC:
/*
@@ -621,7 +629,6 @@ int coresight_enable(struct coresight_device *csdev)
 * be a single session per tracer (when working from sysFS)
 * a per-cpu variable will do just fine.
 */
-   cpu = source_ops(csdev)->cpu_id(csdev);
per_cpu(tracer_path, cpu) = path;
break;
case CORESIGHT_DEV_SUBTYPE_SOURCE_SOFTWARE:
@@ -636,6 +643,9 @@ int coresight_enable(struct coresight_device *csdev)
mutex_unlock(_mutex);
return ret;
 
+err_kdump:
+   coresight_disable_source(csdev);
+
 err_source:
coresight_disable_path(path);
 
@@ -659,9 +669,10 @@ void coresight_disable(struct coresight_device *csdev)
if (!csdev->enable || !coresight_disable_source(csdev))
goto out;
 
+   cpu = source_ops(csdev)->cpu_id(csdev);
+
switch (csdev->subtype.source_subtype) {
case CORESIGHT_DEV_SUBTYPE_SOURCE_PROC:
-   cpu = source_ops(csdev)->cpu_id(csdev);
path = per_cpu(tracer_path, cpu);
per_cpu(tracer_path, cpu) = NULL;
break;
@@ -674,6 +685,7 @@ void coresight_disable(struct coresight_device *csdev)
break;
}
 
+   coresight_kdump_sink(cpu, NULL);
coresight_disable_path(path);
coresight_release_path(path);
 
-- 
2.7.4



[PATCH v4 4/6] coresight: tmc: Hook callback for panic kdump

2018-03-29 Thread Leo Yan
Since Coresight panic kdump functionality has been ready, this patch is
to hook panic callback function for ETB/ETF driver.  The driver data
structure has allocated a buffer when the session started, so simply
save tracing data into this buffer when panic happens and update buffer
related info for kdump.

Signed-off-by: Leo Yan 
---
 drivers/hwtracing/coresight/coresight-tmc-etf.c | 30 +
 1 file changed, 30 insertions(+)

diff --git a/drivers/hwtracing/coresight/coresight-tmc-etf.c 
b/drivers/hwtracing/coresight/coresight-tmc-etf.c
index e2513b7..d20d546 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-etf.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-etf.c
@@ -504,6 +504,35 @@ static void tmc_update_etf_buffer(struct coresight_device 
*csdev,
CS_LOCK(drvdata->base);
 }
 
+static void tmc_panic_cb(void *data)
+{
+   struct coresight_device *csdev = (struct coresight_device *)data;
+   struct tmc_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+   unsigned long flags;
+
+   if (WARN_ON_ONCE(drvdata->config_type != TMC_CONFIG_TYPE_ETB &&
+drvdata->config_type != TMC_CONFIG_TYPE_ETF))
+   return;
+
+   if (drvdata->mode == CS_MODE_DISABLED)
+   return;
+
+   spin_lock_irqsave(>spinlock, flags);
+
+   CS_UNLOCK(drvdata->base);
+
+   tmc_flush_and_stop(drvdata);
+   tmc_etb_dump_hw(drvdata);
+
+   CS_LOCK(drvdata->base);
+
+   /* Update buffer info for panic dump */
+   csdev->kdump_buf = drvdata->buf;
+   csdev->kdump_buf_sz = drvdata->len;
+
+   spin_unlock_irqrestore(>spinlock, flags);
+}
+
 static const struct coresight_ops_sink tmc_etf_sink_ops = {
.enable = tmc_enable_etf_sink,
.disable= tmc_disable_etf_sink,
@@ -512,6 +541,7 @@ static const struct coresight_ops_sink tmc_etf_sink_ops = {
.set_buffer = tmc_set_etf_buffer,
.reset_buffer   = tmc_reset_etf_buffer,
.update_buffer  = tmc_update_etf_buffer,
+   .panic_cb   = tmc_panic_cb,
 };
 
 static const struct coresight_ops_link tmc_etf_link_ops = {
-- 
2.7.4



[PATCH v4 6/6] coresight: etm4x: Support panic kdump

2018-03-29 Thread Leo Yan
ETMv4 hardware information and configuration needs to be saved as
metadata; the metadata format should be compatible with 'perf' tool and
finally is used by tracing data decoder.  ETMv4 works as tracer per CPU,
we cannot wait for gathering ETM info after CPU panic has happened in
case there have CPU is locked up and can't response inter-processor
interrupt for execution dump operations; so it's more reliable to gather
tracer metadata when all of the CPUs are alive.

This patch saves ETMv4 metadata but with the different method for
different registers.  Since values in TRCIDR{0, 1, 2, 8} and
TRCAUTHSTATUS are read-only and won't change afterward, thus those
registers values are filled into metadata structure when tracers are
instantiated.  The configuration and control registers TRCCONFIGR and
TRCTRACEIDR are dynamically configured, their values are recorded during
tracer enabling phase.

To avoid unnecessary overload introduced by set/clear operations for
updating kdump node, we only set ETMv4 metadata info for the
corresponding kdump node at initialization and won't be cleared anymore.

Suggested-by: Mathieu Poirier 
Signed-off-by: Leo Yan 
---
 drivers/hwtracing/coresight/coresight-etm4x.c | 27 +++
 drivers/hwtracing/coresight/coresight-etm4x.h | 15 +++
 2 files changed, 42 insertions(+)

diff --git a/drivers/hwtracing/coresight/coresight-etm4x.c 
b/drivers/hwtracing/coresight/coresight-etm4x.c
index cf364a5..88b1e19 100644
--- a/drivers/hwtracing/coresight/coresight-etm4x.c
+++ b/drivers/hwtracing/coresight/coresight-etm4x.c
@@ -288,6 +288,8 @@ static int etm4_enable(struct coresight_device *csdev,
int ret;
u32 val;
struct etmv4_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+   struct etmv4_config *config = >config;
+   struct etmv4_metadata *metadata = >metadata;
 
val = local_cmpxchg(>mode, CS_MODE_DISABLED, mode);
 
@@ -306,6 +308,10 @@ static int etm4_enable(struct coresight_device *csdev,
ret = -EINVAL;
}
 
+   /* Update tracer meta data after tracer configuration */
+   metadata->trcconfigr = config->cfg;
+   metadata->trctraceidr = drvdata->trcid;
+
/* The tracer didn't start */
if (ret)
local_set(>mode, CS_MODE_DISABLED);
@@ -438,6 +444,7 @@ static void etm4_init_arch_data(void *info)
u32 etmidr4;
u32 etmidr5;
struct etmv4_drvdata *drvdata = info;
+   struct etmv4_metadata *metadata = >metadata;
 
/* Make sure all registers are accessible */
etm4_os_unlock(drvdata);
@@ -590,6 +597,16 @@ static void etm4_init_arch_data(void *info)
drvdata->nrseqstate = BMVAL(etmidr5, 25, 27);
/* NUMCNTR, bits[30:28] number of counters available for tracing */
drvdata->nr_cntr = BMVAL(etmidr5, 28, 30);
+
+   /* Update metadata */
+   metadata->magic = ETM4_METADATA_MAGIC;
+   metadata->cpu = drvdata->cpu;
+   metadata->trcidr0 = readl_relaxed(drvdata->base + TRCIDR0);
+   metadata->trcidr1 = readl_relaxed(drvdata->base + TRCIDR1);
+   metadata->trcidr2 = readl_relaxed(drvdata->base + TRCIDR2);
+   metadata->trcidr8 = readl_relaxed(drvdata->base + TRCIDR8);
+   metadata->trcauthstatus = readl_relaxed(drvdata->base + TRCAUTHSTATUS);
+
CS_LOCK(drvdata->base);
 }
 
@@ -957,6 +974,7 @@ static int etm4_probe(struct amba_device *adev, const 
struct amba_id *id)
struct device *dev = >dev;
struct coresight_platform_data *pdata = NULL;
struct etmv4_drvdata *drvdata;
+   struct etmv4_metadata *metadata;
struct resource *res = >res;
struct coresight_desc desc = { 0 };
struct device_node *np = adev->dev.of_node;
@@ -1027,6 +1045,15 @@ static int etm4_probe(struct amba_device *adev, const 
struct amba_id *id)
goto err_arch_supported;
}
 
+   /* Set source device handler and metadata into kdump node */
+   metadata = >metadata;
+   ret = coresight_kdump_source(drvdata->cpu, drvdata->csdev,
+(char *)metadata, sizeof(*metadata));
+   if (ret) {
+   coresight_unregister(drvdata->csdev);
+   goto err_arch_supported;
+   }
+
ret = etm_perf_symlink(drvdata->csdev, true);
if (ret) {
coresight_unregister(drvdata->csdev);
diff --git a/drivers/hwtracing/coresight/coresight-etm4x.h 
b/drivers/hwtracing/coresight/coresight-etm4x.h
index b3b5ea7..08dc8b7 100644
--- a/drivers/hwtracing/coresight/coresight-etm4x.h
+++ b/drivers/hwtracing/coresight/coresight-etm4x.h
@@ -198,6 +198,20 @@
 #define ETM_EXLEVEL_NS_HYP BIT(14)
 #define ETM_EXLEVEL_NS_NA  BIT(15)
 
+#define ETM4_METADATA_MAGIC0x4040404040404040ULL
+
+struct etmv4_metadata {
+   u64 magic;
+   u64 cpu;
+   u64 trcconfigr;
+   u64 trctraceidr;
+  

[PATCH v4 6/6] coresight: etm4x: Support panic kdump

2018-03-29 Thread Leo Yan
ETMv4 hardware information and configuration needs to be saved as
metadata; the metadata format should be compatible with 'perf' tool and
finally is used by tracing data decoder.  ETMv4 works as tracer per CPU,
we cannot wait for gathering ETM info after CPU panic has happened in
case there have CPU is locked up and can't response inter-processor
interrupt for execution dump operations; so it's more reliable to gather
tracer metadata when all of the CPUs are alive.

This patch saves ETMv4 metadata but with the different method for
different registers.  Since values in TRCIDR{0, 1, 2, 8} and
TRCAUTHSTATUS are read-only and won't change afterward, thus those
registers values are filled into metadata structure when tracers are
instantiated.  The configuration and control registers TRCCONFIGR and
TRCTRACEIDR are dynamically configured, their values are recorded during
tracer enabling phase.

To avoid unnecessary overload introduced by set/clear operations for
updating kdump node, we only set ETMv4 metadata info for the
corresponding kdump node at initialization and won't be cleared anymore.

Suggested-by: Mathieu Poirier 
Signed-off-by: Leo Yan 
---
 drivers/hwtracing/coresight/coresight-etm4x.c | 27 +++
 drivers/hwtracing/coresight/coresight-etm4x.h | 15 +++
 2 files changed, 42 insertions(+)

diff --git a/drivers/hwtracing/coresight/coresight-etm4x.c 
b/drivers/hwtracing/coresight/coresight-etm4x.c
index cf364a5..88b1e19 100644
--- a/drivers/hwtracing/coresight/coresight-etm4x.c
+++ b/drivers/hwtracing/coresight/coresight-etm4x.c
@@ -288,6 +288,8 @@ static int etm4_enable(struct coresight_device *csdev,
int ret;
u32 val;
struct etmv4_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+   struct etmv4_config *config = >config;
+   struct etmv4_metadata *metadata = >metadata;
 
val = local_cmpxchg(>mode, CS_MODE_DISABLED, mode);
 
@@ -306,6 +308,10 @@ static int etm4_enable(struct coresight_device *csdev,
ret = -EINVAL;
}
 
+   /* Update tracer meta data after tracer configuration */
+   metadata->trcconfigr = config->cfg;
+   metadata->trctraceidr = drvdata->trcid;
+
/* The tracer didn't start */
if (ret)
local_set(>mode, CS_MODE_DISABLED);
@@ -438,6 +444,7 @@ static void etm4_init_arch_data(void *info)
u32 etmidr4;
u32 etmidr5;
struct etmv4_drvdata *drvdata = info;
+   struct etmv4_metadata *metadata = >metadata;
 
/* Make sure all registers are accessible */
etm4_os_unlock(drvdata);
@@ -590,6 +597,16 @@ static void etm4_init_arch_data(void *info)
drvdata->nrseqstate = BMVAL(etmidr5, 25, 27);
/* NUMCNTR, bits[30:28] number of counters available for tracing */
drvdata->nr_cntr = BMVAL(etmidr5, 28, 30);
+
+   /* Update metadata */
+   metadata->magic = ETM4_METADATA_MAGIC;
+   metadata->cpu = drvdata->cpu;
+   metadata->trcidr0 = readl_relaxed(drvdata->base + TRCIDR0);
+   metadata->trcidr1 = readl_relaxed(drvdata->base + TRCIDR1);
+   metadata->trcidr2 = readl_relaxed(drvdata->base + TRCIDR2);
+   metadata->trcidr8 = readl_relaxed(drvdata->base + TRCIDR8);
+   metadata->trcauthstatus = readl_relaxed(drvdata->base + TRCAUTHSTATUS);
+
CS_LOCK(drvdata->base);
 }
 
@@ -957,6 +974,7 @@ static int etm4_probe(struct amba_device *adev, const 
struct amba_id *id)
struct device *dev = >dev;
struct coresight_platform_data *pdata = NULL;
struct etmv4_drvdata *drvdata;
+   struct etmv4_metadata *metadata;
struct resource *res = >res;
struct coresight_desc desc = { 0 };
struct device_node *np = adev->dev.of_node;
@@ -1027,6 +1045,15 @@ static int etm4_probe(struct amba_device *adev, const 
struct amba_id *id)
goto err_arch_supported;
}
 
+   /* Set source device handler and metadata into kdump node */
+   metadata = >metadata;
+   ret = coresight_kdump_source(drvdata->cpu, drvdata->csdev,
+(char *)metadata, sizeof(*metadata));
+   if (ret) {
+   coresight_unregister(drvdata->csdev);
+   goto err_arch_supported;
+   }
+
ret = etm_perf_symlink(drvdata->csdev, true);
if (ret) {
coresight_unregister(drvdata->csdev);
diff --git a/drivers/hwtracing/coresight/coresight-etm4x.h 
b/drivers/hwtracing/coresight/coresight-etm4x.h
index b3b5ea7..08dc8b7 100644
--- a/drivers/hwtracing/coresight/coresight-etm4x.h
+++ b/drivers/hwtracing/coresight/coresight-etm4x.h
@@ -198,6 +198,20 @@
 #define ETM_EXLEVEL_NS_HYP BIT(14)
 #define ETM_EXLEVEL_NS_NA  BIT(15)
 
+#define ETM4_METADATA_MAGIC0x4040404040404040ULL
+
+struct etmv4_metadata {
+   u64 magic;
+   u64 cpu;
+   u64 trcconfigr;
+   u64 trctraceidr;
+   u64 trcidr0;
+   u64 trcidr1;
+   

[PATCH v4 4/6] coresight: tmc: Hook callback for panic kdump

2018-03-29 Thread Leo Yan
Since Coresight panic kdump functionality has been ready, this patch is
to hook panic callback function for ETB/ETF driver.  The driver data
structure has allocated a buffer when the session started, so simply
save tracing data into this buffer when panic happens and update buffer
related info for kdump.

Signed-off-by: Leo Yan 
---
 drivers/hwtracing/coresight/coresight-tmc-etf.c | 30 +
 1 file changed, 30 insertions(+)

diff --git a/drivers/hwtracing/coresight/coresight-tmc-etf.c 
b/drivers/hwtracing/coresight/coresight-tmc-etf.c
index e2513b7..d20d546 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-etf.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-etf.c
@@ -504,6 +504,35 @@ static void tmc_update_etf_buffer(struct coresight_device 
*csdev,
CS_LOCK(drvdata->base);
 }
 
+static void tmc_panic_cb(void *data)
+{
+   struct coresight_device *csdev = (struct coresight_device *)data;
+   struct tmc_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent);
+   unsigned long flags;
+
+   if (WARN_ON_ONCE(drvdata->config_type != TMC_CONFIG_TYPE_ETB &&
+drvdata->config_type != TMC_CONFIG_TYPE_ETF))
+   return;
+
+   if (drvdata->mode == CS_MODE_DISABLED)
+   return;
+
+   spin_lock_irqsave(>spinlock, flags);
+
+   CS_UNLOCK(drvdata->base);
+
+   tmc_flush_and_stop(drvdata);
+   tmc_etb_dump_hw(drvdata);
+
+   CS_LOCK(drvdata->base);
+
+   /* Update buffer info for panic dump */
+   csdev->kdump_buf = drvdata->buf;
+   csdev->kdump_buf_sz = drvdata->len;
+
+   spin_unlock_irqrestore(>spinlock, flags);
+}
+
 static const struct coresight_ops_sink tmc_etf_sink_ops = {
.enable = tmc_enable_etf_sink,
.disable= tmc_disable_etf_sink,
@@ -512,6 +541,7 @@ static const struct coresight_ops_sink tmc_etf_sink_ops = {
.set_buffer = tmc_set_etf_buffer,
.reset_buffer   = tmc_reset_etf_buffer,
.update_buffer  = tmc_update_etf_buffer,
+   .panic_cb   = tmc_panic_cb,
 };
 
 static const struct coresight_ops_link tmc_etf_link_ops = {
-- 
2.7.4



[PATCH v4 0/6] Coresight: Support panic kdump

2018-03-29 Thread Leo Yan
This patch set is to explore Coresight tracing data for postmortem
debugging.  When kernel panic happens, the Coresight panic kdump can
help to save on-chip tracing data and tracer metadata into DRAM, later
relies on kdump and crash/perf tools to recovery tracing data for
"offline" analysis.

The documentation is important to understand the purpose of Coresight
panic kdump, the implementation of framework and usage. Patches 0001
and patch 0002 are used for creating new sub directory for placing
Coresight docs and add a new doc for Coresight panic kdump.

Patch 0003 introduces the simple panic kdump framework which provides
helper functions can be used by Coresight devices, and it registers
panic notifier for dump tracing data.

Patches 0004/0005 support panic kdump for ETB; Patch 0006 supports the
kdump for ETMv4.

This patch set has been reworked by following suggestions at Linaro
HKG18 connect (mainly suggestions from Mathieu, thanks a lot!), and
it's rebased on acme git tree [1] with last commit 109d59b900e7 ('perf
vendor events s390: Add JSON files for IBM z14').

Due Coresight kdump data structure has been changed significantly, the
corresponding crash extension program also has been updated for this
reason [2]; furthermore the crash extension program is updated to
dynamically generate kernel buildid according to vmlinux elf info [3],
this is a fixing for the old code which uses hard-coded buildid value.

This patch set has been verified on 96boards Hikey620 with Coresight
enabling by the sysFS interface.  Also the updated crash extension
program has been verified to cowork with Coresight panic kdump and it
successfully extracts tracing data from the vmcore and finally can be
decoded by perf tool.

[1] https://git.kernel.org/pub/scm/linux/kernel/git/acme/linux.git
[2] https://git.linaro.org/people/leo.yan/crash.git/tree/extensions/csdump.c
[3] 
https://git.linaro.org/people/leo.yan/crash.git/tree/extensions/csdump_buildid.c

Changes from v3:
* Following Mathieu suggestion, reworked the panic kdump framework,
  used kdump array to maintain source and sink device handlers;
* According to Mathieu suggestion, optimized panic notifier to
  firstly dump panic CPU tracing data and then dump other CPUs tracing
  data;
* Refined doc to reflect these implementation changes;
* Changed ETMv4 driver to add source device handler at probe phase;
* Refactored crash extension program to reflect kernel changes.

Changes from v2:
* Add the two patches for documentation.
* Following Mathieu suggestion, reworked the panic kdump framework,
  removed the useless flag "PRE_PANIC".
* According to comment, changed to add and delete kdump node operations
  in sink enable/disable functions;
* According to Mathieu suggestion, handle kdump node
  addition/deletion/updating separately for sysFS interface and perf
  method.

Changes from v1:
* Add support to dump ETMv4 meta data.
* Wrote 'crash' extension csdump.so so rely on it to generate 'perf'
  format compatible file.
* Refactored panic dump driver to support pre & post panic dump.

Changes from RFC:
* Follow Mathieu's suggestion, use general framework to support dump
  functionality.
* Changed to use perf to analyse trace data.

Leo Yan (6):
  doc: Add Coresight documentation directory
  doc: Add documentation for Coresight panic kdump
  coresight: Support panic kdump functionality
  coresight: tmc: Hook callback for panic kdump
  coresight: Set and clear sink device handler for kdump node
  coresight: etm4x: Support panic kdump

 Documentation/trace/coresight-cpu-debug.txt| 187 --
 Documentation/trace/coresight.txt  | 383 -
 .../trace/coresight/coresight-cpu-debug.txt| 187 ++
 .../trace/coresight/coresight-panic-kdump.txt  | 130 +++
 Documentation/trace/coresight/coresight.txt| 383 +
 MAINTAINERS|   5 +-
 drivers/hwtracing/coresight/Kconfig|   9 +
 drivers/hwtracing/coresight/Makefile   |   1 +
 drivers/hwtracing/coresight/coresight-etm-perf.c   |   5 +
 drivers/hwtracing/coresight/coresight-etm4x.c  |  27 ++
 drivers/hwtracing/coresight/coresight-etm4x.h  |  15 +
 .../hwtracing/coresight/coresight-panic-kdump.c| 199 +++
 drivers/hwtracing/coresight/coresight-priv.h   |  12 +
 drivers/hwtracing/coresight/coresight-tmc-etf.c|  30 ++
 drivers/hwtracing/coresight/coresight.c|  16 +-
 include/linux/coresight.h  |   4 +
 16 files changed, 1019 insertions(+), 574 deletions(-)
 delete mode 100644 Documentation/trace/coresight-cpu-debug.txt
 delete mode 100644 Documentation/trace/coresight.txt
 create mode 100644 Documentation/trace/coresight/coresight-cpu-debug.txt
 create mode 100644 Documentation/trace/coresight/coresight-panic-kdump.txt
 create mode 100644 Documentation/trace/coresight/coresight.txt
 create mode 100644 

[PATCH v4 0/6] Coresight: Support panic kdump

2018-03-29 Thread Leo Yan
This patch set is to explore Coresight tracing data for postmortem
debugging.  When kernel panic happens, the Coresight panic kdump can
help to save on-chip tracing data and tracer metadata into DRAM, later
relies on kdump and crash/perf tools to recovery tracing data for
"offline" analysis.

The documentation is important to understand the purpose of Coresight
panic kdump, the implementation of framework and usage. Patches 0001
and patch 0002 are used for creating new sub directory for placing
Coresight docs and add a new doc for Coresight panic kdump.

Patch 0003 introduces the simple panic kdump framework which provides
helper functions can be used by Coresight devices, and it registers
panic notifier for dump tracing data.

Patches 0004/0005 support panic kdump for ETB; Patch 0006 supports the
kdump for ETMv4.

This patch set has been reworked by following suggestions at Linaro
HKG18 connect (mainly suggestions from Mathieu, thanks a lot!), and
it's rebased on acme git tree [1] with last commit 109d59b900e7 ('perf
vendor events s390: Add JSON files for IBM z14').

Due Coresight kdump data structure has been changed significantly, the
corresponding crash extension program also has been updated for this
reason [2]; furthermore the crash extension program is updated to
dynamically generate kernel buildid according to vmlinux elf info [3],
this is a fixing for the old code which uses hard-coded buildid value.

This patch set has been verified on 96boards Hikey620 with Coresight
enabling by the sysFS interface.  Also the updated crash extension
program has been verified to cowork with Coresight panic kdump and it
successfully extracts tracing data from the vmcore and finally can be
decoded by perf tool.

[1] https://git.kernel.org/pub/scm/linux/kernel/git/acme/linux.git
[2] https://git.linaro.org/people/leo.yan/crash.git/tree/extensions/csdump.c
[3] 
https://git.linaro.org/people/leo.yan/crash.git/tree/extensions/csdump_buildid.c

Changes from v3:
* Following Mathieu suggestion, reworked the panic kdump framework,
  used kdump array to maintain source and sink device handlers;
* According to Mathieu suggestion, optimized panic notifier to
  firstly dump panic CPU tracing data and then dump other CPUs tracing
  data;
* Refined doc to reflect these implementation changes;
* Changed ETMv4 driver to add source device handler at probe phase;
* Refactored crash extension program to reflect kernel changes.

Changes from v2:
* Add the two patches for documentation.
* Following Mathieu suggestion, reworked the panic kdump framework,
  removed the useless flag "PRE_PANIC".
* According to comment, changed to add and delete kdump node operations
  in sink enable/disable functions;
* According to Mathieu suggestion, handle kdump node
  addition/deletion/updating separately for sysFS interface and perf
  method.

Changes from v1:
* Add support to dump ETMv4 meta data.
* Wrote 'crash' extension csdump.so so rely on it to generate 'perf'
  format compatible file.
* Refactored panic dump driver to support pre & post panic dump.

Changes from RFC:
* Follow Mathieu's suggestion, use general framework to support dump
  functionality.
* Changed to use perf to analyse trace data.

Leo Yan (6):
  doc: Add Coresight documentation directory
  doc: Add documentation for Coresight panic kdump
  coresight: Support panic kdump functionality
  coresight: tmc: Hook callback for panic kdump
  coresight: Set and clear sink device handler for kdump node
  coresight: etm4x: Support panic kdump

 Documentation/trace/coresight-cpu-debug.txt| 187 --
 Documentation/trace/coresight.txt  | 383 -
 .../trace/coresight/coresight-cpu-debug.txt| 187 ++
 .../trace/coresight/coresight-panic-kdump.txt  | 130 +++
 Documentation/trace/coresight/coresight.txt| 383 +
 MAINTAINERS|   5 +-
 drivers/hwtracing/coresight/Kconfig|   9 +
 drivers/hwtracing/coresight/Makefile   |   1 +
 drivers/hwtracing/coresight/coresight-etm-perf.c   |   5 +
 drivers/hwtracing/coresight/coresight-etm4x.c  |  27 ++
 drivers/hwtracing/coresight/coresight-etm4x.h  |  15 +
 .../hwtracing/coresight/coresight-panic-kdump.c| 199 +++
 drivers/hwtracing/coresight/coresight-priv.h   |  12 +
 drivers/hwtracing/coresight/coresight-tmc-etf.c|  30 ++
 drivers/hwtracing/coresight/coresight.c|  16 +-
 include/linux/coresight.h  |   4 +
 16 files changed, 1019 insertions(+), 574 deletions(-)
 delete mode 100644 Documentation/trace/coresight-cpu-debug.txt
 delete mode 100644 Documentation/trace/coresight.txt
 create mode 100644 Documentation/trace/coresight/coresight-cpu-debug.txt
 create mode 100644 Documentation/trace/coresight/coresight-panic-kdump.txt
 create mode 100644 Documentation/trace/coresight/coresight.txt
 create mode 100644 

Re: [PATCH v3] staging: vt6655: check for memory allocation failures

2018-03-29 Thread Jia-Ju Bai



On 2018/3/30 10:44, Ji-Hun Kim wrote:

@@ -1165,10 +1205,18 @@ static int vnt_start(struct ieee80211_hw *hw)
}
  
  	dev_dbg(>pcid->dev, "call device init rd0 ring\n");

-   device_init_rd0_ring(priv);
-   device_init_rd1_ring(priv);
-   device_init_td0_ring(priv);
-   device_init_td1_ring(priv);
+   ret = device_init_rd0_ring(priv);
+   if (ret)
+   goto error;
+   ret = device_init_rd1_ring(priv);
+   if (ret)
+   goto error;
+   ret = device_init_td0_ring(priv);
+   if (ret)
+   goto error;
+   ret = device_init_td1_ring(priv);
+   if (ret)
+   goto error;
  
  	device_init_registers(priv);
  
@@ -1178,6 +1226,8 @@ static int vnt_start(struct ieee80211_hw *hw)

ieee80211_wake_queues(hw);
  
  	return 0;

+error:
+   return ret;
  }


This code will lead to memory leaks when device_init_rd1_ring() fails, 
because the memory allocated by device_init_rd0_ring() is not freed.


I think this one will be better:
ret = device_init_rd0_ring(priv);
if (ret)
goto error_init_rd0_ring;
ret = device_init_rd1_ring(priv);
if (ret)
goto error_init_rd1_ring;
ret = device_init_td0_ring(priv);
if (ret)
goto error_init_td0_ring;
ret = device_init_td1_ring(priv);
if (ret)
goto error_init_td1_ring;
..
error_init_td1_ring:
device_free_td0_ring(priv);
error_init_td0_ring:
device_free_rd1_ring(priv);
error_init_rd1_ring:
device_free_rd0_ring(priv);
error_init_rd0_ring:
return ret;


Best wishes,
Jia-Ju Bai


Re: [PATCH v3] staging: vt6655: check for memory allocation failures

2018-03-29 Thread Jia-Ju Bai



On 2018/3/30 10:44, Ji-Hun Kim wrote:

@@ -1165,10 +1205,18 @@ static int vnt_start(struct ieee80211_hw *hw)
}
  
  	dev_dbg(>pcid->dev, "call device init rd0 ring\n");

-   device_init_rd0_ring(priv);
-   device_init_rd1_ring(priv);
-   device_init_td0_ring(priv);
-   device_init_td1_ring(priv);
+   ret = device_init_rd0_ring(priv);
+   if (ret)
+   goto error;
+   ret = device_init_rd1_ring(priv);
+   if (ret)
+   goto error;
+   ret = device_init_td0_ring(priv);
+   if (ret)
+   goto error;
+   ret = device_init_td1_ring(priv);
+   if (ret)
+   goto error;
  
  	device_init_registers(priv);
  
@@ -1178,6 +1226,8 @@ static int vnt_start(struct ieee80211_hw *hw)

ieee80211_wake_queues(hw);
  
  	return 0;

+error:
+   return ret;
  }


This code will lead to memory leaks when device_init_rd1_ring() fails, 
because the memory allocated by device_init_rd0_ring() is not freed.


I think this one will be better:
ret = device_init_rd0_ring(priv);
if (ret)
goto error_init_rd0_ring;
ret = device_init_rd1_ring(priv);
if (ret)
goto error_init_rd1_ring;
ret = device_init_td0_ring(priv);
if (ret)
goto error_init_td0_ring;
ret = device_init_td1_ring(priv);
if (ret)
goto error_init_td1_ring;
..
error_init_td1_ring:
device_free_td0_ring(priv);
error_init_td0_ring:
device_free_rd1_ring(priv);
error_init_rd1_ring:
device_free_rd0_ring(priv);
error_init_rd0_ring:
return ret;


Best wishes,
Jia-Ju Bai


[PATCH v2] MAINTAINERS: update entry for ARM/berlin

2018-03-29 Thread Jisheng Zhang
Synaptics has acquired the Multimedia Solutions Business of Marvell[1].
So change the berlin entry name and move it to its alphabetical
location. We move to ARM/Synaptics instead of ARM/Marvell.

This patch also updates my email address from marvell to synaptics.

[1] https://www.synaptics.com/company/news/conexant-marvell

Signed-off-by: Jisheng Zhang 
---
Since v1:
 - add some comments about this change. Thank Andrew.

 MAINTAINERS | 18 +-
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 6e950b8b4a41..0a0da6a8e607 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1578,15 +1578,6 @@ ARM/MAGICIAN MACHINE SUPPORT
 M: Philipp Zabel 
 S: Maintained
 
-ARM/Marvell Berlin SoC support
-M: Jisheng Zhang 
-M: Sebastian Hesselbarth 
-L: linux-arm-ker...@lists.infradead.org (moderated for non-subscribers)
-S: Maintained
-F: arch/arm/mach-berlin/
-F: arch/arm/boot/dts/berlin*
-F: arch/arm64/boot/dts/marvell/berlin*
-
 ARM/Marvell Dove/MV78xx0/Orion SOC support
 M: Jason Cooper 
 M: Andrew Lunn 
@@ -2006,6 +1997,15 @@ F:   arch/arm/boot/dts/stm32*
 F: arch/arm/mach-stm32/
 F: drivers/clocksource/armv7m_systick.c
 
+ARM/Synaptics Berlin SoC support
+M: Jisheng Zhang 
+M: Sebastian Hesselbarth 
+L: linux-arm-ker...@lists.infradead.org (moderated for non-subscribers)
+S: Maintained
+F: arch/arm/mach-berlin/
+F: arch/arm/boot/dts/berlin*
+F: arch/arm64/boot/dts/marvell/berlin*
+
 ARM/TANGO ARCHITECTURE
 M: Marc Gonzalez 
 M: Mans Rullgard 
-- 
2.16.3



[PATCH v2] MAINTAINERS: update entry for ARM/berlin

2018-03-29 Thread Jisheng Zhang
Synaptics has acquired the Multimedia Solutions Business of Marvell[1].
So change the berlin entry name and move it to its alphabetical
location. We move to ARM/Synaptics instead of ARM/Marvell.

This patch also updates my email address from marvell to synaptics.

[1] https://www.synaptics.com/company/news/conexant-marvell

Signed-off-by: Jisheng Zhang 
---
Since v1:
 - add some comments about this change. Thank Andrew.

 MAINTAINERS | 18 +-
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 6e950b8b4a41..0a0da6a8e607 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1578,15 +1578,6 @@ ARM/MAGICIAN MACHINE SUPPORT
 M: Philipp Zabel 
 S: Maintained
 
-ARM/Marvell Berlin SoC support
-M: Jisheng Zhang 
-M: Sebastian Hesselbarth 
-L: linux-arm-ker...@lists.infradead.org (moderated for non-subscribers)
-S: Maintained
-F: arch/arm/mach-berlin/
-F: arch/arm/boot/dts/berlin*
-F: arch/arm64/boot/dts/marvell/berlin*
-
 ARM/Marvell Dove/MV78xx0/Orion SOC support
 M: Jason Cooper 
 M: Andrew Lunn 
@@ -2006,6 +1997,15 @@ F:   arch/arm/boot/dts/stm32*
 F: arch/arm/mach-stm32/
 F: drivers/clocksource/armv7m_systick.c
 
+ARM/Synaptics Berlin SoC support
+M: Jisheng Zhang 
+M: Sebastian Hesselbarth 
+L: linux-arm-ker...@lists.infradead.org (moderated for non-subscribers)
+S: Maintained
+F: arch/arm/mach-berlin/
+F: arch/arm/boot/dts/berlin*
+F: arch/arm64/boot/dts/marvell/berlin*
+
 ARM/TANGO ARCHITECTURE
 M: Marc Gonzalez 
 M: Mans Rullgard 
-- 
2.16.3



Re: [PATCH] xfs: always free inline data before resetting inode fork during ifree

2018-03-29 Thread Sasha Levin
On Thu, Mar 29, 2018 at 10:05:35AM +1100, Dave Chinner wrote:
>On Wed, Mar 28, 2018 at 07:30:06PM +, Sasha Levin wrote:
>> This is actually something I want maintainers to dictate. What sort of
>> testing would make the XFS folks happy here? Right now I'm doing
>> "./check 'xfs/*'" with xfstests. Is it sufficient? Anything else you'd like 
>> to see?
>
>... and you're doing it wrong. This is precisely why being able
>to discover /exactly/ what you are testing and being able to browse
>the test results so we can find out if tests passed when a user
>reports a bug on a stable kernel.
>
>The way you are running fstests skips more than half the test suite
>It also runs tests that are considered dangerous because they are
>likely to cause the test run to fail in some way (i.e. trigger an
>oops, hang the machine, leave a filesystem in an unmountable state,
>etc) and hence not complete a full pass.
>
>"./check -g auto" runs the full "expected to pass" regression test
>suite for all configured test configurations. (i.e. all config
>sections listed in the configs/.config file)

Great! With information from Darrick and yourself I've modified tests to
be more relevant. Right now I run 4 configs for each stable kernel, but
can add more or remove any - depends on what helps people analyse the
results.

The complete VM serial logs as well as results/ from xfstests are also
available and are linked from the email.

Here's an example of such email:

> From: Sasha Levin 
> To: Sasha Levin 
> To: linux-...@vger.kernel.org, "Darrick J . Wong" 
> Cc: Brian Foster , linux-kernel@vger.kernel.org
> Subject: Re: [PATCH] xfs: Correctly invert xfs_buftarg LRU isolation logic
> In-Reply-To: <20180306102638.25322-1-vben...@redhat.com>
> References: <20180306102638.25322-1-vben...@redhat.com>
>
> Hi Vratislav Bendel,
>
> [This is an automated email]
>
> This commit has been processed by the -stable helper bot and determined
> to be a high probability candidate for -stable trees. (score: 6.4845)
>
> The bot has tested the following trees: v4.15.12, v4.14.29, v4.9.89, 
> v4.4.123, v4.1.50, v3.18.101.
>
> v4.15.12: Build OK!
> v4.14.29: Build OK!
> v4.9.89: Build OK!
> v4.4.123: Build OK!
> v4.1.50: Build OK!
> v3.18.101: Build OK!
>
> XFS Specific tests:
>
> v4.15.12 (http://stable-bot.westus2.cloudapp.azure.com/test/v4.15.12/tests/):
> No tests completed!
> v4.14.29 (http://stable-bot.westus2.cloudapp.azure.com/test/v4.14.29/tests/):
> No tests completed!
> v4.9.89 (http://stable-bot.westus2.cloudapp.azure.com/test/v4.9.89/tests/):
> No tests completed!
> v4.4.123 (http://stable-bot.westus2.cloudapp.azure.com/test/v4.4.123/tests/):
> v4:
> Thu Mar 29 21:23:57 UTC 2018
> Interrupted!
> Passed all 0 tests
> v4_reflink:
> Thu Mar 29 21:24:37 UTC 2018
> Interrupted!
> Passed all 0 tests
> v4.1.50 (http://stable-bot.westus2.cloudapp.azure.com/test/v4.1.50/tests/):
> No tests completed!
> v3.18.101 
> (http://stable-bot.westus2.cloudapp.azure.com/test/v3.18.101/tests/):
> v4:
> Thu Mar 29 21:30:40 UTC 2018
> Interrupted!
> Passed all 0 tests
> v4_reflink:
> Thu Mar 29 21:25:14 UTC 2018
> Interrupted!
> Passed all 0 tests
>
> Please let us know if you'd like to have this patch included in a stable tree.
>
> --
> Thanks,
> Sasha

Let me know if this would be good enough for now, and if there's
anything else to add that'll be useful.

This brings me to the sad part of this mail: not a single stable kernel
survived a run. Most are paniced, some are hanging, and some were killed
because of KASan.

All have hit various warnings in fs/iomap.c, and kernels accross several
versions hit the BUG at fs/xfs/xfs_message.c:113 (+-1 line)

4.15.12 is hitting a use-after-free in xfs_efi_release().
4.14.29 and 4.9.89 seems to end up with corrupted memory (KASAN
warnings) at or before generic/027.
And finally, 3.18.101 is pretty unhappy with sleeping functions called
from atomic context.

--
Thanks,
Sasha

Re: [PATCH] xfs: always free inline data before resetting inode fork during ifree

2018-03-29 Thread Sasha Levin
On Thu, Mar 29, 2018 at 10:05:35AM +1100, Dave Chinner wrote:
>On Wed, Mar 28, 2018 at 07:30:06PM +, Sasha Levin wrote:
>> This is actually something I want maintainers to dictate. What sort of
>> testing would make the XFS folks happy here? Right now I'm doing
>> "./check 'xfs/*'" with xfstests. Is it sufficient? Anything else you'd like 
>> to see?
>
>... and you're doing it wrong. This is precisely why being able
>to discover /exactly/ what you are testing and being able to browse
>the test results so we can find out if tests passed when a user
>reports a bug on a stable kernel.
>
>The way you are running fstests skips more than half the test suite
>It also runs tests that are considered dangerous because they are
>likely to cause the test run to fail in some way (i.e. trigger an
>oops, hang the machine, leave a filesystem in an unmountable state,
>etc) and hence not complete a full pass.
>
>"./check -g auto" runs the full "expected to pass" regression test
>suite for all configured test configurations. (i.e. all config
>sections listed in the configs/.config file)

Great! With information from Darrick and yourself I've modified tests to
be more relevant. Right now I run 4 configs for each stable kernel, but
can add more or remove any - depends on what helps people analyse the
results.

The complete VM serial logs as well as results/ from xfstests are also
available and are linked from the email.

Here's an example of such email:

> From: Sasha Levin 
> To: Sasha Levin 
> To: linux-...@vger.kernel.org, "Darrick J . Wong" 
> Cc: Brian Foster , linux-kernel@vger.kernel.org
> Subject: Re: [PATCH] xfs: Correctly invert xfs_buftarg LRU isolation logic
> In-Reply-To: <20180306102638.25322-1-vben...@redhat.com>
> References: <20180306102638.25322-1-vben...@redhat.com>
>
> Hi Vratislav Bendel,
>
> [This is an automated email]
>
> This commit has been processed by the -stable helper bot and determined
> to be a high probability candidate for -stable trees. (score: 6.4845)
>
> The bot has tested the following trees: v4.15.12, v4.14.29, v4.9.89, 
> v4.4.123, v4.1.50, v3.18.101.
>
> v4.15.12: Build OK!
> v4.14.29: Build OK!
> v4.9.89: Build OK!
> v4.4.123: Build OK!
> v4.1.50: Build OK!
> v3.18.101: Build OK!
>
> XFS Specific tests:
>
> v4.15.12 (http://stable-bot.westus2.cloudapp.azure.com/test/v4.15.12/tests/):
> No tests completed!
> v4.14.29 (http://stable-bot.westus2.cloudapp.azure.com/test/v4.14.29/tests/):
> No tests completed!
> v4.9.89 (http://stable-bot.westus2.cloudapp.azure.com/test/v4.9.89/tests/):
> No tests completed!
> v4.4.123 (http://stable-bot.westus2.cloudapp.azure.com/test/v4.4.123/tests/):
> v4:
> Thu Mar 29 21:23:57 UTC 2018
> Interrupted!
> Passed all 0 tests
> v4_reflink:
> Thu Mar 29 21:24:37 UTC 2018
> Interrupted!
> Passed all 0 tests
> v4.1.50 (http://stable-bot.westus2.cloudapp.azure.com/test/v4.1.50/tests/):
> No tests completed!
> v3.18.101 
> (http://stable-bot.westus2.cloudapp.azure.com/test/v3.18.101/tests/):
> v4:
> Thu Mar 29 21:30:40 UTC 2018
> Interrupted!
> Passed all 0 tests
> v4_reflink:
> Thu Mar 29 21:25:14 UTC 2018
> Interrupted!
> Passed all 0 tests
>
> Please let us know if you'd like to have this patch included in a stable tree.
>
> --
> Thanks,
> Sasha

Let me know if this would be good enough for now, and if there's
anything else to add that'll be useful.

This brings me to the sad part of this mail: not a single stable kernel
survived a run. Most are paniced, some are hanging, and some were killed
because of KASan.

All have hit various warnings in fs/iomap.c, and kernels accross several
versions hit the BUG at fs/xfs/xfs_message.c:113 (+-1 line)

4.15.12 is hitting a use-after-free in xfs_efi_release().
4.14.29 and 4.9.89 seems to end up with corrupted memory (KASAN
warnings) at or before generic/027.
And finally, 3.18.101 is pretty unhappy with sleeping functions called
from atomic context.

--
Thanks,
Sasha

Re: [RFC PATCH V2 8/8] vhost: event suppression for packed ring

2018-03-29 Thread Jason Wang



On 2018年03月30日 10:05, Tiwei Bie wrote:

On Mon, Mar 26, 2018 at 11:38:53AM +0800, Jason Wang wrote:

This patch introduces basic support for event suppression aka driver
and device area. Compile tested only.

Signed-off-by: Jason Wang 
---

[...]

+
+static bool vhost_notify_packed(struct vhost_dev *dev,
+   struct vhost_virtqueue *vq)
+{
+   __virtio16 event_off_wrap, event_flags;
+   __u16 old, new;
+   bool v, wrap;
+   int off;
+
+   /* Flush out used descriptors updates. This is paired
+* with the barrier that the Guest executes when enabling
+* interrupts.
+*/
+   smp_mb();
+
+   if (vhost_get_avail(vq, event_flags,
+  >driver_event->desc_event_flags) < 0) {
+   vq_err(vq, "Failed to get driver desc_event_flags");
+   return true;
+   }
+
+   if (!(event_flags & cpu_to_vhost16(vq, RING_EVENT_FLAGS_DESC)))
+   return event_flags ==
+  cpu_to_vhost16(vq, RING_EVENT_FLAGS_ENABLE);

Maybe it would be better to not use '&' here. Because these flags
are not defined as bits which can be ORed or ANDed. Instead, they
are defined as values:

0x0  enable
0x1  disable
0x2  desc
0x3  reserved


Yes the code seems tricky. Let me fix it in next version.




+
+   /* Read desc event flags before event_off and event_wrap */
+   smp_rmb();
+
+   if (vhost_get_avail(vq, event_off_wrap,
+   >driver_event->desc_event_off_warp) < 0) {
+   vq_err(vq, "Failed to get driver desc_event_off/wrap");
+   return true;
+   }
+
+   off = vhost16_to_cpu(vq, event_off_wrap);
+
+   wrap = off & 0x1;
+   off >>= 1;

Based on the below definitions in spec, wrap counter is
the most significant bit.

struct pvirtq_event_suppress {
le16 {
desc_event_off : 15; /* Descriptor Ring Change Event Offset */
desc_event_wrap : 1; /* Descriptor Ring Change Event Wrap 
Counter */
} desc; /* If desc_event_flags set to RING_EVENT_FLAGS_DESC */
le16 {
desc_event_flags : 2, /* Descriptor Ring Change Event Flags */
reserved : 14; /* Reserved, set to 0 */
} flags;
};


Will fix this in next version.




+
+
+   old = vq->signalled_used;
+   v = vq->signalled_used_valid;
+   new = vq->signalled_used = vq->last_used_idx;
+   vq->signalled_used_valid = true;
+
+   if (unlikely(!v))
+   return true;
+
+   return vhost_vring_packed_need_event(vq, new, old, off) &&
+  wrap == vq->used_wrap_counter;
+}
+
+static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+{
+   if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+   return vhost_notify_packed(dev, vq);
+   else
+   return vhost_notify_split(dev, vq);
+}
+
  /* This actually signals the guest, using eventfd. */
  void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
  {
@@ -2789,7 +2911,17 @@ static bool vhost_enable_notify_packed(struct vhost_dev 
*dev,
__virtio16 flags;
int ret;
  
-	/* FIXME: disable notification through device area */

+   if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY))
+   return false;
+   vq->used_flags &= ~VRING_USED_F_NO_NOTIFY;
+
+   flags = cpu_to_vhost16(vq, RING_EVENT_FLAGS_ENABLE);
+   ret = vhost_update_device_flags(vq, flags);
+   if (ret) {
+   vq_err(vq, "Failed to enable notification at %p: %d\n",
+  >device_event->desc_event_flags, ret);
+   return false;
+   }
  
  	/* They could have slipped one in as we were doing that: make

 * sure it's written, then check again. */
@@ -2855,7 +2987,18 @@ EXPORT_SYMBOL_GPL(vhost_enable_notify);
  static void vhost_disable_notify_packed(struct vhost_dev *dev,
struct vhost_virtqueue *vq)
  {
-   /* FIXME: disable notification through device area */
+   __virtio16 flags;
+   int r;
+
+   if (vq->used_flags & VRING_USED_F_NO_NOTIFY)
+   return;
+   vq->used_flags |= VRING_USED_F_NO_NOTIFY;
+
+   flags = cpu_to_vhost16(vq, RING_EVENT_FLAGS_DISABLE);
+   r = vhost_update_device_flags(vq, flags);
+   if (r)
+   vq_err(vq, "Failed to enable notification at %p: %d\n",
+  >device_event->desc_event_flags, r);
  }
  
  static void vhost_disable_notify_split(struct vhost_dev *dev,

diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 8a9df4f..02d7a36 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -96,8 +96,14 @@ struct vhost_virtqueue {
struct vring_desc __user *desc;
struct vring_desc_packed __user *desc_packed;

Do you think it'd be better to name the desc type as
struct vring_packed_desc?


Ok. 

Re: [RFC PATCH V2 8/8] vhost: event suppression for packed ring

2018-03-29 Thread Jason Wang



On 2018年03月30日 10:05, Tiwei Bie wrote:

On Mon, Mar 26, 2018 at 11:38:53AM +0800, Jason Wang wrote:

This patch introduces basic support for event suppression aka driver
and device area. Compile tested only.

Signed-off-by: Jason Wang 
---

[...]

+
+static bool vhost_notify_packed(struct vhost_dev *dev,
+   struct vhost_virtqueue *vq)
+{
+   __virtio16 event_off_wrap, event_flags;
+   __u16 old, new;
+   bool v, wrap;
+   int off;
+
+   /* Flush out used descriptors updates. This is paired
+* with the barrier that the Guest executes when enabling
+* interrupts.
+*/
+   smp_mb();
+
+   if (vhost_get_avail(vq, event_flags,
+  >driver_event->desc_event_flags) < 0) {
+   vq_err(vq, "Failed to get driver desc_event_flags");
+   return true;
+   }
+
+   if (!(event_flags & cpu_to_vhost16(vq, RING_EVENT_FLAGS_DESC)))
+   return event_flags ==
+  cpu_to_vhost16(vq, RING_EVENT_FLAGS_ENABLE);

Maybe it would be better to not use '&' here. Because these flags
are not defined as bits which can be ORed or ANDed. Instead, they
are defined as values:

0x0  enable
0x1  disable
0x2  desc
0x3  reserved


Yes the code seems tricky. Let me fix it in next version.




+
+   /* Read desc event flags before event_off and event_wrap */
+   smp_rmb();
+
+   if (vhost_get_avail(vq, event_off_wrap,
+   >driver_event->desc_event_off_warp) < 0) {
+   vq_err(vq, "Failed to get driver desc_event_off/wrap");
+   return true;
+   }
+
+   off = vhost16_to_cpu(vq, event_off_wrap);
+
+   wrap = off & 0x1;
+   off >>= 1;

Based on the below definitions in spec, wrap counter is
the most significant bit.

struct pvirtq_event_suppress {
le16 {
desc_event_off : 15; /* Descriptor Ring Change Event Offset */
desc_event_wrap : 1; /* Descriptor Ring Change Event Wrap 
Counter */
} desc; /* If desc_event_flags set to RING_EVENT_FLAGS_DESC */
le16 {
desc_event_flags : 2, /* Descriptor Ring Change Event Flags */
reserved : 14; /* Reserved, set to 0 */
} flags;
};


Will fix this in next version.




+
+
+   old = vq->signalled_used;
+   v = vq->signalled_used_valid;
+   new = vq->signalled_used = vq->last_used_idx;
+   vq->signalled_used_valid = true;
+
+   if (unlikely(!v))
+   return true;
+
+   return vhost_vring_packed_need_event(vq, new, old, off) &&
+  wrap == vq->used_wrap_counter;
+}
+
+static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
+{
+   if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+   return vhost_notify_packed(dev, vq);
+   else
+   return vhost_notify_split(dev, vq);
+}
+
  /* This actually signals the guest, using eventfd. */
  void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
  {
@@ -2789,7 +2911,17 @@ static bool vhost_enable_notify_packed(struct vhost_dev 
*dev,
__virtio16 flags;
int ret;
  
-	/* FIXME: disable notification through device area */

+   if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY))
+   return false;
+   vq->used_flags &= ~VRING_USED_F_NO_NOTIFY;
+
+   flags = cpu_to_vhost16(vq, RING_EVENT_FLAGS_ENABLE);
+   ret = vhost_update_device_flags(vq, flags);
+   if (ret) {
+   vq_err(vq, "Failed to enable notification at %p: %d\n",
+  >device_event->desc_event_flags, ret);
+   return false;
+   }
  
  	/* They could have slipped one in as we were doing that: make

 * sure it's written, then check again. */
@@ -2855,7 +2987,18 @@ EXPORT_SYMBOL_GPL(vhost_enable_notify);
  static void vhost_disable_notify_packed(struct vhost_dev *dev,
struct vhost_virtqueue *vq)
  {
-   /* FIXME: disable notification through device area */
+   __virtio16 flags;
+   int r;
+
+   if (vq->used_flags & VRING_USED_F_NO_NOTIFY)
+   return;
+   vq->used_flags |= VRING_USED_F_NO_NOTIFY;
+
+   flags = cpu_to_vhost16(vq, RING_EVENT_FLAGS_DISABLE);
+   r = vhost_update_device_flags(vq, flags);
+   if (r)
+   vq_err(vq, "Failed to enable notification at %p: %d\n",
+  >device_event->desc_event_flags, r);
  }
  
  static void vhost_disable_notify_split(struct vhost_dev *dev,

diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 8a9df4f..02d7a36 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -96,8 +96,14 @@ struct vhost_virtqueue {
struct vring_desc __user *desc;
struct vring_desc_packed __user *desc_packed;

Do you think it'd be better to name the desc type as
struct vring_packed_desc?


Ok. Will do.


And it will 

[PATCH v3] staging: vt6655: check for memory allocation failures

2018-03-29 Thread Ji-Hun Kim
There are no null pointer checking on rd_info and td_info values which
are allocated by kzalloc. It has potential null pointer dereferencing
issues. Implement error handling code on device_init_rd*, device_init_td*
and vnt_start for the allocation failures.

Signed-off-by: Ji-Hun Kim 
---
Changes v2:

- Delete WARN_ON which can makes crashes on some machines.
- Instead of return directly, goto freeing function for freeing previously
  allocated memory in the for loop after kzalloc() failed.
- In the freeing function, add if statement for freeing to only allocated
  values.

Changes v3:

- Modify return type of device_init_rd*, device_init_td*. Then add returns
  error code at those functions and vnt_start as well.

 drivers/staging/vt6655/device_main.c | 114 +--
 1 file changed, 82 insertions(+), 32 deletions(-)

diff --git a/drivers/staging/vt6655/device_main.c 
b/drivers/staging/vt6655/device_main.c
index fbc4bc6..0d55f34 100644
--- a/drivers/staging/vt6655/device_main.c
+++ b/drivers/staging/vt6655/device_main.c
@@ -124,10 +124,10 @@
 static void device_free_info(struct vnt_private *priv);
 static void device_print_info(struct vnt_private *priv);
 
-static void device_init_rd0_ring(struct vnt_private *priv);
-static void device_init_rd1_ring(struct vnt_private *priv);
-static void device_init_td0_ring(struct vnt_private *priv);
-static void device_init_td1_ring(struct vnt_private *priv);
+static int device_init_rd0_ring(struct vnt_private *priv);
+static int device_init_rd1_ring(struct vnt_private *priv);
+static int device_init_td0_ring(struct vnt_private *priv);
+static int device_init_td1_ring(struct vnt_private *priv);
 
 static int  device_rx_srv(struct vnt_private *priv, unsigned int idx);
 static int  device_tx_srv(struct vnt_private *priv, unsigned int idx);
@@ -528,18 +528,22 @@ static void device_free_rings(struct vnt_private *priv)
  priv->tx0_bufs, priv->tx_bufs_dma0);
 }
 
-static void device_init_rd0_ring(struct vnt_private *priv)
+static int device_init_rd0_ring(struct vnt_private *priv)
 {
int i;
dma_addr_t  curr = priv->rd0_pool_dma;
struct vnt_rx_desc *desc;
+   int ret = 0;
 
/* Init the RD0 ring entries */
for (i = 0; i < priv->opts.rx_descs0;
 i ++, curr += sizeof(struct vnt_rx_desc)) {
desc = >aRD0Ring[i];
desc->rd_info = kzalloc(sizeof(*desc->rd_info), GFP_KERNEL);
-
+   if (!desc->rd_info) {
+   ret = -ENOMEM;
+   goto error;
+   }
if (!device_alloc_rx_buf(priv, desc))
dev_err(>pcid->dev, "can not alloc rx bufs\n");
 
@@ -550,20 +554,29 @@ static void device_init_rd0_ring(struct vnt_private *priv)
if (i > 0)
priv->aRD0Ring[i-1].next_desc = cpu_to_le32(priv->rd0_pool_dma);
priv->pCurrRD[0] = >aRD0Ring[0];
+
+   return 0;
+error:
+   device_free_rd0_ring(priv);
+   return ret;
 }
 
-static void device_init_rd1_ring(struct vnt_private *priv)
+static int device_init_rd1_ring(struct vnt_private *priv)
 {
int i;
dma_addr_t  curr = priv->rd1_pool_dma;
struct vnt_rx_desc *desc;
+   int ret = 0;
 
/* Init the RD1 ring entries */
for (i = 0; i < priv->opts.rx_descs1;
 i ++, curr += sizeof(struct vnt_rx_desc)) {
desc = >aRD1Ring[i];
desc->rd_info = kzalloc(sizeof(*desc->rd_info), GFP_KERNEL);
-
+   if (!desc->rd_info) {
+   ret = -ENOMEM;
+   goto error;
+   }
if (!device_alloc_rx_buf(priv, desc))
dev_err(>pcid->dev, "can not alloc rx bufs\n");
 
@@ -574,6 +587,11 @@ static void device_init_rd1_ring(struct vnt_private *priv)
if (i > 0)
priv->aRD1Ring[i-1].next_desc = cpu_to_le32(priv->rd1_pool_dma);
priv->pCurrRD[1] = >aRD1Ring[0];
+
+   return 0;
+error:
+   device_free_rd1_ring(priv);
+   return ret;
 }
 
 static void device_free_rd0_ring(struct vnt_private *priv)
@@ -584,12 +602,12 @@ static void device_free_rd0_ring(struct vnt_private *priv)
struct vnt_rx_desc *desc = >aRD0Ring[i];
struct vnt_rd_info *rd_info = desc->rd_info;
 
-   dma_unmap_single(>pcid->dev, rd_info->skb_dma,
-priv->rx_buf_sz, DMA_FROM_DEVICE);
-
-   dev_kfree_skb(rd_info->skb);
-
-   kfree(desc->rd_info);
+   if (rd_info) {
+   dma_unmap_single(>pcid->dev, rd_info->skb_dma,
+   priv->rx_buf_sz, DMA_FROM_DEVICE);
+   dev_kfree_skb(rd_info->skb);
+   kfree(desc->rd_info);
+   }
}
 }
 
@@ -601,27 +619,31 @@ static void 

[PATCH v3] staging: vt6655: check for memory allocation failures

2018-03-29 Thread Ji-Hun Kim
There are no null pointer checking on rd_info and td_info values which
are allocated by kzalloc. It has potential null pointer dereferencing
issues. Implement error handling code on device_init_rd*, device_init_td*
and vnt_start for the allocation failures.

Signed-off-by: Ji-Hun Kim 
---
Changes v2:

- Delete WARN_ON which can makes crashes on some machines.
- Instead of return directly, goto freeing function for freeing previously
  allocated memory in the for loop after kzalloc() failed.
- In the freeing function, add if statement for freeing to only allocated
  values.

Changes v3:

- Modify return type of device_init_rd*, device_init_td*. Then add returns
  error code at those functions and vnt_start as well.

 drivers/staging/vt6655/device_main.c | 114 +--
 1 file changed, 82 insertions(+), 32 deletions(-)

diff --git a/drivers/staging/vt6655/device_main.c 
b/drivers/staging/vt6655/device_main.c
index fbc4bc6..0d55f34 100644
--- a/drivers/staging/vt6655/device_main.c
+++ b/drivers/staging/vt6655/device_main.c
@@ -124,10 +124,10 @@
 static void device_free_info(struct vnt_private *priv);
 static void device_print_info(struct vnt_private *priv);
 
-static void device_init_rd0_ring(struct vnt_private *priv);
-static void device_init_rd1_ring(struct vnt_private *priv);
-static void device_init_td0_ring(struct vnt_private *priv);
-static void device_init_td1_ring(struct vnt_private *priv);
+static int device_init_rd0_ring(struct vnt_private *priv);
+static int device_init_rd1_ring(struct vnt_private *priv);
+static int device_init_td0_ring(struct vnt_private *priv);
+static int device_init_td1_ring(struct vnt_private *priv);
 
 static int  device_rx_srv(struct vnt_private *priv, unsigned int idx);
 static int  device_tx_srv(struct vnt_private *priv, unsigned int idx);
@@ -528,18 +528,22 @@ static void device_free_rings(struct vnt_private *priv)
  priv->tx0_bufs, priv->tx_bufs_dma0);
 }
 
-static void device_init_rd0_ring(struct vnt_private *priv)
+static int device_init_rd0_ring(struct vnt_private *priv)
 {
int i;
dma_addr_t  curr = priv->rd0_pool_dma;
struct vnt_rx_desc *desc;
+   int ret = 0;
 
/* Init the RD0 ring entries */
for (i = 0; i < priv->opts.rx_descs0;
 i ++, curr += sizeof(struct vnt_rx_desc)) {
desc = >aRD0Ring[i];
desc->rd_info = kzalloc(sizeof(*desc->rd_info), GFP_KERNEL);
-
+   if (!desc->rd_info) {
+   ret = -ENOMEM;
+   goto error;
+   }
if (!device_alloc_rx_buf(priv, desc))
dev_err(>pcid->dev, "can not alloc rx bufs\n");
 
@@ -550,20 +554,29 @@ static void device_init_rd0_ring(struct vnt_private *priv)
if (i > 0)
priv->aRD0Ring[i-1].next_desc = cpu_to_le32(priv->rd0_pool_dma);
priv->pCurrRD[0] = >aRD0Ring[0];
+
+   return 0;
+error:
+   device_free_rd0_ring(priv);
+   return ret;
 }
 
-static void device_init_rd1_ring(struct vnt_private *priv)
+static int device_init_rd1_ring(struct vnt_private *priv)
 {
int i;
dma_addr_t  curr = priv->rd1_pool_dma;
struct vnt_rx_desc *desc;
+   int ret = 0;
 
/* Init the RD1 ring entries */
for (i = 0; i < priv->opts.rx_descs1;
 i ++, curr += sizeof(struct vnt_rx_desc)) {
desc = >aRD1Ring[i];
desc->rd_info = kzalloc(sizeof(*desc->rd_info), GFP_KERNEL);
-
+   if (!desc->rd_info) {
+   ret = -ENOMEM;
+   goto error;
+   }
if (!device_alloc_rx_buf(priv, desc))
dev_err(>pcid->dev, "can not alloc rx bufs\n");
 
@@ -574,6 +587,11 @@ static void device_init_rd1_ring(struct vnt_private *priv)
if (i > 0)
priv->aRD1Ring[i-1].next_desc = cpu_to_le32(priv->rd1_pool_dma);
priv->pCurrRD[1] = >aRD1Ring[0];
+
+   return 0;
+error:
+   device_free_rd1_ring(priv);
+   return ret;
 }
 
 static void device_free_rd0_ring(struct vnt_private *priv)
@@ -584,12 +602,12 @@ static void device_free_rd0_ring(struct vnt_private *priv)
struct vnt_rx_desc *desc = >aRD0Ring[i];
struct vnt_rd_info *rd_info = desc->rd_info;
 
-   dma_unmap_single(>pcid->dev, rd_info->skb_dma,
-priv->rx_buf_sz, DMA_FROM_DEVICE);
-
-   dev_kfree_skb(rd_info->skb);
-
-   kfree(desc->rd_info);
+   if (rd_info) {
+   dma_unmap_single(>pcid->dev, rd_info->skb_dma,
+   priv->rx_buf_sz, DMA_FROM_DEVICE);
+   dev_kfree_skb(rd_info->skb);
+   kfree(desc->rd_info);
+   }
}
 }
 
@@ -601,27 +619,31 @@ static void device_free_rd1_ring(struct vnt_private 

Re: [PATCH v5] ANDROID: binder: change down_write to down_read

2018-03-29 Thread Minchan Kim
On Fri, Mar 30, 2018 at 09:39:03AM +0800, Ganesh Mahendran wrote:
> 2018-03-30 9:29 GMT+08:00 Minchan Kim :
> > Hi Ganesh,
> >
> > On Fri, Mar 30, 2018 at 09:21:55AM +0800, Ganesh Mahendran wrote:
> >> 2018-03-29 14:54 GMT+08:00 Minchan Kim :
> >> > binder_update_page_range needs down_write of mmap_sem because
> >> > vm_insert_page need to change vma->vm_flags to VM_MIXEDMAP unless
> >> > it is set. However, when I profile binder working, it seems
> >> > every binder buffers should be mapped in advance by binder_mmap.
> >> > It means we could set VM_MIXEDMAP in binder_mmap time which is
> >> > already hold a mmap_sem as down_write so binder_update_page_range
> >> > doesn't need to hold a mmap_sem as down_write.
> >> >
> >> > Android suffers from mmap_sem contention so let's reduce mmap_sem
> >> > down_write.
> >>
> >> Hi, Minchan:
> >>
> >> It seems there is performance regression of this patch.
> >
> > You mean "This patch aims for solving performance regression" not "This 
> > patch
> > makes performance regression"?
> 
> After applying this patch in our devices, app launch time increases
> about 15% in average.
> "This patch makes performance regression", yes, from the results, it
> is like this.
> 
> I will do more test of this patch.

Thanks for the testing. Every apps increases time?
I will appreciate if you confirm the problem again.

I guess the reason is currently rw_semaphore's preferrence for writer
at up_write so If binder uses down_read instead of down_write, binder
should compete other down_read callers at wake up time from up_write
and could lose wakeup chance if it is low priority. 

Anyway, I think it's abuse of using down_write to get lock holding
asap where it's down_read enough. I guess there is tradeoff based on
binder's priority. 
If we go to the speculative page faults or range-lock instad of mmap_sem,
the situation would be much improved.

Anyway, currently, I have no idea to overcome such prioriy inversion
problem from rw_semaphore and I believe generally changing from
down_read to down_write is good thing to prevent contention.

Ccing maintainers.


Re: [PATCH v5] ANDROID: binder: change down_write to down_read

2018-03-29 Thread Minchan Kim
On Fri, Mar 30, 2018 at 09:39:03AM +0800, Ganesh Mahendran wrote:
> 2018-03-30 9:29 GMT+08:00 Minchan Kim :
> > Hi Ganesh,
> >
> > On Fri, Mar 30, 2018 at 09:21:55AM +0800, Ganesh Mahendran wrote:
> >> 2018-03-29 14:54 GMT+08:00 Minchan Kim :
> >> > binder_update_page_range needs down_write of mmap_sem because
> >> > vm_insert_page need to change vma->vm_flags to VM_MIXEDMAP unless
> >> > it is set. However, when I profile binder working, it seems
> >> > every binder buffers should be mapped in advance by binder_mmap.
> >> > It means we could set VM_MIXEDMAP in binder_mmap time which is
> >> > already hold a mmap_sem as down_write so binder_update_page_range
> >> > doesn't need to hold a mmap_sem as down_write.
> >> >
> >> > Android suffers from mmap_sem contention so let's reduce mmap_sem
> >> > down_write.
> >>
> >> Hi, Minchan:
> >>
> >> It seems there is performance regression of this patch.
> >
> > You mean "This patch aims for solving performance regression" not "This 
> > patch
> > makes performance regression"?
> 
> After applying this patch in our devices, app launch time increases
> about 15% in average.
> "This patch makes performance regression", yes, from the results, it
> is like this.
> 
> I will do more test of this patch.

Thanks for the testing. Every apps increases time?
I will appreciate if you confirm the problem again.

I guess the reason is currently rw_semaphore's preferrence for writer
at up_write so If binder uses down_read instead of down_write, binder
should compete other down_read callers at wake up time from up_write
and could lose wakeup chance if it is low priority. 

Anyway, I think it's abuse of using down_write to get lock holding
asap where it's down_read enough. I guess there is tradeoff based on
binder's priority. 
If we go to the speculative page faults or range-lock instad of mmap_sem,
the situation would be much improved.

Anyway, currently, I have no idea to overcome such prioriy inversion
problem from rw_semaphore and I believe generally changing from
down_read to down_write is good thing to prevent contention.

Ccing maintainers.


Re: [PATCH v3 1/1] security: Add mechanism to safely (un)load LSMs after boot time

2018-03-29 Thread Sargun Dhillon
On Thu, Mar 29, 2018 at 02:37:10PM -0700, Casey Schaufler wrote:
> On 3/29/2018 2:14 PM, Sargun Dhillon wrote:
> > This patch introduces a mechanism to add mutable hooks and immutable
> > hooks to the callback chain. It adds an intermediary item to the
> > chain which separates mutable and immutable hooks. Immutable hooks
> > are then marked as read-only, as well as the hook heads. This does
> > not preclude some hooks being able to be mutated (removed).
> >
> > It also wraps the hook unloading, and execution with an SRCU. One
> > SRCU is used across all hooks, as the SRCU struct can be memory
> > intensive, and hook execution time in general should be relatively
> > short.
> >
> > Signed-off-by: Sargun Dhillon 
> > Signed-off-by: Tetsuo Handa 
> > ---
> >  include/linux/lsm_hooks.h  |  23 ++---
> >  security/Kconfig   |   2 +-
> >  security/apparmor/lsm.c|   2 +-
> >  security/commoncap.c   |   2 +-
> >  security/security.c| 210 
> > ++---
> >  security/selinux/hooks.c   |   5 +-
> >  security/smack/smack_lsm.c |   3 +-
> >  security/tomoyo/tomoyo.c   |   3 +-
> >  security/yama/yama_lsm.c   |   2 +-
> >  9 files changed, 196 insertions(+), 56 deletions(-)
> >
> > diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
> > index 09bc60fb35f1..689e5e72fb38 100644
> > --- a/include/linux/lsm_hooks.h
> > +++ b/include/linux/lsm_hooks.h
> > @@ -1981,9 +1981,12 @@ extern struct security_hook_heads 
> > security_hook_heads;
> >  extern char *lsm_names;
> >  
> >  extern void security_add_hooks(struct security_hook_list *hooks, int count,
> > -   char *lsm);
> > +   char *lsm, bool is_mutable);
> >  
> > -#ifdef CONFIG_SECURITY_SELINUX_DISABLE
> > +#define __lsm_ro_after_init__ro_after_init
> > +/* Currently required to handle SELinux runtime hook disable. */
> > +#ifdef CONFIG_SECURITY_WRITABLE_HOOKS
> > +#define __lsm_mutable_after_init
> >  /*
> >   * Assuring the safety of deleting a security module is up to
> >   * the security module involved. This may entail ordering the
> > @@ -1996,21 +1999,9 @@ extern void security_add_hooks(struct 
> > security_hook_list *hooks, int count,
> >   * disabling their module is a good idea needs to be at least as
> >   * careful as the SELinux team.
> >   */
> > -static inline void security_delete_hooks(struct security_hook_list *hooks,
> > -   int count)
> > -{
> > -   int i;
> > -
> > -   for (i = 0; i < count; i++)
> > -   hlist_del_rcu([i].list);
> > -}
> > -#endif /* CONFIG_SECURITY_SELINUX_DISABLE */
> > -
> > -/* Currently required to handle SELinux runtime hook disable. */
> > -#ifdef CONFIG_SECURITY_WRITABLE_HOOKS
> > -#define __lsm_ro_after_init
> > +extern void security_delete_hooks(struct security_hook_list *hooks, int 
> > count);
> >  #else
> > -#define __lsm_ro_after_init__ro_after_init
> > +#define __lsm_mutable_after_init __ro_after_init
> >  #endif /* CONFIG_SECURITY_WRITABLE_HOOKS */
> >  
> >  extern int __init security_module_enable(const char *module);
> > diff --git a/security/Kconfig b/security/Kconfig
> > index c4302067a3ad..a3b8b1142e6f 100644
> > --- a/security/Kconfig
> > +++ b/security/Kconfig
> > @@ -32,7 +32,7 @@ config SECURITY
> >   If you are unsure how to answer this question, answer N.
> >  
> >  config SECURITY_WRITABLE_HOOKS
> > -   depends on SECURITY
> > +   depends on SECURITY && SRCU
> > bool
> > default n
> >  
> > diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
> > index 9a65eeaf7dfa..d6cca8169df0 100644
> > --- a/security/apparmor/lsm.c
> > +++ b/security/apparmor/lsm.c
> > @@ -1155,7 +1155,7 @@ static int __init apparmor_init(void)
> > goto buffers_out;
> > }
> > security_add_hooks(apparmor_hooks, ARRAY_SIZE(apparmor_hooks),
> > -   "apparmor");
> > +   "apparmor", false);
> >  
> > /* Report that AppArmor successfully initialized */
> > apparmor_initialized = 1;
> > diff --git a/security/commoncap.c b/security/commoncap.c
> > index 48620c93d697..fe4b0d9d44ce 100644
> > --- a/security/commoncap.c
> > +++ b/security/commoncap.c
> > @@ -1363,7 +1363,7 @@ struct security_hook_list capability_hooks[] 
> > __lsm_ro_after_init = {
> >  void __init capability_add_hooks(void)
> >  {
> > security_add_hooks(capability_hooks, ARRAY_SIZE(capability_hooks),
> > -   "capability");
> > +   "capability", false);
> >  }
> >  
> >  #endif /* CONFIG_SECURITY */
> > diff --git a/security/security.c b/security/security.c
> > index 3cafff61b049..2ddb64864e3e 100644
> > --- a/security/security.c
> > +++ b/security/security.c
> > @@ -29,6 +29,11 @@
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> > +#include 
> > +
> > +#define SECURITY_HOOK_COUNT \

Re: [PATCH v3 1/1] security: Add mechanism to safely (un)load LSMs after boot time

2018-03-29 Thread Sargun Dhillon
On Thu, Mar 29, 2018 at 02:37:10PM -0700, Casey Schaufler wrote:
> On 3/29/2018 2:14 PM, Sargun Dhillon wrote:
> > This patch introduces a mechanism to add mutable hooks and immutable
> > hooks to the callback chain. It adds an intermediary item to the
> > chain which separates mutable and immutable hooks. Immutable hooks
> > are then marked as read-only, as well as the hook heads. This does
> > not preclude some hooks being able to be mutated (removed).
> >
> > It also wraps the hook unloading, and execution with an SRCU. One
> > SRCU is used across all hooks, as the SRCU struct can be memory
> > intensive, and hook execution time in general should be relatively
> > short.
> >
> > Signed-off-by: Sargun Dhillon 
> > Signed-off-by: Tetsuo Handa 
> > ---
> >  include/linux/lsm_hooks.h  |  23 ++---
> >  security/Kconfig   |   2 +-
> >  security/apparmor/lsm.c|   2 +-
> >  security/commoncap.c   |   2 +-
> >  security/security.c| 210 
> > ++---
> >  security/selinux/hooks.c   |   5 +-
> >  security/smack/smack_lsm.c |   3 +-
> >  security/tomoyo/tomoyo.c   |   3 +-
> >  security/yama/yama_lsm.c   |   2 +-
> >  9 files changed, 196 insertions(+), 56 deletions(-)
> >
> > diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
> > index 09bc60fb35f1..689e5e72fb38 100644
> > --- a/include/linux/lsm_hooks.h
> > +++ b/include/linux/lsm_hooks.h
> > @@ -1981,9 +1981,12 @@ extern struct security_hook_heads 
> > security_hook_heads;
> >  extern char *lsm_names;
> >  
> >  extern void security_add_hooks(struct security_hook_list *hooks, int count,
> > -   char *lsm);
> > +   char *lsm, bool is_mutable);
> >  
> > -#ifdef CONFIG_SECURITY_SELINUX_DISABLE
> > +#define __lsm_ro_after_init__ro_after_init
> > +/* Currently required to handle SELinux runtime hook disable. */
> > +#ifdef CONFIG_SECURITY_WRITABLE_HOOKS
> > +#define __lsm_mutable_after_init
> >  /*
> >   * Assuring the safety of deleting a security module is up to
> >   * the security module involved. This may entail ordering the
> > @@ -1996,21 +1999,9 @@ extern void security_add_hooks(struct 
> > security_hook_list *hooks, int count,
> >   * disabling their module is a good idea needs to be at least as
> >   * careful as the SELinux team.
> >   */
> > -static inline void security_delete_hooks(struct security_hook_list *hooks,
> > -   int count)
> > -{
> > -   int i;
> > -
> > -   for (i = 0; i < count; i++)
> > -   hlist_del_rcu([i].list);
> > -}
> > -#endif /* CONFIG_SECURITY_SELINUX_DISABLE */
> > -
> > -/* Currently required to handle SELinux runtime hook disable. */
> > -#ifdef CONFIG_SECURITY_WRITABLE_HOOKS
> > -#define __lsm_ro_after_init
> > +extern void security_delete_hooks(struct security_hook_list *hooks, int 
> > count);
> >  #else
> > -#define __lsm_ro_after_init__ro_after_init
> > +#define __lsm_mutable_after_init __ro_after_init
> >  #endif /* CONFIG_SECURITY_WRITABLE_HOOKS */
> >  
> >  extern int __init security_module_enable(const char *module);
> > diff --git a/security/Kconfig b/security/Kconfig
> > index c4302067a3ad..a3b8b1142e6f 100644
> > --- a/security/Kconfig
> > +++ b/security/Kconfig
> > @@ -32,7 +32,7 @@ config SECURITY
> >   If you are unsure how to answer this question, answer N.
> >  
> >  config SECURITY_WRITABLE_HOOKS
> > -   depends on SECURITY
> > +   depends on SECURITY && SRCU
> > bool
> > default n
> >  
> > diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
> > index 9a65eeaf7dfa..d6cca8169df0 100644
> > --- a/security/apparmor/lsm.c
> > +++ b/security/apparmor/lsm.c
> > @@ -1155,7 +1155,7 @@ static int __init apparmor_init(void)
> > goto buffers_out;
> > }
> > security_add_hooks(apparmor_hooks, ARRAY_SIZE(apparmor_hooks),
> > -   "apparmor");
> > +   "apparmor", false);
> >  
> > /* Report that AppArmor successfully initialized */
> > apparmor_initialized = 1;
> > diff --git a/security/commoncap.c b/security/commoncap.c
> > index 48620c93d697..fe4b0d9d44ce 100644
> > --- a/security/commoncap.c
> > +++ b/security/commoncap.c
> > @@ -1363,7 +1363,7 @@ struct security_hook_list capability_hooks[] 
> > __lsm_ro_after_init = {
> >  void __init capability_add_hooks(void)
> >  {
> > security_add_hooks(capability_hooks, ARRAY_SIZE(capability_hooks),
> > -   "capability");
> > +   "capability", false);
> >  }
> >  
> >  #endif /* CONFIG_SECURITY */
> > diff --git a/security/security.c b/security/security.c
> > index 3cafff61b049..2ddb64864e3e 100644
> > --- a/security/security.c
> > +++ b/security/security.c
> > @@ -29,6 +29,11 @@
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> > +#include 
> > +
> > +#define SECURITY_HOOK_COUNT \
> > +   (sizeof(security_hook_heads) / sizeof(struct 

Re: [PATCH net] vhost: validate log when IOTLB is enabled

2018-03-29 Thread Jason Wang



On 2018年03月29日 22:44, Michael S. Tsirkin wrote:

On Thu, Mar 29, 2018 at 04:00:04PM +0800, Jason Wang wrote:

Vq log_base is the userspace address of bitmap which has nothing to do
with IOTLB. So it needs to be validated unconditionally otherwise we
may try use 0 as log_base which may lead to pin pages that will lead
unexpected result (e.g trigger BUG_ON() in set_bit_to_user()).

Fixes: 6b1e6cc7855b0 ("vhost: new device IOTLB API")
Reported-by:syzbot+6304bf97ef436580f...@syzkaller.appspotmail.com
Signed-off-by: Jason Wang

One follow-up question:

We still observe that get user pages returns 0 sometimes. While I agree
we should not pass in unvalidated addresses, isn't this worth
documenting?




Looking at get_user_pages_fast(), it has:

    if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
                    (void __user *)start, len)))
        return 0;

So this is expected I think.

Thanks


Re: [PATCH net] vhost: validate log when IOTLB is enabled

2018-03-29 Thread Jason Wang



On 2018年03月29日 22:44, Michael S. Tsirkin wrote:

On Thu, Mar 29, 2018 at 04:00:04PM +0800, Jason Wang wrote:

Vq log_base is the userspace address of bitmap which has nothing to do
with IOTLB. So it needs to be validated unconditionally otherwise we
may try use 0 as log_base which may lead to pin pages that will lead
unexpected result (e.g trigger BUG_ON() in set_bit_to_user()).

Fixes: 6b1e6cc7855b0 ("vhost: new device IOTLB API")
Reported-by:syzbot+6304bf97ef436580f...@syzkaller.appspotmail.com
Signed-off-by: Jason Wang

One follow-up question:

We still observe that get user pages returns 0 sometimes. While I agree
we should not pass in unvalidated addresses, isn't this worth
documenting?




Looking at get_user_pages_fast(), it has:

    if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
                    (void __user *)start, len)))
        return 0;

So this is expected I think.

Thanks


[PATCH -next] drm/amdkfd: Make function kfd_dev_is_large_bar() static

2018-03-29 Thread Wei Yongjun
Fixes the following sparse warning:

drivers/gpu/drm/amd/amdkfd/kfd_chardev.c:1150:6: warning:
 symbol 'kfd_dev_is_large_bar' was not declared. Should it be static?

Signed-off-by: Wei Yongjun 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index cd679cf..fb5d997 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1147,7 +1147,7 @@ static int kfd_ioctl_acquire_vm(struct file *filep, 
struct kfd_process *p,
return ret;
 }
 
-bool kfd_dev_is_large_bar(struct kfd_dev *dev)
+static bool kfd_dev_is_large_bar(struct kfd_dev *dev)
 {
struct kfd_local_mem_info mem_info;



[PATCH -next] drm/amdkfd: Fix the error return code in kfd_ioctl_unmap_memory_from_gpu()

2018-03-29 Thread Wei Yongjun
Passing NULL pointer to PTR_ERR will result in return value of 0
indicating success which is clearly not what it is intended here.
This patch returns -EINVAL instead.

Fixes: 5ec7e02854b3 ("drm/amdkfd: Add ioctls for GPUVM memory management")
Signed-off-by: Wei Yongjun 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index cd679cf..c32a341 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1421,7 +1421,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file 
*filep,
 
pdd = kfd_get_process_device_data(dev, p);
if (!pdd) {
-   err = PTR_ERR(pdd);
+   err = -EINVAL;
goto bind_process_to_device_failed;
}



[PATCH -next] drm/amdkfd: Make function kfd_dev_is_large_bar() static

2018-03-29 Thread Wei Yongjun
Fixes the following sparse warning:

drivers/gpu/drm/amd/amdkfd/kfd_chardev.c:1150:6: warning:
 symbol 'kfd_dev_is_large_bar' was not declared. Should it be static?

Signed-off-by: Wei Yongjun 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index cd679cf..fb5d997 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1147,7 +1147,7 @@ static int kfd_ioctl_acquire_vm(struct file *filep, 
struct kfd_process *p,
return ret;
 }
 
-bool kfd_dev_is_large_bar(struct kfd_dev *dev)
+static bool kfd_dev_is_large_bar(struct kfd_dev *dev)
 {
struct kfd_local_mem_info mem_info;



[PATCH -next] drm/amdkfd: Fix the error return code in kfd_ioctl_unmap_memory_from_gpu()

2018-03-29 Thread Wei Yongjun
Passing NULL pointer to PTR_ERR will result in return value of 0
indicating success which is clearly not what it is intended here.
This patch returns -EINVAL instead.

Fixes: 5ec7e02854b3 ("drm/amdkfd: Add ioctls for GPUVM memory management")
Signed-off-by: Wei Yongjun 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index cd679cf..c32a341 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1421,7 +1421,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file 
*filep,
 
pdd = kfd_get_process_device_data(dev, p);
if (!pdd) {
-   err = PTR_ERR(pdd);
+   err = -EINVAL;
goto bind_process_to_device_failed;
}



Re: [PATCH v3] ANDROID: binder: change down_write to down_read

2018-03-29 Thread Fengguang Wu

On Fri, Mar 30, 2018 at 09:37:36AM +0900, Minchan Kim wrote:

On Fri, Mar 30, 2018 at 07:42:37AM +0800, kbuild test robot wrote:

Hi Minchan,

I love your patch! Yet something to improve:


Glad to hear.
It's first time someone loves my patch. ;-)


FYI, that message originates from Linus. :-)


[auto build test ERROR on staging/staging-testing]
[also build test ERROR on v4.16-rc7 next-20180329]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Minchan-Kim/ANDROID-binder-change-down_write-to-down_read/20180330-043057
config: x86_64-randconfig-x014-201812 (attached as .config)
compiler: gcc-7 (Debian 7.3.0-1) 7.3.0
reproduce:
# save the attached .config to linux build tree
make ARCH=x86_64

All errors (new ones prefixed by >>):

   drivers//android/binder.c: In function 'binder_mmap':
>> drivers//android/binder.c:4725:24: error: 'struct vm_area_struct' has no 
member named 'flags'; did you mean 'vm_flags'?
 vma->vm_flags = (vma->flags | VM_DONTCOPY | VM_MIXEDMAP) &
   ^
   vm_flags


http://lkml.kernel.org/r/<20180329065424.203172-1-minc...@kernel.org>

This time, I was little bit fast. :)


Hurrah, quick hands! :)

Cheers,
Fengguang


Re: [PATCH v3] ANDROID: binder: change down_write to down_read

2018-03-29 Thread Fengguang Wu

On Fri, Mar 30, 2018 at 09:37:36AM +0900, Minchan Kim wrote:

On Fri, Mar 30, 2018 at 07:42:37AM +0800, kbuild test robot wrote:

Hi Minchan,

I love your patch! Yet something to improve:


Glad to hear.
It's first time someone loves my patch. ;-)


FYI, that message originates from Linus. :-)


[auto build test ERROR on staging/staging-testing]
[also build test ERROR on v4.16-rc7 next-20180329]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Minchan-Kim/ANDROID-binder-change-down_write-to-down_read/20180330-043057
config: x86_64-randconfig-x014-201812 (attached as .config)
compiler: gcc-7 (Debian 7.3.0-1) 7.3.0
reproduce:
# save the attached .config to linux build tree
make ARCH=x86_64

All errors (new ones prefixed by >>):

   drivers//android/binder.c: In function 'binder_mmap':
>> drivers//android/binder.c:4725:24: error: 'struct vm_area_struct' has no 
member named 'flags'; did you mean 'vm_flags'?
 vma->vm_flags = (vma->flags | VM_DONTCOPY | VM_MIXEDMAP) &
   ^
   vm_flags


http://lkml.kernel.org/r/<20180329065424.203172-1-minc...@kernel.org>

This time, I was little bit fast. :)


Hurrah, quick hands! :)

Cheers,
Fengguang


Re: [PATCH v3 2/5] mm: page_alloc: reduce unnecessary binary search in memblock_next_valid_pfn()

2018-03-29 Thread Jia He



On 3/30/2018 9:43 AM, Wei Yang Wrote:

On Thu, Mar 29, 2018 at 04:06:38PM +0800, Jia He wrote:


On 3/28/2018 5:26 PM, Wei Yang Wrote:

On Sun, Mar 25, 2018 at 08:02:16PM -0700, Jia He wrote:

Commit b92df1de5d28 ("mm: page_alloc: skip over regions of invalid pfns
where possible") optimized the loop in memmap_init_zone(). But there is
still some room for improvement. E.g. if pfn and pfn+1 are in the same
memblock region, we can simply pfn++ instead of doing the binary search
in memblock_next_valid_pfn. This patch only works when
CONFIG_HAVE_ARCH_PFN_VALID is enable.

Signed-off-by: Jia He 
---
include/linux/memblock.h |  2 +-
mm/memblock.c| 73 +---
mm/page_alloc.c  |  3 +-
3 files changed, 47 insertions(+), 31 deletions(-)

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index efbbe4b..a8fb2ab 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -204,7 +204,7 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long 
*out_start_pfn,
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

#ifdef CONFIG_HAVE_ARCH_PFN_VALID
-unsigned long memblock_next_valid_pfn(unsigned long pfn);
+unsigned long memblock_next_valid_pfn(unsigned long pfn, int *idx);
#endif

/**
diff --git a/mm/memblock.c b/mm/memblock.c
index bea5a9c..06c1a08 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1102,35 +1102,6 @@ void __init_memblock __next_mem_pfn_range(int *idx, int 
nid,
*out_nid = r->nid;
}

-#ifdef CONFIG_HAVE_ARCH_PFN_VALID
-unsigned long __init_memblock memblock_next_valid_pfn(unsigned long pfn)
-{
-   struct memblock_type *type = 
-   unsigned int right = type->cnt;
-   unsigned int mid, left = 0;
-   phys_addr_t addr = PFN_PHYS(++pfn);
-
-   do {
-   mid = (right + left) / 2;
-
-   if (addr < type->regions[mid].base)
-   right = mid;
-   else if (addr >= (type->regions[mid].base +
- type->regions[mid].size))
-   left = mid + 1;
-   else {
-   /* addr is within the region, so pfn is valid */
-   return pfn;
-   }
-   } while (left < right);
-
-   if (right == type->cnt)
-   return -1UL;
-   else
-   return PHYS_PFN(type->regions[right].base);
-}
-#endif /*CONFIG_HAVE_ARCH_PFN_VALID*/
-
/**
   * memblock_set_node - set node ID on memblock regions
   * @base: base of area to set node ID for
@@ -1162,6 +1133,50 @@ int __init_memblock memblock_set_node(phys_addr_t base, 
phys_addr_t size,
}
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

+#ifdef CONFIG_HAVE_ARCH_PFN_VALID
+unsigned long __init_memblock memblock_next_valid_pfn(unsigned long pfn,
+   int *last_idx)
+{
+   struct memblock_type *type = 
+   unsigned int right = type->cnt;
+   unsigned int mid, left = 0;
+   unsigned long start_pfn, end_pfn;
+   phys_addr_t addr = PFN_PHYS(++pfn);
+
+   /* fast path, return pfh+1 if next pfn is in the same region */

   ^^^  pfn

Thanks

+   if (*last_idx != -1) {
+   start_pfn = PFN_DOWN(type->regions[*last_idx].base);

To me, it should be PFN_UP().

hmm.., seems all the base of memory region is pfn aligned (0x1 aligned).
So

PFN_UP is the same as PFN_DOWN here?
I got this logic from memblock_search_pfn_nid()

Ok, I guess here hide some buggy code.

When you look at __next_mem_pfn_range(), it uses PFN_UP() for base. The reason
is try to clip some un-page-aligned memory. While PFN_DOWN() will introduce
some unavailable memory to system.

Even mostly those address are page-aligned, we need to be careful for this.

Let me drop a patch to fix the original one.
Ok, please cc me, I will change the related code when your patch is 
accepted. ;-)

Cheers,
Jia


+   end_pfn = PFN_DOWN(type->regions[*last_idx].base +
+   type->regions[*last_idx].size);
+
+   if (pfn < end_pfn && pfn > start_pfn)

Could be (pfn < end_pfn && pfn >= start_pfn)?

pfn == start_pfn is also a valid address.

No, pfn=pfn+1 at the beginning, so pfn != start_pfn

This is a little bit tricky.

There is no requirement to pass a valid pfn to memblock_next_valid_pfn(). So
suppose we have memory layout like this:

 [0x100, 0x1ff]
 [0x300, 0x3ff]

And I call memblock_next_valid_pfn(0x2ff, 1), would this fits the fast path
logic?

Well, since memblock_next_valid_pfn() only used memmap_init_zone(), the
situation as I mentioned seems will not happen.

Even though, I suggest to chagne this, otherwise your logic in slow path and
fast path differs. In the case above, your slow path returns 0x300 at last.
ok. looks like it does no harm, even I thought the code will guarantee 
it to skip from 0x1ff

to 0x300 directly.
I will change it after 

Re: [PATCH v3 2/5] mm: page_alloc: reduce unnecessary binary search in memblock_next_valid_pfn()

2018-03-29 Thread Jia He



On 3/30/2018 9:43 AM, Wei Yang Wrote:

On Thu, Mar 29, 2018 at 04:06:38PM +0800, Jia He wrote:


On 3/28/2018 5:26 PM, Wei Yang Wrote:

On Sun, Mar 25, 2018 at 08:02:16PM -0700, Jia He wrote:

Commit b92df1de5d28 ("mm: page_alloc: skip over regions of invalid pfns
where possible") optimized the loop in memmap_init_zone(). But there is
still some room for improvement. E.g. if pfn and pfn+1 are in the same
memblock region, we can simply pfn++ instead of doing the binary search
in memblock_next_valid_pfn. This patch only works when
CONFIG_HAVE_ARCH_PFN_VALID is enable.

Signed-off-by: Jia He 
---
include/linux/memblock.h |  2 +-
mm/memblock.c| 73 +---
mm/page_alloc.c  |  3 +-
3 files changed, 47 insertions(+), 31 deletions(-)

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index efbbe4b..a8fb2ab 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -204,7 +204,7 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long 
*out_start_pfn,
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

#ifdef CONFIG_HAVE_ARCH_PFN_VALID
-unsigned long memblock_next_valid_pfn(unsigned long pfn);
+unsigned long memblock_next_valid_pfn(unsigned long pfn, int *idx);
#endif

/**
diff --git a/mm/memblock.c b/mm/memblock.c
index bea5a9c..06c1a08 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1102,35 +1102,6 @@ void __init_memblock __next_mem_pfn_range(int *idx, int 
nid,
*out_nid = r->nid;
}

-#ifdef CONFIG_HAVE_ARCH_PFN_VALID
-unsigned long __init_memblock memblock_next_valid_pfn(unsigned long pfn)
-{
-   struct memblock_type *type = 
-   unsigned int right = type->cnt;
-   unsigned int mid, left = 0;
-   phys_addr_t addr = PFN_PHYS(++pfn);
-
-   do {
-   mid = (right + left) / 2;
-
-   if (addr < type->regions[mid].base)
-   right = mid;
-   else if (addr >= (type->regions[mid].base +
- type->regions[mid].size))
-   left = mid + 1;
-   else {
-   /* addr is within the region, so pfn is valid */
-   return pfn;
-   }
-   } while (left < right);
-
-   if (right == type->cnt)
-   return -1UL;
-   else
-   return PHYS_PFN(type->regions[right].base);
-}
-#endif /*CONFIG_HAVE_ARCH_PFN_VALID*/
-
/**
   * memblock_set_node - set node ID on memblock regions
   * @base: base of area to set node ID for
@@ -1162,6 +1133,50 @@ int __init_memblock memblock_set_node(phys_addr_t base, 
phys_addr_t size,
}
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

+#ifdef CONFIG_HAVE_ARCH_PFN_VALID
+unsigned long __init_memblock memblock_next_valid_pfn(unsigned long pfn,
+   int *last_idx)
+{
+   struct memblock_type *type = 
+   unsigned int right = type->cnt;
+   unsigned int mid, left = 0;
+   unsigned long start_pfn, end_pfn;
+   phys_addr_t addr = PFN_PHYS(++pfn);
+
+   /* fast path, return pfh+1 if next pfn is in the same region */

   ^^^  pfn

Thanks

+   if (*last_idx != -1) {
+   start_pfn = PFN_DOWN(type->regions[*last_idx].base);

To me, it should be PFN_UP().

hmm.., seems all the base of memory region is pfn aligned (0x1 aligned).
So

PFN_UP is the same as PFN_DOWN here?
I got this logic from memblock_search_pfn_nid()

Ok, I guess here hide some buggy code.

When you look at __next_mem_pfn_range(), it uses PFN_UP() for base. The reason
is try to clip some un-page-aligned memory. While PFN_DOWN() will introduce
some unavailable memory to system.

Even mostly those address are page-aligned, we need to be careful for this.

Let me drop a patch to fix the original one.
Ok, please cc me, I will change the related code when your patch is 
accepted. ;-)

Cheers,
Jia


+   end_pfn = PFN_DOWN(type->regions[*last_idx].base +
+   type->regions[*last_idx].size);
+
+   if (pfn < end_pfn && pfn > start_pfn)

Could be (pfn < end_pfn && pfn >= start_pfn)?

pfn == start_pfn is also a valid address.

No, pfn=pfn+1 at the beginning, so pfn != start_pfn

This is a little bit tricky.

There is no requirement to pass a valid pfn to memblock_next_valid_pfn(). So
suppose we have memory layout like this:

 [0x100, 0x1ff]
 [0x300, 0x3ff]

And I call memblock_next_valid_pfn(0x2ff, 1), would this fits the fast path
logic?

Well, since memblock_next_valid_pfn() only used memmap_init_zone(), the
situation as I mentioned seems will not happen.

Even though, I suggest to chagne this, otherwise your logic in slow path and
fast path differs. In the case above, your slow path returns 0x300 at last.
ok. looks like it does no harm, even I thought the code will guarantee 
it to skip from 0x1ff

to 0x300 directly.
I will change it after fuctional test.

--
Cheers,

  1   2   3   4   5   6   7   8   9   10   >