[PATCH] drm/kms/mode: added a new helper for calculating videomode from crtc's display mode

2018-05-02 Thread Satendra Singh Thakur
1.
-Added a new helper drm_display_mode_crtc_to_videomode
-This helper calculates mode parameters like
  horizontal front_porch, back_porch, sync length
  vertical front_porch, back_porch, sync length
  using crtc_* fields of struct drm_display_mode
-It uses following fields of crtc mode
 horizontal sync start/end, active and total length
 vertical sync start/end, active and total length
2.
-Most of the driver use user-supplied mode for calculating videomode
-However, few drivers use HW (crtc) mode for calculating videomode
-This helper will be useful for such drivers
3.
-Currently following drivers will be using this new helper
-arm hdlcd
-atmel hlcdc
-exynos 5433 decon
-exynos7 decon
-exynos fimd
4.
-This patch removes related duplicate code from above mentioned drivers

Signed-off-by: Satendra Singh Thakur 
Cc: Madhur Verma 
Cc: Hemanshu Srivastava 
---
 drivers/gpu/drm/arm/hdlcd_crtc.c   |  8 +---
 drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_crtc.c |  7 +--
 drivers/gpu/drm/drm_modes.c| 20 
 drivers/gpu/drm/exynos/exynos5433_drm_decon.c  | 22 ++
 drivers/gpu/drm/exynos/exynos7_drm_decon.c | 23 ++-
 drivers/gpu/drm/exynos/exynos_drm_fimd.c   | 22 +-
 include/drm/drm_modes.h|  2 ++
 7 files changed, 53 insertions(+), 51 deletions(-)

diff --git a/drivers/gpu/drm/arm/hdlcd_crtc.c b/drivers/gpu/drm/arm/hdlcd_crtc.c
index cf5cbd6..d20e471 100644
--- a/drivers/gpu/drm/arm/hdlcd_crtc.c
+++ b/drivers/gpu/drm/arm/hdlcd_crtc.c
@@ -130,13 +130,7 @@ static void hdlcd_crtc_mode_set_nofb(struct drm_crtc *crtc)
struct videomode vm;
unsigned int polarities, err;
 
-   vm.vfront_porch = m->crtc_vsync_start - m->crtc_vdisplay;
-   vm.vback_porch = m->crtc_vtotal - m->crtc_vsync_end;
-   vm.vsync_len = m->crtc_vsync_end - m->crtc_vsync_start;
-   vm.hfront_porch = m->crtc_hsync_start - m->crtc_hdisplay;
-   vm.hback_porch = m->crtc_htotal - m->crtc_hsync_end;
-   vm.hsync_len = m->crtc_hsync_end - m->crtc_hsync_start;
-
+   drm_display_mode_crtc_to_videomode(m, );
polarities = HDLCD_POLARITY_DATAEN | HDLCD_POLARITY_DATA;
 
if (m->flags & DRM_MODE_FLAG_PHSYNC)
diff --git a/drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_crtc.c 
b/drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_crtc.c
index d732810..bafcef6 100644
--- a/drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_crtc.c
+++ b/drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_crtc.c
@@ -81,12 +81,7 @@ static void atmel_hlcdc_crtc_mode_set_nofb(struct drm_crtc 
*c)
unsigned int cfg;
int div;
 
-   vm.vfront_porch = adj->crtc_vsync_start - adj->crtc_vdisplay;
-   vm.vback_porch = adj->crtc_vtotal - adj->crtc_vsync_end;
-   vm.vsync_len = adj->crtc_vsync_end - adj->crtc_vsync_start;
-   vm.hfront_porch = adj->crtc_hsync_start - adj->crtc_hdisplay;
-   vm.hback_porch = adj->crtc_htotal - adj->crtc_hsync_end;
-   vm.hsync_len = adj->crtc_hsync_end - adj->crtc_hsync_start;
+   drm_display_mode_crtc_to_videomode(adj, );
 
regmap_write(regmap, ATMEL_HLCDC_CFG(1),
 (vm.hsync_len - 1) | ((vm.vsync_len - 1) << 16));
diff --git a/drivers/gpu/drm/drm_modes.c b/drivers/gpu/drm/drm_modes.c
index e82b61e..a406749 100644
--- a/drivers/gpu/drm/drm_modes.c
+++ b/drivers/gpu/drm/drm_modes.c
@@ -654,6 +654,26 @@ void drm_display_mode_to_videomode(const struct 
drm_display_mode *dmode,
vm->flags |= DISPLAY_FLAGS_DOUBLECLK;
 }
 EXPORT_SYMBOL_GPL(drm_display_mode_to_videomode);
+/**
+ * drm_display_mode_crtc_to_videomode - fill in @vm using crtc fields of@dmode,
+ * @dmode: drm_display_mode structure to use as source
+ * @vm: videomode structure to use as destination
+ *
+ * Fills out @vm using the crtc display mode specified in @dmode.
+ */
+void drm_display_mode_crtc_to_videomode(const struct drm_display_mode *dmode,
+  struct videomode *vm)
+{
+   vm->hfront_porch = dmode->crtc_hsync_start - dmode->crtc_hdisplay;
+   vm->hsync_len = dmode->crtc_hsync_end - dmode->crtc_hsync_start;
+   vm->hback_porch = dmode->crtc_htotal - dmode->crtc_hsync_end;
+
+   vm->vfront_porch = dmode->crtc_vsync_start - dmode->crtc_vdisplay;
+   vm->vsync_len = dmode->crtc_vsync_end - dmode->crtc_vsync_start;
+   vm->vback_porch = dmode->crtc_vtotal - dmode->crtc_vsync_end;
+
+}
+EXPORT_SYMBOL_GPL(drm_display_mode_crtc_to_videomode);
 
 /**
  * drm_bus_flags_from_videomode - extract information about pixelclk and
diff --git a/drivers/gpu/drm/exynos/exynos5433_drm_decon.c 
b/drivers/gpu/drm/exynos/exynos5433_drm_decon.c
index 1c330f2..1ba73a8 100644
--- a/drivers/gpu/drm/exynos/exynos5433_drm_decon.c
+++ b/drivers/gpu/drm/exynos/exynos5433_drm_decon.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 

[PATCH] drm/kms/mode: added a new helper for calculating videomode from crtc's display mode

2018-05-02 Thread Satendra Singh Thakur
1.
-Added a new helper drm_display_mode_crtc_to_videomode
-This helper calculates mode parameters like
  horizontal front_porch, back_porch, sync length
  vertical front_porch, back_porch, sync length
  using crtc_* fields of struct drm_display_mode
-It uses following fields of crtc mode
 horizontal sync start/end, active and total length
 vertical sync start/end, active and total length
2.
-Most of the driver use user-supplied mode for calculating videomode
-However, few drivers use HW (crtc) mode for calculating videomode
-This helper will be useful for such drivers
3.
-Currently following drivers will be using this new helper
-arm hdlcd
-atmel hlcdc
-exynos 5433 decon
-exynos7 decon
-exynos fimd
4.
-This patch removes related duplicate code from above mentioned drivers

Signed-off-by: Satendra Singh Thakur 
Cc: Madhur Verma 
Cc: Hemanshu Srivastava 
---
 drivers/gpu/drm/arm/hdlcd_crtc.c   |  8 +---
 drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_crtc.c |  7 +--
 drivers/gpu/drm/drm_modes.c| 20 
 drivers/gpu/drm/exynos/exynos5433_drm_decon.c  | 22 ++
 drivers/gpu/drm/exynos/exynos7_drm_decon.c | 23 ++-
 drivers/gpu/drm/exynos/exynos_drm_fimd.c   | 22 +-
 include/drm/drm_modes.h|  2 ++
 7 files changed, 53 insertions(+), 51 deletions(-)

diff --git a/drivers/gpu/drm/arm/hdlcd_crtc.c b/drivers/gpu/drm/arm/hdlcd_crtc.c
index cf5cbd6..d20e471 100644
--- a/drivers/gpu/drm/arm/hdlcd_crtc.c
+++ b/drivers/gpu/drm/arm/hdlcd_crtc.c
@@ -130,13 +130,7 @@ static void hdlcd_crtc_mode_set_nofb(struct drm_crtc *crtc)
struct videomode vm;
unsigned int polarities, err;
 
-   vm.vfront_porch = m->crtc_vsync_start - m->crtc_vdisplay;
-   vm.vback_porch = m->crtc_vtotal - m->crtc_vsync_end;
-   vm.vsync_len = m->crtc_vsync_end - m->crtc_vsync_start;
-   vm.hfront_porch = m->crtc_hsync_start - m->crtc_hdisplay;
-   vm.hback_porch = m->crtc_htotal - m->crtc_hsync_end;
-   vm.hsync_len = m->crtc_hsync_end - m->crtc_hsync_start;
-
+   drm_display_mode_crtc_to_videomode(m, );
polarities = HDLCD_POLARITY_DATAEN | HDLCD_POLARITY_DATA;
 
if (m->flags & DRM_MODE_FLAG_PHSYNC)
diff --git a/drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_crtc.c 
b/drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_crtc.c
index d732810..bafcef6 100644
--- a/drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_crtc.c
+++ b/drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_crtc.c
@@ -81,12 +81,7 @@ static void atmel_hlcdc_crtc_mode_set_nofb(struct drm_crtc 
*c)
unsigned int cfg;
int div;
 
-   vm.vfront_porch = adj->crtc_vsync_start - adj->crtc_vdisplay;
-   vm.vback_porch = adj->crtc_vtotal - adj->crtc_vsync_end;
-   vm.vsync_len = adj->crtc_vsync_end - adj->crtc_vsync_start;
-   vm.hfront_porch = adj->crtc_hsync_start - adj->crtc_hdisplay;
-   vm.hback_porch = adj->crtc_htotal - adj->crtc_hsync_end;
-   vm.hsync_len = adj->crtc_hsync_end - adj->crtc_hsync_start;
+   drm_display_mode_crtc_to_videomode(adj, );
 
regmap_write(regmap, ATMEL_HLCDC_CFG(1),
 (vm.hsync_len - 1) | ((vm.vsync_len - 1) << 16));
diff --git a/drivers/gpu/drm/drm_modes.c b/drivers/gpu/drm/drm_modes.c
index e82b61e..a406749 100644
--- a/drivers/gpu/drm/drm_modes.c
+++ b/drivers/gpu/drm/drm_modes.c
@@ -654,6 +654,26 @@ void drm_display_mode_to_videomode(const struct 
drm_display_mode *dmode,
vm->flags |= DISPLAY_FLAGS_DOUBLECLK;
 }
 EXPORT_SYMBOL_GPL(drm_display_mode_to_videomode);
+/**
+ * drm_display_mode_crtc_to_videomode - fill in @vm using crtc fields of@dmode,
+ * @dmode: drm_display_mode structure to use as source
+ * @vm: videomode structure to use as destination
+ *
+ * Fills out @vm using the crtc display mode specified in @dmode.
+ */
+void drm_display_mode_crtc_to_videomode(const struct drm_display_mode *dmode,
+  struct videomode *vm)
+{
+   vm->hfront_porch = dmode->crtc_hsync_start - dmode->crtc_hdisplay;
+   vm->hsync_len = dmode->crtc_hsync_end - dmode->crtc_hsync_start;
+   vm->hback_porch = dmode->crtc_htotal - dmode->crtc_hsync_end;
+
+   vm->vfront_porch = dmode->crtc_vsync_start - dmode->crtc_vdisplay;
+   vm->vsync_len = dmode->crtc_vsync_end - dmode->crtc_vsync_start;
+   vm->vback_porch = dmode->crtc_vtotal - dmode->crtc_vsync_end;
+
+}
+EXPORT_SYMBOL_GPL(drm_display_mode_crtc_to_videomode);
 
 /**
  * drm_bus_flags_from_videomode - extract information about pixelclk and
diff --git a/drivers/gpu/drm/exynos/exynos5433_drm_decon.c 
b/drivers/gpu/drm/exynos/exynos5433_drm_decon.c
index 1c330f2..1ba73a8 100644
--- a/drivers/gpu/drm/exynos/exynos5433_drm_decon.c
+++ b/drivers/gpu/drm/exynos/exynos5433_drm_decon.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "exynos_drm_drv.h"
 #include "exynos_drm_crtc.h"
@@ -225,26 +226,23 @@ static 

Re: [PATCH v2 4/9] x86, memcpy_mcsafe: add write-protection-fault handling

2018-05-02 Thread Mika Penttilä
On 05/03/2018 07:59 AM, Dan Williams wrote:
> In preparation for using memcpy_mcsafe() to handle user copies it needs
> to be to handle write-protection faults while writing user pages. Add
> MMU-fault handlers alongside the machine-check exception handlers.
> 
> Note that the machine check fault exception handling makes assumptions
> about source buffer alignment and poison alignment. In the write fault
> case, given the destination buffer is arbitrarily aligned, it needs a
> separate / additional fault handling approach. The mcsafe_handle_tail()
> helper is reused. The @limit argument is set to @len since there is no
> safety concern about retriggering an MMU fault, and this simplifies the
> assembly.
> 

> diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
> index 75d3776123cc..9787f5ee0cf9 100644
> --- a/arch/x86/lib/usercopy_64.c
> +++ b/arch/x86/lib/usercopy_64.c
> @@ -75,6 +75,23 @@ copy_user_handle_tail(char *to, char *from, unsigned len)
>   return len;
>  }
>  
> +/*
> + * Similar to copy_user_handle_tail, probe for the write fault point,
> + * but reuse __memcpy_mcsafe in case a new read error is encountered.
> + * clac() is handled in _copy_to_iter_mcsafe().
> + */
> +__visible unsigned long
> +mcsafe_handle_tail(char *to, char *from, unsigned len)
> +{
> + for (; len; --len, to++) {
> + unsigned long rem = memcpy_mcsafe(to, from, 1);
> +


Hmm why not 
for (; len; --len, from++, to++)



> + if (rem)
> + break;
> + }
> + return len;
> +}


--Mika



Re: [PATCH v2 4/9] x86, memcpy_mcsafe: add write-protection-fault handling

2018-05-02 Thread Mika Penttilä
On 05/03/2018 07:59 AM, Dan Williams wrote:
> In preparation for using memcpy_mcsafe() to handle user copies it needs
> to be to handle write-protection faults while writing user pages. Add
> MMU-fault handlers alongside the machine-check exception handlers.
> 
> Note that the machine check fault exception handling makes assumptions
> about source buffer alignment and poison alignment. In the write fault
> case, given the destination buffer is arbitrarily aligned, it needs a
> separate / additional fault handling approach. The mcsafe_handle_tail()
> helper is reused. The @limit argument is set to @len since there is no
> safety concern about retriggering an MMU fault, and this simplifies the
> assembly.
> 

> diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
> index 75d3776123cc..9787f5ee0cf9 100644
> --- a/arch/x86/lib/usercopy_64.c
> +++ b/arch/x86/lib/usercopy_64.c
> @@ -75,6 +75,23 @@ copy_user_handle_tail(char *to, char *from, unsigned len)
>   return len;
>  }
>  
> +/*
> + * Similar to copy_user_handle_tail, probe for the write fault point,
> + * but reuse __memcpy_mcsafe in case a new read error is encountered.
> + * clac() is handled in _copy_to_iter_mcsafe().
> + */
> +__visible unsigned long
> +mcsafe_handle_tail(char *to, char *from, unsigned len)
> +{
> + for (; len; --len, to++) {
> + unsigned long rem = memcpy_mcsafe(to, from, 1);
> +


Hmm why not 
for (; len; --len, from++, to++)



> + if (rem)
> + break;
> + }
> + return len;
> +}


--Mika



[PATCH v2] NFC: fdp: Remove __func__ from dev_dbg()

2018-05-02 Thread Amit Pundir
Remove redundant __func__ parameter from dev_dgb() calls.

v2:
Deleted empty dev_dbg() trace calls, which are redundant if
function tracer is enabled.

Signed-off-by: Amit Pundir 
---
 drivers/nfc/fdp/fdp.c | 18 +++---
 drivers/nfc/fdp/i2c.c | 17 -
 2 files changed, 7 insertions(+), 28 deletions(-)

diff --git a/drivers/nfc/fdp/fdp.c b/drivers/nfc/fdp/fdp.c
index d5784a4..f64a6fd 100644
--- a/drivers/nfc/fdp/fdp.c
+++ b/drivers/nfc/fdp/fdp.c
@@ -249,8 +249,6 @@ static int fdp_nci_open(struct nci_dev *ndev)
struct fdp_nci_info *info = nci_get_drvdata(ndev);
struct device *dev = >phy->i2c_dev->dev;
 
-   dev_dbg(dev, "%s\n", __func__);
-
r = info->phy_ops->enable(info->phy);
 
return r;
@@ -261,7 +259,6 @@ static int fdp_nci_close(struct nci_dev *ndev)
struct fdp_nci_info *info = nci_get_drvdata(ndev);
struct device *dev = >phy->i2c_dev->dev;
 
-   dev_dbg(dev, "%s\n", __func__);
return 0;
 }
 
@@ -270,8 +267,6 @@ static int fdp_nci_send(struct nci_dev *ndev, struct 
sk_buff *skb)
struct fdp_nci_info *info = nci_get_drvdata(ndev);
struct device *dev = >phy->i2c_dev->dev;
 
-   dev_dbg(dev, "%s\n", __func__);
-
if (atomic_dec_and_test(>data_pkt_counter))
info->data_pkt_counter_cb(ndev);
 
@@ -283,7 +278,6 @@ int fdp_nci_recv_frame(struct nci_dev *ndev, struct sk_buff 
*skb)
struct fdp_nci_info *info = nci_get_drvdata(ndev);
struct device *dev = >phy->i2c_dev->dev;
 
-   dev_dbg(dev, "%s\n", __func__);
return nci_recv_frame(ndev, skb);
 }
 EXPORT_SYMBOL(fdp_nci_recv_frame);
@@ -498,8 +492,6 @@ static int fdp_nci_setup(struct nci_dev *ndev)
int r;
u8 patched = 0;
 
-   dev_dbg(dev, "%s\n", __func__);
-
r = nci_core_init(ndev);
if (r)
goto error;
@@ -609,7 +601,6 @@ static int fdp_nci_core_reset_ntf_packet(struct nci_dev 
*ndev,
struct fdp_nci_info *info = nci_get_drvdata(ndev);
struct device *dev = >phy->i2c_dev->dev;
 
-   dev_dbg(dev, "%s\n", __func__);
info->setup_reset_ntf = 1;
wake_up(>setup_wq);
 
@@ -622,7 +613,6 @@ static int fdp_nci_prop_patch_ntf_packet(struct nci_dev 
*ndev,
struct fdp_nci_info *info = nci_get_drvdata(ndev);
struct device *dev = >phy->i2c_dev->dev;
 
-   dev_dbg(dev, "%s\n", __func__);
info->setup_patch_ntf = 1;
info->setup_patch_status = skb->data[0];
wake_up(>setup_wq);
@@ -637,7 +627,7 @@ static int fdp_nci_prop_patch_rsp_packet(struct nci_dev 
*ndev,
struct device *dev = >phy->i2c_dev->dev;
u8 status = skb->data[0];
 
-   dev_dbg(dev, "%s: status 0x%x\n", __func__, status);
+   dev_dbg(dev, "status 0x%x\n", status);
nci_req_complete(ndev, status);
 
return 0;
@@ -650,7 +640,7 @@ static int 
fdp_nci_prop_set_production_data_rsp_packet(struct nci_dev *ndev,
struct device *dev = >phy->i2c_dev->dev;
u8 status = skb->data[0];
 
-   dev_dbg(dev, "%s: status 0x%x\n", __func__, status);
+   dev_dbg(dev, "status 0x%x\n", status);
nci_req_complete(ndev, status);
 
return 0;
@@ -695,7 +685,7 @@ static int fdp_nci_core_get_config_rsp_packet(struct 
nci_dev *ndev,
dev_dbg(dev, "OTP version %d\n", info->otp_version);
dev_dbg(dev, "RAM version %d\n", info->ram_version);
dev_dbg(dev, "key index %d\n", info->key_index);
-   dev_dbg(dev, "%s: status 0x%x\n", __func__, rsp->status);
+   dev_dbg(dev, "status 0x%x\n", rsp->status);
 
nci_req_complete(ndev, rsp->status);
 
@@ -798,8 +788,6 @@ void fdp_nci_remove(struct nci_dev *ndev)
struct fdp_nci_info *info = nci_get_drvdata(ndev);
struct device *dev = >phy->i2c_dev->dev;
 
-   dev_dbg(dev, "%s\n", __func__);
-
nci_unregister_device(ndev);
nci_free_device(ndev);
 }
diff --git a/drivers/nfc/fdp/i2c.c b/drivers/nfc/fdp/i2c.c
index c4da50e..f355ab2 100644
--- a/drivers/nfc/fdp/i2c.c
+++ b/drivers/nfc/fdp/i2c.c
@@ -57,7 +57,6 @@ static int fdp_nci_i2c_enable(void *phy_id)
 {
struct fdp_i2c_phy *phy = phy_id;
 
-   dev_dbg(>i2c_dev->dev, "%s\n", __func__);
fdp_nci_i2c_reset(phy);
 
return 0;
@@ -67,7 +66,6 @@ static void fdp_nci_i2c_disable(void *phy_id)
 {
struct fdp_i2c_phy *phy = phy_id;
 
-   dev_dbg(>i2c_dev->dev, "%s\n", __func__);
fdp_nci_i2c_reset(phy);
 }
 
@@ -113,8 +111,8 @@ static int fdp_nci_i2c_write(void *phy_id, struct sk_buff 
*skb)
}
 
if (r < 0 || r != skb->len)
-   dev_dbg(>dev, "%s: error err=%d len=%d\n",
-   __func__, r, skb->len);
+   dev_dbg(>dev, "error err=%d len=%d\n",
+   r, skb->len);
 
if (r >= 0) {
if (r != skb->len) {
@@ -152,8 +150,7 @@ static int fdp_nci_i2c_read(struct fdp_i2c_phy *phy, struct 
sk_buff 

[PATCH v2] NFC: fdp: Remove __func__ from dev_dbg()

2018-05-02 Thread Amit Pundir
Remove redundant __func__ parameter from dev_dgb() calls.

v2:
Deleted empty dev_dbg() trace calls, which are redundant if
function tracer is enabled.

Signed-off-by: Amit Pundir 
---
 drivers/nfc/fdp/fdp.c | 18 +++---
 drivers/nfc/fdp/i2c.c | 17 -
 2 files changed, 7 insertions(+), 28 deletions(-)

diff --git a/drivers/nfc/fdp/fdp.c b/drivers/nfc/fdp/fdp.c
index d5784a4..f64a6fd 100644
--- a/drivers/nfc/fdp/fdp.c
+++ b/drivers/nfc/fdp/fdp.c
@@ -249,8 +249,6 @@ static int fdp_nci_open(struct nci_dev *ndev)
struct fdp_nci_info *info = nci_get_drvdata(ndev);
struct device *dev = >phy->i2c_dev->dev;
 
-   dev_dbg(dev, "%s\n", __func__);
-
r = info->phy_ops->enable(info->phy);
 
return r;
@@ -261,7 +259,6 @@ static int fdp_nci_close(struct nci_dev *ndev)
struct fdp_nci_info *info = nci_get_drvdata(ndev);
struct device *dev = >phy->i2c_dev->dev;
 
-   dev_dbg(dev, "%s\n", __func__);
return 0;
 }
 
@@ -270,8 +267,6 @@ static int fdp_nci_send(struct nci_dev *ndev, struct 
sk_buff *skb)
struct fdp_nci_info *info = nci_get_drvdata(ndev);
struct device *dev = >phy->i2c_dev->dev;
 
-   dev_dbg(dev, "%s\n", __func__);
-
if (atomic_dec_and_test(>data_pkt_counter))
info->data_pkt_counter_cb(ndev);
 
@@ -283,7 +278,6 @@ int fdp_nci_recv_frame(struct nci_dev *ndev, struct sk_buff 
*skb)
struct fdp_nci_info *info = nci_get_drvdata(ndev);
struct device *dev = >phy->i2c_dev->dev;
 
-   dev_dbg(dev, "%s\n", __func__);
return nci_recv_frame(ndev, skb);
 }
 EXPORT_SYMBOL(fdp_nci_recv_frame);
@@ -498,8 +492,6 @@ static int fdp_nci_setup(struct nci_dev *ndev)
int r;
u8 patched = 0;
 
-   dev_dbg(dev, "%s\n", __func__);
-
r = nci_core_init(ndev);
if (r)
goto error;
@@ -609,7 +601,6 @@ static int fdp_nci_core_reset_ntf_packet(struct nci_dev 
*ndev,
struct fdp_nci_info *info = nci_get_drvdata(ndev);
struct device *dev = >phy->i2c_dev->dev;
 
-   dev_dbg(dev, "%s\n", __func__);
info->setup_reset_ntf = 1;
wake_up(>setup_wq);
 
@@ -622,7 +613,6 @@ static int fdp_nci_prop_patch_ntf_packet(struct nci_dev 
*ndev,
struct fdp_nci_info *info = nci_get_drvdata(ndev);
struct device *dev = >phy->i2c_dev->dev;
 
-   dev_dbg(dev, "%s\n", __func__);
info->setup_patch_ntf = 1;
info->setup_patch_status = skb->data[0];
wake_up(>setup_wq);
@@ -637,7 +627,7 @@ static int fdp_nci_prop_patch_rsp_packet(struct nci_dev 
*ndev,
struct device *dev = >phy->i2c_dev->dev;
u8 status = skb->data[0];
 
-   dev_dbg(dev, "%s: status 0x%x\n", __func__, status);
+   dev_dbg(dev, "status 0x%x\n", status);
nci_req_complete(ndev, status);
 
return 0;
@@ -650,7 +640,7 @@ static int 
fdp_nci_prop_set_production_data_rsp_packet(struct nci_dev *ndev,
struct device *dev = >phy->i2c_dev->dev;
u8 status = skb->data[0];
 
-   dev_dbg(dev, "%s: status 0x%x\n", __func__, status);
+   dev_dbg(dev, "status 0x%x\n", status);
nci_req_complete(ndev, status);
 
return 0;
@@ -695,7 +685,7 @@ static int fdp_nci_core_get_config_rsp_packet(struct 
nci_dev *ndev,
dev_dbg(dev, "OTP version %d\n", info->otp_version);
dev_dbg(dev, "RAM version %d\n", info->ram_version);
dev_dbg(dev, "key index %d\n", info->key_index);
-   dev_dbg(dev, "%s: status 0x%x\n", __func__, rsp->status);
+   dev_dbg(dev, "status 0x%x\n", rsp->status);
 
nci_req_complete(ndev, rsp->status);
 
@@ -798,8 +788,6 @@ void fdp_nci_remove(struct nci_dev *ndev)
struct fdp_nci_info *info = nci_get_drvdata(ndev);
struct device *dev = >phy->i2c_dev->dev;
 
-   dev_dbg(dev, "%s\n", __func__);
-
nci_unregister_device(ndev);
nci_free_device(ndev);
 }
diff --git a/drivers/nfc/fdp/i2c.c b/drivers/nfc/fdp/i2c.c
index c4da50e..f355ab2 100644
--- a/drivers/nfc/fdp/i2c.c
+++ b/drivers/nfc/fdp/i2c.c
@@ -57,7 +57,6 @@ static int fdp_nci_i2c_enable(void *phy_id)
 {
struct fdp_i2c_phy *phy = phy_id;
 
-   dev_dbg(>i2c_dev->dev, "%s\n", __func__);
fdp_nci_i2c_reset(phy);
 
return 0;
@@ -67,7 +66,6 @@ static void fdp_nci_i2c_disable(void *phy_id)
 {
struct fdp_i2c_phy *phy = phy_id;
 
-   dev_dbg(>i2c_dev->dev, "%s\n", __func__);
fdp_nci_i2c_reset(phy);
 }
 
@@ -113,8 +111,8 @@ static int fdp_nci_i2c_write(void *phy_id, struct sk_buff 
*skb)
}
 
if (r < 0 || r != skb->len)
-   dev_dbg(>dev, "%s: error err=%d len=%d\n",
-   __func__, r, skb->len);
+   dev_dbg(>dev, "error err=%d len=%d\n",
+   r, skb->len);
 
if (r >= 0) {
if (r != skb->len) {
@@ -152,8 +150,7 @@ static int fdp_nci_i2c_read(struct fdp_i2c_phy *phy, struct 
sk_buff **skb)
 
   

Re: [PATCH 2/2] drivers core: multi-threading device shutdown

2018-05-02 Thread Tobin C. Harding
This code was a pleasure to read, super clean.

On Wed, May 02, 2018 at 11:59:31PM -0400, Pavel Tatashin wrote:
> When system is rebooted, halted or kexeced device_shutdown() is
> called.
> 
> This function shuts down every single device by calling either:
>   dev->bus->shutdown(dev)
>   dev->driver->shutdown(dev)
> 
> Even on a machine just with a moderate amount of devices, device_shutdown()
> may take multiple seconds to complete. Because many devices require a
> specific delays to perform this operation.
> 
> Here is sample analysis of time it takes to call device_shutdown() on
> two socket Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz machine.
> 
> device_shutdown   2.95s
>  mlx4_shutdown1.14s
>  megasas_shutdown 0.24s
>  ixgbe_shutdown   0.37s x 4 (four ixgbe devices on my machine).
>  the rest 0.09s
> 
> In mlx4 we spent the most time, but that is because there is a 1 second
> sleep:
> mlx4_shutdown
>  mlx4_unload_one
>   mlx4_free_ownership
>msleep(1000)
> 
> With megasas we spend quoter of second, but sometimes longer (up-to 0.5s)
> in this path:
> 
> megasas_shutdown
>   megasas_flush_cache
> megasas_issue_blocked_cmd
>   wait_event_timeout
> 
> Finally, with ixgbe_shutdown() it takes 0.37 for each device, but that time
> is spread all over the place, with bigger offenders:
> 
> ixgbe_shutdown
>   __ixgbe_shutdown
> ixgbe_close_suspend
>   ixgbe_down
> ixgbe_init_hw_generic
>   ixgbe_reset_hw_X540
> msleep(100);0.104483472
> ixgbe_get_san_mac_addr_generic  0.048414851
> ixgbe_get_wwn_prefix_generic0.048409893
>   ixgbe_start_hw_X540
> ixgbe_start_hw_generic
>   ixgbe_clear_hw_cntrs_generic  0.048581502
>   ixgbe_setup_fc_generic0.024225800
> 
> All the ixgbe_*generic functions end-up calling:
> ixgbe_read_eerd_X540()
>   ixgbe_acquire_swfw_sync_X540
> usleep_range(5000, 6000);
>   ixgbe_release_swfw_sync_X540
> usleep_range(5000, 6000);
> 
> While these are short sleeps, they end-up calling them over 24 times!
> 24 * 0.0055s = 0.132s. Adding-up to 0.528s for four devices.
> 
> While we should keep optimizing the individual device drivers, in some
> cases this is simply a hardware property that forces a specific delay, and
> we must wait.
> 
> So, the solution for this problem is to shutdown devices in parallel.
> However, we must shutdown children before shutting down parents, so parent
> device must wait for its children to finish.
> 
> With this patch, on the same machine devices_shutdown() takes 1.142s, and
> without mlx4 one second delay only 0.38s
> 
> Signed-off-by: Pavel Tatashin 
> ---
>  drivers/base/core.c | 238 +++-
>  1 file changed, 189 insertions(+), 49 deletions(-)
> 
> diff --git a/drivers/base/core.c b/drivers/base/core.c
> index b610816eb887..f370369a303b 100644
> --- a/drivers/base/core.c
> +++ b/drivers/base/core.c
> @@ -25,6 +25,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include "base.h"
>  #include "power/power.h"
> @@ -2102,6 +2103,59 @@ const char *device_get_devnode(struct device *dev,
>   return *tmp = s;
>  }
>  
> +/**
> + * device_children_count - device children count
> + * @parent: parent struct device.
> + *
> + * Returns number of children for this device or 0 if nonde.
> + */
> +static int device_children_count(struct device *parent)
> +{
> + struct klist_iter i;
> + int children = 0;
> +
> + if (!parent->p)
> + return 0;
> +
> + klist_iter_init(>p->klist_children, );
> + while (next_device())
> + children++;
> + klist_iter_exit();
> +
> + return children;
> +}
> +
> +/**
> + * device_get_child_by_index - Return child using the provide index.
> + * @parent: parent struct device.
> + * @index:  Index of the child, where 0 is the first child in the children 
> list,
> + * and so on.
> + *
> + * Returns child or NULL if child with this index is not present.
> + */
> +static struct device *
> +device_get_child_by_index(struct device *parent, int index)
> +{
> + struct klist_iter i;
> + struct device *dev = NULL, *d;
> + int child_index = 0;
> +
> + if (!parent->p || index < 0)
> + return NULL;
> +
> + klist_iter_init(>p->klist_children, );
> + while ((d = next_device()) != NULL) {

perhaps:
while ((d = next_device())) {

> + if (child_index == index) {
> + dev = d;
> + break;
> + }
> + child_index++;
> + }
> + klist_iter_exit();
> +
> + return dev;
> +}
> +
>  /**
>   * device_for_each_child - device child iterator.
>   * @parent: parent struct device.
> @@ -2765,71 

Re: [PATCH 2/2] drivers core: multi-threading device shutdown

2018-05-02 Thread Tobin C. Harding
This code was a pleasure to read, super clean.

On Wed, May 02, 2018 at 11:59:31PM -0400, Pavel Tatashin wrote:
> When system is rebooted, halted or kexeced device_shutdown() is
> called.
> 
> This function shuts down every single device by calling either:
>   dev->bus->shutdown(dev)
>   dev->driver->shutdown(dev)
> 
> Even on a machine just with a moderate amount of devices, device_shutdown()
> may take multiple seconds to complete. Because many devices require a
> specific delays to perform this operation.
> 
> Here is sample analysis of time it takes to call device_shutdown() on
> two socket Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz machine.
> 
> device_shutdown   2.95s
>  mlx4_shutdown1.14s
>  megasas_shutdown 0.24s
>  ixgbe_shutdown   0.37s x 4 (four ixgbe devices on my machine).
>  the rest 0.09s
> 
> In mlx4 we spent the most time, but that is because there is a 1 second
> sleep:
> mlx4_shutdown
>  mlx4_unload_one
>   mlx4_free_ownership
>msleep(1000)
> 
> With megasas we spend quoter of second, but sometimes longer (up-to 0.5s)
> in this path:
> 
> megasas_shutdown
>   megasas_flush_cache
> megasas_issue_blocked_cmd
>   wait_event_timeout
> 
> Finally, with ixgbe_shutdown() it takes 0.37 for each device, but that time
> is spread all over the place, with bigger offenders:
> 
> ixgbe_shutdown
>   __ixgbe_shutdown
> ixgbe_close_suspend
>   ixgbe_down
> ixgbe_init_hw_generic
>   ixgbe_reset_hw_X540
> msleep(100);0.104483472
> ixgbe_get_san_mac_addr_generic  0.048414851
> ixgbe_get_wwn_prefix_generic0.048409893
>   ixgbe_start_hw_X540
> ixgbe_start_hw_generic
>   ixgbe_clear_hw_cntrs_generic  0.048581502
>   ixgbe_setup_fc_generic0.024225800
> 
> All the ixgbe_*generic functions end-up calling:
> ixgbe_read_eerd_X540()
>   ixgbe_acquire_swfw_sync_X540
> usleep_range(5000, 6000);
>   ixgbe_release_swfw_sync_X540
> usleep_range(5000, 6000);
> 
> While these are short sleeps, they end-up calling them over 24 times!
> 24 * 0.0055s = 0.132s. Adding-up to 0.528s for four devices.
> 
> While we should keep optimizing the individual device drivers, in some
> cases this is simply a hardware property that forces a specific delay, and
> we must wait.
> 
> So, the solution for this problem is to shutdown devices in parallel.
> However, we must shutdown children before shutting down parents, so parent
> device must wait for its children to finish.
> 
> With this patch, on the same machine devices_shutdown() takes 1.142s, and
> without mlx4 one second delay only 0.38s
> 
> Signed-off-by: Pavel Tatashin 
> ---
>  drivers/base/core.c | 238 +++-
>  1 file changed, 189 insertions(+), 49 deletions(-)
> 
> diff --git a/drivers/base/core.c b/drivers/base/core.c
> index b610816eb887..f370369a303b 100644
> --- a/drivers/base/core.c
> +++ b/drivers/base/core.c
> @@ -25,6 +25,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include "base.h"
>  #include "power/power.h"
> @@ -2102,6 +2103,59 @@ const char *device_get_devnode(struct device *dev,
>   return *tmp = s;
>  }
>  
> +/**
> + * device_children_count - device children count
> + * @parent: parent struct device.
> + *
> + * Returns number of children for this device or 0 if nonde.
> + */
> +static int device_children_count(struct device *parent)
> +{
> + struct klist_iter i;
> + int children = 0;
> +
> + if (!parent->p)
> + return 0;
> +
> + klist_iter_init(>p->klist_children, );
> + while (next_device())
> + children++;
> + klist_iter_exit();
> +
> + return children;
> +}
> +
> +/**
> + * device_get_child_by_index - Return child using the provide index.
> + * @parent: parent struct device.
> + * @index:  Index of the child, where 0 is the first child in the children 
> list,
> + * and so on.
> + *
> + * Returns child or NULL if child with this index is not present.
> + */
> +static struct device *
> +device_get_child_by_index(struct device *parent, int index)
> +{
> + struct klist_iter i;
> + struct device *dev = NULL, *d;
> + int child_index = 0;
> +
> + if (!parent->p || index < 0)
> + return NULL;
> +
> + klist_iter_init(>p->klist_children, );
> + while ((d = next_device()) != NULL) {

perhaps:
while ((d = next_device())) {

> + if (child_index == index) {
> + dev = d;
> + break;
> + }
> + child_index++;
> + }
> + klist_iter_exit();
> +
> + return dev;
> +}
> +
>  /**
>   * device_for_each_child - device child iterator.
>   * @parent: parent struct device.
> @@ -2765,71 +2819,157 @@ int 

[PATCH] drm/atomic: Handling the case when setting old crtc for plane

2018-05-02 Thread Satendra Singh Thakur
In the func drm_atomic_set_crtc_for_plane, with the current code,
if crtc of the plane_state and crtc passed as argument to the func
are same, entire func will executed in vein.
It will get state of crtc and clear and set the bits in plane_mask.
All these steps are not required for same old crtc.
Ideally, we should do nothing in this case, this patch handles the same,
and causes the program to return without doing anything in such scenario.

Signed-off-by: Satendra Singh Thakur 
Cc: Madhur Verma 
Cc: Hemanshu Srivastava 
---
 drivers/gpu/drm/drm_atomic.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/drm_atomic.c b/drivers/gpu/drm/drm_atomic.c
index 7d25c42..5bd3365 100644
--- a/drivers/gpu/drm/drm_atomic.c
+++ b/drivers/gpu/drm/drm_atomic.c
@@ -1421,7 +1421,9 @@ drm_atomic_set_crtc_for_plane(struct drm_plane_state 
*plane_state,
 {
struct drm_plane *plane = plane_state->plane;
struct drm_crtc_state *crtc_state;
-
+   /* Nothing to do for same crtc*/
+   if (plane_state->crtc == crtc)
+   return 0;
if (plane_state->crtc) {
crtc_state = drm_atomic_get_crtc_state(plane_state->state,
   plane_state->crtc);
-- 
2.7.4



[PATCH] drm/atomic: Handling the case when setting old crtc for plane

2018-05-02 Thread Satendra Singh Thakur
In the func drm_atomic_set_crtc_for_plane, with the current code,
if crtc of the plane_state and crtc passed as argument to the func
are same, entire func will executed in vein.
It will get state of crtc and clear and set the bits in plane_mask.
All these steps are not required for same old crtc.
Ideally, we should do nothing in this case, this patch handles the same,
and causes the program to return without doing anything in such scenario.

Signed-off-by: Satendra Singh Thakur 
Cc: Madhur Verma 
Cc: Hemanshu Srivastava 
---
 drivers/gpu/drm/drm_atomic.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/drm_atomic.c b/drivers/gpu/drm/drm_atomic.c
index 7d25c42..5bd3365 100644
--- a/drivers/gpu/drm/drm_atomic.c
+++ b/drivers/gpu/drm/drm_atomic.c
@@ -1421,7 +1421,9 @@ drm_atomic_set_crtc_for_plane(struct drm_plane_state 
*plane_state,
 {
struct drm_plane *plane = plane_state->plane;
struct drm_crtc_state *crtc_state;
-
+   /* Nothing to do for same crtc*/
+   if (plane_state->crtc == crtc)
+   return 0;
if (plane_state->crtc) {
crtc_state = drm_atomic_get_crtc_state(plane_state->state,
   plane_state->crtc);
-- 
2.7.4



[PATCH v2] efi/capsule-loader: Don't output reset log when reset flags are not set

2018-05-02 Thread Shunyong Yang
It means firmware attempts to immediately process or launch the capsule
when reset flags in capsule header are not set. Moreover, reset is not
needed in this case. The current code will output log to indicate reset.

This patch adds a branch to avoid reset log output when the flags are not
set.

Cc: Joey Zheng 
Signed-off-by: Shunyong Yang 
---

Changes in v2:
  *Add EFI_CAPSULE_PERSIST_ACROSS_RESET check according to Ard's
   suggestion.
  
---
 drivers/firmware/efi/capsule-loader.c | 13 +
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/firmware/efi/capsule-loader.c 
b/drivers/firmware/efi/capsule-loader.c
index e456f4602df1..344785ef8539 100644
--- a/drivers/firmware/efi/capsule-loader.c
+++ b/drivers/firmware/efi/capsule-loader.c
@@ -134,10 +134,15 @@ static ssize_t efi_capsule_submit_update(struct 
capsule_info *cap_info)
 
/* Indicate capsule binary uploading is done */
cap_info->index = NO_FURTHER_WRITE_ACTION;
-   pr_info("Successfully upload capsule file with reboot type '%s'\n",
-   !cap_info->reset_type ? "RESET_COLD" :
-   cap_info->reset_type == 1 ? "RESET_WARM" :
-   "RESET_SHUTDOWN");
+
+   if (cap_info->header.flags & EFI_CAPSULE_PERSIST_ACROSS_RESET)
+   pr_info("Successfully upload capsule file with reboot type 
'%s'\n",
+   !cap_info->reset_type ? "RESET_COLD" :
+   cap_info->reset_type == 1 ? "RESET_WARM" :
+   "RESET_SHUTDOWN");
+   else
+   pr_info("Successfully upload, process and launch capsule 
file\n");
+
return 0;
 }
 
-- 
1.8.3.1



[PATCH v2] efi/capsule-loader: Don't output reset log when reset flags are not set

2018-05-02 Thread Shunyong Yang
It means firmware attempts to immediately process or launch the capsule
when reset flags in capsule header are not set. Moreover, reset is not
needed in this case. The current code will output log to indicate reset.

This patch adds a branch to avoid reset log output when the flags are not
set.

Cc: Joey Zheng 
Signed-off-by: Shunyong Yang 
---

Changes in v2:
  *Add EFI_CAPSULE_PERSIST_ACROSS_RESET check according to Ard's
   suggestion.
  
---
 drivers/firmware/efi/capsule-loader.c | 13 +
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/firmware/efi/capsule-loader.c 
b/drivers/firmware/efi/capsule-loader.c
index e456f4602df1..344785ef8539 100644
--- a/drivers/firmware/efi/capsule-loader.c
+++ b/drivers/firmware/efi/capsule-loader.c
@@ -134,10 +134,15 @@ static ssize_t efi_capsule_submit_update(struct 
capsule_info *cap_info)
 
/* Indicate capsule binary uploading is done */
cap_info->index = NO_FURTHER_WRITE_ACTION;
-   pr_info("Successfully upload capsule file with reboot type '%s'\n",
-   !cap_info->reset_type ? "RESET_COLD" :
-   cap_info->reset_type == 1 ? "RESET_WARM" :
-   "RESET_SHUTDOWN");
+
+   if (cap_info->header.flags & EFI_CAPSULE_PERSIST_ACROSS_RESET)
+   pr_info("Successfully upload capsule file with reboot type 
'%s'\n",
+   !cap_info->reset_type ? "RESET_COLD" :
+   cap_info->reset_type == 1 ? "RESET_WARM" :
+   "RESET_SHUTDOWN");
+   else
+   pr_info("Successfully upload, process and launch capsule 
file\n");
+
return 0;
 }
 
-- 
1.8.3.1



Re: INFO: rcu detected stall in __schedule

2018-05-02 Thread Tetsuo Handa
I'm not sure whether this is a PPP bug.

As of uptime = 484, RCU says that it stalled for 125 seconds.

--
[  484.407032] INFO: rcu_sched self-detected stall on CPU
[  484.412488]  0-...!: (125000 ticks this GP) idle=f3e/1/4611686018427387906 
softirq=112858/112858 fqs=0 
[  484.422300]   (t=125000 jiffies g=61626 c=61625 q=1534)
[  484.427663] rcu_sched kthread starved for 125000 jiffies! g61626 c61625 f0x0 
RCU_GP_WAIT_FQS(3) ->state=0x402 ->cpu=0
--

484 - 125 = 359, which was about to start SND related fuzzing in that log.

--
2033/05/18 03:36:31 executing program 1:
r0 = socket(0x4a, 0x5, 0x7)
setsockopt$inet_int(r0, 0x0, 0x18, &(0x7f00)=0x200, 0x4)
bind$inet6(r0, &(0x7fc0)={0xa, 0x0, 0x0, @loopback={0x0, 0x1}}, 0x1c)
perf_event_open(&(0x7f40)={0x2, 0x70, 0x3e5}, 0x0, 0x, 
0x, 0x0)
timer_create(0x0, &(0x7f0001c0)={0x0, 0x15, 0x0, @thr={&(0x7f000440), 
&(0x7f000540)}}, &(0x7f000200))
timer_getoverrun(0x0)
perf_event_open(&(0x7f25c000)={0x2, 0x78, 0x3e3}, 0x0, 0x0, 
0x, 0x0)
r1 = syz_open_dev$sndctrl(&(0x7f000200)='/dev/snd/controlC#\x00', 0x2, 0x0)
perf_event_open(&(0x7f001000)={0x0, 0x70, 0x0, 0x0, 0x0, 0x0, 0x0, 0x8ce, 
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x7, 0x0, 0x0, 0x0, 
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xfff8, 0x0, 0x0, 0x0, 0x0, 
0x0, 0x0, 0x0, 0x0, @perf_bp={&(0x7f005000), 0x2}, 0x10c}, 0x0, 
0x0, 0x, 0x0)
ioctl$SNDRV_CTL_IOCTL_SUBSCRIBE_EVENTS(r1, 0xc0045516, &(0x7fc0)=0x1)
r2 = syz_open_dev$sndpcmp(&(0x7f000100)='/dev/snd/pcmC#D#p\x00', 0x1, 
0x4000)
ioctl$SNDRV_SEQ_IOCTL_GET_QUEUE_CLIENT(r2, 0xc04c5349, 
&(0x7f000240)={0x200, 0xfcdc, 0x1})
syz_open_dev$tun(&(0x7f0003c0)='/dev/net/tun\x00', 0x0, 0x20402)
ioctl$SNDRV_CTL_IOCTL_PVERSION(r1, 0xc1105517, &(0x7f001000)=""/250)
ioctl$SNDRV_CTL_IOCTL_SUBSCRIBE_EVENTS(r1, 0xc0045516, &(0x7f00))

2033/05/18 03:36:31 executing program 4:
syz_emit_ethernet(0x3e, &(0x7fc0)={@broadcast=[0xff, 0xff, 0xff, 0xff, 
0xff, 0xff], @empty=[0x0, 0x0, 0xb00], [], {@ipv4={0x800, {{0x5, 
0x4, 0x0, 0x0, 0x30, 0x0, 0x0, 0x0, 0x1, 0x0, @remote={0xac, 0x14, 0x14, 0xbb}, 
@dev={0xac, 0x14, 0x14}}, @icmp=@parameter_prob={0x5, 0x4, 0x0, 0x0, 0x0, 0x0, 
{0x5, 0x4, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @local={0xac, 0x223, 0x14, 
0xaa}, @dev={0xac, 0x14, 0x14}}}, &(0x7f00)={0x0, 0x2, [0x0, 
0x2e6]})

2033/05/18 03:36:31 executing program 1:
r0 = socket$pppoe(0x18, 0x1, 0x0)
connect$pppoe(r0, &(0x7fc0)={0x18, 0x0, {0x1, @broadcast=[0xff, 0xff, 
0xff, 0xff, 0xff, 0xff], 'ip6_vti0\x00'}}, 0x1e)
r1 = socket(0x3, 0xb, 0x8001)
setsockopt$inet_sctp6_SCTP_ADAPTATION_LAYER(r1, 0x84, 0x7, 
&(0x7f000100)={0x2}, 0x4)
ioctl$sock_inet_SIOCGIFADDR(r0, 0x8915, 
&(0x7f40)={'veth1_to_bridge\x00', {0x2, 0x4e21}})
r2 = syz_open_dev$admmidi(&(0x7f00)='/dev/admmidi#\x00', 0x6, 0x8000)
setsockopt$SO_VM_SOCKETS_BUFFER_MAX_SIZE(r2, 0x28, 0x2, 
&(0x7f80)=0xff00, 0x8)

[  359.306427] snd_virmidi snd_virmidi.0: control 112:0:0:�:0 is already 
present
--


Re: INFO: rcu detected stall in __schedule

2018-05-02 Thread Tetsuo Handa
I'm not sure whether this is a PPP bug.

As of uptime = 484, RCU says that it stalled for 125 seconds.

--
[  484.407032] INFO: rcu_sched self-detected stall on CPU
[  484.412488]  0-...!: (125000 ticks this GP) idle=f3e/1/4611686018427387906 
softirq=112858/112858 fqs=0 
[  484.422300]   (t=125000 jiffies g=61626 c=61625 q=1534)
[  484.427663] rcu_sched kthread starved for 125000 jiffies! g61626 c61625 f0x0 
RCU_GP_WAIT_FQS(3) ->state=0x402 ->cpu=0
--

484 - 125 = 359, which was about to start SND related fuzzing in that log.

--
2033/05/18 03:36:31 executing program 1:
r0 = socket(0x4a, 0x5, 0x7)
setsockopt$inet_int(r0, 0x0, 0x18, &(0x7f00)=0x200, 0x4)
bind$inet6(r0, &(0x7fc0)={0xa, 0x0, 0x0, @loopback={0x0, 0x1}}, 0x1c)
perf_event_open(&(0x7f40)={0x2, 0x70, 0x3e5}, 0x0, 0x, 
0x, 0x0)
timer_create(0x0, &(0x7f0001c0)={0x0, 0x15, 0x0, @thr={&(0x7f000440), 
&(0x7f000540)}}, &(0x7f000200))
timer_getoverrun(0x0)
perf_event_open(&(0x7f25c000)={0x2, 0x78, 0x3e3}, 0x0, 0x0, 
0x, 0x0)
r1 = syz_open_dev$sndctrl(&(0x7f000200)='/dev/snd/controlC#\x00', 0x2, 0x0)
perf_event_open(&(0x7f001000)={0x0, 0x70, 0x0, 0x0, 0x0, 0x0, 0x0, 0x8ce, 
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x7, 0x0, 0x0, 0x0, 
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xfff8, 0x0, 0x0, 0x0, 0x0, 
0x0, 0x0, 0x0, 0x0, @perf_bp={&(0x7f005000), 0x2}, 0x10c}, 0x0, 
0x0, 0x, 0x0)
ioctl$SNDRV_CTL_IOCTL_SUBSCRIBE_EVENTS(r1, 0xc0045516, &(0x7fc0)=0x1)
r2 = syz_open_dev$sndpcmp(&(0x7f000100)='/dev/snd/pcmC#D#p\x00', 0x1, 
0x4000)
ioctl$SNDRV_SEQ_IOCTL_GET_QUEUE_CLIENT(r2, 0xc04c5349, 
&(0x7f000240)={0x200, 0xfcdc, 0x1})
syz_open_dev$tun(&(0x7f0003c0)='/dev/net/tun\x00', 0x0, 0x20402)
ioctl$SNDRV_CTL_IOCTL_PVERSION(r1, 0xc1105517, &(0x7f001000)=""/250)
ioctl$SNDRV_CTL_IOCTL_SUBSCRIBE_EVENTS(r1, 0xc0045516, &(0x7f00))

2033/05/18 03:36:31 executing program 4:
syz_emit_ethernet(0x3e, &(0x7fc0)={@broadcast=[0xff, 0xff, 0xff, 0xff, 
0xff, 0xff], @empty=[0x0, 0x0, 0xb00], [], {@ipv4={0x800, {{0x5, 
0x4, 0x0, 0x0, 0x30, 0x0, 0x0, 0x0, 0x1, 0x0, @remote={0xac, 0x14, 0x14, 0xbb}, 
@dev={0xac, 0x14, 0x14}}, @icmp=@parameter_prob={0x5, 0x4, 0x0, 0x0, 0x0, 0x0, 
{0x5, 0x4, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @local={0xac, 0x223, 0x14, 
0xaa}, @dev={0xac, 0x14, 0x14}}}, &(0x7f00)={0x0, 0x2, [0x0, 
0x2e6]})

2033/05/18 03:36:31 executing program 1:
r0 = socket$pppoe(0x18, 0x1, 0x0)
connect$pppoe(r0, &(0x7fc0)={0x18, 0x0, {0x1, @broadcast=[0xff, 0xff, 
0xff, 0xff, 0xff, 0xff], 'ip6_vti0\x00'}}, 0x1e)
r1 = socket(0x3, 0xb, 0x8001)
setsockopt$inet_sctp6_SCTP_ADAPTATION_LAYER(r1, 0x84, 0x7, 
&(0x7f000100)={0x2}, 0x4)
ioctl$sock_inet_SIOCGIFADDR(r0, 0x8915, 
&(0x7f40)={'veth1_to_bridge\x00', {0x2, 0x4e21}})
r2 = syz_open_dev$admmidi(&(0x7f00)='/dev/admmidi#\x00', 0x6, 0x8000)
setsockopt$SO_VM_SOCKETS_BUFFER_MAX_SIZE(r2, 0x28, 0x2, 
&(0x7f80)=0xff00, 0x8)

[  359.306427] snd_virmidi snd_virmidi.0: control 112:0:0:�:0 is already 
present
--


Re: [PATCH V3 10/10] ASoC: amd: dma driver changes for bt i2s instance

2018-05-02 Thread Daniel Kurtz
Some checkpatch nits below...

On Tue, May 1, 2018 at 2:53 PM Vijendar Mukunda 
wrote:

> With in ACP, There are three I2S controllers can be
> configured/enabled ( I2S SP, I2S MICSP, I2S BT).
> Default enabled I2S controller instance is I2S SP.
> This patch provides required changes to support I2S BT
> controller Instance.

> Signed-off-by: Vijendar Mukunda 
> ---
> v1->v2: defined i2s instance macros in acp header file
> v2->v3: sqaushed previous patch series and spilt changes
>   into multiple patches (acp dma driver code cleanup
>   patches and bt i2s instance specific changes)
>sound/soc/amd/acp-da7219-max98357a.c |  23 
>sound/soc/amd/acp-pcm-dma.c  | 256
+++
>sound/soc/amd/acp.h  |  40 ++
>3 files changed, 262 insertions(+), 57 deletions(-)

> diff --git a/sound/soc/amd/acp-da7219-max98357a.c
b/sound/soc/amd/acp-da7219-max98357a.c
> index 133139d..b3184ab 100644
> --- a/sound/soc/amd/acp-da7219-max98357a.c
> +++ b/sound/soc/amd/acp-da7219-max98357a.c
> @@ -36,6 +36,7 @@
>#include 
>#include 

> +#include "acp.h"
>#include "../codecs/da7219.h"
>#include "../codecs/da7219-aad.h"

> @@ -44,6 +45,7 @@

>static struct snd_soc_jack cz_jack;
>static struct clk *da7219_dai_clk;
> +extern int bt_pad_enable;

WARNING: externs should be avoided in .c files


>static int cz_da7219_init(struct snd_soc_pcm_runtime *rtd)
>{
> @@ -132,6 +134,9 @@ static const struct snd_pcm_hw_constraint_list
constraints_channels = {
>static int cz_da7219_startup(struct snd_pcm_substream *substream)
>{
>   struct snd_pcm_runtime *runtime = substream->runtime;
> +   struct snd_soc_pcm_runtime *rtd = substream->private_data;
> +   struct snd_soc_card *card = rtd->card;
> +   struct acp_platform_info *machine =
snd_soc_card_get_drvdata(card);

>   /*
>* On this platform for PCM device we support stereo
> @@ -143,6 +148,7 @@ static int cz_da7219_startup(struct snd_pcm_substream
*substream)
>   snd_pcm_hw_constraint_list(runtime, 0, SNDRV_PCM_HW_PARAM_RATE,
>  _rates);

> +   machine->i2s_instance = I2S_BT_INSTANCE;

I'm not a big fan of this approach, but I don't know any other way to tell
a single "platform" driver (acp-pcm-dma) which of two channels (ST/BT) to
use via the pcm_open() callback.

Mark, can you recommend any other way of doing this?

>   return da7219_clk_enable(substream);
>}

> @@ -153,6 +159,11 @@ static void cz_da7219_shutdown(struct
snd_pcm_substream *substream)

>static int cz_max_startup(struct snd_pcm_substream *substream)
>{
> +   struct snd_soc_pcm_runtime *rtd = substream->private_data;
> +   struct snd_soc_card *card = rtd->card;
> +   struct acp_platform_info *machine =
snd_soc_card_get_drvdata(card);
> +
> +   machine->i2s_instance = I2S_SP_INSTANCE;
>   return da7219_clk_enable(substream);
>}

> @@ -163,6 +174,11 @@ static void cz_max_shutdown(struct snd_pcm_substream
*substream)

>static int cz_dmic_startup(struct snd_pcm_substream *substream)
>{
> +   struct snd_soc_pcm_runtime *rtd = substream->private_data;
> +   struct snd_soc_card *card = rtd->card;
> +   struct acp_platform_info *machine =
snd_soc_card_get_drvdata(card);
> +
> +   machine->i2s_instance = I2S_SP_INSTANCE;
>   return da7219_clk_enable(substream);
>}

> @@ -266,10 +282,16 @@ static int cz_probe(struct platform_device *pdev)
>{
>   int ret;
>   struct snd_soc_card *card;
> +   struct acp_platform_info *machine;

> +   machine = devm_kzalloc(>dev, sizeof(struct
acp_platform_info),
> +  GFP_KERNEL);
> +   if (!machine)
> +   return -ENOMEM;
>   card = _card;
>   cz_card.dev = >dev;
>   platform_set_drvdata(pdev, card);
> +   snd_soc_card_set_drvdata(card, machine);
>   ret = devm_snd_soc_register_card(>dev, _card);
>   if (ret) {
>   dev_err(>dev,
> @@ -277,6 +299,7 @@ static int cz_probe(struct platform_device *pdev)
>   cz_card.name, ret);
>   return ret;
>   }
> +   bt_pad_enable = device_property_read_bool(>dev,
"bt-pad-enable");
>   return 0;
>}

> diff --git a/sound/soc/amd/acp-pcm-dma.c b/sound/soc/amd/acp-pcm-dma.c
> index ec9cab3..2ea103a 100644
> --- a/sound/soc/amd/acp-pcm-dma.c
> +++ b/sound/soc/amd/acp-pcm-dma.c
> @@ -37,12 +37,14 @@
>#define MAX_BUFFER (PLAYBACK_MAX_PERIOD_SIZE * PLAYBACK_MAX_NUM_PERIODS)
>#define MIN_BUFFER MAX_BUFFER

> -#define ST_PLAYBACK_MAX_PERIOD_SIZE 8192
> +#define ST_PLAYBACK_MAX_PERIOD_SIZE 4096
>#define ST_CAPTURE_MAX_PERIOD_SIZE  ST_PLAYBACK_MAX_PERIOD_SIZE
>#define ST_MAX_BUFFER (ST_PLAYBACK_MAX_PERIOD_SIZE *

Re: [PATCH V3 10/10] ASoC: amd: dma driver changes for bt i2s instance

2018-05-02 Thread Daniel Kurtz
Some checkpatch nits below...

On Tue, May 1, 2018 at 2:53 PM Vijendar Mukunda 
wrote:

> With in ACP, There are three I2S controllers can be
> configured/enabled ( I2S SP, I2S MICSP, I2S BT).
> Default enabled I2S controller instance is I2S SP.
> This patch provides required changes to support I2S BT
> controller Instance.

> Signed-off-by: Vijendar Mukunda 
> ---
> v1->v2: defined i2s instance macros in acp header file
> v2->v3: sqaushed previous patch series and spilt changes
>   into multiple patches (acp dma driver code cleanup
>   patches and bt i2s instance specific changes)
>sound/soc/amd/acp-da7219-max98357a.c |  23 
>sound/soc/amd/acp-pcm-dma.c  | 256
+++
>sound/soc/amd/acp.h  |  40 ++
>3 files changed, 262 insertions(+), 57 deletions(-)

> diff --git a/sound/soc/amd/acp-da7219-max98357a.c
b/sound/soc/amd/acp-da7219-max98357a.c
> index 133139d..b3184ab 100644
> --- a/sound/soc/amd/acp-da7219-max98357a.c
> +++ b/sound/soc/amd/acp-da7219-max98357a.c
> @@ -36,6 +36,7 @@
>#include 
>#include 

> +#include "acp.h"
>#include "../codecs/da7219.h"
>#include "../codecs/da7219-aad.h"

> @@ -44,6 +45,7 @@

>static struct snd_soc_jack cz_jack;
>static struct clk *da7219_dai_clk;
> +extern int bt_pad_enable;

WARNING: externs should be avoided in .c files


>static int cz_da7219_init(struct snd_soc_pcm_runtime *rtd)
>{
> @@ -132,6 +134,9 @@ static const struct snd_pcm_hw_constraint_list
constraints_channels = {
>static int cz_da7219_startup(struct snd_pcm_substream *substream)
>{
>   struct snd_pcm_runtime *runtime = substream->runtime;
> +   struct snd_soc_pcm_runtime *rtd = substream->private_data;
> +   struct snd_soc_card *card = rtd->card;
> +   struct acp_platform_info *machine =
snd_soc_card_get_drvdata(card);

>   /*
>* On this platform for PCM device we support stereo
> @@ -143,6 +148,7 @@ static int cz_da7219_startup(struct snd_pcm_substream
*substream)
>   snd_pcm_hw_constraint_list(runtime, 0, SNDRV_PCM_HW_PARAM_RATE,
>  _rates);

> +   machine->i2s_instance = I2S_BT_INSTANCE;

I'm not a big fan of this approach, but I don't know any other way to tell
a single "platform" driver (acp-pcm-dma) which of two channels (ST/BT) to
use via the pcm_open() callback.

Mark, can you recommend any other way of doing this?

>   return da7219_clk_enable(substream);
>}

> @@ -153,6 +159,11 @@ static void cz_da7219_shutdown(struct
snd_pcm_substream *substream)

>static int cz_max_startup(struct snd_pcm_substream *substream)
>{
> +   struct snd_soc_pcm_runtime *rtd = substream->private_data;
> +   struct snd_soc_card *card = rtd->card;
> +   struct acp_platform_info *machine =
snd_soc_card_get_drvdata(card);
> +
> +   machine->i2s_instance = I2S_SP_INSTANCE;
>   return da7219_clk_enable(substream);
>}

> @@ -163,6 +174,11 @@ static void cz_max_shutdown(struct snd_pcm_substream
*substream)

>static int cz_dmic_startup(struct snd_pcm_substream *substream)
>{
> +   struct snd_soc_pcm_runtime *rtd = substream->private_data;
> +   struct snd_soc_card *card = rtd->card;
> +   struct acp_platform_info *machine =
snd_soc_card_get_drvdata(card);
> +
> +   machine->i2s_instance = I2S_SP_INSTANCE;
>   return da7219_clk_enable(substream);
>}

> @@ -266,10 +282,16 @@ static int cz_probe(struct platform_device *pdev)
>{
>   int ret;
>   struct snd_soc_card *card;
> +   struct acp_platform_info *machine;

> +   machine = devm_kzalloc(>dev, sizeof(struct
acp_platform_info),
> +  GFP_KERNEL);
> +   if (!machine)
> +   return -ENOMEM;
>   card = _card;
>   cz_card.dev = >dev;
>   platform_set_drvdata(pdev, card);
> +   snd_soc_card_set_drvdata(card, machine);
>   ret = devm_snd_soc_register_card(>dev, _card);
>   if (ret) {
>   dev_err(>dev,
> @@ -277,6 +299,7 @@ static int cz_probe(struct platform_device *pdev)
>   cz_card.name, ret);
>   return ret;
>   }
> +   bt_pad_enable = device_property_read_bool(>dev,
"bt-pad-enable");
>   return 0;
>}

> diff --git a/sound/soc/amd/acp-pcm-dma.c b/sound/soc/amd/acp-pcm-dma.c
> index ec9cab3..2ea103a 100644
> --- a/sound/soc/amd/acp-pcm-dma.c
> +++ b/sound/soc/amd/acp-pcm-dma.c
> @@ -37,12 +37,14 @@
>#define MAX_BUFFER (PLAYBACK_MAX_PERIOD_SIZE * PLAYBACK_MAX_NUM_PERIODS)
>#define MIN_BUFFER MAX_BUFFER

> -#define ST_PLAYBACK_MAX_PERIOD_SIZE 8192
> +#define ST_PLAYBACK_MAX_PERIOD_SIZE 4096
>#define ST_CAPTURE_MAX_PERIOD_SIZE  ST_PLAYBACK_MAX_PERIOD_SIZE
>#define ST_MAX_BUFFER (ST_PLAYBACK_MAX_PERIOD_SIZE *
PLAYBACK_MAX_NUM_PERIODS)
>#define ST_MIN_BUFFER ST_MAX_BUFFER


Re: [PATCH] net/xfrm: Fix lookups for states with spi == 0

2018-05-02 Thread Herbert Xu
On Wed, May 02, 2018 at 01:41:36PM +0100, Dmitry Safonov wrote:
>
> But still it's possible to create ipsec with zero SPI.
> And it seems not making sense to search for a state with SPI hash if
> request has zero SPI.

Fair enough.  In fact a zero SPI is legal and defined for IPcomp.

The bug arose from this patch:

commit 7b4dc3600e4877178ba94c7fbf7e520421378aa6
Author: Masahide NAKAMURA 
Date:   Wed Sep 27 22:21:52 2006 -0700

[XFRM]: Do not add a state whose SPI is zero to the SPI hash.

SPI=0 is used for acquired IPsec SA and MIPv6 RO state.
Such state should not be added to the SPI hash
because we do not care about it on deleting path.

Signed-off-by: Masahide NAKAMURA 
Signed-off-by: YOSHIFUJI Hideaki 

I think it would be better to revert this.

Cheers,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH] net/xfrm: Fix lookups for states with spi == 0

2018-05-02 Thread Herbert Xu
On Wed, May 02, 2018 at 01:41:36PM +0100, Dmitry Safonov wrote:
>
> But still it's possible to create ipsec with zero SPI.
> And it seems not making sense to search for a state with SPI hash if
> request has zero SPI.

Fair enough.  In fact a zero SPI is legal and defined for IPcomp.

The bug arose from this patch:

commit 7b4dc3600e4877178ba94c7fbf7e520421378aa6
Author: Masahide NAKAMURA 
Date:   Wed Sep 27 22:21:52 2006 -0700

[XFRM]: Do not add a state whose SPI is zero to the SPI hash.

SPI=0 is used for acquired IPsec SA and MIPv6 RO state.
Such state should not be added to the SPI hash
because we do not care about it on deleting path.

Signed-off-by: Masahide NAKAMURA 
Signed-off-by: YOSHIFUJI Hideaki 

I think it would be better to revert this.

Cheers,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH] mtd: rawnand: marvell: pass ms delay to wait_op

2018-05-02 Thread Chris Packham
On 03/05/18 14:21, Chris Packham wrote:
> marvell_nfc_wait_op() expects the delay to be expressed in milliseconds
> but nand_sdr_timings uses picoseconds. Use PSEC_TO_MSEC when passing
> tPROG_max to marvell_nfc_wait_op().
> 
> Fixes: 02f26ecf8c772 ("mtd: nand: add reworked Marvell NAND controller 
> driver")
> Cc: sta...@vger.kernel.org
> Signed-off-by: Chris Packham 
> ---
>   drivers/mtd/nand/raw/marvell_nand.c | 4 ++--
>   1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/mtd/nand/raw/marvell_nand.c 
> b/drivers/mtd/nand/raw/marvell_nand.c
> index 1d779a35ac8e..e4b964fd40d8 100644
> --- a/drivers/mtd/nand/raw/marvell_nand.c
> +++ b/drivers/mtd/nand/raw/marvell_nand.c
> @@ -1074,7 +1074,7 @@ static int marvell_nfc_hw_ecc_hmg_do_write_page(struct 
> nand_chip *chip,
>   return ret;
>   
>   ret = marvell_nfc_wait_op(chip,
> -   chip->data_interface.timings.sdr.tPROG_max);
> +   
> PSEC_TO_MSEC(chip->data_interface.timings.sdr.tPROG_max));
>   return ret;
>   }
>   
> @@ -1494,7 +1494,7 @@ static int marvell_nfc_hw_ecc_bch_write_page(struct 
> mtd_info *mtd,
>   }
>   
>   ret = marvell_nfc_wait_op(chip,
> -   chip->data_interface.timings.sdr.tPROG_max);
> +   
> PSEC_TO_MSEC(chip->data_interface.timings.sdr.tPROG_max));
>   
>   marvell_nfc_disable_hw_ecc(chip);
>   

Actually I'm not so sure about this patch. While passing the pico-second 
value for tPROG_max is clearly wrong and leads to seemingly indefinite 
hangs on some systems. Converting the times to micro-seconds leaves us 
with delays that are far too short.

The old pxa3xx driver had hard coded 200ms delays. These delays now work 
out to 1ms which seems every bit as wrong as 6ms.



Re: [PATCH] mtd: rawnand: marvell: pass ms delay to wait_op

2018-05-02 Thread Chris Packham
On 03/05/18 14:21, Chris Packham wrote:
> marvell_nfc_wait_op() expects the delay to be expressed in milliseconds
> but nand_sdr_timings uses picoseconds. Use PSEC_TO_MSEC when passing
> tPROG_max to marvell_nfc_wait_op().
> 
> Fixes: 02f26ecf8c772 ("mtd: nand: add reworked Marvell NAND controller 
> driver")
> Cc: sta...@vger.kernel.org
> Signed-off-by: Chris Packham 
> ---
>   drivers/mtd/nand/raw/marvell_nand.c | 4 ++--
>   1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/mtd/nand/raw/marvell_nand.c 
> b/drivers/mtd/nand/raw/marvell_nand.c
> index 1d779a35ac8e..e4b964fd40d8 100644
> --- a/drivers/mtd/nand/raw/marvell_nand.c
> +++ b/drivers/mtd/nand/raw/marvell_nand.c
> @@ -1074,7 +1074,7 @@ static int marvell_nfc_hw_ecc_hmg_do_write_page(struct 
> nand_chip *chip,
>   return ret;
>   
>   ret = marvell_nfc_wait_op(chip,
> -   chip->data_interface.timings.sdr.tPROG_max);
> +   
> PSEC_TO_MSEC(chip->data_interface.timings.sdr.tPROG_max));
>   return ret;
>   }
>   
> @@ -1494,7 +1494,7 @@ static int marvell_nfc_hw_ecc_bch_write_page(struct 
> mtd_info *mtd,
>   }
>   
>   ret = marvell_nfc_wait_op(chip,
> -   chip->data_interface.timings.sdr.tPROG_max);
> +   
> PSEC_TO_MSEC(chip->data_interface.timings.sdr.tPROG_max));
>   
>   marvell_nfc_disable_hw_ecc(chip);
>   

Actually I'm not so sure about this patch. While passing the pico-second 
value for tPROG_max is clearly wrong and leads to seemingly indefinite 
hangs on some systems. Converting the times to micro-seconds leaves us 
with delays that are far too short.

The old pxa3xx driver had hard coded 200ms delays. These delays now work 
out to 1ms which seems every bit as wrong as 6ms.



linux-next: Tree for May 3

2018-05-02 Thread Stephen Rothwell
Hi all,

Changes since 20180502:

Removed tree: idr (finished with)

The rockchip tree gained a conflict against the renesas tree.

Non-merge commits (relative to Linus' tree): 3788
 3632 files changed, 146396 insertions(+), 66409 deletions(-)



I have created today's linux-next tree at
git://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
(patches at http://www.kernel.org/pub/linux/kernel/next/ ).  If you
are tracking the linux-next tree using git, you should not use "git pull"
to do so as that will try to merge the new linux-next release with the
old one.  You should use "git fetch" and checkout or reset to the new
master.

You can see which trees have been included by looking in the Next/Trees
file in the source.  There are also quilt-import.log and merge.log
files in the Next directory.  Between each merge, the tree was built
with a ppc64_defconfig for powerpc, an allmodconfig for x86_64, a
multi_v7_defconfig for arm and a native build of tools/perf. After
the final fixups (if any), I do an x86_64 modules_install followed by
builds for x86_64 allnoconfig, powerpc allnoconfig (32 and 64 bit),
ppc44x_defconfig, allyesconfig and pseries_le_defconfig and i386, sparc
and sparc64 defconfig. And finally, a simple boot test of the powerpc
pseries_le_defconfig kernel in qemu (with and without kvm enabled).

Below is a summary of the state of the merge.

I am currently merging 257 trees (counting Linus' and 44 trees of bug
fix patches pending for the current merge release).

Stats about the size of the tree over time can be seen at
http://neuling.org/linux-next-size.html .

Status of my local build tests will be at
http://kisskb.ellerman.id.au/linux-next .  If maintainers want to give
advice about cross compilers/configs that work, we are always open to add
more builds.

Thanks to Randy Dunlap for doing many randconfig builds.  And to Paul
Gortmaker for triage and bug fixes.

-- 
Cheers,
Stephen Rothwell

$ git checkout master
$ git reset --hard stable
Merging origin/master (2d618bdf7163 Merge branch 'for-linus' of 
git://git.kernel.org/pub/scm/linux/kernel/git/rkuo/linux-hexagon-kernel)
Merging fixes/master (147a89bc71e7 Merge tag 'kconfig-v4.17' of 
git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild)
Merging kbuild-current/fixes (6d08b06e67cd Linux 4.17-rc2)
Merging arc-current/for-curr (661e50bc8532 Linux 4.16-rc4)
Merging arm-current/fixes (30cfae461581 ARM: replace unnecessary perl with sed 
and the shell $(( )) operator)
Merging arm64-fixes/for-next/fixes (3789c122d0a0 arm64: avoid instrumenting 
atomic_ll_sc.o)
Merging m68k-current/for-linus (ecd685580c8f m68k/mac: Remove bogus "FIXME" 
comment)
Merging powerpc-fixes/fixes (b2d7ecbe3556 powerpc/kvm/booke: Fix altivec 
related build break)
Merging sparc/master (fff75eb2a08c Merge tag 'errseq-v4.17' of 
git://git.kernel.org/pub/scm/linux/kernel/git/jlayton/linux)
Merging fscrypt-current/for-stable (ae64f9bd1d36 Linux 4.15-rc2)
Merging net/master (7df40c2673a1 net_sched: fq: take care of throttled flows 
before reuse)
Merging bpf/master (0f58e58e2803 Merge branch 'x86-bpf-jit-fixes')
Merging ipsec/master (b4331a681822 vti6: Change minimum MTU to IPV4_MIN_MTU, 
vti6 can carry IPv4 too)
Merging netfilter/master (2f99aa31cd7a netfilter: nf_tables: skip 
synchronize_rcu if transaction log is empty)
Merging ipvs/master (765cca91b895 netfilter: conntrack: include kmemleak.h for 
kmemleak_not_leak())
Merging wireless-drivers/master (af8a41cccf8f rtlwifi: cleanup 8723be ant_sel 
definition)
Merging mac80211/master (2f0605a697f4 nl80211: Free connkeys on external 
authentication failure)
Merging rdma-fixes/for-rc (db82476f3741 IB/core: Make ib_mad_client_id atomic)
Merging sound-current/for-linus (f13876e2c33a ALSA: pcm: Check PCM state at 
xfern compat ioctl)
Merging pci-current/for-linus (0cf22d6b317c PCI: Add "PCIe" to 
pcie_print_link_status() messages)
Merging driver-core.current/driver-core-linus (6da6c0db5316 Linux v4.17-rc3)
Merging tty.current/tty-linus (6da6c0db5316 Linux v4.17-rc3)
Merging usb.current/usb-linus (9aea9b6cc78d usb: musb: trace: fix NULL pointer 
dereference in musb_g_tx())
Merging usb-gadget-fixes/fixes (ed769520727e usb: gadget: composite Allow for 
larger configuration descriptors)
Merging usb-serial-fixes/usb-linus (4842ed5bfcb9 USB: serial: visor: handle 
potential invalid device configuration)
Merging usb-chipidea-fixes/ci-for-usb-stable (964728f9f407 USB: chipidea: msm: 
fix ulpi-node lookup)
Merging phy/fixes (60cc43fc8884 Linux 4.17-rc1)
Merging staging.current/staging-linus (6da6c0db5316 Linux v4.17-rc3)
Merging char-misc.current/char-misc-linus (6da6c0db5316 Linux v4.17-rc3)
Merging input-current/for-linus (f6eeb9e54857 Input: atmel_mxt_ts - add missing 
compatible strings to OF device table)
Merging crypto-current/master (eea0d3ea7546 crypto: drbg - set freed buffers to 
NULL)
Mergi

linux-next: Tree for May 3

2018-05-02 Thread Stephen Rothwell
Hi all,

Changes since 20180502:

Removed tree: idr (finished with)

The rockchip tree gained a conflict against the renesas tree.

Non-merge commits (relative to Linus' tree): 3788
 3632 files changed, 146396 insertions(+), 66409 deletions(-)



I have created today's linux-next tree at
git://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
(patches at http://www.kernel.org/pub/linux/kernel/next/ ).  If you
are tracking the linux-next tree using git, you should not use "git pull"
to do so as that will try to merge the new linux-next release with the
old one.  You should use "git fetch" and checkout or reset to the new
master.

You can see which trees have been included by looking in the Next/Trees
file in the source.  There are also quilt-import.log and merge.log
files in the Next directory.  Between each merge, the tree was built
with a ppc64_defconfig for powerpc, an allmodconfig for x86_64, a
multi_v7_defconfig for arm and a native build of tools/perf. After
the final fixups (if any), I do an x86_64 modules_install followed by
builds for x86_64 allnoconfig, powerpc allnoconfig (32 and 64 bit),
ppc44x_defconfig, allyesconfig and pseries_le_defconfig and i386, sparc
and sparc64 defconfig. And finally, a simple boot test of the powerpc
pseries_le_defconfig kernel in qemu (with and without kvm enabled).

Below is a summary of the state of the merge.

I am currently merging 257 trees (counting Linus' and 44 trees of bug
fix patches pending for the current merge release).

Stats about the size of the tree over time can be seen at
http://neuling.org/linux-next-size.html .

Status of my local build tests will be at
http://kisskb.ellerman.id.au/linux-next .  If maintainers want to give
advice about cross compilers/configs that work, we are always open to add
more builds.

Thanks to Randy Dunlap for doing many randconfig builds.  And to Paul
Gortmaker for triage and bug fixes.

-- 
Cheers,
Stephen Rothwell

$ git checkout master
$ git reset --hard stable
Merging origin/master (2d618bdf7163 Merge branch 'for-linus' of 
git://git.kernel.org/pub/scm/linux/kernel/git/rkuo/linux-hexagon-kernel)
Merging fixes/master (147a89bc71e7 Merge tag 'kconfig-v4.17' of 
git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild)
Merging kbuild-current/fixes (6d08b06e67cd Linux 4.17-rc2)
Merging arc-current/for-curr (661e50bc8532 Linux 4.16-rc4)
Merging arm-current/fixes (30cfae461581 ARM: replace unnecessary perl with sed 
and the shell $(( )) operator)
Merging arm64-fixes/for-next/fixes (3789c122d0a0 arm64: avoid instrumenting 
atomic_ll_sc.o)
Merging m68k-current/for-linus (ecd685580c8f m68k/mac: Remove bogus "FIXME" 
comment)
Merging powerpc-fixes/fixes (b2d7ecbe3556 powerpc/kvm/booke: Fix altivec 
related build break)
Merging sparc/master (fff75eb2a08c Merge tag 'errseq-v4.17' of 
git://git.kernel.org/pub/scm/linux/kernel/git/jlayton/linux)
Merging fscrypt-current/for-stable (ae64f9bd1d36 Linux 4.15-rc2)
Merging net/master (7df40c2673a1 net_sched: fq: take care of throttled flows 
before reuse)
Merging bpf/master (0f58e58e2803 Merge branch 'x86-bpf-jit-fixes')
Merging ipsec/master (b4331a681822 vti6: Change minimum MTU to IPV4_MIN_MTU, 
vti6 can carry IPv4 too)
Merging netfilter/master (2f99aa31cd7a netfilter: nf_tables: skip 
synchronize_rcu if transaction log is empty)
Merging ipvs/master (765cca91b895 netfilter: conntrack: include kmemleak.h for 
kmemleak_not_leak())
Merging wireless-drivers/master (af8a41cccf8f rtlwifi: cleanup 8723be ant_sel 
definition)
Merging mac80211/master (2f0605a697f4 nl80211: Free connkeys on external 
authentication failure)
Merging rdma-fixes/for-rc (db82476f3741 IB/core: Make ib_mad_client_id atomic)
Merging sound-current/for-linus (f13876e2c33a ALSA: pcm: Check PCM state at 
xfern compat ioctl)
Merging pci-current/for-linus (0cf22d6b317c PCI: Add "PCIe" to 
pcie_print_link_status() messages)
Merging driver-core.current/driver-core-linus (6da6c0db5316 Linux v4.17-rc3)
Merging tty.current/tty-linus (6da6c0db5316 Linux v4.17-rc3)
Merging usb.current/usb-linus (9aea9b6cc78d usb: musb: trace: fix NULL pointer 
dereference in musb_g_tx())
Merging usb-gadget-fixes/fixes (ed769520727e usb: gadget: composite Allow for 
larger configuration descriptors)
Merging usb-serial-fixes/usb-linus (4842ed5bfcb9 USB: serial: visor: handle 
potential invalid device configuration)
Merging usb-chipidea-fixes/ci-for-usb-stable (964728f9f407 USB: chipidea: msm: 
fix ulpi-node lookup)
Merging phy/fixes (60cc43fc8884 Linux 4.17-rc1)
Merging staging.current/staging-linus (6da6c0db5316 Linux v4.17-rc3)
Merging char-misc.current/char-misc-linus (6da6c0db5316 Linux v4.17-rc3)
Merging input-current/for-linus (f6eeb9e54857 Input: atmel_mxt_ts - add missing 
compatible strings to OF device table)
Merging crypto-current/master (eea0d3ea7546 crypto: drbg - set freed buffers to 
NULL)
Mergi

Re: [kernel-team] [PATCH 1/3] staging: Android: vsoc: Create wc kernel mapping for region shm.

2018-05-02 Thread Joel Fernandes
On Wed, May 2, 2018 at 9:45 PM 'Alistair Strachan' via kernel-team <
kernel-t...@android.com> wrote:

> Map the region shm as write-combining instead of uncachable.


I think more commit message is needed. Why is this done, what does it fix,
etc. Its hard to know what the improvement is without a cover-letter either.

I am assuming the improvement is in performance by using the
write-combining mode.

thanks,

- Joel


Re: [kernel-team] [PATCH 1/3] staging: Android: vsoc: Create wc kernel mapping for region shm.

2018-05-02 Thread Joel Fernandes
On Wed, May 2, 2018 at 9:45 PM 'Alistair Strachan' via kernel-team <
kernel-t...@android.com> wrote:

> Map the region shm as write-combining instead of uncachable.


I think more commit message is needed. Why is this done, what does it fix,
etc. Its hard to know what the improvement is without a cover-letter either.

I am assuming the improvement is in performance by using the
write-combining mode.

thanks,

- Joel


[PATCH v2 3/9] x86, memcpy_mcsafe: return bytes remaining

2018-05-02 Thread Dan Williams
Machine check safe memory copies are currently deployed in the pmem
driver whenever reading from persistent memory media, so that -EIO is
returned rather than triggering a kernel panic. While this protects most
pmem accesses, it is not complete in the filesystem-dax case. When
filesystem-dax is enabled reads may bypass the block layer and the
driver via dax_iomap_actor() and its usage of copy_to_iter().

In preparation for creating a copy_to_iter() variant that can handle
machine checks, teach memcpy_mcsafe() to return the number of bytes
remaining rather than -EFAULT when an exception occurs.

Cc: 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: Tony Luck 
Cc: Al Viro 
Cc: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Peter Zijlstra 
Cc: Andrew Morton 
Cc: Linus Torvalds 
Co-developed-by: Tony Luck 
Signed-off-by: Dan Williams 
---
 arch/x86/include/asm/string_64.h |8 +---
 arch/x86/lib/memcpy_64.S |   20 ++--
 drivers/nvdimm/claim.c   |3 ++-
 drivers/nvdimm/pmem.c|6 +++---
 include/linux/string.h   |4 ++--
 5 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 4752f8984923..d33f92b9fa22 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -116,7 +116,8 @@ int strcmp(const char *cs, const char *ct);
 #endif
 
 #define __HAVE_ARCH_MEMCPY_MCSAFE 1
-__must_check int __memcpy_mcsafe(void *dst, const void *src, size_t cnt);
+__must_check unsigned long __memcpy_mcsafe(void *dst, const void *src,
+   size_t cnt);
 DECLARE_STATIC_KEY_FALSE(mcsafe_key);
 
 /**
@@ -131,9 +132,10 @@ DECLARE_STATIC_KEY_FALSE(mcsafe_key);
  * actually do machine check recovery. Everyone else can just
  * use memcpy().
  *
- * Return 0 for success, -EFAULT for fail
+ * Return 0 for success, or number of bytes not copied if there was an
+ * exception.
  */
-static __always_inline __must_check int
+static __always_inline __must_check unsigned long
 memcpy_mcsafe(void *dst, const void *src, size_t cnt)
 {
 #ifdef CONFIG_X86_MCE
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 5709f3ec22a4..f01a88391c98 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -252,14 +252,22 @@ ENDPROC(__memcpy_mcsafe)
 EXPORT_SYMBOL_GPL(__memcpy_mcsafe)
 
.section .fixup, "ax"
-   /* Return -EFAULT for any failure */
-.L_memcpy_mcsafe_fail:
-   mov $-EFAULT, %rax
+   /*
+* Return number of bytes not copied for any failure. Note that
+* there is no "tail" handling since the source buffer is 8-byte
+* aligned and poison is cacheline aligned.
+*/
+.E_read_words:
+   shll$3, %ecx
+.E_leading_bytes:
+   addl%edx, %ecx
+.E_trailing_bytes:
+   mov %ecx, %eax
ret
 
.previous
 
-   _ASM_EXTABLE_FAULT(.L_read_leading_bytes, .L_memcpy_mcsafe_fail)
-   _ASM_EXTABLE_FAULT(.L_read_words, .L_memcpy_mcsafe_fail)
-   _ASM_EXTABLE_FAULT(.L_read_trailing_bytes, .L_memcpy_mcsafe_fail)
+   _ASM_EXTABLE_FAULT(.L_read_leading_bytes, .E_leading_bytes)
+   _ASM_EXTABLE_FAULT(.L_read_words, .E_read_words)
+   _ASM_EXTABLE_FAULT(.L_read_trailing_bytes, .E_trailing_bytes)
 #endif
diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
index 30852270484f..2e96b34bc936 100644
--- a/drivers/nvdimm/claim.c
+++ b/drivers/nvdimm/claim.c
@@ -276,7 +276,8 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
if (rw == READ) {
if (unlikely(is_bad_pmem(>bb, sector, sz_align)))
return -EIO;
-   return memcpy_mcsafe(buf, nsio->addr + offset, size);
+   if (memcpy_mcsafe(buf, nsio->addr + offset, size) != 0)
+   return -EIO;
}
 
if (unlikely(is_bad_pmem(>bb, sector, sz_align))) {
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 9d714926ecf5..e023d6aa22b5 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -101,15 +101,15 @@ static blk_status_t read_pmem(struct page *page, unsigned 
int off,
void *pmem_addr, unsigned int len)
 {
unsigned int chunk;
-   int rc;
+   unsigned long rem;
void *mem;
 
while (len) {
mem = kmap_atomic(page);
chunk = min_t(unsigned int, len, PAGE_SIZE);
-   rc = memcpy_mcsafe(mem + off, pmem_addr, chunk);
+   rem = memcpy_mcsafe(mem + off, pmem_addr, chunk);
kunmap_atomic(mem);
-   if (rc)
+   if (rem)
return BLK_STS_IOERR;
len -= chunk;
 

你可以回复我吗/////// ,

2018-05-02 Thread Hannah Justin



[PATCH v2 3/9] x86, memcpy_mcsafe: return bytes remaining

2018-05-02 Thread Dan Williams
Machine check safe memory copies are currently deployed in the pmem
driver whenever reading from persistent memory media, so that -EIO is
returned rather than triggering a kernel panic. While this protects most
pmem accesses, it is not complete in the filesystem-dax case. When
filesystem-dax is enabled reads may bypass the block layer and the
driver via dax_iomap_actor() and its usage of copy_to_iter().

In preparation for creating a copy_to_iter() variant that can handle
machine checks, teach memcpy_mcsafe() to return the number of bytes
remaining rather than -EFAULT when an exception occurs.

Cc: 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: Tony Luck 
Cc: Al Viro 
Cc: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Peter Zijlstra 
Cc: Andrew Morton 
Cc: Linus Torvalds 
Co-developed-by: Tony Luck 
Signed-off-by: Dan Williams 
---
 arch/x86/include/asm/string_64.h |8 +---
 arch/x86/lib/memcpy_64.S |   20 ++--
 drivers/nvdimm/claim.c   |3 ++-
 drivers/nvdimm/pmem.c|6 +++---
 include/linux/string.h   |4 ++--
 5 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 4752f8984923..d33f92b9fa22 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -116,7 +116,8 @@ int strcmp(const char *cs, const char *ct);
 #endif
 
 #define __HAVE_ARCH_MEMCPY_MCSAFE 1
-__must_check int __memcpy_mcsafe(void *dst, const void *src, size_t cnt);
+__must_check unsigned long __memcpy_mcsafe(void *dst, const void *src,
+   size_t cnt);
 DECLARE_STATIC_KEY_FALSE(mcsafe_key);
 
 /**
@@ -131,9 +132,10 @@ DECLARE_STATIC_KEY_FALSE(mcsafe_key);
  * actually do machine check recovery. Everyone else can just
  * use memcpy().
  *
- * Return 0 for success, -EFAULT for fail
+ * Return 0 for success, or number of bytes not copied if there was an
+ * exception.
  */
-static __always_inline __must_check int
+static __always_inline __must_check unsigned long
 memcpy_mcsafe(void *dst, const void *src, size_t cnt)
 {
 #ifdef CONFIG_X86_MCE
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 5709f3ec22a4..f01a88391c98 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -252,14 +252,22 @@ ENDPROC(__memcpy_mcsafe)
 EXPORT_SYMBOL_GPL(__memcpy_mcsafe)
 
.section .fixup, "ax"
-   /* Return -EFAULT for any failure */
-.L_memcpy_mcsafe_fail:
-   mov $-EFAULT, %rax
+   /*
+* Return number of bytes not copied for any failure. Note that
+* there is no "tail" handling since the source buffer is 8-byte
+* aligned and poison is cacheline aligned.
+*/
+.E_read_words:
+   shll$3, %ecx
+.E_leading_bytes:
+   addl%edx, %ecx
+.E_trailing_bytes:
+   mov %ecx, %eax
ret
 
.previous
 
-   _ASM_EXTABLE_FAULT(.L_read_leading_bytes, .L_memcpy_mcsafe_fail)
-   _ASM_EXTABLE_FAULT(.L_read_words, .L_memcpy_mcsafe_fail)
-   _ASM_EXTABLE_FAULT(.L_read_trailing_bytes, .L_memcpy_mcsafe_fail)
+   _ASM_EXTABLE_FAULT(.L_read_leading_bytes, .E_leading_bytes)
+   _ASM_EXTABLE_FAULT(.L_read_words, .E_read_words)
+   _ASM_EXTABLE_FAULT(.L_read_trailing_bytes, .E_trailing_bytes)
 #endif
diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
index 30852270484f..2e96b34bc936 100644
--- a/drivers/nvdimm/claim.c
+++ b/drivers/nvdimm/claim.c
@@ -276,7 +276,8 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
if (rw == READ) {
if (unlikely(is_bad_pmem(>bb, sector, sz_align)))
return -EIO;
-   return memcpy_mcsafe(buf, nsio->addr + offset, size);
+   if (memcpy_mcsafe(buf, nsio->addr + offset, size) != 0)
+   return -EIO;
}
 
if (unlikely(is_bad_pmem(>bb, sector, sz_align))) {
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 9d714926ecf5..e023d6aa22b5 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -101,15 +101,15 @@ static blk_status_t read_pmem(struct page *page, unsigned 
int off,
void *pmem_addr, unsigned int len)
 {
unsigned int chunk;
-   int rc;
+   unsigned long rem;
void *mem;
 
while (len) {
mem = kmap_atomic(page);
chunk = min_t(unsigned int, len, PAGE_SIZE);
-   rc = memcpy_mcsafe(mem + off, pmem_addr, chunk);
+   rem = memcpy_mcsafe(mem + off, pmem_addr, chunk);
kunmap_atomic(mem);
-   if (rc)
+   if (rem)
return BLK_STS_IOERR;
len -= chunk;
off = 0;
diff --git a/include/linux/string.h b/include/linux/string.h
index dd39a690c841..4a5a0eb7df51 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -147,8 +147,8 @@ extern int memcmp(const void *,const void 

你可以回复我吗/////// ,

2018-05-02 Thread Hannah Justin



[PATCH v2 6/9] dax: introduce a ->copy_to_iter dax operation

2018-05-02 Thread Dan Williams
Similar to the ->copy_from_iter() operation, a platform may want to
deploy an architecture or device specific routine for handling reads
from a dax_device like /dev/pmemX. On x86 this routine will point to a
machine check safe version of copy_to_iter(). For now, add the plumbing
to device-mapper and the dax core.

Cc: Ross Zwisler 
Cc: Mike Snitzer 
Cc: Christoph Hellwig 
Signed-off-by: Dan Williams 
---
 drivers/dax/super.c   |   10 ++
 drivers/md/dm-linear.c|   16 
 drivers/md/dm-log-writes.c|   15 +++
 drivers/md/dm-stripe.c|   21 +
 drivers/md/dm.c   |   25 +
 drivers/nvdimm/pmem.c |7 +++
 drivers/s390/block/dcssblk.c  |7 +++
 fs/dax.c  |3 ++-
 include/linux/dax.h   |5 +
 include/linux/device-mapper.h |5 +++--
 10 files changed, 111 insertions(+), 3 deletions(-)

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 2b2332b605e4..31b839113399 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -282,6 +282,16 @@ size_t dax_copy_from_iter(struct dax_device *dax_dev, 
pgoff_t pgoff, void *addr,
 }
 EXPORT_SYMBOL_GPL(dax_copy_from_iter);
 
+size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
+   size_t bytes, struct iov_iter *i)
+{
+   if (!dax_alive(dax_dev))
+   return 0;
+
+   return dax_dev->ops->copy_to_iter(dax_dev, pgoff, addr, bytes, i);
+}
+EXPORT_SYMBOL_GPL(dax_copy_to_iter);
+
 #ifdef CONFIG_ARCH_HAS_PMEM_API
 void arch_wb_cache_pmem(void *addr, size_t size);
 void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 775c06d953b7..d10964d41fd7 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -185,9 +185,24 @@ static size_t linear_dax_copy_from_iter(struct dm_target 
*ti, pgoff_t pgoff,
return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i);
 }
 
+static size_t linear_dax_copy_to_iter(struct dm_target *ti, pgoff_t pgoff,
+   void *addr, size_t bytes, struct iov_iter *i)
+{
+   struct linear_c *lc = ti->private;
+   struct block_device *bdev = lc->dev->bdev;
+   struct dax_device *dax_dev = lc->dev->dax_dev;
+   sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
+
+   dev_sector = linear_map_sector(ti, sector);
+   if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(bytes, PAGE_SIZE), ))
+   return 0;
+   return dax_copy_to_iter(dax_dev, pgoff, addr, bytes, i);
+}
+
 #else
 #define linear_dax_direct_access NULL
 #define linear_dax_copy_from_iter NULL
+#define linear_dax_copy_to_iter NULL
 #endif
 
 static struct target_type linear_target = {
@@ -204,6 +219,7 @@ static struct target_type linear_target = {
.iterate_devices = linear_iterate_devices,
.direct_access = linear_dax_direct_access,
.dax_copy_from_iter = linear_dax_copy_from_iter,
+   .dax_copy_to_iter = linear_dax_copy_to_iter,
 };
 
 int __init dm_linear_init(void)
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index c90c7c08a77f..9ea2b0291f20 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -962,9 +962,23 @@ static size_t log_writes_dax_copy_from_iter(struct 
dm_target *ti,
 dax_copy:
return dax_copy_from_iter(lc->dev->dax_dev, pgoff, addr, bytes, i);
 }
+
+static size_t log_writes_dax_copy_to_iter(struct dm_target *ti,
+ pgoff_t pgoff, void *addr, size_t 
bytes,
+ struct iov_iter *i)
+{
+   struct log_writes_c *lc = ti->private;
+   sector_t sector = pgoff * PAGE_SECTORS;
+
+   if (bdev_dax_pgoff(lc->dev->bdev, sector, ALIGN(bytes, PAGE_SIZE), 
))
+   return 0;
+   return dax_copy_to_iter(lc->dev->dax_dev, pgoff, addr, bytes, i);
+}
+
 #else
 #define log_writes_dax_direct_access NULL
 #define log_writes_dax_copy_from_iter NULL
+#define log_writes_dax_copy_to_iter NULL
 #endif
 
 static struct target_type log_writes_target = {
@@ -982,6 +996,7 @@ static struct target_type log_writes_target = {
.io_hints = log_writes_io_hints,
.direct_access = log_writes_dax_direct_access,
.dax_copy_from_iter = log_writes_dax_copy_from_iter,
+   .dax_copy_to_iter = log_writes_dax_copy_to_iter,
 };
 
 static int __init dm_log_writes_init(void)
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index fe7fb9b1aec3..8547d7594338 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -354,9 +354,29 @@ static size_t stripe_dax_copy_from_iter(struct dm_target 
*ti, pgoff_t pgoff,
return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i);
 }
 
+static size_t stripe_dax_copy_to_iter(struct dm_target *ti, pgoff_t pgoff,
+   

[PATCH v2 6/9] dax: introduce a ->copy_to_iter dax operation

2018-05-02 Thread Dan Williams
Similar to the ->copy_from_iter() operation, a platform may want to
deploy an architecture or device specific routine for handling reads
from a dax_device like /dev/pmemX. On x86 this routine will point to a
machine check safe version of copy_to_iter(). For now, add the plumbing
to device-mapper and the dax core.

Cc: Ross Zwisler 
Cc: Mike Snitzer 
Cc: Christoph Hellwig 
Signed-off-by: Dan Williams 
---
 drivers/dax/super.c   |   10 ++
 drivers/md/dm-linear.c|   16 
 drivers/md/dm-log-writes.c|   15 +++
 drivers/md/dm-stripe.c|   21 +
 drivers/md/dm.c   |   25 +
 drivers/nvdimm/pmem.c |7 +++
 drivers/s390/block/dcssblk.c  |7 +++
 fs/dax.c  |3 ++-
 include/linux/dax.h   |5 +
 include/linux/device-mapper.h |5 +++--
 10 files changed, 111 insertions(+), 3 deletions(-)

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 2b2332b605e4..31b839113399 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -282,6 +282,16 @@ size_t dax_copy_from_iter(struct dax_device *dax_dev, 
pgoff_t pgoff, void *addr,
 }
 EXPORT_SYMBOL_GPL(dax_copy_from_iter);
 
+size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
+   size_t bytes, struct iov_iter *i)
+{
+   if (!dax_alive(dax_dev))
+   return 0;
+
+   return dax_dev->ops->copy_to_iter(dax_dev, pgoff, addr, bytes, i);
+}
+EXPORT_SYMBOL_GPL(dax_copy_to_iter);
+
 #ifdef CONFIG_ARCH_HAS_PMEM_API
 void arch_wb_cache_pmem(void *addr, size_t size);
 void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 775c06d953b7..d10964d41fd7 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -185,9 +185,24 @@ static size_t linear_dax_copy_from_iter(struct dm_target 
*ti, pgoff_t pgoff,
return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i);
 }
 
+static size_t linear_dax_copy_to_iter(struct dm_target *ti, pgoff_t pgoff,
+   void *addr, size_t bytes, struct iov_iter *i)
+{
+   struct linear_c *lc = ti->private;
+   struct block_device *bdev = lc->dev->bdev;
+   struct dax_device *dax_dev = lc->dev->dax_dev;
+   sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
+
+   dev_sector = linear_map_sector(ti, sector);
+   if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(bytes, PAGE_SIZE), ))
+   return 0;
+   return dax_copy_to_iter(dax_dev, pgoff, addr, bytes, i);
+}
+
 #else
 #define linear_dax_direct_access NULL
 #define linear_dax_copy_from_iter NULL
+#define linear_dax_copy_to_iter NULL
 #endif
 
 static struct target_type linear_target = {
@@ -204,6 +219,7 @@ static struct target_type linear_target = {
.iterate_devices = linear_iterate_devices,
.direct_access = linear_dax_direct_access,
.dax_copy_from_iter = linear_dax_copy_from_iter,
+   .dax_copy_to_iter = linear_dax_copy_to_iter,
 };
 
 int __init dm_linear_init(void)
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index c90c7c08a77f..9ea2b0291f20 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -962,9 +962,23 @@ static size_t log_writes_dax_copy_from_iter(struct 
dm_target *ti,
 dax_copy:
return dax_copy_from_iter(lc->dev->dax_dev, pgoff, addr, bytes, i);
 }
+
+static size_t log_writes_dax_copy_to_iter(struct dm_target *ti,
+ pgoff_t pgoff, void *addr, size_t 
bytes,
+ struct iov_iter *i)
+{
+   struct log_writes_c *lc = ti->private;
+   sector_t sector = pgoff * PAGE_SECTORS;
+
+   if (bdev_dax_pgoff(lc->dev->bdev, sector, ALIGN(bytes, PAGE_SIZE), 
))
+   return 0;
+   return dax_copy_to_iter(lc->dev->dax_dev, pgoff, addr, bytes, i);
+}
+
 #else
 #define log_writes_dax_direct_access NULL
 #define log_writes_dax_copy_from_iter NULL
+#define log_writes_dax_copy_to_iter NULL
 #endif
 
 static struct target_type log_writes_target = {
@@ -982,6 +996,7 @@ static struct target_type log_writes_target = {
.io_hints = log_writes_io_hints,
.direct_access = log_writes_dax_direct_access,
.dax_copy_from_iter = log_writes_dax_copy_from_iter,
+   .dax_copy_to_iter = log_writes_dax_copy_to_iter,
 };
 
 static int __init dm_log_writes_init(void)
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index fe7fb9b1aec3..8547d7594338 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -354,9 +354,29 @@ static size_t stripe_dax_copy_from_iter(struct dm_target 
*ti, pgoff_t pgoff,
return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i);
 }
 
+static size_t stripe_dax_copy_to_iter(struct dm_target *ti, pgoff_t pgoff,
+   void *addr, size_t bytes, struct iov_iter *i)
+{
+   sector_t 

[PATCH v2 8/9] pmem: switch to copy_to_iter_mcsafe()

2018-05-02 Thread Dan Williams
Use the machine check safe version of copy_to_iter() for the
->copy_to_iter() operation published by the pmem driver.

Signed-off-by: Dan Williams 
---
 drivers/nvdimm/pmem.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 1b8ab48365de..6d3da8c92868 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -267,7 +267,7 @@ static size_t pmem_copy_from_iter(struct dax_device 
*dax_dev, pgoff_t pgoff,
 static size_t pmem_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
void *addr, size_t bytes, struct iov_iter *i)
 {
-   return copy_to_iter(addr, bytes, i);
+   return copy_to_iter_mcsafe(addr, bytes, i);
 }
 
 static const struct dax_operations pmem_dax_ops = {



[PATCH v2 8/9] pmem: switch to copy_to_iter_mcsafe()

2018-05-02 Thread Dan Williams
Use the machine check safe version of copy_to_iter() for the
->copy_to_iter() operation published by the pmem driver.

Signed-off-by: Dan Williams 
---
 drivers/nvdimm/pmem.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 1b8ab48365de..6d3da8c92868 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -267,7 +267,7 @@ static size_t pmem_copy_from_iter(struct dax_device 
*dax_dev, pgoff_t pgoff,
 static size_t pmem_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
void *addr, size_t bytes, struct iov_iter *i)
 {
-   return copy_to_iter(addr, bytes, i);
+   return copy_to_iter_mcsafe(addr, bytes, i);
 }
 
 static const struct dax_operations pmem_dax_ops = {



[PATCH v2 9/9] x86, nfit_test: unit test for memcpy_mcsafe()

2018-05-02 Thread Dan Williams
Given the fact that the ACPI "EINJ" (error injection) facility is not
universally available, implement software infrastructure to validate the
memcpy_mcsafe() exception handling implementation.

For each potential read exception point in memcpy_mcsafe(), inject a
emulated exception point at the address identified by 'mcsafe_inject'
variable. With this infrastructure implement a test to validate that the
'bytes remaining' calculation is correct for a range of various source
buffer alignments.

This code is compiled out by default. The CONFIG_MCSAFE_DEBUG
configuration symbol needs to be manually enabled by editing
Kconfig.debug. I.e. this functionality can not be accidentally enabled
by a user / distro, it's only for development.

Cc: 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: Tony Luck 
Cc: Al Viro 
Cc: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Peter Zijlstra 
Cc: Andrew Morton 
Cc: Linus Torvalds 
Reported-by: Tony Luck 
Signed-off-by: Dan Williams 
---
 arch/x86/Kconfig.debug  |3 ++
 arch/x86/include/asm/mcsafe_debug.h |   50 +++
 arch/x86/lib/memcpy_64.S|7 +
 tools/testing/nvdimm/test/nfit.c|   48 ++
 4 files changed, 108 insertions(+)
 create mode 100644 arch/x86/include/asm/mcsafe_debug.h

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 192e4d2f9efc..8bdec78a405f 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -72,6 +72,9 @@ config EARLY_PRINTK_USB_XDBC
  You should normally say N here, unless you want to debug early
  crashes or need a very simple printk logging facility.
 
+config MCSAFE_DEBUG
+   def_bool n
+
 config X86_PTDUMP_CORE
def_bool n
 
diff --git a/arch/x86/include/asm/mcsafe_debug.h 
b/arch/x86/include/asm/mcsafe_debug.h
new file mode 100644
index ..0f85d24b46c5
--- /dev/null
+++ b/arch/x86/include/asm/mcsafe_debug.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _MCSAFE_DEBUG_H_
+#define _MCSAFE_DEBUG_H_
+
+#ifndef __ASSEMBLY__
+#ifdef CONFIG_MCSAFE_DEBUG
+extern unsigned long mcsafe_inject;
+
+static inline void set_mcsafe_inject(void *addr)
+{
+   if (addr)
+   mcsafe_inject = (unsigned long) addr;
+   else
+   mcsafe_inject = ~0UL;
+}
+#else /* CONFIG_MCSAFE_DEBUG */
+static inline void set_mcsafe_inject(void *addr)
+{
+}
+#endif /* CONFIG_MCSAFE_DEBUG */
+
+#else /* __ASSEMBLY__ */
+#include 
+
+#ifdef CONFIG_MCSAFE_DEBUG
+.macro MCSAFE_DEBUG_CTL
+   .pushsection .data
+   .align 8
+   .globl mcsafe_inject
+   mcsafe_inject:
+   .quad 0
+   EXPORT_SYMBOL_GPL(mcsafe_inject)
+   .popsection
+.endm
+
+.macro MCSAFE_DEBUG offset reg count target
+   leaq \offset(\reg), %r9
+   addq \count, %r9
+   cmp mcsafe_inject, %r9
+   jg \target
+.endm
+#else
+.macro MCSAFE_DEBUG_CTL
+.endm
+
+.macro MCSAFE_DEBUG offset reg count target
+.endm
+#endif /* CONFIG_MCSAFE_DEBUG */
+#endif /* __ASSEMBLY__ */
+#endif /* _MCSAFE_DEBUG_H_ */
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index c3b527a9f95d..e5f489b2c6ea 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -3,6 +3,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -183,6 +184,9 @@ ENTRY(memcpy_orig)
 ENDPROC(memcpy_orig)
 
 #ifndef CONFIG_UML
+
+MCSAFE_DEBUG_CTL
+
 /*
  * __memcpy_mcsafe - memory copy with machine check exception handling
  * Note that we only catch machine checks when reading the source addresses.
@@ -205,6 +209,7 @@ ENTRY(__memcpy_mcsafe)
negl %ecx
subl %ecx, %edx
 .L_read_leading_bytes:
+   MCSAFE_DEBUG 0 %rsi $1 .E_leading_bytes
movb (%rsi), %al
 .L_write_leading_bytes:
movb %al, (%rdi)
@@ -220,6 +225,7 @@ ENTRY(__memcpy_mcsafe)
jz .L_no_whole_words
 
 .L_read_words:
+   MCSAFE_DEBUG 0 %rsi $8 .E_read_words
movq (%rsi), %r8
 .L_write_words:
movq %r8, (%rdi)
@@ -236,6 +242,7 @@ ENTRY(__memcpy_mcsafe)
/* Copy trailing bytes */
movl %edx, %ecx
 .L_read_trailing_bytes:
+   MCSAFE_DEBUG 0 %rsi $1 .E_trailing_bytes
movb (%rsi), %al
 .L_write_trailing_bytes:
movb %al, (%rdi)
diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index 4ea385be528f..db04ff658971 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -29,6 +29,8 @@
 #include "nfit_test.h"
 #include "../watermark.h"
 
+#include 
+
 /*
  * Generate an NFIT table to describe the following topology:
  *
@@ -2681,6 +2683,51 @@ static struct platform_driver nfit_test_driver = {
.id_table = 

[PATCH v2 9/9] x86, nfit_test: unit test for memcpy_mcsafe()

2018-05-02 Thread Dan Williams
Given the fact that the ACPI "EINJ" (error injection) facility is not
universally available, implement software infrastructure to validate the
memcpy_mcsafe() exception handling implementation.

For each potential read exception point in memcpy_mcsafe(), inject a
emulated exception point at the address identified by 'mcsafe_inject'
variable. With this infrastructure implement a test to validate that the
'bytes remaining' calculation is correct for a range of various source
buffer alignments.

This code is compiled out by default. The CONFIG_MCSAFE_DEBUG
configuration symbol needs to be manually enabled by editing
Kconfig.debug. I.e. this functionality can not be accidentally enabled
by a user / distro, it's only for development.

Cc: 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: Tony Luck 
Cc: Al Viro 
Cc: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Peter Zijlstra 
Cc: Andrew Morton 
Cc: Linus Torvalds 
Reported-by: Tony Luck 
Signed-off-by: Dan Williams 
---
 arch/x86/Kconfig.debug  |3 ++
 arch/x86/include/asm/mcsafe_debug.h |   50 +++
 arch/x86/lib/memcpy_64.S|7 +
 tools/testing/nvdimm/test/nfit.c|   48 ++
 4 files changed, 108 insertions(+)
 create mode 100644 arch/x86/include/asm/mcsafe_debug.h

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 192e4d2f9efc..8bdec78a405f 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -72,6 +72,9 @@ config EARLY_PRINTK_USB_XDBC
  You should normally say N here, unless you want to debug early
  crashes or need a very simple printk logging facility.
 
+config MCSAFE_DEBUG
+   def_bool n
+
 config X86_PTDUMP_CORE
def_bool n
 
diff --git a/arch/x86/include/asm/mcsafe_debug.h 
b/arch/x86/include/asm/mcsafe_debug.h
new file mode 100644
index ..0f85d24b46c5
--- /dev/null
+++ b/arch/x86/include/asm/mcsafe_debug.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _MCSAFE_DEBUG_H_
+#define _MCSAFE_DEBUG_H_
+
+#ifndef __ASSEMBLY__
+#ifdef CONFIG_MCSAFE_DEBUG
+extern unsigned long mcsafe_inject;
+
+static inline void set_mcsafe_inject(void *addr)
+{
+   if (addr)
+   mcsafe_inject = (unsigned long) addr;
+   else
+   mcsafe_inject = ~0UL;
+}
+#else /* CONFIG_MCSAFE_DEBUG */
+static inline void set_mcsafe_inject(void *addr)
+{
+}
+#endif /* CONFIG_MCSAFE_DEBUG */
+
+#else /* __ASSEMBLY__ */
+#include 
+
+#ifdef CONFIG_MCSAFE_DEBUG
+.macro MCSAFE_DEBUG_CTL
+   .pushsection .data
+   .align 8
+   .globl mcsafe_inject
+   mcsafe_inject:
+   .quad 0
+   EXPORT_SYMBOL_GPL(mcsafe_inject)
+   .popsection
+.endm
+
+.macro MCSAFE_DEBUG offset reg count target
+   leaq \offset(\reg), %r9
+   addq \count, %r9
+   cmp mcsafe_inject, %r9
+   jg \target
+.endm
+#else
+.macro MCSAFE_DEBUG_CTL
+.endm
+
+.macro MCSAFE_DEBUG offset reg count target
+.endm
+#endif /* CONFIG_MCSAFE_DEBUG */
+#endif /* __ASSEMBLY__ */
+#endif /* _MCSAFE_DEBUG_H_ */
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index c3b527a9f95d..e5f489b2c6ea 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -3,6 +3,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -183,6 +184,9 @@ ENTRY(memcpy_orig)
 ENDPROC(memcpy_orig)
 
 #ifndef CONFIG_UML
+
+MCSAFE_DEBUG_CTL
+
 /*
  * __memcpy_mcsafe - memory copy with machine check exception handling
  * Note that we only catch machine checks when reading the source addresses.
@@ -205,6 +209,7 @@ ENTRY(__memcpy_mcsafe)
negl %ecx
subl %ecx, %edx
 .L_read_leading_bytes:
+   MCSAFE_DEBUG 0 %rsi $1 .E_leading_bytes
movb (%rsi), %al
 .L_write_leading_bytes:
movb %al, (%rdi)
@@ -220,6 +225,7 @@ ENTRY(__memcpy_mcsafe)
jz .L_no_whole_words
 
 .L_read_words:
+   MCSAFE_DEBUG 0 %rsi $8 .E_read_words
movq (%rsi), %r8
 .L_write_words:
movq %r8, (%rdi)
@@ -236,6 +242,7 @@ ENTRY(__memcpy_mcsafe)
/* Copy trailing bytes */
movl %edx, %ecx
 .L_read_trailing_bytes:
+   MCSAFE_DEBUG 0 %rsi $1 .E_trailing_bytes
movb (%rsi), %al
 .L_write_trailing_bytes:
movb %al, (%rdi)
diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index 4ea385be528f..db04ff658971 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -29,6 +29,8 @@
 #include "nfit_test.h"
 #include "../watermark.h"
 
+#include 
+
 /*
  * Generate an NFIT table to describe the following topology:
  *
@@ -2681,6 +2683,51 @@ static struct platform_driver nfit_test_driver = {
.id_table = nfit_test_id,
 };
 
+static char mcsafe_buf[PAGE_SIZE] __attribute__((__aligned__(PAGE_SIZE)));
+
+void mcsafe_test(void)
+{
+   bool do_inject = false;
+   int i;
+
+   if (IS_ENABLED(CONFIG_MCSAFE_DEBUG)) {
+   pr_info("%s: run...\n", __func__);
+

Re: [PATCH v2 4/4] vsprintf: Add command line option debug_early_boot

2018-05-02 Thread Tobin C. Harding
On Wed, May 02, 2018 at 09:57:57PM -0700, Kees Cook wrote:
> On Wed, May 2, 2018 at 3:50 PM, Tobin C. Harding  wrote:
> > Currently printing [hashed] pointers requires either a hw RNG or enough
> > entropy to be available.  Early in the boot sequence these conditions
> > may not be met resulting in a dummy string '(ptrval)' being
> > printed.  This makes debugging the early boot sequence difficult.  We
> > can relax the requirement to use cryptographically secure hashing during
> > debugging.  This enables debugging while keeping development/production
> > kernel behaviour the same.
> >
> > If new command line option debug_early_boot is enabled use
> > cryptographically insecure hashing and hash pointer value immediately.
> >
> > Signed-off-by: Tobin C. Harding 
> > ---
> >  Documentation/admin-guide/kernel-parameters.txt |  8 
> >  lib/vsprintf.c  | 18 ++
> >  2 files changed, 26 insertions(+)
> >
> > diff --git a/Documentation/admin-guide/kernel-parameters.txt 
> > b/Documentation/admin-guide/kernel-parameters.txt
> > index b8d1379aa039..ab619c4ccbf2 100644
> > --- a/Documentation/admin-guide/kernel-parameters.txt
> > +++ b/Documentation/admin-guide/kernel-parameters.txt
> > @@ -748,6 +748,14 @@
> >
> > debug   [KNL] Enable kernel debugging (events log level).
> >
> > +   debug_early_boot
> > +   [KNL] Enable debugging early in the boot sequence.  
> > If
> > +   enabled, we use a weak hash instead of siphash to 
> > hash
> > +   pointers.  Use this option if you need to see 
> > pointer
> > +   values during early boot (i.e you are seeing 
> > instances
> > +   of '(___ptrval___)') - cryptographically insecure,
> > +   please do not use on production kernels.
> > +
> > debug_locks_verbose=
> > [KNL] verbose self-tests
> > Format=<0|1>
> > diff --git a/lib/vsprintf.c b/lib/vsprintf.c
> > index 3697a19c2b25..6c139b442267 100644
> > --- a/lib/vsprintf.c
> > +++ b/lib/vsprintf.c
> > @@ -1654,6 +1654,18 @@ char *device_node_string(char *buf, char *end, 
> > struct device_node *dn,
> > return widen_string(buf, buf - buf_start, end, spec);
> >  }
> >
> > +/* Make pointers available for printing early in the boot sequence. */
> > +static int debug_early_boot;
> 
> Please make this __ro_after_init too.

Good suggestion.  I forgot, we are supposed to be closing security
wholes not opening them :)

thanks,
Tobin.


[PATCH v2 7/9] dax: report bytes remaining in dax_iomap_actor()

2018-05-02 Thread Dan Williams
In preparation for protecting the dax read(2) path from media errors
with copy_to_iter_mcsafe() (via dax_copy_to_iter()), convert the
implementation to report the bytes successfully transferred.

Cc: 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: Tony Luck 
Cc: Al Viro 
Cc: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Peter Zijlstra 
Cc: Andrew Morton 
Cc: Linus Torvalds 
Signed-off-by: Dan Williams 
---
 fs/dax.c |   20 +++-
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index a64afdf7ec0d..34a2d435ae4b 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -991,6 +991,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t 
length, void *data,
struct iov_iter *iter = data;
loff_t end = pos + length, done = 0;
ssize_t ret = 0;
+   size_t xfer;
int id;
 
if (iov_iter_rw(iter) == READ) {
@@ -1054,19 +1055,20 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t 
length, void *data,
 * vfs_write(), depending on which operation we are doing.
 */
if (iov_iter_rw(iter) == WRITE)
-   map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr,
+   xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
map_len, iter);
else
-   map_len = dax_copy_to_iter(dax_dev, pgoff, kaddr,
+   xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
map_len, iter);
-   if (map_len <= 0) {
-   ret = map_len ? map_len : -EFAULT;
-   break;
-   }
 
-   pos += map_len;
-   length -= map_len;
-   done += map_len;
+   pos += xfer;
+   length -= xfer;
+   done += xfer;
+
+   if (xfer == 0)
+   ret = -EFAULT;
+   if (xfer < map_len)
+   break;
}
dax_read_unlock(id);
 



Re: [PATCH v2 4/4] vsprintf: Add command line option debug_early_boot

2018-05-02 Thread Tobin C. Harding
On Wed, May 02, 2018 at 09:57:57PM -0700, Kees Cook wrote:
> On Wed, May 2, 2018 at 3:50 PM, Tobin C. Harding  wrote:
> > Currently printing [hashed] pointers requires either a hw RNG or enough
> > entropy to be available.  Early in the boot sequence these conditions
> > may not be met resulting in a dummy string '(ptrval)' being
> > printed.  This makes debugging the early boot sequence difficult.  We
> > can relax the requirement to use cryptographically secure hashing during
> > debugging.  This enables debugging while keeping development/production
> > kernel behaviour the same.
> >
> > If new command line option debug_early_boot is enabled use
> > cryptographically insecure hashing and hash pointer value immediately.
> >
> > Signed-off-by: Tobin C. Harding 
> > ---
> >  Documentation/admin-guide/kernel-parameters.txt |  8 
> >  lib/vsprintf.c  | 18 ++
> >  2 files changed, 26 insertions(+)
> >
> > diff --git a/Documentation/admin-guide/kernel-parameters.txt 
> > b/Documentation/admin-guide/kernel-parameters.txt
> > index b8d1379aa039..ab619c4ccbf2 100644
> > --- a/Documentation/admin-guide/kernel-parameters.txt
> > +++ b/Documentation/admin-guide/kernel-parameters.txt
> > @@ -748,6 +748,14 @@
> >
> > debug   [KNL] Enable kernel debugging (events log level).
> >
> > +   debug_early_boot
> > +   [KNL] Enable debugging early in the boot sequence.  
> > If
> > +   enabled, we use a weak hash instead of siphash to 
> > hash
> > +   pointers.  Use this option if you need to see 
> > pointer
> > +   values during early boot (i.e you are seeing 
> > instances
> > +   of '(___ptrval___)') - cryptographically insecure,
> > +   please do not use on production kernels.
> > +
> > debug_locks_verbose=
> > [KNL] verbose self-tests
> > Format=<0|1>
> > diff --git a/lib/vsprintf.c b/lib/vsprintf.c
> > index 3697a19c2b25..6c139b442267 100644
> > --- a/lib/vsprintf.c
> > +++ b/lib/vsprintf.c
> > @@ -1654,6 +1654,18 @@ char *device_node_string(char *buf, char *end, 
> > struct device_node *dn,
> > return widen_string(buf, buf - buf_start, end, spec);
> >  }
> >
> > +/* Make pointers available for printing early in the boot sequence. */
> > +static int debug_early_boot;
> 
> Please make this __ro_after_init too.

Good suggestion.  I forgot, we are supposed to be closing security
wholes not opening them :)

thanks,
Tobin.


[PATCH v2 7/9] dax: report bytes remaining in dax_iomap_actor()

2018-05-02 Thread Dan Williams
In preparation for protecting the dax read(2) path from media errors
with copy_to_iter_mcsafe() (via dax_copy_to_iter()), convert the
implementation to report the bytes successfully transferred.

Cc: 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: Tony Luck 
Cc: Al Viro 
Cc: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Peter Zijlstra 
Cc: Andrew Morton 
Cc: Linus Torvalds 
Signed-off-by: Dan Williams 
---
 fs/dax.c |   20 +++-
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index a64afdf7ec0d..34a2d435ae4b 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -991,6 +991,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t 
length, void *data,
struct iov_iter *iter = data;
loff_t end = pos + length, done = 0;
ssize_t ret = 0;
+   size_t xfer;
int id;
 
if (iov_iter_rw(iter) == READ) {
@@ -1054,19 +1055,20 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t 
length, void *data,
 * vfs_write(), depending on which operation we are doing.
 */
if (iov_iter_rw(iter) == WRITE)
-   map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr,
+   xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
map_len, iter);
else
-   map_len = dax_copy_to_iter(dax_dev, pgoff, kaddr,
+   xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
map_len, iter);
-   if (map_len <= 0) {
-   ret = map_len ? map_len : -EFAULT;
-   break;
-   }
 
-   pos += map_len;
-   length -= map_len;
-   done += map_len;
+   pos += xfer;
+   length -= xfer;
+   done += xfer;
+
+   if (xfer == 0)
+   ret = -EFAULT;
+   if (xfer < map_len)
+   break;
}
dax_read_unlock(id);
 



[PATCH v2 4/9] x86, memcpy_mcsafe: add write-protection-fault handling

2018-05-02 Thread Dan Williams
In preparation for using memcpy_mcsafe() to handle user copies it needs
to be to handle write-protection faults while writing user pages. Add
MMU-fault handlers alongside the machine-check exception handlers.

Note that the machine check fault exception handling makes assumptions
about source buffer alignment and poison alignment. In the write fault
case, given the destination buffer is arbitrarily aligned, it needs a
separate / additional fault handling approach. The mcsafe_handle_tail()
helper is reused. The @limit argument is set to @len since there is no
safety concern about retriggering an MMU fault, and this simplifies the
assembly.

Cc: 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: Tony Luck 
Cc: Al Viro 
Cc: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Peter Zijlstra 
Cc: Andrew Morton 
Cc: Linus Torvalds 
Co-developed-by: Tony Luck 
Signed-off-by: Dan Williams 
---
 arch/x86/include/asm/uaccess_64.h |3 +++
 arch/x86/lib/memcpy_64.S  |   14 ++
 arch/x86/lib/usercopy_64.c|   17 +
 3 files changed, 34 insertions(+)

diff --git a/arch/x86/include/asm/uaccess_64.h 
b/arch/x86/include/asm/uaccess_64.h
index 62546b3a398e..c63efc07891f 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -194,4 +194,7 @@ __copy_from_user_flushcache(void *dst, const void __user 
*src, unsigned size)
 unsigned long
 copy_user_handle_tail(char *to, char *from, unsigned len);
 
+unsigned long
+mcsafe_handle_tail(char *to, char *from, unsigned len);
+
 #endif /* _ASM_X86_UACCESS_64_H */
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index f01a88391c98..c3b527a9f95d 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -265,9 +265,23 @@ EXPORT_SYMBOL_GPL(__memcpy_mcsafe)
mov %ecx, %eax
ret
 
+   /*
+* For write fault handling, given the destination is unaligned,
+* we handle faults on multi-byte writes with a byte-by-byte
+* copy up to the write-protected page.
+*/
+.E_write_words:
+   shll$3, %ecx
+   addl%edx, %ecx
+   movl%ecx, %edx
+   jmp mcsafe_handle_tail
+
.previous
 
_ASM_EXTABLE_FAULT(.L_read_leading_bytes, .E_leading_bytes)
_ASM_EXTABLE_FAULT(.L_read_words, .E_read_words)
_ASM_EXTABLE_FAULT(.L_read_trailing_bytes, .E_trailing_bytes)
+   _ASM_EXTABLE(.L_write_leading_bytes, .E_leading_bytes)
+   _ASM_EXTABLE(.L_write_words, .E_write_words)
+   _ASM_EXTABLE(.L_write_trailing_bytes, .E_trailing_bytes)
 #endif
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index 75d3776123cc..9787f5ee0cf9 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -75,6 +75,23 @@ copy_user_handle_tail(char *to, char *from, unsigned len)
return len;
 }
 
+/*
+ * Similar to copy_user_handle_tail, probe for the write fault point,
+ * but reuse __memcpy_mcsafe in case a new read error is encountered.
+ * clac() is handled in _copy_to_iter_mcsafe().
+ */
+__visible unsigned long
+mcsafe_handle_tail(char *to, char *from, unsigned len)
+{
+   for (; len; --len, to++) {
+   unsigned long rem = memcpy_mcsafe(to, from, 1);
+
+   if (rem)
+   break;
+   }
+   return len;
+}
+
 #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
 /**
  * clean_cache_range - write back a cache range with CLWB



[PATCH v2 5/9] x86, memcpy_mcsafe: define copy_to_iter_mcsafe()

2018-05-02 Thread Dan Williams
Use the updated memcpy_mcsafe() implementation to define
copy_user_mcsafe() and copy_to_iter_mcsafe(). The most significant
difference from typical copy_to_iter() is that the ITER_KVEC and
ITER_BVEC iterator types can fail to complete a full transfer.

Cc: 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: Tony Luck 
Cc: Al Viro 
Cc: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Peter Zijlstra 
Cc: Andrew Morton 
Cc: Linus Torvalds 
Signed-off-by: Dan Williams 
---
 arch/x86/Kconfig  |1 +
 arch/x86/include/asm/uaccess_64.h |   11 +++
 include/linux/uio.h   |   15 +
 lib/iov_iter.c|   61 +
 4 files changed, 88 insertions(+)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c07f492b871a..6ca22706cd64 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -60,6 +60,7 @@ config X86
select ARCH_HAS_PMEM_APIif X86_64
select ARCH_HAS_REFCOUNT
select ARCH_HAS_UACCESS_FLUSHCACHE  if X86_64
+   select ARCH_HAS_UACCESS_MCSAFE  if X86_64
select ARCH_HAS_SET_MEMORY
select ARCH_HAS_SG_CHAIN
select ARCH_HAS_STRICT_KERNEL_RWX
diff --git a/arch/x86/include/asm/uaccess_64.h 
b/arch/x86/include/asm/uaccess_64.h
index c63efc07891f..62acb613114b 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -47,6 +47,17 @@ copy_user_generic(void *to, const void *from, unsigned len)
 }
 
 static __always_inline __must_check unsigned long
+copy_to_user_mcsafe(void *to, const void *from, unsigned len)
+{
+   unsigned long ret;
+
+   __uaccess_begin();
+   ret = memcpy_mcsafe(to, from, len);
+   __uaccess_end();
+   return ret;
+}
+
+static __always_inline __must_check unsigned long
 raw_copy_from_user(void *dst, const void __user *src, unsigned long size)
 {
int ret = 0;
diff --git a/include/linux/uio.h b/include/linux/uio.h
index e67e12adb136..f5766e853a77 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -154,6 +154,12 @@ size_t _copy_from_iter_flushcache(void *addr, size_t 
bytes, struct iov_iter *i);
 #define _copy_from_iter_flushcache _copy_from_iter_nocache
 #endif
 
+#ifdef CONFIG_ARCH_HAS_UACCESS_MCSAFE
+size_t _copy_to_iter_mcsafe(void *addr, size_t bytes, struct iov_iter *i);
+#else
+#define _copy_to_iter_mcsafe _copy_to_iter
+#endif
+
 static __always_inline __must_check
 size_t copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
 {
@@ -163,6 +169,15 @@ size_t copy_from_iter_flushcache(void *addr, size_t bytes, 
struct iov_iter *i)
return _copy_from_iter_flushcache(addr, bytes, i);
 }
 
+static __always_inline __must_check
+size_t copy_to_iter_mcsafe(void *addr, size_t bytes, struct iov_iter *i)
+{
+   if (unlikely(!check_copy_size(addr, bytes, false)))
+   return 0;
+   else
+   return _copy_to_iter_mcsafe(addr, bytes, i);
+}
+
 size_t iov_iter_zero(size_t bytes, struct iov_iter *);
 unsigned long iov_iter_alignment(const struct iov_iter *i);
 unsigned long iov_iter_gap_alignment(const struct iov_iter *i);
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 970212670b6a..70ebc8ede143 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -573,6 +573,67 @@ size_t _copy_to_iter(const void *addr, size_t bytes, 
struct iov_iter *i)
 }
 EXPORT_SYMBOL(_copy_to_iter);
 
+#ifdef CONFIG_ARCH_HAS_UACCESS_MCSAFE
+static int copyout_mcsafe(void __user *to, const void *from, size_t n)
+{
+   if (access_ok(VERIFY_WRITE, to, n)) {
+   kasan_check_read(from, n);
+   n = copy_to_user_mcsafe((__force void *) to, from, n);
+   }
+   return n;
+}
+
+static unsigned long memcpy_mcsafe_to_page(struct page *page, size_t offset,
+   const char *from, size_t len)
+{
+   unsigned long ret;
+   char *to;
+
+   to = kmap_atomic(page);
+   ret = memcpy_mcsafe(to + offset, from, len);
+   kunmap_atomic(to);
+
+   return ret;
+}
+
+size_t _copy_to_iter_mcsafe(const void *addr, size_t bytes, struct iov_iter *i)
+{
+   const char *from = addr;
+   unsigned long rem, curr_addr, s_addr = (unsigned long) addr;
+
+   if (unlikely(i->type & ITER_PIPE)) {
+   WARN_ON(1);
+   return 0;
+   }
+   if (iter_is_iovec(i))
+   might_fault();
+   iterate_and_advance(i, bytes, v,
+   copyout_mcsafe(v.iov_base, (from += v.iov_len) - v.iov_len, 
v.iov_len),
+   ({
+   rem = memcpy_mcsafe_to_page(v.bv_page, v.bv_offset,
+   (from += v.bv_len) - v.bv_len, v.bv_len);
+   if (rem) {
+   curr_addr = 

[PATCH v2 5/9] x86, memcpy_mcsafe: define copy_to_iter_mcsafe()

2018-05-02 Thread Dan Williams
Use the updated memcpy_mcsafe() implementation to define
copy_user_mcsafe() and copy_to_iter_mcsafe(). The most significant
difference from typical copy_to_iter() is that the ITER_KVEC and
ITER_BVEC iterator types can fail to complete a full transfer.

Cc: 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: Tony Luck 
Cc: Al Viro 
Cc: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Peter Zijlstra 
Cc: Andrew Morton 
Cc: Linus Torvalds 
Signed-off-by: Dan Williams 
---
 arch/x86/Kconfig  |1 +
 arch/x86/include/asm/uaccess_64.h |   11 +++
 include/linux/uio.h   |   15 +
 lib/iov_iter.c|   61 +
 4 files changed, 88 insertions(+)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c07f492b871a..6ca22706cd64 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -60,6 +60,7 @@ config X86
select ARCH_HAS_PMEM_APIif X86_64
select ARCH_HAS_REFCOUNT
select ARCH_HAS_UACCESS_FLUSHCACHE  if X86_64
+   select ARCH_HAS_UACCESS_MCSAFE  if X86_64
select ARCH_HAS_SET_MEMORY
select ARCH_HAS_SG_CHAIN
select ARCH_HAS_STRICT_KERNEL_RWX
diff --git a/arch/x86/include/asm/uaccess_64.h 
b/arch/x86/include/asm/uaccess_64.h
index c63efc07891f..62acb613114b 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -47,6 +47,17 @@ copy_user_generic(void *to, const void *from, unsigned len)
 }
 
 static __always_inline __must_check unsigned long
+copy_to_user_mcsafe(void *to, const void *from, unsigned len)
+{
+   unsigned long ret;
+
+   __uaccess_begin();
+   ret = memcpy_mcsafe(to, from, len);
+   __uaccess_end();
+   return ret;
+}
+
+static __always_inline __must_check unsigned long
 raw_copy_from_user(void *dst, const void __user *src, unsigned long size)
 {
int ret = 0;
diff --git a/include/linux/uio.h b/include/linux/uio.h
index e67e12adb136..f5766e853a77 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -154,6 +154,12 @@ size_t _copy_from_iter_flushcache(void *addr, size_t 
bytes, struct iov_iter *i);
 #define _copy_from_iter_flushcache _copy_from_iter_nocache
 #endif
 
+#ifdef CONFIG_ARCH_HAS_UACCESS_MCSAFE
+size_t _copy_to_iter_mcsafe(void *addr, size_t bytes, struct iov_iter *i);
+#else
+#define _copy_to_iter_mcsafe _copy_to_iter
+#endif
+
 static __always_inline __must_check
 size_t copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
 {
@@ -163,6 +169,15 @@ size_t copy_from_iter_flushcache(void *addr, size_t bytes, 
struct iov_iter *i)
return _copy_from_iter_flushcache(addr, bytes, i);
 }
 
+static __always_inline __must_check
+size_t copy_to_iter_mcsafe(void *addr, size_t bytes, struct iov_iter *i)
+{
+   if (unlikely(!check_copy_size(addr, bytes, false)))
+   return 0;
+   else
+   return _copy_to_iter_mcsafe(addr, bytes, i);
+}
+
 size_t iov_iter_zero(size_t bytes, struct iov_iter *);
 unsigned long iov_iter_alignment(const struct iov_iter *i);
 unsigned long iov_iter_gap_alignment(const struct iov_iter *i);
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 970212670b6a..70ebc8ede143 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -573,6 +573,67 @@ size_t _copy_to_iter(const void *addr, size_t bytes, 
struct iov_iter *i)
 }
 EXPORT_SYMBOL(_copy_to_iter);
 
+#ifdef CONFIG_ARCH_HAS_UACCESS_MCSAFE
+static int copyout_mcsafe(void __user *to, const void *from, size_t n)
+{
+   if (access_ok(VERIFY_WRITE, to, n)) {
+   kasan_check_read(from, n);
+   n = copy_to_user_mcsafe((__force void *) to, from, n);
+   }
+   return n;
+}
+
+static unsigned long memcpy_mcsafe_to_page(struct page *page, size_t offset,
+   const char *from, size_t len)
+{
+   unsigned long ret;
+   char *to;
+
+   to = kmap_atomic(page);
+   ret = memcpy_mcsafe(to + offset, from, len);
+   kunmap_atomic(to);
+
+   return ret;
+}
+
+size_t _copy_to_iter_mcsafe(const void *addr, size_t bytes, struct iov_iter *i)
+{
+   const char *from = addr;
+   unsigned long rem, curr_addr, s_addr = (unsigned long) addr;
+
+   if (unlikely(i->type & ITER_PIPE)) {
+   WARN_ON(1);
+   return 0;
+   }
+   if (iter_is_iovec(i))
+   might_fault();
+   iterate_and_advance(i, bytes, v,
+   copyout_mcsafe(v.iov_base, (from += v.iov_len) - v.iov_len, 
v.iov_len),
+   ({
+   rem = memcpy_mcsafe_to_page(v.bv_page, v.bv_offset,
+   (from += v.bv_len) - v.bv_len, v.bv_len);
+   if (rem) {
+   curr_addr = (unsigned long) from;
+   bytes = curr_addr - s_addr - rem;
+   return bytes;
+   }
+   }),
+   ({
+   rem = memcpy_mcsafe(v.iov_base, (from += v.iov_len) - 

[PATCH v2 4/9] x86, memcpy_mcsafe: add write-protection-fault handling

2018-05-02 Thread Dan Williams
In preparation for using memcpy_mcsafe() to handle user copies it needs
to be to handle write-protection faults while writing user pages. Add
MMU-fault handlers alongside the machine-check exception handlers.

Note that the machine check fault exception handling makes assumptions
about source buffer alignment and poison alignment. In the write fault
case, given the destination buffer is arbitrarily aligned, it needs a
separate / additional fault handling approach. The mcsafe_handle_tail()
helper is reused. The @limit argument is set to @len since there is no
safety concern about retriggering an MMU fault, and this simplifies the
assembly.

Cc: 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: Tony Luck 
Cc: Al Viro 
Cc: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Peter Zijlstra 
Cc: Andrew Morton 
Cc: Linus Torvalds 
Co-developed-by: Tony Luck 
Signed-off-by: Dan Williams 
---
 arch/x86/include/asm/uaccess_64.h |3 +++
 arch/x86/lib/memcpy_64.S  |   14 ++
 arch/x86/lib/usercopy_64.c|   17 +
 3 files changed, 34 insertions(+)

diff --git a/arch/x86/include/asm/uaccess_64.h 
b/arch/x86/include/asm/uaccess_64.h
index 62546b3a398e..c63efc07891f 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -194,4 +194,7 @@ __copy_from_user_flushcache(void *dst, const void __user 
*src, unsigned size)
 unsigned long
 copy_user_handle_tail(char *to, char *from, unsigned len);
 
+unsigned long
+mcsafe_handle_tail(char *to, char *from, unsigned len);
+
 #endif /* _ASM_X86_UACCESS_64_H */
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index f01a88391c98..c3b527a9f95d 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -265,9 +265,23 @@ EXPORT_SYMBOL_GPL(__memcpy_mcsafe)
mov %ecx, %eax
ret
 
+   /*
+* For write fault handling, given the destination is unaligned,
+* we handle faults on multi-byte writes with a byte-by-byte
+* copy up to the write-protected page.
+*/
+.E_write_words:
+   shll$3, %ecx
+   addl%edx, %ecx
+   movl%ecx, %edx
+   jmp mcsafe_handle_tail
+
.previous
 
_ASM_EXTABLE_FAULT(.L_read_leading_bytes, .E_leading_bytes)
_ASM_EXTABLE_FAULT(.L_read_words, .E_read_words)
_ASM_EXTABLE_FAULT(.L_read_trailing_bytes, .E_trailing_bytes)
+   _ASM_EXTABLE(.L_write_leading_bytes, .E_leading_bytes)
+   _ASM_EXTABLE(.L_write_words, .E_write_words)
+   _ASM_EXTABLE(.L_write_trailing_bytes, .E_trailing_bytes)
 #endif
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index 75d3776123cc..9787f5ee0cf9 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -75,6 +75,23 @@ copy_user_handle_tail(char *to, char *from, unsigned len)
return len;
 }
 
+/*
+ * Similar to copy_user_handle_tail, probe for the write fault point,
+ * but reuse __memcpy_mcsafe in case a new read error is encountered.
+ * clac() is handled in _copy_to_iter_mcsafe().
+ */
+__visible unsigned long
+mcsafe_handle_tail(char *to, char *from, unsigned len)
+{
+   for (; len; --len, to++) {
+   unsigned long rem = memcpy_mcsafe(to, from, 1);
+
+   if (rem)
+   break;
+   }
+   return len;
+}
+
 #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
 /**
  * clean_cache_range - write back a cache range with CLWB



[PATCH v2 1/9] x86, memcpy_mcsafe: remove loop unrolling

2018-05-02 Thread Dan Williams
In preparation for teaching memcpy_mcsafe() to return 'bytes remaining'
rather than pass / fail, simplify the implementation to remove loop
unrolling. The unrolling complicates the fault handling for negligible
benefit given modern CPUs perform loop stream detection.

Cc: 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: Tony Luck 
Cc: Al Viro 
Cc: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Peter Zijlstra 
Cc: Andrew Morton 
Reported-by: Linus Torvalds 
Signed-off-by: Dan Williams 
---
 arch/x86/include/asm/string_64.h |4 +--
 arch/x86/lib/memcpy_64.S |   59 ++
 2 files changed, 12 insertions(+), 51 deletions(-)

diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 533f74c300c2..4752f8984923 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -116,7 +116,7 @@ int strcmp(const char *cs, const char *ct);
 #endif
 
 #define __HAVE_ARCH_MEMCPY_MCSAFE 1
-__must_check int memcpy_mcsafe_unrolled(void *dst, const void *src, size_t 
cnt);
+__must_check int __memcpy_mcsafe(void *dst, const void *src, size_t cnt);
 DECLARE_STATIC_KEY_FALSE(mcsafe_key);
 
 /**
@@ -138,7 +138,7 @@ memcpy_mcsafe(void *dst, const void *src, size_t cnt)
 {
 #ifdef CONFIG_X86_MCE
if (static_branch_unlikely(_key))
-   return memcpy_mcsafe_unrolled(dst, src, cnt);
+   return __memcpy_mcsafe(dst, src, cnt);
else
 #endif
memcpy(dst, src, cnt);
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 9a53a06e5a3e..54c971892db5 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -184,11 +184,11 @@ ENDPROC(memcpy_orig)
 
 #ifndef CONFIG_UML
 /*
- * memcpy_mcsafe_unrolled - memory copy with machine check exception handling
+ * __memcpy_mcsafe - memory copy with machine check exception handling
  * Note that we only catch machine checks when reading the source addresses.
  * Writes to target are posted and don't generate machine checks.
  */
-ENTRY(memcpy_mcsafe_unrolled)
+ENTRY(__memcpy_mcsafe)
cmpl $8, %edx
/* Less than 8 bytes? Go to byte copy loop */
jb .L_no_whole_words
@@ -213,49 +213,18 @@ ENTRY(memcpy_mcsafe_unrolled)
jnz .L_copy_leading_bytes
 
 .L_8byte_aligned:
-   /* Figure out how many whole cache lines (64-bytes) to copy */
-   movl %edx, %ecx
-   andl $63, %edx
-   shrl $6, %ecx
-   jz .L_no_whole_cache_lines
-
-   /* Loop copying whole cache lines */
-.L_cache_w0: movq (%rsi), %r8
-.L_cache_w1: movq 1*8(%rsi), %r9
-.L_cache_w2: movq 2*8(%rsi), %r10
-.L_cache_w3: movq 3*8(%rsi), %r11
-   movq %r8, (%rdi)
-   movq %r9, 1*8(%rdi)
-   movq %r10, 2*8(%rdi)
-   movq %r11, 3*8(%rdi)
-.L_cache_w4: movq 4*8(%rsi), %r8
-.L_cache_w5: movq 5*8(%rsi), %r9
-.L_cache_w6: movq 6*8(%rsi), %r10
-.L_cache_w7: movq 7*8(%rsi), %r11
-   movq %r8, 4*8(%rdi)
-   movq %r9, 5*8(%rdi)
-   movq %r10, 6*8(%rdi)
-   movq %r11, 7*8(%rdi)
-   leaq 64(%rsi), %rsi
-   leaq 64(%rdi), %rdi
-   decl %ecx
-   jnz .L_cache_w0
-
-   /* Are there any trailing 8-byte words? */
-.L_no_whole_cache_lines:
movl %edx, %ecx
andl $7, %edx
shrl $3, %ecx
jz .L_no_whole_words
 
-   /* Copy trailing words */
-.L_copy_trailing_words:
+.L_copy_words:
movq (%rsi), %r8
-   mov %r8, (%rdi)
-   leaq 8(%rsi), %rsi
-   leaq 8(%rdi), %rdi
+   movq %r8, (%rdi)
+   addq $8, %rsi
+   addq $8, %rdi
decl %ecx
-   jnz .L_copy_trailing_words
+   jnz .L_copy_words
 
/* Any trailing bytes? */
 .L_no_whole_words:
@@ -276,8 +245,8 @@ ENTRY(memcpy_mcsafe_unrolled)
 .L_done_memcpy_trap:
xorq %rax, %rax
ret
-ENDPROC(memcpy_mcsafe_unrolled)
-EXPORT_SYMBOL_GPL(memcpy_mcsafe_unrolled)
+ENDPROC(__memcpy_mcsafe)
+EXPORT_SYMBOL_GPL(__memcpy_mcsafe)
 
.section .fixup, "ax"
/* Return -EFAULT for any failure */
@@ -288,14 +257,6 @@ EXPORT_SYMBOL_GPL(memcpy_mcsafe_unrolled)
.previous
 
_ASM_EXTABLE_FAULT(.L_copy_leading_bytes, .L_memcpy_mcsafe_fail)
-   _ASM_EXTABLE_FAULT(.L_cache_w0, .L_memcpy_mcsafe_fail)
-   _ASM_EXTABLE_FAULT(.L_cache_w1, .L_memcpy_mcsafe_fail)
-   _ASM_EXTABLE_FAULT(.L_cache_w2, .L_memcpy_mcsafe_fail)
-   _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
-   _ASM_EXTABLE_FAULT(.L_cache_w4, .L_memcpy_mcsafe_fail)
-   _ASM_EXTABLE_FAULT(.L_cache_w5, .L_memcpy_mcsafe_fail)
-   _ASM_EXTABLE_FAULT(.L_cache_w6, .L_memcpy_mcsafe_fail)
-   _ASM_EXTABLE_FAULT(.L_cache_w7, .L_memcpy_mcsafe_fail)
-   _ASM_EXTABLE_FAULT(.L_copy_trailing_words, .L_memcpy_mcsafe_fail)
+   

[PATCH v2 2/9] x86, memcpy_mcsafe: add labels for write fault handling

2018-05-02 Thread Dan Williams
The memcpy_mcsafe() implementation handles CPU exceptions when reading
from the source address. Before it can be used for user copies it needs
to grow support for handling write faults. In preparation for adding
that exception handling update the labels for the read cache word X case
(.L_cache_rX) and write cache word X case (.L_cache_wX).

Cc: 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: Al Viro 
Cc: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Peter Zijlstra 
Cc: Andrew Morton 
Cc: Linus Torvalds 
Reported-by: Tony Luck 
Signed-off-by: Dan Williams 
---
 arch/x86/lib/memcpy_64.S |   21 -
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 54c971892db5..5709f3ec22a4 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -204,13 +204,14 @@ ENTRY(__memcpy_mcsafe)
subl $8, %ecx
negl %ecx
subl %ecx, %edx
-.L_copy_leading_bytes:
+.L_read_leading_bytes:
movb (%rsi), %al
+.L_write_leading_bytes:
movb %al, (%rdi)
incq %rsi
incq %rdi
decl %ecx
-   jnz .L_copy_leading_bytes
+   jnz .L_read_leading_bytes
 
 .L_8byte_aligned:
movl %edx, %ecx
@@ -218,13 +219,14 @@ ENTRY(__memcpy_mcsafe)
shrl $3, %ecx
jz .L_no_whole_words
 
-.L_copy_words:
+.L_read_words:
movq (%rsi), %r8
+.L_write_words:
movq %r8, (%rdi)
addq $8, %rsi
addq $8, %rdi
decl %ecx
-   jnz .L_copy_words
+   jnz .L_read_words
 
/* Any trailing bytes? */
 .L_no_whole_words:
@@ -233,13 +235,14 @@ ENTRY(__memcpy_mcsafe)
 
/* Copy trailing bytes */
movl %edx, %ecx
-.L_copy_trailing_bytes:
+.L_read_trailing_bytes:
movb (%rsi), %al
+.L_write_trailing_bytes:
movb %al, (%rdi)
incq %rsi
incq %rdi
decl %ecx
-   jnz .L_copy_trailing_bytes
+   jnz .L_read_trailing_bytes
 
/* Copy successful. Return zero */
 .L_done_memcpy_trap:
@@ -256,7 +259,7 @@ EXPORT_SYMBOL_GPL(__memcpy_mcsafe)
 
.previous
 
-   _ASM_EXTABLE_FAULT(.L_copy_leading_bytes, .L_memcpy_mcsafe_fail)
-   _ASM_EXTABLE_FAULT(.L_copy_words, .L_memcpy_mcsafe_fail)
-   _ASM_EXTABLE_FAULT(.L_copy_trailing_bytes, .L_memcpy_mcsafe_fail)
+   _ASM_EXTABLE_FAULT(.L_read_leading_bytes, .L_memcpy_mcsafe_fail)
+   _ASM_EXTABLE_FAULT(.L_read_words, .L_memcpy_mcsafe_fail)
+   _ASM_EXTABLE_FAULT(.L_read_trailing_bytes, .L_memcpy_mcsafe_fail)
 #endif



[PATCH v2 1/9] x86, memcpy_mcsafe: remove loop unrolling

2018-05-02 Thread Dan Williams
In preparation for teaching memcpy_mcsafe() to return 'bytes remaining'
rather than pass / fail, simplify the implementation to remove loop
unrolling. The unrolling complicates the fault handling for negligible
benefit given modern CPUs perform loop stream detection.

Cc: 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: Tony Luck 
Cc: Al Viro 
Cc: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Peter Zijlstra 
Cc: Andrew Morton 
Reported-by: Linus Torvalds 
Signed-off-by: Dan Williams 
---
 arch/x86/include/asm/string_64.h |4 +--
 arch/x86/lib/memcpy_64.S |   59 ++
 2 files changed, 12 insertions(+), 51 deletions(-)

diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 533f74c300c2..4752f8984923 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -116,7 +116,7 @@ int strcmp(const char *cs, const char *ct);
 #endif
 
 #define __HAVE_ARCH_MEMCPY_MCSAFE 1
-__must_check int memcpy_mcsafe_unrolled(void *dst, const void *src, size_t 
cnt);
+__must_check int __memcpy_mcsafe(void *dst, const void *src, size_t cnt);
 DECLARE_STATIC_KEY_FALSE(mcsafe_key);
 
 /**
@@ -138,7 +138,7 @@ memcpy_mcsafe(void *dst, const void *src, size_t cnt)
 {
 #ifdef CONFIG_X86_MCE
if (static_branch_unlikely(_key))
-   return memcpy_mcsafe_unrolled(dst, src, cnt);
+   return __memcpy_mcsafe(dst, src, cnt);
else
 #endif
memcpy(dst, src, cnt);
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 9a53a06e5a3e..54c971892db5 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -184,11 +184,11 @@ ENDPROC(memcpy_orig)
 
 #ifndef CONFIG_UML
 /*
- * memcpy_mcsafe_unrolled - memory copy with machine check exception handling
+ * __memcpy_mcsafe - memory copy with machine check exception handling
  * Note that we only catch machine checks when reading the source addresses.
  * Writes to target are posted and don't generate machine checks.
  */
-ENTRY(memcpy_mcsafe_unrolled)
+ENTRY(__memcpy_mcsafe)
cmpl $8, %edx
/* Less than 8 bytes? Go to byte copy loop */
jb .L_no_whole_words
@@ -213,49 +213,18 @@ ENTRY(memcpy_mcsafe_unrolled)
jnz .L_copy_leading_bytes
 
 .L_8byte_aligned:
-   /* Figure out how many whole cache lines (64-bytes) to copy */
-   movl %edx, %ecx
-   andl $63, %edx
-   shrl $6, %ecx
-   jz .L_no_whole_cache_lines
-
-   /* Loop copying whole cache lines */
-.L_cache_w0: movq (%rsi), %r8
-.L_cache_w1: movq 1*8(%rsi), %r9
-.L_cache_w2: movq 2*8(%rsi), %r10
-.L_cache_w3: movq 3*8(%rsi), %r11
-   movq %r8, (%rdi)
-   movq %r9, 1*8(%rdi)
-   movq %r10, 2*8(%rdi)
-   movq %r11, 3*8(%rdi)
-.L_cache_w4: movq 4*8(%rsi), %r8
-.L_cache_w5: movq 5*8(%rsi), %r9
-.L_cache_w6: movq 6*8(%rsi), %r10
-.L_cache_w7: movq 7*8(%rsi), %r11
-   movq %r8, 4*8(%rdi)
-   movq %r9, 5*8(%rdi)
-   movq %r10, 6*8(%rdi)
-   movq %r11, 7*8(%rdi)
-   leaq 64(%rsi), %rsi
-   leaq 64(%rdi), %rdi
-   decl %ecx
-   jnz .L_cache_w0
-
-   /* Are there any trailing 8-byte words? */
-.L_no_whole_cache_lines:
movl %edx, %ecx
andl $7, %edx
shrl $3, %ecx
jz .L_no_whole_words
 
-   /* Copy trailing words */
-.L_copy_trailing_words:
+.L_copy_words:
movq (%rsi), %r8
-   mov %r8, (%rdi)
-   leaq 8(%rsi), %rsi
-   leaq 8(%rdi), %rdi
+   movq %r8, (%rdi)
+   addq $8, %rsi
+   addq $8, %rdi
decl %ecx
-   jnz .L_copy_trailing_words
+   jnz .L_copy_words
 
/* Any trailing bytes? */
 .L_no_whole_words:
@@ -276,8 +245,8 @@ ENTRY(memcpy_mcsafe_unrolled)
 .L_done_memcpy_trap:
xorq %rax, %rax
ret
-ENDPROC(memcpy_mcsafe_unrolled)
-EXPORT_SYMBOL_GPL(memcpy_mcsafe_unrolled)
+ENDPROC(__memcpy_mcsafe)
+EXPORT_SYMBOL_GPL(__memcpy_mcsafe)
 
.section .fixup, "ax"
/* Return -EFAULT for any failure */
@@ -288,14 +257,6 @@ EXPORT_SYMBOL_GPL(memcpy_mcsafe_unrolled)
.previous
 
_ASM_EXTABLE_FAULT(.L_copy_leading_bytes, .L_memcpy_mcsafe_fail)
-   _ASM_EXTABLE_FAULT(.L_cache_w0, .L_memcpy_mcsafe_fail)
-   _ASM_EXTABLE_FAULT(.L_cache_w1, .L_memcpy_mcsafe_fail)
-   _ASM_EXTABLE_FAULT(.L_cache_w2, .L_memcpy_mcsafe_fail)
-   _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
-   _ASM_EXTABLE_FAULT(.L_cache_w4, .L_memcpy_mcsafe_fail)
-   _ASM_EXTABLE_FAULT(.L_cache_w5, .L_memcpy_mcsafe_fail)
-   _ASM_EXTABLE_FAULT(.L_cache_w6, .L_memcpy_mcsafe_fail)
-   _ASM_EXTABLE_FAULT(.L_cache_w7, .L_memcpy_mcsafe_fail)
-   _ASM_EXTABLE_FAULT(.L_copy_trailing_words, .L_memcpy_mcsafe_fail)
+   _ASM_EXTABLE_FAULT(.L_copy_words, .L_memcpy_mcsafe_fail)
_ASM_EXTABLE_FAULT(.L_copy_trailing_bytes, .L_memcpy_mcsafe_fail)
 #endif



[PATCH v2 2/9] x86, memcpy_mcsafe: add labels for write fault handling

2018-05-02 Thread Dan Williams
The memcpy_mcsafe() implementation handles CPU exceptions when reading
from the source address. Before it can be used for user copies it needs
to grow support for handling write faults. In preparation for adding
that exception handling update the labels for the read cache word X case
(.L_cache_rX) and write cache word X case (.L_cache_wX).

Cc: 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: Al Viro 
Cc: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Peter Zijlstra 
Cc: Andrew Morton 
Cc: Linus Torvalds 
Reported-by: Tony Luck 
Signed-off-by: Dan Williams 
---
 arch/x86/lib/memcpy_64.S |   21 -
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 54c971892db5..5709f3ec22a4 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -204,13 +204,14 @@ ENTRY(__memcpy_mcsafe)
subl $8, %ecx
negl %ecx
subl %ecx, %edx
-.L_copy_leading_bytes:
+.L_read_leading_bytes:
movb (%rsi), %al
+.L_write_leading_bytes:
movb %al, (%rdi)
incq %rsi
incq %rdi
decl %ecx
-   jnz .L_copy_leading_bytes
+   jnz .L_read_leading_bytes
 
 .L_8byte_aligned:
movl %edx, %ecx
@@ -218,13 +219,14 @@ ENTRY(__memcpy_mcsafe)
shrl $3, %ecx
jz .L_no_whole_words
 
-.L_copy_words:
+.L_read_words:
movq (%rsi), %r8
+.L_write_words:
movq %r8, (%rdi)
addq $8, %rsi
addq $8, %rdi
decl %ecx
-   jnz .L_copy_words
+   jnz .L_read_words
 
/* Any trailing bytes? */
 .L_no_whole_words:
@@ -233,13 +235,14 @@ ENTRY(__memcpy_mcsafe)
 
/* Copy trailing bytes */
movl %edx, %ecx
-.L_copy_trailing_bytes:
+.L_read_trailing_bytes:
movb (%rsi), %al
+.L_write_trailing_bytes:
movb %al, (%rdi)
incq %rsi
incq %rdi
decl %ecx
-   jnz .L_copy_trailing_bytes
+   jnz .L_read_trailing_bytes
 
/* Copy successful. Return zero */
 .L_done_memcpy_trap:
@@ -256,7 +259,7 @@ EXPORT_SYMBOL_GPL(__memcpy_mcsafe)
 
.previous
 
-   _ASM_EXTABLE_FAULT(.L_copy_leading_bytes, .L_memcpy_mcsafe_fail)
-   _ASM_EXTABLE_FAULT(.L_copy_words, .L_memcpy_mcsafe_fail)
-   _ASM_EXTABLE_FAULT(.L_copy_trailing_bytes, .L_memcpy_mcsafe_fail)
+   _ASM_EXTABLE_FAULT(.L_read_leading_bytes, .L_memcpy_mcsafe_fail)
+   _ASM_EXTABLE_FAULT(.L_read_words, .L_memcpy_mcsafe_fail)
+   _ASM_EXTABLE_FAULT(.L_read_trailing_bytes, .L_memcpy_mcsafe_fail)
 #endif



[PATCH v2 0/9] use memcpy_mcsafe() for copy_to_iter()

2018-05-02 Thread Dan Williams
Changes since v1 [1]:

* Remove the loop unrolling in the assembly implementation since it
  significantly complicates the exception handling (Linus)

* Introduce a ->copy_to_iter() dax operation for symmetry with the
  existing ->copy_from_iter() operation to allow platform /
  device-specific implementations.

[1]: https://lists.01.org/pipermail/linux-nvdimm/2018-May/015548.html

---

Currently memcpy_mcsafe() is only deployed in the pmem driver when
reading through a /dev/pmemX block device. However, a filesystem in dax
mode mounted on a /dev/pmemX block device will bypass the block layer
and the driver for reads. The filesystem-dax (fsdax) read case uses
dax_direct_access() and copy_to_iter() to bypass the block layer.

The result of the bypass is that the kernel treats machine checks during
read as system fatal (reboot) when they could simply be flagged as an
I/O error, similar to performing reads through the pmem driver. Prevent
this fatal condition by deploying memcpy_mcsafe() in the fsdax read
path.

The main differences between this copy_to_user_mcsafe() and
copy_user_generic_unrolled() are:

* Typical tail/residue handling after a fault retries the copy
  byte-by-byte until the fault happens again. Re-triggering machine
  checks is potentially fatal so the implementation uses source alignment
  and poison alignment assumptions to avoid re-triggering machine
  checks.

* SMAP coordination is handled external to the assembly with
  __uaccess_begin() and __uaccess_end().

* ITER_KVEC and ITER_BVEC can now end prematurely with an error.

The new MCSAFE_DEBUG facility is proposed as a way to unit test the
exception handling without requiring an ACPI EINJ capable platform.

---

Dan Williams (9):
  x86, memcpy_mcsafe: remove loop unrolling
  x86, memcpy_mcsafe: add labels for write fault handling
  x86, memcpy_mcsafe: return bytes remaining
  x86, memcpy_mcsafe: add write-protection-fault handling
  x86, memcpy_mcsafe: define copy_to_iter_mcsafe()
  dax: introduce a ->copy_to_iter dax operation
  dax: report bytes remaining in dax_iomap_actor()
  pmem: switch to copy_to_iter_mcsafe()
  x86, nfit_test: unit test for memcpy_mcsafe()


 arch/x86/Kconfig|1 
 arch/x86/Kconfig.debug  |3 +
 arch/x86/include/asm/mcsafe_debug.h |   50 
 arch/x86/include/asm/string_64.h|   10 ++-
 arch/x86/include/asm/uaccess_64.h   |   14 
 arch/x86/lib/memcpy_64.S|  109 ---
 arch/x86/lib/usercopy_64.c  |   17 +
 drivers/dax/super.c |   10 +++
 drivers/md/dm-linear.c  |   16 +
 drivers/md/dm-log-writes.c  |   15 +
 drivers/md/dm-stripe.c  |   21 +++
 drivers/md/dm.c |   25 
 drivers/nvdimm/claim.c  |3 +
 drivers/nvdimm/pmem.c   |   13 +++-
 drivers/s390/block/dcssblk.c|7 ++
 fs/dax.c|   21 ---
 include/linux/dax.h |5 ++
 include/linux/device-mapper.h   |5 +-
 include/linux/string.h  |4 +
 include/linux/uio.h |   15 +
 lib/iov_iter.c  |   61 
 tools/testing/nvdimm/test/nfit.c|   48 +++
 22 files changed, 394 insertions(+), 79 deletions(-)
 create mode 100644 arch/x86/include/asm/mcsafe_debug.h


[PATCH v2 0/9] use memcpy_mcsafe() for copy_to_iter()

2018-05-02 Thread Dan Williams
Changes since v1 [1]:

* Remove the loop unrolling in the assembly implementation since it
  significantly complicates the exception handling (Linus)

* Introduce a ->copy_to_iter() dax operation for symmetry with the
  existing ->copy_from_iter() operation to allow platform /
  device-specific implementations.

[1]: https://lists.01.org/pipermail/linux-nvdimm/2018-May/015548.html

---

Currently memcpy_mcsafe() is only deployed in the pmem driver when
reading through a /dev/pmemX block device. However, a filesystem in dax
mode mounted on a /dev/pmemX block device will bypass the block layer
and the driver for reads. The filesystem-dax (fsdax) read case uses
dax_direct_access() and copy_to_iter() to bypass the block layer.

The result of the bypass is that the kernel treats machine checks during
read as system fatal (reboot) when they could simply be flagged as an
I/O error, similar to performing reads through the pmem driver. Prevent
this fatal condition by deploying memcpy_mcsafe() in the fsdax read
path.

The main differences between this copy_to_user_mcsafe() and
copy_user_generic_unrolled() are:

* Typical tail/residue handling after a fault retries the copy
  byte-by-byte until the fault happens again. Re-triggering machine
  checks is potentially fatal so the implementation uses source alignment
  and poison alignment assumptions to avoid re-triggering machine
  checks.

* SMAP coordination is handled external to the assembly with
  __uaccess_begin() and __uaccess_end().

* ITER_KVEC and ITER_BVEC can now end prematurely with an error.

The new MCSAFE_DEBUG facility is proposed as a way to unit test the
exception handling without requiring an ACPI EINJ capable platform.

---

Dan Williams (9):
  x86, memcpy_mcsafe: remove loop unrolling
  x86, memcpy_mcsafe: add labels for write fault handling
  x86, memcpy_mcsafe: return bytes remaining
  x86, memcpy_mcsafe: add write-protection-fault handling
  x86, memcpy_mcsafe: define copy_to_iter_mcsafe()
  dax: introduce a ->copy_to_iter dax operation
  dax: report bytes remaining in dax_iomap_actor()
  pmem: switch to copy_to_iter_mcsafe()
  x86, nfit_test: unit test for memcpy_mcsafe()


 arch/x86/Kconfig|1 
 arch/x86/Kconfig.debug  |3 +
 arch/x86/include/asm/mcsafe_debug.h |   50 
 arch/x86/include/asm/string_64.h|   10 ++-
 arch/x86/include/asm/uaccess_64.h   |   14 
 arch/x86/lib/memcpy_64.S|  109 ---
 arch/x86/lib/usercopy_64.c  |   17 +
 drivers/dax/super.c |   10 +++
 drivers/md/dm-linear.c  |   16 +
 drivers/md/dm-log-writes.c  |   15 +
 drivers/md/dm-stripe.c  |   21 +++
 drivers/md/dm.c |   25 
 drivers/nvdimm/claim.c  |3 +
 drivers/nvdimm/pmem.c   |   13 +++-
 drivers/s390/block/dcssblk.c|7 ++
 fs/dax.c|   21 ---
 include/linux/dax.h |5 ++
 include/linux/device-mapper.h   |5 +-
 include/linux/string.h  |4 +
 include/linux/uio.h |   15 +
 lib/iov_iter.c  |   61 
 tools/testing/nvdimm/test/nfit.c|   48 +++
 22 files changed, 394 insertions(+), 79 deletions(-)
 create mode 100644 arch/x86/include/asm/mcsafe_debug.h


[PATCH v15 5/9] PCI/AER: Factor out error reporting from AER

2018-05-02 Thread Oza Pawandeep
This patch factors out error reporting callbacks, which are currently
tightly coupled with AER.

DPC should be able to register callbacks and attempt recovery when DPC
trigger event occurs.

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/pcie/Makefile b/drivers/pci/pcie/Makefile
index 800e1d4..03f4e0b 100644
--- a/drivers/pci/pcie/Makefile
+++ b/drivers/pci/pcie/Makefile
@@ -2,7 +2,7 @@
 #
 # Makefile for PCI Express features and port driver
 
-pcieportdrv-y  := portdrv_core.o portdrv_pci.o
+pcieportdrv-y  := portdrv_core.o portdrv_pci.o err.o
 
 obj-$(CONFIG_PCIEPORTBUS)  += pcieportdrv.o
 
diff --git a/drivers/pci/pcie/aer/aerdrv.h b/drivers/pci/pcie/aer/aerdrv.h
index 08b4584..b4c9506 100644
--- a/drivers/pci/pcie/aer/aerdrv.h
+++ b/drivers/pci/pcie/aer/aerdrv.h
@@ -76,36 +76,6 @@ struct aer_rpc {
 */
 };
 
-struct aer_broadcast_data {
-   enum pci_channel_state state;
-   enum pci_ers_result result;
-};
-
-static inline pci_ers_result_t merge_result(enum pci_ers_result orig,
-   enum pci_ers_result new)
-{
-   if (new == PCI_ERS_RESULT_NO_AER_DRIVER)
-   return PCI_ERS_RESULT_NO_AER_DRIVER;
-
-   if (new == PCI_ERS_RESULT_NONE)
-   return orig;
-
-   switch (orig) {
-   case PCI_ERS_RESULT_CAN_RECOVER:
-   case PCI_ERS_RESULT_RECOVERED:
-   orig = new;
-   break;
-   case PCI_ERS_RESULT_DISCONNECT:
-   if (new == PCI_ERS_RESULT_NEED_RESET)
-   orig = PCI_ERS_RESULT_NEED_RESET;
-   break;
-   default:
-   break;
-   }
-
-   return orig;
-}
-
 extern struct bus_type pcie_port_bus_type;
 void aer_isr(struct work_struct *work);
 void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index be4ee3b..51515d1 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -228,191 +228,6 @@ static bool find_source_device(struct pci_dev *parent,
return true;
 }
 
-static int report_error_detected(struct pci_dev *dev, void *data)
-{
-   pci_ers_result_t vote;
-   const struct pci_error_handlers *err_handler;
-   struct aer_broadcast_data *result_data;
-   result_data = (struct aer_broadcast_data *) data;
-
-   device_lock(>dev);
-   dev->error_state = result_data->state;
-
-   if (!dev->driver ||
-   !dev->driver->err_handler ||
-   !dev->driver->err_handler->error_detected) {
-   if (result_data->state == pci_channel_io_frozen &&
-   dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
-   /*
-* In case of fatal recovery, if one of down-
-* stream device has no driver. We might be
-* unable to recover because a later insmod
-* of a driver for this device is unaware of
-* its hw state.
-*/
-   pci_printk(KERN_DEBUG, dev, "device has %s\n",
-  dev->driver ?
-  "no AER-aware driver" : "no driver");
-   }
-
-   /*
-* If there's any device in the subtree that does not
-* have an error_detected callback, returning
-* PCI_ERS_RESULT_NO_AER_DRIVER prevents calling of
-* the subsequent mmio_enabled/slot_reset/resume
-* callbacks of "any" device in the subtree. All the
-* devices in the subtree are left in the error state
-* without recovery.
-*/
-
-   if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE)
-   vote = PCI_ERS_RESULT_NO_AER_DRIVER;
-   else
-   vote = PCI_ERS_RESULT_NONE;
-   } else {
-   err_handler = dev->driver->err_handler;
-   vote = err_handler->error_detected(dev, result_data->state);
-   pci_uevent_ers(dev, PCI_ERS_RESULT_NONE);
-   }
-
-   result_data->result = merge_result(result_data->result, vote);
-   device_unlock(>dev);
-   return 0;
-}
-
-static int report_mmio_enabled(struct pci_dev *dev, void *data)
-{
-   pci_ers_result_t vote;
-   const struct pci_error_handlers *err_handler;
-   struct aer_broadcast_data *result_data;
-   result_data = (struct aer_broadcast_data *) data;
-
-   device_lock(>dev);
-   if (!dev->driver ||
-   !dev->driver->err_handler ||
-   !dev->driver->err_handler->mmio_enabled)
-   goto out;
-
-   err_handler = dev->driver->err_handler;
-   vote = err_handler->mmio_enabled(dev);
-   result_data->result = 

[PATCH v15 5/9] PCI/AER: Factor out error reporting from AER

2018-05-02 Thread Oza Pawandeep
This patch factors out error reporting callbacks, which are currently
tightly coupled with AER.

DPC should be able to register callbacks and attempt recovery when DPC
trigger event occurs.

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/pcie/Makefile b/drivers/pci/pcie/Makefile
index 800e1d4..03f4e0b 100644
--- a/drivers/pci/pcie/Makefile
+++ b/drivers/pci/pcie/Makefile
@@ -2,7 +2,7 @@
 #
 # Makefile for PCI Express features and port driver
 
-pcieportdrv-y  := portdrv_core.o portdrv_pci.o
+pcieportdrv-y  := portdrv_core.o portdrv_pci.o err.o
 
 obj-$(CONFIG_PCIEPORTBUS)  += pcieportdrv.o
 
diff --git a/drivers/pci/pcie/aer/aerdrv.h b/drivers/pci/pcie/aer/aerdrv.h
index 08b4584..b4c9506 100644
--- a/drivers/pci/pcie/aer/aerdrv.h
+++ b/drivers/pci/pcie/aer/aerdrv.h
@@ -76,36 +76,6 @@ struct aer_rpc {
 */
 };
 
-struct aer_broadcast_data {
-   enum pci_channel_state state;
-   enum pci_ers_result result;
-};
-
-static inline pci_ers_result_t merge_result(enum pci_ers_result orig,
-   enum pci_ers_result new)
-{
-   if (new == PCI_ERS_RESULT_NO_AER_DRIVER)
-   return PCI_ERS_RESULT_NO_AER_DRIVER;
-
-   if (new == PCI_ERS_RESULT_NONE)
-   return orig;
-
-   switch (orig) {
-   case PCI_ERS_RESULT_CAN_RECOVER:
-   case PCI_ERS_RESULT_RECOVERED:
-   orig = new;
-   break;
-   case PCI_ERS_RESULT_DISCONNECT:
-   if (new == PCI_ERS_RESULT_NEED_RESET)
-   orig = PCI_ERS_RESULT_NEED_RESET;
-   break;
-   default:
-   break;
-   }
-
-   return orig;
-}
-
 extern struct bus_type pcie_port_bus_type;
 void aer_isr(struct work_struct *work);
 void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index be4ee3b..51515d1 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -228,191 +228,6 @@ static bool find_source_device(struct pci_dev *parent,
return true;
 }
 
-static int report_error_detected(struct pci_dev *dev, void *data)
-{
-   pci_ers_result_t vote;
-   const struct pci_error_handlers *err_handler;
-   struct aer_broadcast_data *result_data;
-   result_data = (struct aer_broadcast_data *) data;
-
-   device_lock(>dev);
-   dev->error_state = result_data->state;
-
-   if (!dev->driver ||
-   !dev->driver->err_handler ||
-   !dev->driver->err_handler->error_detected) {
-   if (result_data->state == pci_channel_io_frozen &&
-   dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
-   /*
-* In case of fatal recovery, if one of down-
-* stream device has no driver. We might be
-* unable to recover because a later insmod
-* of a driver for this device is unaware of
-* its hw state.
-*/
-   pci_printk(KERN_DEBUG, dev, "device has %s\n",
-  dev->driver ?
-  "no AER-aware driver" : "no driver");
-   }
-
-   /*
-* If there's any device in the subtree that does not
-* have an error_detected callback, returning
-* PCI_ERS_RESULT_NO_AER_DRIVER prevents calling of
-* the subsequent mmio_enabled/slot_reset/resume
-* callbacks of "any" device in the subtree. All the
-* devices in the subtree are left in the error state
-* without recovery.
-*/
-
-   if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE)
-   vote = PCI_ERS_RESULT_NO_AER_DRIVER;
-   else
-   vote = PCI_ERS_RESULT_NONE;
-   } else {
-   err_handler = dev->driver->err_handler;
-   vote = err_handler->error_detected(dev, result_data->state);
-   pci_uevent_ers(dev, PCI_ERS_RESULT_NONE);
-   }
-
-   result_data->result = merge_result(result_data->result, vote);
-   device_unlock(>dev);
-   return 0;
-}
-
-static int report_mmio_enabled(struct pci_dev *dev, void *data)
-{
-   pci_ers_result_t vote;
-   const struct pci_error_handlers *err_handler;
-   struct aer_broadcast_data *result_data;
-   result_data = (struct aer_broadcast_data *) data;
-
-   device_lock(>dev);
-   if (!dev->driver ||
-   !dev->driver->err_handler ||
-   !dev->driver->err_handler->mmio_enabled)
-   goto out;
-
-   err_handler = dev->driver->err_handler;
-   vote = err_handler->mmio_enabled(dev);
-   result_data->result = merge_result(result_data->result, vote);

Re: [PATCH] kernel/exit.c: pointer sighand could be uninitialized

2018-05-02 Thread Al Viro
On Wed, May 02, 2018 at 06:48:57PM -0700, Yizhuo Zhai wrote:
> Variable 'sighand' could be uninitialized if probe_kernel_address fails
> (-EFAULT). The later use in the if statement may lead to undefined behavior.

Excuse me, but that's nonsense.  The value *copied* into it (in case
probe_kernel_address() has not failed) may be just as uninitialized.
If mere "compare uninitialized pointer value to NULL" can cause nasal demons to 
fly,
* we are screwed anyway
* the piece of crap compiler should be printed on sandpaper and used to
polish its authors.

Read the comments in there, please.  Especially the one regarding the second 
case.


[PATCH v15 6/9] PCI/PORTDRV: Implement generic find service

2018-05-02 Thread Oza Pawandeep
This patch implements generic pcie_port_find_service() routine.

Signed-off-by: Oza Pawandeep 
Reviewed-by: Keith Busch 

diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index 51515d1..a525296 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -228,32 +228,6 @@ static bool find_source_device(struct pci_dev *parent,
return true;
 }
 
-static int find_aer_service_iter(struct device *device, void *data)
-{
-   struct pcie_port_service_driver *service_driver, **drv;
-
-   drv = (struct pcie_port_service_driver **) data;
-
-   if (device->bus == _port_bus_type && device->driver) {
-   service_driver = to_service_driver(device->driver);
-   if (service_driver->service == PCIE_PORT_SERVICE_AER) {
-   *drv = service_driver;
-   return 1;
-   }
-   }
-
-   return 0;
-}
-
-struct pcie_port_service_driver *find_aer_service(struct pci_dev *dev)
-{
-   struct pcie_port_service_driver *drv = NULL;
-
-   device_for_each_child(>dev, , find_aer_service_iter);
-
-   return drv;
-}
-
 /**
  * handle_error_source - handle logging error into an event log
  * @aerdev: pointer to pcie_device data structure of the root port
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index 55df974..877785d 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -195,10 +195,8 @@ static pci_ers_result_t reset_link(struct pci_dev *dev)
udev = dev->bus->self;
}
 
-#if IS_ENABLED(CONFIG_PCIEAER)
/* Use the aer driver of the component firstly */
-   driver = find_aer_service(udev);
-#endif
+   driver = pcie_port_find_service(udev, PCIE_PORT_SERVICE_AER);
 
if (driver && driver->reset_link) {
status = driver->reset_link(udev);
diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index 47c9824..ba6c963 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -112,5 +112,6 @@ static inline bool pcie_pme_no_msi(void) { return false; }
 static inline void pcie_pme_interrupt_enable(struct pci_dev *dev, bool en) {}
 #endif /* !CONFIG_PCIE_PME */
 
-struct pcie_port_service_driver *find_aer_service(struct pci_dev *dev);
+struct pcie_port_service_driver *pcie_port_find_service(struct pci_dev *dev,
+   u32 service);
 #endif /* _PORTDRV_H_ */
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index c9c0663..d843055 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -18,6 +18,10 @@
 
 #include "../pci.h"
 #include "portdrv.h"
+struct portdrv_service_data {
+   struct pcie_port_service_driver *drv;
+   u32 service;
+};
 
 /**
  * release_pcie_device - free PCI Express port service device structure
@@ -398,6 +402,46 @@ static int remove_iter(struct device *dev, void *data)
return 0;
 }
 
+static int find_service_iter(struct device *device, void *data)
+{
+   struct pcie_port_service_driver *service_driver;
+   struct portdrv_service_data *pdrvs;
+   u32 service;
+
+   pdrvs = (struct portdrv_service_data *) data;
+   service = pdrvs->service;
+
+   if (device->bus == _port_bus_type && device->driver) {
+   service_driver = to_service_driver(device->driver);
+   if (service_driver->service == service) {
+   pdrvs->drv = service_driver;
+   return 1;
+   }
+   }
+
+   return 0;
+}
+/**
+ * pcie_port_find_service - find the service driver
+ * @dev: PCI Express port the service devices associated with
+ * @service: Service to find
+ *
+ * Find PCI Express port service driver associated with given service
+ */
+struct pcie_port_service_driver *pcie_port_find_service(struct pci_dev *dev,
+   u32 service)
+{
+   struct pcie_port_service_driver *drv;
+   struct portdrv_service_data pdrvs;
+
+   pdrvs.drv = NULL;
+   pdrvs.service = service;
+   device_for_each_child(>dev, , find_service_iter);
+
+   drv = pdrvs.drv;
+   return drv;
+}
+
 /**
  * pcie_port_device_remove - unregister PCI Express port service devices
  * @dev: PCI Express port the service devices to unregister are associated with
-- 
2.7.4



Re: [PATCH] kernel/exit.c: pointer sighand could be uninitialized

2018-05-02 Thread Al Viro
On Wed, May 02, 2018 at 06:48:57PM -0700, Yizhuo Zhai wrote:
> Variable 'sighand' could be uninitialized if probe_kernel_address fails
> (-EFAULT). The later use in the if statement may lead to undefined behavior.

Excuse me, but that's nonsense.  The value *copied* into it (in case
probe_kernel_address() has not failed) may be just as uninitialized.
If mere "compare uninitialized pointer value to NULL" can cause nasal demons to 
fly,
* we are screwed anyway
* the piece of crap compiler should be printed on sandpaper and used to
polish its authors.

Read the comments in there, please.  Especially the one regarding the second 
case.


[PATCH v15 6/9] PCI/PORTDRV: Implement generic find service

2018-05-02 Thread Oza Pawandeep
This patch implements generic pcie_port_find_service() routine.

Signed-off-by: Oza Pawandeep 
Reviewed-by: Keith Busch 

diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index 51515d1..a525296 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -228,32 +228,6 @@ static bool find_source_device(struct pci_dev *parent,
return true;
 }
 
-static int find_aer_service_iter(struct device *device, void *data)
-{
-   struct pcie_port_service_driver *service_driver, **drv;
-
-   drv = (struct pcie_port_service_driver **) data;
-
-   if (device->bus == _port_bus_type && device->driver) {
-   service_driver = to_service_driver(device->driver);
-   if (service_driver->service == PCIE_PORT_SERVICE_AER) {
-   *drv = service_driver;
-   return 1;
-   }
-   }
-
-   return 0;
-}
-
-struct pcie_port_service_driver *find_aer_service(struct pci_dev *dev)
-{
-   struct pcie_port_service_driver *drv = NULL;
-
-   device_for_each_child(>dev, , find_aer_service_iter);
-
-   return drv;
-}
-
 /**
  * handle_error_source - handle logging error into an event log
  * @aerdev: pointer to pcie_device data structure of the root port
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index 55df974..877785d 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -195,10 +195,8 @@ static pci_ers_result_t reset_link(struct pci_dev *dev)
udev = dev->bus->self;
}
 
-#if IS_ENABLED(CONFIG_PCIEAER)
/* Use the aer driver of the component firstly */
-   driver = find_aer_service(udev);
-#endif
+   driver = pcie_port_find_service(udev, PCIE_PORT_SERVICE_AER);
 
if (driver && driver->reset_link) {
status = driver->reset_link(udev);
diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index 47c9824..ba6c963 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -112,5 +112,6 @@ static inline bool pcie_pme_no_msi(void) { return false; }
 static inline void pcie_pme_interrupt_enable(struct pci_dev *dev, bool en) {}
 #endif /* !CONFIG_PCIE_PME */
 
-struct pcie_port_service_driver *find_aer_service(struct pci_dev *dev);
+struct pcie_port_service_driver *pcie_port_find_service(struct pci_dev *dev,
+   u32 service);
 #endif /* _PORTDRV_H_ */
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index c9c0663..d843055 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -18,6 +18,10 @@
 
 #include "../pci.h"
 #include "portdrv.h"
+struct portdrv_service_data {
+   struct pcie_port_service_driver *drv;
+   u32 service;
+};
 
 /**
  * release_pcie_device - free PCI Express port service device structure
@@ -398,6 +402,46 @@ static int remove_iter(struct device *dev, void *data)
return 0;
 }
 
+static int find_service_iter(struct device *device, void *data)
+{
+   struct pcie_port_service_driver *service_driver;
+   struct portdrv_service_data *pdrvs;
+   u32 service;
+
+   pdrvs = (struct portdrv_service_data *) data;
+   service = pdrvs->service;
+
+   if (device->bus == _port_bus_type && device->driver) {
+   service_driver = to_service_driver(device->driver);
+   if (service_driver->service == service) {
+   pdrvs->drv = service_driver;
+   return 1;
+   }
+   }
+
+   return 0;
+}
+/**
+ * pcie_port_find_service - find the service driver
+ * @dev: PCI Express port the service devices associated with
+ * @service: Service to find
+ *
+ * Find PCI Express port service driver associated with given service
+ */
+struct pcie_port_service_driver *pcie_port_find_service(struct pci_dev *dev,
+   u32 service)
+{
+   struct pcie_port_service_driver *drv;
+   struct portdrv_service_data pdrvs;
+
+   pdrvs.drv = NULL;
+   pdrvs.service = service;
+   device_for_each_child(>dev, , find_service_iter);
+
+   drv = pdrvs.drv;
+   return drv;
+}
+
 /**
  * pcie_port_device_remove - unregister PCI Express port service devices
  * @dev: PCI Express port the service devices to unregister are associated with
-- 
2.7.4



[PATCH v15 7/9] PCI/PORTDRV: Implement generic find device

2018-05-02 Thread Oza Pawandeep
This patch implements generic pcie_port_find_device() routine.

Signed-off-by: Oza Pawandeep 
Reviewed-by: Keith Busch 

diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index ba6c963..896608a 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -114,4 +114,6 @@ static inline void pcie_pme_interrupt_enable(struct pci_dev 
*dev, bool en) {}
 
 struct pcie_port_service_driver *pcie_port_find_service(struct pci_dev *dev,
u32 service);
+struct device *pcie_port_find_device(struct pci_dev *dev,
+u32 service);
 #endif /* _PORTDRV_H_ */
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index d843055..c6147c4 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -20,6 +20,7 @@
 #include "portdrv.h"
 struct portdrv_service_data {
struct pcie_port_service_driver *drv;
+   struct device *dev;
u32 service;
 };
 
@@ -415,6 +416,7 @@ static int find_service_iter(struct device *device, void 
*data)
service_driver = to_service_driver(device->driver);
if (service_driver->service == service) {
pdrvs->drv = service_driver;
+   pdrvs->dev = device;
return 1;
}
}
@@ -443,6 +445,27 @@ struct pcie_port_service_driver 
*pcie_port_find_service(struct pci_dev *dev,
 }
 
 /**
+ * pcie_port_find_device - find the struct device
+ * @dev: PCI Express port the service devices associated with
+ * @service: For the service to find
+ *
+ * Find PCI Express port service driver associated with given service
+ */
+struct  device *pcie_port_find_device(struct pci_dev *dev,
+ u32 service)
+{
+   struct device *device;
+   struct portdrv_service_data pdrvs;
+
+   pdrvs.dev = NULL;
+   pdrvs.service = service;
+   device_for_each_child(>dev, , find_service_iter);
+
+   device = pdrvs.dev;
+   return device;
+}
+
+/**
  * pcie_port_device_remove - unregister PCI Express port service devices
  * @dev: PCI Express port the service devices to unregister are associated with
  *
-- 
2.7.4



[PATCH v15 7/9] PCI/PORTDRV: Implement generic find device

2018-05-02 Thread Oza Pawandeep
This patch implements generic pcie_port_find_device() routine.

Signed-off-by: Oza Pawandeep 
Reviewed-by: Keith Busch 

diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index ba6c963..896608a 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -114,4 +114,6 @@ static inline void pcie_pme_interrupt_enable(struct pci_dev 
*dev, bool en) {}
 
 struct pcie_port_service_driver *pcie_port_find_service(struct pci_dev *dev,
u32 service);
+struct device *pcie_port_find_device(struct pci_dev *dev,
+u32 service);
 #endif /* _PORTDRV_H_ */
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index d843055..c6147c4 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -20,6 +20,7 @@
 #include "portdrv.h"
 struct portdrv_service_data {
struct pcie_port_service_driver *drv;
+   struct device *dev;
u32 service;
 };
 
@@ -415,6 +416,7 @@ static int find_service_iter(struct device *device, void 
*data)
service_driver = to_service_driver(device->driver);
if (service_driver->service == service) {
pdrvs->drv = service_driver;
+   pdrvs->dev = device;
return 1;
}
}
@@ -443,6 +445,27 @@ struct pcie_port_service_driver 
*pcie_port_find_service(struct pci_dev *dev,
 }
 
 /**
+ * pcie_port_find_device - find the struct device
+ * @dev: PCI Express port the service devices associated with
+ * @service: For the service to find
+ *
+ * Find PCI Express port service driver associated with given service
+ */
+struct  device *pcie_port_find_device(struct pci_dev *dev,
+ u32 service)
+{
+   struct device *device;
+   struct portdrv_service_data pdrvs;
+
+   pdrvs.dev = NULL;
+   pdrvs.service = service;
+   device_for_each_child(>dev, , find_service_iter);
+
+   device = pdrvs.dev;
+   return device;
+}
+
+/**
  * pcie_port_device_remove - unregister PCI Express port service devices
  * @dev: PCI Express port the service devices to unregister are associated with
  *
-- 
2.7.4



[PATCH v15 8/9] PCI/DPC: Unify and plumb error handling into DPC

2018-05-02 Thread Oza Pawandeep
Current DPC driver does not do recovery, e.g. calling end-point's driver's
callbacks, which sanitize the sw.

DPC driver implements link_reset callback, and calls pci_do_recovery().

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index 80ec384..aed7c9f 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -73,29 +73,21 @@ static void dpc_wait_link_inactive(struct dpc_dev *dpc)
pcie_wait_for_link(pdev, false);
 }
 
-static void dpc_work(struct work_struct *work)
+static pci_ers_result_t dpc_reset_link(struct pci_dev *pdev)
 {
-   struct dpc_dev *dpc = container_of(work, struct dpc_dev, work);
-   struct pci_dev *dev, *temp, *pdev = dpc->dev->port;
-   struct pci_bus *parent = pdev->subordinate;
-   u16 cap = dpc->cap_pos, ctl;
-
-   pci_lock_rescan_remove();
-   list_for_each_entry_safe_reverse(dev, temp, >devices,
-bus_list) {
-   pci_dev_get(dev);
-   pci_dev_set_disconnected(dev, NULL);
-   if (pci_has_subordinate(dev))
-   pci_walk_bus(dev->subordinate,
-pci_dev_set_disconnected, NULL);
-   pci_stop_and_remove_bus_device(dev);
-   pci_dev_put(dev);
-   }
-   pci_unlock_rescan_remove();
+   struct dpc_dev *dpc;
+   struct pcie_device *pciedev;
+   struct device *devdpc;
+   u16 cap, ctl;
+
+   devdpc = pcie_port_find_device(pdev, PCIE_PORT_SERVICE_DPC);
+   pciedev = to_pcie_device(devdpc);
+   dpc = get_service_data(pciedev);
+   cap = dpc->cap_pos;
 
dpc_wait_link_inactive(dpc);
if (dpc->rp_extensions && dpc_wait_rp_inactive(dpc))
-   return;
+   return PCI_ERS_RESULT_DISCONNECT;
if (dpc->rp_extensions && dpc->rp_pio_status) {
pci_write_config_dword(pdev, cap + PCI_EXP_DPC_RP_PIO_STATUS,
   dpc->rp_pio_status);
@@ -108,6 +100,17 @@ static void dpc_work(struct work_struct *work)
pci_read_config_word(pdev, cap + PCI_EXP_DPC_CTL, );
pci_write_config_word(pdev, cap + PCI_EXP_DPC_CTL,
  ctl | PCI_EXP_DPC_CTL_INT_EN);
+
+   return PCI_ERS_RESULT_RECOVERED;
+}
+
+static void dpc_work(struct work_struct *work)
+{
+   struct dpc_dev *dpc = container_of(work, struct dpc_dev, work);
+   struct pci_dev *pdev = dpc->dev->port;
+
+   /* From DPC point of view error is always FATAL. */
+   pcie_do_recovery(pdev, DPC_FATAL);
 }
 
 static void dpc_process_rp_pio_error(struct dpc_dev *dpc)
@@ -288,6 +291,7 @@ static struct pcie_port_service_driver dpcdriver = {
.service= PCIE_PORT_SERVICE_DPC,
.probe  = dpc_probe,
.remove = dpc_remove,
+   .reset_link = dpc_reset_link,
 };
 
 static int __init dpc_service_init(void)
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index 877785d..526aba8 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -181,11 +181,12 @@ static pci_ers_result_t default_reset_link(struct pci_dev 
*dev)
return PCI_ERS_RESULT_RECOVERED;
 }
 
-static pci_ers_result_t reset_link(struct pci_dev *dev)
+static pci_ers_result_t reset_link(struct pci_dev *dev, int severity)
 {
struct pci_dev *udev;
pci_ers_result_t status;
struct pcie_port_service_driver *driver;
+   u32 service;
 
if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
/* Reset this port for all subordinates */
@@ -196,7 +197,12 @@ static pci_ers_result_t reset_link(struct pci_dev *dev)
}
 
/* Use the aer driver of the component firstly */
-   driver = pcie_port_find_service(udev, PCIE_PORT_SERVICE_AER);
+   if (severity == DPC_FATAL)
+   service = PCIE_PORT_SERVICE_DPC;
+   else
+   service = PCIE_PORT_SERVICE_AER;
+
+   driver = pcie_port_find_service(udev, service);
 
if (driver && driver->reset_link) {
status = driver->reset_link(udev);
@@ -302,7 +308,7 @@ static pci_ers_result_t do_fatal_recovery(struct pci_dev 
*dev, int severity)
pci_dev_put(pdev);
}
 
-   result = reset_link(udev);
+   result = reset_link(udev, severity);
if (result == PCI_ERS_RESULT_RECOVERED)
if (pcie_wait_for_link(udev, true))
pci_rescan_bus(udev->bus);
@@ -326,7 +332,8 @@ void pcie_do_recovery(struct pci_dev *dev, int severity)
pci_ers_result_t status;
enum pci_channel_state state;
 
-   if (severity == AER_FATAL) {
+   if ((severity == AER_FATAL) ||
+  (severity == DPC_FATAL)) {
status = do_fatal_recovery(dev, severity);
if (status != PCI_ERS_RESULT_RECOVERED)
goto failed;
diff --git a/include/linux/aer.h 

[PATCH v15 8/9] PCI/DPC: Unify and plumb error handling into DPC

2018-05-02 Thread Oza Pawandeep
Current DPC driver does not do recovery, e.g. calling end-point's driver's
callbacks, which sanitize the sw.

DPC driver implements link_reset callback, and calls pci_do_recovery().

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index 80ec384..aed7c9f 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -73,29 +73,21 @@ static void dpc_wait_link_inactive(struct dpc_dev *dpc)
pcie_wait_for_link(pdev, false);
 }
 
-static void dpc_work(struct work_struct *work)
+static pci_ers_result_t dpc_reset_link(struct pci_dev *pdev)
 {
-   struct dpc_dev *dpc = container_of(work, struct dpc_dev, work);
-   struct pci_dev *dev, *temp, *pdev = dpc->dev->port;
-   struct pci_bus *parent = pdev->subordinate;
-   u16 cap = dpc->cap_pos, ctl;
-
-   pci_lock_rescan_remove();
-   list_for_each_entry_safe_reverse(dev, temp, >devices,
-bus_list) {
-   pci_dev_get(dev);
-   pci_dev_set_disconnected(dev, NULL);
-   if (pci_has_subordinate(dev))
-   pci_walk_bus(dev->subordinate,
-pci_dev_set_disconnected, NULL);
-   pci_stop_and_remove_bus_device(dev);
-   pci_dev_put(dev);
-   }
-   pci_unlock_rescan_remove();
+   struct dpc_dev *dpc;
+   struct pcie_device *pciedev;
+   struct device *devdpc;
+   u16 cap, ctl;
+
+   devdpc = pcie_port_find_device(pdev, PCIE_PORT_SERVICE_DPC);
+   pciedev = to_pcie_device(devdpc);
+   dpc = get_service_data(pciedev);
+   cap = dpc->cap_pos;
 
dpc_wait_link_inactive(dpc);
if (dpc->rp_extensions && dpc_wait_rp_inactive(dpc))
-   return;
+   return PCI_ERS_RESULT_DISCONNECT;
if (dpc->rp_extensions && dpc->rp_pio_status) {
pci_write_config_dword(pdev, cap + PCI_EXP_DPC_RP_PIO_STATUS,
   dpc->rp_pio_status);
@@ -108,6 +100,17 @@ static void dpc_work(struct work_struct *work)
pci_read_config_word(pdev, cap + PCI_EXP_DPC_CTL, );
pci_write_config_word(pdev, cap + PCI_EXP_DPC_CTL,
  ctl | PCI_EXP_DPC_CTL_INT_EN);
+
+   return PCI_ERS_RESULT_RECOVERED;
+}
+
+static void dpc_work(struct work_struct *work)
+{
+   struct dpc_dev *dpc = container_of(work, struct dpc_dev, work);
+   struct pci_dev *pdev = dpc->dev->port;
+
+   /* From DPC point of view error is always FATAL. */
+   pcie_do_recovery(pdev, DPC_FATAL);
 }
 
 static void dpc_process_rp_pio_error(struct dpc_dev *dpc)
@@ -288,6 +291,7 @@ static struct pcie_port_service_driver dpcdriver = {
.service= PCIE_PORT_SERVICE_DPC,
.probe  = dpc_probe,
.remove = dpc_remove,
+   .reset_link = dpc_reset_link,
 };
 
 static int __init dpc_service_init(void)
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index 877785d..526aba8 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -181,11 +181,12 @@ static pci_ers_result_t default_reset_link(struct pci_dev 
*dev)
return PCI_ERS_RESULT_RECOVERED;
 }
 
-static pci_ers_result_t reset_link(struct pci_dev *dev)
+static pci_ers_result_t reset_link(struct pci_dev *dev, int severity)
 {
struct pci_dev *udev;
pci_ers_result_t status;
struct pcie_port_service_driver *driver;
+   u32 service;
 
if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
/* Reset this port for all subordinates */
@@ -196,7 +197,12 @@ static pci_ers_result_t reset_link(struct pci_dev *dev)
}
 
/* Use the aer driver of the component firstly */
-   driver = pcie_port_find_service(udev, PCIE_PORT_SERVICE_AER);
+   if (severity == DPC_FATAL)
+   service = PCIE_PORT_SERVICE_DPC;
+   else
+   service = PCIE_PORT_SERVICE_AER;
+
+   driver = pcie_port_find_service(udev, service);
 
if (driver && driver->reset_link) {
status = driver->reset_link(udev);
@@ -302,7 +308,7 @@ static pci_ers_result_t do_fatal_recovery(struct pci_dev 
*dev, int severity)
pci_dev_put(pdev);
}
 
-   result = reset_link(udev);
+   result = reset_link(udev, severity);
if (result == PCI_ERS_RESULT_RECOVERED)
if (pcie_wait_for_link(udev, true))
pci_rescan_bus(udev->bus);
@@ -326,7 +332,8 @@ void pcie_do_recovery(struct pci_dev *dev, int severity)
pci_ers_result_t status;
enum pci_channel_state state;
 
-   if (severity == AER_FATAL) {
+   if ((severity == AER_FATAL) ||
+  (severity == DPC_FATAL)) {
status = do_fatal_recovery(dev, severity);
if (status != PCI_ERS_RESULT_RECOVERED)
goto failed;
diff --git a/include/linux/aer.h b/include/linux/aer.h
index 

[PATCH v15 2/9] pci-error-recovery: Add AER_FATAL handling

2018-05-02 Thread Oza Pawandeep
It adds description on AER_FATAL error handling.

Signed-off-by: Oza Pawandeep 

diff --git a/Documentation/PCI/pci-error-recovery.txt 
b/Documentation/PCI/pci-error-recovery.txt
index 0b6bb3e..688b691 100644
--- a/Documentation/PCI/pci-error-recovery.txt
+++ b/Documentation/PCI/pci-error-recovery.txt
@@ -110,7 +110,7 @@ The actual steps taken by a platform to recover from a PCI 
error
 event will be platform-dependent, but will follow the general
 sequence described below.
 
-STEP 0: Error Event
+STEP 0: Error Event: ERR_NONFATAL
 ---
 A PCI bus error is detected by the PCI hardware.  On powerpc, the slot
 is isolated, in that all I/O is blocked: all reads return 0x,
@@ -228,13 +228,7 @@ proceeds to either STEP3 (Link Reset) or to STEP 5 (Resume 
Operations).
 If any driver returned PCI_ERS_RESULT_NEED_RESET, then the platform
 proceeds to STEP 4 (Slot Reset)
 
-STEP 3: Link Reset
---
-The platform resets the link.  This is a PCI-Express specific step
-and is done whenever a fatal error has been detected that can be
-"solved" by resetting the link.
-
-STEP 4: Slot Reset
+STEP 3: Slot Reset
 --
 
 In response to a return value of PCI_ERS_RESULT_NEED_RESET, the
@@ -320,7 +314,7 @@ Failure).
 >>> However, it probably should.
 
 
-STEP 5: Resume Operations
+STEP 4: Resume Operations
 -
 The platform will call the resume() callback on all affected device
 drivers if all drivers on the segment have returned
@@ -332,7 +326,7 @@ a result code.
 At this point, if a new error happens, the platform will restart
 a new error recovery sequence.
 
-STEP 6: Permanent Failure
+STEP 5: Permanent Failure
 -
 A "permanent failure" has occurred, and the platform cannot recover
 the device.  The platform will call error_detected() with a
@@ -355,6 +349,27 @@ errors. See the discussion in 
powerpc/eeh-pci-error-recovery.txt
 for additional detail on real-life experience of the causes of
 software errors.
 
+STEP 0: Error Event: ERR_FATAL
+---
+PCI bus error is detected by the PCI hardware. On powerpc, the slot is
+isolated, in that all I/O is blocked: all reads return 0x, all
+writes are ignored.
+
+STEP 1: Remove devices
+
+Platform removes the devices depending on the error agent, it could be
+this port for all subordinates or upstream component (likely downstream
+port)
+
+STEP 2: Reset link
+
+The platform resets the link.  This is a PCI-Express specific step and is
+done whenever a fatal error has been detected that can be "solved" by
+resetting the link.
+
+STEP 3: Re-enumerate the devices
+
+Initiates the re-enumeration.
 
 Conclusion; General Remarks
 ---
-- 
2.7.4



[PATCH v15 9/9] PCI/DPC: Disable ERR_NONFATAL and enable ERR_FATAL for DPC

2018-05-02 Thread Oza Pawandeep
This patch disables ERR_NONFATAL trigger for DPC, so now DPC
handles only ERR_FATAL.

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index aed7c9f..6966e00 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -263,7 +263,7 @@ static int dpc_probe(struct pcie_device *dev)
}
}
 
-   ctl = (ctl & 0xfff4) | PCI_EXP_DPC_CTL_EN_NONFATAL | 
PCI_EXP_DPC_CTL_INT_EN;
+   ctl = (ctl & 0xfff4) | PCI_EXP_DPC_CTL_EN_FATAL | 
PCI_EXP_DPC_CTL_INT_EN;
pci_write_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, ctl);
 
dev_info(device, "DPC error containment capabilities: Int Msg #%d, 
RPExt%c PoisonedTLP%c SwTrigger%c RP PIO Log %d, DL_ActiveErr%c\n",
@@ -281,7 +281,7 @@ static void dpc_remove(struct pcie_device *dev)
u16 ctl;
 
pci_read_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, );
-   ctl &= ~(PCI_EXP_DPC_CTL_EN_NONFATAL | PCI_EXP_DPC_CTL_INT_EN);
+   ctl &= ~(PCI_EXP_DPC_CTL_EN_FATAL | PCI_EXP_DPC_CTL_INT_EN);
pci_write_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, ctl);
 }
 
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 103ba79..86f1cc2 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -981,6 +981,7 @@
 #define  PCI_EXP_DPC_CAP_DL_ACTIVE 0x1000  /* ERR_COR signal on DL_Active 
supported */
 
 #define PCI_EXP_DPC_CTL6   /* DPC control */
+#define PCI_EXP_DPC_CTL_EN_FATAL   0x0001  /* Enable trigger on 
ERR_FATAL message */
 #define  PCI_EXP_DPC_CTL_EN_NONFATAL   0x0002  /* Enable trigger on 
ERR_NONFATAL message */
 #define  PCI_EXP_DPC_CTL_INT_EN0x0008  /* DPC Interrupt Enable */
 
-- 
2.7.4



[PATCH v15 2/9] pci-error-recovery: Add AER_FATAL handling

2018-05-02 Thread Oza Pawandeep
It adds description on AER_FATAL error handling.

Signed-off-by: Oza Pawandeep 

diff --git a/Documentation/PCI/pci-error-recovery.txt 
b/Documentation/PCI/pci-error-recovery.txt
index 0b6bb3e..688b691 100644
--- a/Documentation/PCI/pci-error-recovery.txt
+++ b/Documentation/PCI/pci-error-recovery.txt
@@ -110,7 +110,7 @@ The actual steps taken by a platform to recover from a PCI 
error
 event will be platform-dependent, but will follow the general
 sequence described below.
 
-STEP 0: Error Event
+STEP 0: Error Event: ERR_NONFATAL
 ---
 A PCI bus error is detected by the PCI hardware.  On powerpc, the slot
 is isolated, in that all I/O is blocked: all reads return 0x,
@@ -228,13 +228,7 @@ proceeds to either STEP3 (Link Reset) or to STEP 5 (Resume 
Operations).
 If any driver returned PCI_ERS_RESULT_NEED_RESET, then the platform
 proceeds to STEP 4 (Slot Reset)
 
-STEP 3: Link Reset
---
-The platform resets the link.  This is a PCI-Express specific step
-and is done whenever a fatal error has been detected that can be
-"solved" by resetting the link.
-
-STEP 4: Slot Reset
+STEP 3: Slot Reset
 --
 
 In response to a return value of PCI_ERS_RESULT_NEED_RESET, the
@@ -320,7 +314,7 @@ Failure).
 >>> However, it probably should.
 
 
-STEP 5: Resume Operations
+STEP 4: Resume Operations
 -
 The platform will call the resume() callback on all affected device
 drivers if all drivers on the segment have returned
@@ -332,7 +326,7 @@ a result code.
 At this point, if a new error happens, the platform will restart
 a new error recovery sequence.
 
-STEP 6: Permanent Failure
+STEP 5: Permanent Failure
 -
 A "permanent failure" has occurred, and the platform cannot recover
 the device.  The platform will call error_detected() with a
@@ -355,6 +349,27 @@ errors. See the discussion in 
powerpc/eeh-pci-error-recovery.txt
 for additional detail on real-life experience of the causes of
 software errors.
 
+STEP 0: Error Event: ERR_FATAL
+---
+PCI bus error is detected by the PCI hardware. On powerpc, the slot is
+isolated, in that all I/O is blocked: all reads return 0x, all
+writes are ignored.
+
+STEP 1: Remove devices
+
+Platform removes the devices depending on the error agent, it could be
+this port for all subordinates or upstream component (likely downstream
+port)
+
+STEP 2: Reset link
+
+The platform resets the link.  This is a PCI-Express specific step and is
+done whenever a fatal error has been detected that can be "solved" by
+resetting the link.
+
+STEP 3: Re-enumerate the devices
+
+Initiates the re-enumeration.
 
 Conclusion; General Remarks
 ---
-- 
2.7.4



[PATCH v15 9/9] PCI/DPC: Disable ERR_NONFATAL and enable ERR_FATAL for DPC

2018-05-02 Thread Oza Pawandeep
This patch disables ERR_NONFATAL trigger for DPC, so now DPC
handles only ERR_FATAL.

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index aed7c9f..6966e00 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -263,7 +263,7 @@ static int dpc_probe(struct pcie_device *dev)
}
}
 
-   ctl = (ctl & 0xfff4) | PCI_EXP_DPC_CTL_EN_NONFATAL | 
PCI_EXP_DPC_CTL_INT_EN;
+   ctl = (ctl & 0xfff4) | PCI_EXP_DPC_CTL_EN_FATAL | 
PCI_EXP_DPC_CTL_INT_EN;
pci_write_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, ctl);
 
dev_info(device, "DPC error containment capabilities: Int Msg #%d, 
RPExt%c PoisonedTLP%c SwTrigger%c RP PIO Log %d, DL_ActiveErr%c\n",
@@ -281,7 +281,7 @@ static void dpc_remove(struct pcie_device *dev)
u16 ctl;
 
pci_read_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, );
-   ctl &= ~(PCI_EXP_DPC_CTL_EN_NONFATAL | PCI_EXP_DPC_CTL_INT_EN);
+   ctl &= ~(PCI_EXP_DPC_CTL_EN_FATAL | PCI_EXP_DPC_CTL_INT_EN);
pci_write_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, ctl);
 }
 
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 103ba79..86f1cc2 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -981,6 +981,7 @@
 #define  PCI_EXP_DPC_CAP_DL_ACTIVE 0x1000  /* ERR_COR signal on DL_Active 
supported */
 
 #define PCI_EXP_DPC_CTL6   /* DPC control */
+#define PCI_EXP_DPC_CTL_EN_FATAL   0x0001  /* Enable trigger on 
ERR_FATAL message */
 #define  PCI_EXP_DPC_CTL_EN_NONFATAL   0x0002  /* Enable trigger on 
ERR_NONFATAL message */
 #define  PCI_EXP_DPC_CTL_INT_EN0x0008  /* DPC Interrupt Enable */
 
-- 
2.7.4



[PATCH v15 1/9] PCI: Unify wait for link active into generic PCI

2018-05-02 Thread Oza Pawandeep
Clients such as HP, DPC are using pcie_wait_link_active(), which waits
till the link becomes active or inactive.

Made generic function and moved it to drivers/pci/pci.c

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c
index 18a42f8..e0c2b8e 100644
--- a/drivers/pci/hotplug/pciehp_hpc.c
+++ b/drivers/pci/hotplug/pciehp_hpc.c
@@ -231,25 +231,11 @@ bool pciehp_check_link_active(struct controller *ctrl)
return ret;
 }
 
-static void __pcie_wait_link_active(struct controller *ctrl, bool active)
-{
-   int timeout = 1000;
-
-   if (pciehp_check_link_active(ctrl) == active)
-   return;
-   while (timeout > 0) {
-   msleep(10);
-   timeout -= 10;
-   if (pciehp_check_link_active(ctrl) == active)
-   return;
-   }
-   ctrl_dbg(ctrl, "Data Link Layer Link Active not %s in 1000 msec\n",
-   active ? "set" : "cleared");
-}
-
 static void pcie_wait_link_active(struct controller *ctrl)
 {
-   __pcie_wait_link_active(ctrl, true);
+   struct pci_dev *pdev = ctrl_dev(ctrl);
+
+   pcie_wait_for_link(pdev, true);
 }
 
 static bool pci_bus_check_dev(struct pci_bus *bus, int devfn)
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index e597655..2e4d1e4 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4138,6 +4138,35 @@ static int pci_pm_reset(struct pci_dev *dev, int probe)
 
return pci_dev_wait(dev, "PM D3->D0", PCIE_RESET_READY_POLL_MS);
 }
+/**
+ * pcie_wait_for_link - Wait for link till it's active/inactive
+ * @pdev: Bridge device
+ * @active: waiting for active or inactive ?
+ *
+ * Use this to wait till link becomes active or inactive.
+ */
+bool pcie_wait_for_link(struct pci_dev *pdev, bool active)
+{
+   int timeout = 1000;
+   bool ret;
+   u16 lnk_status;
+
+   for (;;) {
+   pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, _status);
+   ret = !!(lnk_status & PCI_EXP_LNKSTA_DLLLA);
+   if (ret == active)
+   return true;
+   if (timeout <= 0)
+   break;
+   msleep(10);
+   timeout -= 10;
+   }
+
+   pci_info(pdev, "Data Link Layer Link Active not %s in 1000 msec\n",
+active ? "set" : "cleared");
+
+   return false;
+}
 
 void pci_reset_secondary_bus(struct pci_dev *dev)
 {
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 023f7cf..cec9d8c 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -353,6 +353,7 @@ static inline resource_size_t pci_resource_alignment(struct 
pci_dev *dev,
 
 void pci_enable_acs(struct pci_dev *dev);
 
+bool pcie_wait_for_link(struct pci_dev *pdev, bool active);
 #ifdef CONFIG_PCIEASPM
 void pcie_aspm_init_link_state(struct pci_dev *pdev);
 void pcie_aspm_exit_link_state(struct pci_dev *pdev);
diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index 8c57d60..80ec384 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -68,19 +68,9 @@ static int dpc_wait_rp_inactive(struct dpc_dev *dpc)
 
 static void dpc_wait_link_inactive(struct dpc_dev *dpc)
 {
-   unsigned long timeout = jiffies + HZ;
struct pci_dev *pdev = dpc->dev->port;
-   struct device *dev = >dev->device;
-   u16 lnk_status;
 
-   pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, _status);
-   while (lnk_status & PCI_EXP_LNKSTA_DLLLA &&
-   !time_after(jiffies, timeout)) {
-   msleep(10);
-   pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, _status);
-   }
-   if (lnk_status & PCI_EXP_LNKSTA_DLLLA)
-   dev_warn(dev, "Link state not disabled for DPC event\n");
+   pcie_wait_for_link(pdev, false);
 }
 
 static void dpc_work(struct work_struct *work)
-- 
2.7.4



[PATCH v15 1/9] PCI: Unify wait for link active into generic PCI

2018-05-02 Thread Oza Pawandeep
Clients such as HP, DPC are using pcie_wait_link_active(), which waits
till the link becomes active or inactive.

Made generic function and moved it to drivers/pci/pci.c

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c
index 18a42f8..e0c2b8e 100644
--- a/drivers/pci/hotplug/pciehp_hpc.c
+++ b/drivers/pci/hotplug/pciehp_hpc.c
@@ -231,25 +231,11 @@ bool pciehp_check_link_active(struct controller *ctrl)
return ret;
 }
 
-static void __pcie_wait_link_active(struct controller *ctrl, bool active)
-{
-   int timeout = 1000;
-
-   if (pciehp_check_link_active(ctrl) == active)
-   return;
-   while (timeout > 0) {
-   msleep(10);
-   timeout -= 10;
-   if (pciehp_check_link_active(ctrl) == active)
-   return;
-   }
-   ctrl_dbg(ctrl, "Data Link Layer Link Active not %s in 1000 msec\n",
-   active ? "set" : "cleared");
-}
-
 static void pcie_wait_link_active(struct controller *ctrl)
 {
-   __pcie_wait_link_active(ctrl, true);
+   struct pci_dev *pdev = ctrl_dev(ctrl);
+
+   pcie_wait_for_link(pdev, true);
 }
 
 static bool pci_bus_check_dev(struct pci_bus *bus, int devfn)
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index e597655..2e4d1e4 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4138,6 +4138,35 @@ static int pci_pm_reset(struct pci_dev *dev, int probe)
 
return pci_dev_wait(dev, "PM D3->D0", PCIE_RESET_READY_POLL_MS);
 }
+/**
+ * pcie_wait_for_link - Wait for link till it's active/inactive
+ * @pdev: Bridge device
+ * @active: waiting for active or inactive ?
+ *
+ * Use this to wait till link becomes active or inactive.
+ */
+bool pcie_wait_for_link(struct pci_dev *pdev, bool active)
+{
+   int timeout = 1000;
+   bool ret;
+   u16 lnk_status;
+
+   for (;;) {
+   pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, _status);
+   ret = !!(lnk_status & PCI_EXP_LNKSTA_DLLLA);
+   if (ret == active)
+   return true;
+   if (timeout <= 0)
+   break;
+   msleep(10);
+   timeout -= 10;
+   }
+
+   pci_info(pdev, "Data Link Layer Link Active not %s in 1000 msec\n",
+active ? "set" : "cleared");
+
+   return false;
+}
 
 void pci_reset_secondary_bus(struct pci_dev *dev)
 {
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 023f7cf..cec9d8c 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -353,6 +353,7 @@ static inline resource_size_t pci_resource_alignment(struct 
pci_dev *dev,
 
 void pci_enable_acs(struct pci_dev *dev);
 
+bool pcie_wait_for_link(struct pci_dev *pdev, bool active);
 #ifdef CONFIG_PCIEASPM
 void pcie_aspm_init_link_state(struct pci_dev *pdev);
 void pcie_aspm_exit_link_state(struct pci_dev *pdev);
diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index 8c57d60..80ec384 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -68,19 +68,9 @@ static int dpc_wait_rp_inactive(struct dpc_dev *dpc)
 
 static void dpc_wait_link_inactive(struct dpc_dev *dpc)
 {
-   unsigned long timeout = jiffies + HZ;
struct pci_dev *pdev = dpc->dev->port;
-   struct device *dev = >dev->device;
-   u16 lnk_status;
 
-   pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, _status);
-   while (lnk_status & PCI_EXP_LNKSTA_DLLLA &&
-   !time_after(jiffies, timeout)) {
-   msleep(10);
-   pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, _status);
-   }
-   if (lnk_status & PCI_EXP_LNKSTA_DLLLA)
-   dev_warn(dev, "Link state not disabled for DPC event\n");
+   pcie_wait_for_link(pdev, false);
 }
 
 static void dpc_work(struct work_struct *work)
-- 
2.7.4



[PATCH v15 3/9] PCI/AER: Handle ERR_FATAL with removal and re-enumeration of devices

2018-05-02 Thread Oza Pawandeep
This patch alters the behavior of handling of ERR_FATAL, where removal
of devices is initiated, followed by reset link, followed by
re-enumeration.

So the errors are handled in a different way as follows:
ERR_NONFATAL => call driver recovery entry points
ERR_FATAL=> remove and re-enumerate

please refer to Documentation/PCI/pci-error-recovery.txt for more details.

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/pcie/aer/aerdrv.c b/drivers/pci/pcie/aer/aerdrv.c
index 779b387..206f590 100644
--- a/drivers/pci/pcie/aer/aerdrv.c
+++ b/drivers/pci/pcie/aer/aerdrv.c
@@ -330,6 +330,13 @@ static pci_ers_result_t aer_root_reset(struct pci_dev *dev)
reg32 |= ROOT_PORT_INTR_ON_MESG_MASK;
pci_write_config_dword(dev, pos + PCI_ERR_ROOT_COMMAND, reg32);
 
+   /*
+* This function is called only on ERR_FATAL now, and since
+* the pci_report_resume is called only in ERR_NONFATAL case,
+* the clearing part has to be taken care here.
+*/
+   aer_error_resume(dev);
+
return PCI_ERS_RESULT_RECOVERED;
 }
 
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index 0ea5acc..655d4e8 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include "aerdrv.h"
+#include "../../pci.h"
 
 #definePCI_EXP_AER_FLAGS   (PCI_EXP_DEVCTL_CERE | 
PCI_EXP_DEVCTL_NFERE | \
 PCI_EXP_DEVCTL_FERE | PCI_EXP_DEVCTL_URRE)
@@ -474,6 +475,44 @@ static pci_ers_result_t reset_link(struct pci_dev *dev)
return status;
 }
 
+static pci_ers_result_t do_fatal_recovery(struct pci_dev *dev, int severity)
+{
+   struct pci_dev *udev;
+   struct pci_bus *parent;
+   struct pci_dev *pdev, *temp;
+   pci_ers_result_t result = PCI_ERS_RESULT_RECOVERED;
+
+   if (severity == AER_FATAL)
+   pci_cleanup_aer_uncorrect_error_status(dev);
+
+   if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)
+   udev = dev;
+   else
+   udev = dev->bus->self;
+
+   parent = udev->subordinate;
+   pci_lock_rescan_remove();
+   list_for_each_entry_safe_reverse(pdev, temp, >devices,
+bus_list) {
+   pci_dev_get(pdev);
+   pci_dev_set_disconnected(pdev, NULL);
+   if (pci_has_subordinate(pdev))
+   pci_walk_bus(pdev->subordinate,
+pci_dev_set_disconnected, NULL);
+   pci_stop_and_remove_bus_device(pdev);
+   pci_dev_put(pdev);
+   }
+
+   result = reset_link(udev);
+   if (result == PCI_ERS_RESULT_RECOVERED)
+   if (pcie_wait_for_link(udev, true))
+   pci_rescan_bus(udev->bus);
+
+   pci_unlock_rescan_remove();
+
+   return result;
+}
+
 /**
  * do_recovery - handle nonfatal/fatal error recovery process
  * @dev: pointer to a pci_dev data structure of agent detecting an error
@@ -485,11 +524,15 @@ static pci_ers_result_t reset_link(struct pci_dev *dev)
  */
 static void do_recovery(struct pci_dev *dev, int severity)
 {
-   pci_ers_result_t status, result = PCI_ERS_RESULT_RECOVERED;
+   pci_ers_result_t status;
enum pci_channel_state state;
 
-   if (severity == AER_FATAL)
-   state = pci_channel_io_frozen;
+   if (severity == AER_FATAL) {
+   status = do_fatal_recovery(dev, severity);
+   if (status != PCI_ERS_RESULT_RECOVERED)
+   goto failed;
+   return;
+   }
else
state = pci_channel_io_normal;
 
@@ -498,12 +541,6 @@ static void do_recovery(struct pci_dev *dev, int severity)
"error_detected",
report_error_detected);
 
-   if (severity == AER_FATAL) {
-   result = reset_link(dev);
-   if (result != PCI_ERS_RESULT_RECOVERED)
-   goto failed;
-   }
-
if (status == PCI_ERS_RESULT_CAN_RECOVER)
status = broadcast_error_message(dev,
state,
-- 
2.7.4



[PATCH v15 0/9] Address error and recovery for AER and DPC

2018-05-02 Thread Oza Pawandeep
This patch set brings in error handling support for DPC

The current implementation of AER and error message broadcasting to the
EP driver is tightly coupled and limited to AER service driver.
It is important to factor out broadcasting and other link handling
callbacks. So that not only when AER gets triggered, but also when DPC get
triggered (for e.g. ERR_FATAL), callbacks are handled appropriately.

The goal of the patch-set is:
DPC should handle the error handling and recovery similar to AER, because 
finally both are attempting recovery in some or the other way,
and for that error handling and recovery framework has to be loosely
coupled.

It achieves uniformity and transparency to the error handling agents such
as AER, DPC, with respect to recovery and error handling.

So, this patch-set tries to unify lot of things between error agents and
make them behave in a well defined way. (be it error (FATAL, NON_FATAL)
handling or recovery).

The FATAL error handling is handled with remove/reset_link/re-enumerate
sequence while the NON_FATAL follows the default path.
Documentation/PCI/pci-error-recovery.txt talks more on that.

Changes since v14:
Bjorn's comments addressed
> simplified the patch set, and moved AER_FATAL handling in the beginning.
> rebase the code to 4.17-rc1.
Changes since v13:
Bjorn's comments addressed
> handke FATAL errors with remove devices followed by re-enumeration.
> changes in AER and DPC along with required Documentation.
Changes since v12:
Bjorn's and Keith's Comments addressed.
> Made DPC and AER error handling identical 
> hanldled cases for hotplug enabled system differently.
Changes since v11:
Bjorn's comments addressed.
> rename pcie-err.c to err.c
> removed EXPORT_SYMBOL
> made generic find_serivce function in port driver.
> removed mutex patch as no need to have mutex in pcie_do_recovery
> brough in DPC_FATAL in aer.h
> so now all the error codes (AER and DPC) are unified in aer.h
Changes since v10:
Christoph Hellwig's, David Laight's and Randy Dunlap's
comments addressed.
> renamed pci_do_recovery to pcie_do_recovery
> removed inner braces in conditional statements.
> restrctured the code in pci_wait_for_link
> EXPORT_SYMBOL_GPL
Changes since v9:
Sinan's comments addressed.
> bool active = true; unnecessary variable removed.
Changes since v8:
Fixed Kbuild errors.
Changes since v7:
Rebased the code on pci master
> https://kernel.googlesource.com/pub/scm/linux/kernel/git/helgaas/pci
Changes since v6:
Sinan's and Stefan's comments implemented.
> reordered patch 6 and 7
> cleaned up
Changes since v5:
Sinan's and Keith's comments incorporated.
> made separate patch for mutex
> unified error repotting codes into driver/pci/pci.h
> got rid of wait link active/inactive and
  made generic function in driver/pci/pci.c
Changes since v4:
Bjorn's comments incorporated.
> Renamed only do_recovery.
> moved the things more locally to drivers/pci/pci.h
Changes since v3:
Bjorn's comments incorporated.
> Made separate patch renaming generic pci_err.c
> Introduce pci_err.h to contain all the error types and recovery
> removed all the dependencies on pci.h
Changes since v2:
Based on feedback from Keith:
"
When DPC is triggered due to receipt of an uncorrectable error Message,
the Requester ID from the Message is recorded in the DPC Error
Source ID register and that Message is discarded and not forwarded Upstream.
"
Removed the patch where AER checks if DPC service is active
Changes since v1:
Kbuild errors fixed:
> pci_find_dpc_dev made static
> ras_event.h updated
> pci_find_aer_service call with CONFIG check
> pci_find_dpc_service call with CONFIG check

Oza Pawandeep (9):
  PCI: Unify wait for link active into generic PCI
  pci-error-recovery: Add AER_FATAL handling
  PCI/AER: Handle ERRR_FATAL with removal and re-enumeration of devices
  PCI/AER: Rename error recovery to generic PCI naming
  PCI/AER: Factor out error reporting from AER
  PCI/PORTDRV: Implement generic find service
  PCI/PORTDRV: Implement generic find device
  PCI/DPC: Unify and plumb error handling into DPC
  PCI/DPC: Disable ERR_NONFATAL and enable ERR_FATAL for DPC

 Documentation/PCI/pci-error-recovery.txt |  35 ++-
 drivers/pci/hotplug/pciehp_hpc.c |  20 +-
 drivers/pci/pci.c|  29 +++
 drivers/pci/pci.h|   4 +
 drivers/pci/pcie/Makefile|   2 +-
 drivers/pci/pcie/aer/aerdrv.c|   2 +
 drivers/pci/pcie/aer/aerdrv.h|  30 ---
 drivers/pci/pcie/aer/aerdrv_core.c   | 317 +-
 drivers/pci/pcie/dpc.c   |  58 +++--
 drivers/pci/pcie/err.c   | 374 +++
 

[PATCH v15 3/9] PCI/AER: Handle ERR_FATAL with removal and re-enumeration of devices

2018-05-02 Thread Oza Pawandeep
This patch alters the behavior of handling of ERR_FATAL, where removal
of devices is initiated, followed by reset link, followed by
re-enumeration.

So the errors are handled in a different way as follows:
ERR_NONFATAL => call driver recovery entry points
ERR_FATAL=> remove and re-enumerate

please refer to Documentation/PCI/pci-error-recovery.txt for more details.

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/pcie/aer/aerdrv.c b/drivers/pci/pcie/aer/aerdrv.c
index 779b387..206f590 100644
--- a/drivers/pci/pcie/aer/aerdrv.c
+++ b/drivers/pci/pcie/aer/aerdrv.c
@@ -330,6 +330,13 @@ static pci_ers_result_t aer_root_reset(struct pci_dev *dev)
reg32 |= ROOT_PORT_INTR_ON_MESG_MASK;
pci_write_config_dword(dev, pos + PCI_ERR_ROOT_COMMAND, reg32);
 
+   /*
+* This function is called only on ERR_FATAL now, and since
+* the pci_report_resume is called only in ERR_NONFATAL case,
+* the clearing part has to be taken care here.
+*/
+   aer_error_resume(dev);
+
return PCI_ERS_RESULT_RECOVERED;
 }
 
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index 0ea5acc..655d4e8 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include "aerdrv.h"
+#include "../../pci.h"
 
 #definePCI_EXP_AER_FLAGS   (PCI_EXP_DEVCTL_CERE | 
PCI_EXP_DEVCTL_NFERE | \
 PCI_EXP_DEVCTL_FERE | PCI_EXP_DEVCTL_URRE)
@@ -474,6 +475,44 @@ static pci_ers_result_t reset_link(struct pci_dev *dev)
return status;
 }
 
+static pci_ers_result_t do_fatal_recovery(struct pci_dev *dev, int severity)
+{
+   struct pci_dev *udev;
+   struct pci_bus *parent;
+   struct pci_dev *pdev, *temp;
+   pci_ers_result_t result = PCI_ERS_RESULT_RECOVERED;
+
+   if (severity == AER_FATAL)
+   pci_cleanup_aer_uncorrect_error_status(dev);
+
+   if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)
+   udev = dev;
+   else
+   udev = dev->bus->self;
+
+   parent = udev->subordinate;
+   pci_lock_rescan_remove();
+   list_for_each_entry_safe_reverse(pdev, temp, >devices,
+bus_list) {
+   pci_dev_get(pdev);
+   pci_dev_set_disconnected(pdev, NULL);
+   if (pci_has_subordinate(pdev))
+   pci_walk_bus(pdev->subordinate,
+pci_dev_set_disconnected, NULL);
+   pci_stop_and_remove_bus_device(pdev);
+   pci_dev_put(pdev);
+   }
+
+   result = reset_link(udev);
+   if (result == PCI_ERS_RESULT_RECOVERED)
+   if (pcie_wait_for_link(udev, true))
+   pci_rescan_bus(udev->bus);
+
+   pci_unlock_rescan_remove();
+
+   return result;
+}
+
 /**
  * do_recovery - handle nonfatal/fatal error recovery process
  * @dev: pointer to a pci_dev data structure of agent detecting an error
@@ -485,11 +524,15 @@ static pci_ers_result_t reset_link(struct pci_dev *dev)
  */
 static void do_recovery(struct pci_dev *dev, int severity)
 {
-   pci_ers_result_t status, result = PCI_ERS_RESULT_RECOVERED;
+   pci_ers_result_t status;
enum pci_channel_state state;
 
-   if (severity == AER_FATAL)
-   state = pci_channel_io_frozen;
+   if (severity == AER_FATAL) {
+   status = do_fatal_recovery(dev, severity);
+   if (status != PCI_ERS_RESULT_RECOVERED)
+   goto failed;
+   return;
+   }
else
state = pci_channel_io_normal;
 
@@ -498,12 +541,6 @@ static void do_recovery(struct pci_dev *dev, int severity)
"error_detected",
report_error_detected);
 
-   if (severity == AER_FATAL) {
-   result = reset_link(dev);
-   if (result != PCI_ERS_RESULT_RECOVERED)
-   goto failed;
-   }
-
if (status == PCI_ERS_RESULT_CAN_RECOVER)
status = broadcast_error_message(dev,
state,
-- 
2.7.4



[PATCH v15 0/9] Address error and recovery for AER and DPC

2018-05-02 Thread Oza Pawandeep
This patch set brings in error handling support for DPC

The current implementation of AER and error message broadcasting to the
EP driver is tightly coupled and limited to AER service driver.
It is important to factor out broadcasting and other link handling
callbacks. So that not only when AER gets triggered, but also when DPC get
triggered (for e.g. ERR_FATAL), callbacks are handled appropriately.

The goal of the patch-set is:
DPC should handle the error handling and recovery similar to AER, because 
finally both are attempting recovery in some or the other way,
and for that error handling and recovery framework has to be loosely
coupled.

It achieves uniformity and transparency to the error handling agents such
as AER, DPC, with respect to recovery and error handling.

So, this patch-set tries to unify lot of things between error agents and
make them behave in a well defined way. (be it error (FATAL, NON_FATAL)
handling or recovery).

The FATAL error handling is handled with remove/reset_link/re-enumerate
sequence while the NON_FATAL follows the default path.
Documentation/PCI/pci-error-recovery.txt talks more on that.

Changes since v14:
Bjorn's comments addressed
> simplified the patch set, and moved AER_FATAL handling in the beginning.
> rebase the code to 4.17-rc1.
Changes since v13:
Bjorn's comments addressed
> handke FATAL errors with remove devices followed by re-enumeration.
> changes in AER and DPC along with required Documentation.
Changes since v12:
Bjorn's and Keith's Comments addressed.
> Made DPC and AER error handling identical 
> hanldled cases for hotplug enabled system differently.
Changes since v11:
Bjorn's comments addressed.
> rename pcie-err.c to err.c
> removed EXPORT_SYMBOL
> made generic find_serivce function in port driver.
> removed mutex patch as no need to have mutex in pcie_do_recovery
> brough in DPC_FATAL in aer.h
> so now all the error codes (AER and DPC) are unified in aer.h
Changes since v10:
Christoph Hellwig's, David Laight's and Randy Dunlap's
comments addressed.
> renamed pci_do_recovery to pcie_do_recovery
> removed inner braces in conditional statements.
> restrctured the code in pci_wait_for_link
> EXPORT_SYMBOL_GPL
Changes since v9:
Sinan's comments addressed.
> bool active = true; unnecessary variable removed.
Changes since v8:
Fixed Kbuild errors.
Changes since v7:
Rebased the code on pci master
> https://kernel.googlesource.com/pub/scm/linux/kernel/git/helgaas/pci
Changes since v6:
Sinan's and Stefan's comments implemented.
> reordered patch 6 and 7
> cleaned up
Changes since v5:
Sinan's and Keith's comments incorporated.
> made separate patch for mutex
> unified error repotting codes into driver/pci/pci.h
> got rid of wait link active/inactive and
  made generic function in driver/pci/pci.c
Changes since v4:
Bjorn's comments incorporated.
> Renamed only do_recovery.
> moved the things more locally to drivers/pci/pci.h
Changes since v3:
Bjorn's comments incorporated.
> Made separate patch renaming generic pci_err.c
> Introduce pci_err.h to contain all the error types and recovery
> removed all the dependencies on pci.h
Changes since v2:
Based on feedback from Keith:
"
When DPC is triggered due to receipt of an uncorrectable error Message,
the Requester ID from the Message is recorded in the DPC Error
Source ID register and that Message is discarded and not forwarded Upstream.
"
Removed the patch where AER checks if DPC service is active
Changes since v1:
Kbuild errors fixed:
> pci_find_dpc_dev made static
> ras_event.h updated
> pci_find_aer_service call with CONFIG check
> pci_find_dpc_service call with CONFIG check

Oza Pawandeep (9):
  PCI: Unify wait for link active into generic PCI
  pci-error-recovery: Add AER_FATAL handling
  PCI/AER: Handle ERRR_FATAL with removal and re-enumeration of devices
  PCI/AER: Rename error recovery to generic PCI naming
  PCI/AER: Factor out error reporting from AER
  PCI/PORTDRV: Implement generic find service
  PCI/PORTDRV: Implement generic find device
  PCI/DPC: Unify and plumb error handling into DPC
  PCI/DPC: Disable ERR_NONFATAL and enable ERR_FATAL for DPC

 Documentation/PCI/pci-error-recovery.txt |  35 ++-
 drivers/pci/hotplug/pciehp_hpc.c |  20 +-
 drivers/pci/pci.c|  29 +++
 drivers/pci/pci.h|   4 +
 drivers/pci/pcie/Makefile|   2 +-
 drivers/pci/pcie/aer/aerdrv.c|   2 +
 drivers/pci/pcie/aer/aerdrv.h|  30 ---
 drivers/pci/pcie/aer/aerdrv_core.c   | 317 +-
 drivers/pci/pcie/dpc.c   |  58 +++--
 drivers/pci/pcie/err.c   | 374 +++
 

[PATCH v15 4/9] PCI/AER: Rename error recovery to generic PCI naming

2018-05-02 Thread Oza Pawandeep
This patch renames error recovery to generic name with pcie prefix

Signed-off-by: Oza Pawandeep 
Reviewed-by: Keith Busch 

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index cec9d8c..22a9589 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -353,6 +353,9 @@ static inline resource_size_t pci_resource_alignment(struct 
pci_dev *dev,
 
 void pci_enable_acs(struct pci_dev *dev);
 
+/* PCI error reporting and recovery */
+void pcie_do_recovery(struct pci_dev *dev, int severity);
+
 bool pcie_wait_for_link(struct pci_dev *pdev, bool active);
 #ifdef CONFIG_PCIEASPM
 void pcie_aspm_init_link_state(struct pci_dev *pdev);
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index 655d4e8..be4ee3b 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -475,7 +475,7 @@ static pci_ers_result_t reset_link(struct pci_dev *dev)
return status;
 }
 
-static pci_ers_result_t do_fatal_recovery(struct pci_dev *dev, int severity)
+static pci_ers_result_t pcie_do_fatal_recovery(struct pci_dev *dev, int 
severity)
 {
struct pci_dev *udev;
struct pci_bus *parent;
@@ -514,7 +514,7 @@ static pci_ers_result_t do_fatal_recovery(struct pci_dev 
*dev, int severity)
 }
 
 /**
- * do_recovery - handle nonfatal/fatal error recovery process
+ * pcie_do_recovery - handle nonfatal/fatal error recovery process
  * @dev: pointer to a pci_dev data structure of agent detecting an error
  * @severity: error severity type
  *
@@ -522,13 +522,13 @@ static pci_ers_result_t do_fatal_recovery(struct pci_dev 
*dev, int severity)
  * error detected message to all downstream drivers within a hierarchy in
  * question and return the returned code.
  */
-static void do_recovery(struct pci_dev *dev, int severity)
+void pcie_do_recovery(struct pci_dev *dev, int severity)
 {
pci_ers_result_t status;
enum pci_channel_state state;
 
if (severity == AER_FATAL) {
-   status = do_fatal_recovery(dev, severity);
+   status = pcie_do_fatal_recovery(dev, severity);
if (status != PCI_ERS_RESULT_RECOVERED)
goto failed;
return;
@@ -600,7 +600,7 @@ static void handle_error_source(struct pcie_device *aerdev,
pci_write_config_dword(dev, pos + PCI_ERR_COR_STATUS,
info->status);
} else
-   do_recovery(dev, info->severity);
+   pcie_do_recovery(dev, info->severity);
 }
 
 #ifdef CONFIG_ACPI_APEI_PCIEAER
@@ -665,7 +665,7 @@ static void aer_recover_work_func(struct work_struct *work)
}
cper_print_aer(pdev, entry.severity, entry.regs);
if (entry.severity != AER_CORRECTABLE)
-   do_recovery(pdev, entry.severity);
+   pcie_do_recovery(pdev, entry.severity);
pci_dev_put(pdev);
}
 }
-- 
2.7.4



[PATCH v15 4/9] PCI/AER: Rename error recovery to generic PCI naming

2018-05-02 Thread Oza Pawandeep
This patch renames error recovery to generic name with pcie prefix

Signed-off-by: Oza Pawandeep 
Reviewed-by: Keith Busch 

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index cec9d8c..22a9589 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -353,6 +353,9 @@ static inline resource_size_t pci_resource_alignment(struct 
pci_dev *dev,
 
 void pci_enable_acs(struct pci_dev *dev);
 
+/* PCI error reporting and recovery */
+void pcie_do_recovery(struct pci_dev *dev, int severity);
+
 bool pcie_wait_for_link(struct pci_dev *pdev, bool active);
 #ifdef CONFIG_PCIEASPM
 void pcie_aspm_init_link_state(struct pci_dev *pdev);
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index 655d4e8..be4ee3b 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -475,7 +475,7 @@ static pci_ers_result_t reset_link(struct pci_dev *dev)
return status;
 }
 
-static pci_ers_result_t do_fatal_recovery(struct pci_dev *dev, int severity)
+static pci_ers_result_t pcie_do_fatal_recovery(struct pci_dev *dev, int 
severity)
 {
struct pci_dev *udev;
struct pci_bus *parent;
@@ -514,7 +514,7 @@ static pci_ers_result_t do_fatal_recovery(struct pci_dev 
*dev, int severity)
 }
 
 /**
- * do_recovery - handle nonfatal/fatal error recovery process
+ * pcie_do_recovery - handle nonfatal/fatal error recovery process
  * @dev: pointer to a pci_dev data structure of agent detecting an error
  * @severity: error severity type
  *
@@ -522,13 +522,13 @@ static pci_ers_result_t do_fatal_recovery(struct pci_dev 
*dev, int severity)
  * error detected message to all downstream drivers within a hierarchy in
  * question and return the returned code.
  */
-static void do_recovery(struct pci_dev *dev, int severity)
+void pcie_do_recovery(struct pci_dev *dev, int severity)
 {
pci_ers_result_t status;
enum pci_channel_state state;
 
if (severity == AER_FATAL) {
-   status = do_fatal_recovery(dev, severity);
+   status = pcie_do_fatal_recovery(dev, severity);
if (status != PCI_ERS_RESULT_RECOVERED)
goto failed;
return;
@@ -600,7 +600,7 @@ static void handle_error_source(struct pcie_device *aerdev,
pci_write_config_dword(dev, pos + PCI_ERR_COR_STATUS,
info->status);
} else
-   do_recovery(dev, info->severity);
+   pcie_do_recovery(dev, info->severity);
 }
 
 #ifdef CONFIG_ACPI_APEI_PCIEAER
@@ -665,7 +665,7 @@ static void aer_recover_work_func(struct work_struct *work)
}
cper_print_aer(pdev, entry.severity, entry.regs);
if (entry.severity != AER_CORRECTABLE)
-   do_recovery(pdev, entry.severity);
+   pcie_do_recovery(pdev, entry.severity);
pci_dev_put(pdev);
}
 }
-- 
2.7.4



Re: [v2 PATCH 1/1] tg3: fix meaningless hw_stats reading after tg3_halt memset 0 hw_stats

2018-05-02 Thread Michael Chan
On Wed, May 2, 2018 at 5:30 PM, Zumeng Chen  wrote:
> On 2018年05月03日 01:32, Michael Chan wrote:
>>
>> On Wed, May 2, 2018 at 3:27 AM, Zumeng Chen  wrote:
>>>
>>> On 2018年05月02日 13:12, Michael Chan wrote:

 On Tue, May 1, 2018 at 5:42 PM, Zumeng Chen 
 wrote:

> diff --git a/drivers/net/ethernet/broadcom/tg3.h
> b/drivers/net/ethernet/broadcom/tg3.h
> index 3b5e98e..c61d83c 100644
> --- a/drivers/net/ethernet/broadcom/tg3.h
> +++ b/drivers/net/ethernet/broadcom/tg3.h
> @@ -3102,6 +3102,7 @@ enum TG3_FLAGS {
>   TG3_FLAG_ROBOSWITCH,
>   TG3_FLAG_ONE_DMA_AT_ONCE,
>   TG3_FLAG_RGMII_MODE,
> +   TG3_FLAG_HALT,

 I think you should be able to use the existing INIT_COMPLETE flag
>>>
>>>
>>> No,  it will bring the uncertain factors into the existed complicate
>>> logic
>>> of INIT_COMPLETE.
>>> And I think it's very simple logic here to fix the meaningless hw_stats
>>> reading and the problem
>>> of commit f5992b72. I even suspect if you have read INIT_COMPLETE related
>>> codes carefully.
>>>
>> We should use an existing flag whenever appropriate
>
>
> I disagree. This is sort of blahblah...
>>

I don't want to see another flag added that is practically the same as
!INIT_COMPLETE.  The driver already has close to one hundred flags.
Adding a new flag that is similar to an existing flag will just make
the code more difficult to understand and maintain.

If you don't want to fix it the cleaner way, Siva or I will fix it.


Re: [v2 PATCH 1/1] tg3: fix meaningless hw_stats reading after tg3_halt memset 0 hw_stats

2018-05-02 Thread Michael Chan
On Wed, May 2, 2018 at 5:30 PM, Zumeng Chen  wrote:
> On 2018年05月03日 01:32, Michael Chan wrote:
>>
>> On Wed, May 2, 2018 at 3:27 AM, Zumeng Chen  wrote:
>>>
>>> On 2018年05月02日 13:12, Michael Chan wrote:

 On Tue, May 1, 2018 at 5:42 PM, Zumeng Chen 
 wrote:

> diff --git a/drivers/net/ethernet/broadcom/tg3.h
> b/drivers/net/ethernet/broadcom/tg3.h
> index 3b5e98e..c61d83c 100644
> --- a/drivers/net/ethernet/broadcom/tg3.h
> +++ b/drivers/net/ethernet/broadcom/tg3.h
> @@ -3102,6 +3102,7 @@ enum TG3_FLAGS {
>   TG3_FLAG_ROBOSWITCH,
>   TG3_FLAG_ONE_DMA_AT_ONCE,
>   TG3_FLAG_RGMII_MODE,
> +   TG3_FLAG_HALT,

 I think you should be able to use the existing INIT_COMPLETE flag
>>>
>>>
>>> No,  it will bring the uncertain factors into the existed complicate
>>> logic
>>> of INIT_COMPLETE.
>>> And I think it's very simple logic here to fix the meaningless hw_stats
>>> reading and the problem
>>> of commit f5992b72. I even suspect if you have read INIT_COMPLETE related
>>> codes carefully.
>>>
>> We should use an existing flag whenever appropriate
>
>
> I disagree. This is sort of blahblah...
>>

I don't want to see another flag added that is practically the same as
!INIT_COMPLETE.  The driver already has close to one hundred flags.
Adding a new flag that is similar to an existing flag will just make
the code more difficult to understand and maintain.

If you don't want to fix it the cleaner way, Siva or I will fix it.


Re: [PATCH v2 4/4] vsprintf: Add command line option debug_early_boot

2018-05-02 Thread Kees Cook
On Wed, May 2, 2018 at 3:50 PM, Tobin C. Harding  wrote:
> Currently printing [hashed] pointers requires either a hw RNG or enough
> entropy to be available.  Early in the boot sequence these conditions
> may not be met resulting in a dummy string '(ptrval)' being
> printed.  This makes debugging the early boot sequence difficult.  We
> can relax the requirement to use cryptographically secure hashing during
> debugging.  This enables debugging while keeping development/production
> kernel behaviour the same.
>
> If new command line option debug_early_boot is enabled use
> cryptographically insecure hashing and hash pointer value immediately.
>
> Signed-off-by: Tobin C. Harding 
> ---
>  Documentation/admin-guide/kernel-parameters.txt |  8 
>  lib/vsprintf.c  | 18 ++
>  2 files changed, 26 insertions(+)
>
> diff --git a/Documentation/admin-guide/kernel-parameters.txt 
> b/Documentation/admin-guide/kernel-parameters.txt
> index b8d1379aa039..ab619c4ccbf2 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -748,6 +748,14 @@
>
> debug   [KNL] Enable kernel debugging (events log level).
>
> +   debug_early_boot
> +   [KNL] Enable debugging early in the boot sequence.  If
> +   enabled, we use a weak hash instead of siphash to hash
> +   pointers.  Use this option if you need to see pointer
> +   values during early boot (i.e you are seeing instances
> +   of '(___ptrval___)') - cryptographically insecure,
> +   please do not use on production kernels.
> +
> debug_locks_verbose=
> [KNL] verbose self-tests
> Format=<0|1>
> diff --git a/lib/vsprintf.c b/lib/vsprintf.c
> index 3697a19c2b25..6c139b442267 100644
> --- a/lib/vsprintf.c
> +++ b/lib/vsprintf.c
> @@ -1654,6 +1654,18 @@ char *device_node_string(char *buf, char *end, struct 
> device_node *dn,
> return widen_string(buf, buf - buf_start, end, spec);
>  }
>
> +/* Make pointers available for printing early in the boot sequence. */
> +static int debug_early_boot;

Please make this __ro_after_init too.

-Kees

> +EXPORT_SYMBOL(debug_early_boot);
> +
> +static int __init debug_early_boot_enable(char *str)
> +{
> +   debug_early_boot = 1;
> +   pr_info("debug_early_boot enabled\n");
> +   return 0;
> +}
> +early_param("debug_early_boot", debug_early_boot_enable);
> +
>  static bool have_filled_random_ptr_key __read_mostly;
>  static siphash_key_t ptr_key __read_mostly;
>
> @@ -1707,6 +1719,12 @@ static char *ptr_to_id(char *buf, char *end, void 
> *ptr, struct printf_spec spec)
> const char *str = sizeof(ptr) == 8 ? "(ptrval)" : "(ptrval)";
> unsigned long hashval;
>
> +   /* When debugging early boot use non-cryptographically secure hash */
> +   if (unlikely(debug_early_boot)) {
> +   hashval = hash_long((unsigned long)ptr, 32);
> +   return pointer_string(buf, end, (const void *)hashval, spec);
> +   }
> +
> if (unlikely(!have_filled_random_ptr_key)) {
> spec.field_width = 2 * sizeof(ptr);
> /* string length must be less than default_width */
> --
> 2.7.4
>



-- 
Kees Cook
Pixel Security


Re: [PATCH v2 4/4] vsprintf: Add command line option debug_early_boot

2018-05-02 Thread Kees Cook
On Wed, May 2, 2018 at 3:50 PM, Tobin C. Harding  wrote:
> Currently printing [hashed] pointers requires either a hw RNG or enough
> entropy to be available.  Early in the boot sequence these conditions
> may not be met resulting in a dummy string '(ptrval)' being
> printed.  This makes debugging the early boot sequence difficult.  We
> can relax the requirement to use cryptographically secure hashing during
> debugging.  This enables debugging while keeping development/production
> kernel behaviour the same.
>
> If new command line option debug_early_boot is enabled use
> cryptographically insecure hashing and hash pointer value immediately.
>
> Signed-off-by: Tobin C. Harding 
> ---
>  Documentation/admin-guide/kernel-parameters.txt |  8 
>  lib/vsprintf.c  | 18 ++
>  2 files changed, 26 insertions(+)
>
> diff --git a/Documentation/admin-guide/kernel-parameters.txt 
> b/Documentation/admin-guide/kernel-parameters.txt
> index b8d1379aa039..ab619c4ccbf2 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -748,6 +748,14 @@
>
> debug   [KNL] Enable kernel debugging (events log level).
>
> +   debug_early_boot
> +   [KNL] Enable debugging early in the boot sequence.  If
> +   enabled, we use a weak hash instead of siphash to hash
> +   pointers.  Use this option if you need to see pointer
> +   values during early boot (i.e you are seeing instances
> +   of '(___ptrval___)') - cryptographically insecure,
> +   please do not use on production kernels.
> +
> debug_locks_verbose=
> [KNL] verbose self-tests
> Format=<0|1>
> diff --git a/lib/vsprintf.c b/lib/vsprintf.c
> index 3697a19c2b25..6c139b442267 100644
> --- a/lib/vsprintf.c
> +++ b/lib/vsprintf.c
> @@ -1654,6 +1654,18 @@ char *device_node_string(char *buf, char *end, struct 
> device_node *dn,
> return widen_string(buf, buf - buf_start, end, spec);
>  }
>
> +/* Make pointers available for printing early in the boot sequence. */
> +static int debug_early_boot;

Please make this __ro_after_init too.

-Kees

> +EXPORT_SYMBOL(debug_early_boot);
> +
> +static int __init debug_early_boot_enable(char *str)
> +{
> +   debug_early_boot = 1;
> +   pr_info("debug_early_boot enabled\n");
> +   return 0;
> +}
> +early_param("debug_early_boot", debug_early_boot_enable);
> +
>  static bool have_filled_random_ptr_key __read_mostly;
>  static siphash_key_t ptr_key __read_mostly;
>
> @@ -1707,6 +1719,12 @@ static char *ptr_to_id(char *buf, char *end, void 
> *ptr, struct printf_spec spec)
> const char *str = sizeof(ptr) == 8 ? "(ptrval)" : "(ptrval)";
> unsigned long hashval;
>
> +   /* When debugging early boot use non-cryptographically secure hash */
> +   if (unlikely(debug_early_boot)) {
> +   hashval = hash_long((unsigned long)ptr, 32);
> +   return pointer_string(buf, end, (const void *)hashval, spec);
> +   }
> +
> if (unlikely(!have_filled_random_ptr_key)) {
> spec.field_width = 2 * sizeof(ptr);
> /* string length must be less than default_width */
> --
> 2.7.4
>



-- 
Kees Cook
Pixel Security


Re: [PATCH] kernel/exit.c: pointer sighand could be uninitialized

2018-05-02 Thread Kees Cook
On Wed, May 2, 2018 at 6:48 PM, Yizhuo Zhai  wrote:
> Variable 'sighand' could be uninitialized if probe_kernel_address fails
> (-EFAULT). The later use in the if statement may lead to undefined behavior.
>
> Signed-off-by: yzhai...@ucr.edu 
> ---
>  kernel/exit.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/kernel/exit.c b/kernel/exit.c
> index f6cad39..a353bd1 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -232,7 +232,7 @@ void release_task(struct task_struct *p)
>   */
>  struct task_struct *task_rcu_dereference(struct task_struct **ptask)
>  {
> - struct sighand_struct *sighand;
> + struct sighand_struct *sighand = NULL;
>   struct task_struct *task;
>
>   /*

Better would probably be to check the return of probe_kernel_address()
and take appropriate action...

-Kees

-- 
Kees Cook
Pixel Security


Re: [PATCH] kernel/exit.c: pointer sighand could be uninitialized

2018-05-02 Thread Kees Cook
On Wed, May 2, 2018 at 6:48 PM, Yizhuo Zhai  wrote:
> Variable 'sighand' could be uninitialized if probe_kernel_address fails
> (-EFAULT). The later use in the if statement may lead to undefined behavior.
>
> Signed-off-by: yzhai...@ucr.edu 
> ---
>  kernel/exit.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/kernel/exit.c b/kernel/exit.c
> index f6cad39..a353bd1 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -232,7 +232,7 @@ void release_task(struct task_struct *p)
>   */
>  struct task_struct *task_rcu_dereference(struct task_struct **ptask)
>  {
> - struct sighand_struct *sighand;
> + struct sighand_struct *sighand = NULL;
>   struct task_struct *task;
>
>   /*

Better would probably be to check the return of probe_kernel_address()
and take appropriate action...

-Kees

-- 
Kees Cook
Pixel Security


[PATCH 1/3] staging: Android: vsoc: Create wc kernel mapping for region shm.

2018-05-02 Thread Alistair Strachan
Map the region shm as write-combining instead of uncachable.

Cc: Greg Kroah-Hartman 
Cc: Arve Hjønnevåg 
Cc: Todd Kjos 
Cc: Martijn Coenen 
Cc: Greg Hartman 
Cc: de...@driverdev.osuosl.org
Cc: kernel-t...@android.com
Signed-off-by: Alistair Strachan 
---
 drivers/staging/android/TODO   | 1 -
 drivers/staging/android/vsoc.c | 4 +---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/staging/android/TODO b/drivers/staging/android/TODO
index 2ea6f97b8f0f..ebd6ba3ae02e 100644
--- a/drivers/staging/android/TODO
+++ b/drivers/staging/android/TODO
@@ -18,7 +18,6 @@ vsoc.c, uapi/vsoc_shm.h
waiting threads. We should eventually use multiple queues and select the
queue based on the region.
  - Add debugfs support for examining the permissions of regions.
- - Use ioremap_wc instead of ioremap_nocache.
  - Remove VSOC_WAIT_FOR_INCOMING_INTERRUPT ioctl. This functionality has been
superseded by the futex and is there for legacy reasons.
 
diff --git a/drivers/staging/android/vsoc.c b/drivers/staging/android/vsoc.c
index 587c66d709b9..794137b7751f 100644
--- a/drivers/staging/android/vsoc.c
+++ b/drivers/staging/android/vsoc.c
@@ -802,9 +802,7 @@ static int vsoc_probe_device(struct pci_dev *pdev,
 
dev_info(>dev, "shared memory @ DMA %p size=0x%zx\n",
 (void *)vsoc_dev.shm_phys_start, vsoc_dev.shm_size);
-   /* TODO(ghartman): ioremap_wc should work here */
-   vsoc_dev.kernel_mapped_shm = ioremap_nocache(
-   vsoc_dev.shm_phys_start, vsoc_dev.shm_size);
+   vsoc_dev.kernel_mapped_shm = pci_iomap_wc(pdev, SHARED_MEMORY_BAR, 0);
if (!vsoc_dev.kernel_mapped_shm) {
dev_err(_dev.dev->dev, "cannot iomap region\n");
vsoc_remove_device(pdev);


[PATCH 1/3] staging: Android: vsoc: Create wc kernel mapping for region shm.

2018-05-02 Thread Alistair Strachan
Map the region shm as write-combining instead of uncachable.

Cc: Greg Kroah-Hartman 
Cc: Arve Hjønnevåg 
Cc: Todd Kjos 
Cc: Martijn Coenen 
Cc: Greg Hartman 
Cc: de...@driverdev.osuosl.org
Cc: kernel-t...@android.com
Signed-off-by: Alistair Strachan 
---
 drivers/staging/android/TODO   | 1 -
 drivers/staging/android/vsoc.c | 4 +---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/staging/android/TODO b/drivers/staging/android/TODO
index 2ea6f97b8f0f..ebd6ba3ae02e 100644
--- a/drivers/staging/android/TODO
+++ b/drivers/staging/android/TODO
@@ -18,7 +18,6 @@ vsoc.c, uapi/vsoc_shm.h
waiting threads. We should eventually use multiple queues and select the
queue based on the region.
  - Add debugfs support for examining the permissions of regions.
- - Use ioremap_wc instead of ioremap_nocache.
  - Remove VSOC_WAIT_FOR_INCOMING_INTERRUPT ioctl. This functionality has been
superseded by the futex and is there for legacy reasons.
 
diff --git a/drivers/staging/android/vsoc.c b/drivers/staging/android/vsoc.c
index 587c66d709b9..794137b7751f 100644
--- a/drivers/staging/android/vsoc.c
+++ b/drivers/staging/android/vsoc.c
@@ -802,9 +802,7 @@ static int vsoc_probe_device(struct pci_dev *pdev,
 
dev_info(>dev, "shared memory @ DMA %p size=0x%zx\n",
 (void *)vsoc_dev.shm_phys_start, vsoc_dev.shm_size);
-   /* TODO(ghartman): ioremap_wc should work here */
-   vsoc_dev.kernel_mapped_shm = ioremap_nocache(
-   vsoc_dev.shm_phys_start, vsoc_dev.shm_size);
+   vsoc_dev.kernel_mapped_shm = pci_iomap_wc(pdev, SHARED_MEMORY_BAR, 0);
if (!vsoc_dev.kernel_mapped_shm) {
dev_err(_dev.dev->dev, "cannot iomap region\n");
vsoc_remove_device(pdev);


[PATCH 2/3] staging: Android: vsoc: Fix a i386-randconfig warning.

2018-05-02 Thread Alistair Strachan
Fix "warning: cast to pointer from integer of different size" when
printing the region shm physical address. Use the %pa conversion
specifier and pass the resource by reference.

Cc: Greg Kroah-Hartman 
Cc: Arve Hjønnevåg 
Cc: Todd Kjos 
Cc: Martijn Coenen 
Cc: Greg Hartman 
Cc: de...@driverdev.osuosl.org
Cc: kernel-t...@android.com
Signed-off-by: Alistair Strachan 
---
 drivers/staging/android/vsoc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/staging/android/vsoc.c b/drivers/staging/android/vsoc.c
index 794137b7751f..3e6e4af7d6a1 100644
--- a/drivers/staging/android/vsoc.c
+++ b/drivers/staging/android/vsoc.c
@@ -800,8 +800,8 @@ static int vsoc_probe_device(struct pci_dev *pdev,
vsoc_dev.shm_phys_start = pci_resource_start(pdev, SHARED_MEMORY_BAR);
vsoc_dev.shm_size = pci_resource_len(pdev, SHARED_MEMORY_BAR);
 
-   dev_info(>dev, "shared memory @ DMA %p size=0x%zx\n",
-(void *)vsoc_dev.shm_phys_start, vsoc_dev.shm_size);
+   dev_info(>dev, "shared memory @ DMA %pa size=0x%zx\n",
+_dev.shm_phys_start, vsoc_dev.shm_size);
vsoc_dev.kernel_mapped_shm = pci_iomap_wc(pdev, SHARED_MEMORY_BAR, 0);
if (!vsoc_dev.kernel_mapped_shm) {
dev_err(_dev.dev->dev, "cannot iomap region\n");


[PATCH 2/3] staging: Android: vsoc: Fix a i386-randconfig warning.

2018-05-02 Thread Alistair Strachan
Fix "warning: cast to pointer from integer of different size" when
printing the region shm physical address. Use the %pa conversion
specifier and pass the resource by reference.

Cc: Greg Kroah-Hartman 
Cc: Arve Hjønnevåg 
Cc: Todd Kjos 
Cc: Martijn Coenen 
Cc: Greg Hartman 
Cc: de...@driverdev.osuosl.org
Cc: kernel-t...@android.com
Signed-off-by: Alistair Strachan 
---
 drivers/staging/android/vsoc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/staging/android/vsoc.c b/drivers/staging/android/vsoc.c
index 794137b7751f..3e6e4af7d6a1 100644
--- a/drivers/staging/android/vsoc.c
+++ b/drivers/staging/android/vsoc.c
@@ -800,8 +800,8 @@ static int vsoc_probe_device(struct pci_dev *pdev,
vsoc_dev.shm_phys_start = pci_resource_start(pdev, SHARED_MEMORY_BAR);
vsoc_dev.shm_size = pci_resource_len(pdev, SHARED_MEMORY_BAR);
 
-   dev_info(>dev, "shared memory @ DMA %p size=0x%zx\n",
-(void *)vsoc_dev.shm_phys_start, vsoc_dev.shm_size);
+   dev_info(>dev, "shared memory @ DMA %pa size=0x%zx\n",
+_dev.shm_phys_start, vsoc_dev.shm_size);
vsoc_dev.kernel_mapped_shm = pci_iomap_wc(pdev, SHARED_MEMORY_BAR, 0);
if (!vsoc_dev.kernel_mapped_shm) {
dev_err(_dev.dev->dev, "cannot iomap region\n");


[PATCH 3/3] staging: Android: Fix sparse warnings in vsoc driver.

2018-05-02 Thread Alistair Strachan
Cc: Greg Kroah-Hartman 
Cc: Arve Hjønnevåg 
Cc: Todd Kjos 
Cc: Martijn Coenen 
Cc: Greg Hartman 
Cc: de...@driverdev.osuosl.org
Cc: kernel-t...@android.com
Signed-off-by: Alistair Strachan 
---
 drivers/staging/android/vsoc.c | 100 -
 1 file changed, 49 insertions(+), 51 deletions(-)

diff --git a/drivers/staging/android/vsoc.c b/drivers/staging/android/vsoc.c
index 3e6e4af7d6a1..954ed2c5d807 100644
--- a/drivers/staging/android/vsoc.c
+++ b/drivers/staging/android/vsoc.c
@@ -81,8 +81,8 @@ struct vsoc_region_data {
atomic_t *incoming_signalled;
/* Flag indicating the guest has signalled the host. */
atomic_t *outgoing_signalled;
-   int irq_requested;
-   int device_created;
+   bool irq_requested;
+   bool device_created;
 };
 
 struct vsoc_device {
@@ -91,7 +91,7 @@ struct vsoc_device {
/* Physical address of SHARED_MEMORY_BAR. */
phys_addr_t shm_phys_start;
/* Kernel virtual address of SHARED_MEMORY_BAR. */
-   void *kernel_mapped_shm;
+   void __iomem *kernel_mapped_shm;
/* Size of the entire shared memory window in bytes. */
size_t shm_size;
/*
@@ -116,22 +116,23 @@ struct vsoc_device {
 * vsoc_region_data because the kernel deals with them as an array.
 */
struct msix_entry *msix_entries;
-   /*
-* Flags that indicate what we've initialzied. These are used to do an
-* orderly cleanup of the device.
-*/
-   char enabled_device;
-   char requested_regions;
-   char cdev_added;
-   char class_added;
-   char msix_enabled;
/* Mutex that protectes the permission list */
struct mutex mtx;
/* Major number assigned by the kernel */
int major;
-
+   /* Character device assigned by the kernel */
struct cdev cdev;
+   /* Device class assigned by the kernel */
struct class *class;
+   /*
+* Flags that indicate what we've initialized. These are used to do an
+* orderly cleanup of the device.
+*/
+   bool enabled_device;
+   bool requested_regions;
+   bool cdev_added;
+   bool class_added;
+   bool msix_enabled;
 };
 
 static struct vsoc_device vsoc_dev;
@@ -153,13 +154,13 @@ static long vsoc_ioctl(struct file *, unsigned int, 
unsigned long);
 static int vsoc_mmap(struct file *, struct vm_area_struct *);
 static int vsoc_open(struct inode *, struct file *);
 static int vsoc_release(struct inode *, struct file *);
-static ssize_t vsoc_read(struct file *, char *, size_t, loff_t *);
-static ssize_t vsoc_write(struct file *, const char *, size_t, loff_t *);
+static ssize_t vsoc_read(struct file *, char __user *, size_t, loff_t *);
+static ssize_t vsoc_write(struct file *, const char __user *, size_t, loff_t 
*);
 static loff_t vsoc_lseek(struct file *filp, loff_t offset, int origin);
 static int do_create_fd_scoped_permission(
struct vsoc_device_region *region_p,
struct fd_scoped_permission_node *np,
-   struct fd_scoped_permission_arg *__user arg);
+   struct fd_scoped_permission_arg __user *arg);
 static void do_destroy_fd_scoped_permission(
struct vsoc_device_region *owner_region_p,
struct fd_scoped_permission *perm);
@@ -198,7 +199,7 @@ inline int vsoc_validate_filep(struct file *filp)
 /* Converts from shared memory offset to virtual address */
 static inline void *shm_off_to_virtual_addr(__u32 offset)
 {
-   return vsoc_dev.kernel_mapped_shm + offset;
+   return (void __force *)vsoc_dev.kernel_mapped_shm + offset;
 }
 
 /* Converts from shared memory offset to physical address */
@@ -261,7 +262,7 @@ static struct pci_driver vsoc_pci_driver = {
 static int do_create_fd_scoped_permission(
struct vsoc_device_region *region_p,
struct fd_scoped_permission_node *np,
-   struct fd_scoped_permission_arg *__user arg)
+   struct fd_scoped_permission_arg __user *arg)
 {
struct file *managed_filp;
s32 managed_fd;
@@ -632,11 +633,11 @@ static long vsoc_ioctl(struct file *filp, unsigned int 
cmd, unsigned long arg)
return 0;
 }
 
-static ssize_t vsoc_read(struct file *filp, char *buffer, size_t len,
+static ssize_t vsoc_read(struct file *filp, char __user *buffer, size_t len,
 loff_t *poffset)
 {
__u32 area_off;
-   void *area_p;
+   const void *area_p;
ssize_t area_len;
int retval = vsoc_validate_filep(filp);
 
@@ -706,7 +707,7 @@ static loff_t vsoc_lseek(struct file *filp, loff_t offset, 
int origin)
return offset;
 }
 
-static ssize_t vsoc_write(struct file *filp, const char *buffer,
+static ssize_t vsoc_write(struct file *filp, const char __user *buffer,
  size_t len, loff_t *poffset)
 {
__u32 area_off;
@@ 

[PATCH 3/3] staging: Android: Fix sparse warnings in vsoc driver.

2018-05-02 Thread Alistair Strachan
Cc: Greg Kroah-Hartman 
Cc: Arve Hjønnevåg 
Cc: Todd Kjos 
Cc: Martijn Coenen 
Cc: Greg Hartman 
Cc: de...@driverdev.osuosl.org
Cc: kernel-t...@android.com
Signed-off-by: Alistair Strachan 
---
 drivers/staging/android/vsoc.c | 100 -
 1 file changed, 49 insertions(+), 51 deletions(-)

diff --git a/drivers/staging/android/vsoc.c b/drivers/staging/android/vsoc.c
index 3e6e4af7d6a1..954ed2c5d807 100644
--- a/drivers/staging/android/vsoc.c
+++ b/drivers/staging/android/vsoc.c
@@ -81,8 +81,8 @@ struct vsoc_region_data {
atomic_t *incoming_signalled;
/* Flag indicating the guest has signalled the host. */
atomic_t *outgoing_signalled;
-   int irq_requested;
-   int device_created;
+   bool irq_requested;
+   bool device_created;
 };
 
 struct vsoc_device {
@@ -91,7 +91,7 @@ struct vsoc_device {
/* Physical address of SHARED_MEMORY_BAR. */
phys_addr_t shm_phys_start;
/* Kernel virtual address of SHARED_MEMORY_BAR. */
-   void *kernel_mapped_shm;
+   void __iomem *kernel_mapped_shm;
/* Size of the entire shared memory window in bytes. */
size_t shm_size;
/*
@@ -116,22 +116,23 @@ struct vsoc_device {
 * vsoc_region_data because the kernel deals with them as an array.
 */
struct msix_entry *msix_entries;
-   /*
-* Flags that indicate what we've initialzied. These are used to do an
-* orderly cleanup of the device.
-*/
-   char enabled_device;
-   char requested_regions;
-   char cdev_added;
-   char class_added;
-   char msix_enabled;
/* Mutex that protectes the permission list */
struct mutex mtx;
/* Major number assigned by the kernel */
int major;
-
+   /* Character device assigned by the kernel */
struct cdev cdev;
+   /* Device class assigned by the kernel */
struct class *class;
+   /*
+* Flags that indicate what we've initialized. These are used to do an
+* orderly cleanup of the device.
+*/
+   bool enabled_device;
+   bool requested_regions;
+   bool cdev_added;
+   bool class_added;
+   bool msix_enabled;
 };
 
 static struct vsoc_device vsoc_dev;
@@ -153,13 +154,13 @@ static long vsoc_ioctl(struct file *, unsigned int, 
unsigned long);
 static int vsoc_mmap(struct file *, struct vm_area_struct *);
 static int vsoc_open(struct inode *, struct file *);
 static int vsoc_release(struct inode *, struct file *);
-static ssize_t vsoc_read(struct file *, char *, size_t, loff_t *);
-static ssize_t vsoc_write(struct file *, const char *, size_t, loff_t *);
+static ssize_t vsoc_read(struct file *, char __user *, size_t, loff_t *);
+static ssize_t vsoc_write(struct file *, const char __user *, size_t, loff_t 
*);
 static loff_t vsoc_lseek(struct file *filp, loff_t offset, int origin);
 static int do_create_fd_scoped_permission(
struct vsoc_device_region *region_p,
struct fd_scoped_permission_node *np,
-   struct fd_scoped_permission_arg *__user arg);
+   struct fd_scoped_permission_arg __user *arg);
 static void do_destroy_fd_scoped_permission(
struct vsoc_device_region *owner_region_p,
struct fd_scoped_permission *perm);
@@ -198,7 +199,7 @@ inline int vsoc_validate_filep(struct file *filp)
 /* Converts from shared memory offset to virtual address */
 static inline void *shm_off_to_virtual_addr(__u32 offset)
 {
-   return vsoc_dev.kernel_mapped_shm + offset;
+   return (void __force *)vsoc_dev.kernel_mapped_shm + offset;
 }
 
 /* Converts from shared memory offset to physical address */
@@ -261,7 +262,7 @@ static struct pci_driver vsoc_pci_driver = {
 static int do_create_fd_scoped_permission(
struct vsoc_device_region *region_p,
struct fd_scoped_permission_node *np,
-   struct fd_scoped_permission_arg *__user arg)
+   struct fd_scoped_permission_arg __user *arg)
 {
struct file *managed_filp;
s32 managed_fd;
@@ -632,11 +633,11 @@ static long vsoc_ioctl(struct file *filp, unsigned int 
cmd, unsigned long arg)
return 0;
 }
 
-static ssize_t vsoc_read(struct file *filp, char *buffer, size_t len,
+static ssize_t vsoc_read(struct file *filp, char __user *buffer, size_t len,
 loff_t *poffset)
 {
__u32 area_off;
-   void *area_p;
+   const void *area_p;
ssize_t area_len;
int retval = vsoc_validate_filep(filp);
 
@@ -706,7 +707,7 @@ static loff_t vsoc_lseek(struct file *filp, loff_t offset, 
int origin)
return offset;
 }
 
-static ssize_t vsoc_write(struct file *filp, const char *buffer,
+static ssize_t vsoc_write(struct file *filp, const char __user *buffer,
  size_t len, loff_t *poffset)
 {
__u32 area_off;
@@ -772,14 +773,14 @@ static int vsoc_probe_device(struct pci_dev *pdev,
pci_name(pdev), result);
  

[PATCH RFC v2 net-next 3/4] bpfilter: add iptable get/set parsing

2018-05-02 Thread Alexei Starovoitov
From: "David S. Miller" 

parse iptable binary blobs into bpfilter internal data structures
bpfilter.ko only passing the [gs]etsockopt commands from kernel to umh
All parsing is done inside umh

Signed-off-by: David S. Miller 
Signed-off-by: Alexei Starovoitov 
---
 include/uapi/linux/bpfilter.h | 179 ++
 net/bpfilter/Makefile |   2 +-
 net/bpfilter/bpfilter_mod.h   |  96 ++
 net/bpfilter/ctor.c   |  80 +++
 net/bpfilter/init.c   |  33 
 net/bpfilter/main.c   |  51 
 net/bpfilter/sockopt.c| 153 
 net/bpfilter/tables.c |  70 +
 net/bpfilter/targets.c|  51 
 net/bpfilter/tgts.c   |  25 ++
 10 files changed, 739 insertions(+), 1 deletion(-)
 create mode 100644 net/bpfilter/bpfilter_mod.h
 create mode 100644 net/bpfilter/ctor.c
 create mode 100644 net/bpfilter/init.c
 create mode 100644 net/bpfilter/sockopt.c
 create mode 100644 net/bpfilter/tables.c
 create mode 100644 net/bpfilter/targets.c
 create mode 100644 net/bpfilter/tgts.c

diff --git a/include/uapi/linux/bpfilter.h b/include/uapi/linux/bpfilter.h
index 2ec3cc99ea4c..38d54e9947a1 100644
--- a/include/uapi/linux/bpfilter.h
+++ b/include/uapi/linux/bpfilter.h
@@ -18,4 +18,183 @@ enum {
BPFILTER_IPT_GET_MAX,
 };
 
+enum {
+   BPFILTER_XT_TABLE_MAXNAMELEN = 32,
+};
+
+enum {
+   BPFILTER_NF_DROP = 0,
+   BPFILTER_NF_ACCEPT = 1,
+   BPFILTER_NF_STOLEN = 2,
+   BPFILTER_NF_QUEUE = 3,
+   BPFILTER_NF_REPEAT = 4,
+   BPFILTER_NF_STOP = 5,
+   BPFILTER_NF_MAX_VERDICT = BPFILTER_NF_STOP,
+};
+
+enum {
+   BPFILTER_INET_HOOK_PRE_ROUTING  = 0,
+   BPFILTER_INET_HOOK_LOCAL_IN = 1,
+   BPFILTER_INET_HOOK_FORWARD  = 2,
+   BPFILTER_INET_HOOK_LOCAL_OUT= 3,
+   BPFILTER_INET_HOOK_POST_ROUTING = 4,
+   BPFILTER_INET_HOOK_MAX,
+};
+
+enum {
+   BPFILTER_PROTO_UNSPEC   = 0,
+   BPFILTER_PROTO_INET = 1,
+   BPFILTER_PROTO_IPV4 = 2,
+   BPFILTER_PROTO_ARP  = 3,
+   BPFILTER_PROTO_NETDEV   = 5,
+   BPFILTER_PROTO_BRIDGE   = 7,
+   BPFILTER_PROTO_IPV6 = 10,
+   BPFILTER_PROTO_DECNET   = 12,
+   BPFILTER_PROTO_NUMPROTO,
+};
+
+#ifndef INT_MAX
+#define INT_MAX((int)(~0U>>1))
+#endif
+#ifndef INT_MIN
+#define INT_MIN (-INT_MAX - 1)
+#endif
+
+enum {
+   BPFILTER_IP_PRI_FIRST   = INT_MIN,
+   BPFILTER_IP_PRI_CONNTRACK_DEFRAG= -400,
+   BPFILTER_IP_PRI_RAW = -300,
+   BPFILTER_IP_PRI_SELINUX_FIRST   = -225,
+   BPFILTER_IP_PRI_CONNTRACK   = -200,
+   BPFILTER_IP_PRI_MANGLE  = -150,
+   BPFILTER_IP_PRI_NAT_DST = -100,
+   BPFILTER_IP_PRI_FILTER  = 0,
+   BPFILTER_IP_PRI_SECURITY= 50,
+   BPFILTER_IP_PRI_NAT_SRC = 100,
+   BPFILTER_IP_PRI_SELINUX_LAST= 225,
+   BPFILTER_IP_PRI_CONNTRACK_HELPER= 300,
+   BPFILTER_IP_PRI_CONNTRACK_CONFIRM   = INT_MAX,
+   BPFILTER_IP_PRI_LAST= INT_MAX,
+};
+
+#define BPFILTER_FUNCTION_MAXNAMELEN   30
+#define BPFILTER_EXTENSION_MAXNAMELEN  29
+#define BPFILTER_TABLE_MAXNAMELEN  32
+
+struct bpfilter_match;
+struct bpfilter_entry_match {
+   union {
+   struct {
+   __u16   match_size;
+   charname[BPFILTER_EXTENSION_MAXNAMELEN];
+   __u8revision;
+   } user;
+   struct {
+   __u16   match_size;
+   struct bpfilter_match   *match;
+   } kernel;
+   __u16   match_size;
+   } u;
+   unsigned char   data[0];
+};
+
+struct bpfilter_target;
+struct bpfilter_entry_target {
+   union {
+   struct {
+   __u16   target_size;
+   charname[BPFILTER_EXTENSION_MAXNAMELEN];
+   __u8revision;
+   } user;
+   struct {
+   __u16   target_size;
+   struct bpfilter_target  *target;
+   } kernel;
+   __u16   target_size;
+   } u;
+   unsigned char   data[0];
+};
+
+struct bpfilter_standard_target {
+   struct bpfilter_entry_targettarget;
+   int verdict;
+};
+
+struct bpfilter_error_target {
+   struct bpfilter_entry_targettarget;
+   char
error_name[BPFILTER_FUNCTION_MAXNAMELEN];
+};
+
+#define __ALIGN_KERNEL(x, a)__ALIGN_KERNEL_MASK(x, (typeof(x))(a) 
- 1)

[PATCH RFC v2 net-next 3/4] bpfilter: add iptable get/set parsing

2018-05-02 Thread Alexei Starovoitov
From: "David S. Miller" 

parse iptable binary blobs into bpfilter internal data structures
bpfilter.ko only passing the [gs]etsockopt commands from kernel to umh
All parsing is done inside umh

Signed-off-by: David S. Miller 
Signed-off-by: Alexei Starovoitov 
---
 include/uapi/linux/bpfilter.h | 179 ++
 net/bpfilter/Makefile |   2 +-
 net/bpfilter/bpfilter_mod.h   |  96 ++
 net/bpfilter/ctor.c   |  80 +++
 net/bpfilter/init.c   |  33 
 net/bpfilter/main.c   |  51 
 net/bpfilter/sockopt.c| 153 
 net/bpfilter/tables.c |  70 +
 net/bpfilter/targets.c|  51 
 net/bpfilter/tgts.c   |  25 ++
 10 files changed, 739 insertions(+), 1 deletion(-)
 create mode 100644 net/bpfilter/bpfilter_mod.h
 create mode 100644 net/bpfilter/ctor.c
 create mode 100644 net/bpfilter/init.c
 create mode 100644 net/bpfilter/sockopt.c
 create mode 100644 net/bpfilter/tables.c
 create mode 100644 net/bpfilter/targets.c
 create mode 100644 net/bpfilter/tgts.c

diff --git a/include/uapi/linux/bpfilter.h b/include/uapi/linux/bpfilter.h
index 2ec3cc99ea4c..38d54e9947a1 100644
--- a/include/uapi/linux/bpfilter.h
+++ b/include/uapi/linux/bpfilter.h
@@ -18,4 +18,183 @@ enum {
BPFILTER_IPT_GET_MAX,
 };
 
+enum {
+   BPFILTER_XT_TABLE_MAXNAMELEN = 32,
+};
+
+enum {
+   BPFILTER_NF_DROP = 0,
+   BPFILTER_NF_ACCEPT = 1,
+   BPFILTER_NF_STOLEN = 2,
+   BPFILTER_NF_QUEUE = 3,
+   BPFILTER_NF_REPEAT = 4,
+   BPFILTER_NF_STOP = 5,
+   BPFILTER_NF_MAX_VERDICT = BPFILTER_NF_STOP,
+};
+
+enum {
+   BPFILTER_INET_HOOK_PRE_ROUTING  = 0,
+   BPFILTER_INET_HOOK_LOCAL_IN = 1,
+   BPFILTER_INET_HOOK_FORWARD  = 2,
+   BPFILTER_INET_HOOK_LOCAL_OUT= 3,
+   BPFILTER_INET_HOOK_POST_ROUTING = 4,
+   BPFILTER_INET_HOOK_MAX,
+};
+
+enum {
+   BPFILTER_PROTO_UNSPEC   = 0,
+   BPFILTER_PROTO_INET = 1,
+   BPFILTER_PROTO_IPV4 = 2,
+   BPFILTER_PROTO_ARP  = 3,
+   BPFILTER_PROTO_NETDEV   = 5,
+   BPFILTER_PROTO_BRIDGE   = 7,
+   BPFILTER_PROTO_IPV6 = 10,
+   BPFILTER_PROTO_DECNET   = 12,
+   BPFILTER_PROTO_NUMPROTO,
+};
+
+#ifndef INT_MAX
+#define INT_MAX((int)(~0U>>1))
+#endif
+#ifndef INT_MIN
+#define INT_MIN (-INT_MAX - 1)
+#endif
+
+enum {
+   BPFILTER_IP_PRI_FIRST   = INT_MIN,
+   BPFILTER_IP_PRI_CONNTRACK_DEFRAG= -400,
+   BPFILTER_IP_PRI_RAW = -300,
+   BPFILTER_IP_PRI_SELINUX_FIRST   = -225,
+   BPFILTER_IP_PRI_CONNTRACK   = -200,
+   BPFILTER_IP_PRI_MANGLE  = -150,
+   BPFILTER_IP_PRI_NAT_DST = -100,
+   BPFILTER_IP_PRI_FILTER  = 0,
+   BPFILTER_IP_PRI_SECURITY= 50,
+   BPFILTER_IP_PRI_NAT_SRC = 100,
+   BPFILTER_IP_PRI_SELINUX_LAST= 225,
+   BPFILTER_IP_PRI_CONNTRACK_HELPER= 300,
+   BPFILTER_IP_PRI_CONNTRACK_CONFIRM   = INT_MAX,
+   BPFILTER_IP_PRI_LAST= INT_MAX,
+};
+
+#define BPFILTER_FUNCTION_MAXNAMELEN   30
+#define BPFILTER_EXTENSION_MAXNAMELEN  29
+#define BPFILTER_TABLE_MAXNAMELEN  32
+
+struct bpfilter_match;
+struct bpfilter_entry_match {
+   union {
+   struct {
+   __u16   match_size;
+   charname[BPFILTER_EXTENSION_MAXNAMELEN];
+   __u8revision;
+   } user;
+   struct {
+   __u16   match_size;
+   struct bpfilter_match   *match;
+   } kernel;
+   __u16   match_size;
+   } u;
+   unsigned char   data[0];
+};
+
+struct bpfilter_target;
+struct bpfilter_entry_target {
+   union {
+   struct {
+   __u16   target_size;
+   charname[BPFILTER_EXTENSION_MAXNAMELEN];
+   __u8revision;
+   } user;
+   struct {
+   __u16   target_size;
+   struct bpfilter_target  *target;
+   } kernel;
+   __u16   target_size;
+   } u;
+   unsigned char   data[0];
+};
+
+struct bpfilter_standard_target {
+   struct bpfilter_entry_targettarget;
+   int verdict;
+};
+
+struct bpfilter_error_target {
+   struct bpfilter_entry_targettarget;
+   char
error_name[BPFILTER_FUNCTION_MAXNAMELEN];
+};
+
+#define __ALIGN_KERNEL(x, a)__ALIGN_KERNEL_MASK(x, (typeof(x))(a) 
- 1)
+#define __ALIGN_KERNEL_MASK(x, mask)(((x) + (mask)) & 

[PATCH v2 net-next 1/4] umh: introduce fork_usermode_blob() helper

2018-05-02 Thread Alexei Starovoitov
Introduce helper:
int fork_usermode_blob(void *data, size_t len, struct umh_info *info);
struct umh_info {
   struct file *pipe_to_umh;
   struct file *pipe_from_umh;
   pid_t pid;
};

that GPLed kernel modules (signed or unsigned) can use it to execute part
of its own data as swappable user mode process.

The kernel will do:
- mount "tmpfs"
- allocate a unique file in tmpfs
- populate that file with [data, data + len] bytes
- user-mode-helper code will do_execve that file and, before the process
  starts, the kernel will create two unix pipes for bidirectional
  communication between kernel module and umh
- close tmpfs file, effectively deleting it
- the fork_usermode_blob will return zero on success and populate
  'struct umh_info' with two unix pipes and the pid of the user process

As the first step in the development of the bpfilter project
the fork_usermode_blob() helper is introduced to allow user mode code
to be invoked from a kernel module. The idea is that user mode code plus
normal kernel module code are built as part of the kernel build
and installed as traditional kernel module into distro specified location,
such that from a distribution point of view, there is
no difference between regular kernel modules and kernel modules + umh code.
Such modules can be signed, modprobed, rmmod, etc. The use of this new helper
by a kernel module doesn't make it any special from kernel and user space
tooling point of view.

Such approach enables kernel to delegate functionality traditionally done
by the kernel modules into the user space processes (either root or !root) and
reduces security attack surface of the new code. The buggy umh code would crash
the user process, but not the kernel. Another advantage is that umh code
of the kernel module can be debugged and tested out of user space
(e.g. opening the possibility to run clang sanitizers, fuzzers or
user space test suites on the umh code).
In case of the bpfilter project such architecture allows complex control plane
to be done in the user space while bpf based data plane stays in the kernel.

Since umh can crash, can be oom-ed by the kernel, killed by the admin,
the kernel module that uses them (like bpfilter) needs to manage life
time of umh on its own via two unix pipes and the pid of umh.

The exit code of such kernel module should kill the umh it started,
so that rmmod of the kernel module will cleanup the corresponding umh.
Just like if the kernel module does kmalloc() it should kfree() it in the exit 
code.

Signed-off-by: Alexei Starovoitov 
---
 fs/exec.c   |  38 ---
 include/linux/binfmts.h |   1 +
 include/linux/umh.h |  12 
 kernel/umh.c| 176 +++-
 4 files changed, 215 insertions(+), 12 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 183059c427b9..30a36c2a39bf 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1706,14 +1706,13 @@ static int exec_binprm(struct linux_binprm *bprm)
 /*
  * sys_execve() executes a new program.
  */
-static int do_execveat_common(int fd, struct filename *filename,
- struct user_arg_ptr argv,
- struct user_arg_ptr envp,
- int flags)
+static int __do_execve_file(int fd, struct filename *filename,
+   struct user_arg_ptr argv,
+   struct user_arg_ptr envp,
+   int flags, struct file *file)
 {
char *pathbuf = NULL;
struct linux_binprm *bprm;
-   struct file *file;
struct files_struct *displaced;
int retval;
 
@@ -1752,7 +1751,8 @@ static int do_execveat_common(int fd, struct filename 
*filename,
check_unsafe_exec(bprm);
current->in_execve = 1;
 
-   file = do_open_execat(fd, filename, flags);
+   if (!file)
+   file = do_open_execat(fd, filename, flags);
retval = PTR_ERR(file);
if (IS_ERR(file))
goto out_unmark;
@@ -1760,7 +1760,9 @@ static int do_execveat_common(int fd, struct filename 
*filename,
sched_exec();
 
bprm->file = file;
-   if (fd == AT_FDCWD || filename->name[0] == '/') {
+   if (!filename) {
+   bprm->filename = "none";
+   } else if (fd == AT_FDCWD || filename->name[0] == '/') {
bprm->filename = filename->name;
} else {
if (filename->name[0] == '\0')
@@ -1826,7 +1828,8 @@ static int do_execveat_common(int fd, struct filename 
*filename,
task_numa_free(current);
free_bprm(bprm);
kfree(pathbuf);
-   putname(filename);
+   if (filename)
+   putname(filename);
if (displaced)
put_files_struct(displaced);
return retval;
@@ -1849,10 +1852,27 @@ static int do_execveat_common(int fd, struct filename 
*filename,
if (displaced)
reset_files_struct(displaced);
 out_ret:
-  

[PATCH v2 net-next 1/4] umh: introduce fork_usermode_blob() helper

2018-05-02 Thread Alexei Starovoitov
Introduce helper:
int fork_usermode_blob(void *data, size_t len, struct umh_info *info);
struct umh_info {
   struct file *pipe_to_umh;
   struct file *pipe_from_umh;
   pid_t pid;
};

that GPLed kernel modules (signed or unsigned) can use it to execute part
of its own data as swappable user mode process.

The kernel will do:
- mount "tmpfs"
- allocate a unique file in tmpfs
- populate that file with [data, data + len] bytes
- user-mode-helper code will do_execve that file and, before the process
  starts, the kernel will create two unix pipes for bidirectional
  communication between kernel module and umh
- close tmpfs file, effectively deleting it
- the fork_usermode_blob will return zero on success and populate
  'struct umh_info' with two unix pipes and the pid of the user process

As the first step in the development of the bpfilter project
the fork_usermode_blob() helper is introduced to allow user mode code
to be invoked from a kernel module. The idea is that user mode code plus
normal kernel module code are built as part of the kernel build
and installed as traditional kernel module into distro specified location,
such that from a distribution point of view, there is
no difference between regular kernel modules and kernel modules + umh code.
Such modules can be signed, modprobed, rmmod, etc. The use of this new helper
by a kernel module doesn't make it any special from kernel and user space
tooling point of view.

Such approach enables kernel to delegate functionality traditionally done
by the kernel modules into the user space processes (either root or !root) and
reduces security attack surface of the new code. The buggy umh code would crash
the user process, but not the kernel. Another advantage is that umh code
of the kernel module can be debugged and tested out of user space
(e.g. opening the possibility to run clang sanitizers, fuzzers or
user space test suites on the umh code).
In case of the bpfilter project such architecture allows complex control plane
to be done in the user space while bpf based data plane stays in the kernel.

Since umh can crash, can be oom-ed by the kernel, killed by the admin,
the kernel module that uses them (like bpfilter) needs to manage life
time of umh on its own via two unix pipes and the pid of umh.

The exit code of such kernel module should kill the umh it started,
so that rmmod of the kernel module will cleanup the corresponding umh.
Just like if the kernel module does kmalloc() it should kfree() it in the exit 
code.

Signed-off-by: Alexei Starovoitov 
---
 fs/exec.c   |  38 ---
 include/linux/binfmts.h |   1 +
 include/linux/umh.h |  12 
 kernel/umh.c| 176 +++-
 4 files changed, 215 insertions(+), 12 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 183059c427b9..30a36c2a39bf 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1706,14 +1706,13 @@ static int exec_binprm(struct linux_binprm *bprm)
 /*
  * sys_execve() executes a new program.
  */
-static int do_execveat_common(int fd, struct filename *filename,
- struct user_arg_ptr argv,
- struct user_arg_ptr envp,
- int flags)
+static int __do_execve_file(int fd, struct filename *filename,
+   struct user_arg_ptr argv,
+   struct user_arg_ptr envp,
+   int flags, struct file *file)
 {
char *pathbuf = NULL;
struct linux_binprm *bprm;
-   struct file *file;
struct files_struct *displaced;
int retval;
 
@@ -1752,7 +1751,8 @@ static int do_execveat_common(int fd, struct filename 
*filename,
check_unsafe_exec(bprm);
current->in_execve = 1;
 
-   file = do_open_execat(fd, filename, flags);
+   if (!file)
+   file = do_open_execat(fd, filename, flags);
retval = PTR_ERR(file);
if (IS_ERR(file))
goto out_unmark;
@@ -1760,7 +1760,9 @@ static int do_execveat_common(int fd, struct filename 
*filename,
sched_exec();
 
bprm->file = file;
-   if (fd == AT_FDCWD || filename->name[0] == '/') {
+   if (!filename) {
+   bprm->filename = "none";
+   } else if (fd == AT_FDCWD || filename->name[0] == '/') {
bprm->filename = filename->name;
} else {
if (filename->name[0] == '\0')
@@ -1826,7 +1828,8 @@ static int do_execveat_common(int fd, struct filename 
*filename,
task_numa_free(current);
free_bprm(bprm);
kfree(pathbuf);
-   putname(filename);
+   if (filename)
+   putname(filename);
if (displaced)
put_files_struct(displaced);
return retval;
@@ -1849,10 +1852,27 @@ static int do_execveat_common(int fd, struct filename 
*filename,
if (displaced)
reset_files_struct(displaced);
 out_ret:
-   

[PATCH RFC v2 net-next 4/4] bpfilter: rough bpfilter codegen example hack

2018-05-02 Thread Alexei Starovoitov
From: Daniel Borkmann 

Signed-off-by: Daniel Borkmann 
---
 net/bpfilter/Makefile   |   2 +-
 net/bpfilter/bpfilter_mod.h | 285 ++-
 net/bpfilter/ctor.c |  57 +
 net/bpfilter/gen.c  | 290 
 net/bpfilter/init.c |  11 +-
 net/bpfilter/main.c |  15 ++-
 net/bpfilter/sockopt.c  | 137 -
 net/bpfilter/tables.c   |   5 +-
 net/bpfilter/tgts.c |   1 +
 9 files changed, 737 insertions(+), 66 deletions(-)
 create mode 100644 net/bpfilter/gen.c

diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile
index bec6181de995..3796651c76cb 100644
--- a/net/bpfilter/Makefile
+++ b/net/bpfilter/Makefile
@@ -4,7 +4,7 @@
 #
 
 hostprogs-y := bpfilter_umh
-bpfilter_umh-objs := main.o tgts.o targets.o tables.o init.o ctor.o sockopt.o
+bpfilter_umh-objs := main.o tgts.o targets.o tables.o init.o ctor.o sockopt.o 
gen.o
 HOSTCFLAGS += -I. -Itools/include/
 
 # a bit of elf magic to convert bpfilter_umh binary into a binary blob
diff --git a/net/bpfilter/bpfilter_mod.h b/net/bpfilter/bpfilter_mod.h
index f0de41b20793..b4209985efff 100644
--- a/net/bpfilter/bpfilter_mod.h
+++ b/net/bpfilter/bpfilter_mod.h
@@ -21,8 +21,8 @@ struct bpfilter_table_info {
unsigned intinitial_entries;
unsigned inthook_entry[BPFILTER_INET_HOOK_MAX];
unsigned intunderflow[BPFILTER_INET_HOOK_MAX];
-   unsigned intstacksize;
-   void***jumpstack;
+// unsigned intstacksize;
+// void***jumpstack;
unsigned char   entries[0] __aligned(8);
 };
 
@@ -64,22 +64,55 @@ struct bpfilter_ipt_error {
 
 struct bpfilter_target {
struct list_headall_target_list;
-   const char  name[BPFILTER_EXTENSION_MAXNAMELEN];
+   charname[BPFILTER_EXTENSION_MAXNAMELEN];
unsigned intsize;
int hold;
u16 family;
u8  rev;
 };
 
+struct bpfilter_gen_ctx {
+   struct bpf_insn *img;
+   u32 len_cur;
+   u32 len_max;
+   u32 default_verdict;
+   int fd;
+   int ifindex;
+   booloffloaded;
+};
+
+union bpf_attr;
+int sys_bpf(int cmd, union bpf_attr *attr, unsigned int size);
+
+int bpfilter_gen_init(struct bpfilter_gen_ctx *ctx);
+int bpfilter_gen_prologue(struct bpfilter_gen_ctx *ctx);
+int bpfilter_gen_epilogue(struct bpfilter_gen_ctx *ctx);
+int bpfilter_gen_append(struct bpfilter_gen_ctx *ctx,
+   struct bpfilter_ipt_ip *ent, int verdict);
+int bpfilter_gen_commit(struct bpfilter_gen_ctx *ctx);
+void bpfilter_gen_destroy(struct bpfilter_gen_ctx *ctx);
+
 struct bpfilter_target *bpfilter_target_get_by_name(const char *name);
 void bpfilter_target_put(struct bpfilter_target *tgt);
 int bpfilter_target_add(struct bpfilter_target *tgt);
 
-struct bpfilter_table_info *bpfilter_ipv4_table_ctor(struct bpfilter_table 
*tbl);
+struct bpfilter_table_info *
+bpfilter_ipv4_table_alloc(struct bpfilter_table *tbl, __u32 size_ents);
+struct bpfilter_table_info *
+bpfilter_ipv4_table_finalize(struct bpfilter_table *tbl,
+struct bpfilter_table_info *info,
+__u32 size_ents, __u32 num_ents);
+struct bpfilter_table_info *
+bpfilter_ipv4_table_finalize2(struct bpfilter_table *tbl,
+ struct bpfilter_table_info *info,
+ __u32 size_ents, __u32 num_ents);
+
 int bpfilter_ipv4_register_targets(void);
 void bpfilter_tables_init(void);
 int bpfilter_get_info(void *addr, int len);
 int bpfilter_get_entries(void *cmd, int len);
+int bpfilter_set_replace(void *cmd, int len);
+int bpfilter_set_add_counters(void *cmd, int len);
 int bpfilter_ipv4_init(void);
 
 int copy_from_user(void *dst, void *addr, int len);
@@ -93,4 +126,248 @@ extern int pid;
 extern int debug_fd;
 #define ENOTSUPP524
 
+/* Helper macros for filter block array initializers. */
+
+/* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */
+
+#define BPF_ALU64_REG(OP, DST, SRC)\
+   ((struct bpf_insn) {\
+   .code  = BPF_ALU64 | BPF_OP(OP) | BPF_X,\
+   .dst_reg = DST, \
+   .src_reg = SRC, \
+   .off   = 0, \
+   .imm   = 0 })
+
+#define BPF_ALU32_REG(OP, DST, SRC)\
+   ((struct bpf_insn) {\
+   .code  = BPF_ALU | BPF_OP(OP) | 

[PATCH v2 net-next 0/4] bpfilter

2018-05-02 Thread Alexei Starovoitov
Hi All,

v1->v2:
this patch set is almost a full rewrite of the earlier umh modules approach
The v1 of patches and follow up discussion was covered by LWN:
https://lwn.net/Articles/749108/

I believe the v2 addresses all issues brought up by Andy and others.
Mainly there are zero changes to kernel/module.c
Instead of teaching module loading logic to recognize special
umh module, let normal kernel modules execute part of its own
.init.rodata as a new user space process (Andy's idea)
Patch 1 introduces this new helper:
int fork_usermode_blob(void *data, size_t len, struct umh_info *info);
Input:
  data + len == executable file
Output:
  struct umh_info {
   struct file *pipe_to_umh;
   struct file *pipe_from_umh;
   pid_t pid;
  };

Advantages vs v1:
- the embedded user mode executable is stored as .init.rodata inside
  normal kernel module. These pages are freed when .ko finishes loading
- the elf file is copied into tmpfs file. The user mode process is swappable.
- the communication between user mode process and 'parent' kernel module
  is done via two unix pipes, hence protocol is not exposed to
  user space
- impossible to launch umh on its own (that was the main issue of v1)
  and impossible to be man-in-the-middle due to pipes
- bpfilter.ko consists of tiny kernel part that passes the data
  between kernel and umh via pipes and much bigger umh part that
  doing all the work
- 'lsmod' shows bpfilter.ko as usual.
  'rmmod bpfilter' removes kernel module and kills corresponding umh
- signed bpfilter.ko covers the whole image including umh code

Few issues:
- architecturally bpfilter.ko can be builtin, but doesn't work yet.
  Still debugging. Kinda cool to have user mode executables
  to be part of vmlinux
- the user can still attach to the process and debug it with
  'gdb /proc/pid/exe pid', but 'gdb -p pid' doesn't work.
  (a bit worse comparing to v1)
- tinyconfig will notice a small increase in .text
  +766 | TEXT | 7c8b94806bec umh: introduce fork_usermode_blob() helper

More details in patches 1 and 2 that are ready to land.
Patches 3 and 4 are still rough. They were mainly used for
testing and to demonstrate how bpfilter is building on top.
The patch 4 approach of converting one iptable rule to few bpf
instructions will certainly change in the future, since it doesn't
scale to thousands of rules.

Alexei Starovoitov (2):
  umh: introduce fork_usermode_blob() helper
  net: add skeleton of bpfilter kernel module

Daniel Borkmann (1):
  bpfilter: rough bpfilter codegen example hack

David S. Miller (1):
  bpfilter: add iptable get/set parsing

 fs/exec.c |  38 -
 include/linux/binfmts.h   |   1 +
 include/linux/bpfilter.h  |  15 ++
 include/linux/umh.h   |  12 ++
 include/uapi/linux/bpfilter.h | 200 ++
 kernel/umh.c  | 176 +++-
 net/Kconfig   |   2 +
 net/Makefile  |   1 +
 net/bpfilter/Kconfig  |  17 ++
 net/bpfilter/Makefile |  24 +++
 net/bpfilter/bpfilter_kern.c  |  93 +++
 net/bpfilter/bpfilter_mod.h   | 373 ++
 net/bpfilter/ctor.c   |  91 +++
 net/bpfilter/gen.c| 290 
 net/bpfilter/init.c   |  36 
 net/bpfilter/main.c   | 117 +
 net/bpfilter/msgfmt.h |  17 ++
 net/bpfilter/sockopt.c| 236 ++
 net/bpfilter/tables.c |  73 +
 net/bpfilter/targets.c|  51 ++
 net/bpfilter/tgts.c   |  26 +++
 net/ipv4/Makefile |   2 +
 net/ipv4/bpfilter/Makefile|   2 +
 net/ipv4/bpfilter/sockopt.c   |  42 +
 net/ipv4/ip_sockglue.c|  17 ++
 25 files changed, 1940 insertions(+), 12 deletions(-)
 create mode 100644 include/linux/bpfilter.h
 create mode 100644 include/uapi/linux/bpfilter.h
 create mode 100644 net/bpfilter/Kconfig
 create mode 100644 net/bpfilter/Makefile
 create mode 100644 net/bpfilter/bpfilter_kern.c
 create mode 100644 net/bpfilter/bpfilter_mod.h
 create mode 100644 net/bpfilter/ctor.c
 create mode 100644 net/bpfilter/gen.c
 create mode 100644 net/bpfilter/init.c
 create mode 100644 net/bpfilter/main.c
 create mode 100644 net/bpfilter/msgfmt.h
 create mode 100644 net/bpfilter/sockopt.c
 create mode 100644 net/bpfilter/tables.c
 create mode 100644 net/bpfilter/targets.c
 create mode 100644 net/bpfilter/tgts.c
 create mode 100644 net/ipv4/bpfilter/Makefile
 create mode 100644 net/ipv4/bpfilter/sockopt.c

-- 
2.9.5



[PATCH RFC v2 net-next 4/4] bpfilter: rough bpfilter codegen example hack

2018-05-02 Thread Alexei Starovoitov
From: Daniel Borkmann 

Signed-off-by: Daniel Borkmann 
---
 net/bpfilter/Makefile   |   2 +-
 net/bpfilter/bpfilter_mod.h | 285 ++-
 net/bpfilter/ctor.c |  57 +
 net/bpfilter/gen.c  | 290 
 net/bpfilter/init.c |  11 +-
 net/bpfilter/main.c |  15 ++-
 net/bpfilter/sockopt.c  | 137 -
 net/bpfilter/tables.c   |   5 +-
 net/bpfilter/tgts.c |   1 +
 9 files changed, 737 insertions(+), 66 deletions(-)
 create mode 100644 net/bpfilter/gen.c

diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile
index bec6181de995..3796651c76cb 100644
--- a/net/bpfilter/Makefile
+++ b/net/bpfilter/Makefile
@@ -4,7 +4,7 @@
 #
 
 hostprogs-y := bpfilter_umh
-bpfilter_umh-objs := main.o tgts.o targets.o tables.o init.o ctor.o sockopt.o
+bpfilter_umh-objs := main.o tgts.o targets.o tables.o init.o ctor.o sockopt.o 
gen.o
 HOSTCFLAGS += -I. -Itools/include/
 
 # a bit of elf magic to convert bpfilter_umh binary into a binary blob
diff --git a/net/bpfilter/bpfilter_mod.h b/net/bpfilter/bpfilter_mod.h
index f0de41b20793..b4209985efff 100644
--- a/net/bpfilter/bpfilter_mod.h
+++ b/net/bpfilter/bpfilter_mod.h
@@ -21,8 +21,8 @@ struct bpfilter_table_info {
unsigned intinitial_entries;
unsigned inthook_entry[BPFILTER_INET_HOOK_MAX];
unsigned intunderflow[BPFILTER_INET_HOOK_MAX];
-   unsigned intstacksize;
-   void***jumpstack;
+// unsigned intstacksize;
+// void***jumpstack;
unsigned char   entries[0] __aligned(8);
 };
 
@@ -64,22 +64,55 @@ struct bpfilter_ipt_error {
 
 struct bpfilter_target {
struct list_headall_target_list;
-   const char  name[BPFILTER_EXTENSION_MAXNAMELEN];
+   charname[BPFILTER_EXTENSION_MAXNAMELEN];
unsigned intsize;
int hold;
u16 family;
u8  rev;
 };
 
+struct bpfilter_gen_ctx {
+   struct bpf_insn *img;
+   u32 len_cur;
+   u32 len_max;
+   u32 default_verdict;
+   int fd;
+   int ifindex;
+   booloffloaded;
+};
+
+union bpf_attr;
+int sys_bpf(int cmd, union bpf_attr *attr, unsigned int size);
+
+int bpfilter_gen_init(struct bpfilter_gen_ctx *ctx);
+int bpfilter_gen_prologue(struct bpfilter_gen_ctx *ctx);
+int bpfilter_gen_epilogue(struct bpfilter_gen_ctx *ctx);
+int bpfilter_gen_append(struct bpfilter_gen_ctx *ctx,
+   struct bpfilter_ipt_ip *ent, int verdict);
+int bpfilter_gen_commit(struct bpfilter_gen_ctx *ctx);
+void bpfilter_gen_destroy(struct bpfilter_gen_ctx *ctx);
+
 struct bpfilter_target *bpfilter_target_get_by_name(const char *name);
 void bpfilter_target_put(struct bpfilter_target *tgt);
 int bpfilter_target_add(struct bpfilter_target *tgt);
 
-struct bpfilter_table_info *bpfilter_ipv4_table_ctor(struct bpfilter_table 
*tbl);
+struct bpfilter_table_info *
+bpfilter_ipv4_table_alloc(struct bpfilter_table *tbl, __u32 size_ents);
+struct bpfilter_table_info *
+bpfilter_ipv4_table_finalize(struct bpfilter_table *tbl,
+struct bpfilter_table_info *info,
+__u32 size_ents, __u32 num_ents);
+struct bpfilter_table_info *
+bpfilter_ipv4_table_finalize2(struct bpfilter_table *tbl,
+ struct bpfilter_table_info *info,
+ __u32 size_ents, __u32 num_ents);
+
 int bpfilter_ipv4_register_targets(void);
 void bpfilter_tables_init(void);
 int bpfilter_get_info(void *addr, int len);
 int bpfilter_get_entries(void *cmd, int len);
+int bpfilter_set_replace(void *cmd, int len);
+int bpfilter_set_add_counters(void *cmd, int len);
 int bpfilter_ipv4_init(void);
 
 int copy_from_user(void *dst, void *addr, int len);
@@ -93,4 +126,248 @@ extern int pid;
 extern int debug_fd;
 #define ENOTSUPP524
 
+/* Helper macros for filter block array initializers. */
+
+/* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */
+
+#define BPF_ALU64_REG(OP, DST, SRC)\
+   ((struct bpf_insn) {\
+   .code  = BPF_ALU64 | BPF_OP(OP) | BPF_X,\
+   .dst_reg = DST, \
+   .src_reg = SRC, \
+   .off   = 0, \
+   .imm   = 0 })
+
+#define BPF_ALU32_REG(OP, DST, SRC)\
+   ((struct bpf_insn) {\
+   .code  = BPF_ALU | BPF_OP(OP) | BPF_X,  \
+   .dst_reg = 

[PATCH v2 net-next 0/4] bpfilter

2018-05-02 Thread Alexei Starovoitov
Hi All,

v1->v2:
this patch set is almost a full rewrite of the earlier umh modules approach
The v1 of patches and follow up discussion was covered by LWN:
https://lwn.net/Articles/749108/

I believe the v2 addresses all issues brought up by Andy and others.
Mainly there are zero changes to kernel/module.c
Instead of teaching module loading logic to recognize special
umh module, let normal kernel modules execute part of its own
.init.rodata as a new user space process (Andy's idea)
Patch 1 introduces this new helper:
int fork_usermode_blob(void *data, size_t len, struct umh_info *info);
Input:
  data + len == executable file
Output:
  struct umh_info {
   struct file *pipe_to_umh;
   struct file *pipe_from_umh;
   pid_t pid;
  };

Advantages vs v1:
- the embedded user mode executable is stored as .init.rodata inside
  normal kernel module. These pages are freed when .ko finishes loading
- the elf file is copied into tmpfs file. The user mode process is swappable.
- the communication between user mode process and 'parent' kernel module
  is done via two unix pipes, hence protocol is not exposed to
  user space
- impossible to launch umh on its own (that was the main issue of v1)
  and impossible to be man-in-the-middle due to pipes
- bpfilter.ko consists of tiny kernel part that passes the data
  between kernel and umh via pipes and much bigger umh part that
  doing all the work
- 'lsmod' shows bpfilter.ko as usual.
  'rmmod bpfilter' removes kernel module and kills corresponding umh
- signed bpfilter.ko covers the whole image including umh code

Few issues:
- architecturally bpfilter.ko can be builtin, but doesn't work yet.
  Still debugging. Kinda cool to have user mode executables
  to be part of vmlinux
- the user can still attach to the process and debug it with
  'gdb /proc/pid/exe pid', but 'gdb -p pid' doesn't work.
  (a bit worse comparing to v1)
- tinyconfig will notice a small increase in .text
  +766 | TEXT | 7c8b94806bec umh: introduce fork_usermode_blob() helper

More details in patches 1 and 2 that are ready to land.
Patches 3 and 4 are still rough. They were mainly used for
testing and to demonstrate how bpfilter is building on top.
The patch 4 approach of converting one iptable rule to few bpf
instructions will certainly change in the future, since it doesn't
scale to thousands of rules.

Alexei Starovoitov (2):
  umh: introduce fork_usermode_blob() helper
  net: add skeleton of bpfilter kernel module

Daniel Borkmann (1):
  bpfilter: rough bpfilter codegen example hack

David S. Miller (1):
  bpfilter: add iptable get/set parsing

 fs/exec.c |  38 -
 include/linux/binfmts.h   |   1 +
 include/linux/bpfilter.h  |  15 ++
 include/linux/umh.h   |  12 ++
 include/uapi/linux/bpfilter.h | 200 ++
 kernel/umh.c  | 176 +++-
 net/Kconfig   |   2 +
 net/Makefile  |   1 +
 net/bpfilter/Kconfig  |  17 ++
 net/bpfilter/Makefile |  24 +++
 net/bpfilter/bpfilter_kern.c  |  93 +++
 net/bpfilter/bpfilter_mod.h   | 373 ++
 net/bpfilter/ctor.c   |  91 +++
 net/bpfilter/gen.c| 290 
 net/bpfilter/init.c   |  36 
 net/bpfilter/main.c   | 117 +
 net/bpfilter/msgfmt.h |  17 ++
 net/bpfilter/sockopt.c| 236 ++
 net/bpfilter/tables.c |  73 +
 net/bpfilter/targets.c|  51 ++
 net/bpfilter/tgts.c   |  26 +++
 net/ipv4/Makefile |   2 +
 net/ipv4/bpfilter/Makefile|   2 +
 net/ipv4/bpfilter/sockopt.c   |  42 +
 net/ipv4/ip_sockglue.c|  17 ++
 25 files changed, 1940 insertions(+), 12 deletions(-)
 create mode 100644 include/linux/bpfilter.h
 create mode 100644 include/uapi/linux/bpfilter.h
 create mode 100644 net/bpfilter/Kconfig
 create mode 100644 net/bpfilter/Makefile
 create mode 100644 net/bpfilter/bpfilter_kern.c
 create mode 100644 net/bpfilter/bpfilter_mod.h
 create mode 100644 net/bpfilter/ctor.c
 create mode 100644 net/bpfilter/gen.c
 create mode 100644 net/bpfilter/init.c
 create mode 100644 net/bpfilter/main.c
 create mode 100644 net/bpfilter/msgfmt.h
 create mode 100644 net/bpfilter/sockopt.c
 create mode 100644 net/bpfilter/tables.c
 create mode 100644 net/bpfilter/targets.c
 create mode 100644 net/bpfilter/tgts.c
 create mode 100644 net/ipv4/bpfilter/Makefile
 create mode 100644 net/ipv4/bpfilter/sockopt.c

-- 
2.9.5



[PATCH v2 net-next 2/4] net: add skeleton of bpfilter kernel module

2018-05-02 Thread Alexei Starovoitov
bpfilter.ko consists of bpfilter_kern.c (normal kernel module code)
and user mode helper code that is embedded into bpfilter.ko

The steps to build bpfilter.ko are the following:
- main.c is compiled by HOSTCC into the bpfilter_umh elf executable file
- with quite a bit of objcopy and Makefile magic the bpfilter_umh elf file
  is converted into bpfilter_umh.o object file
  with _binary_net_bpfilter_bpfilter_umh_start and _end symbols
  Example:
  $ nm ./bld_x64/net/bpfilter/bpfilter_umh.o
  4cf8 T _binary_net_bpfilter_bpfilter_umh_end
  4cf8 A _binary_net_bpfilter_bpfilter_umh_size
   T _binary_net_bpfilter_bpfilter_umh_start
- bpfilter_umh.o and bpfilter_kern.o are linked together into bpfilter.ko

bpfilter_kern.c is a normal kernel module code that calls
the fork_usermode_blob() helper to execute part of its own data
as a user mode process.

Notice that _binary_net_bpfilter_bpfilter_umh_start - end
is placed into .init.rodata section, so it's freed as soon as __init
function of bpfilter.ko is finished.
As part of __init the bpfilter.ko does first request/reply action
via two unix pipe provided by fork_usermode_blob() helper to
make sure that umh is healthy. If not it will kill it via pid.

Later bpfilter_process_sockopt() will be called from bpfilter hooks
in get/setsockopt() to pass iptable commands into umh via bpfilter.ko

If admin does 'rmmod bpfilter' the __exit code bpfilter.ko will
kill umh as well.

Signed-off-by: Alexei Starovoitov 
---
 include/linux/bpfilter.h  | 15 +++
 include/uapi/linux/bpfilter.h | 21 ++
 net/Kconfig   |  2 +
 net/Makefile  |  1 +
 net/bpfilter/Kconfig  | 17 
 net/bpfilter/Makefile | 24 +++
 net/bpfilter/bpfilter_kern.c  | 93 +++
 net/bpfilter/main.c   | 63 +
 net/bpfilter/msgfmt.h | 17 
 net/ipv4/Makefile |  2 +
 net/ipv4/bpfilter/Makefile|  2 +
 net/ipv4/bpfilter/sockopt.c   | 42 +++
 net/ipv4/ip_sockglue.c| 17 
 13 files changed, 316 insertions(+)
 create mode 100644 include/linux/bpfilter.h
 create mode 100644 include/uapi/linux/bpfilter.h
 create mode 100644 net/bpfilter/Kconfig
 create mode 100644 net/bpfilter/Makefile
 create mode 100644 net/bpfilter/bpfilter_kern.c
 create mode 100644 net/bpfilter/main.c
 create mode 100644 net/bpfilter/msgfmt.h
 create mode 100644 net/ipv4/bpfilter/Makefile
 create mode 100644 net/ipv4/bpfilter/sockopt.c

diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h
new file mode 100644
index ..687b1760bb9f
--- /dev/null
+++ b/include/linux/bpfilter.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_BPFILTER_H
+#define _LINUX_BPFILTER_H
+
+#include 
+
+struct sock;
+int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char *optval,
+   unsigned int optlen);
+int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char *optval,
+   int *optlen);
+extern int (*bpfilter_process_sockopt)(struct sock *sk, int optname,
+  char __user *optval,
+  unsigned int optlen, bool is_set);
+#endif
diff --git a/include/uapi/linux/bpfilter.h b/include/uapi/linux/bpfilter.h
new file mode 100644
index ..2ec3cc99ea4c
--- /dev/null
+++ b/include/uapi/linux/bpfilter.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _UAPI_LINUX_BPFILTER_H
+#define _UAPI_LINUX_BPFILTER_H
+
+#include 
+
+enum {
+   BPFILTER_IPT_SO_SET_REPLACE = 64,
+   BPFILTER_IPT_SO_SET_ADD_COUNTERS = 65,
+   BPFILTER_IPT_SET_MAX,
+};
+
+enum {
+   BPFILTER_IPT_SO_GET_INFO = 64,
+   BPFILTER_IPT_SO_GET_ENTRIES = 65,
+   BPFILTER_IPT_SO_GET_REVISION_MATCH = 66,
+   BPFILTER_IPT_SO_GET_REVISION_TARGET = 67,
+   BPFILTER_IPT_GET_MAX,
+};
+
+#endif /* _UAPI_LINUX_BPFILTER_H */
diff --git a/net/Kconfig b/net/Kconfig
index b62089fb1332..ed6368b306fa 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -201,6 +201,8 @@ source "net/bridge/netfilter/Kconfig"
 
 endif
 
+source "net/bpfilter/Kconfig"
+
 source "net/dccp/Kconfig"
 source "net/sctp/Kconfig"
 source "net/rds/Kconfig"
diff --git a/net/Makefile b/net/Makefile
index a6147c61b174..7f982b7682bd 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_TLS) += tls/
 obj-$(CONFIG_XFRM) += xfrm/
 obj-$(CONFIG_UNIX) += unix/
 obj-$(CONFIG_NET)  += ipv6/
+obj-$(CONFIG_BPFILTER) += bpfilter/
 obj-$(CONFIG_PACKET)   += packet/
 obj-$(CONFIG_NET_KEY)  += key/
 obj-$(CONFIG_BRIDGE)   += bridge/
diff --git a/net/bpfilter/Kconfig b/net/bpfilter/Kconfig
new file mode 100644
index ..782a732b9a5c
--- /dev/null
+++ b/net/bpfilter/Kconfig
@@ -0,0 +1,17 @@

[PATCH v2 net-next 2/4] net: add skeleton of bpfilter kernel module

2018-05-02 Thread Alexei Starovoitov
bpfilter.ko consists of bpfilter_kern.c (normal kernel module code)
and user mode helper code that is embedded into bpfilter.ko

The steps to build bpfilter.ko are the following:
- main.c is compiled by HOSTCC into the bpfilter_umh elf executable file
- with quite a bit of objcopy and Makefile magic the bpfilter_umh elf file
  is converted into bpfilter_umh.o object file
  with _binary_net_bpfilter_bpfilter_umh_start and _end symbols
  Example:
  $ nm ./bld_x64/net/bpfilter/bpfilter_umh.o
  4cf8 T _binary_net_bpfilter_bpfilter_umh_end
  4cf8 A _binary_net_bpfilter_bpfilter_umh_size
   T _binary_net_bpfilter_bpfilter_umh_start
- bpfilter_umh.o and bpfilter_kern.o are linked together into bpfilter.ko

bpfilter_kern.c is a normal kernel module code that calls
the fork_usermode_blob() helper to execute part of its own data
as a user mode process.

Notice that _binary_net_bpfilter_bpfilter_umh_start - end
is placed into .init.rodata section, so it's freed as soon as __init
function of bpfilter.ko is finished.
As part of __init the bpfilter.ko does first request/reply action
via two unix pipe provided by fork_usermode_blob() helper to
make sure that umh is healthy. If not it will kill it via pid.

Later bpfilter_process_sockopt() will be called from bpfilter hooks
in get/setsockopt() to pass iptable commands into umh via bpfilter.ko

If admin does 'rmmod bpfilter' the __exit code bpfilter.ko will
kill umh as well.

Signed-off-by: Alexei Starovoitov 
---
 include/linux/bpfilter.h  | 15 +++
 include/uapi/linux/bpfilter.h | 21 ++
 net/Kconfig   |  2 +
 net/Makefile  |  1 +
 net/bpfilter/Kconfig  | 17 
 net/bpfilter/Makefile | 24 +++
 net/bpfilter/bpfilter_kern.c  | 93 +++
 net/bpfilter/main.c   | 63 +
 net/bpfilter/msgfmt.h | 17 
 net/ipv4/Makefile |  2 +
 net/ipv4/bpfilter/Makefile|  2 +
 net/ipv4/bpfilter/sockopt.c   | 42 +++
 net/ipv4/ip_sockglue.c| 17 
 13 files changed, 316 insertions(+)
 create mode 100644 include/linux/bpfilter.h
 create mode 100644 include/uapi/linux/bpfilter.h
 create mode 100644 net/bpfilter/Kconfig
 create mode 100644 net/bpfilter/Makefile
 create mode 100644 net/bpfilter/bpfilter_kern.c
 create mode 100644 net/bpfilter/main.c
 create mode 100644 net/bpfilter/msgfmt.h
 create mode 100644 net/ipv4/bpfilter/Makefile
 create mode 100644 net/ipv4/bpfilter/sockopt.c

diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h
new file mode 100644
index ..687b1760bb9f
--- /dev/null
+++ b/include/linux/bpfilter.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_BPFILTER_H
+#define _LINUX_BPFILTER_H
+
+#include 
+
+struct sock;
+int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char *optval,
+   unsigned int optlen);
+int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char *optval,
+   int *optlen);
+extern int (*bpfilter_process_sockopt)(struct sock *sk, int optname,
+  char __user *optval,
+  unsigned int optlen, bool is_set);
+#endif
diff --git a/include/uapi/linux/bpfilter.h b/include/uapi/linux/bpfilter.h
new file mode 100644
index ..2ec3cc99ea4c
--- /dev/null
+++ b/include/uapi/linux/bpfilter.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _UAPI_LINUX_BPFILTER_H
+#define _UAPI_LINUX_BPFILTER_H
+
+#include 
+
+enum {
+   BPFILTER_IPT_SO_SET_REPLACE = 64,
+   BPFILTER_IPT_SO_SET_ADD_COUNTERS = 65,
+   BPFILTER_IPT_SET_MAX,
+};
+
+enum {
+   BPFILTER_IPT_SO_GET_INFO = 64,
+   BPFILTER_IPT_SO_GET_ENTRIES = 65,
+   BPFILTER_IPT_SO_GET_REVISION_MATCH = 66,
+   BPFILTER_IPT_SO_GET_REVISION_TARGET = 67,
+   BPFILTER_IPT_GET_MAX,
+};
+
+#endif /* _UAPI_LINUX_BPFILTER_H */
diff --git a/net/Kconfig b/net/Kconfig
index b62089fb1332..ed6368b306fa 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -201,6 +201,8 @@ source "net/bridge/netfilter/Kconfig"
 
 endif
 
+source "net/bpfilter/Kconfig"
+
 source "net/dccp/Kconfig"
 source "net/sctp/Kconfig"
 source "net/rds/Kconfig"
diff --git a/net/Makefile b/net/Makefile
index a6147c61b174..7f982b7682bd 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_TLS) += tls/
 obj-$(CONFIG_XFRM) += xfrm/
 obj-$(CONFIG_UNIX) += unix/
 obj-$(CONFIG_NET)  += ipv6/
+obj-$(CONFIG_BPFILTER) += bpfilter/
 obj-$(CONFIG_PACKET)   += packet/
 obj-$(CONFIG_NET_KEY)  += key/
 obj-$(CONFIG_BRIDGE)   += bridge/
diff --git a/net/bpfilter/Kconfig b/net/bpfilter/Kconfig
new file mode 100644
index ..782a732b9a5c
--- /dev/null
+++ b/net/bpfilter/Kconfig
@@ -0,0 +1,17 @@
+menuconfig 

[PATCH net] macsonic: Set platform device coherent_dma_mask

2018-05-02 Thread Finn Thain
Set the device's coherent_dma_mask to avoid a WARNING splat.
Please see commit 205e1b7f51e4 ("dma-mapping: warn when there is
no coherent_dma_mask").

Cc: linux-m...@lists.linux-m68k.org
Signed-off-by: Finn Thain 
---
 drivers/net/ethernet/natsemi/macsonic.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/natsemi/macsonic.c 
b/drivers/net/ethernet/natsemi/macsonic.c
index 0937fc2a928e..37b1ffa8bb61 100644
--- a/drivers/net/ethernet/natsemi/macsonic.c
+++ b/drivers/net/ethernet/natsemi/macsonic.c
@@ -523,6 +523,10 @@ static int mac_sonic_platform_probe(struct platform_device 
*pdev)
struct sonic_local *lp;
int err;
 
+   err = dma_coerce_mask_and_coherent(>dev, DMA_BIT_MASK(32));
+   if (err)
+   return err;
+
dev = alloc_etherdev(sizeof(struct sonic_local));
if (!dev)
return -ENOMEM;
-- 
2.16.1



[PATCH net] macsonic: Set platform device coherent_dma_mask

2018-05-02 Thread Finn Thain
Set the device's coherent_dma_mask to avoid a WARNING splat.
Please see commit 205e1b7f51e4 ("dma-mapping: warn when there is
no coherent_dma_mask").

Cc: linux-m...@lists.linux-m68k.org
Signed-off-by: Finn Thain 
---
 drivers/net/ethernet/natsemi/macsonic.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/natsemi/macsonic.c 
b/drivers/net/ethernet/natsemi/macsonic.c
index 0937fc2a928e..37b1ffa8bb61 100644
--- a/drivers/net/ethernet/natsemi/macsonic.c
+++ b/drivers/net/ethernet/natsemi/macsonic.c
@@ -523,6 +523,10 @@ static int mac_sonic_platform_probe(struct platform_device 
*pdev)
struct sonic_local *lp;
int err;
 
+   err = dma_coerce_mask_and_coherent(>dev, DMA_BIT_MASK(32));
+   if (err)
+   return err;
+
dev = alloc_etherdev(sizeof(struct sonic_local));
if (!dev)
return -ENOMEM;
-- 
2.16.1



[PATCH net] macmace: Set platform device coherent_dma_mask

2018-05-02 Thread Finn Thain
Set the device's coherent_dma_mask to avoid a WARNING splat.
Please see commit 205e1b7f51e4 ("dma-mapping: warn when there is
no coherent_dma_mask").

Cc: linux-m...@lists.linux-m68k.org
Tested-by: Stan Johnson 
Signed-off-by: Finn Thain 
---
 drivers/net/ethernet/apple/macmace.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/apple/macmace.c 
b/drivers/net/ethernet/apple/macmace.c
index 137cbb470af2..98292c49ecf0 100644
--- a/drivers/net/ethernet/apple/macmace.c
+++ b/drivers/net/ethernet/apple/macmace.c
@@ -203,6 +203,10 @@ static int mace_probe(struct platform_device *pdev)
unsigned char checksum = 0;
int err;
 
+   err = dma_coerce_mask_and_coherent(>dev, DMA_BIT_MASK(32));
+   if (err)
+   return err;
+
dev = alloc_etherdev(PRIV_BYTES);
if (!dev)
return -ENOMEM;
-- 
2.16.1



[PATCH net] macmace: Set platform device coherent_dma_mask

2018-05-02 Thread Finn Thain
Set the device's coherent_dma_mask to avoid a WARNING splat.
Please see commit 205e1b7f51e4 ("dma-mapping: warn when there is
no coherent_dma_mask").

Cc: linux-m...@lists.linux-m68k.org
Tested-by: Stan Johnson 
Signed-off-by: Finn Thain 
---
 drivers/net/ethernet/apple/macmace.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/apple/macmace.c 
b/drivers/net/ethernet/apple/macmace.c
index 137cbb470af2..98292c49ecf0 100644
--- a/drivers/net/ethernet/apple/macmace.c
+++ b/drivers/net/ethernet/apple/macmace.c
@@ -203,6 +203,10 @@ static int mace_probe(struct platform_device *pdev)
unsigned char checksum = 0;
int err;
 
+   err = dma_coerce_mask_and_coherent(>dev, DMA_BIT_MASK(32));
+   if (err)
+   return err;
+
dev = alloc_etherdev(PRIV_BYTES);
if (!dev)
return -ENOMEM;
-- 
2.16.1



[PATCH] clk: qcom: Add support for RCG to register for DFS

2018-05-02 Thread Taniya Das
In the cases where a RCG requires a Dynamic Frequency switch support
requires to register which would at runtime read the clock perf level
registers to identify the frequencies supported and update the frequency
table accordingly.

Signed-off-by: Taniya Das 
---
 drivers/clk/qcom/clk-rcg.h  |   7 +-
 drivers/clk/qcom/clk-rcg2.c | 172 
 drivers/clk/qcom/common.c   |  23 +-
 drivers/clk/qcom/common.h   |  14 +++-
 4 files changed, 213 insertions(+), 3 deletions(-)

diff --git a/drivers/clk/qcom/clk-rcg.h b/drivers/clk/qcom/clk-rcg.h
index 2a7489a..06de69f 100644
--- a/drivers/clk/qcom/clk-rcg.h
+++ b/drivers/clk/qcom/clk-rcg.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2013, 2018, The Linux Foundation. All rights reserved.
  *
  * This software is licensed under the terms of the GNU General Public
  * License version 2, as published by the Free Software Foundation, and
@@ -144,6 +144,7 @@ struct clk_dyn_rcg {
  * @cmd_rcgr: corresponds to *_CMD_RCGR
  * @mnd_width: number of bits in m/n/d values
  * @hid_width: number of bits in half integer divider
+ * @flags: additional flag parameters for the RCG
  * @parent_map: map from software's parent index to hardware's src_sel field
  * @freq_tbl: frequency table
  * @clkr: regmap clock handle
@@ -153,6 +154,8 @@ struct clk_rcg2 {
u32 cmd_rcgr;
u8  mnd_width;
u8  hid_width;
+   u8  flags;
+#define DFS_ENABLE_RCG BIT(0)
const struct parent_map *parent_map;
const struct freq_tbl   *freq_tbl;
struct clk_regmap   clkr;
@@ -168,4 +171,6 @@ struct clk_rcg2 {
 extern const struct clk_ops clk_pixel_ops;
 extern const struct clk_ops clk_gfx3d_ops;

+extern int clk_rcg2_get_dfs_clock_rate(struct clk_rcg2 *clk,
+   struct device *dev);
 #endif
diff --git a/drivers/clk/qcom/clk-rcg2.c b/drivers/clk/qcom/clk-rcg2.c
index e63db10..7c35bca 100644
--- a/drivers/clk/qcom/clk-rcg2.c
+++ b/drivers/clk/qcom/clk-rcg2.c
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 

@@ -48,6 +49,14 @@
 #define N_REG  0xc
 #define D_REG  0x10

+/* Dynamic Frequency Scaling */
+#define MAX_PERF_LEVEL 16
+#define SE_CMD_DFSR_OFFSET 0x14
+#define SE_CMD_DFS_EN  BIT(0)
+#define SE_PERF_DFSR(level)(0x1c + 0x4 * (level))
+#define SE_PERF_M_DFSR(level)  (0x5c + 0x4 * (level))
+#define SE_PERF_N_DFSR(level)  (0x9c + 0x4 * (level))
+
 enum freq_policy {
FLOOR,
CEIL,
@@ -122,6 +131,10 @@ static int clk_rcg2_set_parent(struct clk_hw *hw, u8 index)
int ret;
u32 cfg = rcg->parent_map[index].cfg << CFG_SRC_SEL_SHIFT;

+   /* In DFS mode skip updating the RCG CFG */
+   if (rcg->flags & DFS_ENABLE_RCG)
+   return 0;
+
ret = regmap_update_bits(rcg->clkr.regmap, rcg->cmd_rcgr + CFG_REG,
 CFG_SRC_SEL_MASK, cfg);
if (ret)
@@ -296,6 +309,9 @@ static int __clk_rcg2_set_rate(struct clk_hw *hw, unsigned 
long rate,
struct clk_rcg2 *rcg = to_clk_rcg2(hw);
const struct freq_tbl *f;

+   if (rcg->flags & DFS_ENABLE_RCG)
+   return -EPERM;
+
switch (policy) {
case FLOOR:
f = qcom_find_freq_floor(rcg->freq_tbl, rate);
@@ -790,3 +806,159 @@ static int clk_gfx3d_set_rate(struct clk_hw *hw, unsigned 
long rate,
.determine_rate = clk_gfx3d_determine_rate,
 };
 EXPORT_SYMBOL_GPL(clk_gfx3d_ops);
+
+/* Common APIs to be used for DFS based RCGR */
+static u8 clk_parent_index_pre_div_and_mode(struct clk_hw *hw, u32 offset,
+   u32 *mode, u32 *pre_div)
+{
+   struct clk_rcg2 *rcg;
+   int num_parents;
+   u32 cfg, mask;
+   int i, ret;
+
+   if (!hw)
+   return -EINVAL;
+
+   num_parents = clk_hw_get_num_parents(hw);
+
+   rcg = to_clk_rcg2(hw);
+
+   ret = regmap_read(rcg->clkr.regmap, rcg->cmd_rcgr + offset, );
+   if (ret)
+   goto err;
+
+   mask = BIT(rcg->hid_width) - 1;
+   *pre_div = cfg & mask ? (cfg & mask) : 1;
+
+   *mode = cfg & CFG_MODE_MASK;
+   *mode >>= CFG_MODE_SHIFT;
+
+   cfg &= CFG_SRC_SEL_MASK;
+   cfg >>= CFG_SRC_SEL_SHIFT;
+
+   for (i = 0; i < num_parents; i++)
+   if (cfg == rcg->parent_map[i].cfg)
+   return i;
+err:
+   return 0;
+}
+
+static int calculate_m_and_n(struct clk_hw *hw, u32 m_offset, u32 n_offset,
+   u32 mode, u32 *m, u32 *n)
+{
+   struct clk_rcg2 *rcg = to_clk_rcg2(hw);
+   u32 val, mask;
+   int ret = 0;
+
+   if (!hw)
+   return -EINVAL;
+
+   *m = *n = 0;
+
+   if (mode) {
+   /* Calculate M & N values */
+   mask = 

[PATCH] clk: qcom: Add support for RCG to register for DFS

2018-05-02 Thread Taniya Das
In the cases where a RCG requires a Dynamic Frequency switch support
requires to register which would at runtime read the clock perf level
registers to identify the frequencies supported and update the frequency
table accordingly.

Signed-off-by: Taniya Das 
---
 drivers/clk/qcom/clk-rcg.h  |   7 +-
 drivers/clk/qcom/clk-rcg2.c | 172 
 drivers/clk/qcom/common.c   |  23 +-
 drivers/clk/qcom/common.h   |  14 +++-
 4 files changed, 213 insertions(+), 3 deletions(-)

diff --git a/drivers/clk/qcom/clk-rcg.h b/drivers/clk/qcom/clk-rcg.h
index 2a7489a..06de69f 100644
--- a/drivers/clk/qcom/clk-rcg.h
+++ b/drivers/clk/qcom/clk-rcg.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2013, 2018, The Linux Foundation. All rights reserved.
  *
  * This software is licensed under the terms of the GNU General Public
  * License version 2, as published by the Free Software Foundation, and
@@ -144,6 +144,7 @@ struct clk_dyn_rcg {
  * @cmd_rcgr: corresponds to *_CMD_RCGR
  * @mnd_width: number of bits in m/n/d values
  * @hid_width: number of bits in half integer divider
+ * @flags: additional flag parameters for the RCG
  * @parent_map: map from software's parent index to hardware's src_sel field
  * @freq_tbl: frequency table
  * @clkr: regmap clock handle
@@ -153,6 +154,8 @@ struct clk_rcg2 {
u32 cmd_rcgr;
u8  mnd_width;
u8  hid_width;
+   u8  flags;
+#define DFS_ENABLE_RCG BIT(0)
const struct parent_map *parent_map;
const struct freq_tbl   *freq_tbl;
struct clk_regmap   clkr;
@@ -168,4 +171,6 @@ struct clk_rcg2 {
 extern const struct clk_ops clk_pixel_ops;
 extern const struct clk_ops clk_gfx3d_ops;

+extern int clk_rcg2_get_dfs_clock_rate(struct clk_rcg2 *clk,
+   struct device *dev);
 #endif
diff --git a/drivers/clk/qcom/clk-rcg2.c b/drivers/clk/qcom/clk-rcg2.c
index e63db10..7c35bca 100644
--- a/drivers/clk/qcom/clk-rcg2.c
+++ b/drivers/clk/qcom/clk-rcg2.c
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 

@@ -48,6 +49,14 @@
 #define N_REG  0xc
 #define D_REG  0x10

+/* Dynamic Frequency Scaling */
+#define MAX_PERF_LEVEL 16
+#define SE_CMD_DFSR_OFFSET 0x14
+#define SE_CMD_DFS_EN  BIT(0)
+#define SE_PERF_DFSR(level)(0x1c + 0x4 * (level))
+#define SE_PERF_M_DFSR(level)  (0x5c + 0x4 * (level))
+#define SE_PERF_N_DFSR(level)  (0x9c + 0x4 * (level))
+
 enum freq_policy {
FLOOR,
CEIL,
@@ -122,6 +131,10 @@ static int clk_rcg2_set_parent(struct clk_hw *hw, u8 index)
int ret;
u32 cfg = rcg->parent_map[index].cfg << CFG_SRC_SEL_SHIFT;

+   /* In DFS mode skip updating the RCG CFG */
+   if (rcg->flags & DFS_ENABLE_RCG)
+   return 0;
+
ret = regmap_update_bits(rcg->clkr.regmap, rcg->cmd_rcgr + CFG_REG,
 CFG_SRC_SEL_MASK, cfg);
if (ret)
@@ -296,6 +309,9 @@ static int __clk_rcg2_set_rate(struct clk_hw *hw, unsigned 
long rate,
struct clk_rcg2 *rcg = to_clk_rcg2(hw);
const struct freq_tbl *f;

+   if (rcg->flags & DFS_ENABLE_RCG)
+   return -EPERM;
+
switch (policy) {
case FLOOR:
f = qcom_find_freq_floor(rcg->freq_tbl, rate);
@@ -790,3 +806,159 @@ static int clk_gfx3d_set_rate(struct clk_hw *hw, unsigned 
long rate,
.determine_rate = clk_gfx3d_determine_rate,
 };
 EXPORT_SYMBOL_GPL(clk_gfx3d_ops);
+
+/* Common APIs to be used for DFS based RCGR */
+static u8 clk_parent_index_pre_div_and_mode(struct clk_hw *hw, u32 offset,
+   u32 *mode, u32 *pre_div)
+{
+   struct clk_rcg2 *rcg;
+   int num_parents;
+   u32 cfg, mask;
+   int i, ret;
+
+   if (!hw)
+   return -EINVAL;
+
+   num_parents = clk_hw_get_num_parents(hw);
+
+   rcg = to_clk_rcg2(hw);
+
+   ret = regmap_read(rcg->clkr.regmap, rcg->cmd_rcgr + offset, );
+   if (ret)
+   goto err;
+
+   mask = BIT(rcg->hid_width) - 1;
+   *pre_div = cfg & mask ? (cfg & mask) : 1;
+
+   *mode = cfg & CFG_MODE_MASK;
+   *mode >>= CFG_MODE_SHIFT;
+
+   cfg &= CFG_SRC_SEL_MASK;
+   cfg >>= CFG_SRC_SEL_SHIFT;
+
+   for (i = 0; i < num_parents; i++)
+   if (cfg == rcg->parent_map[i].cfg)
+   return i;
+err:
+   return 0;
+}
+
+static int calculate_m_and_n(struct clk_hw *hw, u32 m_offset, u32 n_offset,
+   u32 mode, u32 *m, u32 *n)
+{
+   struct clk_rcg2 *rcg = to_clk_rcg2(hw);
+   u32 val, mask;
+   int ret = 0;
+
+   if (!hw)
+   return -EINVAL;
+
+   *m = *n = 0;
+
+   if (mode) {
+   /* Calculate M & N values */
+   mask = BIT(rcg->mnd_width) - 

Re: [PATCH V4 5/8] soc: mediatek: pwrap: add pwrap for mt6797 SoCs

2018-05-02 Thread Sean Wang
Hi, Argus

On Wed, 2018-05-02 at 17:21 +0800, argus@mediatek.com wrote:
> From: Argus Lin 
> 
> mt6797 is a highly integrated SoCs, it uses mt6351 for power
> management. We need to add pwrap support to access mt6351.
> Pwrap of mt6797 support new feature include starvation and channel
> request exception interrupt, dynamic starvation priority
> adjustment mechanism.

suggest line wrapping closely at 75 columns

> 
> Signed-off-by: Argus Lin 
> ---
>  drivers/soc/mediatek/mtk-pmic-wrap.c | 110 
> ---
>  1 file changed, 102 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/soc/mediatek/mtk-pmic-wrap.c 
> b/drivers/soc/mediatek/mtk-pmic-wrap.c
> index a6366f147b79..0d4a2dae6912 100644
> --- a/drivers/soc/mediatek/mtk-pmic-wrap.c
> +++ b/drivers/soc/mediatek/mtk-pmic-wrap.c
> @@ -284,6 +284,12 @@ enum pwrap_regs {
>   PWRAP_DVFS_WDATA7,
>   PWRAP_SPMINF_STA,
>   PWRAP_CIPHER_EN,
> +
> + /* MT6797 series regs */
> + PWRAP_INT1_EN,
> + PWRAP_INT1_FLG_RAW,
> + PWRAP_INT1_FLG,
> + PWRAP_INT1_CLR,
>  };
>  
>  static int mt2701_regs[] = {
> @@ -372,6 +378,43 @@ static int mt2701_regs[] = {
>   [PWRAP_ADC_RDATA_ADDR2] =   0x154,
>  };
>  
> +static int mt6797_regs[] = {
> + [PWRAP_MUX_SEL] =   0x0,
> + [PWRAP_WRAP_EN] =   0x4,
> + [PWRAP_DIO_EN] =0x8,
> + [PWRAP_SIDLY] = 0xC,
> + [PWRAP_RDDMY] = 0x10,
> + [PWRAP_CSHEXT_WRITE] =  0x18,
> + [PWRAP_CSHEXT_READ] =   0x1C,
> + [PWRAP_CSLEXT_START] =  0x20,
> + [PWRAP_CSLEXT_END] =0x24,
> + [PWRAP_STAUPD_PRD] =0x28,
> + [PWRAP_HARB_HPRIO] =0x50,
> + [PWRAP_HIPRIO_ARB_EN] = 0x54,
> + [PWRAP_MAN_EN] =0x60,
> + [PWRAP_MAN_CMD] =   0x64,
> + [PWRAP_WACS0_EN] =  0x70,
> + [PWRAP_WACS1_EN] =  0x84,
> + [PWRAP_WACS2_EN] =  0x98,
> + [PWRAP_INIT_DONE2] =0x9C,
> + [PWRAP_WACS2_CMD] = 0xA0,
> + [PWRAP_WACS2_RDATA] =   0xA4,
> + [PWRAP_WACS2_VLDCLR] =  0xA8,
> + [PWRAP_INT_EN] =0xC0,
> + [PWRAP_INT_FLG_RAW] =   0xC4,
> + [PWRAP_INT_FLG] =   0xC8,
> + [PWRAP_INT_CLR] =   0xCC,
> + [PWRAP_INT1_EN] =   0xD0,
> + [PWRAP_INT1_FLG_RAW] =  0xD4,
> + [PWRAP_INT1_FLG] =  0xD8,
> + [PWRAP_INT1_CLR] =  0xDC,
> + [PWRAP_TIMER_EN] =  0xF4,
> + [PWRAP_WDT_UNIT] =  0xFC,
> + [PWRAP_WDT_SRC_EN] =0x100,
> + [PWRAP_DCM_EN] =0x1CC,
> + [PWRAP_DCM_DBC_PRD] =   0x1D4,
> +};
> +

trim unused registers if any

>  static int mt7622_regs[] = {
>   [PWRAP_MUX_SEL] =   0x0,
>   [PWRAP_WRAP_EN] =   0x4,
> @@ -647,6 +690,7 @@ enum pmic_type {
>  
>  enum pwrap_type {
>   PWRAP_MT2701,
> + PWRAP_MT6797,
>   PWRAP_MT7622,
>   PWRAP_MT8135,
>   PWRAP_MT8173,
> @@ -1006,6 +1050,12 @@ static void pwrap_init_chip_select_ext(struct 
> pmic_wrapper *wrp, u8 hext_write,
>  static int pwrap_common_init_reg_clock(struct pmic_wrapper *wrp)
>  {
>   switch (wrp->master->type) {
> + case PWRAP_MT6797:
> + pwrap_writel(wrp, 0x8, PWRAP_RDDMY);
> + pwrap_write(wrp, wrp->slave->dew_regs[PWRAP_DEW_RDDMY_NO],
> + 0x8);
> + pwrap_init_chip_select_ext(wrp, 0x88, 0x55, 3, 0);
> + break;

the setup for timing is much similar to mt2701 + mt6323

so we can merge the both logic into one, and then hope to eliminate specific 
pwrap_mt2701_init_reg_clock totally

>   case PWRAP_MT8173:
>   pwrap_init_chip_select_ext(wrp, 0, 4, 2, 2);
>   break;
> @@ -1076,11 +1126,14 @@ static int pwrap_init_cipher(struct pmic_wrapper *wrp)
>   break;
>   case PWRAP_MT2701:
>   case PWRAP_MT8173:
> + case PWRAP_MT6797:

need to be listed in alphabetical order

>   pwrap_writel(wrp, 1, PWRAP_CIPHER_EN);
>   break;
>   case PWRAP_MT7622:
>   pwrap_writel(wrp, 0, PWRAP_CIPHER_EN);
>   break;
> + default:
> + break;
>   }
>  
>   /* Config cipher mode @PMIC */
> @@ -1325,6 +1378,15 @@ static irqreturn_t pwrap_interrupt(int irqno, void 
> *dev_id)
>  
>   pwrap_writel(wrp, 0x, PWRAP_INT_CLR);
>  
> + /* If we support INT1 interrupt, we also need to clear it */
> + if (HAS_CAP(wrp->master->caps, PWRAP_CAP_INT1_EN)) {
> + rdata = pwrap_readl(wrp, PWRAP_INT1_FLG);
> +
> + dev_err(wrp->dev, "unexpected interrupt int1=0x%x\n", rdata);
> +
> + pwrap_writel(wrp, rdata, PWRAP_INT1_CLR);
> + }
> +

it 

Re: [PATCH V4 5/8] soc: mediatek: pwrap: add pwrap for mt6797 SoCs

2018-05-02 Thread Sean Wang
Hi, Argus

On Wed, 2018-05-02 at 17:21 +0800, argus@mediatek.com wrote:
> From: Argus Lin 
> 
> mt6797 is a highly integrated SoCs, it uses mt6351 for power
> management. We need to add pwrap support to access mt6351.
> Pwrap of mt6797 support new feature include starvation and channel
> request exception interrupt, dynamic starvation priority
> adjustment mechanism.

suggest line wrapping closely at 75 columns

> 
> Signed-off-by: Argus Lin 
> ---
>  drivers/soc/mediatek/mtk-pmic-wrap.c | 110 
> ---
>  1 file changed, 102 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/soc/mediatek/mtk-pmic-wrap.c 
> b/drivers/soc/mediatek/mtk-pmic-wrap.c
> index a6366f147b79..0d4a2dae6912 100644
> --- a/drivers/soc/mediatek/mtk-pmic-wrap.c
> +++ b/drivers/soc/mediatek/mtk-pmic-wrap.c
> @@ -284,6 +284,12 @@ enum pwrap_regs {
>   PWRAP_DVFS_WDATA7,
>   PWRAP_SPMINF_STA,
>   PWRAP_CIPHER_EN,
> +
> + /* MT6797 series regs */
> + PWRAP_INT1_EN,
> + PWRAP_INT1_FLG_RAW,
> + PWRAP_INT1_FLG,
> + PWRAP_INT1_CLR,
>  };
>  
>  static int mt2701_regs[] = {
> @@ -372,6 +378,43 @@ static int mt2701_regs[] = {
>   [PWRAP_ADC_RDATA_ADDR2] =   0x154,
>  };
>  
> +static int mt6797_regs[] = {
> + [PWRAP_MUX_SEL] =   0x0,
> + [PWRAP_WRAP_EN] =   0x4,
> + [PWRAP_DIO_EN] =0x8,
> + [PWRAP_SIDLY] = 0xC,
> + [PWRAP_RDDMY] = 0x10,
> + [PWRAP_CSHEXT_WRITE] =  0x18,
> + [PWRAP_CSHEXT_READ] =   0x1C,
> + [PWRAP_CSLEXT_START] =  0x20,
> + [PWRAP_CSLEXT_END] =0x24,
> + [PWRAP_STAUPD_PRD] =0x28,
> + [PWRAP_HARB_HPRIO] =0x50,
> + [PWRAP_HIPRIO_ARB_EN] = 0x54,
> + [PWRAP_MAN_EN] =0x60,
> + [PWRAP_MAN_CMD] =   0x64,
> + [PWRAP_WACS0_EN] =  0x70,
> + [PWRAP_WACS1_EN] =  0x84,
> + [PWRAP_WACS2_EN] =  0x98,
> + [PWRAP_INIT_DONE2] =0x9C,
> + [PWRAP_WACS2_CMD] = 0xA0,
> + [PWRAP_WACS2_RDATA] =   0xA4,
> + [PWRAP_WACS2_VLDCLR] =  0xA8,
> + [PWRAP_INT_EN] =0xC0,
> + [PWRAP_INT_FLG_RAW] =   0xC4,
> + [PWRAP_INT_FLG] =   0xC8,
> + [PWRAP_INT_CLR] =   0xCC,
> + [PWRAP_INT1_EN] =   0xD0,
> + [PWRAP_INT1_FLG_RAW] =  0xD4,
> + [PWRAP_INT1_FLG] =  0xD8,
> + [PWRAP_INT1_CLR] =  0xDC,
> + [PWRAP_TIMER_EN] =  0xF4,
> + [PWRAP_WDT_UNIT] =  0xFC,
> + [PWRAP_WDT_SRC_EN] =0x100,
> + [PWRAP_DCM_EN] =0x1CC,
> + [PWRAP_DCM_DBC_PRD] =   0x1D4,
> +};
> +

trim unused registers if any

>  static int mt7622_regs[] = {
>   [PWRAP_MUX_SEL] =   0x0,
>   [PWRAP_WRAP_EN] =   0x4,
> @@ -647,6 +690,7 @@ enum pmic_type {
>  
>  enum pwrap_type {
>   PWRAP_MT2701,
> + PWRAP_MT6797,
>   PWRAP_MT7622,
>   PWRAP_MT8135,
>   PWRAP_MT8173,
> @@ -1006,6 +1050,12 @@ static void pwrap_init_chip_select_ext(struct 
> pmic_wrapper *wrp, u8 hext_write,
>  static int pwrap_common_init_reg_clock(struct pmic_wrapper *wrp)
>  {
>   switch (wrp->master->type) {
> + case PWRAP_MT6797:
> + pwrap_writel(wrp, 0x8, PWRAP_RDDMY);
> + pwrap_write(wrp, wrp->slave->dew_regs[PWRAP_DEW_RDDMY_NO],
> + 0x8);
> + pwrap_init_chip_select_ext(wrp, 0x88, 0x55, 3, 0);
> + break;

the setup for timing is much similar to mt2701 + mt6323

so we can merge the both logic into one, and then hope to eliminate specific 
pwrap_mt2701_init_reg_clock totally

>   case PWRAP_MT8173:
>   pwrap_init_chip_select_ext(wrp, 0, 4, 2, 2);
>   break;
> @@ -1076,11 +1126,14 @@ static int pwrap_init_cipher(struct pmic_wrapper *wrp)
>   break;
>   case PWRAP_MT2701:
>   case PWRAP_MT8173:
> + case PWRAP_MT6797:

need to be listed in alphabetical order

>   pwrap_writel(wrp, 1, PWRAP_CIPHER_EN);
>   break;
>   case PWRAP_MT7622:
>   pwrap_writel(wrp, 0, PWRAP_CIPHER_EN);
>   break;
> + default:
> + break;
>   }
>  
>   /* Config cipher mode @PMIC */
> @@ -1325,6 +1378,15 @@ static irqreturn_t pwrap_interrupt(int irqno, void 
> *dev_id)
>  
>   pwrap_writel(wrp, 0x, PWRAP_INT_CLR);
>  
> + /* If we support INT1 interrupt, we also need to clear it */
> + if (HAS_CAP(wrp->master->caps, PWRAP_CAP_INT1_EN)) {
> + rdata = pwrap_readl(wrp, PWRAP_INT1_FLG);
> +
> + dev_err(wrp->dev, "unexpected interrupt int1=0x%x\n", rdata);
> +
> + pwrap_writel(wrp, rdata, PWRAP_INT1_CLR);
> + }
> +

it seems no required to add PWRAP_CAP_INT1_EN:

the CAP 

  1   2   3   4   5   6   7   8   9   10   >