[PATCH v2 3/4] watchdog: iTCO_wdt: Use pdev for platform device and pci_dev for pci device

2017-01-03 Thread Guenter Roeck
Use pdev for struct platform_device, pci_dev for struct pci_dev, and dev
for struct device variables to improve consistency.

Remove 'struct platform_device *dev;' from struct iTCO_wdt_private since
it was unused.

Reviewed-by: Andy Shevchenko 
Signed-off-by: Guenter Roeck 
---
v2: Named pci device variable pci_dev (from pcidev)
Added Reviewed-by:

 drivers/watchdog/iTCO_wdt.c | 53 ++---
 1 file changed, 26 insertions(+), 27 deletions(-)

diff --git a/drivers/watchdog/iTCO_wdt.c b/drivers/watchdog/iTCO_wdt.c
index eed1dee6de19..9b60f4201c26 100644
--- a/drivers/watchdog/iTCO_wdt.c
+++ b/drivers/watchdog/iTCO_wdt.c
@@ -102,9 +102,8 @@ struct iTCO_wdt_private {
unsigned long __iomem *gcs_pmc;
/* the lock for io operations */
spinlock_t io_lock;
-   struct platform_device *dev;
/* the PCI-device */
-   struct pci_dev *pdev;
+   struct pci_dev *pci_dev;
/* whether or not the watchdog has been suspended */
bool suspended;
 };
@@ -181,9 +180,9 @@ static void iTCO_wdt_set_NO_REBOOT_bit(struct 
iTCO_wdt_private *p)
val32 |= no_reboot_bit(p);
writel(val32, p->gcs_pmc);
} else if (p->iTCO_version == 1) {
-   pci_read_config_dword(p->pdev, 0xd4, );
+   pci_read_config_dword(p->pci_dev, 0xd4, );
val32 |= no_reboot_bit(p);
-   pci_write_config_dword(p->pdev, 0xd4, val32);
+   pci_write_config_dword(p->pci_dev, 0xd4, val32);
}
 }
 
@@ -200,11 +199,11 @@ static int iTCO_wdt_unset_NO_REBOOT_bit(struct 
iTCO_wdt_private *p)
 
val32 = readl(p->gcs_pmc);
} else if (p->iTCO_version == 1) {
-   pci_read_config_dword(p->pdev, 0xd4, );
+   pci_read_config_dword(p->pci_dev, 0xd4, );
val32 &= ~enable_bit;
-   pci_write_config_dword(p->pdev, 0xd4, val32);
+   pci_write_config_dword(p->pci_dev, 0xd4, val32);
 
-   pci_read_config_dword(p->pdev, 0xd4, );
+   pci_read_config_dword(p->pci_dev, 0xd4, );
}
 
if (val32 & enable_bit)
@@ -401,9 +400,10 @@ static const struct watchdog_ops iTCO_wdt_ops = {
  * Init & exit routines
  */
 
-static int iTCO_wdt_probe(struct platform_device *dev)
+static int iTCO_wdt_probe(struct platform_device *pdev)
 {
-   struct itco_wdt_platform_data *pdata = dev_get_platdata(>dev);
+   struct device *dev = >dev;
+   struct itco_wdt_platform_data *pdata = dev_get_platdata(dev);
struct iTCO_wdt_private *p;
unsigned long val32;
int ret;
@@ -411,33 +411,32 @@ static int iTCO_wdt_probe(struct platform_device *dev)
if (!pdata)
return -ENODEV;
 
-   p = devm_kzalloc(>dev, sizeof(*p), GFP_KERNEL);
+   p = devm_kzalloc(dev, sizeof(*p), GFP_KERNEL);
if (!p)
return -ENOMEM;
 
spin_lock_init(>io_lock);
 
-   p->tco_res = platform_get_resource(dev, IORESOURCE_IO, ICH_RES_IO_TCO);
+   p->tco_res = platform_get_resource(pdev, IORESOURCE_IO, ICH_RES_IO_TCO);
if (!p->tco_res)
return -ENODEV;
 
-   p->smi_res = platform_get_resource(dev, IORESOURCE_IO, ICH_RES_IO_SMI);
+   p->smi_res = platform_get_resource(pdev, IORESOURCE_IO, ICH_RES_IO_SMI);
if (!p->smi_res)
return -ENODEV;
 
p->iTCO_version = pdata->version;
-   p->dev = dev;
-   p->pdev = to_pci_dev(dev->dev.parent);
+   p->pci_dev = to_pci_dev(dev->parent);
 
/*
 * Get the Memory-Mapped GCS or PMC register, we need it for the
 * NO_REBOOT flag (TCO v2 and v3).
 */
if (p->iTCO_version >= 2) {
-   p->gcs_pmc_res = platform_get_resource(dev,
+   p->gcs_pmc_res = platform_get_resource(pdev,
   IORESOURCE_MEM,
   ICH_RES_MEM_GCS_PMC);
-   p->gcs_pmc = devm_ioremap_resource(>dev, p->gcs_pmc_res);
+   p->gcs_pmc = devm_ioremap_resource(dev, p->gcs_pmc_res);
if (IS_ERR(p->gcs_pmc))
return PTR_ERR(p->gcs_pmc);
}
@@ -453,9 +452,9 @@ static int iTCO_wdt_probe(struct platform_device *dev)
iTCO_wdt_set_NO_REBOOT_bit(p);
 
/* The TCO logic uses the TCO_EN bit in the SMI_EN register */
-   if (!devm_request_region(>dev, p->smi_res->start,
+   if (!devm_request_region(dev, p->smi_res->start,
 resource_size(p->smi_res),
-dev->name)) {
+pdev->name)) {
pr_err("I/O address 0x%04llx already in use, device disabled\n",
   (u64)SMI_EN(p));
return -EBUSY;
@@ -470,9 +469,9 @@ static int iTCO_wdt_probe(struct 

[PATCH v2 1/4] watchdog: iTCO_wdt: Use allocated data structures

2017-01-03 Thread Guenter Roeck
Allocate private data and the watchdog device to avoid having
to clear it on remove and to enable subsequent simplifications.

Reviewed-by: Andy Shevchenko 
Signed-off-by: Guenter Roeck 
---
v2: Fixed typo in description
Added Reviewed-by:

 drivers/watchdog/iTCO_wdt.c | 402 ++--
 1 file changed, 205 insertions(+), 197 deletions(-)

diff --git a/drivers/watchdog/iTCO_wdt.c b/drivers/watchdog/iTCO_wdt.c
index 06fcb6c8c917..a35a9164ccd0 100644
--- a/drivers/watchdog/iTCO_wdt.c
+++ b/drivers/watchdog/iTCO_wdt.c
@@ -72,22 +72,24 @@
 
 /* Address definitions for the TCO */
 /* TCO base address */
-#define TCOBASE(iTCO_wdt_private.tco_res->start)
+#define TCOBASE(p) ((p)->tco_res->start)
 /* SMI Control and Enable Register */
-#define SMI_EN (iTCO_wdt_private.smi_res->start)
-
-#define TCO_RLD(TCOBASE + 0x00) /* TCO Timer Reload and Curr. 
Value */
-#define TCOv1_TMR  (TCOBASE + 0x01) /* TCOv1 Timer Initial Value   */
-#define TCO_DAT_IN (TCOBASE + 0x02) /* TCO Data In Register*/
-#define TCO_DAT_OUT(TCOBASE + 0x03) /* TCO Data Out Register   */
-#define TCO1_STS   (TCOBASE + 0x04) /* TCO1 Status Register*/
-#define TCO2_STS   (TCOBASE + 0x06) /* TCO2 Status Register*/
-#define TCO1_CNT   (TCOBASE + 0x08) /* TCO1 Control Register   */
-#define TCO2_CNT   (TCOBASE + 0x0a) /* TCO2 Control Register   */
-#define TCOv2_TMR  (TCOBASE + 0x12) /* TCOv2 Timer Initial Value   */
+#define SMI_EN(p)  ((p)->smi_res->start)
+
+#define TCO_RLD(p) (TCOBASE(p) + 0x00) /* TCO Timer Reload/Curr. Value */
+#define TCOv1_TMR(p)   (TCOBASE(p) + 0x01) /* TCOv1 Timer Initial Value*/
+#define TCO_DAT_IN(p)  (TCOBASE(p) + 0x02) /* TCO Data In Register */
+#define TCO_DAT_OUT(p) (TCOBASE(p) + 0x03) /* TCO Data Out Register*/
+#define TCO1_STS(p)(TCOBASE(p) + 0x04) /* TCO1 Status Register */
+#define TCO2_STS(p)(TCOBASE(p) + 0x06) /* TCO2 Status Register */
+#define TCO1_CNT(p)(TCOBASE(p) + 0x08) /* TCO1 Control Register*/
+#define TCO2_CNT(p)(TCOBASE(p) + 0x0a) /* TCO2 Control Register*/
+#define TCOv2_TMR(p)   (TCOBASE(p) + 0x12) /* TCOv2 Timer Initial Value*/
 
 /* internal variables */
-static struct {/* this is private data for the iTCO_wdt device 
*/
+struct iTCO_wdt_private {
+   struct watchdog_device wddev;
+
/* TCO version/generation */
unsigned int iTCO_version;
struct resource *tco_res;
@@ -105,7 +107,7 @@ static struct { /* this is private data for the 
iTCO_wdt device */
struct pci_dev *pdev;
/* whether or not the watchdog has been suspended */
bool suspended;
-} iTCO_wdt_private;
+};
 
 /* module parameters */
 #define WATCHDOG_TIMEOUT 30/* 30 sec default heartbeat */
@@ -135,21 +137,23 @@ MODULE_PARM_DESC(turn_SMI_watchdog_clear_off,
  * every 0.6 seconds.  v3's internal timer is stored as seconds (some
  * datasheets incorrectly state 0.6 seconds).
  */
-static inline unsigned int seconds_to_ticks(int secs)
+static inline unsigned int seconds_to_ticks(struct iTCO_wdt_private *p,
+   int secs)
 {
-   return iTCO_wdt_private.iTCO_version == 3 ? secs : (secs * 10) / 6;
+   return p->iTCO_version == 3 ? secs : (secs * 10) / 6;
 }
 
-static inline unsigned int ticks_to_seconds(int ticks)
+static inline unsigned int ticks_to_seconds(struct iTCO_wdt_private *p,
+   int ticks)
 {
-   return iTCO_wdt_private.iTCO_version == 3 ? ticks : (ticks * 6) / 10;
+   return p->iTCO_version == 3 ? ticks : (ticks * 6) / 10;
 }
 
-static inline u32 no_reboot_bit(void)
+static inline u32 no_reboot_bit(struct iTCO_wdt_private *p)
 {
u32 enable_bit;
 
-   switch (iTCO_wdt_private.iTCO_version) {
+   switch (p->iTCO_version) {
case 5:
case 3:
enable_bit = 0x0010;
@@ -167,40 +171,40 @@ static inline u32 no_reboot_bit(void)
return enable_bit;
 }
 
-static void iTCO_wdt_set_NO_REBOOT_bit(void)
+static void iTCO_wdt_set_NO_REBOOT_bit(struct iTCO_wdt_private *p)
 {
u32 val32;
 
/* Set the NO_REBOOT bit: this disables reboots */
-   if (iTCO_wdt_private.iTCO_version >= 2) {
-   val32 = readl(iTCO_wdt_private.gcs_pmc);
-   val32 |= no_reboot_bit();
-   writel(val32, iTCO_wdt_private.gcs_pmc);
-   } else if (iTCO_wdt_private.iTCO_version == 1) {
-   pci_read_config_dword(iTCO_wdt_private.pdev, 0xd4, );
-   val32 |= no_reboot_bit();
-   pci_write_config_dword(iTCO_wdt_private.pdev, 0xd4, val32);
+   if (p->iTCO_version >= 2) {
+   val32 = readl(p->gcs_pmc);
+   val32 |= no_reboot_bit(p);
+   writel(val32, p->gcs_pmc);
+   } else if 

Re: [PATCH net 9/9] virtio-net: XDP support for small buffers

2017-01-03 Thread Jason Wang



On 2017年01月04日 00:40, John Fastabend wrote:

On 17-01-02 10:16 PM, Jason Wang wrote:


On 2017年01月03日 06:43, John Fastabend wrote:

On 16-12-23 06:37 AM, Jason Wang wrote:

Commit f600b6905015 ("virtio_net: Add XDP support") leaves the case of
small receive buffer untouched. This will confuse the user who want to
set XDP but use small buffers. Other than forbid XDP in small buffer
mode, let's make it work. XDP then can only work at skb->data since
virtio-net create skbs during refill, this is sub optimal which could
be optimized in the future.

Cc: John Fastabend 
Signed-off-by: Jason Wang 
---
   drivers/net/virtio_net.c | 112 
---
   1 file changed, 87 insertions(+), 25 deletions(-)


Hi Jason,

I was doing some more testing on this what do you think about doing this
so that free_unused_bufs() handles the buffer free with dev_kfree_skb()
instead of put_page in small receive mode. Seems more correct to me.


diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 783e842..27ff76c 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1898,6 +1898,10 @@ static void free_receive_page_frags(struct virtnet_info
*vi)

   static bool is_xdp_queue(struct virtnet_info *vi, int q)
   {
+   /* For small receive mode always use kfree_skb variants */
+   if (!vi->mergeable_rx_bufs)
+   return false;
+
  if (q < (vi->curr_queue_pairs - vi->xdp_queue_pairs))
  return false;
  else if (q < vi->curr_queue_pairs)


patch is untested just spotted doing code review.

Thanks,
John

We probably need a better name for this function.

Acked-by: Jason Wang 


How about is_xdp_raw_buffer_queue()?

I'll submit a proper patch today.


Sounds good, thanks.



[PATCH v2 3/4] watchdog: iTCO_wdt: Use pdev for platform device and pci_dev for pci device

2017-01-03 Thread Guenter Roeck
Use pdev for struct platform_device, pci_dev for struct pci_dev, and dev
for struct device variables to improve consistency.

Remove 'struct platform_device *dev;' from struct iTCO_wdt_private since
it was unused.

Reviewed-by: Andy Shevchenko 
Signed-off-by: Guenter Roeck 
---
v2: Named pci device variable pci_dev (from pcidev)
Added Reviewed-by:

 drivers/watchdog/iTCO_wdt.c | 53 ++---
 1 file changed, 26 insertions(+), 27 deletions(-)

diff --git a/drivers/watchdog/iTCO_wdt.c b/drivers/watchdog/iTCO_wdt.c
index eed1dee6de19..9b60f4201c26 100644
--- a/drivers/watchdog/iTCO_wdt.c
+++ b/drivers/watchdog/iTCO_wdt.c
@@ -102,9 +102,8 @@ struct iTCO_wdt_private {
unsigned long __iomem *gcs_pmc;
/* the lock for io operations */
spinlock_t io_lock;
-   struct platform_device *dev;
/* the PCI-device */
-   struct pci_dev *pdev;
+   struct pci_dev *pci_dev;
/* whether or not the watchdog has been suspended */
bool suspended;
 };
@@ -181,9 +180,9 @@ static void iTCO_wdt_set_NO_REBOOT_bit(struct 
iTCO_wdt_private *p)
val32 |= no_reboot_bit(p);
writel(val32, p->gcs_pmc);
} else if (p->iTCO_version == 1) {
-   pci_read_config_dword(p->pdev, 0xd4, );
+   pci_read_config_dword(p->pci_dev, 0xd4, );
val32 |= no_reboot_bit(p);
-   pci_write_config_dword(p->pdev, 0xd4, val32);
+   pci_write_config_dword(p->pci_dev, 0xd4, val32);
}
 }
 
@@ -200,11 +199,11 @@ static int iTCO_wdt_unset_NO_REBOOT_bit(struct 
iTCO_wdt_private *p)
 
val32 = readl(p->gcs_pmc);
} else if (p->iTCO_version == 1) {
-   pci_read_config_dword(p->pdev, 0xd4, );
+   pci_read_config_dword(p->pci_dev, 0xd4, );
val32 &= ~enable_bit;
-   pci_write_config_dword(p->pdev, 0xd4, val32);
+   pci_write_config_dword(p->pci_dev, 0xd4, val32);
 
-   pci_read_config_dword(p->pdev, 0xd4, );
+   pci_read_config_dword(p->pci_dev, 0xd4, );
}
 
if (val32 & enable_bit)
@@ -401,9 +400,10 @@ static const struct watchdog_ops iTCO_wdt_ops = {
  * Init & exit routines
  */
 
-static int iTCO_wdt_probe(struct platform_device *dev)
+static int iTCO_wdt_probe(struct platform_device *pdev)
 {
-   struct itco_wdt_platform_data *pdata = dev_get_platdata(>dev);
+   struct device *dev = >dev;
+   struct itco_wdt_platform_data *pdata = dev_get_platdata(dev);
struct iTCO_wdt_private *p;
unsigned long val32;
int ret;
@@ -411,33 +411,32 @@ static int iTCO_wdt_probe(struct platform_device *dev)
if (!pdata)
return -ENODEV;
 
-   p = devm_kzalloc(>dev, sizeof(*p), GFP_KERNEL);
+   p = devm_kzalloc(dev, sizeof(*p), GFP_KERNEL);
if (!p)
return -ENOMEM;
 
spin_lock_init(>io_lock);
 
-   p->tco_res = platform_get_resource(dev, IORESOURCE_IO, ICH_RES_IO_TCO);
+   p->tco_res = platform_get_resource(pdev, IORESOURCE_IO, ICH_RES_IO_TCO);
if (!p->tco_res)
return -ENODEV;
 
-   p->smi_res = platform_get_resource(dev, IORESOURCE_IO, ICH_RES_IO_SMI);
+   p->smi_res = platform_get_resource(pdev, IORESOURCE_IO, ICH_RES_IO_SMI);
if (!p->smi_res)
return -ENODEV;
 
p->iTCO_version = pdata->version;
-   p->dev = dev;
-   p->pdev = to_pci_dev(dev->dev.parent);
+   p->pci_dev = to_pci_dev(dev->parent);
 
/*
 * Get the Memory-Mapped GCS or PMC register, we need it for the
 * NO_REBOOT flag (TCO v2 and v3).
 */
if (p->iTCO_version >= 2) {
-   p->gcs_pmc_res = platform_get_resource(dev,
+   p->gcs_pmc_res = platform_get_resource(pdev,
   IORESOURCE_MEM,
   ICH_RES_MEM_GCS_PMC);
-   p->gcs_pmc = devm_ioremap_resource(>dev, p->gcs_pmc_res);
+   p->gcs_pmc = devm_ioremap_resource(dev, p->gcs_pmc_res);
if (IS_ERR(p->gcs_pmc))
return PTR_ERR(p->gcs_pmc);
}
@@ -453,9 +452,9 @@ static int iTCO_wdt_probe(struct platform_device *dev)
iTCO_wdt_set_NO_REBOOT_bit(p);
 
/* The TCO logic uses the TCO_EN bit in the SMI_EN register */
-   if (!devm_request_region(>dev, p->smi_res->start,
+   if (!devm_request_region(dev, p->smi_res->start,
 resource_size(p->smi_res),
-dev->name)) {
+pdev->name)) {
pr_err("I/O address 0x%04llx already in use, device disabled\n",
   (u64)SMI_EN(p));
return -EBUSY;
@@ -470,9 +469,9 @@ static int iTCO_wdt_probe(struct platform_device *dev)
outl(val32, 

[PATCH v2 1/4] watchdog: iTCO_wdt: Use allocated data structures

2017-01-03 Thread Guenter Roeck
Allocate private data and the watchdog device to avoid having
to clear it on remove and to enable subsequent simplifications.

Reviewed-by: Andy Shevchenko 
Signed-off-by: Guenter Roeck 
---
v2: Fixed typo in description
Added Reviewed-by:

 drivers/watchdog/iTCO_wdt.c | 402 ++--
 1 file changed, 205 insertions(+), 197 deletions(-)

diff --git a/drivers/watchdog/iTCO_wdt.c b/drivers/watchdog/iTCO_wdt.c
index 06fcb6c8c917..a35a9164ccd0 100644
--- a/drivers/watchdog/iTCO_wdt.c
+++ b/drivers/watchdog/iTCO_wdt.c
@@ -72,22 +72,24 @@
 
 /* Address definitions for the TCO */
 /* TCO base address */
-#define TCOBASE(iTCO_wdt_private.tco_res->start)
+#define TCOBASE(p) ((p)->tco_res->start)
 /* SMI Control and Enable Register */
-#define SMI_EN (iTCO_wdt_private.smi_res->start)
-
-#define TCO_RLD(TCOBASE + 0x00) /* TCO Timer Reload and Curr. 
Value */
-#define TCOv1_TMR  (TCOBASE + 0x01) /* TCOv1 Timer Initial Value   */
-#define TCO_DAT_IN (TCOBASE + 0x02) /* TCO Data In Register*/
-#define TCO_DAT_OUT(TCOBASE + 0x03) /* TCO Data Out Register   */
-#define TCO1_STS   (TCOBASE + 0x04) /* TCO1 Status Register*/
-#define TCO2_STS   (TCOBASE + 0x06) /* TCO2 Status Register*/
-#define TCO1_CNT   (TCOBASE + 0x08) /* TCO1 Control Register   */
-#define TCO2_CNT   (TCOBASE + 0x0a) /* TCO2 Control Register   */
-#define TCOv2_TMR  (TCOBASE + 0x12) /* TCOv2 Timer Initial Value   */
+#define SMI_EN(p)  ((p)->smi_res->start)
+
+#define TCO_RLD(p) (TCOBASE(p) + 0x00) /* TCO Timer Reload/Curr. Value */
+#define TCOv1_TMR(p)   (TCOBASE(p) + 0x01) /* TCOv1 Timer Initial Value*/
+#define TCO_DAT_IN(p)  (TCOBASE(p) + 0x02) /* TCO Data In Register */
+#define TCO_DAT_OUT(p) (TCOBASE(p) + 0x03) /* TCO Data Out Register*/
+#define TCO1_STS(p)(TCOBASE(p) + 0x04) /* TCO1 Status Register */
+#define TCO2_STS(p)(TCOBASE(p) + 0x06) /* TCO2 Status Register */
+#define TCO1_CNT(p)(TCOBASE(p) + 0x08) /* TCO1 Control Register*/
+#define TCO2_CNT(p)(TCOBASE(p) + 0x0a) /* TCO2 Control Register*/
+#define TCOv2_TMR(p)   (TCOBASE(p) + 0x12) /* TCOv2 Timer Initial Value*/
 
 /* internal variables */
-static struct {/* this is private data for the iTCO_wdt device 
*/
+struct iTCO_wdt_private {
+   struct watchdog_device wddev;
+
/* TCO version/generation */
unsigned int iTCO_version;
struct resource *tco_res;
@@ -105,7 +107,7 @@ static struct { /* this is private data for the 
iTCO_wdt device */
struct pci_dev *pdev;
/* whether or not the watchdog has been suspended */
bool suspended;
-} iTCO_wdt_private;
+};
 
 /* module parameters */
 #define WATCHDOG_TIMEOUT 30/* 30 sec default heartbeat */
@@ -135,21 +137,23 @@ MODULE_PARM_DESC(turn_SMI_watchdog_clear_off,
  * every 0.6 seconds.  v3's internal timer is stored as seconds (some
  * datasheets incorrectly state 0.6 seconds).
  */
-static inline unsigned int seconds_to_ticks(int secs)
+static inline unsigned int seconds_to_ticks(struct iTCO_wdt_private *p,
+   int secs)
 {
-   return iTCO_wdt_private.iTCO_version == 3 ? secs : (secs * 10) / 6;
+   return p->iTCO_version == 3 ? secs : (secs * 10) / 6;
 }
 
-static inline unsigned int ticks_to_seconds(int ticks)
+static inline unsigned int ticks_to_seconds(struct iTCO_wdt_private *p,
+   int ticks)
 {
-   return iTCO_wdt_private.iTCO_version == 3 ? ticks : (ticks * 6) / 10;
+   return p->iTCO_version == 3 ? ticks : (ticks * 6) / 10;
 }
 
-static inline u32 no_reboot_bit(void)
+static inline u32 no_reboot_bit(struct iTCO_wdt_private *p)
 {
u32 enable_bit;
 
-   switch (iTCO_wdt_private.iTCO_version) {
+   switch (p->iTCO_version) {
case 5:
case 3:
enable_bit = 0x0010;
@@ -167,40 +171,40 @@ static inline u32 no_reboot_bit(void)
return enable_bit;
 }
 
-static void iTCO_wdt_set_NO_REBOOT_bit(void)
+static void iTCO_wdt_set_NO_REBOOT_bit(struct iTCO_wdt_private *p)
 {
u32 val32;
 
/* Set the NO_REBOOT bit: this disables reboots */
-   if (iTCO_wdt_private.iTCO_version >= 2) {
-   val32 = readl(iTCO_wdt_private.gcs_pmc);
-   val32 |= no_reboot_bit();
-   writel(val32, iTCO_wdt_private.gcs_pmc);
-   } else if (iTCO_wdt_private.iTCO_version == 1) {
-   pci_read_config_dword(iTCO_wdt_private.pdev, 0xd4, );
-   val32 |= no_reboot_bit();
-   pci_write_config_dword(iTCO_wdt_private.pdev, 0xd4, val32);
+   if (p->iTCO_version >= 2) {
+   val32 = readl(p->gcs_pmc);
+   val32 |= no_reboot_bit(p);
+   writel(val32, p->gcs_pmc);
+   } else if (p->iTCO_version == 1) {
+   

Re: [PATCH net 9/9] virtio-net: XDP support for small buffers

2017-01-03 Thread Jason Wang



On 2017年01月04日 00:40, John Fastabend wrote:

On 17-01-02 10:16 PM, Jason Wang wrote:


On 2017年01月03日 06:43, John Fastabend wrote:

On 16-12-23 06:37 AM, Jason Wang wrote:

Commit f600b6905015 ("virtio_net: Add XDP support") leaves the case of
small receive buffer untouched. This will confuse the user who want to
set XDP but use small buffers. Other than forbid XDP in small buffer
mode, let's make it work. XDP then can only work at skb->data since
virtio-net create skbs during refill, this is sub optimal which could
be optimized in the future.

Cc: John Fastabend 
Signed-off-by: Jason Wang 
---
   drivers/net/virtio_net.c | 112 
---
   1 file changed, 87 insertions(+), 25 deletions(-)


Hi Jason,

I was doing some more testing on this what do you think about doing this
so that free_unused_bufs() handles the buffer free with dev_kfree_skb()
instead of put_page in small receive mode. Seems more correct to me.


diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 783e842..27ff76c 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1898,6 +1898,10 @@ static void free_receive_page_frags(struct virtnet_info
*vi)

   static bool is_xdp_queue(struct virtnet_info *vi, int q)
   {
+   /* For small receive mode always use kfree_skb variants */
+   if (!vi->mergeable_rx_bufs)
+   return false;
+
  if (q < (vi->curr_queue_pairs - vi->xdp_queue_pairs))
  return false;
  else if (q < vi->curr_queue_pairs)


patch is untested just spotted doing code review.

Thanks,
John

We probably need a better name for this function.

Acked-by: Jason Wang 


How about is_xdp_raw_buffer_queue()?

I'll submit a proper patch today.


Sounds good, thanks.



[PATCH v20 0/4] Mediatek MT8173 CMDQ support

2017-01-03 Thread HS Liao

Hi,

This is Mediatek MT8173 Command Queue(CMDQ) driver. The CMDQ is used
to help write registers with critical time limitation, such as
updating display configuration during the vblank. It controls Global
Command Engine (GCE) hardware to achieve this requirement.

These patches have a build dependency on top of v4.10-rc2.

Changes since v19:
 - rebase to v4.10-rc2

Best regards,
HS Liao

HS Liao (4):
  dt-bindings: soc: Add documentation for the MediaTek GCE unit
  mailbox: mediatek: Add Mediatek CMDQ driver
  arm64: dts: mt8173: Add GCE node
  soc: mediatek: Add Mediatek CMDQ helper

 .../devicetree/bindings/mailbox/mtk-gce.txt|  43 ++
 arch/arm64/boot/dts/mediatek/mt8173.dtsi   |  10 +
 drivers/mailbox/Kconfig|  10 +
 drivers/mailbox/Makefile   |   2 +
 drivers/mailbox/mtk-cmdq-mailbox.c | 596 +
 drivers/soc/mediatek/Kconfig   |  12 +
 drivers/soc/mediatek/Makefile  |   1 +
 drivers/soc/mediatek/mtk-cmdq-helper.c | 310 +++
 include/linux/mailbox/mtk-cmdq-mailbox.h   |  75 +++
 include/linux/soc/mediatek/mtk-cmdq.h  | 174 ++
 10 files changed, 1233 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/mailbox/mtk-gce.txt
 create mode 100644 drivers/mailbox/mtk-cmdq-mailbox.c
 create mode 100644 drivers/soc/mediatek/mtk-cmdq-helper.c
 create mode 100644 include/linux/mailbox/mtk-cmdq-mailbox.h
 create mode 100644 include/linux/soc/mediatek/mtk-cmdq.h

-- 
1.9.1



[PATCH v20 0/4] Mediatek MT8173 CMDQ support

2017-01-03 Thread HS Liao

Hi,

This is Mediatek MT8173 Command Queue(CMDQ) driver. The CMDQ is used
to help write registers with critical time limitation, such as
updating display configuration during the vblank. It controls Global
Command Engine (GCE) hardware to achieve this requirement.

These patches have a build dependency on top of v4.10-rc2.

Changes since v19:
 - rebase to v4.10-rc2

Best regards,
HS Liao

HS Liao (4):
  dt-bindings: soc: Add documentation for the MediaTek GCE unit
  mailbox: mediatek: Add Mediatek CMDQ driver
  arm64: dts: mt8173: Add GCE node
  soc: mediatek: Add Mediatek CMDQ helper

 .../devicetree/bindings/mailbox/mtk-gce.txt|  43 ++
 arch/arm64/boot/dts/mediatek/mt8173.dtsi   |  10 +
 drivers/mailbox/Kconfig|  10 +
 drivers/mailbox/Makefile   |   2 +
 drivers/mailbox/mtk-cmdq-mailbox.c | 596 +
 drivers/soc/mediatek/Kconfig   |  12 +
 drivers/soc/mediatek/Makefile  |   1 +
 drivers/soc/mediatek/mtk-cmdq-helper.c | 310 +++
 include/linux/mailbox/mtk-cmdq-mailbox.h   |  75 +++
 include/linux/soc/mediatek/mtk-cmdq.h  | 174 ++
 10 files changed, 1233 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/mailbox/mtk-gce.txt
 create mode 100644 drivers/mailbox/mtk-cmdq-mailbox.c
 create mode 100644 drivers/soc/mediatek/mtk-cmdq-helper.c
 create mode 100644 include/linux/mailbox/mtk-cmdq-mailbox.h
 create mode 100644 include/linux/soc/mediatek/mtk-cmdq.h

-- 
1.9.1



Re: taint/module: Fix problems when out-of-kernel driver defines true or false

2017-01-03 Thread Jessica Yu

+++ Larry Finger [01/01/17 20:25 -0600]:

Commit 7fd8329ba502 ("taint/module: Clean up global and module taint
flags handling") used the key words true and false as character members
of a new struct. These names cause problems when out-of-kernel modules
such as VirtualBox include their own definitions of true and false.

Fixes: 7fd8329ba502 ("taint/module: Clean up global and module taint flags 
handling")
Signed-off-by: Larry Finger 
Cc: Petr Mladek 
Cc: Jessica Yu 
Cc: Rusty Russell 


Applied, thanks.

Jessica


Re: taint/module: Fix problems when out-of-kernel driver defines true or false

2017-01-03 Thread Jessica Yu

+++ Larry Finger [01/01/17 20:25 -0600]:

Commit 7fd8329ba502 ("taint/module: Clean up global and module taint
flags handling") used the key words true and false as character members
of a new struct. These names cause problems when out-of-kernel modules
such as VirtualBox include their own definitions of true and false.

Fixes: 7fd8329ba502 ("taint/module: Clean up global and module taint flags 
handling")
Signed-off-by: Larry Finger 
Cc: Petr Mladek 
Cc: Jessica Yu 
Cc: Rusty Russell 


Applied, thanks.

Jessica


Re: [LKP] [lkp-developer] [sched/fair] 4e5160766f: +149% ftq.noise.50% regression

2017-01-03 Thread Huang, Ying
Vincent Guittot  writes:

> Hi Dietmar and Ying,
>
> Le Tuesday 03 Jan 2017  11:38:39 (+0100), Dietmar Eggemann a crit :
>> Hi Vincent and Ying,
>> 
>> On 01/02/2017 04:42 PM, Vincent Guittot wrote:
>> >Hi Ying,
>> >
>> >On 28 December 2016 at 09:17, Huang, Ying  wrote:
>> >>Vincent Guittot  writes:
>> >>
>> >>>Le Tuesday 13 Dec 2016 . 09:47:30 (+0800), Huang, Ying a .crit :
>> Hi, Vincent,
>> 
>> Vincent Guittot  writes:
>> 
>> [...]
>>
>
> [snip]
>
>> >>
>> >>The test result is as follow,
>> >>
>> >>=
>> >>compiler/cpufreq_governor/freq/kconfig/nr_task/rootfs/samples/tbox_group/test/testcase:
>> >>  
>> >> gcc-6/powersave/20/x86_64-rhel-7.2/100%/debian-x86_64-2016-08-31.cgz/6000ss/lkp-hsw-d01/cache/ftq
>> >>
>> >>commit:
>> >>  4e5160766fcc9f41bbd38bac11f92dce993644aa: first bad commit
>> >>  09a43ace1f986b003c118fdf6ddf1fd685692d49: parent of first bad commit
>> >>  0613870ea53a7a279d8d37f2a3ce40aafc155fc8: debug commit with above patch
>> >>
>> >>4e5160766fcc9f41 09a43ace1f986b003c118fdf6d 0613870ea53a7a279d8d37f2a3
>> >> -- --
>> >> %stddev %change %stddev %change %stddev
>> >> \  |\  |\
>> >> 61670 228% -96.5%   2148  11% -94.7%   3281  58%  
>> >> ftq.noise.25%
>> >>  3463  10% -60.0%   1386  19% -26.3%   2552  58%  
>> >> ftq.noise.50%
>> >>  1116  23% -72.6% 305.99  30% -35.8% 716.15  64%  
>> >> ftq.noise.75%
>> >>   3843815   3%  +3.1%3963589   1% -49.6%1938221 100%  
>> >> ftq.time.involuntary_context_switches
>> >>  5.33  30% +21.4%   6.46  14% -71.7%   1.50 108%  
>> >> time.system_time
>> >>
>> >>
>> >>It appears that the system_time and involuntary_context_switches reduced
>> >>much after applied the debug patch, which is good from noise point of
>> >>view.  ftq.noise.50% reduced compared with the first bad commit, but
>> >>have not restored to that of the parent of the first bad commit.
>> >
>> >Thanks for testing. I will try to improve it a bit but not sure that I
>> >can reduce more.
>> 
>> Is this a desktop system where this regression comes from autogroups (1
>> level taskgroups) or a server system with systemd (2 level taskgroups)?
>> 
>> Since the PELT rewrite (v4.2) I have ~60 autogroups per cpu
>> (>leaf_cfs_rq_list) on my Ubuntu desktop system permanently (Intel
>> i7-4750HQ) whereas in v4.1 there were 0 - 10.
>> 
>> $ for i in `seq 0 7`; do cat /proc/sched_debug | grep
>> "cfs_rq\[$i\]:/autogroup-" | wc -l; done
>> 58
>> 61
>> 63
>> 65
>> 60
>> 59
>> 62
>> 56
>> 
>> Couldn't we still remove these autogroups by if (!cfs_rq->nr_running &&
>> !se->avg.load_avg && !se->avg.util_avg) in update_blocked_averages()?
>> 
>> Vincent, like we discussed in September last year, the proper fix would
>> probably be a cfs-rq->nr_attached which IMHO is not doable w/o being an
>> atomic because of migrate_task_rq_fair()->remove_entity_load_avg() not
>> holding the rq lock.
>
> I remember the discussion and even if I agree that a large number of taskgroup
> increases the number of loop in update_blocked_averages() and as a result the
> time spent in the update, I don't think that this is the root cause of
> this regression because the patch "sched/fair: Propagate asynchrous detach"
> doesn't add more loops to update_blocked_averages but it adds more thing to do
> per loop.
>
> Then, I think I'm still too conservative in the condition for calling 
> update_load_avg(cfs_rq->tg->se[cpu], 0). This call has been added to
> propagate gcfs_rq->propagate_avg flag to parent so we don't need to call it 
> even if load_avg is not null but only when propagate_avg flag is set. The
> patch below should improve thing compare to the previous version because
> it will call update_load_avg(cfs_rq->tg->se[cpu], 0) only if an asynchrounous
> detach happened (propagate_avg is set).
>
> Ying, could you test the patch below instead of the previous one ?
>
> ---
>  kernel/sched/fair.c | 8 +---
>  1 file changed, 5 insertions(+), 3 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 6559d19..a4f5c35 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6915,6 +6915,7 @@ static void update_blocked_averages(int cpu)
>  {
>   struct rq *rq = cpu_rq(cpu);
>   struct cfs_rq *cfs_rq;
> + struct sched_entity *se;
>   unsigned long flags;
>  
>   raw_spin_lock_irqsave(>lock, flags);
> @@ -6932,9 +6933,10 @@ static void update_blocked_averages(int cpu)
>   if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, 
> true))
>   update_tg_load_avg(cfs_rq, 0);
>  
> - /* 

[PATCH v20 4/4] soc: mediatek: Add Mediatek CMDQ helper

2017-01-03 Thread HS Liao
Add Mediatek CMDQ helper to create CMDQ packet and assemble GCE op code.

Signed-off-by: HS Liao 
---
 drivers/soc/mediatek/Kconfig   |  12 ++
 drivers/soc/mediatek/Makefile  |   1 +
 drivers/soc/mediatek/mtk-cmdq-helper.c | 310 +
 include/linux/soc/mediatek/mtk-cmdq.h  | 174 ++
 4 files changed, 497 insertions(+)
 create mode 100644 drivers/soc/mediatek/mtk-cmdq-helper.c
 create mode 100644 include/linux/soc/mediatek/mtk-cmdq.h

diff --git a/drivers/soc/mediatek/Kconfig b/drivers/soc/mediatek/Kconfig
index 609bb34..2f145d8 100644
--- a/drivers/soc/mediatek/Kconfig
+++ b/drivers/soc/mediatek/Kconfig
@@ -1,6 +1,18 @@
 #
 # MediaTek SoC drivers
 #
+config MTK_CMDQ
+   bool "MediaTek CMDQ Support"
+   depends on ARM64 && ( ARCH_MEDIATEK || COMPILE_TEST )
+   select MAILBOX
+   select MTK_CMDQ_MBOX
+   select MTK_INFRACFG
+   help
+ Say yes here to add support for the MediaTek Command Queue (CMDQ)
+ driver. The CMDQ is used to help read/write registers with critical
+ time limitation, such as updating display configuration during the
+ vblank.
+
 config MTK_INFRACFG
bool "MediaTek INFRACFG Support"
depends on ARCH_MEDIATEK || COMPILE_TEST
diff --git a/drivers/soc/mediatek/Makefile b/drivers/soc/mediatek/Makefile
index 12998b0..64ce5ee 100644
--- a/drivers/soc/mediatek/Makefile
+++ b/drivers/soc/mediatek/Makefile
@@ -1,3 +1,4 @@
+obj-$(CONFIG_MTK_CMDQ) += mtk-cmdq-helper.o
 obj-$(CONFIG_MTK_INFRACFG) += mtk-infracfg.o
 obj-$(CONFIG_MTK_PMIC_WRAP) += mtk-pmic-wrap.o
 obj-$(CONFIG_MTK_SCPSYS) += mtk-scpsys.o
diff --git a/drivers/soc/mediatek/mtk-cmdq-helper.c 
b/drivers/soc/mediatek/mtk-cmdq-helper.c
new file mode 100644
index 000..7809e65
--- /dev/null
+++ b/drivers/soc/mediatek/mtk-cmdq-helper.c
@@ -0,0 +1,310 @@
+/*
+ * Copyright (c) 2015 MediaTek Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+#define CMDQ_SUBSYS_SHIFT  16
+#define CMDQ_ARG_A_WRITE_MASK  0x
+#define CMDQ_WRITE_ENABLE_MASK BIT(0)
+#define CMDQ_EOC_IRQ_ENBIT(0)
+#define CMDQ_EOC_CMD   ((u64)((CMDQ_CODE_EOC << CMDQ_OP_CODE_SHIFT)) \
+   << 32 | CMDQ_EOC_IRQ_EN)
+
+struct cmdq_subsys {
+   u32 base;
+   int id;
+};
+
+static const struct cmdq_subsys gce_subsys[] = {
+   {0x1400, 1},
+   {0x1401, 2},
+   {0x1402, 3},
+};
+
+static int cmdq_subsys_base_to_id(u32 base)
+{
+   int i;
+
+   for (i = 0; i < ARRAY_SIZE(gce_subsys); i++)
+   if (gce_subsys[i].base == base)
+   return gce_subsys[i].id;
+   return -EFAULT;
+}
+
+static int cmdq_pkt_realloc_cmd_buffer(struct cmdq_pkt *pkt, size_t size)
+{
+   void *new_buf;
+
+   new_buf = krealloc(pkt->va_base, size, GFP_KERNEL | __GFP_ZERO);
+   if (!new_buf)
+   return -ENOMEM;
+   pkt->va_base = new_buf;
+   pkt->buf_size = size;
+   return 0;
+}
+
+struct cmdq_base *cmdq_register_device(struct device *dev)
+{
+   struct cmdq_base *cmdq_base;
+   struct resource res;
+   int subsys;
+   u32 base;
+
+   if (of_address_to_resource(dev->of_node, 0, ))
+   return NULL;
+   base = (u32)res.start;
+
+   subsys = cmdq_subsys_base_to_id(base >> 16);
+   if (subsys < 0)
+   return NULL;
+
+   cmdq_base = devm_kmalloc(dev, sizeof(*cmdq_base), GFP_KERNEL);
+   if (!cmdq_base)
+   return NULL;
+   cmdq_base->subsys = subsys;
+   cmdq_base->base = base;
+
+   return cmdq_base;
+}
+EXPORT_SYMBOL(cmdq_register_device);
+
+struct cmdq_client *cmdq_mbox_create(struct device *dev, int index)
+{
+   struct cmdq_client *client;
+
+   client = kzalloc(sizeof(*client), GFP_KERNEL);
+   client->client.dev = dev;
+   client->client.tx_block = false;
+   client->chan = mbox_request_channel(>client, index);
+   return client;
+}
+EXPORT_SYMBOL(cmdq_mbox_create);
+
+void cmdq_mbox_destroy(struct cmdq_client *client)
+{
+   mbox_free_channel(client->chan);
+   kfree(client);
+}
+EXPORT_SYMBOL(cmdq_mbox_destroy);
+
+int cmdq_pkt_create(struct cmdq_pkt **pkt_ptr)
+{
+   struct cmdq_pkt *pkt;
+   int err;
+
+   pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
+   if (!pkt)
+   return -ENOMEM;
+   err = cmdq_pkt_realloc_cmd_buffer(pkt, PAGE_SIZE);
+   if (err < 0) {
+   kfree(pkt);
+   

[PATCH v20 2/4] mailbox: mediatek: Add Mediatek CMDQ driver

2017-01-03 Thread HS Liao
This patch is first version of Mediatek Command Queue(CMDQ) driver. The
CMDQ is used to help write registers with critical time limitation,
such as updating display configuration during the vblank. It controls
Global Command Engine (GCE) hardware to achieve this requirement.
Currently, CMDQ only supports display related hardwares, but we expect
it can be extended to other hardwares for future requirements.

Signed-off-by: HS Liao 
Signed-off-by: CK Hu 
---
 drivers/mailbox/Kconfig  |  10 +
 drivers/mailbox/Makefile |   2 +
 drivers/mailbox/mtk-cmdq-mailbox.c   | 596 +++
 include/linux/mailbox/mtk-cmdq-mailbox.h |  75 
 4 files changed, 683 insertions(+)
 create mode 100644 drivers/mailbox/mtk-cmdq-mailbox.c
 create mode 100644 include/linux/mailbox/mtk-cmdq-mailbox.h

diff --git a/drivers/mailbox/Kconfig b/drivers/mailbox/Kconfig
index ceff415..9108dd4 100644
--- a/drivers/mailbox/Kconfig
+++ b/drivers/mailbox/Kconfig
@@ -152,4 +152,14 @@ config BCM_PDC_MBOX
  Mailbox implementation for the Broadcom PDC ring manager,
  which provides access to various offload engines on Broadcom
  SoCs. Say Y here if you want to use the Broadcom PDC.
+
+config MTK_CMDQ_MBOX
+   bool "MediaTek CMDQ Mailbox Support"
+   depends on ARM64 && ( ARCH_MEDIATEK || COMPILE_TEST )
+   select MTK_INFRACFG
+   help
+ Say yes here to add support for the MediaTek Command Queue (CMDQ)
+ mailbox driver. The CMDQ is used to help read/write registers with
+ critical time limitation, such as updating display configuration
+ during the vblank.
 endif
diff --git a/drivers/mailbox/Makefile b/drivers/mailbox/Makefile
index 7dde4f6..fad8965 100644
--- a/drivers/mailbox/Makefile
+++ b/drivers/mailbox/Makefile
@@ -31,3 +31,5 @@ obj-$(CONFIG_HI6220_MBOX) += hi6220-mailbox.o
 obj-$(CONFIG_BCM_PDC_MBOX) += bcm-pdc-mailbox.o
 
 obj-$(CONFIG_TEGRA_HSP_MBOX)   += tegra-hsp.o
+
+obj-$(CONFIG_MTK_CMDQ_MBOX)+= mtk-cmdq-mailbox.o
diff --git a/drivers/mailbox/mtk-cmdq-mailbox.c 
b/drivers/mailbox/mtk-cmdq-mailbox.c
new file mode 100644
index 000..747bcd3
--- /dev/null
+++ b/drivers/mailbox/mtk-cmdq-mailbox.c
@@ -0,0 +1,596 @@
+/*
+ * Copyright (c) 2015 MediaTek Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define CMDQ_THR_MAX_COUNT 3 /* main, sub, general(misc) */
+#define CMDQ_OP_CODE_MASK  (0xff << CMDQ_OP_CODE_SHIFT)
+#define CMDQ_TIMEOUT_MS1000
+#define CMDQ_IRQ_MASK  0x
+#define CMDQ_NUM_CMD(t)(t->cmd_buf_size / 
CMDQ_INST_SIZE)
+
+#define CMDQ_CURR_IRQ_STATUS   0x10
+#define CMDQ_THR_SLOT_CYCLES   0x30
+
+#define CMDQ_THR_BASE  0x100
+#define CMDQ_THR_SIZE  0x80
+#define CMDQ_THR_WARM_RESET0x00
+#define CMDQ_THR_ENABLE_TASK   0x04
+#define CMDQ_THR_SUSPEND_TASK  0x08
+#define CMDQ_THR_CURR_STATUS   0x0c
+#define CMDQ_THR_IRQ_STATUS0x10
+#define CMDQ_THR_IRQ_ENABLE0x14
+#define CMDQ_THR_CURR_ADDR 0x20
+#define CMDQ_THR_END_ADDR  0x24
+#define CMDQ_THR_WAIT_TOKEN0x30
+
+#define CMDQ_THR_ENABLED   0x1
+#define CMDQ_THR_DISABLED  0x0
+#define CMDQ_THR_SUSPEND   0x1
+#define CMDQ_THR_RESUME0x0
+#define CMDQ_THR_STATUS_SUSPENDED  BIT(1)
+#define CMDQ_THR_DO_WARM_RESET BIT(0)
+#define CMDQ_THR_ACTIVE_SLOT_CYCLES0x3200
+#define CMDQ_THR_IRQ_DONE  0x1
+#define CMDQ_THR_IRQ_ERROR 0x12
+#define CMDQ_THR_IRQ_EN(CMDQ_THR_IRQ_ERROR | 
CMDQ_THR_IRQ_DONE)
+#define CMDQ_THR_IS_WAITINGBIT(31)
+
+#define CMDQ_JUMP_BY_OFFSET0x1000
+#define CMDQ_JUMP_BY_PA0x1001
+
+struct cmdq_thread {
+   struct mbox_chan*chan;
+   void __iomem*base;
+   struct list_headtask_busy_list;
+   struct timer_list   timeout;
+   boolatomic_exec;
+};
+
+struct cmdq_task {
+   struct cmdq *cmdq;
+   struct list_headlist_entry;
+   dma_addr_t  pa_base;
+   struct cmdq_thread  *thread;
+   struct cmdq_pkt *pkt; /* the packet sent from 

Re: [LKP] [lkp-developer] [sched/fair] 4e5160766f: +149% ftq.noise.50% regression

2017-01-03 Thread Huang, Ying
Vincent Guittot  writes:

> Hi Dietmar and Ying,
>
> Le Tuesday 03 Jan 2017  11:38:39 (+0100), Dietmar Eggemann a crit :
>> Hi Vincent and Ying,
>> 
>> On 01/02/2017 04:42 PM, Vincent Guittot wrote:
>> >Hi Ying,
>> >
>> >On 28 December 2016 at 09:17, Huang, Ying  wrote:
>> >>Vincent Guittot  writes:
>> >>
>> >>>Le Tuesday 13 Dec 2016 . 09:47:30 (+0800), Huang, Ying a .crit :
>> Hi, Vincent,
>> 
>> Vincent Guittot  writes:
>> 
>> [...]
>>
>
> [snip]
>
>> >>
>> >>The test result is as follow,
>> >>
>> >>=
>> >>compiler/cpufreq_governor/freq/kconfig/nr_task/rootfs/samples/tbox_group/test/testcase:
>> >>  
>> >> gcc-6/powersave/20/x86_64-rhel-7.2/100%/debian-x86_64-2016-08-31.cgz/6000ss/lkp-hsw-d01/cache/ftq
>> >>
>> >>commit:
>> >>  4e5160766fcc9f41bbd38bac11f92dce993644aa: first bad commit
>> >>  09a43ace1f986b003c118fdf6ddf1fd685692d49: parent of first bad commit
>> >>  0613870ea53a7a279d8d37f2a3ce40aafc155fc8: debug commit with above patch
>> >>
>> >>4e5160766fcc9f41 09a43ace1f986b003c118fdf6d 0613870ea53a7a279d8d37f2a3
>> >> -- --
>> >> %stddev %change %stddev %change %stddev
>> >> \  |\  |\
>> >> 61670 228% -96.5%   2148  11% -94.7%   3281  58%  
>> >> ftq.noise.25%
>> >>  3463  10% -60.0%   1386  19% -26.3%   2552  58%  
>> >> ftq.noise.50%
>> >>  1116  23% -72.6% 305.99  30% -35.8% 716.15  64%  
>> >> ftq.noise.75%
>> >>   3843815   3%  +3.1%3963589   1% -49.6%1938221 100%  
>> >> ftq.time.involuntary_context_switches
>> >>  5.33  30% +21.4%   6.46  14% -71.7%   1.50 108%  
>> >> time.system_time
>> >>
>> >>
>> >>It appears that the system_time and involuntary_context_switches reduced
>> >>much after applied the debug patch, which is good from noise point of
>> >>view.  ftq.noise.50% reduced compared with the first bad commit, but
>> >>have not restored to that of the parent of the first bad commit.
>> >
>> >Thanks for testing. I will try to improve it a bit but not sure that I
>> >can reduce more.
>> 
>> Is this a desktop system where this regression comes from autogroups (1
>> level taskgroups) or a server system with systemd (2 level taskgroups)?
>> 
>> Since the PELT rewrite (v4.2) I have ~60 autogroups per cpu
>> (>leaf_cfs_rq_list) on my Ubuntu desktop system permanently (Intel
>> i7-4750HQ) whereas in v4.1 there were 0 - 10.
>> 
>> $ for i in `seq 0 7`; do cat /proc/sched_debug | grep
>> "cfs_rq\[$i\]:/autogroup-" | wc -l; done
>> 58
>> 61
>> 63
>> 65
>> 60
>> 59
>> 62
>> 56
>> 
>> Couldn't we still remove these autogroups by if (!cfs_rq->nr_running &&
>> !se->avg.load_avg && !se->avg.util_avg) in update_blocked_averages()?
>> 
>> Vincent, like we discussed in September last year, the proper fix would
>> probably be a cfs-rq->nr_attached which IMHO is not doable w/o being an
>> atomic because of migrate_task_rq_fair()->remove_entity_load_avg() not
>> holding the rq lock.
>
> I remember the discussion and even if I agree that a large number of taskgroup
> increases the number of loop in update_blocked_averages() and as a result the
> time spent in the update, I don't think that this is the root cause of
> this regression because the patch "sched/fair: Propagate asynchrous detach"
> doesn't add more loops to update_blocked_averages but it adds more thing to do
> per loop.
>
> Then, I think I'm still too conservative in the condition for calling 
> update_load_avg(cfs_rq->tg->se[cpu], 0). This call has been added to
> propagate gcfs_rq->propagate_avg flag to parent so we don't need to call it 
> even if load_avg is not null but only when propagate_avg flag is set. The
> patch below should improve thing compare to the previous version because
> it will call update_load_avg(cfs_rq->tg->se[cpu], 0) only if an asynchrounous
> detach happened (propagate_avg is set).
>
> Ying, could you test the patch below instead of the previous one ?
>
> ---
>  kernel/sched/fair.c | 8 +---
>  1 file changed, 5 insertions(+), 3 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 6559d19..a4f5c35 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6915,6 +6915,7 @@ static void update_blocked_averages(int cpu)
>  {
>   struct rq *rq = cpu_rq(cpu);
>   struct cfs_rq *cfs_rq;
> + struct sched_entity *se;
>   unsigned long flags;
>  
>   raw_spin_lock_irqsave(>lock, flags);
> @@ -6932,9 +6933,10 @@ static void update_blocked_averages(int cpu)
>   if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, 
> true))
>   update_tg_load_avg(cfs_rq, 0);
>  
> - /* Propagate pending load changes to the parent */
> - if (cfs_rq->tg->se[cpu])
> -   

[PATCH v20 4/4] soc: mediatek: Add Mediatek CMDQ helper

2017-01-03 Thread HS Liao
Add Mediatek CMDQ helper to create CMDQ packet and assemble GCE op code.

Signed-off-by: HS Liao 
---
 drivers/soc/mediatek/Kconfig   |  12 ++
 drivers/soc/mediatek/Makefile  |   1 +
 drivers/soc/mediatek/mtk-cmdq-helper.c | 310 +
 include/linux/soc/mediatek/mtk-cmdq.h  | 174 ++
 4 files changed, 497 insertions(+)
 create mode 100644 drivers/soc/mediatek/mtk-cmdq-helper.c
 create mode 100644 include/linux/soc/mediatek/mtk-cmdq.h

diff --git a/drivers/soc/mediatek/Kconfig b/drivers/soc/mediatek/Kconfig
index 609bb34..2f145d8 100644
--- a/drivers/soc/mediatek/Kconfig
+++ b/drivers/soc/mediatek/Kconfig
@@ -1,6 +1,18 @@
 #
 # MediaTek SoC drivers
 #
+config MTK_CMDQ
+   bool "MediaTek CMDQ Support"
+   depends on ARM64 && ( ARCH_MEDIATEK || COMPILE_TEST )
+   select MAILBOX
+   select MTK_CMDQ_MBOX
+   select MTK_INFRACFG
+   help
+ Say yes here to add support for the MediaTek Command Queue (CMDQ)
+ driver. The CMDQ is used to help read/write registers with critical
+ time limitation, such as updating display configuration during the
+ vblank.
+
 config MTK_INFRACFG
bool "MediaTek INFRACFG Support"
depends on ARCH_MEDIATEK || COMPILE_TEST
diff --git a/drivers/soc/mediatek/Makefile b/drivers/soc/mediatek/Makefile
index 12998b0..64ce5ee 100644
--- a/drivers/soc/mediatek/Makefile
+++ b/drivers/soc/mediatek/Makefile
@@ -1,3 +1,4 @@
+obj-$(CONFIG_MTK_CMDQ) += mtk-cmdq-helper.o
 obj-$(CONFIG_MTK_INFRACFG) += mtk-infracfg.o
 obj-$(CONFIG_MTK_PMIC_WRAP) += mtk-pmic-wrap.o
 obj-$(CONFIG_MTK_SCPSYS) += mtk-scpsys.o
diff --git a/drivers/soc/mediatek/mtk-cmdq-helper.c 
b/drivers/soc/mediatek/mtk-cmdq-helper.c
new file mode 100644
index 000..7809e65
--- /dev/null
+++ b/drivers/soc/mediatek/mtk-cmdq-helper.c
@@ -0,0 +1,310 @@
+/*
+ * Copyright (c) 2015 MediaTek Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+#define CMDQ_SUBSYS_SHIFT  16
+#define CMDQ_ARG_A_WRITE_MASK  0x
+#define CMDQ_WRITE_ENABLE_MASK BIT(0)
+#define CMDQ_EOC_IRQ_ENBIT(0)
+#define CMDQ_EOC_CMD   ((u64)((CMDQ_CODE_EOC << CMDQ_OP_CODE_SHIFT)) \
+   << 32 | CMDQ_EOC_IRQ_EN)
+
+struct cmdq_subsys {
+   u32 base;
+   int id;
+};
+
+static const struct cmdq_subsys gce_subsys[] = {
+   {0x1400, 1},
+   {0x1401, 2},
+   {0x1402, 3},
+};
+
+static int cmdq_subsys_base_to_id(u32 base)
+{
+   int i;
+
+   for (i = 0; i < ARRAY_SIZE(gce_subsys); i++)
+   if (gce_subsys[i].base == base)
+   return gce_subsys[i].id;
+   return -EFAULT;
+}
+
+static int cmdq_pkt_realloc_cmd_buffer(struct cmdq_pkt *pkt, size_t size)
+{
+   void *new_buf;
+
+   new_buf = krealloc(pkt->va_base, size, GFP_KERNEL | __GFP_ZERO);
+   if (!new_buf)
+   return -ENOMEM;
+   pkt->va_base = new_buf;
+   pkt->buf_size = size;
+   return 0;
+}
+
+struct cmdq_base *cmdq_register_device(struct device *dev)
+{
+   struct cmdq_base *cmdq_base;
+   struct resource res;
+   int subsys;
+   u32 base;
+
+   if (of_address_to_resource(dev->of_node, 0, ))
+   return NULL;
+   base = (u32)res.start;
+
+   subsys = cmdq_subsys_base_to_id(base >> 16);
+   if (subsys < 0)
+   return NULL;
+
+   cmdq_base = devm_kmalloc(dev, sizeof(*cmdq_base), GFP_KERNEL);
+   if (!cmdq_base)
+   return NULL;
+   cmdq_base->subsys = subsys;
+   cmdq_base->base = base;
+
+   return cmdq_base;
+}
+EXPORT_SYMBOL(cmdq_register_device);
+
+struct cmdq_client *cmdq_mbox_create(struct device *dev, int index)
+{
+   struct cmdq_client *client;
+
+   client = kzalloc(sizeof(*client), GFP_KERNEL);
+   client->client.dev = dev;
+   client->client.tx_block = false;
+   client->chan = mbox_request_channel(>client, index);
+   return client;
+}
+EXPORT_SYMBOL(cmdq_mbox_create);
+
+void cmdq_mbox_destroy(struct cmdq_client *client)
+{
+   mbox_free_channel(client->chan);
+   kfree(client);
+}
+EXPORT_SYMBOL(cmdq_mbox_destroy);
+
+int cmdq_pkt_create(struct cmdq_pkt **pkt_ptr)
+{
+   struct cmdq_pkt *pkt;
+   int err;
+
+   pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
+   if (!pkt)
+   return -ENOMEM;
+   err = cmdq_pkt_realloc_cmd_buffer(pkt, PAGE_SIZE);
+   if (err < 0) {
+   kfree(pkt);
+   return err;
+   }
+ 

[PATCH v20 2/4] mailbox: mediatek: Add Mediatek CMDQ driver

2017-01-03 Thread HS Liao
This patch is first version of Mediatek Command Queue(CMDQ) driver. The
CMDQ is used to help write registers with critical time limitation,
such as updating display configuration during the vblank. It controls
Global Command Engine (GCE) hardware to achieve this requirement.
Currently, CMDQ only supports display related hardwares, but we expect
it can be extended to other hardwares for future requirements.

Signed-off-by: HS Liao 
Signed-off-by: CK Hu 
---
 drivers/mailbox/Kconfig  |  10 +
 drivers/mailbox/Makefile |   2 +
 drivers/mailbox/mtk-cmdq-mailbox.c   | 596 +++
 include/linux/mailbox/mtk-cmdq-mailbox.h |  75 
 4 files changed, 683 insertions(+)
 create mode 100644 drivers/mailbox/mtk-cmdq-mailbox.c
 create mode 100644 include/linux/mailbox/mtk-cmdq-mailbox.h

diff --git a/drivers/mailbox/Kconfig b/drivers/mailbox/Kconfig
index ceff415..9108dd4 100644
--- a/drivers/mailbox/Kconfig
+++ b/drivers/mailbox/Kconfig
@@ -152,4 +152,14 @@ config BCM_PDC_MBOX
  Mailbox implementation for the Broadcom PDC ring manager,
  which provides access to various offload engines on Broadcom
  SoCs. Say Y here if you want to use the Broadcom PDC.
+
+config MTK_CMDQ_MBOX
+   bool "MediaTek CMDQ Mailbox Support"
+   depends on ARM64 && ( ARCH_MEDIATEK || COMPILE_TEST )
+   select MTK_INFRACFG
+   help
+ Say yes here to add support for the MediaTek Command Queue (CMDQ)
+ mailbox driver. The CMDQ is used to help read/write registers with
+ critical time limitation, such as updating display configuration
+ during the vblank.
 endif
diff --git a/drivers/mailbox/Makefile b/drivers/mailbox/Makefile
index 7dde4f6..fad8965 100644
--- a/drivers/mailbox/Makefile
+++ b/drivers/mailbox/Makefile
@@ -31,3 +31,5 @@ obj-$(CONFIG_HI6220_MBOX) += hi6220-mailbox.o
 obj-$(CONFIG_BCM_PDC_MBOX) += bcm-pdc-mailbox.o
 
 obj-$(CONFIG_TEGRA_HSP_MBOX)   += tegra-hsp.o
+
+obj-$(CONFIG_MTK_CMDQ_MBOX)+= mtk-cmdq-mailbox.o
diff --git a/drivers/mailbox/mtk-cmdq-mailbox.c 
b/drivers/mailbox/mtk-cmdq-mailbox.c
new file mode 100644
index 000..747bcd3
--- /dev/null
+++ b/drivers/mailbox/mtk-cmdq-mailbox.c
@@ -0,0 +1,596 @@
+/*
+ * Copyright (c) 2015 MediaTek Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define CMDQ_THR_MAX_COUNT 3 /* main, sub, general(misc) */
+#define CMDQ_OP_CODE_MASK  (0xff << CMDQ_OP_CODE_SHIFT)
+#define CMDQ_TIMEOUT_MS1000
+#define CMDQ_IRQ_MASK  0x
+#define CMDQ_NUM_CMD(t)(t->cmd_buf_size / 
CMDQ_INST_SIZE)
+
+#define CMDQ_CURR_IRQ_STATUS   0x10
+#define CMDQ_THR_SLOT_CYCLES   0x30
+
+#define CMDQ_THR_BASE  0x100
+#define CMDQ_THR_SIZE  0x80
+#define CMDQ_THR_WARM_RESET0x00
+#define CMDQ_THR_ENABLE_TASK   0x04
+#define CMDQ_THR_SUSPEND_TASK  0x08
+#define CMDQ_THR_CURR_STATUS   0x0c
+#define CMDQ_THR_IRQ_STATUS0x10
+#define CMDQ_THR_IRQ_ENABLE0x14
+#define CMDQ_THR_CURR_ADDR 0x20
+#define CMDQ_THR_END_ADDR  0x24
+#define CMDQ_THR_WAIT_TOKEN0x30
+
+#define CMDQ_THR_ENABLED   0x1
+#define CMDQ_THR_DISABLED  0x0
+#define CMDQ_THR_SUSPEND   0x1
+#define CMDQ_THR_RESUME0x0
+#define CMDQ_THR_STATUS_SUSPENDED  BIT(1)
+#define CMDQ_THR_DO_WARM_RESET BIT(0)
+#define CMDQ_THR_ACTIVE_SLOT_CYCLES0x3200
+#define CMDQ_THR_IRQ_DONE  0x1
+#define CMDQ_THR_IRQ_ERROR 0x12
+#define CMDQ_THR_IRQ_EN(CMDQ_THR_IRQ_ERROR | 
CMDQ_THR_IRQ_DONE)
+#define CMDQ_THR_IS_WAITINGBIT(31)
+
+#define CMDQ_JUMP_BY_OFFSET0x1000
+#define CMDQ_JUMP_BY_PA0x1001
+
+struct cmdq_thread {
+   struct mbox_chan*chan;
+   void __iomem*base;
+   struct list_headtask_busy_list;
+   struct timer_list   timeout;
+   boolatomic_exec;
+};
+
+struct cmdq_task {
+   struct cmdq *cmdq;
+   struct list_headlist_entry;
+   dma_addr_t  pa_base;
+   struct cmdq_thread  *thread;
+   struct cmdq_pkt *pkt; /* the packet sent from mailbox client */
+};
+
+struct cmdq {
+   

[PATCH v20 1/4] dt-bindings: soc: Add documentation for the MediaTek GCE unit

2017-01-03 Thread HS Liao
This adds documentation for the MediaTek Global Command Engine (GCE) unit
found in MT8173 SoCs.

Signed-off-by: HS Liao 
Acked-by: Rob Herring 
---
 .../devicetree/bindings/mailbox/mtk-gce.txt| 43 ++
 1 file changed, 43 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/mailbox/mtk-gce.txt

diff --git a/Documentation/devicetree/bindings/mailbox/mtk-gce.txt 
b/Documentation/devicetree/bindings/mailbox/mtk-gce.txt
new file mode 100644
index 000..d2d3ccb
--- /dev/null
+++ b/Documentation/devicetree/bindings/mailbox/mtk-gce.txt
@@ -0,0 +1,43 @@
+MediaTek GCE
+===
+
+The Global Command Engine (GCE) is used to help read/write registers with
+critical time limitation, such as updating display configuration during the
+vblank. The GCE can be used to implement the Command Queue (CMDQ) driver.
+
+CMDQ driver uses mailbox framework for communication. Please refer to
+mailbox.txt for generic information about mailbox device-tree bindings.
+
+Required properties:
+- compatible: Must be "mediatek,mt8173-gce"
+- reg: Address range of the GCE unit
+- interrupts: The interrupt signal from the GCE block
+- clock: Clocks according to the common clock binding
+- clock-names: Must be "gce" to stand for GCE clock
+- #mbox-cells: Should be 2
+
+Required properties for a client device:
+- mboxes: client use mailbox to communicate with GCE, it should have this
+  property and list of phandle, mailbox channel specifiers, and atomic
+  execution flag.
+
+Example:
+
+   gce: gce@10212000 {
+   compatible = "mediatek,mt8173-gce";
+   reg = <0 0x10212000 0 0x1000>;
+   interrupts = ;
+   clocks = < CLK_INFRA_GCE>;
+   clock-names = "gce";
+
+   #mbox-cells = <2>;
+   };
+
+Example for a client device:
+
+   mmsys: clock-controller@1400 {
+   compatible = "mediatek,mt8173-mmsys";
+   mboxes = < 0 1 /* main display with atomic execution */
+  1 1>; /* sub display with atomic execution */
+   ...
+   };
-- 
1.9.1



[PATCH v20 3/4] arm64: dts: mt8173: Add GCE node

2017-01-03 Thread HS Liao
This patch adds the device node of the GCE hardware for CMDQ module.

Signed-off-by: HS Liao 
---
 arch/arm64/boot/dts/mediatek/mt8173.dtsi | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/arch/arm64/boot/dts/mediatek/mt8173.dtsi 
b/arch/arm64/boot/dts/mediatek/mt8173.dtsi
index 12e7027..9f93447 100644
--- a/arch/arm64/boot/dts/mediatek/mt8173.dtsi
+++ b/arch/arm64/boot/dts/mediatek/mt8173.dtsi
@@ -422,6 +422,16 @@
status = "disabled";
};
 
+   gce: gce@10212000 {
+   compatible = "mediatek,mt8173-gce";
+   reg = <0 0x10212000 0 0x1000>;
+   interrupts = ;
+   clocks = < CLK_INFRA_GCE>;
+   clock-names = "gce";
+
+   #mbox-cells = <2>;
+   };
+
mipi_tx0: mipi-dphy@10215000 {
compatible = "mediatek,mt8173-mipi-tx";
reg = <0 0x10215000 0 0x1000>;
-- 
1.9.1



[PATCH v20 1/4] dt-bindings: soc: Add documentation for the MediaTek GCE unit

2017-01-03 Thread HS Liao
This adds documentation for the MediaTek Global Command Engine (GCE) unit
found in MT8173 SoCs.

Signed-off-by: HS Liao 
Acked-by: Rob Herring 
---
 .../devicetree/bindings/mailbox/mtk-gce.txt| 43 ++
 1 file changed, 43 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/mailbox/mtk-gce.txt

diff --git a/Documentation/devicetree/bindings/mailbox/mtk-gce.txt 
b/Documentation/devicetree/bindings/mailbox/mtk-gce.txt
new file mode 100644
index 000..d2d3ccb
--- /dev/null
+++ b/Documentation/devicetree/bindings/mailbox/mtk-gce.txt
@@ -0,0 +1,43 @@
+MediaTek GCE
+===
+
+The Global Command Engine (GCE) is used to help read/write registers with
+critical time limitation, such as updating display configuration during the
+vblank. The GCE can be used to implement the Command Queue (CMDQ) driver.
+
+CMDQ driver uses mailbox framework for communication. Please refer to
+mailbox.txt for generic information about mailbox device-tree bindings.
+
+Required properties:
+- compatible: Must be "mediatek,mt8173-gce"
+- reg: Address range of the GCE unit
+- interrupts: The interrupt signal from the GCE block
+- clock: Clocks according to the common clock binding
+- clock-names: Must be "gce" to stand for GCE clock
+- #mbox-cells: Should be 2
+
+Required properties for a client device:
+- mboxes: client use mailbox to communicate with GCE, it should have this
+  property and list of phandle, mailbox channel specifiers, and atomic
+  execution flag.
+
+Example:
+
+   gce: gce@10212000 {
+   compatible = "mediatek,mt8173-gce";
+   reg = <0 0x10212000 0 0x1000>;
+   interrupts = ;
+   clocks = < CLK_INFRA_GCE>;
+   clock-names = "gce";
+
+   #mbox-cells = <2>;
+   };
+
+Example for a client device:
+
+   mmsys: clock-controller@1400 {
+   compatible = "mediatek,mt8173-mmsys";
+   mboxes = < 0 1 /* main display with atomic execution */
+  1 1>; /* sub display with atomic execution */
+   ...
+   };
-- 
1.9.1



[PATCH v20 3/4] arm64: dts: mt8173: Add GCE node

2017-01-03 Thread HS Liao
This patch adds the device node of the GCE hardware for CMDQ module.

Signed-off-by: HS Liao 
---
 arch/arm64/boot/dts/mediatek/mt8173.dtsi | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/arch/arm64/boot/dts/mediatek/mt8173.dtsi 
b/arch/arm64/boot/dts/mediatek/mt8173.dtsi
index 12e7027..9f93447 100644
--- a/arch/arm64/boot/dts/mediatek/mt8173.dtsi
+++ b/arch/arm64/boot/dts/mediatek/mt8173.dtsi
@@ -422,6 +422,16 @@
status = "disabled";
};
 
+   gce: gce@10212000 {
+   compatible = "mediatek,mt8173-gce";
+   reg = <0 0x10212000 0 0x1000>;
+   interrupts = ;
+   clocks = < CLK_INFRA_GCE>;
+   clock-names = "gce";
+
+   #mbox-cells = <2>;
+   };
+
mipi_tx0: mipi-dphy@10215000 {
compatible = "mediatek,mt8173-mipi-tx";
reg = <0 0x10215000 0 0x1000>;
-- 
1.9.1



Re: [PATCH net-next V2 3/3] tun: rx batching

2017-01-03 Thread Jason Wang



On 2017年01月03日 21:33, Stefan Hajnoczi wrote:

On Wed, Dec 28, 2016 at 04:09:31PM +0800, Jason Wang wrote:

+static int tun_rx_batched(struct tun_file *tfile, struct sk_buff *skb,
+ int more)
+{
+   struct sk_buff_head *queue = >sk.sk_write_queue;
+   struct sk_buff_head process_queue;
+   int qlen;
+   bool rcv = false;
+
+   spin_lock(>lock);

Should this be spin_lock_bh()?  Below and in tun_get_user() there are
explicit local_bh_disable() calls so I guess BHs can interrupt us here
and this would deadlock.


sk_write_queue were accessed only in this function which runs under 
process context, so no need for spin_lock_bh() here.


Re: [PATCH net-next V2 3/3] tun: rx batching

2017-01-03 Thread Jason Wang



On 2017年01月03日 21:33, Stefan Hajnoczi wrote:

On Wed, Dec 28, 2016 at 04:09:31PM +0800, Jason Wang wrote:

+static int tun_rx_batched(struct tun_file *tfile, struct sk_buff *skb,
+ int more)
+{
+   struct sk_buff_head *queue = >sk.sk_write_queue;
+   struct sk_buff_head process_queue;
+   int qlen;
+   bool rcv = false;
+
+   spin_lock(>lock);

Should this be spin_lock_bh()?  Below and in tun_get_user() there are
explicit local_bh_disable() calls so I guess BHs can interrupt us here
and this would deadlock.


sk_write_queue were accessed only in this function which runs under 
process context, so no need for spin_lock_bh() here.


[PATCH v2] KVM: x86: fix NULL deref in vcpu_scan_ioapic

2017-01-03 Thread Wanpeng Li
From: Wanpeng Li 

Reported by syzkaller:

BUG: unable to handle kernel NULL pointer dereference at 01b0
IP: _raw_spin_lock+0xc/0x30
PGD 3e28eb067
PUD 3f0ac6067
PMD 0
Oops: 0002 [#1] SMP
CPU: 0 PID: 2431 Comm: test Tainted: G   OE   4.10.0-rc1+ #3
Call Trace:
 ? kvm_ioapic_scan_entry+0x3e/0x110 [kvm]
 kvm_arch_vcpu_ioctl_run+0x10a8/0x15f0 [kvm]
 ? pick_next_task_fair+0xe1/0x4e0
 ? kvm_arch_vcpu_load+0xea/0x260 [kvm]
 kvm_vcpu_ioctl+0x33a/0x600 [kvm]
 ? hrtimer_try_to_cancel+0x29/0x130
 ? do_nanosleep+0x97/0xf0
 do_vfs_ioctl+0xa1/0x5d0
 ? __hrtimer_init+0x90/0x90
 ? do_nanosleep+0x5b/0xf0
 SyS_ioctl+0x79/0x90
 do_syscall_64+0x6e/0x180
 entry_SYSCALL64_slow_path+0x25/0x25
RIP: _raw_spin_lock+0xc/0x30 RSP: a43688973cc0

The syzkaller folks reported a NULL pointer dereference that due to 
ENABLE_CAP doesn't fail without an irqchip, and synic is activated 
which results in a rescan ioapic request has been set although it 
should never been set for that vCPU.

#include  
#include  
#include  
#include  
#include  
#include  
#include  
#include  
#include  
#include  

#ifndef KVM_CAP_HYPERV_SYNIC 
#define KVM_CAP_HYPERV_SYNIC 123 
#endif 

void* thr(void* arg) 
{ 
struct kvm_enable_cap cap; 
cap.flags = 0; 
cap.cap = KVM_CAP_HYPERV_SYNIC; 
ioctl((long)arg, KVM_ENABLE_CAP, ); 
return 0; 
} 

int main() 
{ 
void *host_mem = mmap(0, 0x1000, PROT_READ|PROT_WRITE, 
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); 
int kvmfd = open("/dev/kvm", 0); 
int vmfd = ioctl(kvmfd, KVM_CREATE_VM, 0); 
struct kvm_userspace_memory_region memreg; 
memreg.slot = 0; 
memreg.flags = 0; 
memreg.guest_phys_addr = 0; 
memreg.memory_size = 0x1000; 
memreg.userspace_addr = (unsigned long)host_mem; 
memcpy(host_mem, 
"\x1a\xb5\x00\x04\x65\x26\x64\x26\xf0\x82\xaa\x00" 
"\x02\x00\x3e\x75\x64\xf3\xf3\xf0\x18\x35\x66\xb9" 
"\x99\x00\x00\x40\x46\xb8\xc0\x40\x00\x00\x66\xba" 
"\x00\x00\x00\x00\x0f\x30\x0f\xfc\xda\xdb\xca\x36" 
"\x26\x3e\x67\x3e\x67\xcf\x66\xb9\x4a\x0a\x00\x00" 
"\x0f\x32\x2e\x26\x65\x0f\x0f\x01\xa0", 
69); 
ioctl(vmfd, KVM_SET_USER_MEMORY_REGION, ); 
int cpufd = ioctl(vmfd, KVM_CREATE_VCPU, 0); 
struct kvm_sregs sregs; 
ioctl(cpufd, KVM_GET_SREGS, ); 
sregs.cr0 = 0; 
sregs.cr4 = 0; 
sregs.efer = 0; 
sregs.cs.selector = 0; 
sregs.cs.base = 0; 
ioctl(cpufd, KVM_SET_SREGS, ); 
struct kvm_regs regs; 
memset(, 0, sizeof(regs)); 
regs.rflags = 2; 
ioctl(cpufd, KVM_SET_REGS, ); 
ioctl(vmfd, KVM_CREATE_IRQCHIP, 0); 
pthread_t th; 
pthread_create(, 0, thr, (void*)(long)cpufd); 
usleep(rand() % 1); 
ioctl(cpufd, KVM_RUN, 0); 
pthread_join(th, 0); 
return 0; 
} 

This patch fix it by failing ENABLE_CAP if without an irqchip.

Reported-by: Dmitry Vyukov 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Dmitry Vyukov 
Signed-off-by: Wanpeng Li 
---
v1 -> v2:
 * fail ENABLE_CAP if without an irqchip

 arch/x86/kvm/x86.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5f6e288..2ada44a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3345,6 +3345,8 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu 
*vcpu,
 
switch (cap->cap) {
case KVM_CAP_HYPERV_SYNIC:
+   if (!irqchip_in_kernel(vcpu->kvm))
+   return -EINVAL;
return kvm_hv_activate_synic(vcpu);
default:
return -EINVAL;
-- 
2.7.4



[PATCH v2] KVM: x86: fix NULL deref in vcpu_scan_ioapic

2017-01-03 Thread Wanpeng Li
From: Wanpeng Li 

Reported by syzkaller:

BUG: unable to handle kernel NULL pointer dereference at 01b0
IP: _raw_spin_lock+0xc/0x30
PGD 3e28eb067
PUD 3f0ac6067
PMD 0
Oops: 0002 [#1] SMP
CPU: 0 PID: 2431 Comm: test Tainted: G   OE   4.10.0-rc1+ #3
Call Trace:
 ? kvm_ioapic_scan_entry+0x3e/0x110 [kvm]
 kvm_arch_vcpu_ioctl_run+0x10a8/0x15f0 [kvm]
 ? pick_next_task_fair+0xe1/0x4e0
 ? kvm_arch_vcpu_load+0xea/0x260 [kvm]
 kvm_vcpu_ioctl+0x33a/0x600 [kvm]
 ? hrtimer_try_to_cancel+0x29/0x130
 ? do_nanosleep+0x97/0xf0
 do_vfs_ioctl+0xa1/0x5d0
 ? __hrtimer_init+0x90/0x90
 ? do_nanosleep+0x5b/0xf0
 SyS_ioctl+0x79/0x90
 do_syscall_64+0x6e/0x180
 entry_SYSCALL64_slow_path+0x25/0x25
RIP: _raw_spin_lock+0xc/0x30 RSP: a43688973cc0

The syzkaller folks reported a NULL pointer dereference that due to 
ENABLE_CAP doesn't fail without an irqchip, and synic is activated 
which results in a rescan ioapic request has been set although it 
should never been set for that vCPU.

#include  
#include  
#include  
#include  
#include  
#include  
#include  
#include  
#include  
#include  

#ifndef KVM_CAP_HYPERV_SYNIC 
#define KVM_CAP_HYPERV_SYNIC 123 
#endif 

void* thr(void* arg) 
{ 
struct kvm_enable_cap cap; 
cap.flags = 0; 
cap.cap = KVM_CAP_HYPERV_SYNIC; 
ioctl((long)arg, KVM_ENABLE_CAP, ); 
return 0; 
} 

int main() 
{ 
void *host_mem = mmap(0, 0x1000, PROT_READ|PROT_WRITE, 
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); 
int kvmfd = open("/dev/kvm", 0); 
int vmfd = ioctl(kvmfd, KVM_CREATE_VM, 0); 
struct kvm_userspace_memory_region memreg; 
memreg.slot = 0; 
memreg.flags = 0; 
memreg.guest_phys_addr = 0; 
memreg.memory_size = 0x1000; 
memreg.userspace_addr = (unsigned long)host_mem; 
memcpy(host_mem, 
"\x1a\xb5\x00\x04\x65\x26\x64\x26\xf0\x82\xaa\x00" 
"\x02\x00\x3e\x75\x64\xf3\xf3\xf0\x18\x35\x66\xb9" 
"\x99\x00\x00\x40\x46\xb8\xc0\x40\x00\x00\x66\xba" 
"\x00\x00\x00\x00\x0f\x30\x0f\xfc\xda\xdb\xca\x36" 
"\x26\x3e\x67\x3e\x67\xcf\x66\xb9\x4a\x0a\x00\x00" 
"\x0f\x32\x2e\x26\x65\x0f\x0f\x01\xa0", 
69); 
ioctl(vmfd, KVM_SET_USER_MEMORY_REGION, ); 
int cpufd = ioctl(vmfd, KVM_CREATE_VCPU, 0); 
struct kvm_sregs sregs; 
ioctl(cpufd, KVM_GET_SREGS, ); 
sregs.cr0 = 0; 
sregs.cr4 = 0; 
sregs.efer = 0; 
sregs.cs.selector = 0; 
sregs.cs.base = 0; 
ioctl(cpufd, KVM_SET_SREGS, ); 
struct kvm_regs regs; 
memset(, 0, sizeof(regs)); 
regs.rflags = 2; 
ioctl(cpufd, KVM_SET_REGS, ); 
ioctl(vmfd, KVM_CREATE_IRQCHIP, 0); 
pthread_t th; 
pthread_create(, 0, thr, (void*)(long)cpufd); 
usleep(rand() % 1); 
ioctl(cpufd, KVM_RUN, 0); 
pthread_join(th, 0); 
return 0; 
} 

This patch fix it by failing ENABLE_CAP if without an irqchip.

Reported-by: Dmitry Vyukov 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Dmitry Vyukov 
Signed-off-by: Wanpeng Li 
---
v1 -> v2:
 * fail ENABLE_CAP if without an irqchip

 arch/x86/kvm/x86.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5f6e288..2ada44a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3345,6 +3345,8 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu 
*vcpu,
 
switch (cap->cap) {
case KVM_CAP_HYPERV_SYNIC:
+   if (!irqchip_in_kernel(vcpu->kvm))
+   return -EINVAL;
return kvm_hv_activate_synic(vcpu);
default:
return -EINVAL;
-- 
2.7.4



Re: kmod: add a sanity check on module loading

2017-01-03 Thread Jessica Yu

+++ Luis R. Rodriguez [08/12/16 11:49 -0800]:

kmod has an optimization in place whereby if a some kernel code
uses request_module() on a module already loaded we never bother
userspace as the module already is loaded. This is not true for
get_fs_type() though as it uses aliases.

Additionally kmod <= v19 was broken -- it returns 0 to modprobe calls,
assuming the kernel module is built-in, where really we have a race as
the module starts forming. kmod <= v19 has incorrect userspace heuristics,
a userspace kmod fix is available for it:

http://git.kernel.org/cgit/utils/kernel/kmod/kmod.git/commit/libkmod/libkmod-module.c?id=fd44a98ae2eb5eb32161088954ab21e58e19dfc4

This changes kmod to address both:

o Provides the alias optimization for get_fs_type() so modules already
  loaded do not get re-requested.

o Provides a sanity test to verify modprobe's work

This is important given how any get_fs_type() users assert success
means we're ready to go, and tests with the new test_kmod stress driver
reveal that request_module() and get_fs_type() might fail for a few
other reasons. You don't need old kmod to fail on request_module() or
get_fs_type(), with the right system setup, these calls *can* fail
today.

Although this does get us in the business of keeping alias maps in
kernel, the the work to support and maintain this is trivial.
Aditionally, since it may be important get_fs_type() should not fail on
certain systems, this tightens things up a bit more.

The TL;DR:

kmod <= v19 will return 0 on modprobe calls if you are built-in,
however its heuristics for checking if you are built-in were broken.

It assumed that having the directory /sys/module/module-name
but not having the file /sys/module/module-name/initstate
is sufficient to assume a module is built-in.

The kernel loads the inittstate attribute *after* it creates the
directory. This is an issue when modprobe returns 0 for kernel calls
which assumes a return of 0 on request_module() can give you the
right to assert the module is loaded and live.

We cannot trust returns of modprobe as 0 in the kernel, we need to
verify that modules are live if modprobe return 0 but only if modules
*are* modules. The kernel heuristic we use to determine if a module is
built-in is that if modprobe returns 0 we know we must be built-in or
a module, but if we are a module clearly we must have a lingering kmod
dangling on our linked list. If there is no modules there we are *somewhat*
certain the module must be built in.

This is not enough though... we cannot easily work around this since the
kernel can use aliases to userspace for modules calls. For instance
fs/namespace.c uses fs-modulename for filesystesms on get_fs_type(), so
these need to be taken into consideration as well.

Using kmod <= 19 will give you a NULL get_fs_type() return even though
the module was loaded... That is a corner case, there are other failures
for request_module() though -- the other failures are not easy to
reproduce though but fortunately we have a stress test driver to help
with that now. Use the following tests:

# tools/testing/selftests/kmod/kmod.sh -t 0008
# tools/testing/selftests/kmod/kmod.sh -t 0009

You can more easily see this error if you have kmod <= v19 installed.

You will need to install kmod <= v19, be sure to install its modprobe
into /sbin/ as by default the 'make install' target does not replace
your own.

This test helps cure test_kmod cases 0008 0009 so enable them.

Reported-by: Martin Wilck 
Reported-by: Randy Wright 
Signed-off-by: Luis R. Rodriguez 


Back from travel today, apologies for the delay. Will be able to give
this a proper look this week.

Jessica




Re: kmod: add a sanity check on module loading

2017-01-03 Thread Jessica Yu

+++ Luis R. Rodriguez [08/12/16 11:49 -0800]:

kmod has an optimization in place whereby if a some kernel code
uses request_module() on a module already loaded we never bother
userspace as the module already is loaded. This is not true for
get_fs_type() though as it uses aliases.

Additionally kmod <= v19 was broken -- it returns 0 to modprobe calls,
assuming the kernel module is built-in, where really we have a race as
the module starts forming. kmod <= v19 has incorrect userspace heuristics,
a userspace kmod fix is available for it:

http://git.kernel.org/cgit/utils/kernel/kmod/kmod.git/commit/libkmod/libkmod-module.c?id=fd44a98ae2eb5eb32161088954ab21e58e19dfc4

This changes kmod to address both:

o Provides the alias optimization for get_fs_type() so modules already
  loaded do not get re-requested.

o Provides a sanity test to verify modprobe's work

This is important given how any get_fs_type() users assert success
means we're ready to go, and tests with the new test_kmod stress driver
reveal that request_module() and get_fs_type() might fail for a few
other reasons. You don't need old kmod to fail on request_module() or
get_fs_type(), with the right system setup, these calls *can* fail
today.

Although this does get us in the business of keeping alias maps in
kernel, the the work to support and maintain this is trivial.
Aditionally, since it may be important get_fs_type() should not fail on
certain systems, this tightens things up a bit more.

The TL;DR:

kmod <= v19 will return 0 on modprobe calls if you are built-in,
however its heuristics for checking if you are built-in were broken.

It assumed that having the directory /sys/module/module-name
but not having the file /sys/module/module-name/initstate
is sufficient to assume a module is built-in.

The kernel loads the inittstate attribute *after* it creates the
directory. This is an issue when modprobe returns 0 for kernel calls
which assumes a return of 0 on request_module() can give you the
right to assert the module is loaded and live.

We cannot trust returns of modprobe as 0 in the kernel, we need to
verify that modules are live if modprobe return 0 but only if modules
*are* modules. The kernel heuristic we use to determine if a module is
built-in is that if modprobe returns 0 we know we must be built-in or
a module, but if we are a module clearly we must have a lingering kmod
dangling on our linked list. If there is no modules there we are *somewhat*
certain the module must be built in.

This is not enough though... we cannot easily work around this since the
kernel can use aliases to userspace for modules calls. For instance
fs/namespace.c uses fs-modulename for filesystesms on get_fs_type(), so
these need to be taken into consideration as well.

Using kmod <= 19 will give you a NULL get_fs_type() return even though
the module was loaded... That is a corner case, there are other failures
for request_module() though -- the other failures are not easy to
reproduce though but fortunately we have a stress test driver to help
with that now. Use the following tests:

# tools/testing/selftests/kmod/kmod.sh -t 0008
# tools/testing/selftests/kmod/kmod.sh -t 0009

You can more easily see this error if you have kmod <= v19 installed.

You will need to install kmod <= v19, be sure to install its modprobe
into /sbin/ as by default the 'make install' target does not replace
your own.

This test helps cure test_kmod cases 0008 0009 so enable them.

Reported-by: Martin Wilck 
Reported-by: Randy Wright 
Signed-off-by: Luis R. Rodriguez 


Back from travel today, apologies for the delay. Will be able to give
this a proper look this week.

Jessica




Re: [PATCH 08/29] efi: Allow drivers to reserve boot services forever

2017-01-03 Thread Dan Williams
On Fri, Sep 9, 2016 at 8:18 AM, Matt Fleming  wrote:
> Today, it is not possible for drivers to reserve EFI boot services for
> access after efi_free_boot_services() has been called on x86. For
> ARM/arm64 it can be done simply by calling memblock_reserve().
>
> Having this ability for all three architectures is desirable for a
> couple of reasons,
>
>   1) It saves drivers copying data out of those regions
>   2) kexec reboot can now make use of things like ESRT
>
> Instead of using the standard memblock_reserve() which is insufficient
> to reserve the region on x86 (see efi_reserve_boot_services()), a new
> API is introduced in this patch; efi_mem_reserve().
>
> efi.memmap now always represents which EFI memory regions are
> available. On x86 the EFI boot services regions that have not been
> reserved via efi_mem_reserve() will be removed from efi.memmap during
> efi_free_boot_services().
>
> This has implications for kexec, since it is not possible for a newly
> kexec'd kernel to access the same boot services regions that the
> initial boot kernel had access to unless they are reserved by every
> kexec kernel in the chain.
>
> Tested-by: Dave Young  [kexec/kdump]
> Tested-by: Ard Biesheuvel  [arm]
> Acked-by: Ard Biesheuvel 
> Cc: Leif Lindholm 
> Cc: Peter Jones 
> Cc: Borislav Petkov 
> Cc: Mark Rutland 
> Signed-off-by: Matt Fleming 

This commit appears to cause a boot regression between v4.8 and v4.9.

BUG: unable to handle kernel paging request at 8830281bf1c8
IP: [] __next_mem_range_rev+0x13a/0x1d6
PGD 3193067 PUD 3196067 PTE 8030281bf060
Oops:  1 SMP DEBUG_PAGEALLOC
Modules linked in:
CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.9.0+ #2
task: 82011540 task.stack: 8200
RIP: 0010:[] []
__next_mem_range_rev+0x13a/0x1d6
RSP: :82003dd8 EFLAGS: 00010202
RAX: 8830281bf1e0 RBX: 82003e60 RCX: 82167490
RDX:  RSI:  RDI: 00184000
RBP: 82003e18 R08: 821674b0 R09: 008f
R10: 008f R11: 82011cf0 R12: 0004
R13: 00304000 R14:  R15: 0001
FS: () GS:8817e080() knlGS:
CS: 0010 DS:  ES:  CR0: 80050033
CR2: 8830281bf1c8 CR3: 0200a000 CR4: 007406f0
DR0:  DR1:  DR2: 
DR3:  DR6: fffe0ff0 DR7: 0400
PKRU: 
Stack:
ea001700 82003e50 
10304000 82003e58 0180
ffc0
82003e98 81a21395 82003e58 
Call Trace:
[] memblock_find_in_range_node+0x93/0x13a
[] memblock_alloc_range_nid+0x1b/0x3e
[] __memblock_alloc_base+0x15/0x17
[] memblock_alloc_base+0x12/0x2e
[] memblock_alloc+0xb/0xd
[] efi_free_boot_services+0x46/0x180
[] start_kernel+0x4a1/0x4cc
[] ? set_init_arg+0x55/0x55
[] ? early_idt_handler_array+0x120/0x120
[] x86_64_start_reservations+0x2a/0x2c
[] x86_64_start_kernel+0x14c/0x16f
Code: 18 44 89 38 41 8d 44 24 ff 49 c1 e1 20 4c 09 c8 48 89 03 e9 a0
00 00 00 4d 63 d1 4c 89 d0 48 c1 e0 05 49 03 40 18 45 85 c9 74 28 <48>
8b 50 e8 48 03 50 e0 49 83 cb ff 4d 3b 10 73 03 4c 8b 18 49 ^M
RIP [] __next_mem_range_rev+0x13a/0x1d6

I also see that Petr may have run into it as well [1]? Petr is this
the same signature you are seeing? Can you post a boot log with
"efi=debug" on the kernel command line?

It also fails on 4.10-rc2.  However, if I revert the following commits
it boots fine:

4bc9f92e64c8 x86/efi-bgrt: Use efi_mem_reserve() to avoid copying image data
8e80632fb23f efi/esrt: Use efi_mem_reserve() and avoid a kmalloc()
816e76129ed5 efi: Allow drivers to reserve boot services forever

[1]: https://lkml.org/lkml/2016/12/21/197


Re: [PATCH 08/29] efi: Allow drivers to reserve boot services forever

2017-01-03 Thread Dan Williams
On Fri, Sep 9, 2016 at 8:18 AM, Matt Fleming  wrote:
> Today, it is not possible for drivers to reserve EFI boot services for
> access after efi_free_boot_services() has been called on x86. For
> ARM/arm64 it can be done simply by calling memblock_reserve().
>
> Having this ability for all three architectures is desirable for a
> couple of reasons,
>
>   1) It saves drivers copying data out of those regions
>   2) kexec reboot can now make use of things like ESRT
>
> Instead of using the standard memblock_reserve() which is insufficient
> to reserve the region on x86 (see efi_reserve_boot_services()), a new
> API is introduced in this patch; efi_mem_reserve().
>
> efi.memmap now always represents which EFI memory regions are
> available. On x86 the EFI boot services regions that have not been
> reserved via efi_mem_reserve() will be removed from efi.memmap during
> efi_free_boot_services().
>
> This has implications for kexec, since it is not possible for a newly
> kexec'd kernel to access the same boot services regions that the
> initial boot kernel had access to unless they are reserved by every
> kexec kernel in the chain.
>
> Tested-by: Dave Young  [kexec/kdump]
> Tested-by: Ard Biesheuvel  [arm]
> Acked-by: Ard Biesheuvel 
> Cc: Leif Lindholm 
> Cc: Peter Jones 
> Cc: Borislav Petkov 
> Cc: Mark Rutland 
> Signed-off-by: Matt Fleming 

This commit appears to cause a boot regression between v4.8 and v4.9.

BUG: unable to handle kernel paging request at 8830281bf1c8
IP: [] __next_mem_range_rev+0x13a/0x1d6
PGD 3193067 PUD 3196067 PTE 8030281bf060
Oops:  1 SMP DEBUG_PAGEALLOC
Modules linked in:
CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.9.0+ #2
task: 82011540 task.stack: 8200
RIP: 0010:[] []
__next_mem_range_rev+0x13a/0x1d6
RSP: :82003dd8 EFLAGS: 00010202
RAX: 8830281bf1e0 RBX: 82003e60 RCX: 82167490
RDX:  RSI:  RDI: 00184000
RBP: 82003e18 R08: 821674b0 R09: 008f
R10: 008f R11: 82011cf0 R12: 0004
R13: 00304000 R14:  R15: 0001
FS: () GS:8817e080() knlGS:
CS: 0010 DS:  ES:  CR0: 80050033
CR2: 8830281bf1c8 CR3: 0200a000 CR4: 007406f0
DR0:  DR1:  DR2: 
DR3:  DR6: fffe0ff0 DR7: 0400
PKRU: 
Stack:
ea001700 82003e50 
10304000 82003e58 0180
ffc0
82003e98 81a21395 82003e58 
Call Trace:
[] memblock_find_in_range_node+0x93/0x13a
[] memblock_alloc_range_nid+0x1b/0x3e
[] __memblock_alloc_base+0x15/0x17
[] memblock_alloc_base+0x12/0x2e
[] memblock_alloc+0xb/0xd
[] efi_free_boot_services+0x46/0x180
[] start_kernel+0x4a1/0x4cc
[] ? set_init_arg+0x55/0x55
[] ? early_idt_handler_array+0x120/0x120
[] x86_64_start_reservations+0x2a/0x2c
[] x86_64_start_kernel+0x14c/0x16f
Code: 18 44 89 38 41 8d 44 24 ff 49 c1 e1 20 4c 09 c8 48 89 03 e9 a0
00 00 00 4d 63 d1 4c 89 d0 48 c1 e0 05 49 03 40 18 45 85 c9 74 28 <48>
8b 50 e8 48 03 50 e0 49 83 cb ff 4d 3b 10 73 03 4c 8b 18 49 ^M
RIP [] __next_mem_range_rev+0x13a/0x1d6

I also see that Petr may have run into it as well [1]? Petr is this
the same signature you are seeing? Can you post a boot log with
"efi=debug" on the kernel command line?

It also fails on 4.10-rc2.  However, if I revert the following commits
it boots fine:

4bc9f92e64c8 x86/efi-bgrt: Use efi_mem_reserve() to avoid copying image data
8e80632fb23f efi/esrt: Use efi_mem_reserve() and avoid a kmalloc()
816e76129ed5 efi: Allow drivers to reserve boot services forever

[1]: https://lkml.org/lkml/2016/12/21/197


[PATCH V3] arm64: dts: ls1046a: Add TMU device tree support

2017-01-03 Thread Jia Hongtao
Also add nodes and properties for thermal management support.

Signed-off-by: Jia Hongtao 
---
Changes for V3:
* Update the subject title according to Shawn Guo's comment.
* Fix some style issue.

Changes for V2:
* Update the subject title according to Shawn Guo's comment.
* Add comments for calibration data groups.
* Update "thermal-zones" property in a unified style with platform dts.

 arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi | 80 ++
 1 file changed, 80 insertions(+)

diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi 
b/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi
index 38806ca..4a164b8 100644
--- a/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi
+++ b/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi
@@ -45,6 +45,7 @@
  */
 
 #include 
+#include 
 
 / {
compatible = "fsl,ls1046a";
@@ -67,6 +68,7 @@
clocks = < 1 0>;
next-level-cache = <>;
cpu-idle-states = <_PH20>;
+   #cooling-cells = <2>;
};
 
cpu1: cpu@1 {
@@ -279,6 +281,84 @@
clocks = <>;
};
 
+   tmu: tmu@1f0 {
+   compatible = "fsl,qoriq-tmu";
+   reg = <0x0 0x1f0 0x0 0x1>;
+   interrupts = <0 33 0x4>;
+   fsl,tmu-range = <0xb 0x9002a 0x6004c 0x30062>;
+   fsl,tmu-calibration =
+   /* Calibration data group 1 */
+   <0x 0x0026
+   0x0001 0x002d
+   0x0002 0x0032
+   0x0003 0x0039
+   0x0004 0x003f
+   0x0005 0x0046
+   0x0006 0x004d
+   0x0007 0x0054
+   0x0008 0x005a
+   0x0009 0x0061
+   0x000a 0x006a
+   0x000b 0x0071
+   /* Calibration data group 2 */
+   0x0001 0x0025
+   0x00010001 0x002c
+   0x00010002 0x0035
+   0x00010003 0x003d
+   0x00010004 0x0045
+   0x00010005 0x004e
+   0x00010006 0x0057
+   0x00010007 0x0061
+   0x00010008 0x006b
+   0x00010009 0x0076
+   /* Calibration data group 3 */
+   0x0002 0x0029
+   0x00020001 0x0033
+   0x00020002 0x003d
+   0x00020003 0x0049
+   0x00020004 0x0056
+   0x00020005 0x0061
+   0x00020006 0x006d
+   /* Calibration data group 4 */
+   0x0003 0x0021
+   0x00030001 0x002a
+   0x00030002 0x003c
+   0x00030003 0x004e>;
+   big-endian;
+   #thermal-sensor-cells = <1>;
+   };
+
+   thermal-zones {
+   cpu_thermal: cpu-thermal {
+   polling-delay-passive = <1000>;
+   polling-delay = <5000>;
+   thermal-sensors = < 3>;
+
+   trips {
+   cpu_alert: cpu-alert {
+   temperature = <85000>;
+   hysteresis = <2000>;
+   type = "passive";
+   };
+
+   cpu_crit: cpu-crit {
+   temperature = <95000>;
+   hysteresis = <2000>;
+   type = "critical";
+   };
+   };
+
+   cooling-maps {
+   map0 {
+   trip = <_alert>;
+   cooling-device =
+   < THERMAL_NO_LIMIT
+   THERMAL_NO_LIMIT>;

[PATCH V3] arm64: dts: ls1046a: Add TMU device tree support

2017-01-03 Thread Jia Hongtao
Also add nodes and properties for thermal management support.

Signed-off-by: Jia Hongtao 
---
Changes for V3:
* Update the subject title according to Shawn Guo's comment.
* Fix some style issue.

Changes for V2:
* Update the subject title according to Shawn Guo's comment.
* Add comments for calibration data groups.
* Update "thermal-zones" property in a unified style with platform dts.

 arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi | 80 ++
 1 file changed, 80 insertions(+)

diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi 
b/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi
index 38806ca..4a164b8 100644
--- a/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi
+++ b/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi
@@ -45,6 +45,7 @@
  */
 
 #include 
+#include 
 
 / {
compatible = "fsl,ls1046a";
@@ -67,6 +68,7 @@
clocks = < 1 0>;
next-level-cache = <>;
cpu-idle-states = <_PH20>;
+   #cooling-cells = <2>;
};
 
cpu1: cpu@1 {
@@ -279,6 +281,84 @@
clocks = <>;
};
 
+   tmu: tmu@1f0 {
+   compatible = "fsl,qoriq-tmu";
+   reg = <0x0 0x1f0 0x0 0x1>;
+   interrupts = <0 33 0x4>;
+   fsl,tmu-range = <0xb 0x9002a 0x6004c 0x30062>;
+   fsl,tmu-calibration =
+   /* Calibration data group 1 */
+   <0x 0x0026
+   0x0001 0x002d
+   0x0002 0x0032
+   0x0003 0x0039
+   0x0004 0x003f
+   0x0005 0x0046
+   0x0006 0x004d
+   0x0007 0x0054
+   0x0008 0x005a
+   0x0009 0x0061
+   0x000a 0x006a
+   0x000b 0x0071
+   /* Calibration data group 2 */
+   0x0001 0x0025
+   0x00010001 0x002c
+   0x00010002 0x0035
+   0x00010003 0x003d
+   0x00010004 0x0045
+   0x00010005 0x004e
+   0x00010006 0x0057
+   0x00010007 0x0061
+   0x00010008 0x006b
+   0x00010009 0x0076
+   /* Calibration data group 3 */
+   0x0002 0x0029
+   0x00020001 0x0033
+   0x00020002 0x003d
+   0x00020003 0x0049
+   0x00020004 0x0056
+   0x00020005 0x0061
+   0x00020006 0x006d
+   /* Calibration data group 4 */
+   0x0003 0x0021
+   0x00030001 0x002a
+   0x00030002 0x003c
+   0x00030003 0x004e>;
+   big-endian;
+   #thermal-sensor-cells = <1>;
+   };
+
+   thermal-zones {
+   cpu_thermal: cpu-thermal {
+   polling-delay-passive = <1000>;
+   polling-delay = <5000>;
+   thermal-sensors = < 3>;
+
+   trips {
+   cpu_alert: cpu-alert {
+   temperature = <85000>;
+   hysteresis = <2000>;
+   type = "passive";
+   };
+
+   cpu_crit: cpu-crit {
+   temperature = <95000>;
+   hysteresis = <2000>;
+   type = "critical";
+   };
+   };
+
+   cooling-maps {
+   map0 {
+   trip = <_alert>;
+   cooling-device =
+   < THERMAL_NO_LIMIT
+   THERMAL_NO_LIMIT>;
+ 

Re: [PATCH 2/4] watchdog: iTCO_wdt: Use device managed resources

2017-01-03 Thread Guenter Roeck

On 01/03/2017 02:41 PM, Andy Shevchenko wrote:

On Tue, Jan 3, 2017 at 4:39 PM, Guenter Roeck  wrote:

Using device managed resources simplifies error handling and cleanup,
and to reduce the likelyhood of errors.

Signed-off-by: Guenter Roeck 


Reviewed-by: Andy Shevchenko 

Does it make sense to convert to dev_err() at some point?



I tried. It gives me

[59886.759774] iTCO_vendor_support: vendor-support=0
[59886.760452] iTCO_wdt: Intel TCO WatchDog Timer Driver v1.11
[59886.760540] iTCO_wdt iTCO_wdt.0.auto: Found a 9 Series TCO device 
(Version=2, TCOBASE=0x1860)
[59886.761849] iTCO_wdt iTCO_wdt.0.auto: initialized. heartbeat=30 sec 
(nowayout=0)

which I don't really like too much. Let's stick with pr_{info,err}.

Thanks,
Guenter



Re: [PATCH 2/2] printk: always report lost messages on serial console

2017-01-03 Thread Sergey Senozhatsky
On (01/03/17 17:53), Petr Mladek wrote:
> On Wed 2017-01-04 00:47:45, Sergey Senozhatsky wrote:
> > On (01/03/17 15:55), Petr Mladek wrote:
> > [..]
> > > This causes the opposite problem. We might print a message that was 
> > > supposed
> > > to be suppressed.
> > 
> > so what? yes, we print a message that otherwise would have been suppressed.
> > not a big deal. at all. we are under high printk load and the best thing
> > we can do is to report "we are losing the messages" straight ahead. the
> > next 'visible' message may be seconds/minutes/forever away. think of a
> > printk() flood of messages with suppressed loglevel coming from CPUA-CPUX,
> > big enough to drain all 'visible' loglevel messages from CPUZ. we are
> > back to problem "a".
> > 
> > thus I want a simple bool flag and a simple rule: we see something - we say 
> > it.
> 
> So, you prefer to print some random debug message instead of an
> emergency one?

yep. because console_unlock() is not the right place to address printk()
abuse, and console_unlock() cannot address it. the only way to fix it is
to reduce the amount of printk() calls (well, there is one more thing
probably and may be but not really *).


> It will always drop a message because you always process only one
> and many new appear in the meantime. While with my solution,
> you should see:
> 
> ** 1324 printk messages dropped ** 
> ** 523 printk messages dropped ** 
> ** 324 printk messages dropped ** 
> ** 345 printk messages dropped ** 

once you started losing the messages because of printk() flood you will
lose them no matter what you do. console_unlock() cannot force printk() CPUs
to add less messages to the logbuf, so we magically can have enough time to
print the message on the consoles. any call to console drivers leads to lost
messages. there is no difference. 

this is from the real serial logs I'm looking at right now. we attach
"bad news" to 'critical' messages only:

...
[   32.941061] bc00: b65dc0d8 b65dc6d0 ae1fbc7c b65c11c5 b563c9c4 0001 
b65dc6d0 b0a62f74
** 150 printk messages dropped ** [   32.941614] cee0: 0081 ae1fcef0 
0038  b5369000 ae1fcf00 ae1fd0f0 
** 75 printk messages dropped ** [   32.941892] d860: 00056608 af203848 
 0004088c 00d0   
** 12 printk messages dropped ** [   32.941940] ..
** 2 printk messages dropped ** [   32.941951] ..
** 10 printk messages dropped ** [   32.941992] ..
** 1 printk messages dropped ** [   32.941999] ..
...

note how many critical messages I lost in consecutive console drivers calls
because console_unlock() was printing other critical messages.
no matter what we filter-out in console_unlock() we are still far-far-far
behind the CPUs that flood the printk buffer. and those CPUs will be happy
to drain any critical/visible message from the logbuf.




* may be can do something like this:

two logbuf buffers. one for `everything' -- both suppressed and visible
loglevels. this one is also used by dmesg. the other one if for messages
that won't be suppressed. we call_console_drivers() on that buffer.
so vprintk_emit() becomes

vprintk_emit()
{
spin_lock logbuf_lock

text = sprintf(...)

log_store(logbuf)
if (!suppress_message_printing(level))
log_store(printing_lofbuf)

spin_ulock logbuf_lock
}

and console_unlock() reads printing_logbuf only. but we still can lose
messages even from filtered printing_logbuf.

-ss


Re: [PATCH 2/4] watchdog: iTCO_wdt: Use device managed resources

2017-01-03 Thread Guenter Roeck

On 01/03/2017 02:41 PM, Andy Shevchenko wrote:

On Tue, Jan 3, 2017 at 4:39 PM, Guenter Roeck  wrote:

Using device managed resources simplifies error handling and cleanup,
and to reduce the likelyhood of errors.

Signed-off-by: Guenter Roeck 


Reviewed-by: Andy Shevchenko 

Does it make sense to convert to dev_err() at some point?



I tried. It gives me

[59886.759774] iTCO_vendor_support: vendor-support=0
[59886.760452] iTCO_wdt: Intel TCO WatchDog Timer Driver v1.11
[59886.760540] iTCO_wdt iTCO_wdt.0.auto: Found a 9 Series TCO device 
(Version=2, TCOBASE=0x1860)
[59886.761849] iTCO_wdt iTCO_wdt.0.auto: initialized. heartbeat=30 sec 
(nowayout=0)

which I don't really like too much. Let's stick with pr_{info,err}.

Thanks,
Guenter



Re: [PATCH 2/2] printk: always report lost messages on serial console

2017-01-03 Thread Sergey Senozhatsky
On (01/03/17 17:53), Petr Mladek wrote:
> On Wed 2017-01-04 00:47:45, Sergey Senozhatsky wrote:
> > On (01/03/17 15:55), Petr Mladek wrote:
> > [..]
> > > This causes the opposite problem. We might print a message that was 
> > > supposed
> > > to be suppressed.
> > 
> > so what? yes, we print a message that otherwise would have been suppressed.
> > not a big deal. at all. we are under high printk load and the best thing
> > we can do is to report "we are losing the messages" straight ahead. the
> > next 'visible' message may be seconds/minutes/forever away. think of a
> > printk() flood of messages with suppressed loglevel coming from CPUA-CPUX,
> > big enough to drain all 'visible' loglevel messages from CPUZ. we are
> > back to problem "a".
> > 
> > thus I want a simple bool flag and a simple rule: we see something - we say 
> > it.
> 
> So, you prefer to print some random debug message instead of an
> emergency one?

yep. because console_unlock() is not the right place to address printk()
abuse, and console_unlock() cannot address it. the only way to fix it is
to reduce the amount of printk() calls (well, there is one more thing
probably and may be but not really *).


> It will always drop a message because you always process only one
> and many new appear in the meantime. While with my solution,
> you should see:
> 
> ** 1324 printk messages dropped ** 
> ** 523 printk messages dropped ** 
> ** 324 printk messages dropped ** 
> ** 345 printk messages dropped ** 

once you started losing the messages because of printk() flood you will
lose them no matter what you do. console_unlock() cannot force printk() CPUs
to add less messages to the logbuf, so we magically can have enough time to
print the message on the consoles. any call to console drivers leads to lost
messages. there is no difference. 

this is from the real serial logs I'm looking at right now. we attach
"bad news" to 'critical' messages only:

...
[   32.941061] bc00: b65dc0d8 b65dc6d0 ae1fbc7c b65c11c5 b563c9c4 0001 
b65dc6d0 b0a62f74
** 150 printk messages dropped ** [   32.941614] cee0: 0081 ae1fcef0 
0038  b5369000 ae1fcf00 ae1fd0f0 
** 75 printk messages dropped ** [   32.941892] d860: 00056608 af203848 
 0004088c 00d0   
** 12 printk messages dropped ** [   32.941940] ..
** 2 printk messages dropped ** [   32.941951] ..
** 10 printk messages dropped ** [   32.941992] ..
** 1 printk messages dropped ** [   32.941999] ..
...

note how many critical messages I lost in consecutive console drivers calls
because console_unlock() was printing other critical messages.
no matter what we filter-out in console_unlock() we are still far-far-far
behind the CPUs that flood the printk buffer. and those CPUs will be happy
to drain any critical/visible message from the logbuf.




* may be can do something like this:

two logbuf buffers. one for `everything' -- both suppressed and visible
loglevels. this one is also used by dmesg. the other one if for messages
that won't be suppressed. we call_console_drivers() on that buffer.
so vprintk_emit() becomes

vprintk_emit()
{
spin_lock logbuf_lock

text = sprintf(...)

log_store(logbuf)
if (!suppress_message_printing(level))
log_store(printing_lofbuf)

spin_ulock logbuf_lock
}

and console_unlock() reads printing_logbuf only. but we still can lose
messages even from filtered printing_logbuf.

-ss


Re: [PATCH 3/5] arm64: dts: sun50i: add MMC nodes

2017-01-03 Thread Chen-Yu Tsai
On Wed, Jan 4, 2017 at 8:22 AM, André Przywara  wrote:
> On 03/01/17 13:28, Chen-Yu Tsai wrote:
>> On Tue, Jan 3, 2017 at 6:48 PM, André Przywara  
>> wrote:
>>> On 03/01/17 02:52, Chen-Yu Tsai wrote:
>
> Hi Chen-Yu,
>
 On Tue, Jan 3, 2017 at 7:03 AM, Andre Przywara  
 wrote:

 A commit message explaining the mmc controllers would be nice.
>>>
>>> OK.
>>>
> Signed-off-by: Andre Przywara 
> ---
>  arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi | 67 
> +++
>  1 file changed, 67 insertions(+)
>
> diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi 
> b/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi
> index e0dcab8..c680566 100644
> --- a/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi
> +++ b/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi
> @@ -150,6 +150,32 @@
> pins = "PB8", "PB9";
> function = "uart0";
> };
> +
> +   mmc0_pins: mmc0@0 {
> +   pins = "PF0", "PF1", "PF2", "PF3", "PF4", 
> "PF5";
> +   function = "mmc0";
> +   drive-strength = <30>;
> +   };
> +
> +   mmc0_default_cd_pin: mmc0_cd_pin@0 {
> +   pins = "PF6";
> +   function = "gpio_in";
> +   bias-pull-up;
> +   };

 We are starting to drop pinmux nodes for gpio usage.
>>>
>>> And replacing them with what?
>>> Or do you mean they go in the individual board .dts files?
>>> In this case I believe having a default pin defined here would help to
>>> define it in every .dts.
>>
>> Nope. I meant dropping them. Pinmux and gpio are orthogonal. One should not
>> need to specify a gpio pinmux to use it as a gpio. We added them because in
>> the past nothing was preventing someone from claiming an already muxed pin
>> as a gpio. On some platforms this is fine. For sunxi, this breaks the system,
>> as the gpio functions are muxed in.
>>
>> The idea moving forward is that these cases should be guarded in the driver.
>
> Ah, OK, you mean just referencing the pin like:
> cd-gpios = < 5 6 GPIO_ACTIVE_HIGH>; /* PF6 */
> in the board DT is enough? And the driver claiming it will (eventually?)
> prevent users from using them via the sysfs interface then?

cd-gpios = 

should be enough to block other devices from claiming a pinmux (pinctrl-N = ...)
on the same pin, or vice versa, whichever claimed it first in kernel space.

Same goes for a pin already claimed by a device through a pinmux. It should
not be claimable as a gpio through the sysfs interface.

>> Of course we would have to deal with existing dtbs, but lets not add any 
>> more.
>>
> +
> +   mmc1_pins: mmc1@0 {
> +   pins = "PG0", "PG1", "PG2", "PG3", "PG4", 
> "PG5";
> +   function = "mmc1";
> +   drive-strength = <30>;
> +   };
> +
> +   mmc2_pins: mmc2@0 {
> +   pins = "PC1", "PC5", "PC6", "PC8", "PC9",
> +  "PC10", "PC11", "PC12", "PC13", 
> "PC14",
> +  "PC15", "PC16";
> +   function = "mmc2";
> +   drive-strength = <30>;
> +   };

 Moreover I think you should split out the pinmux nodes to a separate patch.
>>>
>>> I can surely do, just wondering what's the rationale is behind that?
>>
>> More or less the "do one thing in one patch" rationale. Of course you can
>> claim these are the defaults used in the reference design and pretty much
>> every board out there. Then it makes sense to do them together. :)
>
> Mmmh, probably a misunderstanding, but:
> Those pins are the possible pins that expose the MMC interface. Those
> nodes here name them to allow easy and concise reference by a board DT
> (as in the following patch).
> And in fact in case of the A64 there are _no_ alternative muxes for the
> three MMC controllers: SDC0 is only on PF0-5, SDC1 on PG0-PG5 and SDC2
> on PC1-PC16.
> So it's not a board design question, apart from using a controller or not.
>
> That's why I rather keep them together with the MMC controller nodes.

Cool. Please mention this in the commit log, as in they are the only
possible choice.S ince they are the only choices, please drop the @0
in the node name.

And you might want to rename mmc2_pins to something like
"mmc2_8bit_pins: mmc2_8bit". Who knows, someone might consider using
it for an SD 

Re: [PATCH 3/5] arm64: dts: sun50i: add MMC nodes

2017-01-03 Thread Chen-Yu Tsai
On Wed, Jan 4, 2017 at 8:22 AM, André Przywara  wrote:
> On 03/01/17 13:28, Chen-Yu Tsai wrote:
>> On Tue, Jan 3, 2017 at 6:48 PM, André Przywara  
>> wrote:
>>> On 03/01/17 02:52, Chen-Yu Tsai wrote:
>
> Hi Chen-Yu,
>
 On Tue, Jan 3, 2017 at 7:03 AM, Andre Przywara  
 wrote:

 A commit message explaining the mmc controllers would be nice.
>>>
>>> OK.
>>>
> Signed-off-by: Andre Przywara 
> ---
>  arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi | 67 
> +++
>  1 file changed, 67 insertions(+)
>
> diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi 
> b/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi
> index e0dcab8..c680566 100644
> --- a/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi
> +++ b/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi
> @@ -150,6 +150,32 @@
> pins = "PB8", "PB9";
> function = "uart0";
> };
> +
> +   mmc0_pins: mmc0@0 {
> +   pins = "PF0", "PF1", "PF2", "PF3", "PF4", 
> "PF5";
> +   function = "mmc0";
> +   drive-strength = <30>;
> +   };
> +
> +   mmc0_default_cd_pin: mmc0_cd_pin@0 {
> +   pins = "PF6";
> +   function = "gpio_in";
> +   bias-pull-up;
> +   };

 We are starting to drop pinmux nodes for gpio usage.
>>>
>>> And replacing them with what?
>>> Or do you mean they go in the individual board .dts files?
>>> In this case I believe having a default pin defined here would help to
>>> define it in every .dts.
>>
>> Nope. I meant dropping them. Pinmux and gpio are orthogonal. One should not
>> need to specify a gpio pinmux to use it as a gpio. We added them because in
>> the past nothing was preventing someone from claiming an already muxed pin
>> as a gpio. On some platforms this is fine. For sunxi, this breaks the system,
>> as the gpio functions are muxed in.
>>
>> The idea moving forward is that these cases should be guarded in the driver.
>
> Ah, OK, you mean just referencing the pin like:
> cd-gpios = < 5 6 GPIO_ACTIVE_HIGH>; /* PF6 */
> in the board DT is enough? And the driver claiming it will (eventually?)
> prevent users from using them via the sysfs interface then?

cd-gpios = 

should be enough to block other devices from claiming a pinmux (pinctrl-N = ...)
on the same pin, or vice versa, whichever claimed it first in kernel space.

Same goes for a pin already claimed by a device through a pinmux. It should
not be claimable as a gpio through the sysfs interface.

>> Of course we would have to deal with existing dtbs, but lets not add any 
>> more.
>>
> +
> +   mmc1_pins: mmc1@0 {
> +   pins = "PG0", "PG1", "PG2", "PG3", "PG4", 
> "PG5";
> +   function = "mmc1";
> +   drive-strength = <30>;
> +   };
> +
> +   mmc2_pins: mmc2@0 {
> +   pins = "PC1", "PC5", "PC6", "PC8", "PC9",
> +  "PC10", "PC11", "PC12", "PC13", 
> "PC14",
> +  "PC15", "PC16";
> +   function = "mmc2";
> +   drive-strength = <30>;
> +   };

 Moreover I think you should split out the pinmux nodes to a separate patch.
>>>
>>> I can surely do, just wondering what's the rationale is behind that?
>>
>> More or less the "do one thing in one patch" rationale. Of course you can
>> claim these are the defaults used in the reference design and pretty much
>> every board out there. Then it makes sense to do them together. :)
>
> Mmmh, probably a misunderstanding, but:
> Those pins are the possible pins that expose the MMC interface. Those
> nodes here name them to allow easy and concise reference by a board DT
> (as in the following patch).
> And in fact in case of the A64 there are _no_ alternative muxes for the
> three MMC controllers: SDC0 is only on PF0-5, SDC1 on PG0-PG5 and SDC2
> on PC1-PC16.
> So it's not a board design question, apart from using a controller or not.
>
> That's why I rather keep them together with the MMC controller nodes.

Cool. Please mention this in the commit log, as in they are the only
possible choice.S ince they are the only choices, please drop the @0
in the node name.

And you might want to rename mmc2_pins to something like
"mmc2_8bit_pins: mmc2_8bit". Who knows, someone might consider using
it for an SD card or SDIO, and the remaining pins for something else.


> };
>

[next PATCH v4 2/3] mm: Rename __page_frag functions to __page_frag_cache, drop order from drain

2017-01-03 Thread Alexander Duyck
From: Alexander Duyck 

This patch does two things.

First it goes through and renames the __page_frag prefixed functions to
__page_frag_cache so that we can be clear that we are draining or refilling
the cache, not the frags themselves.

Second we drop the order parameter from __page_frag_cache_drain since we
don't actually need to pass it since all fragments are either order 0 or
must be a compound page.

Signed-off-by: Alexander Duyck 
---

v2: No change
v3: No change (in theory)
v4: Pulled out page_frag_alloc rename bits which had leaked through due to me
trying out reordering some patches in my own queue.

 drivers/net/ethernet/intel/igb/igb_main.c |6 +++---
 include/linux/gfp.h   |3 +--
 mm/page_alloc.c   |   13 +++--
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c 
b/drivers/net/ethernet/intel/igb/igb_main.c
index 594604e09f8d..cb08900c9cf2 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -3964,8 +3964,8 @@ static void igb_clean_rx_ring(struct igb_ring *rx_ring)
 PAGE_SIZE,
 DMA_FROM_DEVICE,
 DMA_ATTR_SKIP_CPU_SYNC);
-   __page_frag_drain(buffer_info->page, 0,
- buffer_info->pagecnt_bias);
+   __page_frag_cache_drain(buffer_info->page,
+   buffer_info->pagecnt_bias);
 
buffer_info->page = NULL;
}
@@ -6993,7 +6993,7 @@ static struct sk_buff *igb_fetch_rx_buffer(struct 
igb_ring *rx_ring,
dma_unmap_page_attrs(rx_ring->dev, rx_buffer->dma,
 PAGE_SIZE, DMA_FROM_DEVICE,
 DMA_ATTR_SKIP_CPU_SYNC);
-   __page_frag_drain(page, 0, rx_buffer->pagecnt_bias);
+   __page_frag_cache_drain(page, rx_buffer->pagecnt_bias);
}
 
/* clear contents of rx_buffer */
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 6238c74e0a01..884080404e24 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -506,8 +506,7 @@ extern struct page *alloc_pages_vma(gfp_t gfp_mask, int 
order,
 extern void free_hot_cold_page_list(struct list_head *list, bool cold);
 
 struct page_frag_cache;
-extern void __page_frag_drain(struct page *page, unsigned int order,
- unsigned int count);
+extern void __page_frag_cache_drain(struct page *page, unsigned int count);
 extern void *page_frag_alloc(struct page_frag_cache *nc,
 unsigned int fragsz, gfp_t gfp_mask);
 extern void page_frag_free(void *addr);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9534e44308b2..4b0541cd3699 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3904,8 +3904,8 @@ void free_pages(unsigned long addr, unsigned int order)
  * drivers to provide a backing region of memory for use as either an
  * sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
  */
-static struct page *__page_frag_refill(struct page_frag_cache *nc,
-  gfp_t gfp_mask)
+static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
+gfp_t gfp_mask)
 {
struct page *page = NULL;
gfp_t gfp = gfp_mask;
@@ -3925,19 +3925,20 @@ static struct page *__page_frag_refill(struct 
page_frag_cache *nc,
return page;
 }
 
-void __page_frag_drain(struct page *page, unsigned int order,
-  unsigned int count)
+void __page_frag_cache_drain(struct page *page, unsigned int count)
 {
VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
 
if (page_ref_sub_and_test(page, count)) {
+   unsigned int order = compound_order(page);
+
if (order == 0)
free_hot_cold_page(page, false);
else
__free_pages_ok(page, order);
}
 }
-EXPORT_SYMBOL(__page_frag_drain);
+EXPORT_SYMBOL(__page_frag_cache_drain);
 
 void *page_frag_alloc(struct page_frag_cache *nc,
  unsigned int fragsz, gfp_t gfp_mask)
@@ -3948,7 +3949,7 @@ void *page_frag_alloc(struct page_frag_cache *nc,
 
if (unlikely(!nc->va)) {
 refill:
-   page = __page_frag_refill(nc, gfp_mask);
+   page = __page_frag_cache_refill(nc, gfp_mask);
if (!page)
return NULL;
 



[next PATCH v4 2/3] mm: Rename __page_frag functions to __page_frag_cache, drop order from drain

2017-01-03 Thread Alexander Duyck
From: Alexander Duyck 

This patch does two things.

First it goes through and renames the __page_frag prefixed functions to
__page_frag_cache so that we can be clear that we are draining or refilling
the cache, not the frags themselves.

Second we drop the order parameter from __page_frag_cache_drain since we
don't actually need to pass it since all fragments are either order 0 or
must be a compound page.

Signed-off-by: Alexander Duyck 
---

v2: No change
v3: No change (in theory)
v4: Pulled out page_frag_alloc rename bits which had leaked through due to me
trying out reordering some patches in my own queue.

 drivers/net/ethernet/intel/igb/igb_main.c |6 +++---
 include/linux/gfp.h   |3 +--
 mm/page_alloc.c   |   13 +++--
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c 
b/drivers/net/ethernet/intel/igb/igb_main.c
index 594604e09f8d..cb08900c9cf2 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -3964,8 +3964,8 @@ static void igb_clean_rx_ring(struct igb_ring *rx_ring)
 PAGE_SIZE,
 DMA_FROM_DEVICE,
 DMA_ATTR_SKIP_CPU_SYNC);
-   __page_frag_drain(buffer_info->page, 0,
- buffer_info->pagecnt_bias);
+   __page_frag_cache_drain(buffer_info->page,
+   buffer_info->pagecnt_bias);
 
buffer_info->page = NULL;
}
@@ -6993,7 +6993,7 @@ static struct sk_buff *igb_fetch_rx_buffer(struct 
igb_ring *rx_ring,
dma_unmap_page_attrs(rx_ring->dev, rx_buffer->dma,
 PAGE_SIZE, DMA_FROM_DEVICE,
 DMA_ATTR_SKIP_CPU_SYNC);
-   __page_frag_drain(page, 0, rx_buffer->pagecnt_bias);
+   __page_frag_cache_drain(page, rx_buffer->pagecnt_bias);
}
 
/* clear contents of rx_buffer */
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 6238c74e0a01..884080404e24 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -506,8 +506,7 @@ extern struct page *alloc_pages_vma(gfp_t gfp_mask, int 
order,
 extern void free_hot_cold_page_list(struct list_head *list, bool cold);
 
 struct page_frag_cache;
-extern void __page_frag_drain(struct page *page, unsigned int order,
- unsigned int count);
+extern void __page_frag_cache_drain(struct page *page, unsigned int count);
 extern void *page_frag_alloc(struct page_frag_cache *nc,
 unsigned int fragsz, gfp_t gfp_mask);
 extern void page_frag_free(void *addr);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9534e44308b2..4b0541cd3699 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3904,8 +3904,8 @@ void free_pages(unsigned long addr, unsigned int order)
  * drivers to provide a backing region of memory for use as either an
  * sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
  */
-static struct page *__page_frag_refill(struct page_frag_cache *nc,
-  gfp_t gfp_mask)
+static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
+gfp_t gfp_mask)
 {
struct page *page = NULL;
gfp_t gfp = gfp_mask;
@@ -3925,19 +3925,20 @@ static struct page *__page_frag_refill(struct 
page_frag_cache *nc,
return page;
 }
 
-void __page_frag_drain(struct page *page, unsigned int order,
-  unsigned int count)
+void __page_frag_cache_drain(struct page *page, unsigned int count)
 {
VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
 
if (page_ref_sub_and_test(page, count)) {
+   unsigned int order = compound_order(page);
+
if (order == 0)
free_hot_cold_page(page, false);
else
__free_pages_ok(page, order);
}
 }
-EXPORT_SYMBOL(__page_frag_drain);
+EXPORT_SYMBOL(__page_frag_cache_drain);
 
 void *page_frag_alloc(struct page_frag_cache *nc,
  unsigned int fragsz, gfp_t gfp_mask)
@@ -3948,7 +3949,7 @@ void *page_frag_alloc(struct page_frag_cache *nc,
 
if (unlikely(!nc->va)) {
 refill:
-   page = __page_frag_refill(nc, gfp_mask);
+   page = __page_frag_cache_refill(nc, gfp_mask);
if (!page)
return NULL;
 



[next PATCH v4 3/3] mm: Add documentation for page fragment APIs

2017-01-03 Thread Alexander Duyck
From: Alexander Duyck 

This is a first pass at trying to add documentation for the page_frag APIs.
They may still change over time but for now I thought I would try to get
these documented so that as more network drivers and stack calls make use
of them we have one central spot to document how they are meant to be used.

Signed-off-by: Alexander Duyck 
---

v2,v3: No change

 Documentation/vm/page_frags |   42 ++
 1 file changed, 42 insertions(+)
 create mode 100644 Documentation/vm/page_frags

diff --git a/Documentation/vm/page_frags b/Documentation/vm/page_frags
new file mode 100644
index ..a6714565dbf9
--- /dev/null
+++ b/Documentation/vm/page_frags
@@ -0,0 +1,42 @@
+Page fragments
+--
+
+A page fragment is an arbitrary-length arbitrary-offset area of memory
+which resides within a 0 or higher order compound page.  Multiple
+fragments within that page are individually refcounted, in the page's
+reference counter.
+
+The page_frag functions, page_frag_alloc and page_frag_free, provide a
+simple allocation framework for page fragments.  This is used by the
+network stack and network device drivers to provide a backing region of
+memory for use as either an sk_buff->head, or to be used in the "frags"
+portion of skb_shared_info.
+
+In order to make use of the page fragment APIs a backing page fragment
+cache is needed.  This provides a central point for the fragment allocation
+and tracks allows multiple calls to make use of a cached page.  The
+advantage to doing this is that multiple calls to get_page can be avoided
+which can be expensive at allocation time.  However due to the nature of
+this caching it is required that any calls to the cache be protected by
+either a per-cpu limitation, or a per-cpu limitation and forcing interrupts
+to be disabled when executing the fragment allocation.
+
+The network stack uses two separate caches per CPU to handle fragment
+allocation.  The netdev_alloc_cache is used by callers making use of the
+__netdev_alloc_frag and __netdev_alloc_skb calls.  The napi_alloc_cache is
+used by callers of the __napi_alloc_frag and __napi_alloc_skb calls.  The
+main difference between these two calls is the context in which they may be
+called.  The "netdev" prefixed functions are usable in any context as these
+functions will disable interrupts, while the "napi" prefixed functions are
+only usable within the softirq context.
+
+Many network device drivers use a similar methodology for allocating page
+fragments, but the page fragments are cached at the ring or descriptor
+level.  In order to enable these cases it is necessary to provide a generic
+way of tearing down a page cache.  For this reason __page_frag_cache_drain
+was implemented.  It allows for freeing multiple references from a single
+page via a single call.  The advantage to doing this is that it allows for
+cleaning up the multiple references that were added to a page in order to
+avoid calling get_page per allocation.
+
+Alexander Duyck, Nov 29, 2016.



[next PATCH v4 1/3] mm: Rename __alloc_page_frag to page_frag_alloc and __free_page_frag to page_frag_free

2017-01-03 Thread Alexander Duyck
From: Alexander Duyck 

This patch renames the page frag functions to be more consistent with other
APIs.  Specifically we place the name page_frag first in the name and then
have either an alloc or free call name that we append as the suffix.  This
makes it a bit clearer in terms of naming.

In addition we drop the leading double underscores since we are technically
no longer a backing interface and instead the front end that is called from
the networking APIs.

Signed-off-by: Alexander Duyck 
---

v2: Fixed a comparison between a void* and 0 due to copy/paste from free_pages
v3: Dropped changes to function and make this a rename patch only.
This fixes a small performance regression I saw in some tests.
v4: Added back page_frag_alloc which had leaked through to patch 2 after
I had tried reordering them in my own queue.

 include/linux/gfp.h|6 +++---
 include/linux/skbuff.h |2 +-
 mm/page_alloc.c|   10 +-
 net/core/skbuff.c  |8 
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 4175dca4ac39..6238c74e0a01 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -508,9 +508,9 @@ extern struct page *alloc_pages_vma(gfp_t gfp_mask, int 
order,
 struct page_frag_cache;
 extern void __page_frag_drain(struct page *page, unsigned int order,
  unsigned int count);
-extern void *__alloc_page_frag(struct page_frag_cache *nc,
-  unsigned int fragsz, gfp_t gfp_mask);
-extern void __free_page_frag(void *addr);
+extern void *page_frag_alloc(struct page_frag_cache *nc,
+unsigned int fragsz, gfp_t gfp_mask);
+extern void page_frag_free(void *addr);
 
 #define __free_page(page) __free_pages((page), 0)
 #define free_page(addr) free_pages((addr), 0)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b53c0cfd417e..a410715bbef8 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2480,7 +2480,7 @@ static inline struct sk_buff 
*netdev_alloc_skb_ip_align(struct net_device *dev,
 
 static inline void skb_free_frag(void *addr)
 {
-   __free_page_frag(addr);
+   page_frag_free(addr);
 }
 
 void *napi_alloc_frag(unsigned int fragsz);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2c6d5f64feca..9534e44308b2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3939,8 +3939,8 @@ void __page_frag_drain(struct page *page, unsigned int 
order,
 }
 EXPORT_SYMBOL(__page_frag_drain);
 
-void *__alloc_page_frag(struct page_frag_cache *nc,
-   unsigned int fragsz, gfp_t gfp_mask)
+void *page_frag_alloc(struct page_frag_cache *nc,
+ unsigned int fragsz, gfp_t gfp_mask)
 {
unsigned int size = PAGE_SIZE;
struct page *page;
@@ -3991,19 +3991,19 @@ void *__alloc_page_frag(struct page_frag_cache *nc,
 
return nc->va + offset;
 }
-EXPORT_SYMBOL(__alloc_page_frag);
+EXPORT_SYMBOL(page_frag_alloc);
 
 /*
  * Frees a page fragment allocated out of either a compound or order 0 page.
  */
-void __free_page_frag(void *addr)
+void page_frag_free(void *addr)
 {
struct page *page = virt_to_head_page(addr);
 
if (unlikely(put_page_testzero(page)))
__free_pages_ok(page, compound_order(page));
 }
-EXPORT_SYMBOL(__free_page_frag);
+EXPORT_SYMBOL(page_frag_free);
 
 static void *make_alloc_exact(unsigned long addr, unsigned int order,
size_t size)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 5a03730fbc1a..734c71468b01 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -369,7 +369,7 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t 
gfp_mask)
 
local_irq_save(flags);
nc = this_cpu_ptr(_alloc_cache);
-   data = __alloc_page_frag(nc, fragsz, gfp_mask);
+   data = page_frag_alloc(nc, fragsz, gfp_mask);
local_irq_restore(flags);
return data;
 }
@@ -391,7 +391,7 @@ static void *__napi_alloc_frag(unsigned int fragsz, gfp_t 
gfp_mask)
 {
struct napi_alloc_cache *nc = this_cpu_ptr(_alloc_cache);
 
-   return __alloc_page_frag(>page, fragsz, gfp_mask);
+   return page_frag_alloc(>page, fragsz, gfp_mask);
 }
 
 void *napi_alloc_frag(unsigned int fragsz)
@@ -441,7 +441,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, 
unsigned int len,
local_irq_save(flags);
 
nc = this_cpu_ptr(_alloc_cache);
-   data = __alloc_page_frag(nc, len, gfp_mask);
+   data = page_frag_alloc(nc, len, gfp_mask);
pfmemalloc = nc->pfmemalloc;
 
local_irq_restore(flags);
@@ -505,7 +505,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, 
unsigned int len,
if (sk_memalloc_socks())
gfp_mask |= __GFP_MEMALLOC;
 
-   data = __alloc_page_frag(>page, len, gfp_mask);
+   data = page_frag_alloc(>page, len, 

[next PATCH v4 3/3] mm: Add documentation for page fragment APIs

2017-01-03 Thread Alexander Duyck
From: Alexander Duyck 

This is a first pass at trying to add documentation for the page_frag APIs.
They may still change over time but for now I thought I would try to get
these documented so that as more network drivers and stack calls make use
of them we have one central spot to document how they are meant to be used.

Signed-off-by: Alexander Duyck 
---

v2,v3: No change

 Documentation/vm/page_frags |   42 ++
 1 file changed, 42 insertions(+)
 create mode 100644 Documentation/vm/page_frags

diff --git a/Documentation/vm/page_frags b/Documentation/vm/page_frags
new file mode 100644
index ..a6714565dbf9
--- /dev/null
+++ b/Documentation/vm/page_frags
@@ -0,0 +1,42 @@
+Page fragments
+--
+
+A page fragment is an arbitrary-length arbitrary-offset area of memory
+which resides within a 0 or higher order compound page.  Multiple
+fragments within that page are individually refcounted, in the page's
+reference counter.
+
+The page_frag functions, page_frag_alloc and page_frag_free, provide a
+simple allocation framework for page fragments.  This is used by the
+network stack and network device drivers to provide a backing region of
+memory for use as either an sk_buff->head, or to be used in the "frags"
+portion of skb_shared_info.
+
+In order to make use of the page fragment APIs a backing page fragment
+cache is needed.  This provides a central point for the fragment allocation
+and tracks allows multiple calls to make use of a cached page.  The
+advantage to doing this is that multiple calls to get_page can be avoided
+which can be expensive at allocation time.  However due to the nature of
+this caching it is required that any calls to the cache be protected by
+either a per-cpu limitation, or a per-cpu limitation and forcing interrupts
+to be disabled when executing the fragment allocation.
+
+The network stack uses two separate caches per CPU to handle fragment
+allocation.  The netdev_alloc_cache is used by callers making use of the
+__netdev_alloc_frag and __netdev_alloc_skb calls.  The napi_alloc_cache is
+used by callers of the __napi_alloc_frag and __napi_alloc_skb calls.  The
+main difference between these two calls is the context in which they may be
+called.  The "netdev" prefixed functions are usable in any context as these
+functions will disable interrupts, while the "napi" prefixed functions are
+only usable within the softirq context.
+
+Many network device drivers use a similar methodology for allocating page
+fragments, but the page fragments are cached at the ring or descriptor
+level.  In order to enable these cases it is necessary to provide a generic
+way of tearing down a page cache.  For this reason __page_frag_cache_drain
+was implemented.  It allows for freeing multiple references from a single
+page via a single call.  The advantage to doing this is that it allows for
+cleaning up the multiple references that were added to a page in order to
+avoid calling get_page per allocation.
+
+Alexander Duyck, Nov 29, 2016.



[next PATCH v4 1/3] mm: Rename __alloc_page_frag to page_frag_alloc and __free_page_frag to page_frag_free

2017-01-03 Thread Alexander Duyck
From: Alexander Duyck 

This patch renames the page frag functions to be more consistent with other
APIs.  Specifically we place the name page_frag first in the name and then
have either an alloc or free call name that we append as the suffix.  This
makes it a bit clearer in terms of naming.

In addition we drop the leading double underscores since we are technically
no longer a backing interface and instead the front end that is called from
the networking APIs.

Signed-off-by: Alexander Duyck 
---

v2: Fixed a comparison between a void* and 0 due to copy/paste from free_pages
v3: Dropped changes to function and make this a rename patch only.
This fixes a small performance regression I saw in some tests.
v4: Added back page_frag_alloc which had leaked through to patch 2 after
I had tried reordering them in my own queue.

 include/linux/gfp.h|6 +++---
 include/linux/skbuff.h |2 +-
 mm/page_alloc.c|   10 +-
 net/core/skbuff.c  |8 
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 4175dca4ac39..6238c74e0a01 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -508,9 +508,9 @@ extern struct page *alloc_pages_vma(gfp_t gfp_mask, int 
order,
 struct page_frag_cache;
 extern void __page_frag_drain(struct page *page, unsigned int order,
  unsigned int count);
-extern void *__alloc_page_frag(struct page_frag_cache *nc,
-  unsigned int fragsz, gfp_t gfp_mask);
-extern void __free_page_frag(void *addr);
+extern void *page_frag_alloc(struct page_frag_cache *nc,
+unsigned int fragsz, gfp_t gfp_mask);
+extern void page_frag_free(void *addr);
 
 #define __free_page(page) __free_pages((page), 0)
 #define free_page(addr) free_pages((addr), 0)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b53c0cfd417e..a410715bbef8 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2480,7 +2480,7 @@ static inline struct sk_buff 
*netdev_alloc_skb_ip_align(struct net_device *dev,
 
 static inline void skb_free_frag(void *addr)
 {
-   __free_page_frag(addr);
+   page_frag_free(addr);
 }
 
 void *napi_alloc_frag(unsigned int fragsz);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2c6d5f64feca..9534e44308b2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3939,8 +3939,8 @@ void __page_frag_drain(struct page *page, unsigned int 
order,
 }
 EXPORT_SYMBOL(__page_frag_drain);
 
-void *__alloc_page_frag(struct page_frag_cache *nc,
-   unsigned int fragsz, gfp_t gfp_mask)
+void *page_frag_alloc(struct page_frag_cache *nc,
+ unsigned int fragsz, gfp_t gfp_mask)
 {
unsigned int size = PAGE_SIZE;
struct page *page;
@@ -3991,19 +3991,19 @@ void *__alloc_page_frag(struct page_frag_cache *nc,
 
return nc->va + offset;
 }
-EXPORT_SYMBOL(__alloc_page_frag);
+EXPORT_SYMBOL(page_frag_alloc);
 
 /*
  * Frees a page fragment allocated out of either a compound or order 0 page.
  */
-void __free_page_frag(void *addr)
+void page_frag_free(void *addr)
 {
struct page *page = virt_to_head_page(addr);
 
if (unlikely(put_page_testzero(page)))
__free_pages_ok(page, compound_order(page));
 }
-EXPORT_SYMBOL(__free_page_frag);
+EXPORT_SYMBOL(page_frag_free);
 
 static void *make_alloc_exact(unsigned long addr, unsigned int order,
size_t size)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 5a03730fbc1a..734c71468b01 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -369,7 +369,7 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t 
gfp_mask)
 
local_irq_save(flags);
nc = this_cpu_ptr(_alloc_cache);
-   data = __alloc_page_frag(nc, fragsz, gfp_mask);
+   data = page_frag_alloc(nc, fragsz, gfp_mask);
local_irq_restore(flags);
return data;
 }
@@ -391,7 +391,7 @@ static void *__napi_alloc_frag(unsigned int fragsz, gfp_t 
gfp_mask)
 {
struct napi_alloc_cache *nc = this_cpu_ptr(_alloc_cache);
 
-   return __alloc_page_frag(>page, fragsz, gfp_mask);
+   return page_frag_alloc(>page, fragsz, gfp_mask);
 }
 
 void *napi_alloc_frag(unsigned int fragsz)
@@ -441,7 +441,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, 
unsigned int len,
local_irq_save(flags);
 
nc = this_cpu_ptr(_alloc_cache);
-   data = __alloc_page_frag(nc, len, gfp_mask);
+   data = page_frag_alloc(nc, len, gfp_mask);
pfmemalloc = nc->pfmemalloc;
 
local_irq_restore(flags);
@@ -505,7 +505,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, 
unsigned int len,
if (sk_memalloc_socks())
gfp_mask |= __GFP_MEMALLOC;
 
-   data = __alloc_page_frag(>page, len, gfp_mask);
+   data = page_frag_alloc(>page, len, gfp_mask);
if (unlikely(!data))
return 

[next PATCH v4 0/3] Page fragment updates

2017-01-03 Thread Alexander Duyck
This patch series takes care of a few cleanups for the page fragments API.

First we do some renames so that things are much more consistent.  First we
move the page_frag_ portion of the name to the front of the functions
names.  Secondly we split out the cache specific functions from the other
page fragment functions by adding the word "cache" to the name.

Finally I added a bit of documentation that will hopefully help to explain
some of this.  I plan to revisit this later as we get things more ironed
out in the near future with the changes planned for the DMA setup to
support eXpress Data Path.

---

v2: Fixed a comparison between a void* and 0 due to copy/paste from free_pages
v3: Updated first rename patch so that it is just a rename and doesn't impact
the actual functionality to avoid performance regression.
v4: Fix mangling that occured due to a bad merge fix when patches 1 and 2
were swapped and then swapped back.

I'm submitting this to Intel Wired Lan and Jeff Kirsher's "next-queue" for
acceptance as I have a series of other patches for igb that are blocked by
by these patches since I had to rename the functionality fo draining extra
references.

This series was going to be accepted for mmotm back when it was v1, however
since then I found a few minor issues that needed to be fixed.

I am hoping to get an Acked-by from Andrew Morton for these patches and
then have them submitted to David Miller as he has said he will accept them
if I get the Acked-by.  In the meantime if these can be applied to
next-queue while waiting on that Acked-by then I can submit the other
patches for igb and ixgbe for testing.

Alexander Duyck (3):
  mm: Rename __alloc_page_frag to page_frag_alloc and __free_page_frag to 
page_frag_free
  mm: Rename __page_frag functions to __page_frag_cache, drop order from 
drain
  mm: Add documentation for page fragment APIs


 Documentation/vm/page_frags   |   42 +
 drivers/net/ethernet/intel/igb/igb_main.c |6 ++--
 include/linux/gfp.h   |9 +++---
 include/linux/skbuff.h|2 +
 mm/page_alloc.c   |   23 
 net/core/skbuff.c |8 +++---
 6 files changed, 66 insertions(+), 24 deletions(-)
 create mode 100644 Documentation/vm/page_frags

--


[next PATCH v4 0/3] Page fragment updates

2017-01-03 Thread Alexander Duyck
This patch series takes care of a few cleanups for the page fragments API.

First we do some renames so that things are much more consistent.  First we
move the page_frag_ portion of the name to the front of the functions
names.  Secondly we split out the cache specific functions from the other
page fragment functions by adding the word "cache" to the name.

Finally I added a bit of documentation that will hopefully help to explain
some of this.  I plan to revisit this later as we get things more ironed
out in the near future with the changes planned for the DMA setup to
support eXpress Data Path.

---

v2: Fixed a comparison between a void* and 0 due to copy/paste from free_pages
v3: Updated first rename patch so that it is just a rename and doesn't impact
the actual functionality to avoid performance regression.
v4: Fix mangling that occured due to a bad merge fix when patches 1 and 2
were swapped and then swapped back.

I'm submitting this to Intel Wired Lan and Jeff Kirsher's "next-queue" for
acceptance as I have a series of other patches for igb that are blocked by
by these patches since I had to rename the functionality fo draining extra
references.

This series was going to be accepted for mmotm back when it was v1, however
since then I found a few minor issues that needed to be fixed.

I am hoping to get an Acked-by from Andrew Morton for these patches and
then have them submitted to David Miller as he has said he will accept them
if I get the Acked-by.  In the meantime if these can be applied to
next-queue while waiting on that Acked-by then I can submit the other
patches for igb and ixgbe for testing.

Alexander Duyck (3):
  mm: Rename __alloc_page_frag to page_frag_alloc and __free_page_frag to 
page_frag_free
  mm: Rename __page_frag functions to __page_frag_cache, drop order from 
drain
  mm: Add documentation for page fragment APIs


 Documentation/vm/page_frags   |   42 +
 drivers/net/ethernet/intel/igb/igb_main.c |6 ++--
 include/linux/gfp.h   |9 +++---
 include/linux/skbuff.h|2 +
 mm/page_alloc.c   |   23 
 net/core/skbuff.c |8 +++---
 6 files changed, 66 insertions(+), 24 deletions(-)
 create mode 100644 Documentation/vm/page_frags

--


Re: [PATCH v4] x86: fix kaslr and memmap collision

2017-01-03 Thread Baoquan He
Hi Dave,

I have several concerns, please see the inline comments.

On 01/03/17 at 01:48pm, Dave Jiang wrote:
> CONFIG_RANDOMIZE_BASE relocates the kernel to a random base address.
> However it does not take into account the memmap= parameter passed in from
> the kernel cmdline. This results in the kernel sometimes being put in
> the middle of the user memmap. Teaching kaslr to not insert the kernel in
~~~
It could be better to be replaced with "user-defined physical RAM map"
or memmap directly because memmap is meaning user-defined physical RAM
map. Please check the boot info printing about them. I was confused at
first glance.
> memmap defined regions. We will support up to 4 memmap regions. Any
> additional regions will cause kaslr to disable. The mem_avoid set has
> been augmented to add up to 4 regions of memmaps provided by the user
  ~~~
4 un-usable memmap region need be cared, usable memory is not
concerned.
> to exclude those regions from the set of valid address range to insert
> the uncompressed kernel image. The nn@ss ranges will be skipped by the
> mem_avoid set since it indicates memory useable.
> 
> Signed-off-by: Dave Jiang 
> ---
>  arch/x86/boot/boot.h |3 +
>  arch/x86/boot/compressed/kaslr.c |  131 
> ++
>  arch/x86/boot/string.c   |   38 +++
>  3 files changed, 172 insertions(+)
> 
> diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
> index e5612f3..59c2075 100644
> --- a/arch/x86/boot/boot.h
> +++ b/arch/x86/boot/boot.h
> @@ -332,7 +332,10 @@ int strncmp(const char *cs, const char *ct, size_t 
> count);
>  size_t strnlen(const char *s, size_t maxlen);
>  unsigned int atou(const char *s);
>  unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int 
> base);
> +unsigned long simple_strtoul(const char *cp, char **endp, unsigned int base);
> +long simple_strtol(const char *cp, char **endp, unsigned int base);
>  size_t strlen(const char *s);
> +char *strchr(const char *s, int c);
>  
>  /* tty.c */
>  void puts(const char *);
> diff --git a/arch/x86/boot/compressed/kaslr.c 
> b/arch/x86/boot/compressed/kaslr.c
> index a66854d..f5a1c8d 100644
> --- a/arch/x86/boot/compressed/kaslr.c
> +++ b/arch/x86/boot/compressed/kaslr.c
> @@ -11,6 +11,7 @@
>   */
>  #include "misc.h"
>  #include "error.h"
> +#include "../boot.h"
>  
>  #include 
>  #include 
> @@ -61,9 +62,16 @@ enum mem_avoid_index {
>   MEM_AVOID_INITRD,
>   MEM_AVOID_CMDLINE,
>   MEM_AVOID_BOOTPARAMS,
> + MEM_AVOID_MEMMAP1,
> + MEM_AVOID_MEMMAP2,
> + MEM_AVOID_MEMMAP3,
> + MEM_AVOID_MEMMAP4,

This looks not good. Could it be done like fixed_addresses?
Something like:

MEM_AVOID_MEMMAP_BEGIN,
MEM_AVOID_MEMMAP_BEGIN + MEM_AVOID_MAX -1,

Please point it out to me if there's some existing code in kernel like
your way, I can also accept it.

>   MEM_AVOID_MAX,
>  };
>  
> +/* only supporting at most 4 memmap regions with kaslr */
And here, "Only supporting at most 4 un-usable memmap regions with kaslr"?
Surely this is based on if you will ignore the usable memory and do not
store it as 0. And also the log need be changed too accordingly.
> +#define MAX_MEMMAP_REGIONS   4
> +
>  static struct mem_vector mem_avoid[MEM_AVOID_MAX];
>  
>  static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two)
> @@ -77,6 +85,121 @@ static bool mem_overlaps(struct mem_vector *one, struct 
> mem_vector *two)
>   return true;
>  }
>  
> +/**
> + *   _memparse - parse a string with mem suffixes into a number
> + *   @ptr: Where parse begins
> + *   @retptr: (output) Optional pointer to next char after parse completes
> + *
> + *   Parses a string into a number.  The number stored at @ptr is
> + *   potentially suffixed with K, M, G, T, P, E.
> + */
> +static unsigned long long _memparse(const char *ptr, char **retptr)
> +{
> + char *endptr;   /* local pointer to end of parsed string */
> +
> + unsigned long long ret = simple_strtoull(ptr, , 0);
> +
> + switch (*endptr) {
> + case 'E':
> + case 'e':
> + ret <<= 10;
> + case 'P':
> + case 'p':
> + ret <<= 10;
> + case 'T':
> + case 't':
> + ret <<= 10;
> + case 'G':
> + case 'g':
> + ret <<= 10;
> + case 'M':
> + case 'm':
> + ret <<= 10;
> + case 'K':
> + case 'k':
> + ret <<= 10;
> + endptr++;
> + default:
> + break;
> + }
> +
> + if (retptr)
> + *retptr = endptr;
> +
> + return ret;
> +}
> +
> +static int
> +parse_memmap(char *p, unsigned long long *start, unsigned long long *size)
> +{
> + char *oldp;
> +
> + if (!p)
> + return -EINVAL;
> +
> + /* we don't care about this option here */
> + if (!strncmp(p, "exactmap", 8))
> + return -EINVAL;
> +
> +  

Re: [PATCH v4] x86: fix kaslr and memmap collision

2017-01-03 Thread Baoquan He
Hi Dave,

I have several concerns, please see the inline comments.

On 01/03/17 at 01:48pm, Dave Jiang wrote:
> CONFIG_RANDOMIZE_BASE relocates the kernel to a random base address.
> However it does not take into account the memmap= parameter passed in from
> the kernel cmdline. This results in the kernel sometimes being put in
> the middle of the user memmap. Teaching kaslr to not insert the kernel in
~~~
It could be better to be replaced with "user-defined physical RAM map"
or memmap directly because memmap is meaning user-defined physical RAM
map. Please check the boot info printing about them. I was confused at
first glance.
> memmap defined regions. We will support up to 4 memmap regions. Any
> additional regions will cause kaslr to disable. The mem_avoid set has
> been augmented to add up to 4 regions of memmaps provided by the user
  ~~~
4 un-usable memmap region need be cared, usable memory is not
concerned.
> to exclude those regions from the set of valid address range to insert
> the uncompressed kernel image. The nn@ss ranges will be skipped by the
> mem_avoid set since it indicates memory useable.
> 
> Signed-off-by: Dave Jiang 
> ---
>  arch/x86/boot/boot.h |3 +
>  arch/x86/boot/compressed/kaslr.c |  131 
> ++
>  arch/x86/boot/string.c   |   38 +++
>  3 files changed, 172 insertions(+)
> 
> diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
> index e5612f3..59c2075 100644
> --- a/arch/x86/boot/boot.h
> +++ b/arch/x86/boot/boot.h
> @@ -332,7 +332,10 @@ int strncmp(const char *cs, const char *ct, size_t 
> count);
>  size_t strnlen(const char *s, size_t maxlen);
>  unsigned int atou(const char *s);
>  unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int 
> base);
> +unsigned long simple_strtoul(const char *cp, char **endp, unsigned int base);
> +long simple_strtol(const char *cp, char **endp, unsigned int base);
>  size_t strlen(const char *s);
> +char *strchr(const char *s, int c);
>  
>  /* tty.c */
>  void puts(const char *);
> diff --git a/arch/x86/boot/compressed/kaslr.c 
> b/arch/x86/boot/compressed/kaslr.c
> index a66854d..f5a1c8d 100644
> --- a/arch/x86/boot/compressed/kaslr.c
> +++ b/arch/x86/boot/compressed/kaslr.c
> @@ -11,6 +11,7 @@
>   */
>  #include "misc.h"
>  #include "error.h"
> +#include "../boot.h"
>  
>  #include 
>  #include 
> @@ -61,9 +62,16 @@ enum mem_avoid_index {
>   MEM_AVOID_INITRD,
>   MEM_AVOID_CMDLINE,
>   MEM_AVOID_BOOTPARAMS,
> + MEM_AVOID_MEMMAP1,
> + MEM_AVOID_MEMMAP2,
> + MEM_AVOID_MEMMAP3,
> + MEM_AVOID_MEMMAP4,

This looks not good. Could it be done like fixed_addresses?
Something like:

MEM_AVOID_MEMMAP_BEGIN,
MEM_AVOID_MEMMAP_BEGIN + MEM_AVOID_MAX -1,

Please point it out to me if there's some existing code in kernel like
your way, I can also accept it.

>   MEM_AVOID_MAX,
>  };
>  
> +/* only supporting at most 4 memmap regions with kaslr */
And here, "Only supporting at most 4 un-usable memmap regions with kaslr"?
Surely this is based on if you will ignore the usable memory and do not
store it as 0. And also the log need be changed too accordingly.
> +#define MAX_MEMMAP_REGIONS   4
> +
>  static struct mem_vector mem_avoid[MEM_AVOID_MAX];
>  
>  static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two)
> @@ -77,6 +85,121 @@ static bool mem_overlaps(struct mem_vector *one, struct 
> mem_vector *two)
>   return true;
>  }
>  
> +/**
> + *   _memparse - parse a string with mem suffixes into a number
> + *   @ptr: Where parse begins
> + *   @retptr: (output) Optional pointer to next char after parse completes
> + *
> + *   Parses a string into a number.  The number stored at @ptr is
> + *   potentially suffixed with K, M, G, T, P, E.
> + */
> +static unsigned long long _memparse(const char *ptr, char **retptr)
> +{
> + char *endptr;   /* local pointer to end of parsed string */
> +
> + unsigned long long ret = simple_strtoull(ptr, , 0);
> +
> + switch (*endptr) {
> + case 'E':
> + case 'e':
> + ret <<= 10;
> + case 'P':
> + case 'p':
> + ret <<= 10;
> + case 'T':
> + case 't':
> + ret <<= 10;
> + case 'G':
> + case 'g':
> + ret <<= 10;
> + case 'M':
> + case 'm':
> + ret <<= 10;
> + case 'K':
> + case 'k':
> + ret <<= 10;
> + endptr++;
> + default:
> + break;
> + }
> +
> + if (retptr)
> + *retptr = endptr;
> +
> + return ret;
> +}
> +
> +static int
> +parse_memmap(char *p, unsigned long long *start, unsigned long long *size)
> +{
> + char *oldp;
> +
> + if (!p)
> + return -EINVAL;
> +
> + /* we don't care about this option here */
> + if (!strncmp(p, "exactmap", 8))
> + return -EINVAL;
> +
> + oldp = p;
> + 

RE: [PATCH v3 2/3] USB3/DWC3: Add property "snps, incr-burst-type-adjustment" for INCR burst type

2017-01-03 Thread Jerry Huang
Hi, Rob,

> -Original Message-
> From: Rob Herring [mailto:r...@kernel.org]
> Sent: Wednesday, January 04, 2017 5:24 AM
> To: Jerry Huang 
> Cc: ba...@kernel.org; mark.rutl...@arm.com; catalin.mari...@arm.com;
> will.dea...@arm.com; li...@armlinux.org.uk; devicet...@vger.kernel.org;
> linux-...@vger.kernel.org; linux-kernel@vger.kernel.org; linux-arm-
> ker...@lists.infradead.org
> Subject: Re: [PATCH v3 2/3] USB3/DWC3: Add property "snps, incr-burst-
> type-adjustment" for INCR burst type
> 
> On Thu, Dec 22, 2016 at 8:52 PM, Jerry Huang  wrote:
> > Hi, Rob,
> >> -Original Message-
> >> From: Rob Herring [mailto:r...@kernel.org]
> >> Sent: Friday, December 23, 2016 2:45 AM
> >> To: Jerry Huang 
> >> Cc: ba...@kernel.org; mark.rutl...@arm.com; catalin.mari...@arm.com;
> >> will.dea...@arm.com; li...@armlinux.org.uk;
> >> devicet...@vger.kernel.org; linux-...@vger.kernel.org;
> >> linux-kernel@vger.kernel.org; linux-arm- ker...@lists.infradead.org
> >> Subject: Re: [PATCH v3 2/3] USB3/DWC3: Add property "snps,
> >> incr-burst- type-adjustment" for INCR burst type
> >>
> >> On Mon, Dec 19, 2016 at 05:25:53PM +0800, Changming Huang wrote:
> >> > New property "snps,incr-burst-type-adjustment = , " for
> >> > USB3.0
> >> DWC3.
> >> > Field "x": 1/0 - undefined length INCR burst type enable or not;
> >> > Field
> >> > "y": INCR4/INCR8/INCR16/INCR32/INCR64/INCR128/INCR256 burst type.
> >> >
> >> > While enabling undefined length INCR burst type and INCR16 burst
> >> > type, get better write performance on NXP Layerscape platform:
> >> > around 3% improvement (from 364MB/s to 375MB/s).
> >> >
> >> > Signed-off-by: Changming Huang 
> >> > ---
> >> > Changes in v3:
> >> >   - add new property for INCR burst in usb node.
> >> >
> >> >  Documentation/devicetree/bindings/usb/dwc3.txt |5 +
> >> >  arch/arm/boot/dts/ls1021a.dtsi |1 +
> >> >  arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi |3 +++
> >> >  arch/arm64/boot/dts/freescale/fsl-ls2080a.dtsi |2 ++
> >> >  4 files changed, 11 insertions(+)
> >> >
> >> > diff --git a/Documentation/devicetree/bindings/usb/dwc3.txt
> >> > b/Documentation/devicetree/bindings/usb/dwc3.txt
> >> > index e3e6983..8c405a3 100644
> >> > --- a/Documentation/devicetree/bindings/usb/dwc3.txt
> >> > +++ b/Documentation/devicetree/bindings/usb/dwc3.txt
> >> > @@ -55,6 +55,10 @@ Optional properties:
> >> > fladj_30mhz_sdbnd signal is invalid or incorrect.
> >> >
> >> >   -  tx-fifo-resize: determines if the FIFO *has* to be
> >> reallocated.
> >> > + - snps,incr-burst-type-adjustment: Value for INCR burst type of
> >> GSBUSCFG0
> >> > +   register, undefined length INCR burst type enable and INCRx type.
> >> > +   First field is for undefined length INCR burst type enable or not.
> >> > +   Second field is for largest INCRx type enabled.
> >>
> >> Why do you need the first field? Is the 2nd field used if the 1st is 0?
> >> If not, then just use the presence of the property to enable or not.
> > The first field is one switch.
> > When it is 1, means undefined length INCR burst type enabled, we can use
> any length less than or equal to the largest-enabled burst length of
> INCR4/8/16/32/64/128/256.
> > When it is zero, means INCRx burst mode enabled, we can use one fixed
> burst length of 1/4/8/16/32/64/128/256 byte.
> > So, the 2nd field is used if the 1st is 0, we need to select one largest 
> > burst
> length the USB controller can support.
> > If we don't want to change the value of this register (use the default 
> > value),
> we don't need to add this property to usb node.
> 
> Just make this a single value with 0 meaning INCR and 4/8/16/etc being INCRx.
Maybe, I didn't describe it clearly. 
According to DWC3 spec, the value "0" of field INCRBrstEna means INCRx burst 
mode, 1 means INCR burst mode.
Regardless of the value of INCRBrstEna [bit0], we need to modify the other 
field bit[1,2,3,4,5,6,7] to one INCR burst type  for the platform supported.
Ad you mentioned, if we just use a single value with 0 meaning INCR and 
4/8/16/etc being INCRx.
I understand totally that when it is none-zero, we can use it for INCR burst 
mode. 
Then, when it is 0, how to select the INCRx value?

So, I think we still need two vaue to specify INCRBrstEna and INCRx burst type.


RE: [PATCH v3 2/3] USB3/DWC3: Add property "snps, incr-burst-type-adjustment" for INCR burst type

2017-01-03 Thread Jerry Huang
Hi, Rob,

> -Original Message-
> From: Rob Herring [mailto:r...@kernel.org]
> Sent: Wednesday, January 04, 2017 5:24 AM
> To: Jerry Huang 
> Cc: ba...@kernel.org; mark.rutl...@arm.com; catalin.mari...@arm.com;
> will.dea...@arm.com; li...@armlinux.org.uk; devicet...@vger.kernel.org;
> linux-...@vger.kernel.org; linux-kernel@vger.kernel.org; linux-arm-
> ker...@lists.infradead.org
> Subject: Re: [PATCH v3 2/3] USB3/DWC3: Add property "snps, incr-burst-
> type-adjustment" for INCR burst type
> 
> On Thu, Dec 22, 2016 at 8:52 PM, Jerry Huang  wrote:
> > Hi, Rob,
> >> -Original Message-
> >> From: Rob Herring [mailto:r...@kernel.org]
> >> Sent: Friday, December 23, 2016 2:45 AM
> >> To: Jerry Huang 
> >> Cc: ba...@kernel.org; mark.rutl...@arm.com; catalin.mari...@arm.com;
> >> will.dea...@arm.com; li...@armlinux.org.uk;
> >> devicet...@vger.kernel.org; linux-...@vger.kernel.org;
> >> linux-kernel@vger.kernel.org; linux-arm- ker...@lists.infradead.org
> >> Subject: Re: [PATCH v3 2/3] USB3/DWC3: Add property "snps,
> >> incr-burst- type-adjustment" for INCR burst type
> >>
> >> On Mon, Dec 19, 2016 at 05:25:53PM +0800, Changming Huang wrote:
> >> > New property "snps,incr-burst-type-adjustment = , " for
> >> > USB3.0
> >> DWC3.
> >> > Field "x": 1/0 - undefined length INCR burst type enable or not;
> >> > Field
> >> > "y": INCR4/INCR8/INCR16/INCR32/INCR64/INCR128/INCR256 burst type.
> >> >
> >> > While enabling undefined length INCR burst type and INCR16 burst
> >> > type, get better write performance on NXP Layerscape platform:
> >> > around 3% improvement (from 364MB/s to 375MB/s).
> >> >
> >> > Signed-off-by: Changming Huang 
> >> > ---
> >> > Changes in v3:
> >> >   - add new property for INCR burst in usb node.
> >> >
> >> >  Documentation/devicetree/bindings/usb/dwc3.txt |5 +
> >> >  arch/arm/boot/dts/ls1021a.dtsi |1 +
> >> >  arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi |3 +++
> >> >  arch/arm64/boot/dts/freescale/fsl-ls2080a.dtsi |2 ++
> >> >  4 files changed, 11 insertions(+)
> >> >
> >> > diff --git a/Documentation/devicetree/bindings/usb/dwc3.txt
> >> > b/Documentation/devicetree/bindings/usb/dwc3.txt
> >> > index e3e6983..8c405a3 100644
> >> > --- a/Documentation/devicetree/bindings/usb/dwc3.txt
> >> > +++ b/Documentation/devicetree/bindings/usb/dwc3.txt
> >> > @@ -55,6 +55,10 @@ Optional properties:
> >> > fladj_30mhz_sdbnd signal is invalid or incorrect.
> >> >
> >> >   -  tx-fifo-resize: determines if the FIFO *has* to be
> >> reallocated.
> >> > + - snps,incr-burst-type-adjustment: Value for INCR burst type of
> >> GSBUSCFG0
> >> > +   register, undefined length INCR burst type enable and INCRx type.
> >> > +   First field is for undefined length INCR burst type enable or not.
> >> > +   Second field is for largest INCRx type enabled.
> >>
> >> Why do you need the first field? Is the 2nd field used if the 1st is 0?
> >> If not, then just use the presence of the property to enable or not.
> > The first field is one switch.
> > When it is 1, means undefined length INCR burst type enabled, we can use
> any length less than or equal to the largest-enabled burst length of
> INCR4/8/16/32/64/128/256.
> > When it is zero, means INCRx burst mode enabled, we can use one fixed
> burst length of 1/4/8/16/32/64/128/256 byte.
> > So, the 2nd field is used if the 1st is 0, we need to select one largest 
> > burst
> length the USB controller can support.
> > If we don't want to change the value of this register (use the default 
> > value),
> we don't need to add this property to usb node.
> 
> Just make this a single value with 0 meaning INCR and 4/8/16/etc being INCRx.
Maybe, I didn't describe it clearly. 
According to DWC3 spec, the value "0" of field INCRBrstEna means INCRx burst 
mode, 1 means INCR burst mode.
Regardless of the value of INCRBrstEna [bit0], we need to modify the other 
field bit[1,2,3,4,5,6,7] to one INCR burst type  for the platform supported.
Ad you mentioned, if we just use a single value with 0 meaning INCR and 
4/8/16/etc being INCRx.
I understand totally that when it is none-zero, we can use it for INCR burst 
mode. 
Then, when it is 0, how to select the INCRx value?

So, I think we still need two vaue to specify INCRBrstEna and INCRx burst type.


Re: [PATCH 0/3] hisi_sas: some CQ processing fixes

2017-01-03 Thread Hanjun Guo

On 2017/1/3 20:24, John Garry wrote:

This patchset fixes some issues related to servicing of the
completion queue interrupt.
The major fix is that sensitive hisi_hba structures need to be
locked when free'ing a slot.
Another modification is that the v2 hw completion queue irq is
now serviced with a tasklet, as too much work was being done in
the ISR.


Tested on v2 based sas hardware and the crashes are gone,

Tested-by: Hanjun Guo 

Thanks
Hanjun


Re: [PATCH 0/3] hisi_sas: some CQ processing fixes

2017-01-03 Thread Hanjun Guo

On 2017/1/3 20:24, John Garry wrote:

This patchset fixes some issues related to servicing of the
completion queue interrupt.
The major fix is that sensitive hisi_hba structures need to be
locked when free'ing a slot.
Another modification is that the v2 hw completion queue irq is
now serviced with a tasklet, as too much work was being done in
the ISR.


Tested on v2 based sas hardware and the crashes are gone,

Tested-by: Hanjun Guo 

Thanks
Hanjun


Re: [PATCH 0/2] Begin auditing SECCOMP_RET_ERRNO return actions

2017-01-03 Thread Tyler Hicks
On 01/04/2017 04:44 AM, Kees Cook wrote:
> On Tue, Jan 3, 2017 at 1:31 PM, Paul Moore  wrote:
>> On Tue, Jan 3, 2017 at 4:21 PM, Kees Cook  wrote:
>>> On Tue, Jan 3, 2017 at 1:13 PM, Paul Moore  wrote:
 On Tue, Jan 3, 2017 at 4:03 PM, Kees Cook  wrote:
> On Tue, Jan 3, 2017 at 12:54 PM, Paul Moore  wrote:
>> On Tue, Jan 3, 2017 at 3:44 PM, Kees Cook  wrote:
>>> I still wonder, though, isn't there a way to use auditctl to get all
>>> the seccomp messages you need?
>>
>> Not all of the seccomp actions are currently logged, that's one of the
>> problems (and the biggest at the moment).
>
> Well... sort of. It all gets passed around, but the logic isn't very
> obvious (or at least I always have to go look it up).

 Last time I checked SECCOMP_RET_ALLOW wasn't logged (as well as at
 least one other action, but I can't remember which off the top of my
 head)?
>>>
>>> Sure, but if you're using audit, you don't need RET_ALLOW to be logged
>>> because you'll get a full syscall log entry. Logging RET_ALLOW is
>>> redundant and provides no new information, it seems to me.
>>
>> I only bring this up as it might be a way to help solve the
>> SECCOMP_RET_AUDIT problem that Tyler mentioned.
> 
> So, I guess I want to understand why something like this doesn't work,
> with no changes at all to the kernel:
> 
> Imaginary "seccomp-audit.c":
> 
> ...
> pid = fork();
> if (pid) {
> char cmd[80];
> 
> sprintf(cmd, "auditctl -a always,exit -S all -F pid=%d", pid);
> system(cmd);
> release...
>  } else {
> wait for release...
> execv(argv[1], argv + 1);
>  }
> ...
> 
> This should dump all syscalls (both RET_ALLOW and RET_ERRNO), as well
> as all seccomp actions of any kind. (Down side is the need for root to
> launch auditctl...)

Hey Kees - Thanks for the suggestion!

Here are some of the reasons that it doesn't quite work:

1) We don't install/run auditd by default and would continue to prefer
not to in some situations where resources are tight.

2) We block a relatively small number of syscalls as compared to what
are allowed so auditing all syscalls is a really heavyweight solution.
That could be fixed with a better -S argument, though.

3) We sometimes only block certain arguments for a given syscall and
auditing all instances of the syscall could still be a heavyweight solution.

4) If the application spawns children processes, that rule doesn't audit
their syscalls. That can be fixed with ppid=%d but then grandchildren
pids are a problem.

5) Cleanup of the audit rule for an old pid, before the pid is reused,
could be difficult.

Tyler

> 
> Perhaps an improvement to this could be enabling audit when seccomp
> syscall is seen? I can't tell if auditctl already has something to do
> this ("start auditing this process and all children when syscall X is
> performed").
> 
> -Kees
> 




signature.asc
Description: OpenPGP digital signature


Re: [PATCH 0/2] Begin auditing SECCOMP_RET_ERRNO return actions

2017-01-03 Thread Tyler Hicks
On 01/04/2017 04:44 AM, Kees Cook wrote:
> On Tue, Jan 3, 2017 at 1:31 PM, Paul Moore  wrote:
>> On Tue, Jan 3, 2017 at 4:21 PM, Kees Cook  wrote:
>>> On Tue, Jan 3, 2017 at 1:13 PM, Paul Moore  wrote:
 On Tue, Jan 3, 2017 at 4:03 PM, Kees Cook  wrote:
> On Tue, Jan 3, 2017 at 12:54 PM, Paul Moore  wrote:
>> On Tue, Jan 3, 2017 at 3:44 PM, Kees Cook  wrote:
>>> I still wonder, though, isn't there a way to use auditctl to get all
>>> the seccomp messages you need?
>>
>> Not all of the seccomp actions are currently logged, that's one of the
>> problems (and the biggest at the moment).
>
> Well... sort of. It all gets passed around, but the logic isn't very
> obvious (or at least I always have to go look it up).

 Last time I checked SECCOMP_RET_ALLOW wasn't logged (as well as at
 least one other action, but I can't remember which off the top of my
 head)?
>>>
>>> Sure, but if you're using audit, you don't need RET_ALLOW to be logged
>>> because you'll get a full syscall log entry. Logging RET_ALLOW is
>>> redundant and provides no new information, it seems to me.
>>
>> I only bring this up as it might be a way to help solve the
>> SECCOMP_RET_AUDIT problem that Tyler mentioned.
> 
> So, I guess I want to understand why something like this doesn't work,
> with no changes at all to the kernel:
> 
> Imaginary "seccomp-audit.c":
> 
> ...
> pid = fork();
> if (pid) {
> char cmd[80];
> 
> sprintf(cmd, "auditctl -a always,exit -S all -F pid=%d", pid);
> system(cmd);
> release...
>  } else {
> wait for release...
> execv(argv[1], argv + 1);
>  }
> ...
> 
> This should dump all syscalls (both RET_ALLOW and RET_ERRNO), as well
> as all seccomp actions of any kind. (Down side is the need for root to
> launch auditctl...)

Hey Kees - Thanks for the suggestion!

Here are some of the reasons that it doesn't quite work:

1) We don't install/run auditd by default and would continue to prefer
not to in some situations where resources are tight.

2) We block a relatively small number of syscalls as compared to what
are allowed so auditing all syscalls is a really heavyweight solution.
That could be fixed with a better -S argument, though.

3) We sometimes only block certain arguments for a given syscall and
auditing all instances of the syscall could still be a heavyweight solution.

4) If the application spawns children processes, that rule doesn't audit
their syscalls. That can be fixed with ppid=%d but then grandchildren
pids are a problem.

5) Cleanup of the audit rule for an old pid, before the pid is reused,
could be difficult.

Tyler

> 
> Perhaps an improvement to this could be enabling audit when seccomp
> syscall is seen? I can't tell if auditctl already has something to do
> this ("start auditing this process and all children when syscall X is
> performed").
> 
> -Kees
> 




signature.asc
Description: OpenPGP digital signature


Re: [PATCH v2 9/9] ARM: sunxi: Convert pinctrl nodes to generic bindings

2017-01-03 Thread André Przywara
On 18/10/16 08:43, Chen-Yu Tsai wrote:

Hi Maxime, Chen-Yu,

I just stumbled over this patch in Maxime's -next tree and think I
missed it before. I guess it's a bit late, but I just wanted to express
my concerns and write out the issues with the current DT approach:

> On Tue, Oct 11, 2016 at 11:46 PM, Maxime Ripard
>  wrote:
>> Now that we can handle the generic pinctrl bindings, convert our DT to it.

Do we really want to do this? This makes the new DTs incompatible to
older kernels, for no good reasons. Or are there any? Does this "lock
muxed pins" functionality (users cannot export GPIOs which are used by a
driver) rely on the new binding?

Anyway ...
I understand that in sunxi world a kernel is always accompanied by a
matching .dtb, but we should really get away from this, as this doesn't
scale and more importantly creates big headaches in arm64 world:

1) Scaling: Providing a board .dtb together with a kernel puts the
burden of board support to each distribution, which means that _every_
distribution needs to ship matching DTs for _every_ board. There are a
lot of Allwinner boards (around 100 these days?), I am not sure every
distribution seriously wants to put every .dtb into some bootloader
accessible directory. Also this kind of defeats the original DT idea,
where the .dtb is provided by firmware and a generic kernel just works
with whatever it gets presented.
Either distributions will not support certain boards because of this or
there is a lot of maintenance churn in distros.
Yes, arm(32) has lived with this for years, but that doesn't mean we
need to keep it this way.

2) Firmware DT: We start to see more and more Allwinner boards with SPI
flash now, which is supposed to hold some firmware bits. This gives the
ultimate opportunity to store the DT in there as well. BUT this relies
on there being _one_ DT (to rule them all). I don't see issues with
older DTs not having full functionality on newer kernels (this would be
a reason for a firmware update), but there should at any given time _one
best_ DT which is a candidate for being burned into the flash.
Otherwise people are stuck with a certain kernel version (range).
As an example in this case: Kernel 4.9 is LTS and will probably be
elected by Debian to be _the_ kernel for the next release and supported
over several years. But it relies on an old style DT (allwinner,pins
properties), newer pinctrl DT nodes will be rejected by the driver.
But for new features (upcoming HDMI support for H3, for instance) to be
usable a newer DT is required, even if an old kernel will ignore those
features. So if the flash provides a new DT, we cannot boot stable
Debian, whereas if the flash provides a legacy DT, we cannot use new
features in newer kernels.
Not changing the DT in a way that breaks forward compatibility can be
done, for instance we just drop this patch here. For new SoCs (like A64
or H5, where there are no backward compatibility issues) we can happily
use the new bindings in the official DTs, but for anything already
supported by older kernels we just stick which what we have. We have to
keep the old support code in anyway.
Yes, providing both forward and backward compatibility is not going to
be a walk in the park, but is inevitable if we want to get those board
out of the toy area. Other platform can cope with this.

3) On EFI setups the generation(!) of the DT is actually done in EFI
firmware (very much like in the original PowerPC OF setup). It is
expected that the generation relies on a static DT scaffold, but
firmware is free to add or remove nodes. Providing different DTs for
each kernel is hard to integrate into this approach.

4) Generic boot images: On the x86 side everybody expects this just to
work: You put in a generic USB stick with one (or two) generic kernels
and it just boots, without this installation having to know each
supported machine beforehand. It even works with machines newer than the
distro. While the last is hard work, it can be done, but only if the DT
is provided by firmware, which requires there to be one best DT for each
board.

5) arm64: Distributions providing support for arm64 machines generally
do not seem to provide DTs, because most platform do not require them
to. It is not clear that every distribution will do so for the sake or
supporting Allwinner boards.

6) Other OSes (or U-Boot) are affected by this as well. Eventually this
will lead them to create their own DTs (and their own bindings?), which
makes this situation really a mess. We can hardly request them to stick
with the Linux bindings if we break them regularly.

7) Dynamic DT alteration (by firmware/U-Boot and using overlays, for
instance), will be much easier if the DT is originally stored in the
firmware and snippets are stored there as well. This is for instance a
use case for the (optional) WiFi module on the Pine64.

8) Those patches also create a lot of potential merge conflicts, since
they touch almost every 

Re: [PATCH 2/2] PCI: lock each enable/disable num_vfs operation in sysfs

2017-01-03 Thread Gavin Shan
On Tue, Jan 03, 2017 at 04:48:31PM -0800, Emil Tantilov wrote:
>Enabling/disabling SRIOV via sysfs by echo-ing multiple values
>simultaneously:
>
>echo 63 > /sys/class/net/ethX/device/sriov_numvfs&
>echo 63 > /sys/class/net/ethX/device/sriov_numvfs
>
>sleep 5
>
>echo 0 > /sys/class/net/ethX/device/sriov_numvfs&
>echo 0 > /sys/class/net/ethX/device/sriov_numvfs
>
>Results in the following bug:
>
>kernel BUG at drivers/pci/iov.c:495!
>invalid opcode:  [#1] SMP
>CPU: 1 PID: 8050 Comm: bash Tainted: G   W   4.9.0-rc7-net-next #2092
>RIP: 0010:[]
> [] pci_iov_release+0x57/0x60
>
>Call Trace:
> [] pci_release_dev+0x26/0x70
> [] device_release+0x3e/0xb0
> [] kobject_cleanup+0x67/0x180
> [] kobject_put+0x2d/0x60
> [] put_device+0x17/0x20
> [] pci_dev_put+0x1a/0x20
> [] pci_get_dev_by_id+0x5b/0x90
> [] pci_get_subsys+0x35/0x40
> [] pci_get_device+0x18/0x20
> [] pci_get_domain_bus_and_slot+0x2b/0x60
> [] pci_iov_remove_virtfn+0x57/0x180
> [] pci_disable_sriov+0x65/0x140
> [] ixgbe_disable_sriov+0xc7/0x1d0 [ixgbe]
> [] ixgbe_pci_sriov_configure+0x3d/0x170 [ixgbe]
> [] sriov_numvfs_store+0xdc/0x130
>...
>RIP  [] pci_iov_release+0x57/0x60
>
>Use the existing mutex lock to protect each enable/disable operation.
>
>CC: Alexander Duyck 
>Signed-off-by: Emil Tantilov 

Emil, It's going to change semantics of pci_enable_sriov() and 
pci_disable_sriov().
They can be invoked when writing to the sysfs entry, or loading PF's driver. 
With
the change applied, the lock (pf->sriov->lock) isn't acquired and released in 
the
PF's driver loading path.

I think the reasonable way would be adding a flag in "struct sriov", to indicate
someone is accessing the IOV capability through sysfs file. With this, the code
returns with "-EBUSY" immediately for contenders. With it, nothing is going to
be changed in PF's driver loading path.

Also, there are some minor comments as below and I guess most of them won't be
applied if you take my suggestion eventually. However, I'm trying to make the
comments complete.

>---
> drivers/pci/pci-sysfs.c |   24 +---
> 1 file changed, 17 insertions(+), 7 deletions(-)
>
>diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
>index 0666287..5b54cf5 100644
>--- a/drivers/pci/pci-sysfs.c
>+++ b/drivers/pci/pci-sysfs.c
>@@ -472,7 +472,9 @@ static ssize_t sriov_numvfs_store(struct device *dev,
> const char *buf, size_t count)
> {
>   struct pci_dev *pdev = to_pci_dev(dev);
>+  struct pci_sriov *iov = pdev->sriov;
>   int ret;
>+

Unnecessary change.

>   u16 num_vfs;
>
>   ret = kstrtou16(buf, 0, _vfs);
>@@ -482,38 +484,46 @@ static ssize_t sriov_numvfs_store(struct device *dev,
>   if (num_vfs > pci_sriov_get_totalvfs(pdev))
>   return -ERANGE;
>
>+  mutex_lock(>dev->sriov->lock);
>+
>   if (num_vfs == pdev->sriov->num_VFs)
>-  return count;   /* no change */
>+  goto exit;
>
>   /* is PF driver loaded w/callback */
>   if (!pdev->driver || !pdev->driver->sriov_configure) {
>   dev_info(>dev, "Driver doesn't support SRIOV 
> configuration via sysfs\n");
>-  return -ENOSYS;
>+  ret = -EINVAL;
>+  goto exit;

Why we need change the error code here?

>   }
>
>   if (num_vfs == 0) {
>   /* disable VFs */
>   ret = pdev->driver->sriov_configure(pdev, 0);
>-  if (ret < 0)
>-  return ret;
>-  return count;
>+  goto exit;
>   }
>
>   /* enable VFs */
>   if (pdev->sriov->num_VFs) {
>   dev_warn(>dev, "%d VFs already enabled. Disable before 
> enabling %d VFs\n",
>pdev->sriov->num_VFs, num_vfs);
>-  return -EBUSY;
>+  ret = -EBUSY;
>+  goto exit;
>   }
>
>   ret = pdev->driver->sriov_configure(pdev, num_vfs);
>   if (ret < 0)
>-  return ret;
>+  goto exit;
>
>   if (ret != num_vfs)
>   dev_warn(>dev, "%d VFs requested; only %d enabled\n",
>num_vfs, ret);
>
>+exit:
>+  mutex_unlock(>dev->sriov->lock);
>+
>+  if (ret < 0)
>+  return ret;
>+
>   return count;

The code might be clearer if @ret is returned here. In that case, We need
set it properly in error paths.

> }
>

Thanks,
Gavin



Re: [PATCH v2 9/9] ARM: sunxi: Convert pinctrl nodes to generic bindings

2017-01-03 Thread André Przywara
On 18/10/16 08:43, Chen-Yu Tsai wrote:

Hi Maxime, Chen-Yu,

I just stumbled over this patch in Maxime's -next tree and think I
missed it before. I guess it's a bit late, but I just wanted to express
my concerns and write out the issues with the current DT approach:

> On Tue, Oct 11, 2016 at 11:46 PM, Maxime Ripard
>  wrote:
>> Now that we can handle the generic pinctrl bindings, convert our DT to it.

Do we really want to do this? This makes the new DTs incompatible to
older kernels, for no good reasons. Or are there any? Does this "lock
muxed pins" functionality (users cannot export GPIOs which are used by a
driver) rely on the new binding?

Anyway ...
I understand that in sunxi world a kernel is always accompanied by a
matching .dtb, but we should really get away from this, as this doesn't
scale and more importantly creates big headaches in arm64 world:

1) Scaling: Providing a board .dtb together with a kernel puts the
burden of board support to each distribution, which means that _every_
distribution needs to ship matching DTs for _every_ board. There are a
lot of Allwinner boards (around 100 these days?), I am not sure every
distribution seriously wants to put every .dtb into some bootloader
accessible directory. Also this kind of defeats the original DT idea,
where the .dtb is provided by firmware and a generic kernel just works
with whatever it gets presented.
Either distributions will not support certain boards because of this or
there is a lot of maintenance churn in distros.
Yes, arm(32) has lived with this for years, but that doesn't mean we
need to keep it this way.

2) Firmware DT: We start to see more and more Allwinner boards with SPI
flash now, which is supposed to hold some firmware bits. This gives the
ultimate opportunity to store the DT in there as well. BUT this relies
on there being _one_ DT (to rule them all). I don't see issues with
older DTs not having full functionality on newer kernels (this would be
a reason for a firmware update), but there should at any given time _one
best_ DT which is a candidate for being burned into the flash.
Otherwise people are stuck with a certain kernel version (range).
As an example in this case: Kernel 4.9 is LTS and will probably be
elected by Debian to be _the_ kernel for the next release and supported
over several years. But it relies on an old style DT (allwinner,pins
properties), newer pinctrl DT nodes will be rejected by the driver.
But for new features (upcoming HDMI support for H3, for instance) to be
usable a newer DT is required, even if an old kernel will ignore those
features. So if the flash provides a new DT, we cannot boot stable
Debian, whereas if the flash provides a legacy DT, we cannot use new
features in newer kernels.
Not changing the DT in a way that breaks forward compatibility can be
done, for instance we just drop this patch here. For new SoCs (like A64
or H5, where there are no backward compatibility issues) we can happily
use the new bindings in the official DTs, but for anything already
supported by older kernels we just stick which what we have. We have to
keep the old support code in anyway.
Yes, providing both forward and backward compatibility is not going to
be a walk in the park, but is inevitable if we want to get those board
out of the toy area. Other platform can cope with this.

3) On EFI setups the generation(!) of the DT is actually done in EFI
firmware (very much like in the original PowerPC OF setup). It is
expected that the generation relies on a static DT scaffold, but
firmware is free to add or remove nodes. Providing different DTs for
each kernel is hard to integrate into this approach.

4) Generic boot images: On the x86 side everybody expects this just to
work: You put in a generic USB stick with one (or two) generic kernels
and it just boots, without this installation having to know each
supported machine beforehand. It even works with machines newer than the
distro. While the last is hard work, it can be done, but only if the DT
is provided by firmware, which requires there to be one best DT for each
board.

5) arm64: Distributions providing support for arm64 machines generally
do not seem to provide DTs, because most platform do not require them
to. It is not clear that every distribution will do so for the sake or
supporting Allwinner boards.

6) Other OSes (or U-Boot) are affected by this as well. Eventually this
will lead them to create their own DTs (and their own bindings?), which
makes this situation really a mess. We can hardly request them to stick
with the Linux bindings if we break them regularly.

7) Dynamic DT alteration (by firmware/U-Boot and using overlays, for
instance), will be much easier if the DT is originally stored in the
firmware and snippets are stored there as well. This is for instance a
use case for the (optional) WiFi module on the Pine64.

8) Those patches also create a lot of potential merge conflicts, since
they touch almost every sunxi DT file. They are trivial to 

Re: [PATCH 2/2] PCI: lock each enable/disable num_vfs operation in sysfs

2017-01-03 Thread Gavin Shan
On Tue, Jan 03, 2017 at 04:48:31PM -0800, Emil Tantilov wrote:
>Enabling/disabling SRIOV via sysfs by echo-ing multiple values
>simultaneously:
>
>echo 63 > /sys/class/net/ethX/device/sriov_numvfs&
>echo 63 > /sys/class/net/ethX/device/sriov_numvfs
>
>sleep 5
>
>echo 0 > /sys/class/net/ethX/device/sriov_numvfs&
>echo 0 > /sys/class/net/ethX/device/sriov_numvfs
>
>Results in the following bug:
>
>kernel BUG at drivers/pci/iov.c:495!
>invalid opcode:  [#1] SMP
>CPU: 1 PID: 8050 Comm: bash Tainted: G   W   4.9.0-rc7-net-next #2092
>RIP: 0010:[]
> [] pci_iov_release+0x57/0x60
>
>Call Trace:
> [] pci_release_dev+0x26/0x70
> [] device_release+0x3e/0xb0
> [] kobject_cleanup+0x67/0x180
> [] kobject_put+0x2d/0x60
> [] put_device+0x17/0x20
> [] pci_dev_put+0x1a/0x20
> [] pci_get_dev_by_id+0x5b/0x90
> [] pci_get_subsys+0x35/0x40
> [] pci_get_device+0x18/0x20
> [] pci_get_domain_bus_and_slot+0x2b/0x60
> [] pci_iov_remove_virtfn+0x57/0x180
> [] pci_disable_sriov+0x65/0x140
> [] ixgbe_disable_sriov+0xc7/0x1d0 [ixgbe]
> [] ixgbe_pci_sriov_configure+0x3d/0x170 [ixgbe]
> [] sriov_numvfs_store+0xdc/0x130
>...
>RIP  [] pci_iov_release+0x57/0x60
>
>Use the existing mutex lock to protect each enable/disable operation.
>
>CC: Alexander Duyck 
>Signed-off-by: Emil Tantilov 

Emil, It's going to change semantics of pci_enable_sriov() and 
pci_disable_sriov().
They can be invoked when writing to the sysfs entry, or loading PF's driver. 
With
the change applied, the lock (pf->sriov->lock) isn't acquired and released in 
the
PF's driver loading path.

I think the reasonable way would be adding a flag in "struct sriov", to indicate
someone is accessing the IOV capability through sysfs file. With this, the code
returns with "-EBUSY" immediately for contenders. With it, nothing is going to
be changed in PF's driver loading path.

Also, there are some minor comments as below and I guess most of them won't be
applied if you take my suggestion eventually. However, I'm trying to make the
comments complete.

>---
> drivers/pci/pci-sysfs.c |   24 +---
> 1 file changed, 17 insertions(+), 7 deletions(-)
>
>diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
>index 0666287..5b54cf5 100644
>--- a/drivers/pci/pci-sysfs.c
>+++ b/drivers/pci/pci-sysfs.c
>@@ -472,7 +472,9 @@ static ssize_t sriov_numvfs_store(struct device *dev,
> const char *buf, size_t count)
> {
>   struct pci_dev *pdev = to_pci_dev(dev);
>+  struct pci_sriov *iov = pdev->sriov;
>   int ret;
>+

Unnecessary change.

>   u16 num_vfs;
>
>   ret = kstrtou16(buf, 0, _vfs);
>@@ -482,38 +484,46 @@ static ssize_t sriov_numvfs_store(struct device *dev,
>   if (num_vfs > pci_sriov_get_totalvfs(pdev))
>   return -ERANGE;
>
>+  mutex_lock(>dev->sriov->lock);
>+
>   if (num_vfs == pdev->sriov->num_VFs)
>-  return count;   /* no change */
>+  goto exit;
>
>   /* is PF driver loaded w/callback */
>   if (!pdev->driver || !pdev->driver->sriov_configure) {
>   dev_info(>dev, "Driver doesn't support SRIOV 
> configuration via sysfs\n");
>-  return -ENOSYS;
>+  ret = -EINVAL;
>+  goto exit;

Why we need change the error code here?

>   }
>
>   if (num_vfs == 0) {
>   /* disable VFs */
>   ret = pdev->driver->sriov_configure(pdev, 0);
>-  if (ret < 0)
>-  return ret;
>-  return count;
>+  goto exit;
>   }
>
>   /* enable VFs */
>   if (pdev->sriov->num_VFs) {
>   dev_warn(>dev, "%d VFs already enabled. Disable before 
> enabling %d VFs\n",
>pdev->sriov->num_VFs, num_vfs);
>-  return -EBUSY;
>+  ret = -EBUSY;
>+  goto exit;
>   }
>
>   ret = pdev->driver->sriov_configure(pdev, num_vfs);
>   if (ret < 0)
>-  return ret;
>+  goto exit;
>
>   if (ret != num_vfs)
>   dev_warn(>dev, "%d VFs requested; only %d enabled\n",
>num_vfs, ret);
>
>+exit:
>+  mutex_unlock(>dev->sriov->lock);
>+
>+  if (ret < 0)
>+  return ret;
>+
>   return count;

The code might be clearer if @ret is returned here. In that case, We need
set it properly in error paths.

> }
>

Thanks,
Gavin



Re: [RFC] memcpy_nocache() and memcpy_writethrough()

2017-01-03 Thread Dan Williams
On Tue, Jan 3, 2017 at 5:59 PM, Al Viro  wrote:
> On Tue, Jan 03, 2017 at 05:38:54PM -0800, Dan Williams wrote:
>> > 1) memcpy_to_pmem() seems to rely upon the __copy_from_user_nocache()
>> > having only used movnt; it does not attempt clwb at all.
>>
>> Yes, and there was a fix a while back to make sure it always used
>> movnt so clwb after the fact is not required:
>>
>> a82eee742452 x86/uaccess/64: Handle the caching of 4-byte nocache
>> copies properly in __copy_user_nocache()
>>
>> > 2) __copy_from_user_nocache() for short copies does not use movnt at all.
>> > In that case neither sfence nor clwb is issued.
>>
>> For the 32bit case, yes, but the pmem driver should warn about this
>> when it checks platform persistent memory capabilities (i.e. x86 32bit
>> not supported). Ugh, we may have lost that warning for this specific
>> case recently, I'll go double check and fix it up.
>>
>> > 3) it uses movnt only for part of copying in case of misaligned copy;
>> > No clwb is issued, but sfence *is* - at the very end in 64bit case,
>> > between movnt and copying the tail - in 32bit one.  Incidentally,
>> > while 64bit case takes care to align the destination for movnt part,
>> > 32bit one does not.
>> >
>> > How much of the above is broken and what do the callers rely upon?
>>
>> 32bit issues are known, but 64bit path is ok since that fix above.
>
> Bollocks.  That fix above does *NOT* eliminate all cached stores.  Just look
> at the damn function - it still does cached stores for until the target is
> aligned and it does the same for tail when end of destination is not aligned.
> Right there in arch/x86/lib/copy_user_64.S.

No, it does not eliminate all cache stores, but the cases where we use
it have naturally aligned targets.

Yes, it is terrible to then call wrap it in a memcpy_to_pmem() wrapper
which does not document these alignment constraints.


Re: [RFC] memcpy_nocache() and memcpy_writethrough()

2017-01-03 Thread Dan Williams
On Tue, Jan 3, 2017 at 5:59 PM, Al Viro  wrote:
> On Tue, Jan 03, 2017 at 05:38:54PM -0800, Dan Williams wrote:
>> > 1) memcpy_to_pmem() seems to rely upon the __copy_from_user_nocache()
>> > having only used movnt; it does not attempt clwb at all.
>>
>> Yes, and there was a fix a while back to make sure it always used
>> movnt so clwb after the fact is not required:
>>
>> a82eee742452 x86/uaccess/64: Handle the caching of 4-byte nocache
>> copies properly in __copy_user_nocache()
>>
>> > 2) __copy_from_user_nocache() for short copies does not use movnt at all.
>> > In that case neither sfence nor clwb is issued.
>>
>> For the 32bit case, yes, but the pmem driver should warn about this
>> when it checks platform persistent memory capabilities (i.e. x86 32bit
>> not supported). Ugh, we may have lost that warning for this specific
>> case recently, I'll go double check and fix it up.
>>
>> > 3) it uses movnt only for part of copying in case of misaligned copy;
>> > No clwb is issued, but sfence *is* - at the very end in 64bit case,
>> > between movnt and copying the tail - in 32bit one.  Incidentally,
>> > while 64bit case takes care to align the destination for movnt part,
>> > 32bit one does not.
>> >
>> > How much of the above is broken and what do the callers rely upon?
>>
>> 32bit issues are known, but 64bit path is ok since that fix above.
>
> Bollocks.  That fix above does *NOT* eliminate all cached stores.  Just look
> at the damn function - it still does cached stores for until the target is
> aligned and it does the same for tail when end of destination is not aligned.
> Right there in arch/x86/lib/copy_user_64.S.

No, it does not eliminate all cache stores, but the cases where we use
it have naturally aligned targets.

Yes, it is terrible to then call wrap it in a memcpy_to_pmem() wrapper
which does not document these alignment constraints.


DEBUG_LOCKS_WARN_ON(1) / lockdep.c:3134 lockdep_init_map+0x1e8/0x1f0

2017-01-03 Thread Christian Kujau
Hi,

booting v4.10-rc2 on this PowerPC G4 machine prints the following early 
on, but then continues to boot and the machine is running fine so far:


BUG: key ef0ba7d0 not in .data!
DEBUG_LOCKS_WARN_ON(1)
[ cut here ]
WARNING: CPU: 0 PID: 1 at 
/usr/local/src/linux-git/kernel/locking/lockdep.c:3134 
lockdep_init_map+0x1e8/0x1f0
Modules linked in:
CPU: 0 PID: 1 Comm: swapper Not tainted 4.10.0-rc2 #4
task: ef04aa60 task.stack: ef042000
NIP: c005eb78 LR: c005eb78 CTR: 
REGS: ef043d70 TRAP: 0700   Not tainted  (4.10.0-rc2)
MSR: 02029032 
  CR: 4822  XER: 2000
GPR00: c005eb78 ef043e20 ef04aa60 0016 0001 c0068b24  0001 
GPR08:   4ead 00d6 2824  c00047f0  
GPR16:   c08b9280 effedfa0 c078d6ac c107 c078eddc c078ee14 
GPR24: c078ed24 c078ee24 0002 ef085a00  c08b ef0ba7d0 ef0ba7b4 
NIP [c005eb78] lockdep_init_map+0x1e8/0x1f0
LR [c005eb78] lockdep_init_map+0x1e8/0x1f0
Call Trace:
[ef043e20] [c005eb78] lockdep_init_map+0x1e8/0x1f0 (unreliable)
[ef043e40] [c083adb4] kw_i2c_add+0xc0/0x134
[ef043e60] [c083b29c] pmac_i2c_init+0x3b8/0x518
[ef043ea0] [c00040c0] do_one_initcall+0x40/0x174
[ef043f00] [c0834064] kernel_init_freeable+0x134/0x1cc
[ef043f30] [c0004808] kernel_init+0x18/0x110
[ef043f40] [c0010ad8] ret_from_kernel_thread+0x5c/0x64
Instruction dump:
4837259d 2f83 41befec0 3d20c08b 812953a0 2f89 409efeb0 3c60c079 
3c80c07b 3884b48c 3863f9e4 4860d52d <0fe0> 4bfffe94 9421ff70 7c0802a6 
---[ end trace 8a79d8041d87d000 ]---


Full dmesg and .config: http://nerdbynature.de/bits/4.10-rc2/


Thanks for listening,
Christian.
-- 
BOFH excuse #409:

The vulcan-death-grip ping has been applied.


DEBUG_LOCKS_WARN_ON(1) / lockdep.c:3134 lockdep_init_map+0x1e8/0x1f0

2017-01-03 Thread Christian Kujau
Hi,

booting v4.10-rc2 on this PowerPC G4 machine prints the following early 
on, but then continues to boot and the machine is running fine so far:


BUG: key ef0ba7d0 not in .data!
DEBUG_LOCKS_WARN_ON(1)
[ cut here ]
WARNING: CPU: 0 PID: 1 at 
/usr/local/src/linux-git/kernel/locking/lockdep.c:3134 
lockdep_init_map+0x1e8/0x1f0
Modules linked in:
CPU: 0 PID: 1 Comm: swapper Not tainted 4.10.0-rc2 #4
task: ef04aa60 task.stack: ef042000
NIP: c005eb78 LR: c005eb78 CTR: 
REGS: ef043d70 TRAP: 0700   Not tainted  (4.10.0-rc2)
MSR: 02029032 
  CR: 4822  XER: 2000
GPR00: c005eb78 ef043e20 ef04aa60 0016 0001 c0068b24  0001 
GPR08:   4ead 00d6 2824  c00047f0  
GPR16:   c08b9280 effedfa0 c078d6ac c107 c078eddc c078ee14 
GPR24: c078ed24 c078ee24 0002 ef085a00  c08b ef0ba7d0 ef0ba7b4 
NIP [c005eb78] lockdep_init_map+0x1e8/0x1f0
LR [c005eb78] lockdep_init_map+0x1e8/0x1f0
Call Trace:
[ef043e20] [c005eb78] lockdep_init_map+0x1e8/0x1f0 (unreliable)
[ef043e40] [c083adb4] kw_i2c_add+0xc0/0x134
[ef043e60] [c083b29c] pmac_i2c_init+0x3b8/0x518
[ef043ea0] [c00040c0] do_one_initcall+0x40/0x174
[ef043f00] [c0834064] kernel_init_freeable+0x134/0x1cc
[ef043f30] [c0004808] kernel_init+0x18/0x110
[ef043f40] [c0010ad8] ret_from_kernel_thread+0x5c/0x64
Instruction dump:
4837259d 2f83 41befec0 3d20c08b 812953a0 2f89 409efeb0 3c60c079 
3c80c07b 3884b48c 3863f9e4 4860d52d <0fe0> 4bfffe94 9421ff70 7c0802a6 
---[ end trace 8a79d8041d87d000 ]---


Full dmesg and .config: http://nerdbynature.de/bits/4.10-rc2/


Thanks for listening,
Christian.
-- 
BOFH excuse #409:

The vulcan-death-grip ping has been applied.


Re: [PATCH 0/2] Begin auditing SECCOMP_RET_ERRNO return actions

2017-01-03 Thread Tyler Hicks
On 01/04/2017 02:42 AM, Paul Moore wrote:
> On Tue, Jan 3, 2017 at 8:31 AM, Tyler Hicks  wrote:
>> On 01/02/2017 04:47 PM, Paul Moore wrote:
>>> On Mon, Jan 2, 2017 at 11:53 AM, Tyler Hicks  wrote:
 This patch set creates the basis for auditing information specific to a 
 given
 seccomp return action and then starts auditing SECCOMP_RET_ERRNO return
 actions. The audit messages for SECCOMP_RET_ERRNO return actions include 
 the
 errno value that will be returned to userspace.
>>>
>>> I'm replying to this patchset posting because it his my inbox first,
>>> but my comments here apply to both this patchset and the other
>>> seccomp/audit patchset you posted.
>>>
>>> In my experience, we have two or three problems (the count varies
>>> depending on perspective) when it comes to seccomp filter reporting:
>>>
>>> 1. Inability to log all filter actions.
>>> 2. Inability to selectively enable filtering; e.g. devs want noisy
>>> logging, users want relative quiet.
>>> 3. Consistent behavior with audit enabled and disabled.
>>
>> Agreed. Those three logging issues are what have been nagging me the most.
> 
> /me nods
> 
>>> My current thinking - forgive me, this has been kicking around in my
>>> head for the better part of six months (longer?) and I haven't
>>> attempted to code it up - is to create a sysctl knob for a system wide
>>> seccomp logging threshold that would be applied to the high 16-bits of
>>> *every* triggered action: if the action was at/below the threshold a
>>> record would be emitted, otherwise silence.  This should resolve
>>> problems #1 and #2, and the code should be relatively straightforward
>>> and small.
>>
>> I like that idea quite a bit. To be completely honest, for #1, I
>> personally only care about logging SECCOMP_RET_ERRNO actions but this
>> idea solves it in a nice and general way.
> 
> Yeah, I'd much rather solve this problem generally; everybody has
> their favorite action and I'd like to avoid solving the same problem
> multiple times.
> 
> Sooo ... you want to take a whack at coding this up? ;)

Yes, I can do that.

> 
>>> As part of the code above, I expect that all seccomp logging would get
>>> routed through a single logging function (sort of like a better
>>> implementation of the existing audit_seccomp()) that would check the
>>> threshold and trigger the logging if needed.  This function could be
>>> augmented to check for CONFIG_AUDIT and in the case where audit was
>>> not built into the kernel, a simple printk could be used to log the
>>> seccomp event; solving problem #3.
>>
>> That doesn't fully solve #3 for me. In Ubuntu (and I think Debian), we
>> build with CONFIG_AUDIT enabled but don't ship auditd by default so
>> audit_enabled is false. In that default configuration, we still want
>> seccomp audit messages to be printk'ed. I'll need to figure out how to
>> cleanly allow opting into seccomp audit messages when CONFIG_AUDIT is
>> enabled and audit_enabled is false.
> 
> Heh, so you've got audit built into the kernel but you're not using
> it; that sounds "fun".

Users that need full auditing functionality can simply install auditd
but most users don't require it. Ubuntu has done it this way for many
years and the lack of seccomp auditing when auditd isn't running has
been the only problem that I can remember.

> Anyway, I think the logging consolidation could still help you, if for
> no other reason than everything is going through the same function at
> that point.  We could do some other stuff there to handle the case
> where audit is compiled, but auditd is not running ... we already have
> some code in place to handle that for other reasons, check
> kernel/audit.c for more information.  I'd still work on the other
> stuff first and then we can add this in at the end of the patchset.

/me nods. It is easy to continue to distro patch out the recently added
check for audit_enabled until a better solution for all distros can be
identified/upstreamed.

Thanks again!

Tyler

> 
>>> We could also add a SECCOMP_RET_AUDIT, or similar, if we still feel
>>> that is important (I personally waffle on this), but I think that is
>>> independent of the ideas above.
>>
>> I agree that it is independent but SECCOMP_RET_AUDIT would still be
>> important to Ubuntu.
>>
>> Tyler
> 




signature.asc
Description: OpenPGP digital signature


Re: [PATCH 0/2] Begin auditing SECCOMP_RET_ERRNO return actions

2017-01-03 Thread Tyler Hicks
On 01/04/2017 02:42 AM, Paul Moore wrote:
> On Tue, Jan 3, 2017 at 8:31 AM, Tyler Hicks  wrote:
>> On 01/02/2017 04:47 PM, Paul Moore wrote:
>>> On Mon, Jan 2, 2017 at 11:53 AM, Tyler Hicks  wrote:
 This patch set creates the basis for auditing information specific to a 
 given
 seccomp return action and then starts auditing SECCOMP_RET_ERRNO return
 actions. The audit messages for SECCOMP_RET_ERRNO return actions include 
 the
 errno value that will be returned to userspace.
>>>
>>> I'm replying to this patchset posting because it his my inbox first,
>>> but my comments here apply to both this patchset and the other
>>> seccomp/audit patchset you posted.
>>>
>>> In my experience, we have two or three problems (the count varies
>>> depending on perspective) when it comes to seccomp filter reporting:
>>>
>>> 1. Inability to log all filter actions.
>>> 2. Inability to selectively enable filtering; e.g. devs want noisy
>>> logging, users want relative quiet.
>>> 3. Consistent behavior with audit enabled and disabled.
>>
>> Agreed. Those three logging issues are what have been nagging me the most.
> 
> /me nods
> 
>>> My current thinking - forgive me, this has been kicking around in my
>>> head for the better part of six months (longer?) and I haven't
>>> attempted to code it up - is to create a sysctl knob for a system wide
>>> seccomp logging threshold that would be applied to the high 16-bits of
>>> *every* triggered action: if the action was at/below the threshold a
>>> record would be emitted, otherwise silence.  This should resolve
>>> problems #1 and #2, and the code should be relatively straightforward
>>> and small.
>>
>> I like that idea quite a bit. To be completely honest, for #1, I
>> personally only care about logging SECCOMP_RET_ERRNO actions but this
>> idea solves it in a nice and general way.
> 
> Yeah, I'd much rather solve this problem generally; everybody has
> their favorite action and I'd like to avoid solving the same problem
> multiple times.
> 
> Sooo ... you want to take a whack at coding this up? ;)

Yes, I can do that.

> 
>>> As part of the code above, I expect that all seccomp logging would get
>>> routed through a single logging function (sort of like a better
>>> implementation of the existing audit_seccomp()) that would check the
>>> threshold and trigger the logging if needed.  This function could be
>>> augmented to check for CONFIG_AUDIT and in the case where audit was
>>> not built into the kernel, a simple printk could be used to log the
>>> seccomp event; solving problem #3.
>>
>> That doesn't fully solve #3 for me. In Ubuntu (and I think Debian), we
>> build with CONFIG_AUDIT enabled but don't ship auditd by default so
>> audit_enabled is false. In that default configuration, we still want
>> seccomp audit messages to be printk'ed. I'll need to figure out how to
>> cleanly allow opting into seccomp audit messages when CONFIG_AUDIT is
>> enabled and audit_enabled is false.
> 
> Heh, so you've got audit built into the kernel but you're not using
> it; that sounds "fun".

Users that need full auditing functionality can simply install auditd
but most users don't require it. Ubuntu has done it this way for many
years and the lack of seccomp auditing when auditd isn't running has
been the only problem that I can remember.

> Anyway, I think the logging consolidation could still help you, if for
> no other reason than everything is going through the same function at
> that point.  We could do some other stuff there to handle the case
> where audit is compiled, but auditd is not running ... we already have
> some code in place to handle that for other reasons, check
> kernel/audit.c for more information.  I'd still work on the other
> stuff first and then we can add this in at the end of the patchset.

/me nods. It is easy to continue to distro patch out the recently added
check for audit_enabled until a better solution for all distros can be
identified/upstreamed.

Thanks again!

Tyler

> 
>>> We could also add a SECCOMP_RET_AUDIT, or similar, if we still feel
>>> that is important (I personally waffle on this), but I think that is
>>> independent of the ideas above.
>>
>> I agree that it is independent but SECCOMP_RET_AUDIT would still be
>> important to Ubuntu.
>>
>> Tyler
> 




signature.asc
Description: OpenPGP digital signature


Re: [PATCH] f2fs: fix small discards when se->valid_blocks is zero

2017-01-03 Thread Jaegeuk Kim
Hi Yunlong,

On 01/03, Yunlong Song wrote:
> In the small discard case, when se->valid_blocks is zero, the 
> add_discard_addrs
> will directly return without __add_discard_entry. This will cause the file
> delete have no small discard. The case is like this:
> 
> 1. Allocate free 2M segment
> 2. Write a file (size n blocks < 512) in that 2M segment, se->valid_blocks = n
> 3. Delete that file, se->valid_blocks = 0, add_discard_addrs will return 
> without
> sending any discard of that file, and forever due to cur_map[i] ^ ckpt_map[i] 
> =
> 0 after that checkpoint

During this checkpoint, that'll be discarded as a prefree segment, no?
Note that, if this is a current segment, f2fs won't discard it until it is
fully allocated.

Thanks,

> 
> Signed-off-by: Yunlong Song 
> ---
>  fs/f2fs/segment.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
> index 0738f48..8610f14 100644
> --- a/fs/f2fs/segment.c
> +++ b/fs/f2fs/segment.c
> @@ -838,7 +838,7 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, 
> struct cp_control *cpc)
>   return;
>  
>   if (!force) {
> - if (!test_opt(sbi, DISCARD) || !se->valid_blocks ||
> + if (!test_opt(sbi, DISCARD) ||
>   SM_I(sbi)->nr_discards >= SM_I(sbi)->max_discards)
>   return;
>   }
> -- 
> 1.8.5.2


Re: [PATCH] f2fs: fix small discards when se->valid_blocks is zero

2017-01-03 Thread Jaegeuk Kim
Hi Yunlong,

On 01/03, Yunlong Song wrote:
> In the small discard case, when se->valid_blocks is zero, the 
> add_discard_addrs
> will directly return without __add_discard_entry. This will cause the file
> delete have no small discard. The case is like this:
> 
> 1. Allocate free 2M segment
> 2. Write a file (size n blocks < 512) in that 2M segment, se->valid_blocks = n
> 3. Delete that file, se->valid_blocks = 0, add_discard_addrs will return 
> without
> sending any discard of that file, and forever due to cur_map[i] ^ ckpt_map[i] 
> =
> 0 after that checkpoint

During this checkpoint, that'll be discarded as a prefree segment, no?
Note that, if this is a current segment, f2fs won't discard it until it is
fully allocated.

Thanks,

> 
> Signed-off-by: Yunlong Song 
> ---
>  fs/f2fs/segment.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
> index 0738f48..8610f14 100644
> --- a/fs/f2fs/segment.c
> +++ b/fs/f2fs/segment.c
> @@ -838,7 +838,7 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, 
> struct cp_control *cpc)
>   return;
>  
>   if (!force) {
> - if (!test_opt(sbi, DISCARD) || !se->valid_blocks ||
> + if (!test_opt(sbi, DISCARD) ||
>   SM_I(sbi)->nr_discards >= SM_I(sbi)->max_discards)
>   return;
>   }
> -- 
> 1.8.5.2


[PATCH] staging:iio:addac Fixes Alignment should match open parenthesis

2017-01-03 Thread Scott Matheina
Fixes style issue where Alignment doesn't match open parenthesis

Signed-off-by: Scott Matheina 
---
 drivers/staging/iio/addac/adt7316-i2c.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/staging/iio/addac/adt7316-i2c.c 
b/drivers/staging/iio/addac/adt7316-i2c.c
index 0ccf192..f66dd3e 100644
--- a/drivers/staging/iio/addac/adt7316-i2c.c
+++ b/drivers/staging/iio/addac/adt7316-i2c.c
@@ -93,7 +93,7 @@ static int adt7316_i2c_multi_write(void *client, u8 reg, u8 
count, u8 *data)
  */
 
 static int adt7316_i2c_probe(struct i2c_client *client,
-   const struct i2c_device_id *id)
+const struct i2c_device_id *id)
 {
struct adt7316_bus bus = {
.client = client,
-- 
2.7.4



[PATCH] staging:iio:addac Fixes Alignment should match open parenthesis

2017-01-03 Thread Scott Matheina
Fixes style issue where Alignment doesn't match open parenthesis

Signed-off-by: Scott Matheina 
---
 drivers/staging/iio/addac/adt7316-i2c.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/staging/iio/addac/adt7316-i2c.c 
b/drivers/staging/iio/addac/adt7316-i2c.c
index 0ccf192..f66dd3e 100644
--- a/drivers/staging/iio/addac/adt7316-i2c.c
+++ b/drivers/staging/iio/addac/adt7316-i2c.c
@@ -93,7 +93,7 @@ static int adt7316_i2c_multi_write(void *client, u8 reg, u8 
count, u8 *data)
  */
 
 static int adt7316_i2c_probe(struct i2c_client *client,
-   const struct i2c_device_id *id)
+const struct i2c_device_id *id)
 {
struct adt7316_bus bus = {
.client = client,
-- 
2.7.4



Re: [RFC] memcpy_nocache() and memcpy_writethrough()

2017-01-03 Thread Al Viro
On Tue, Jan 03, 2017 at 05:38:54PM -0800, Dan Williams wrote:
> > 1) memcpy_to_pmem() seems to rely upon the __copy_from_user_nocache()
> > having only used movnt; it does not attempt clwb at all.
> 
> Yes, and there was a fix a while back to make sure it always used
> movnt so clwb after the fact is not required:
> 
> a82eee742452 x86/uaccess/64: Handle the caching of 4-byte nocache
> copies properly in __copy_user_nocache()
> 
> > 2) __copy_from_user_nocache() for short copies does not use movnt at all.
> > In that case neither sfence nor clwb is issued.
> 
> For the 32bit case, yes, but the pmem driver should warn about this
> when it checks platform persistent memory capabilities (i.e. x86 32bit
> not supported). Ugh, we may have lost that warning for this specific
> case recently, I'll go double check and fix it up.
> 
> > 3) it uses movnt only for part of copying in case of misaligned copy;
> > No clwb is issued, but sfence *is* - at the very end in 64bit case,
> > between movnt and copying the tail - in 32bit one.  Incidentally,
> > while 64bit case takes care to align the destination for movnt part,
> > 32bit one does not.
> >
> > How much of the above is broken and what do the callers rely upon?
> 
> 32bit issues are known, but 64bit path is ok since that fix above.

Bollocks.  That fix above does *NOT* eliminate all cached stores.  Just look
at the damn function - it still does cached stores for until the target is
aligned and it does the same for tail when end of destination is not aligned.
Right there in arch/x86/lib/copy_user_64.S.

> > In particular, is that sfence the right thing for pmem usecases?
> 
> That sfence is not there for pmem purposes. The dax / pmem usage does
> not expect memcpy_to_pmem() to fence as it may have more writes to
> queue up and amortize all the writes with a later fence. This seems to
> be even more evidence for moving this functionality away from the
> uaccess routines to somewhere more pmem specific.


Re: [RFC] memcpy_nocache() and memcpy_writethrough()

2017-01-03 Thread Al Viro
On Tue, Jan 03, 2017 at 05:38:54PM -0800, Dan Williams wrote:
> > 1) memcpy_to_pmem() seems to rely upon the __copy_from_user_nocache()
> > having only used movnt; it does not attempt clwb at all.
> 
> Yes, and there was a fix a while back to make sure it always used
> movnt so clwb after the fact is not required:
> 
> a82eee742452 x86/uaccess/64: Handle the caching of 4-byte nocache
> copies properly in __copy_user_nocache()
> 
> > 2) __copy_from_user_nocache() for short copies does not use movnt at all.
> > In that case neither sfence nor clwb is issued.
> 
> For the 32bit case, yes, but the pmem driver should warn about this
> when it checks platform persistent memory capabilities (i.e. x86 32bit
> not supported). Ugh, we may have lost that warning for this specific
> case recently, I'll go double check and fix it up.
> 
> > 3) it uses movnt only for part of copying in case of misaligned copy;
> > No clwb is issued, but sfence *is* - at the very end in 64bit case,
> > between movnt and copying the tail - in 32bit one.  Incidentally,
> > while 64bit case takes care to align the destination for movnt part,
> > 32bit one does not.
> >
> > How much of the above is broken and what do the callers rely upon?
> 
> 32bit issues are known, but 64bit path is ok since that fix above.

Bollocks.  That fix above does *NOT* eliminate all cached stores.  Just look
at the damn function - it still does cached stores for until the target is
aligned and it does the same for tail when end of destination is not aligned.
Right there in arch/x86/lib/copy_user_64.S.

> > In particular, is that sfence the right thing for pmem usecases?
> 
> That sfence is not there for pmem purposes. The dax / pmem usage does
> not expect memcpy_to_pmem() to fence as it may have more writes to
> queue up and amortize all the writes with a later fence. This seems to
> be even more evidence for moving this functionality away from the
> uaccess routines to somewhere more pmem specific.


Re: [PATCH] x86: fix kaslr and memmap collision

2017-01-03 Thread Baoquan He
On 01/03/17 at 01:15pm, Dave Jiang wrote:
> 
> 
> On 01/03/2017 11:24 AM, Dan Williams wrote:
> > On Tue, Jan 3, 2017 at 12:31 AM, Baoquan He  wrote:
> >> Hi Dan,
> >>
> >> On 11/22/16 at 09:26am, Dan Williams wrote:
> >>> [ replying for Dave since he's offline today and tomorrow ]
> >>>
> >>> On Tue, Nov 22, 2016 at 12:47 AM, Ingo Molnar  wrote:
> 
>  * Dave Jiang  wrote:
> 
> > CONFIG_RANDOMIZE_BASE relocates the kernel to a random base address.
> > However it does not take into account the memmap= parameter passed in 
> > from
> > the kernel commandline.
> 
>  memmap= parameters are often used as a list.
> 
> > [...] This results in the kernel sometimes being put in the middle of 
> > the user
> > memmap. [...]
> 
>  What does this mean? If memmap= is used to re-define the memory map then 
>  the
>  kernel getting in the middle of a RAM area is what we want, isn't it? 
>  What we
>  don't want is for the kernel to get into reserved areas, right?
> >>>
> >>> Right, this is about teaching kaslr to not land the kernel in newly
> >>> defined reserved regions that were not marked reserved in the initial
> >>> e820 map from platform firmware.
> >>
> >> If only tell kaslr to not land kernel in newly defined reserved regions,
> >> memory added by "memmap=nn[KMG]@ss[KMG]" should not be avoided since
> >> it's usable memory. Kernel randomized into this region is also what we
> >> want. Not sure if I understand it right.
> > 
> > You're right, this is supposed to be for memmap=nn!ss cases which
> > defines reserved persistent memory ranges, not memmap=nn@ss which
> > defines usable memory.
> > 
> > We need to fix mem_avoid_memmap() to only skip memmap= statements that
> > specify reserved memory.

Thanks for confirmation, Dan!

> > 
> 
> I think nn@ss is the only one that we should skip over, otherwise
> everything else looks like should be avoided. I'll update.
Hi Dave,

I guess your purpose is to avoid the user defined reserved memory and
pmem which I am not very sure about since kaslr won't stamp on ACPI
region reported by BIOS. Seems OK to avoid them all except of nn@ss.

I have other concerns, will directly comment in your v4 post.

Thanks
Baoquan


Re: [PATCH] f2fs: fix small discards when se->valid_blocks is zero

2017-01-03 Thread Chao Yu
On 2017/1/3 17:01, Yunlong Song wrote:
> In the small discard case, when se->valid_blocks is zero, the 
> add_discard_addrs
> will directly return without __add_discard_entry. This will cause the file
> delete have no small discard. The case is like this:
> 
> 1. Allocate free 2M segment
> 2. Write a file (size n blocks < 512) in that 2M segment, se->valid_blocks = n
> 3. Delete that file, se->valid_blocks = 0, add_discard_addrs will return 
> without
> sending any discard of that file, and forever due to cur_map[i] ^ ckpt_map[i] 
> =
> 0 after that checkpoint

Wouldn't it be discarded as prefree segment?

Thanks,

> 
> Signed-off-by: Yunlong Song 
> ---
>  fs/f2fs/segment.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
> index 0738f48..8610f14 100644
> --- a/fs/f2fs/segment.c
> +++ b/fs/f2fs/segment.c
> @@ -838,7 +838,7 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, 
> struct cp_control *cpc)
>   return;
>  
>   if (!force) {
> - if (!test_opt(sbi, DISCARD) || !se->valid_blocks ||
> + if (!test_opt(sbi, DISCARD) ||
>   SM_I(sbi)->nr_discards >= SM_I(sbi)->max_discards)
>   return;
>   }
> 



Re: [PATCH] x86: fix kaslr and memmap collision

2017-01-03 Thread Baoquan He
On 01/03/17 at 01:15pm, Dave Jiang wrote:
> 
> 
> On 01/03/2017 11:24 AM, Dan Williams wrote:
> > On Tue, Jan 3, 2017 at 12:31 AM, Baoquan He  wrote:
> >> Hi Dan,
> >>
> >> On 11/22/16 at 09:26am, Dan Williams wrote:
> >>> [ replying for Dave since he's offline today and tomorrow ]
> >>>
> >>> On Tue, Nov 22, 2016 at 12:47 AM, Ingo Molnar  wrote:
> 
>  * Dave Jiang  wrote:
> 
> > CONFIG_RANDOMIZE_BASE relocates the kernel to a random base address.
> > However it does not take into account the memmap= parameter passed in 
> > from
> > the kernel commandline.
> 
>  memmap= parameters are often used as a list.
> 
> > [...] This results in the kernel sometimes being put in the middle of 
> > the user
> > memmap. [...]
> 
>  What does this mean? If memmap= is used to re-define the memory map then 
>  the
>  kernel getting in the middle of a RAM area is what we want, isn't it? 
>  What we
>  don't want is for the kernel to get into reserved areas, right?
> >>>
> >>> Right, this is about teaching kaslr to not land the kernel in newly
> >>> defined reserved regions that were not marked reserved in the initial
> >>> e820 map from platform firmware.
> >>
> >> If only tell kaslr to not land kernel in newly defined reserved regions,
> >> memory added by "memmap=nn[KMG]@ss[KMG]" should not be avoided since
> >> it's usable memory. Kernel randomized into this region is also what we
> >> want. Not sure if I understand it right.
> > 
> > You're right, this is supposed to be for memmap=nn!ss cases which
> > defines reserved persistent memory ranges, not memmap=nn@ss which
> > defines usable memory.
> > 
> > We need to fix mem_avoid_memmap() to only skip memmap= statements that
> > specify reserved memory.

Thanks for confirmation, Dan!

> > 
> 
> I think nn@ss is the only one that we should skip over, otherwise
> everything else looks like should be avoided. I'll update.
Hi Dave,

I guess your purpose is to avoid the user defined reserved memory and
pmem which I am not very sure about since kaslr won't stamp on ACPI
region reported by BIOS. Seems OK to avoid them all except of nn@ss.

I have other concerns, will directly comment in your v4 post.

Thanks
Baoquan


Re: [PATCH] f2fs: fix small discards when se->valid_blocks is zero

2017-01-03 Thread Chao Yu
On 2017/1/3 17:01, Yunlong Song wrote:
> In the small discard case, when se->valid_blocks is zero, the 
> add_discard_addrs
> will directly return without __add_discard_entry. This will cause the file
> delete have no small discard. The case is like this:
> 
> 1. Allocate free 2M segment
> 2. Write a file (size n blocks < 512) in that 2M segment, se->valid_blocks = n
> 3. Delete that file, se->valid_blocks = 0, add_discard_addrs will return 
> without
> sending any discard of that file, and forever due to cur_map[i] ^ ckpt_map[i] 
> =
> 0 after that checkpoint

Wouldn't it be discarded as prefree segment?

Thanks,

> 
> Signed-off-by: Yunlong Song 
> ---
>  fs/f2fs/segment.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
> index 0738f48..8610f14 100644
> --- a/fs/f2fs/segment.c
> +++ b/fs/f2fs/segment.c
> @@ -838,7 +838,7 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, 
> struct cp_control *cpc)
>   return;
>  
>   if (!force) {
> - if (!test_opt(sbi, DISCARD) || !se->valid_blocks ||
> + if (!test_opt(sbi, DISCARD) ||
>   SM_I(sbi)->nr_discards >= SM_I(sbi)->max_discards)
>   return;
>   }
> 



Re: [PATCH v2 2/2] mtd: nand: Update dependency of IFC for LS1021A

2017-01-03 Thread Marek Vasut
On 01/04/2017 02:46 AM, Alison Wang wrote:
>> On 01/03/2017 03:41 AM, Alison Wang wrote:
>>> As NAND support for Freescale/NXP IFC controller is available on
>>> LS1021A, the dependency for LS1021A is added.
>>
>> Does LS stand for LayerScape ? Yes it does. So why does ARCH_LAYERSCAPE
>> not cover LS1021 ?
> [Alison Wang] LS1021A is an earlier product and is not compatible with later 
> Layerscape architecture. So ARCH_LAYERSCAPE can't cover LS1021A.

Ah ok, I see. That information would be useful in the commit message ;-)

>>
>>> Signed-off-by: Alison Wang 
>>> ---
>>> Changes in v2:
>>> - None
>>>
>>>  drivers/mtd/nand/Kconfig | 2 +-
>>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/mtd/nand/Kconfig b/drivers/mtd/nand/Kconfig
>> index
>>> 353a9dd..85e3860 100644
>>> --- a/drivers/mtd/nand/Kconfig
>>> +++ b/drivers/mtd/nand/Kconfig
>>> @@ -441,7 +441,7 @@ config MTD_NAND_FSL_ELBC
>>>
>>>  config MTD_NAND_FSL_IFC
>>> tristate "NAND support for Freescale IFC controller"
>>> -   depends on FSL_SOC || ARCH_LAYERSCAPE
>>> +   depends on FSL_SOC || ARCH_LAYERSCAPE || SOC_LS1021A
>>> select FSL_IFC
>>> select MEMORY
>>> help
>>>
> 
> Best Regards,
> Alison Wang
> 


-- 
Best regards,
Marek Vasut


Re: [PATCH v2 2/2] mtd: nand: Update dependency of IFC for LS1021A

2017-01-03 Thread Marek Vasut
On 01/04/2017 02:46 AM, Alison Wang wrote:
>> On 01/03/2017 03:41 AM, Alison Wang wrote:
>>> As NAND support for Freescale/NXP IFC controller is available on
>>> LS1021A, the dependency for LS1021A is added.
>>
>> Does LS stand for LayerScape ? Yes it does. So why does ARCH_LAYERSCAPE
>> not cover LS1021 ?
> [Alison Wang] LS1021A is an earlier product and is not compatible with later 
> Layerscape architecture. So ARCH_LAYERSCAPE can't cover LS1021A.

Ah ok, I see. That information would be useful in the commit message ;-)

>>
>>> Signed-off-by: Alison Wang 
>>> ---
>>> Changes in v2:
>>> - None
>>>
>>>  drivers/mtd/nand/Kconfig | 2 +-
>>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/mtd/nand/Kconfig b/drivers/mtd/nand/Kconfig
>> index
>>> 353a9dd..85e3860 100644
>>> --- a/drivers/mtd/nand/Kconfig
>>> +++ b/drivers/mtd/nand/Kconfig
>>> @@ -441,7 +441,7 @@ config MTD_NAND_FSL_ELBC
>>>
>>>  config MTD_NAND_FSL_IFC
>>> tristate "NAND support for Freescale IFC controller"
>>> -   depends on FSL_SOC || ARCH_LAYERSCAPE
>>> +   depends on FSL_SOC || ARCH_LAYERSCAPE || SOC_LS1021A
>>> select FSL_IFC
>>> select MEMORY
>>> help
>>>
> 
> Best Regards,
> Alison Wang
> 


-- 
Best regards,
Marek Vasut


Re: LTP rwtest01 blocks on DAX mountpoint

2017-01-03 Thread Xiong Zhou
On Tue, Jan 03, 2017 at 09:57:10AM -0700, Ross Zwisler wrote:
> On Tue, Jan 03, 2017 at 02:49:22PM +0800, Xiong Zhou wrote:
> > On Mon, Jan 02, 2017 at 02:49:41PM -0700, Ross Zwisler wrote:
> > > On Mon, Jan 02, 2017 at 06:16:17PM +0100, Jan Kara wrote:
> > > > On Fri 30-12-16 17:33:53, Xiong Zhou wrote:
> > > > > On Sat, Dec 24, 2016 at 07:07:14PM +0800, Xiong Zhou wrote:
> > > > > > Hi lists,
> > snip
> > > > I was trying to reproduce this but for me rwtest01 completes just fine 
> > > > on
> > > > dax mountpoint (I've used your reproducer). So can you sample several
> > > > kernel stack traces to get a rough idea where the kernel is running?
> > > > Thanks!
> > > > 
> > > > Honza
> > > 
> > > I'm also unable to reproduce this issue.  I've tried with both the blamed
> > > commit:
> > > 4b4bb46 (HEAD) dax: clear dirty entry tags on cache flush
> > > and with v4.9-rc2.  Both pass the test in my setup.
> > > Perhaps the variable is the size of your PMEM partitions?
> > > # fdisk -l /dev/pmem0
> > > Disk /dev/pmem0: 16 GiB, 17179869184 bytes, 33554432 sectors
> > > Units: sectors of 1 * 512 = 512 bytes
> > > Sector size (logical/physical): 512 bytes / 4096 bytes
> > > I/O size (minimum/optimal): 4096 bytes / 4096 bytes
> > > Disklabel type: dos
> > > Disk identifier: 0xfe50c900
> > > Device   BootStart  End  Sectors Size Id Type
> > > /dev/pmem0p1  4096 25165823 25161728  12G 83 Linux
> > > /dev/pmem0p2  25165824 33550335  8384512   4G 83 Linux
> > > 
> > > What does your setup look like?
> > > I'm using the current tip of the LTP tree:
> > > 8cc4165  waitid02: define _XOPEN_SOURCE 500
> > > Thanks,
> > > - Ross
> > 
> > Thanks all for looking into it.
> > 
> > Turns out the rc2 relative updates fix this issue, so does
> > an old issue i reported a while ago:
> > multi-threads libvmmalloc fork test hang
> > https://lists.01.org/pipermail/linux-nvdimm/2016-October/007602.html
> > 
> > I'm able to reproduce these issues before rc2, now it
> > passes on current Linus tree:
> > c8b4ec8 Merge tag 'fscrypt-for-stable'
> 
> Hmm...I'm able to reproduce the other libvmmalloc issue with both v4.10-rc2
> and with "c8b4ec8 Merge tag 'fscrypt-for-stable'".  I'm debugging that issue
> today.
> 
> It's interesting that both tests started passing for you.  Did you change
> something in your test setup?

Er.. After double-checking, I have reduced nvml check test time on
Dec 30,

-make -C src check -j $NR_CPU
+make -C src check -j $NR_CPU TEST_TIME=1m

Other then this, nothing changed but the kernel code, same machine,
same cmdline, same configs.

I'm going to dig this more, and test your patch.

Thanks for looking into this!
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: LTP rwtest01 blocks on DAX mountpoint

2017-01-03 Thread Xiong Zhou
On Tue, Jan 03, 2017 at 09:57:10AM -0700, Ross Zwisler wrote:
> On Tue, Jan 03, 2017 at 02:49:22PM +0800, Xiong Zhou wrote:
> > On Mon, Jan 02, 2017 at 02:49:41PM -0700, Ross Zwisler wrote:
> > > On Mon, Jan 02, 2017 at 06:16:17PM +0100, Jan Kara wrote:
> > > > On Fri 30-12-16 17:33:53, Xiong Zhou wrote:
> > > > > On Sat, Dec 24, 2016 at 07:07:14PM +0800, Xiong Zhou wrote:
> > > > > > Hi lists,
> > snip
> > > > I was trying to reproduce this but for me rwtest01 completes just fine 
> > > > on
> > > > dax mountpoint (I've used your reproducer). So can you sample several
> > > > kernel stack traces to get a rough idea where the kernel is running?
> > > > Thanks!
> > > > 
> > > > Honza
> > > 
> > > I'm also unable to reproduce this issue.  I've tried with both the blamed
> > > commit:
> > > 4b4bb46 (HEAD) dax: clear dirty entry tags on cache flush
> > > and with v4.9-rc2.  Both pass the test in my setup.
> > > Perhaps the variable is the size of your PMEM partitions?
> > > # fdisk -l /dev/pmem0
> > > Disk /dev/pmem0: 16 GiB, 17179869184 bytes, 33554432 sectors
> > > Units: sectors of 1 * 512 = 512 bytes
> > > Sector size (logical/physical): 512 bytes / 4096 bytes
> > > I/O size (minimum/optimal): 4096 bytes / 4096 bytes
> > > Disklabel type: dos
> > > Disk identifier: 0xfe50c900
> > > Device   BootStart  End  Sectors Size Id Type
> > > /dev/pmem0p1  4096 25165823 25161728  12G 83 Linux
> > > /dev/pmem0p2  25165824 33550335  8384512   4G 83 Linux
> > > 
> > > What does your setup look like?
> > > I'm using the current tip of the LTP tree:
> > > 8cc4165  waitid02: define _XOPEN_SOURCE 500
> > > Thanks,
> > > - Ross
> > 
> > Thanks all for looking into it.
> > 
> > Turns out the rc2 relative updates fix this issue, so does
> > an old issue i reported a while ago:
> > multi-threads libvmmalloc fork test hang
> > https://lists.01.org/pipermail/linux-nvdimm/2016-October/007602.html
> > 
> > I'm able to reproduce these issues before rc2, now it
> > passes on current Linus tree:
> > c8b4ec8 Merge tag 'fscrypt-for-stable'
> 
> Hmm...I'm able to reproduce the other libvmmalloc issue with both v4.10-rc2
> and with "c8b4ec8 Merge tag 'fscrypt-for-stable'".  I'm debugging that issue
> today.
> 
> It's interesting that both tests started passing for you.  Did you change
> something in your test setup?

Er.. After double-checking, I have reduced nvml check test time on
Dec 30,

-make -C src check -j $NR_CPU
+make -C src check -j $NR_CPU TEST_TIME=1m

Other then this, nothing changed but the kernel code, same machine,
same cmdline, same configs.

I'm going to dig this more, and test your patch.

Thanks for looking into this!
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 0/3] PM / devfreq: Fix the bug and add reviewer for devfreq support

2017-01-03 Thread Chanwoo Choi
Dear Rafael,

On 2017년 01월 04일 06:58, Rafael J. Wysocki wrote:
> On Tue, Jan 3, 2017 at 12:54 PM, Chanwoo Choi  wrote:
>> Dear Myungjoo,
>>
>> Thanks for your review for patch1.
>> But, patch2/3 is not yet reviewed. Could you please review these patches?
> 
> I queued them up as 4.10 fixes in the meantime.

Thanks for picking up the patches.

-- 
Regards,
Chanwoo Choi


Re: [PATCH v2 0/3] PM / devfreq: Fix the bug and add reviewer for devfreq support

2017-01-03 Thread Chanwoo Choi
Dear Rafael,

On 2017년 01월 04일 06:58, Rafael J. Wysocki wrote:
> On Tue, Jan 3, 2017 at 12:54 PM, Chanwoo Choi  wrote:
>> Dear Myungjoo,
>>
>> Thanks for your review for patch1.
>> But, patch2/3 is not yet reviewed. Could you please review these patches?
> 
> I queued them up as 4.10 fixes in the meantime.

Thanks for picking up the patches.

-- 
Regards,
Chanwoo Choi


Re: 9pfs hangs since 4.7

2017-01-03 Thread Al Viro
On Wed, Jan 04, 2017 at 01:34:50AM +0200, Tuomas Tynkkynen wrote:

> I got these logs from the server & client with 9p tracepoints enabled:
> 
> https://gist.githubusercontent.com/dezgeg/02447100b3182167403099fe7de4d941/raw/3772e408ddf586fb662ac9148fc10943529a6b99/dmesg%2520with%25209p%2520trace
> https://gist.githubusercontent.com/dezgeg/e1e0c7f354042e1d9bdf7e9135934a65/raw/3a0e3b4f7a5229fd0be032c6839b578d47a21ce4/qemu.log
> 

Lovely...

27349:out 8 0001TSTATFS, tag 1
27350:out 12 0001   TLOPEN, tag 1
27351:complete 9 0001   RSTATFS, tag 1
27352:complete 13 0001  RLOPEN, tag 1

27677:out 8 0001TSTATFS, tag 1
27678:out 12 0001   TLOPEN, tag 1
27679:complete 9 0001   RSTATFS, tag 1
27680:complete 13 0001  RLOPEN, tag 1

29667:out 8 0001TSTATFS, tag 1
29668:out 110 0001  TWALK, tag 1
29669:complete 9 0001   RSTATFS, tag 1
29670:complete 7 0001   RLERROR, tag 1

42317:out 110 0001  TWALK, tag 1
42318:out 8 0001TSTATFS, tag 1
42319:complete 9 0001   RSTATFS, tag 1
42320:complete 7 0001   RERROR, tag 1

Those are outright protocol violations: tag can be reused only after either
a reply bearing the same tag has arrived *or* TFLUSH for that tag had been
issued and successful (type == RFLUSH) reply bearing the tag of that TFLUSH
has arrived.  Your log doesn't contain any TFLUSH (108) at all, so it should've
been plain and simple "no reuse until server sends a reply with matching tag".

Otherwise the the dump looks sane.  In the end all issued requests had been
served, so it's not as if the client had been waiting for a free tag or for
a response to be produced by the server.

Interesting...  dmesg doesn't seem to contain the beginning of the 9P trace -
only 89858 out of 108818 in the dump, even though it claims to have lost
only 4445 events.  The last exchanges in the trace are
> P9_TLOPEN tag  90 err 0
> P9_TLOPEN tag  25 err 0
> P9_TLOPEN tag  15 err 0
> P9_TLOPEN tag  104 err 0
> P9_TLOPEN tag  102 err 0
> P9_TLOPEN tag  91 err 0
> P9_TLOPEN tag  12 err 0
< P9_TREADLINK tag  12

which should've been
complete 13 005a
complete 13 0019
complete 13 000f
complete 13 0068
complete 13 0066
complete 13 005b
out 22 000c

and the only plausible match for that is

108784:complete 13 005a
108798:complete 13 000f
108799:complete 13 0019
108800:complete 13 0068
108801:complete 13 0066
108802:complete 13 005b

but there's a bunch of replies in between that hadn't been logged *and* such
a TREADLINK request isn't seen after
108341:out 22 000c

... and sure enough, there's a whole lot of processes stuck in readlink.

Interesting...  Which version of qemu/what arguments are you using there?


Re: 9pfs hangs since 4.7

2017-01-03 Thread Al Viro
On Wed, Jan 04, 2017 at 01:34:50AM +0200, Tuomas Tynkkynen wrote:

> I got these logs from the server & client with 9p tracepoints enabled:
> 
> https://gist.githubusercontent.com/dezgeg/02447100b3182167403099fe7de4d941/raw/3772e408ddf586fb662ac9148fc10943529a6b99/dmesg%2520with%25209p%2520trace
> https://gist.githubusercontent.com/dezgeg/e1e0c7f354042e1d9bdf7e9135934a65/raw/3a0e3b4f7a5229fd0be032c6839b578d47a21ce4/qemu.log
> 

Lovely...

27349:out 8 0001TSTATFS, tag 1
27350:out 12 0001   TLOPEN, tag 1
27351:complete 9 0001   RSTATFS, tag 1
27352:complete 13 0001  RLOPEN, tag 1

27677:out 8 0001TSTATFS, tag 1
27678:out 12 0001   TLOPEN, tag 1
27679:complete 9 0001   RSTATFS, tag 1
27680:complete 13 0001  RLOPEN, tag 1

29667:out 8 0001TSTATFS, tag 1
29668:out 110 0001  TWALK, tag 1
29669:complete 9 0001   RSTATFS, tag 1
29670:complete 7 0001   RLERROR, tag 1

42317:out 110 0001  TWALK, tag 1
42318:out 8 0001TSTATFS, tag 1
42319:complete 9 0001   RSTATFS, tag 1
42320:complete 7 0001   RERROR, tag 1

Those are outright protocol violations: tag can be reused only after either
a reply bearing the same tag has arrived *or* TFLUSH for that tag had been
issued and successful (type == RFLUSH) reply bearing the tag of that TFLUSH
has arrived.  Your log doesn't contain any TFLUSH (108) at all, so it should've
been plain and simple "no reuse until server sends a reply with matching tag".

Otherwise the the dump looks sane.  In the end all issued requests had been
served, so it's not as if the client had been waiting for a free tag or for
a response to be produced by the server.

Interesting...  dmesg doesn't seem to contain the beginning of the 9P trace -
only 89858 out of 108818 in the dump, even though it claims to have lost
only 4445 events.  The last exchanges in the trace are
> P9_TLOPEN tag  90 err 0
> P9_TLOPEN tag  25 err 0
> P9_TLOPEN tag  15 err 0
> P9_TLOPEN tag  104 err 0
> P9_TLOPEN tag  102 err 0
> P9_TLOPEN tag  91 err 0
> P9_TLOPEN tag  12 err 0
< P9_TREADLINK tag  12

which should've been
complete 13 005a
complete 13 0019
complete 13 000f
complete 13 0068
complete 13 0066
complete 13 005b
out 22 000c

and the only plausible match for that is

108784:complete 13 005a
108798:complete 13 000f
108799:complete 13 0019
108800:complete 13 0068
108801:complete 13 0066
108802:complete 13 005b

but there's a bunch of replies in between that hadn't been logged *and* such
a TREADLINK request isn't seen after
108341:out 22 000c

... and sure enough, there's a whole lot of processes stuck in readlink.

Interesting...  Which version of qemu/what arguments are you using there?


RE: [PATCH v2 2/2] mtd: nand: Update dependency of IFC for LS1021A

2017-01-03 Thread Alison Wang
> On 01/03/2017 03:41 AM, Alison Wang wrote:
> > As NAND support for Freescale/NXP IFC controller is available on
> > LS1021A, the dependency for LS1021A is added.
> 
> Does LS stand for LayerScape ? Yes it does. So why does ARCH_LAYERSCAPE
> not cover LS1021 ?
[Alison Wang] LS1021A is an earlier product and is not compatible with later 
Layerscape architecture. So ARCH_LAYERSCAPE can't cover LS1021A.

> 
> > Signed-off-by: Alison Wang 
> > ---
> > Changes in v2:
> > - None
> >
> >  drivers/mtd/nand/Kconfig | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/drivers/mtd/nand/Kconfig b/drivers/mtd/nand/Kconfig
> index
> > 353a9dd..85e3860 100644
> > --- a/drivers/mtd/nand/Kconfig
> > +++ b/drivers/mtd/nand/Kconfig
> > @@ -441,7 +441,7 @@ config MTD_NAND_FSL_ELBC
> >
> >  config MTD_NAND_FSL_IFC
> > tristate "NAND support for Freescale IFC controller"
> > -   depends on FSL_SOC || ARCH_LAYERSCAPE
> > +   depends on FSL_SOC || ARCH_LAYERSCAPE || SOC_LS1021A
> > select FSL_IFC
> > select MEMORY
> > help
> >

Best Regards,
Alison Wang


RE: [PATCH v2 2/2] mtd: nand: Update dependency of IFC for LS1021A

2017-01-03 Thread Alison Wang
> On 01/03/2017 03:41 AM, Alison Wang wrote:
> > As NAND support for Freescale/NXP IFC controller is available on
> > LS1021A, the dependency for LS1021A is added.
> 
> Does LS stand for LayerScape ? Yes it does. So why does ARCH_LAYERSCAPE
> not cover LS1021 ?
[Alison Wang] LS1021A is an earlier product and is not compatible with later 
Layerscape architecture. So ARCH_LAYERSCAPE can't cover LS1021A.

> 
> > Signed-off-by: Alison Wang 
> > ---
> > Changes in v2:
> > - None
> >
> >  drivers/mtd/nand/Kconfig | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/drivers/mtd/nand/Kconfig b/drivers/mtd/nand/Kconfig
> index
> > 353a9dd..85e3860 100644
> > --- a/drivers/mtd/nand/Kconfig
> > +++ b/drivers/mtd/nand/Kconfig
> > @@ -441,7 +441,7 @@ config MTD_NAND_FSL_ELBC
> >
> >  config MTD_NAND_FSL_IFC
> > tristate "NAND support for Freescale IFC controller"
> > -   depends on FSL_SOC || ARCH_LAYERSCAPE
> > +   depends on FSL_SOC || ARCH_LAYERSCAPE || SOC_LS1021A
> > select FSL_IFC
> > select MEMORY
> > help
> >

Best Regards,
Alison Wang


Re: [PATCH] Allow userspace control of runtime disabling/enabling of driver probing

2017-01-03 Thread Rafael J. Wysocki
On Wed, Jan 4, 2017 at 12:38 AM, Kees Cook  wrote:
> On Tue, Jan 3, 2017 at 3:34 PM, Rafael J. Wysocki  wrote:
>> On Tue, Jan 3, 2017 at 11:58 PM, Kees Cook  wrote:
>>> From: Matthew Garrett 
>>>
>>> Various attacks are made possible due to the large attack surface of
>>> kernel drivers and the easy availability of hotpluggable hardware that can
>>> be programmed to mimic arbitrary devices. This allows attackers to find a
>>> single vulnerable driver and then produce a device that can exploit it by
>>> plugging into a hotpluggable bus (such as PCI or USB). This violates user
>>> assumptions about unattended systems being secure as long as the screen
>>> is locked.
>>>
>>> The kernel already has support for deferring driver binding in order
>>> to avoid problems over suspend/resume. By exposing this to userspace we
>>> can disable probing when the screen is locked and simply reenable it on
>>> unlock.
>>>
>>> This is not a complete solution - since this still permits device
>>> creation and simply blocks driver binding, it won't stop userspace
>>> drivers from attaching to devices and it won't protect against any kernel
>>> vulnerabilities in the core bus code. However, it should be sufficient to
>>> block attacks like Poisontap (https://samy.pl/poisontap/).
>>
>> It also looks like this may be worked around by tricking the user to
>> unlock the screen while the malicious device is still attached to the
>> system.
>
> It certainly changes the temporal aspect of the attack (i.e. there is
> a delay and must be "silent" in that the local user cannot notice it).

It will be silent until a driver binds to it anyway, won't it?

>> If that really is the case, I wonder if it's worth the extra complexity.
>
> I think so, since it's not that much more complexity (it uses the
> existing deferral mechanism).

But that existing mechanism certainly wasn't designed to be turned on
and off at random and concurrently etc.  The way it is going to be
used now is far more complex IMO.

Thanks,
Rafael


Re: [PATCH] Allow userspace control of runtime disabling/enabling of driver probing

2017-01-03 Thread Rafael J. Wysocki
On Wed, Jan 4, 2017 at 12:38 AM, Kees Cook  wrote:
> On Tue, Jan 3, 2017 at 3:34 PM, Rafael J. Wysocki  wrote:
>> On Tue, Jan 3, 2017 at 11:58 PM, Kees Cook  wrote:
>>> From: Matthew Garrett 
>>>
>>> Various attacks are made possible due to the large attack surface of
>>> kernel drivers and the easy availability of hotpluggable hardware that can
>>> be programmed to mimic arbitrary devices. This allows attackers to find a
>>> single vulnerable driver and then produce a device that can exploit it by
>>> plugging into a hotpluggable bus (such as PCI or USB). This violates user
>>> assumptions about unattended systems being secure as long as the screen
>>> is locked.
>>>
>>> The kernel already has support for deferring driver binding in order
>>> to avoid problems over suspend/resume. By exposing this to userspace we
>>> can disable probing when the screen is locked and simply reenable it on
>>> unlock.
>>>
>>> This is not a complete solution - since this still permits device
>>> creation and simply blocks driver binding, it won't stop userspace
>>> drivers from attaching to devices and it won't protect against any kernel
>>> vulnerabilities in the core bus code. However, it should be sufficient to
>>> block attacks like Poisontap (https://samy.pl/poisontap/).
>>
>> It also looks like this may be worked around by tricking the user to
>> unlock the screen while the malicious device is still attached to the
>> system.
>
> It certainly changes the temporal aspect of the attack (i.e. there is
> a delay and must be "silent" in that the local user cannot notice it).

It will be silent until a driver binds to it anyway, won't it?

>> If that really is the case, I wonder if it's worth the extra complexity.
>
> I think so, since it's not that much more complexity (it uses the
> existing deferral mechanism).

But that existing mechanism certainly wasn't designed to be turned on
and off at random and concurrently etc.  The way it is going to be
used now is far more complex IMO.

Thanks,
Rafael


Re: [PATCH 3/4] watchdog: iTCO_wdt: Use pdev for platform device and pci_dev for pci device

2017-01-03 Thread Guenter Roeck

On 01/03/2017 03:50 PM, Andy Shevchenko wrote:

On Wed, Jan 4, 2017 at 1:48 AM, Andy Shevchenko
 wrote:

On Wed, Jan 4, 2017 at 1:40 AM, Guenter Roeck  wrote:

On Wed, Jan 04, 2017 at 12:39:59AM +0200, Andy Shevchenko wrote:

On Tue, Jan 3, 2017 at 4:39 PM, Guenter Roeck  wrote:

Use pdev for struct platform_device, pcidev for struct pci_dev, and dev
for struct device variables to improve consistency.

Remove 'struct platform_device *dev;' from struct iTCO_wdt_private since
it was unused.


Would pci_dev work?


Sure, or maybe just 'pci'. Any preference ?


Just slightly prefer pci_dev over others (pcidev, pci), but do not
insist. Up to you.


Or even

struct device *dev;
...
struct pci_dev *pci_dev = to_pci_dev(dev);



'dev' would have to be a pointer to the parent device. If I consider using
dev_{info,err}, I'll need 'dev' in struct iTCO_wdt_private. I think I'll
stick with pci_dev.

Thanks,
Guenter



Re: [PATCH 3/4] watchdog: iTCO_wdt: Use pdev for platform device and pci_dev for pci device

2017-01-03 Thread Guenter Roeck

On 01/03/2017 03:50 PM, Andy Shevchenko wrote:

On Wed, Jan 4, 2017 at 1:48 AM, Andy Shevchenko
 wrote:

On Wed, Jan 4, 2017 at 1:40 AM, Guenter Roeck  wrote:

On Wed, Jan 04, 2017 at 12:39:59AM +0200, Andy Shevchenko wrote:

On Tue, Jan 3, 2017 at 4:39 PM, Guenter Roeck  wrote:

Use pdev for struct platform_device, pcidev for struct pci_dev, and dev
for struct device variables to improve consistency.

Remove 'struct platform_device *dev;' from struct iTCO_wdt_private since
it was unused.


Would pci_dev work?


Sure, or maybe just 'pci'. Any preference ?


Just slightly prefer pci_dev over others (pcidev, pci), but do not
insist. Up to you.


Or even

struct device *dev;
...
struct pci_dev *pci_dev = to_pci_dev(dev);



'dev' would have to be a pointer to the parent device. If I consider using
dev_{info,err}, I'll need 'dev' in struct iTCO_wdt_private. I think I'll
stick with pci_dev.

Thanks,
Guenter



Re: [RFC] memcpy_nocache() and memcpy_writethrough()

2017-01-03 Thread Dan Williams
On Tue, Jan 3, 2017 at 3:22 PM, Al Viro  wrote:
> On Tue, Jan 03, 2017 at 01:14:11PM -0800, Dan Williams wrote:
>
>> Robert was describing the overall flow / mechanics, but I think it is
>> easier to visualize the sfence as a flush command sent to a disk
>> device with a volatile cache. In fact, that's how we implemented it in
>> the pmem block device driver. The pmem block device registers itself
>> as requiring REQ_FLUSH to be sent to persist writes. The driver issues
>> sfence on the assumption that all writes to pmem have either bypassed
>> the cache with movnt, or are scheduled for write-back via one of the
>> flush instructions (clflush, clwb, or clflushopt).
>
> *blink*
>
> 1) memcpy_to_pmem() seems to rely upon the __copy_from_user_nocache()
> having only used movnt; it does not attempt clwb at all.

Yes, and there was a fix a while back to make sure it always used
movnt so clwb after the fact is not required:

a82eee742452 x86/uaccess/64: Handle the caching of 4-byte nocache
copies properly in __copy_user_nocache()

> 2) __copy_from_user_nocache() for short copies does not use movnt at all.
> In that case neither sfence nor clwb is issued.

For the 32bit case, yes, but the pmem driver should warn about this
when it checks platform persistent memory capabilities (i.e. x86 32bit
not supported). Ugh, we may have lost that warning for this specific
case recently, I'll go double check and fix it up.

> 3) it uses movnt only for part of copying in case of misaligned copy;
> No clwb is issued, but sfence *is* - at the very end in 64bit case,
> between movnt and copying the tail - in 32bit one.  Incidentally,
> while 64bit case takes care to align the destination for movnt part,
> 32bit one does not.
>
> How much of the above is broken and what do the callers rely upon?

32bit issues are known, but 64bit path is ok since that fix above.

> In particular, is that sfence the right thing for pmem usecases?

That sfence is not there for pmem purposes. The dax / pmem usage does
not expect memcpy_to_pmem() to fence as it may have more writes to
queue up and amortize all the writes with a later fence. This seems to
be even more evidence for moving this functionality away from the
uaccess routines to somewhere more pmem specific.


Re: [RFC] memcpy_nocache() and memcpy_writethrough()

2017-01-03 Thread Dan Williams
On Tue, Jan 3, 2017 at 3:22 PM, Al Viro  wrote:
> On Tue, Jan 03, 2017 at 01:14:11PM -0800, Dan Williams wrote:
>
>> Robert was describing the overall flow / mechanics, but I think it is
>> easier to visualize the sfence as a flush command sent to a disk
>> device with a volatile cache. In fact, that's how we implemented it in
>> the pmem block device driver. The pmem block device registers itself
>> as requiring REQ_FLUSH to be sent to persist writes. The driver issues
>> sfence on the assumption that all writes to pmem have either bypassed
>> the cache with movnt, or are scheduled for write-back via one of the
>> flush instructions (clflush, clwb, or clflushopt).
>
> *blink*
>
> 1) memcpy_to_pmem() seems to rely upon the __copy_from_user_nocache()
> having only used movnt; it does not attempt clwb at all.

Yes, and there was a fix a while back to make sure it always used
movnt so clwb after the fact is not required:

a82eee742452 x86/uaccess/64: Handle the caching of 4-byte nocache
copies properly in __copy_user_nocache()

> 2) __copy_from_user_nocache() for short copies does not use movnt at all.
> In that case neither sfence nor clwb is issued.

For the 32bit case, yes, but the pmem driver should warn about this
when it checks platform persistent memory capabilities (i.e. x86 32bit
not supported). Ugh, we may have lost that warning for this specific
case recently, I'll go double check and fix it up.

> 3) it uses movnt only for part of copying in case of misaligned copy;
> No clwb is issued, but sfence *is* - at the very end in 64bit case,
> between movnt and copying the tail - in 32bit one.  Incidentally,
> while 64bit case takes care to align the destination for movnt part,
> 32bit one does not.
>
> How much of the above is broken and what do the callers rely upon?

32bit issues are known, but 64bit path is ok since that fix above.

> In particular, is that sfence the right thing for pmem usecases?

That sfence is not there for pmem purposes. The dax / pmem usage does
not expect memcpy_to_pmem() to fence as it may have more writes to
queue up and amortize all the writes with a later fence. This seems to
be even more evidence for moving this functionality away from the
uaccess routines to somewhere more pmem specific.


[PATCH] initramfs: Fix spurious rebuilds

2017-01-03 Thread Florian Fainelli
Commit 35e669e1a254 ("initramfs: select builtin initram
compression algorithm on KConfig instead of Makefile") makes suffix_y be
a quote variable, which can be illustrated looking at the build output:

  GEN usr/initramfs_data.cpio".gz"

Make sure that we do strip off double quotes from
CONFIG_INITRAMFS_COMPRESSION, since the Makefile tracks targets with
unquoted suffixes.

Fixes: 35e669e1a254 ("initramfs: select builtin initram compression algorithm 
on KConfig instead of Makefile")
Signed-off-by: Florian Fainelli 
---
This is against v4.10-rc2, thanks!

 usr/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/usr/Makefile b/usr/Makefile
index 17a513268325..a9ae8b493e2b 100644
--- a/usr/Makefile
+++ b/usr/Makefile
@@ -5,7 +5,7 @@
 klibcdirs:;
 PHONY += klibcdirs
 
-suffix_y = $(CONFIG_INITRAMFS_COMPRESSION)
+suffix_y = $(subst ",,$(CONFIG_INITRAMFS_COMPRESSION))
 AFLAGS_initramfs_data.o += 
-DINITRAMFS_IMAGE="usr/initramfs_data.cpio$(suffix_y)"
 
 # Generate builtin.o based on initramfs_data.o
-- 
2.9.3



[PATCH] initramfs: Fix spurious rebuilds

2017-01-03 Thread Florian Fainelli
Commit 35e669e1a254 ("initramfs: select builtin initram
compression algorithm on KConfig instead of Makefile") makes suffix_y be
a quote variable, which can be illustrated looking at the build output:

  GEN usr/initramfs_data.cpio".gz"

Make sure that we do strip off double quotes from
CONFIG_INITRAMFS_COMPRESSION, since the Makefile tracks targets with
unquoted suffixes.

Fixes: 35e669e1a254 ("initramfs: select builtin initram compression algorithm 
on KConfig instead of Makefile")
Signed-off-by: Florian Fainelli 
---
This is against v4.10-rc2, thanks!

 usr/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/usr/Makefile b/usr/Makefile
index 17a513268325..a9ae8b493e2b 100644
--- a/usr/Makefile
+++ b/usr/Makefile
@@ -5,7 +5,7 @@
 klibcdirs:;
 PHONY += klibcdirs
 
-suffix_y = $(CONFIG_INITRAMFS_COMPRESSION)
+suffix_y = $(subst ",,$(CONFIG_INITRAMFS_COMPRESSION))
 AFLAGS_initramfs_data.o += 
-DINITRAMFS_IMAGE="usr/initramfs_data.cpio$(suffix_y)"
 
 # Generate builtin.o based on initramfs_data.o
-- 
2.9.3



Re: [v2 3/7] x86/mpx, x86/insn: Relocate insn util functions to a new insn-utils

2017-01-03 Thread Ricardo Neri
On Tue, 2017-01-03 at 08:44 -0800, Dave Hansen wrote:
> On 12/23/2016 05:37 PM, Ricardo Neri wrote:
> > Other kernel submodules can benefit from using the utility functions
> > defined in mpx.c to obtain the addresses and values of operands contained
> > in the general purpose registers. An instance of this is the emulation code
> > used for instructions protected by the Intel User-Mode Instruction
> > Prevention feature.
> 
> I haven't looked at this in detail, but as long as this is pretty much a
> straight code move, I don't see any issues with it from an MPX
> perspective.  I'm glad to see it getting reused.

Yes, this is only a relocation of code.
> 
> Feel free to add my Acked-by on it if you like.

Great! Thanks!
Ricardo




Re: [v2 3/7] x86/mpx, x86/insn: Relocate insn util functions to a new insn-utils

2017-01-03 Thread Ricardo Neri
On Tue, 2017-01-03 at 08:44 -0800, Dave Hansen wrote:
> On 12/23/2016 05:37 PM, Ricardo Neri wrote:
> > Other kernel submodules can benefit from using the utility functions
> > defined in mpx.c to obtain the addresses and values of operands contained
> > in the general purpose registers. An instance of this is the emulation code
> > used for instructions protected by the Intel User-Mode Instruction
> > Prevention feature.
> 
> I haven't looked at this in detail, but as long as this is pretty much a
> straight code move, I don't see any issues with it from an MPX
> perspective.  I'm glad to see it getting reused.

Yes, this is only a relocation of code.
> 
> Feel free to add my Acked-by on it if you like.

Great! Thanks!
Ricardo




Re: [v2 2/7] x86/mpx: Fail when implicit zero-displacement is used along with R/EBP

2017-01-03 Thread Ricardo Neri
On Tue, 2017-01-03 at 08:41 -0800, Dave Hansen wrote:
> On 12/27/2016 02:33 PM, Ricardo Neri wrote:
> >>> diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
> >>> index 6a75a75..71681d0 100644
> >>> --- a/arch/x86/mm/mpx.c
> >>> +++ b/arch/x86/mm/mpx.c
> >>> @@ -120,6 +120,13 @@ static int get_reg_offset(struct insn *insn, struct 
> >>> pt_regs *regs,
> >>>
> >>> case REG_TYPE_BASE:
> >>> regno = X86_SIB_BASE(insn->sib.value);
> >>> +   if (regno == 5 && X86_MODRM_RM(insn->modrm.value) == 0) {
> >>> +   WARN_ONCE(1, "An explicit displacement is 
> >>> required when %sBP used as SIB base.",
> >>> + (IS_ENABLED(CONFIG_X86_64) && 
> >>> insn->x86_64) ?
> >>> + "R13 or R" : "E");
> >>> +   return -EINVAL;
> >>> +   }
> >>> +
> >> Now that I've read the cover letter, I see what's going on.  This
> >> should not warn -- user code can easily trigger this deliberately.
> > OK, I'll remove it. Are you concerned about the warning printing the
> > calltrace, even only once?
> 
> Yes.  We don't let userspace spam the kernel, even once.  If we have a
> couple thousand "only once" places, then userspace can overwhelm the
> kernel log.

This makes sense. I was not looking at it this way.
> 
> Also, this needs a much better description of what's going on in the
> code.  Could you add a comment explaining what's going on, and why
> regno==5, etc...?

I will add more comments.

Thanks!
Ricardo




Re: [v2 2/7] x86/mpx: Fail when implicit zero-displacement is used along with R/EBP

2017-01-03 Thread Ricardo Neri
On Tue, 2017-01-03 at 08:41 -0800, Dave Hansen wrote:
> On 12/27/2016 02:33 PM, Ricardo Neri wrote:
> >>> diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
> >>> index 6a75a75..71681d0 100644
> >>> --- a/arch/x86/mm/mpx.c
> >>> +++ b/arch/x86/mm/mpx.c
> >>> @@ -120,6 +120,13 @@ static int get_reg_offset(struct insn *insn, struct 
> >>> pt_regs *regs,
> >>>
> >>> case REG_TYPE_BASE:
> >>> regno = X86_SIB_BASE(insn->sib.value);
> >>> +   if (regno == 5 && X86_MODRM_RM(insn->modrm.value) == 0) {
> >>> +   WARN_ONCE(1, "An explicit displacement is 
> >>> required when %sBP used as SIB base.",
> >>> + (IS_ENABLED(CONFIG_X86_64) && 
> >>> insn->x86_64) ?
> >>> + "R13 or R" : "E");
> >>> +   return -EINVAL;
> >>> +   }
> >>> +
> >> Now that I've read the cover letter, I see what's going on.  This
> >> should not warn -- user code can easily trigger this deliberately.
> > OK, I'll remove it. Are you concerned about the warning printing the
> > calltrace, even only once?
> 
> Yes.  We don't let userspace spam the kernel, even once.  If we have a
> couple thousand "only once" places, then userspace can overwhelm the
> kernel log.

This makes sense. I was not looking at it this way.
> 
> Also, this needs a much better description of what's going on in the
> code.  Could you add a comment explaining what's going on, and why
> regno==5, etc...?

I will add more comments.

Thanks!
Ricardo




Re: [v2 5/7] x86: Add emulation code for UMIP instructions

2017-01-03 Thread Ricardo Neri
On Fri, 2016-12-30 at 18:07 -0800, Andy Lutomirski wrote:
> On Thu, Dec 29, 2016 at 9:23 PM, Ricardo Neri
>  wrote:
> > On Tue, 2016-12-27 at 16:48 -0800, Andy Lutomirski wrote:
> >>
> >> >> > +   if (nr_copied  > 0)
> >> >> > +   return -EFAULT;
> >> >>
> >> >> This should be the only EFAULT case.
> >> > Should this be EFAULT event if the caller cares only about successful
> >> > (return 0) vs failed (return non-0) emulation?
> >>
> >> In theory this particular error would be a page fault not a general
> >> protection fault (in the UMIP off case).  If you were emulating it
> >> extra carefully, you could change the signal accordingly.  But, as I
> >> said, I really doubt this matters.
> >
> > If simple enough and for the sake of accuracy, I could try to issue the
> > page fault. It seems to me that this entitles calling
> > force_sig_info_fault in this particular case as opposed to the
> > force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk) that do_general_protection
> > calls.
> 
> Sure.  You could even do it by sending the signal in the emulation
> code and returning true.

Will do.

Thanks!
Ricardo
> 
> --Andy




Re: [v2 5/7] x86: Add emulation code for UMIP instructions

2017-01-03 Thread Ricardo Neri
On Fri, 2016-12-30 at 18:07 -0800, Andy Lutomirski wrote:
> On Thu, Dec 29, 2016 at 9:23 PM, Ricardo Neri
>  wrote:
> > On Tue, 2016-12-27 at 16:48 -0800, Andy Lutomirski wrote:
> >>
> >> >> > +   if (nr_copied  > 0)
> >> >> > +   return -EFAULT;
> >> >>
> >> >> This should be the only EFAULT case.
> >> > Should this be EFAULT event if the caller cares only about successful
> >> > (return 0) vs failed (return non-0) emulation?
> >>
> >> In theory this particular error would be a page fault not a general
> >> protection fault (in the UMIP off case).  If you were emulating it
> >> extra carefully, you could change the signal accordingly.  But, as I
> >> said, I really doubt this matters.
> >
> > If simple enough and for the sake of accuracy, I could try to issue the
> > page fault. It seems to me that this entitles calling
> > force_sig_info_fault in this particular case as opposed to the
> > force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk) that do_general_protection
> > calls.
> 
> Sure.  You could even do it by sending the signal in the emulation
> code and returning true.

Will do.

Thanks!
Ricardo
> 
> --Andy




Re: [PATCH] target/user: Fix use-after-free cmd->se_cmd if the cmd is expired

2017-01-03 Thread Mike Christie
On 01/03/2017 02:46 AM, lixi...@cmss.chinamobile.com wrote:
> From: Xiubo Li 
> 
> This is another use-after-free bug, the crash Call Trace is like:
> [  368.909498] RIP: 0010:[]  []
> memcpy+0x16/0x110
> ..
> [  368.909547] Call Trace:
> [  368.909550]  [] ?gather_data_area+0x109/0x180
> [  368.909552]  [] tcmu_handle_completions+0x2ff/0x450
> [  368.909554]  [] tcmu_irqcontrol+0x15/0x20
> [  368.909555]  [] uio_write+0x7b/0xc0
> [  368.909558]  [] vfs_write+0xbd/0x1e0
> [  368.909559]  [] SyS_write+0x7f/0xe0
> [  368.909562]  [] system_call_fastpath+0x16/0x1b
> 
> Don't free se_cmd of the expired cmds in tcmu_check_expired_cmd(),
> it will be dereferenced by tcmu_handle_completions()--->
> tcmu_handle_completion(), after userspace ever resumes processing.
> 
> It will be freed by tcmu_handle_completion() if userspace ever recovers,
> or tcmu_free_device if not.
> 
> Cc: sta...@vger.kernel.org
> Signed-off-by: Xiubo Li 
> Signed-off-by: Jianfei Hu 
> ---
>  drivers/target/target_core_user.c | 1 -
>  1 file changed, 1 deletion(-)
> 
> diff --git a/drivers/target/target_core_user.c 
> b/drivers/target/target_core_user.c
> index 2e33100..6396581 100644
> --- a/drivers/target/target_core_user.c
> +++ b/drivers/target/target_core_user.c
> @@ -684,7 +684,6 @@ static int tcmu_check_expired_cmd(int id, void *p, void 
> *data)
>  
>   set_bit(TCMU_CMD_BIT_EXPIRED, >flags);
>   target_complete_cmd(cmd->se_cmd, SAM_STAT_CHECK_CONDITION);
> - cmd->se_cmd = NULL;
>  

How did tcmu_handle_completion get to a point it was accessing the
se_cmd if the TCMU_CMD_BIT_EXPIRED bit was set? Were memory accesses out
of order? CPU1 set the TCMU_CMD_BIT_EXPIRED bit then cleared
cmd->se_cmd, but CPU2 copied cmd->se_cmd to se_cmd and saw it was NULL
but did not yet see the TCMU_CMD_BIT_EXPIRED bit set?

It looks like, if you do the above patch, the above function will call
target_complete_cmd and tcmu_handle_completion will call it again, so we
will have a double free issue.


Re: [PATCH] target/user: Fix use-after-free cmd->se_cmd if the cmd is expired

2017-01-03 Thread Mike Christie
On 01/03/2017 02:46 AM, lixi...@cmss.chinamobile.com wrote:
> From: Xiubo Li 
> 
> This is another use-after-free bug, the crash Call Trace is like:
> [  368.909498] RIP: 0010:[]  []
> memcpy+0x16/0x110
> ..
> [  368.909547] Call Trace:
> [  368.909550]  [] ?gather_data_area+0x109/0x180
> [  368.909552]  [] tcmu_handle_completions+0x2ff/0x450
> [  368.909554]  [] tcmu_irqcontrol+0x15/0x20
> [  368.909555]  [] uio_write+0x7b/0xc0
> [  368.909558]  [] vfs_write+0xbd/0x1e0
> [  368.909559]  [] SyS_write+0x7f/0xe0
> [  368.909562]  [] system_call_fastpath+0x16/0x1b
> 
> Don't free se_cmd of the expired cmds in tcmu_check_expired_cmd(),
> it will be dereferenced by tcmu_handle_completions()--->
> tcmu_handle_completion(), after userspace ever resumes processing.
> 
> It will be freed by tcmu_handle_completion() if userspace ever recovers,
> or tcmu_free_device if not.
> 
> Cc: sta...@vger.kernel.org
> Signed-off-by: Xiubo Li 
> Signed-off-by: Jianfei Hu 
> ---
>  drivers/target/target_core_user.c | 1 -
>  1 file changed, 1 deletion(-)
> 
> diff --git a/drivers/target/target_core_user.c 
> b/drivers/target/target_core_user.c
> index 2e33100..6396581 100644
> --- a/drivers/target/target_core_user.c
> +++ b/drivers/target/target_core_user.c
> @@ -684,7 +684,6 @@ static int tcmu_check_expired_cmd(int id, void *p, void 
> *data)
>  
>   set_bit(TCMU_CMD_BIT_EXPIRED, >flags);
>   target_complete_cmd(cmd->se_cmd, SAM_STAT_CHECK_CONDITION);
> - cmd->se_cmd = NULL;
>  

How did tcmu_handle_completion get to a point it was accessing the
se_cmd if the TCMU_CMD_BIT_EXPIRED bit was set? Were memory accesses out
of order? CPU1 set the TCMU_CMD_BIT_EXPIRED bit then cleared
cmd->se_cmd, but CPU2 copied cmd->se_cmd to se_cmd and saw it was NULL
but did not yet see the TCMU_CMD_BIT_EXPIRED bit set?

It looks like, if you do the above patch, the above function will call
target_complete_cmd and tcmu_handle_completion will call it again, so we
will have a double free issue.


Re: [tpmdd-devel] [PATCH RFC 0/4] RFC: in-kernel resource manager

2017-01-03 Thread James Bottomley
On Tue, 2017-01-03 at 21:14 +0200, Jarkko Sakkinen wrote:
> On Tue, Jan 03, 2017 at 08:36:02PM +0200, Jarkko Sakkinen wrote:
> > On Tue, Jan 03, 2017 at 08:14:55AM -0800, James Bottomley wrote:
> > > On Tue, 2017-01-03 at 15:41 +0200, Jarkko Sakkinen wrote:
[...]
> > > > Just thinking how to split up the non-RFC patch set. This was 
> > > > also what Jason suggested if I understood his remark correctly.
> > > 
> > > SUre ... let's get agreement on how we move forward first.  How 
> > > the patch is activated by the user has to be sorted out as well
> > > before it can go in, but it doesn't have to be the first thing we 
> > > do.  I'm happy to continue playing with the interfaces to see 
> > > what works and what doesn't.  My main current feedback is that I 
> > > think separate devices works way better than an ioctl becuase the 
> > > separate devices approach allows differing system policies for 
> > > who accesses the RM backed TPM vs who accesses the raw one.
> > 
> > I think I see your point. I would rather name the device as tpms0 
> > but otherwise I think we could do it in the way you suggest...

I'm not at all wedded to the name tpm0rm, so tpms0 is fine by me.

> I think one more stronger argument for tpms0 is that it keeps tpm0
> intact. Those who don't care about tpms0 don't have to worry about it
> causing regressions. Also it makes it cleaner to put the whole 
> feature under a compilation flag, which would make to me because that 
> gives distributions a choice to not enable in-kernel RM when it first 
> hits the mainline.

I wouldn't go that far: one of the evils we cause for distros is too
many compile options.  In this case, I can't think of a good reason to
have an option to disable this now the feature is segregated on to a
separate device.  If we get a regression, only users of the new device
will notice.  If it's a compile option, this is the same if the distro
enables it and if it's disabled, no user can test out the feature (and
distros eventually get complaints about it not working).

James




Re: [tpmdd-devel] [PATCH RFC 0/4] RFC: in-kernel resource manager

2017-01-03 Thread James Bottomley
On Tue, 2017-01-03 at 21:14 +0200, Jarkko Sakkinen wrote:
> On Tue, Jan 03, 2017 at 08:36:02PM +0200, Jarkko Sakkinen wrote:
> > On Tue, Jan 03, 2017 at 08:14:55AM -0800, James Bottomley wrote:
> > > On Tue, 2017-01-03 at 15:41 +0200, Jarkko Sakkinen wrote:
[...]
> > > > Just thinking how to split up the non-RFC patch set. This was 
> > > > also what Jason suggested if I understood his remark correctly.
> > > 
> > > SUre ... let's get agreement on how we move forward first.  How 
> > > the patch is activated by the user has to be sorted out as well
> > > before it can go in, but it doesn't have to be the first thing we 
> > > do.  I'm happy to continue playing with the interfaces to see 
> > > what works and what doesn't.  My main current feedback is that I 
> > > think separate devices works way better than an ioctl becuase the 
> > > separate devices approach allows differing system policies for 
> > > who accesses the RM backed TPM vs who accesses the raw one.
> > 
> > I think I see your point. I would rather name the device as tpms0 
> > but otherwise I think we could do it in the way you suggest...

I'm not at all wedded to the name tpm0rm, so tpms0 is fine by me.

> I think one more stronger argument for tpms0 is that it keeps tpm0
> intact. Those who don't care about tpms0 don't have to worry about it
> causing regressions. Also it makes it cleaner to put the whole 
> feature under a compilation flag, which would make to me because that 
> gives distributions a choice to not enable in-kernel RM when it first 
> hits the mainline.

I wouldn't go that far: one of the evils we cause for distros is too
many compile options.  In this case, I can't think of a good reason to
have an option to disable this now the feature is segregated on to a
separate device.  If we get a regression, only users of the new device
will notice.  If it's a compile option, this is the same if the distro
enables it and if it's disabled, no user can test out the feature (and
distros eventually get complaints about it not working).

James




<    1   2   3   4   5   6   7   8   9   10   >