[PATCH v2 0/3] net: ethernet: fec: fix refclk enable for SMSC LAN8710/20

2017-11-20 Thread Richard Leitner
From: Richard Leitner 

This patch series fixes the use of the SMSC LAN8710/20 with a Freescale ETH
when the refclk is generated by the FSL.

Changes v2:
- simplify and fix fec_reset_phy function to support multiple calls
- include: linux: phy: harmonize phy_id{,_mask} type
- reset the phy instead of not turning the clock on and off
  (which would have caused a power consumption regression)

Richard Leitner (3):
  net: ethernet: freescale: simplify fec_reset_phy
  include: linux: phy: harmonize phy_id{,_mask} type
  net: ethernet: fec: fix refclk enable for SMSC LAN8710/20

 drivers/net/ethernet/freescale/fec.h  |   4 +
 drivers/net/ethernet/freescale/fec_main.c | 125 --
 include/linux/phy.h   |   2 +-
 3 files changed, 88 insertions(+), 43 deletions(-)

-- 
2.11.0



[PATCH v2 2/3] include: linux: phy: harmonize phy_id{,_mask} type

2017-11-20 Thread Richard Leitner
From: Richard Leitner 

Previously phy_id was u32 and phy_id_mask was unsigned int. As the
phy_id_mask defines the important bits of the phy_id (and is therefore the
same size) these two variables should be the same datatype.

Signed-off-by: Richard Leitner 
---
 include/linux/phy.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/phy.h b/include/linux/phy.h
index dc82a07cb4fd..e00fd9ce3bce 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -509,7 +509,7 @@ struct phy_driver {
struct mdio_driver_common mdiodrv;
u32 phy_id;
char *name;
-   unsigned int phy_id_mask;
+   u32 phy_id_mask;
u32 features;
u32 flags;
const void *driver_data;
-- 
2.11.0



[PATCH v2 3/3] net: ethernet: fec: fix refclk enable for SMSC LAN8710/20

2017-11-20 Thread Richard Leitner
From: Richard Leitner 

Some PHYs (for example the SMSC LAN8710/LAN8720) doesn't allow turning
the refclk on and off again during operation (according to their
datasheet). Nonetheless exactly this behaviour was introduced for power
saving reasons by commit e8fcfcd5684a ("net: fec: optimize the clock management 
to save power").
Therefore after enabling the refclk we detect if an affected PHY is
attached. If so reset and initialize it again.

For a better understanding here's a outline of the time response of the
clock and reset lines before and after this patch:

  v--fec_probe()  v--fec_enet_open()
  v   v
w/o patch eCLK: ___|
w/o patch nRST: __--
w/o patch CONF: ___XX___

w/  patch eCLK: ___|
w/  patch nRST: __-__---
w/  patch CONF: ___XX_XX
  ^   ^
  ^--fec_probe()  ^--fec_enet_open()

Generally speaking this issue is only relevant if the ref clk for the
PHY is generated by the SoC. In our specific case (PCB) this problem
does occur at about every 10th to 50th POR of an LAN8710 connected to an
i.MX6DL SoC. The typical symptom of this problem is a "swinging"
ethernet link. Similar issues were reported by users of the NXP forum:
https://community.nxp.com/thread/389902
https://community.nxp.com/message/309354
With this patch applied the issue didn't occur for at least a few
hundret PORs of our board.

Fixes: e8fcfcd5684a ("net: fec: optimize the clock management to save power")
Signed-off-by: Richard Leitner 
---
 drivers/net/ethernet/freescale/fec_main.c | 37 +++
 1 file changed, 37 insertions(+)

diff --git a/drivers/net/ethernet/freescale/fec_main.c 
b/drivers/net/ethernet/freescale/fec_main.c
index 06a7caca0cee..52ec9b29a70e 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -68,6 +68,7 @@
 
 static void set_multicast_list(struct net_device *ndev);
 static void fec_enet_itr_coal_init(struct net_device *ndev);
+static int fec_reset_phy(struct net_device *ndev);
 
 #define DRIVER_NAME"fec"
 
@@ -1833,6 +1834,32 @@ static int fec_enet_mdio_write(struct mii_bus *bus, int 
mii_id, int regnum,
return ret;
 }
 
+static int fec_enet_clk_ref_enable_reset_phy_quirk(struct net_device *ndev)
+{
+   struct phy_device *phy_dev = ndev->phydev;
+   u32 real_phy_id;
+   int ret;
+
+   /* some PHYs need a reset after the refclk was enabled, so we
+* reset them here
+*/
+   if (!phy_dev)
+   return 0;
+   if (!phy_dev->drv)
+   return 0;
+   real_phy_id = phy_dev->drv->phy_id & phy_dev->drv->phy_id_mask;
+   switch (real_phy_id) {
+   case 0x0007c0f0: /* SMSC LAN8710/LAN8720 */
+   ret = fec_reset_phy(ndev);
+   if (ret)
+   return ret;
+   ret = phy_init_hw(phy_dev);
+   if (ret)
+   return ret;
+   }
+   return 0;
+}
+
 static int fec_enet_clk_enable(struct net_device *ndev, bool enable)
 {
struct fec_enet_private *fep = netdev_priv(ndev);
@@ -1862,6 +1889,10 @@ static int fec_enet_clk_enable(struct net_device *ndev, 
bool enable)
ret = clk_prepare_enable(fep->clk_ref);
if (ret)
goto failed_clk_ref;
+
+   ret = fec_enet_clk_ref_enable_reset_phy_quirk(ndev);
+   if (ret)
+   netdev_warn(ndev, "Resetting PHY failed, connection may 
be unstable\n");
} else {
clk_disable_unprepare(fep->clk_ahb);
clk_disable_unprepare(fep->clk_enet_out);
@@ -2860,11 +2891,17 @@ fec_enet_open(struct net_device *ndev)
if (ret)
goto err_enet_mii_probe;
 
+   /* as the PHY is connected now, trigger the reset quirk again */
+   ret = fec_enet_clk_ref_enable_reset_phy_quirk(ndev);
+   if (ret)
+   netdev_warn(ndev, "Resetting PHY failed, connection may be 
unstable\n");
+
if (fep->quirks & FEC_QUIRK_ERR006687)
imx6q_cpuidle_fec_irqs_used();
 
napi_enable(&fep->napi);
phy_start(ndev->phydev);
+
netif_tx_start_all_queues(ndev);
 
device_set_wakeup_enable(&ndev->dev, fep->wol_flag &
-- 
2.11.0



[PATCH v2 1/3] net: ethernet: freescale: simplify fec_reset_phy

2017-11-20 Thread Richard Leitner
From: Richard Leitner 

The fec_reset_phy function allowed only one execution during probeing.
To make it more usable move the dt parsing and gpio allocation to the
probe function. The parameters of the phy reset are added to the
fec_enet_private struct. As a result the fec_reset_phy function may be
called anytime after probe.

One checkpatch.pl warning (too long line) is ignored. This is due to the
fact a string (dt property name) otherwise needs to be split over
multiple lines, which is counterproductive for the readability.

Signed-off-by: Richard Leitner 
---
 drivers/net/ethernet/freescale/fec.h  |  4 ++
 drivers/net/ethernet/freescale/fec_main.c | 88 ---
 2 files changed, 50 insertions(+), 42 deletions(-)

diff --git a/drivers/net/ethernet/freescale/fec.h 
b/drivers/net/ethernet/freescale/fec.h
index 5385074b3b7d..401c4eabf08a 100644
--- a/drivers/net/ethernet/freescale/fec.h
+++ b/drivers/net/ethernet/freescale/fec.h
@@ -539,6 +539,10 @@ struct fec_enet_private {
int pause_flag;
int wol_flag;
u32 quirks;
+   int phy_reset;
+   int phy_reset_duration;
+   int phy_reset_post_delay;
+   boolphy_reset_active_high;
 
struct  napi_struct napi;
int csum_flags;
diff --git a/drivers/net/ethernet/freescale/fec_main.c 
b/drivers/net/ethernet/freescale/fec_main.c
index 610573855213..06a7caca0cee 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -3212,62 +3212,36 @@ static int fec_enet_init(struct net_device *ndev)
 }
 
 #ifdef CONFIG_OF
-static int fec_reset_phy(struct platform_device *pdev)
+static int fec_reset_phy(struct net_device *ndev)
 {
-   int err, phy_reset;
-   bool active_high = false;
-   int msec = 1, phy_post_delay = 0;
-   struct device_node *np = pdev->dev.of_node;
-
-   if (!np)
-   return 0;
-
-   err = of_property_read_u32(np, "phy-reset-duration", &msec);
-   /* A sane reset duration should not be longer than 1s */
-   if (!err && msec > 1000)
-   msec = 1;
+   struct fec_enet_private *fep = netdev_priv(ndev);
 
-   phy_reset = of_get_named_gpio(np, "phy-reset-gpios", 0);
-   if (phy_reset == -EPROBE_DEFER)
-   return phy_reset;
-   else if (!gpio_is_valid(phy_reset))
+   if (!fep->phy_reset)
return 0;
 
-   err = of_property_read_u32(np, "phy-reset-post-delay", &phy_post_delay);
-   /* valid reset duration should be less than 1s */
-   if (!err && phy_post_delay > 1000)
-   return -EINVAL;
-
-   active_high = of_property_read_bool(np, "phy-reset-active-high");
+   gpio_set_value_cansleep(fep->phy_reset, fep->phy_reset_active_high);
 
-   err = devm_gpio_request_one(&pdev->dev, phy_reset,
-   active_high ? GPIOF_OUT_INIT_HIGH : GPIOF_OUT_INIT_LOW,
-   "phy-reset");
-   if (err) {
-   dev_err(&pdev->dev, "failed to get phy-reset-gpios: %d\n", err);
-   return err;
-   }
-
-   if (msec > 20)
-   msleep(msec);
+   if (fep->phy_reset_duration > 20)
+   msleep(fep->phy_reset_duration);
else
-   usleep_range(msec * 1000, msec * 1000 + 1000);
+   usleep_range(fep->phy_reset_duration * 1000,
+fep->phy_reset_duration * 1000 + 1000);
 
-   gpio_set_value_cansleep(phy_reset, !active_high);
+   gpio_set_value_cansleep(fep->phy_reset, !fep->phy_reset_active_high);
 
-   if (!phy_post_delay)
+   if (!fep->phy_reset_post_delay)
return 0;
 
-   if (phy_post_delay > 20)
-   msleep(phy_post_delay);
+   if (fep->phy_reset_post_delay > 20)
+   msleep(fep->phy_reset_post_delay);
else
-   usleep_range(phy_post_delay * 1000,
-phy_post_delay * 1000 + 1000);
+   usleep_range(fep->phy_reset_post_delay * 1000,
+fep->phy_reset_post_delay * 1000 + 1000);
 
return 0;
 }
 #else /* CONFIG_OF */
-static int fec_reset_phy(struct platform_device *pdev)
+static int fec_reset_phy(struct net_device *ndev)
 {
/*
 * In case of platform probe, the reset has been done
@@ -3400,6 +3374,36 @@ fec_probe(struct platform_device *pdev)
}
fep->phy_node = phy_node;
 
+   fep->phy_reset = of_get_named_gpio(np, "phy-reset-gpios", 0);
+   if (gpio_is_valid(fep->phy_reset)) {
+   ret = of_property_read_u32(np, "phy-reset-duration",
+  &fep->phy_reset_duration);
+   /* A sane reset duration should not be longer than 1s */
+   if (!ret && fep->phy_reset_post_delay > 1000)
+   fep->phy_reset_post_delay = 1;
+
+   ret = of_property_read_u32(np, "phy-reset-post-delay",
+ 

Re: Problem of changing the inline mode or turning on the encapsulation support of Mellanox NIC’s eswitch

2017-11-20 Thread Or Gerlitz
On Mon, Nov 20, 2017 at 5:29 AM, Junxue ZHANG  wrote:
> Hi all,
>
> I encountered a problem when I tried to change the inline mode or turn on the 
> encapsulation support of Mellanox NIC’s eswitch. I wonder if anyone could 
> help me with this. Thanks.
>
> I want to use VXLan with OVS and try to offload the rules in hardware. I 
> don’t know whether it is necessary to turn on the encapsulation support. It 
> would also be of great help if anyone could tell me the correct way to 
> accomplish that. Thanks.

Sounds good.

E-switch vxlan encapsulation offload is supported from ConnectX4-Lx
and onward (CX5, etc)

> ConnectX®-4 EN network interface card, 40GbE dual-port QSFP28, PCIe3.0 x8, 
> tall bracket, ROHS R6

This seems to be CX4 which does not support what you are looking for,
just to be sure, send the output of "lspci -nn | grep -i mellanox"

> Problem:
> —
> I can successfully turned on the switchdev mode of my Mellanox NIC. But when 
> I tried to change the inline-mode or turn on the encapsulation support, it 
> failed.
>
> I can create VFs successfully and use those VFs with switch with link inline 
> mode and encapsulation support off.
>
> How to reproduce:
> —
> Before each of the following steps, I first turn the switchdev mode off and 
> turn on the legacy mode.
>
> 1. Change the inline mode:
>
> $ devlink dev eswitch set pci/:81:00.0 mode switchdev inline-mode network
> devlink answers: Invalid argument

changing inline mode is supported only when all VFs are unbinded


> 2. Turn on the encapsulation:
>
> $ devlink dev eswitch set pci/:81:00.0 mode switchdev encap enable
> devlink answers: Operation not supported

not supported for your HW, as I explained


[PATCH] net: qmi_wwan: add Dell DW5818, DW5819

2017-11-20 Thread Shrirang Bagul
Dell Wireless 5819/5818 devices are re-branded Sierra Wireless MC74
series modems which will by default boot with vid 0x413c and pid's
0x81cf, 0x81d0, 0x81d1,0x81d2. Along with qcserial, these modems support
qmi_wwan on the usb interface #12.

Signed-off-by: Shrirang Bagul 
---
 drivers/net/usb/qmi_wwan.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c
index 8d4a6f7cba61..bdf1fae38af2 100644
--- a/drivers/net/usb/qmi_wwan.c
+++ b/drivers/net/usb/qmi_wwan.c
@@ -1234,6 +1234,10 @@ static const struct usb_device_id products[] = {
{QMI_FIXED_INTF(0x413c, 0x81b3, 8)},/* Dell Wireless 5809e Gobi(TM) 
4G LTE Mobile Broadband Card (rev3) */
{QMI_FIXED_INTF(0x413c, 0x81b6, 8)},/* Dell Wireless 5811e */
{QMI_FIXED_INTF(0x413c, 0x81b6, 10)},   /* Dell Wireless 5811e */
+   {QMI_FIXED_INTF(0x413c, 0x81cf, 12)},   /* Dell Wireless 5819 */
+   {QMI_FIXED_INTF(0x413c, 0x81d0, 12)},   /* Dell Wireless 5819 */
+   {QMI_FIXED_INTF(0x413c, 0x81d1, 12)},   /* Dell Wireless 5818 */
+   {QMI_FIXED_INTF(0x413c, 0x81d2, 12)},   /* Dell Wireless 5818 */
{QMI_FIXED_INTF(0x03f0, 0x4e1d, 8)},/* HP lt4111 LTE/EV-DO/HSPA+ 
Gobi 4G Module */
{QMI_FIXED_INTF(0x22de, 0x9061, 3)},/* WeTelecom WPD-600N */
{QMI_FIXED_INTF(0x1e0e, 0x9001, 5)},/* SIMCom 7230E */
-- 
2.14.1



RE: [PATCH v2 1/3] net: ethernet: freescale: simplify fec_reset_phy

2017-11-20 Thread Andy Duan
From: Richard Leitner  Sent: Monday, November 20, 2017 4:34 PM
>The fec_reset_phy function allowed only one execution during probeing.
>To make it more usable move the dt parsing and gpio allocation to the probe
>function. The parameters of the phy reset are added to the fec_enet_private
>struct. As a result the fec_reset_phy function may be called anytime after
>probe.
>
>One checkpatch.pl warning (too long line) is ignored. This is due to the fact a
>string (dt property name) otherwise needs to be split over multiple lines,
>which is counterproductive for the readability.
>
>Signed-off-by: Richard Leitner 
>---

It is better to convert to gpio descriptor, and use dts gpio flag as the gpio 
polarity instead of extra phy_reset_active_high variable.

Regards,
Andy

> drivers/net/ethernet/freescale/fec.h  |  4 ++
> drivers/net/ethernet/freescale/fec_main.c | 88 --
>-
> 2 files changed, 50 insertions(+), 42 deletions(-)
>
>diff --git a/drivers/net/ethernet/freescale/fec.h
>b/drivers/net/ethernet/freescale/fec.h
>index 5385074b3b7d..401c4eabf08a 100644
>--- a/drivers/net/ethernet/freescale/fec.h
>+++ b/drivers/net/ethernet/freescale/fec.h
>@@ -539,6 +539,10 @@ struct fec_enet_private {
>   int pause_flag;
>   int wol_flag;
>   u32 quirks;
>+  int phy_reset;
>+  int phy_reset_duration;
>+  int phy_reset_post_delay;
>+  boolphy_reset_active_high;
>
>   struct  napi_struct napi;
>   int csum_flags;
>diff --git a/drivers/net/ethernet/freescale/fec_main.c
>b/drivers/net/ethernet/freescale/fec_main.c
>index 610573855213..06a7caca0cee 100644
>--- a/drivers/net/ethernet/freescale/fec_main.c
>+++ b/drivers/net/ethernet/freescale/fec_main.c
>@@ -3212,62 +3212,36 @@ static int fec_enet_init(struct net_device *ndev)  }
>
> #ifdef CONFIG_OF
>-static int fec_reset_phy(struct platform_device *pdev)
>+static int fec_reset_phy(struct net_device *ndev)
> {
>-  int err, phy_reset;
>-  bool active_high = false;
>-  int msec = 1, phy_post_delay = 0;
>-  struct device_node *np = pdev->dev.of_node;
>-
>-  if (!np)
>-  return 0;
>-
>-  err = of_property_read_u32(np, "phy-reset-duration", &msec);
>-  /* A sane reset duration should not be longer than 1s */
>-  if (!err && msec > 1000)
>-  msec = 1;
>+  struct fec_enet_private *fep = netdev_priv(ndev);
>
>-  phy_reset = of_get_named_gpio(np, "phy-reset-gpios", 0);
>-  if (phy_reset == -EPROBE_DEFER)
>-  return phy_reset;
>-  else if (!gpio_is_valid(phy_reset))
>+  if (!fep->phy_reset)
>   return 0;
>
>-  err = of_property_read_u32(np, "phy-reset-post-delay",
>&phy_post_delay);
>-  /* valid reset duration should be less than 1s */
>-  if (!err && phy_post_delay > 1000)
>-  return -EINVAL;
>-
>-  active_high = of_property_read_bool(np, "phy-reset-active-high");
>+  gpio_set_value_cansleep(fep->phy_reset, fep-
>>phy_reset_active_high);
>
>-  err = devm_gpio_request_one(&pdev->dev, phy_reset,
>-  active_high ? GPIOF_OUT_INIT_HIGH :
>GPIOF_OUT_INIT_LOW,
>-  "phy-reset");
>-  if (err) {
>-  dev_err(&pdev->dev, "failed to get phy-reset-gpios: %d\n",
>err);
>-  return err;
>-  }
>-
>-  if (msec > 20)
>-  msleep(msec);
>+  if (fep->phy_reset_duration > 20)
>+  msleep(fep->phy_reset_duration);
>   else
>-  usleep_range(msec * 1000, msec * 1000 + 1000);
>+  usleep_range(fep->phy_reset_duration * 1000,
>+   fep->phy_reset_duration * 1000 + 1000);
>
>-  gpio_set_value_cansleep(phy_reset, !active_high);
>+  gpio_set_value_cansleep(fep->phy_reset, !fep-
>>phy_reset_active_high);
>
>-  if (!phy_post_delay)
>+  if (!fep->phy_reset_post_delay)
>   return 0;
>
>-  if (phy_post_delay > 20)
>-  msleep(phy_post_delay);
>+  if (fep->phy_reset_post_delay > 20)
>+  msleep(fep->phy_reset_post_delay);
>   else
>-  usleep_range(phy_post_delay * 1000,
>-   phy_post_delay * 1000 + 1000);
>+  usleep_range(fep->phy_reset_post_delay * 1000,
>+   fep->phy_reset_post_delay * 1000 + 1000);
>
>   return 0;
> }
> #else /* CONFIG_OF */
>-static int fec_reset_phy(struct platform_device *pdev)
>+static int fec_reset_phy(struct net_device *ndev)
> {
>   /*
>* In case of platform probe, the reset has been done @@ -3400,6
>+3374,36 @@ fec_probe(struct platform_device *pdev)
>   }
>   fep->phy_node = phy_node;
>
>+  fep->phy_reset = of_get_named_gpio(np, "phy-reset-gpios", 0);
>+  if (gpio_is_valid(fep->phy_reset)) {
>+  ret = of_property_read_u32(np, "phy-reset-duration",
>+ &fep->phy_reset_duration);
>+  /* A sane r

Re: [PATCH] net: qmi_wwan: add Dell DW5818, DW5819

2017-11-20 Thread Bjørn Mork
Shrirang Bagul  writes:

> Dell Wireless 5819/5818 devices are re-branded Sierra Wireless MC74
> series modems which will by default boot with vid 0x413c and pid's
> 0x81cf, 0x81d0, 0x81d1,0x81d2. Along with qcserial, these modems support
> qmi_wwan on the usb interface #12.

NAK,

Interace #12 is MBIM, as shown by the device descriptors. Please provide
those descriptors and you will see that this interface is clearly a CDC
MBIM class interface.

Yes, I know these modems probe the control protocol so that you can make
QMI work on an MBIM control interface by sending it a QMI request as the
first messsage.  This is still wrong, abusing a quirky firmware
feature.

You need to reconfigure the modem for QMI using the Sierra specific AT
command or QMI request (tunneled in MBIM!) to properly switch it to QMI
mode, which will appear as a vendor specific interface number 8 (and 10
if you enable both QMI functions).





Bjørn


RE: [PATCH v2 3/3] net: ethernet: fec: fix refclk enable for SMSC LAN8710/20

2017-11-20 Thread Andy Duan
From: Richard Leitner  Sent: Monday, November 20, 2017 4:34 PM
>To: f.faine...@gmail.com; Andy Duan ;
>and...@lunn.ch
>Cc: netdev@vger.kernel.org; linux-ker...@vger.kernel.org;
>richard.leit...@skidata.com
>Subject: [PATCH v2 3/3] net: ethernet: fec: fix refclk enable for SMSC
>LAN8710/20
>
>From: Richard Leitner 
>
>Some PHYs (for example the SMSC LAN8710/LAN8720) doesn't allow turning
>the refclk on and off again during operation (according to their datasheet).
>Nonetheless exactly this behaviour was introduced for power saving reasons
>by commit e8fcfcd5684a ("net: fec: optimize the clock management to save
>power").
>Therefore after enabling the refclk we detect if an affected PHY is attached. 
>If
>so reset and initialize it again.
>
>For a better understanding here's a outline of the time response of the clock
>and reset lines before and after this patch:
>
> v--fec_probe()  v--fec_enet_open()
> v   v
>   w/o patch eCLK:
>___|
>   w/o patch nRST: __--
>   w/o patch CONF:
>___XX___
>
>   w/  patch eCLK:
>___|
>   w/  patch nRST: __-__---
>   w/  patch CONF:
>___XX_XX
> ^   ^
> ^--fec_probe()  ^--fec_enet_open()
>
>Generally speaking this issue is only relevant if the ref clk for the PHY is
>generated by the SoC. In our specific case (PCB) this problem does occur at
>about every 10th to 50th POR of an LAN8710 connected to an i.MX6DL SoC.
>The typical symptom of this problem is a "swinging"
>ethernet link. Similar issues were reported by users of the NXP forum:
>   https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F
>%2Fcommunity.nxp.com%2Fthread%2F389902&data=02%7C01%7Cfugang.du
>an%40nxp.com%7C507c7aafbd4744f81ebf08d52ff1acff%7C686ea1d3bc2b4c6f
>a92cd99c5c301635%7C0%7C0%7C636467637399145483&sdata=RNXVGpPrlrcyL
>0SoQl8%2BI0k8Oc8BM0Iwykd1O%2Bjmvcc%3D&reserved=0
>   https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F
>%2Fcommunity.nxp.com%2Fmessage%2F309354&data=02%7C01%7Cfugang.d
>uan%40nxp.com%7C507c7aafbd4744f81ebf08d52ff1acff%7C686ea1d3bc2b4c6
>fa92cd99c5c301635%7C0%7C0%7C636467637399145483&sdata=pjeJEZGuBpb9
>uCMKGr70qa%2FmsNoak6v3nCID2vbNAeg%3D&reserved=0
>With this patch applied the issue didn't occur for at least a few hundret PORs
>of our board.
>
>Fixes: e8fcfcd5684a ("net: fec: optimize the clock management to save
>power")
>Signed-off-by: Richard Leitner 
>---
> drivers/net/ethernet/freescale/fec_main.c | 37
>+++
> 1 file changed, 37 insertions(+)
>
>diff --git a/drivers/net/ethernet/freescale/fec_main.c
>b/drivers/net/ethernet/freescale/fec_main.c
>index 06a7caca0cee..52ec9b29a70e 100644
>--- a/drivers/net/ethernet/freescale/fec_main.c
>+++ b/drivers/net/ethernet/freescale/fec_main.c
>@@ -68,6 +68,7 @@
>
> static void set_multicast_list(struct net_device *ndev);  static void
>fec_enet_itr_coal_init(struct net_device *ndev);
>+static int fec_reset_phy(struct net_device *ndev);
>
> #define DRIVER_NAME   "fec"
>
>@@ -1833,6 +1834,32 @@ static int fec_enet_mdio_write(struct mii_bus *bus,
>int mii_id, int regnum,
>   return ret;
> }
>
>+static int fec_enet_clk_ref_enable_reset_phy_quirk(struct net_device
>+*ndev) {
>+  struct phy_device *phy_dev = ndev->phydev;
>+  u32 real_phy_id;
>+  int ret;
>+
>+  /* some PHYs need a reset after the refclk was enabled, so we
>+   * reset them here
>+   */
>+  if (!phy_dev)
>+  return 0;
>+  if (!phy_dev->drv)
>+  return 0;
>+  real_phy_id = phy_dev->drv->phy_id & phy_dev->drv->phy_id_mask;
>+  switch (real_phy_id) {
>+  case 0x0007c0f0: /* SMSC LAN8710/LAN8720 */

Don't hard code here...
I believe there have many other phys also do such operation, hardcode is 
unacceptable...

And these code can be put into phy_device.c as common interface.

>+  ret = fec_reset_phy(ndev);
>+  if (ret)
>+  return ret;
>+  ret = phy_init_hw(phy_dev);
>+  if (ret)
>+  return ret;
>+  }
>+  return 0;
>+}
>+
> static int fec_enet_clk_enable(struct net_device *ndev, bool enable)  {
>   struct fec_enet_private *fep = netdev_priv(ndev); @@ -1862,6
>+1889,10 @@ static int fec_enet_clk_enable(struct net_device *ndev, bool
>enable)
>   ret = clk_prepare_enable(fep->clk_ref);
>   if (ret)
>   goto failed_clk_ref;
>+
>+  ret = fec_enet_clk_ref_enable_reset_phy_quirk(ndev);
>+  if (ret)
>+  netdev_warn(ndev, "Resetting PHY failed, connect

Re: [PATCH] net: qmi_wwan: add Dell DW5818, DW5819

2017-11-20 Thread Shrirang Bagul
On Mon, 2017-11-20 at 10:41 +0100, Bjørn Mork wrote:
> Shrirang Bagul  writes:
> 
> > Dell Wireless 5819/5818 devices are re-branded Sierra Wireless MC74
> > series modems which will by default boot with vid 0x413c and pid's
> > 0x81cf, 0x81d0, 0x81d1,0x81d2. Along with qcserial, these modems support
> > qmi_wwan on the usb interface #12.
> 
> NAK,
> 
> Interace #12 is MBIM, as shown by the device descriptors. Please provide
> those descriptors and you will see that this interface is clearly a CDC
> MBIM class interface.
> 
> Yes, I know these modems probe the control protocol so that you can make
> QMI work on an MBIM control interface by sending it a QMI request as the
> first messsage.  This is still wrong, abusing a quirky firmware
> feature.
> 
> You need to reconfigure the modem for QMI using the Sierra specific AT
> command or QMI request (tunneled in MBIM!) to properly switch it to QMI
> mode, which will appear as a vendor specific interface number 8 (and 10
> if you enable both QMI functions).
Understood. Needs more work, will resend with fixes.

- Shrirang
> 
> 
> 
> 
> 
> Bjørn

signature.asc
Description: This is a digitally signed message part


Re: [PATCH 1/8] dt-bindings: can: rcar_can: document r8a774[35] can support

2017-11-20 Thread Simon Horman
On Tue, Nov 07, 2017 at 03:10:42PM +, Fabrizio Castro wrote:
> Document "renesas,can-r8a7743" and "renesas,can-r8a7745" compatible
> strings. Since the fallback compatible string ("renesas,rcar-gen2-can")
> activates the right code in the driver, no driver change is needed.
> 
> Signed-off-by: Fabrizio Castro 
> Reviewed-by: Biju Das 

Reviewed-by: Simon Horman 



Re: [PATCH 0/8] Add CAN support to iwg2[02]d

2017-11-20 Thread Simon Horman
On Tue, Nov 07, 2017 at 03:10:41PM +, Fabrizio Castro wrote:
> Hello,
> 
> this series delivers all of the changes necessary to add CAN bus
> support to the:
> * iW-RainboW-G22D SODIMM, and
> * iW-RainboW-G20M-Qseven-RZG1M
> development platforms, including documentation, pinctrl driver, SoC
> specific device trees, and board specific device trees.
> 
> This work has been based and tested on top of:
> renesas-devel-20171106-v4.14-rc8
> 
> Best regards,
> 
> Fabrizio Castro (8):
>   dt-bindings: can: rcar_can: document r8a774[35] can support
>   pinctrl: sh-pfc: r8a7745: Add CAN[01] support
>   ARM: dts: r8a7745: Add CAN[01] SoC support
>   ARM: dts: iwg22d-sodimm: Add can0 support to carrier board
>   ARM: dts: iwg22d-sodimm-dbhd-ca: Add can1 support to HDMI DB
>   ARM: dts: r8a7743: Add CAN[01] SoC support
>   ARM: dts: iwg20d-q7-common: Add can0 support to carrier board
>   ARM: dts: iwg20d-q7-dbcm-ca: Add can1 support to camera DB

Thanks, I have applied the "ARM: dts" patches for inclusion in v4.16.


Re: [PATCH v2 3/3] net: ethernet: fec: fix refclk enable for SMSC LAN8710/20

2017-11-20 Thread Richard Leitner

On 11/20/2017 10:47 AM, Andy Duan wrote:
> From: Richard Leitner  Sent: Monday, November 20, 2017 4:34 
> PM
>> To: f.faine...@gmail.com; Andy Duan ;
>> and...@lunn.ch
>> Cc: netdev@vger.kernel.org; linux-ker...@vger.kernel.org;
>> richard.leit...@skidata.com
>> Subject: [PATCH v2 3/3] net: ethernet: fec: fix refclk enable for SMSC
>> LAN8710/20
>>
>> From: Richard Leitner 
>>
>> Some PHYs (for example the SMSC LAN8710/LAN8720) doesn't allow turning
>> the refclk on and off again during operation (according to their datasheet).
>> Nonetheless exactly this behaviour was introduced for power saving reasons
>> by commit e8fcfcd5684a ("net: fec: optimize the clock management to save
>> power").
>> Therefore after enabling the refclk we detect if an affected PHY is 
>> attached. If
>> so reset and initialize it again.

...

>> +static int fec_enet_clk_ref_enable_reset_phy_quirk(struct net_device
>> +*ndev) {
>> +struct phy_device *phy_dev = ndev->phydev;
>> +u32 real_phy_id;
>> +int ret;
>> +
>> +/* some PHYs need a reset after the refclk was enabled, so we
>> + * reset them here
>> + */
>> +if (!phy_dev)
>> +return 0;
>> +if (!phy_dev->drv)
>> +return 0;
>> +real_phy_id = phy_dev->drv->phy_id & phy_dev->drv->phy_id_mask;
>> +switch (real_phy_id) {
>> +case 0x0007c0f0: /* SMSC LAN8710/LAN8720 */
> 
> Don't hard code here...
> I believe there have many other phys also do such operation, hardcode is 
> unacceptable...
> 
> And these code can be put into phy_device.c as common interface.

Ok. Thank you for the feedback.
So it would be fine to hardcode the affected phy_id's in a common
function in phy_device.c?


Another possible solution that came to my mind is to add a flag called
something like "PHY_RST_AFTER_CLK_EN" to the flags variable in struct
phy_driver. This flag could then be set in the smsc PHY driver for
affected PHYs.

Then instead of comparing the phy_id in the MAC driver this flag could
be checked:

if (phydev->drv->flags & PHY_RST_AFTER_CLK_EN) {
ret = fec_reset_phy(ndev);
...
}

Would checking the flag be OK in fec_main.c?

What would be the "better" approach?

> 
>> +ret = fec_reset_phy(ndev);
>> +if (ret)
>> +return ret;
>> +ret = phy_init_hw(phy_dev);
>> +if (ret)
>> +return ret;
>> +}
>> +return 0;
>> +}
>> +
>> static int fec_enet_clk_enable(struct net_device *ndev, bool enable)  {
>>  struct fec_enet_private *fep = netdev_priv(ndev); @@ -1862,6
>> +1889,10 @@ static int fec_enet_clk_enable(struct net_device *ndev, bool
>> enable)
>>  ret = clk_prepare_enable(fep->clk_ref);
>>  if (ret)
>>  goto failed_clk_ref;
>> +
>> +ret = fec_enet_clk_ref_enable_reset_phy_quirk(ndev);
>> +if (ret)
>> +netdev_warn(ndev, "Resetting PHY failed, connection
>> may be
>> +unstable\n");
>>  } else {
>>  clk_disable_unprepare(fep->clk_ahb);
>>  clk_disable_unprepare(fep->clk_enet_out);
>> @@ -2860,11 +2891,17 @@ fec_enet_open(struct net_device *ndev)
>>  if (ret)
>>  goto err_enet_mii_probe;
>>
>> +/* as the PHY is connected now, trigger the reset quirk again */
>> +ret = fec_enet_clk_ref_enable_reset_phy_quirk(ndev);
>> +if (ret)
>> +netdev_warn(ndev, "Resetting PHY failed, connection may be
>> +unstable\n");
>> +
>>  if (fep->quirks & FEC_QUIRK_ERR006687)
>>  imx6q_cpuidle_fec_irqs_used();
>>
>>  napi_enable(&fep->napi);
>>  phy_start(ndev->phydev);
>> +
> 
> No need blank line here...
>>  netif_tx_start_all_queues(ndev);
>>
>>  device_set_wakeup_enable(&ndev->dev, fep->wol_flag &
>> --
>> 2.11.0


[PATCH ipsec-next] net: xfrm: allow clearing socket xfrm policies.

2017-11-20 Thread Lorenzo Colitti
Currently it is possible to add or update socket policies, but
not clear them. Therefore, once a socket policy has been applied,
the socket cannot be used for unencrypted traffic.

This patch allows (privileged) users to clear socket policies by
passing in a NULL pointer and zero length argument to the
{IP,IPV6}_{IPSEC,XFRM}_POLICY setsockopts. This results in both
the incoming and outgoing policies being cleared.

The simple approach taken in this patch cannot clear socket
policies in only one direction. If desired this could be added
in the future, for example by continuing to pass in a length of
zero (which currently is guaranteed to return EMSGSIZE) and
making the policy be a pointer to an integer that contains one
of the XFRM_POLICY_{IN,OUT} enum values.

An alternative would have been to interpret the length as a
signed integer and use XFRM_POLICY_IN (i.e., 0) to clear the
input policy and -XFRM_POLICY_OUT (i.e., -1) to clear the output
policy.

Tested: https://android-review.googlesource.com/539816
Signed-off-by: Lorenzo Colitti 
---
 net/xfrm/xfrm_policy.c | 2 +-
 net/xfrm/xfrm_state.c  | 7 +++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index f02b1743b2..d5ef584642 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1251,7 +1251,7 @@ EXPORT_SYMBOL(xfrm_policy_delete);
 
 int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
 {
-   struct net *net = xp_net(pol);
+   struct net *net = sock_net(sk);
struct xfrm_policy *old_pol;
 
 #ifdef CONFIG_XFRM_SUB_POLICY
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 1f5cee2269..ec0c738c4d 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -2049,6 +2049,13 @@ int xfrm_user_policy(struct sock *sk, int optname, u8 
__user *optval, int optlen
struct xfrm_mgr *km;
struct xfrm_policy *pol = NULL;
 
+   if (!optval && !optlen) {
+   xfrm_sk_policy_insert(sk, XFRM_POLICY_IN, NULL);
+   xfrm_sk_policy_insert(sk, XFRM_POLICY_OUT, NULL);
+   __sk_dst_reset(sk);
+   return 0;
+   }
+
if (optlen <= 0 || optlen > PAGE_SIZE)
return -EMSGSIZE;
 
-- 
2.15.0.448.gf294e3d99a-goog



Re: [PATCH] net: qmi_wwan: add Dell DW5818, DW5819

2017-11-20 Thread Lars Melin

On 11/20/2017 16:27, Shrirang Bagul wrote:

Dell Wireless 5819/5818 devices are re-branded Sierra Wireless MC74
series modems which will by default boot with vid 0x413c and pid's
0x81cf, 0x81d0, 0x81d1,0x81d2. Along with qcserial, these modems support
qmi_wwan on the usb interface #12.

Signed-off-by: Shrirang Bagul 
---
  drivers/net/usb/qmi_wwan.c | 4 
  1 file changed, 4 insertions(+)

diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c
index 8d4a6f7cba61..bdf1fae38af2 100644
--- a/drivers/net/usb/qmi_wwan.c
+++ b/drivers/net/usb/qmi_wwan.c
@@ -1234,6 +1234,10 @@ static const struct usb_device_id products[] = {
{QMI_FIXED_INTF(0x413c, 0x81b3, 8)},/* Dell Wireless 5809e Gobi(TM) 
4G LTE Mobile Broadband Card (rev3) */
{QMI_FIXED_INTF(0x413c, 0x81b6, 8)},/* Dell Wireless 5811e */
{QMI_FIXED_INTF(0x413c, 0x81b6, 10)},   /* Dell Wireless 5811e */
+   {QMI_FIXED_INTF(0x413c, 0x81cf, 12)},   /* Dell Wireless 5819 */
+   {QMI_FIXED_INTF(0x413c, 0x81d0, 12)},   /* Dell Wireless 5819 */
+   {QMI_FIXED_INTF(0x413c, 0x81d1, 12)},   /* Dell Wireless 5818 */
+   {QMI_FIXED_INTF(0x413c, 0x81d2, 12)},   /* Dell Wireless 5818 */
{QMI_FIXED_INTF(0x03f0, 0x4e1d, 8)},/* HP lt4111 LTE/EV-DO/HSPA+ 
Gobi 4G Module */
{QMI_FIXED_INTF(0x22de, 0x9061, 3)},/* WeTelecom WPD-600N */
{QMI_FIXED_INTF(0x1e0e, 0x9001, 5)},/* SIMCom 7230E */




NAK

413c:81cf and 413c:81d1 do not have a net interface, they only have a 
single serial interface (QDL) for firmware update.
Please do not add usb id's for which you have not confirmed the 
interface composition.


br
Lars



RE: [PATCH v2 3/3] net: ethernet: fec: fix refclk enable for SMSC LAN8710/20

2017-11-20 Thread Andy Duan
From: Richard Leitner  Sent: Monday, November 20, 
2017 5:57 PM
>To: Andy Duan ; f.faine...@gmail.com;
>and...@lunn.ch
>Cc: Richard Leitner ; netdev@vger.kernel.org; linux-
>ker...@vger.kernel.org
>Subject: Re: [PATCH v2 3/3] net: ethernet: fec: fix refclk enable for SMSC
>LAN8710/20
>
>
>On 11/20/2017 10:47 AM, Andy Duan wrote:
>> From: Richard Leitner  Sent: Monday, November 20, 2017
>> 4:34 PM
>>> To: f.faine...@gmail.com; Andy Duan ;
>>> and...@lunn.ch
>>> Cc: netdev@vger.kernel.org; linux-ker...@vger.kernel.org;
>>> richard.leit...@skidata.com
>>> Subject: [PATCH v2 3/3] net: ethernet: fec: fix refclk enable for
>>> SMSC
>>> LAN8710/20
>>>
>>> From: Richard Leitner 
>>>
>>> Some PHYs (for example the SMSC LAN8710/LAN8720) doesn't allow
>>> turning the refclk on and off again during operation (according to their
>datasheet).
>>> Nonetheless exactly this behaviour was introduced for power saving
>>> reasons by commit e8fcfcd5684a ("net: fec: optimize the clock
>>> management to save power").
>>> Therefore after enabling the refclk we detect if an affected PHY is
>>> attached. If so reset and initialize it again.
>
>...
>
>>> +static int fec_enet_clk_ref_enable_reset_phy_quirk(struct net_device
>>> +*ndev) {
>>> +   struct phy_device *phy_dev = ndev->phydev;
>>> +   u32 real_phy_id;
>>> +   int ret;
>>> +
>>> +   /* some PHYs need a reset after the refclk was enabled, so we
>>> +* reset them here
>>> +*/
>>> +   if (!phy_dev)
>>> +   return 0;
>>> +   if (!phy_dev->drv)
>>> +   return 0;
>>> +   real_phy_id = phy_dev->drv->phy_id & phy_dev->drv->phy_id_mask;
>>> +   switch (real_phy_id) {
>>> +   case 0x0007c0f0: /* SMSC LAN8710/LAN8720 */
>>
>> Don't hard code here...
>> I believe there have many other phys also do such operation, hardcode is
>unacceptable...
>>
>> And these code can be put into phy_device.c as common interface.
>
>Ok. Thank you for the feedback.
>So it would be fine to hardcode the affected phy_id's in a common function in
>phy_device.c?
>
>
>Another possible solution that came to my mind is to add a flag called
>something like "PHY_RST_AFTER_CLK_EN" to the flags variable in struct
>phy_driver. This flag could then be set in the smsc PHY driver for affected
>PHYs.
>
>Then instead of comparing the phy_id in the MAC driver this flag could be
>checked:
>
>if (phydev->drv->flags & PHY_RST_AFTER_CLK_EN) {
>ret = fec_reset_phy(ndev);
>...
>}
>
>Would checking the flag be OK in fec_main.c?

Yes, it is better than previous solution.
But add new common API in phy_device.c is much better like: 
1. add a flag called "PHY_RST_AFTER_CLK_EN" to the flags variable in struct 
phy_driver,  all phy driver that need reset can set the flag.
2. add new common api interface phy_reset_after_clk_enable() in phy_device.c 
driver
3. add reset gpio descriptor for common phy device driver. 
4. then any mac driver can directly call the common interface 
.phy_reset_after_clk_enable().

That is only my suggestion, maybe there have better idea.
Thanks.

>
>What would be the "better" approach?
>
>>
>>> +   ret = fec_reset_phy(ndev);
>>> +   if (ret)
>>> +   return ret;
>>> +   ret = phy_init_hw(phy_dev);
>>> +   if (ret)
>>> +   return ret;
>>> +   }
>>> +   return 0;
>>> +}
>>> +
>>> static int fec_enet_clk_enable(struct net_device *ndev, bool enable)  {
>>> struct fec_enet_private *fep = netdev_priv(ndev); @@ -1862,6
>>> +1889,10 @@ static int fec_enet_clk_enable(struct net_device *ndev,
>>> +bool
>>> enable)
>>> ret = clk_prepare_enable(fep->clk_ref);
>>> if (ret)
>>> goto failed_clk_ref;
>>> +
>>> +   ret = fec_enet_clk_ref_enable_reset_phy_quirk(ndev);
>>> +   if (ret)
>>> +   netdev_warn(ndev, "Resetting PHY failed, connection
>>> may be
>>> +unstable\n");
>>> } else {
>>> clk_disable_unprepare(fep->clk_ahb);
>>> clk_disable_unprepare(fep->clk_enet_out);
>>> @@ -2860,11 +2891,17 @@ fec_enet_open(struct net_device *ndev)
>>> if (ret)
>>> goto err_enet_mii_probe;
>>>
>>> +   /* as the PHY is connected now, trigger the reset quirk again */
>>> +   ret = fec_enet_clk_ref_enable_reset_phy_quirk(ndev);
>>> +   if (ret)
>>> +   netdev_warn(ndev, "Resetting PHY failed, connection may be
>>> +unstable\n");
>>> +
>>> if (fep->quirks & FEC_QUIRK_ERR006687)
>>> imx6q_cpuidle_fec_irqs_used();
>>>
>>> napi_enable(&fep->napi);
>>> phy_start(ndev->phydev);
>>> +
>>
>> No need blank line here...
>>> netif_tx_start_all_queues(ndev);
>>>
>>> device_set_wakeup_enable(&ndev->dev, fep->wol_flag &
>>> --
>>> 2.11.0


[PATCH] xen-netfront: remove warning when unloading module

2017-11-20 Thread Eduardo Otubo
When unloading module xen_netfront from guest, dmesg would output
warning messages like below:

  [  105.236836] xen:grant_table: WARNING: g.e. 0x903 still in use!
  [  105.236839] deferring g.e. 0x903 (pfn 0x35805)

This problem relies on netfront and netback being out of sync. By the time
netfront revokes the g.e.'s netback didn't have enough time to free all of
them, hence displaying the warnings on dmesg.

The trick here is to make netfront to wait until netback frees all the g.e.'s
and only then continue to cleanup for the module removal, and this is done by
manipulating both device states.

Signed-off-by: Eduardo Otubo 
---
 drivers/net/xen-netfront.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index 8b8689c6d887..b948e2a1ce40 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -2130,6 +2130,17 @@ static int xennet_remove(struct xenbus_device *dev)
 
dev_dbg(&dev->dev, "%s\n", dev->nodename);
 
+   xenbus_switch_state(dev, XenbusStateClosing);
+   while (xenbus_read_driver_state(dev->otherend) != XenbusStateClosing){
+   cpu_relax();
+   schedule();
+   }
+   xenbus_switch_state(dev, XenbusStateClosed);
+   while (dev->xenbus_state != XenbusStateClosed){
+   cpu_relax();
+   schedule();
+   }
+
xennet_disconnect_backend(info);
 
unregister_netdev(info->netdev);
-- 
2.13.6



Re: [PATCH] xen-netfront: remove warning when unloading module

2017-11-20 Thread Wei Liu
CC netfront maintainers.

On Mon, Nov 20, 2017 at 11:41:09AM +0100, Eduardo Otubo wrote:
> When unloading module xen_netfront from guest, dmesg would output
> warning messages like below:
> 
>   [  105.236836] xen:grant_table: WARNING: g.e. 0x903 still in use!
>   [  105.236839] deferring g.e. 0x903 (pfn 0x35805)
> 
> This problem relies on netfront and netback being out of sync. By the time
> netfront revokes the g.e.'s netback didn't have enough time to free all of
> them, hence displaying the warnings on dmesg.
> 
> The trick here is to make netfront to wait until netback frees all the g.e.'s
> and only then continue to cleanup for the module removal, and this is done by
> manipulating both device states.
> 
> Signed-off-by: Eduardo Otubo 
> ---
>  drivers/net/xen-netfront.c | 11 +++
>  1 file changed, 11 insertions(+)
> 
> diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
> index 8b8689c6d887..b948e2a1ce40 100644
> --- a/drivers/net/xen-netfront.c
> +++ b/drivers/net/xen-netfront.c
> @@ -2130,6 +2130,17 @@ static int xennet_remove(struct xenbus_device *dev)
>  
>   dev_dbg(&dev->dev, "%s\n", dev->nodename);
>  
> + xenbus_switch_state(dev, XenbusStateClosing);
> + while (xenbus_read_driver_state(dev->otherend) != XenbusStateClosing){
> + cpu_relax();
> + schedule();
> + }
> + xenbus_switch_state(dev, XenbusStateClosed);
> + while (dev->xenbus_state != XenbusStateClosed){
> + cpu_relax();
> + schedule();
> + }
> +
>   xennet_disconnect_backend(info);
>  
>   unregister_netdev(info->netdev);
> -- 
> 2.13.6
> 


Re: [PATCH net-next 1/1] net: dsa: microchip: Add Microchip KSZ8895 DSA driver

2017-11-20 Thread Pavel Machek
Hi!

> From: Tristram Ha 
> 
> Add Microchip KSZ8895 DSA driver.
> 
> Signed-off-by: Tristram Ha 
> Reviewed-by: Woojung Huh 

Thanks for patches. I installed whole series on top of net-next.

Hardware is:

root@miro:~# cat /proc/cpuinfo
model name   : ARM926EJ-S rev 5 (v5l)
Hardware  : Freescale MXS (Device Tree)

I added devicetree chunks, and enabled DSA in the config. It seems
switch is detected:

[4.775934] Micrel KSZ8051 dsa-0.0:00: attached PHY driver [Micrel
KSZ8051] (mii_bus:phy_addr=dsa-0.0:00, irq=POLL)
[4.885952] Micrel KSZ8051 dsa-0.0:01: attached PHY driver [Micrel
KSZ8051] (mii_bus:phy_addr=dsa-0.0:01, irq=POLL)
[4.995934] Micrel KSZ8051 dsa-0.0:02: attached PHY driver [Micrel
KSZ8051] (mii_bus:phy_addr=dsa-0.0:02, irq=POLL)
[5.011484] DSA: tree 0 setup

root@miro:~# ifconfig lan3 192.168.20.103 netmask 255.255.0.0 up
[  131.196667] IPv6: ADDRCONF(NETDEV_UP): lan3: link is not ready
root@miro:~# [  132.225863] ksz8895-switch spi2.0 lan3: Link is Up -
100Mbps/Full - flow control rx/tx
[  132.233939] IPv6: ADDRCONF(NETDEV_CHANGE): lan3: link becomes ready

root@miro:~# ping 192.168.1.1
PING 192.168.1.1 (192.168.1.1): 56 data bytes
^C
--- 192.168.1.1 ping statistics ---
7 packets transmitted, 0 packets received, 100% packet loss
root@miro:~# ifconfig [  149.904234] random: crng init done

But packets do not go through, and there is nothing helpful in
dmesg. Dts part is:

spi@0 {
compatible = "microchip,ksz8895";
spi-max-frequency = <2500>;
reg = <0>;
// reset-gpios = <&gpio2 8 0>;
status = "okay";

spi-cpha;
spi-cpol;
   ports {
 #address-cells = <1>;
 #size-cells = <0>;
 port@0 {
reg = <0>;
label = "lan1";
 };
 port@1 {
reg = <1>;
label = "lan2";
 };
 port@2 {
reg = <2>;
label = "lan3";
 };
 port@4 {
reg = <4>;
label = "cpu";
ethernet = <&mac0>;
fixed-link {
   speed = <100>;
   full-duplex;
};
 };
   };
};

I went back to my version of dsa patches, and test above works as
expected.

Pavel
-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) 
http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html


signature.asc
Description: Digital signature


RE: [PATCH] xen-netfront: remove warning when unloading module

2017-11-20 Thread Paul Durrant
> -Original Message-
> From: Eduardo Otubo [mailto:ot...@redhat.com]
> Sent: 20 November 2017 10:41
> To: xen-de...@lists.xenproject.org
> Cc: netdev@vger.kernel.org; Paul Durrant ; Wei
> Liu ; linux-ker...@vger.kernel.org;
> vkuzn...@redhat.com; cav...@redhat.com; che...@redhat.com;
> mga...@redhat.com; Eduardo Otubo 
> Subject: [PATCH] xen-netfront: remove warning when unloading module
> 
> When unloading module xen_netfront from guest, dmesg would output
> warning messages like below:
> 
>   [  105.236836] xen:grant_table: WARNING: g.e. 0x903 still in use!
>   [  105.236839] deferring g.e. 0x903 (pfn 0x35805)
> 
> This problem relies on netfront and netback being out of sync. By the time
> netfront revokes the g.e.'s netback didn't have enough time to free all of
> them, hence displaying the warnings on dmesg.
> 
> The trick here is to make netfront to wait until netback frees all the g.e.'s
> and only then continue to cleanup for the module removal, and this is done
> by
> manipulating both device states.
> 
> Signed-off-by: Eduardo Otubo 
> ---
>  drivers/net/xen-netfront.c | 11 +++
>  1 file changed, 11 insertions(+)
> 
> diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
> index 8b8689c6d887..b948e2a1ce40 100644
> --- a/drivers/net/xen-netfront.c
> +++ b/drivers/net/xen-netfront.c
> @@ -2130,6 +2130,17 @@ static int xennet_remove(struct xenbus_device
> *dev)
> 
>   dev_dbg(&dev->dev, "%s\n", dev->nodename);
> 
> + xenbus_switch_state(dev, XenbusStateClosing);
> + while (xenbus_read_driver_state(dev->otherend) !=
> XenbusStateClosing){
> + cpu_relax();
> + schedule();
> + }
> + xenbus_switch_state(dev, XenbusStateClosed);
> + while (dev->xenbus_state != XenbusStateClosed){
> + cpu_relax();
> + schedule();
> + }
> +

Waitiing for closing should be ok but waiting for closed is risky. As soon as a 
backend is in the closed state then a toolstack can completely remove the 
backend xenstore area, resulting a state of XenbusStateUnknown, which would 
cause your second loop to spin forever.

  Paul

>   xennet_disconnect_backend(info);
> 
>   unregister_netdev(info->netdev);
> --
> 2.13.6



Re: [PATCH 2/2] ip6_tunnel: pass tun_dst arg from ip6_tnl_rcv() to __ip6_tnl_rcv()

2017-11-20 Thread Alexey Kodanev
On 11/19/2017 06:22 AM, David Miller wrote:
> From: Alexey Kodanev 
> Date: Fri, 17 Nov 2017 19:16:18 +0300
>
>> Otherwise tun_dst argument is unused there. Currently, ip6_tnl_rcv()
>> invoked with tun_dst set to NULL, so there is no actual functional
>> changes introduced in this patch.
> Oh yes there is a functional change, becaue __ip6_tnl_rcv() is also
> used by ipxip6_rcv() which can pass a non-NULL tnl_dst.

The patch is not changing __ip6_tnl_rcv(), only ip6_tnl_rcv() wrapper.

Thanks,
Alexey



Re: [PATCH] xen-netfront: remove warning when unloading module

2017-11-20 Thread Juergen Gross
On 20/11/17 11:49, Wei Liu wrote:
> CC netfront maintainers.
> 
> On Mon, Nov 20, 2017 at 11:41:09AM +0100, Eduardo Otubo wrote:
>> When unloading module xen_netfront from guest, dmesg would output
>> warning messages like below:
>>
>>   [  105.236836] xen:grant_table: WARNING: g.e. 0x903 still in use!
>>   [  105.236839] deferring g.e. 0x903 (pfn 0x35805)
>>
>> This problem relies on netfront and netback being out of sync. By the time
>> netfront revokes the g.e.'s netback didn't have enough time to free all of
>> them, hence displaying the warnings on dmesg.
>>
>> The trick here is to make netfront to wait until netback frees all the g.e.'s
>> and only then continue to cleanup for the module removal, and this is done by
>> manipulating both device states.
>>
>> Signed-off-by: Eduardo Otubo 
>> ---
>>  drivers/net/xen-netfront.c | 11 +++
>>  1 file changed, 11 insertions(+)
>>
>> diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
>> index 8b8689c6d887..b948e2a1ce40 100644
>> --- a/drivers/net/xen-netfront.c
>> +++ b/drivers/net/xen-netfront.c
>> @@ -2130,6 +2130,17 @@ static int xennet_remove(struct xenbus_device *dev)
>>  
>>  dev_dbg(&dev->dev, "%s\n", dev->nodename);
>>  
>> +xenbus_switch_state(dev, XenbusStateClosing);
>> +while (xenbus_read_driver_state(dev->otherend) != XenbusStateClosing){
>> +cpu_relax();
>> +schedule();
>> +}
>> +xenbus_switch_state(dev, XenbusStateClosed);
>> +while (dev->xenbus_state != XenbusStateClosed){
>> +cpu_relax();
>> +schedule();
>> +}

I really don't like the busy waits.

Can't you use e.g. a wait queue and wait_event_interruptible() instead?

BTW: what happens if the device is already in closed state if you enter
xennet_remove()? In case this is impossible, please add a comment to
indicate you've thought about that case.

Other than that: you should run ./scripts/checkpatch.p1 against your
patch to avoid common style problems.


Juergen



Re: Bug in socket(7) man page

2017-11-20 Thread Michael Kerrisk (man-pages)
[CC widended]

Tobias,

On 7 August 2017 at 13:53, Tobias Klausmann  wrote:
> Hi!
>
> This bug pertains to the manpage as visible on man7.org right
> now.
>
> The socket(7) man page has this paragraph:
>
>SO_RXQ_OVFL (since Linux 2.6.33)
>   Indicates that an unsigned 32-bit value ancillary message 
> (cmsg) should be attached to
>   received skbs indicating the number of packets dropped by the 
> socket between the  last
>   received packet and this received packet.
>
> The second half is wrong: the counter (internally,
> SOCK_SKB_CB(skb)->dropcount is *not* reset after every packet.
> That is, it is a proper counter, not a gauge, in monitoring
> parlance.
>
> A better version of that paragraph:
>
>SO_RXQ_OVFL (since Linux 2.6.33)
>   Indicates that an unsigned 32-bit value ancillary message 
> (cmsg) should be attached to
>   received skbs indicating the number of packets dropped by the 
> socket since its
>   creation.

Thanks for the report. See also my reply to Petr in just a moment.
I've taken your suggested text change.

Cheers,

Michael


-- 
Michael Kerrisk
Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
Linux/UNIX System Programming Training: http://man7.org/training/


Re: Incorrect behaviour or documentation problem of SO_RXQ_OVFL

2017-11-20 Thread Michael Kerrisk (man-pages)
[Adding Neil, who wrote the original text. Maybe he has also some
suggested improvement.]

Hello Petr and Tobias,

Thank you both for your reports about the incorrect documentation. See below.

On 15 November 2017 at 16:14, Petr Malat  wrote:
> Hi!
> Generic SO_RXQ_OVFL helpers sock_skb_set_dropcount() and sock_recv_drops()
> implements returning of sk->sk_drops (the total number of dropped packets),
> although the documentation says the number of dropped packets since the
> last received one should be returned (quoting the current socket.7):
>   SO_RXQ_OVFL (since Linux 2.6.33)
>   Indicates that an unsigned 32-bit value ancillary message (cmsg)
>   should be attached to received skbs indicating the number of packets
>   dropped by the socket between the last received packet and this
>   received packet.
>
> I assume the documentation needs to be updated, as fixing this in the
> code could break programs depending on the current behavior, although
> the formerly planned functionality seems to be more usefull.
>
> The problem can be revealed with the following program:
>
> #include 
> #include 
> #include 
> #include 
> #include 
> #include 
> #include 
> #include 
>
> int extract_drop(struct msghdr *msg)
> {
> struct cmsghdr *cmsg;
> int rtn;
>
> for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg,cmsg)) {
> if (cmsg->cmsg_level == SOL_SOCKET &&
> cmsg->cmsg_type == SO_RXQ_OVFL) {
> memcpy(&rtn, CMSG_DATA(cmsg), sizeof rtn);
> return rtn;
> }
> }
> return -1;
> }
>
> int main(int argc, char *argv[])
> {
> struct sockaddr_in addr = { .sin_family = AF_INET };
> char msg[48*1024], cmsgbuf[256];
> struct iovec iov = { .iov_base = msg, .iov_len = sizeof msg };
> int sk1, sk2, i, one = 1;
>
> sk1 = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP);
> sk2 = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP);
>
> inet_pton(AF_INET, "127.0.0.1", &addr.sin_addr);
> addr.sin_port = htons(5);
>
> bind(sk1, (struct sockaddr*)&addr, sizeof addr);
> connect(sk2, (struct sockaddr*)&addr, sizeof addr);
>
> // Kernel doubles this limit, but it accounts also the SKB overhead,
> // but it receives as long as there is at least 1 byte free.
> i = sizeof msg;
> setsockopt(sk1, SOL_SOCKET, SO_RCVBUF, &i, sizeof i);
> setsockopt(sk1, SOL_SOCKET, SO_RXQ_OVFL, &one, sizeof one);
>
> for (i = 0; i < 4; i++) {
> int rtn;
>
> send(sk2, msg, sizeof msg, 0);
> send(sk2, msg, sizeof msg, 0);
> send(sk2, msg, sizeof msg, 0);
>
> do {
> struct msghdr msghdr = {
> .msg_iov = &iov, .msg_iovlen = 1,
> .msg_control = &cmsgbuf,
> .msg_controllen = sizeof cmsgbuf };
> rtn = recvmsg(sk1, &msghdr, MSG_DONTWAIT);
> if (rtn > 0) {
> printf("rtn: %d drop %d\n", rtn,
> extract_drop(&msghdr));
> } else {
> printf("rtn: %d\n", rtn);
> }
> } while (rtn > 0);
> }
>
> return 0;
> }
>
> which prints
>   rtn: 49152 drop -1
>   rtn: 49152 drop -1
>   rtn: -1
>   rtn: 49152 drop 1
>   rtn: 49152 drop 1
>   rtn: -1
>   rtn: 49152 drop 2
>   rtn: 49152 drop 2
>   rtn: -1
>   rtn: 49152 drop 3
>   rtn: 49152 drop 3
>   rtn: -1
> although it should print (according to the documentation):
>   rtn: 49152 drop 0
>   rtn: 49152 drop 0
>   rtn: -1
>   rtn: 49152 drop 1
>   rtn: 49152 drop 0
>   rtn: -1
>   rtn: 49152 drop 1
>   rtn: 49152 drop 0
>   rtn: -1
>   rtn: 49152 drop 1
>   rtn: 49152 drop 0
>   rtn: -1
>
> Please keep me on To:/CC: as I'm not on the list.

Thanks for the test program. Tobias reported the same issue, and I've
applied his suggested change to the page. (See below.)

Cheers,

Michael

diff --git a/man7/socket.7 b/man7/socket.7
index 79966a6fd..1a2cfe9cc 100644
--- a/man7/socket.7
+++ b/man7/socket.7
@@ -881,8 +881,7 @@ compete to receive datagrams on the same socket.
 .\" commit 3b885787ea4112eaa80945999ea0901bf742707f
 Indicates that an unsigned 32-bit value ancillary message (cmsg)
 should be attached to received skbs indicating
-the number of packets dropped by the socket between
-the last received packet and this received packet.
+the number of packets dropped by the socket since its creation.
 .TP
 .B SO_SNDBUF
 Sets or gets the maximum socket send buffer in bytes.


-- 
Michael Kerrisk
Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
Linux/UNIX System Programming Training: http://man7.org/training/


Re: [RFC PATCH net-next 0/2] Configuring PFC stall prevention via ethtool

2017-11-20 Thread Eran Ben Elisha
On Thu, Nov 16, 2017 at 5:42 PM, Andrew Lunn  wrote:
>> I don't like adding another ethtool_ops callback tightly tied to the
>> structures passed via ioctl() but when I started to think what to
>> suggest as an alternative, I started to wonder if it is really necessary
>> to add a new ethtool command at all. Couldn't this be handled as
>> a tunable?
>
> I agree with Michal here.
>
> And as he pointed out, there does not need to be a 1:1 mapping between
> ethtool(1) and the kAPI. I suggest extending the existing -a option,
> and have it make two system calls if needed.
>
> Andrew

Sound good to me. We will follow this suggestion to extend -a using
the tunable op.
In addition, we will come up with new API to use timeouts  and on/off
instead of auto/default.

Eran


Re: JOIN_ANYCAST breakage w. "net: ipv6: put host and anycast routes on device with address"

2017-11-20 Thread Florian Westphal
David Ahern  wrote:
> On 11/14/17 10:36 AM, Florian Westphal wrote:
> > Hi David
> > 
> > This test program no longer works with 4.14
> > (recvfrom: Resource temporarily unavailable)
> > 
> > after reverting commit
> > 4832c30d5458387ff2533ff66fbde26ad8bb5a2d
> > (net: ipv6: put host and anycast routes on device with address)
> > 
> > it will work again ("OK").
> > 
> > Could you please have a look at this?
> > 
> 
> This restores the previous behavior:
> 
> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> index 05eb7bc36156..1c29d9bcedc3 100644
> --- a/net/ipv6/route.c
> +++ b/net/ipv6/route.c
> @@ -1019,7 +1019,7 @@ static struct net_device
> *ip6_rt_get_dev_rcu(struct rt6_info *rt)
>  {
> struct net_device *dev = rt->dst.dev;
> 
> -   if (rt->rt6i_flags & RTF_LOCAL) {
> +   if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
> /* for copies of local routes, dst->dev needs to be the
>  * device if it is a master device, the master device if
>  * device is enslaved, and the loopback as the default

Looks like it, thanks David!


Re: [PATCH net] sctp: report SCTP_ERROR_INV_STRM as cpu endian

2017-11-20 Thread Marcelo Ricardo Leitner
On Sun, Nov 19, 2017 at 02:20:04AM +0800, Xin Long wrote:
> On Fri, Nov 17, 2017 at 11:04 PM, Marcelo Ricardo Leitner
>  wrote:
> > On Fri, Nov 17, 2017 at 02:15:02PM +0800, Xin Long wrote:
> >> rfc6458 demands the send_error in SCTP_SEND_FAILED_EVENT should
> >> be in cpu endian, while SCTP_ERROR_INV_STRM is in big endian.
> >>
> >> This issue is there since very beginning, Eric noticed it by
> >> running 'make C=2 M=net/sctp/'.
> >>
> >> This patch is to convert it before reporting it.
> >
> > Unfortunatelly we can't fix this as this will break UAPI. It will
> > break applications that are currently matching on the current value.
> yes, you're right, seems all other send_errors also come in big endian now.
> 
> what do you think of using:
> sctp_chunk_fail(ch, (__force __u16)SCTP_ERROR_INV_STRM); or
> 
> sctp_chunk_fail(ch, (__force __u32)SCTP_ERROR_INV_STRM);
> 
> to avoid this warning ?

It's good. The last one is more interesting as it matches function
prototype already.

  Marcelo

> 
> >
> >>
> >> Reported-by: Eric Dumazet 
> >> Signed-off-by: Xin Long 
> >> ---
> >>  net/sctp/stream.c | 2 +-
> >>  1 file changed, 1 insertion(+), 1 deletion(-)
> >>
> >> diff --git a/net/sctp/stream.c b/net/sctp/stream.c
> >> index a11db21..f86ceee 100644
> >> --- a/net/sctp/stream.c
> >> +++ b/net/sctp/stream.c
> >> @@ -64,7 +64,7 @@ static void sctp_stream_outq_migrate(struct sctp_stream 
> >> *stream,
> >>*/
> >>
> >>   /* Mark as failed send. */
> >> - sctp_chunk_fail(ch, SCTP_ERROR_INV_STRM);
> >> + sctp_chunk_fail(ch, be16_to_cpu(SCTP_ERROR_INV_STRM));
> >>   if (asoc->peer.prsctp_capable &&
> >>   SCTP_PR_PRIO_ENABLED(ch->sinfo.sinfo_flags))
> >>   asoc->sent_cnt_removable--;
> >> --
> >> 2.1.0
> >>


Re: [PATCH v2 3/3] net: ethernet: fec: fix refclk enable for SMSC LAN8710/20

2017-11-20 Thread Richard Leitner
On 11/20/2017 11:35 AM, Andy Duan wrote:
> From: Richard Leitner  Sent: Monday, November 
> 20, 2017 5:57 PM
>> To: Andy Duan ; f.faine...@gmail.com;
>> and...@lunn.ch
>> Cc: Richard Leitner ; netdev@vger.kernel.org; linux-
>> ker...@vger.kernel.org
>> Subject: Re: [PATCH v2 3/3] net: ethernet: fec: fix refclk enable for SMSC
>> LAN8710/20
>>
>>
>> On 11/20/2017 10:47 AM, Andy Duan wrote:
>>> From: Richard Leitner  Sent: Monday, November 20, 2017
>>> 4:34 PM
 To: f.faine...@gmail.com; Andy Duan ;
 and...@lunn.ch
 Cc: netdev@vger.kernel.org; linux-ker...@vger.kernel.org;
 richard.leit...@skidata.com
 Subject: [PATCH v2 3/3] net: ethernet: fec: fix refclk enable for
 SMSC
 LAN8710/20

 From: Richard Leitner 

 Some PHYs (for example the SMSC LAN8710/LAN8720) doesn't allow
 turning the refclk on and off again during operation (according to their
>> datasheet).
 Nonetheless exactly this behaviour was introduced for power saving
 reasons by commit e8fcfcd5684a ("net: fec: optimize the clock
 management to save power").
 Therefore after enabling the refclk we detect if an affected PHY is
 attached. If so reset and initialize it again.
>> ...
>>
 +static int fec_enet_clk_ref_enable_reset_phy_quirk(struct net_device
 +*ndev) {
 +  struct phy_device *phy_dev = ndev->phydev;
 +  u32 real_phy_id;
 +  int ret;
 +
 +  /* some PHYs need a reset after the refclk was enabled, so we
 +   * reset them here
 +   */
 +  if (!phy_dev)
 +  return 0;
 +  if (!phy_dev->drv)
 +  return 0;
 +  real_phy_id = phy_dev->drv->phy_id & phy_dev->drv->phy_id_mask;
 +  switch (real_phy_id) {
 +  case 0x0007c0f0: /* SMSC LAN8710/LAN8720 */
>>> Don't hard code here...
>>> I believe there have many other phys also do such operation, hardcode is
>> unacceptable...
>>> And these code can be put into phy_device.c as common interface.
>> Ok. Thank you for the feedback.
>> So it would be fine to hardcode the affected phy_id's in a common function in
>> phy_device.c?
>>
>>
>> Another possible solution that came to my mind is to add a flag called
>> something like "PHY_RST_AFTER_CLK_EN" to the flags variable in struct
>> phy_driver. This flag could then be set in the smsc PHY driver for affected
>> PHYs.
>>
>> Then instead of comparing the phy_id in the MAC driver this flag could be
>> checked:
>>
>> if (phydev->drv->flags & PHY_RST_AFTER_CLK_EN) {
>>ret = fec_reset_phy(ndev);
>>...
>> }
>>
>> Would checking the flag be OK in fec_main.c?
> Yes, it is better than previous solution.
> But add new common API in phy_device.c is much better like: 
> 1. add a flag called "PHY_RST_AFTER_CLK_EN" to the flags variable in struct 
> phy_driver,  all phy driver that need reset can set the flag.

OK.

> 2. add new common api interface phy_reset_after_clk_enable() in phy_device.c 
> driver

OK. But see below...

> 3. add reset gpio descriptor for common phy device driver. 

... if I understood it correctly the patch called "Teach phylib
hard-resetting devices" by Geert and Sergei is exactly doing this:
https://patchwork.ozlabs.org/cover/828503/
https://lkml.org/lkml/2017/10/20/166

So I'll implement the phy_reset_after_clk_enable function atop of this
patch-set and add a note that my patch-series depends on it. Would that
be OK?

> 4. then any mac driver can directly call the common interface 
> .phy_reset_after_clk_enable().

Sounds reasonable :-)

> 
> That is only my suggestion, maybe there have better idea.
> Thanks.
> 

Thanks for your quick feedback.

regards
Richard.L


Re: [PATCH] xen-netfront: remove warning when unloading module

2017-11-20 Thread 'Eduardo Otubo'
On Mon, Nov 20, 2017 at 10:55:55AM +, Paul Durrant wrote:
> > -Original Message-
> > From: Eduardo Otubo [mailto:ot...@redhat.com]
> > Sent: 20 November 2017 10:41
> > To: xen-de...@lists.xenproject.org
> > Cc: netdev@vger.kernel.org; Paul Durrant ; Wei
> > Liu ; linux-ker...@vger.kernel.org;
> > vkuzn...@redhat.com; cav...@redhat.com; che...@redhat.com;
> > mga...@redhat.com; Eduardo Otubo 
> > Subject: [PATCH] xen-netfront: remove warning when unloading module
> > 
> > When unloading module xen_netfront from guest, dmesg would output
> > warning messages like below:
> > 
> >   [  105.236836] xen:grant_table: WARNING: g.e. 0x903 still in use!
> >   [  105.236839] deferring g.e. 0x903 (pfn 0x35805)
> > 
> > This problem relies on netfront and netback being out of sync. By the time
> > netfront revokes the g.e.'s netback didn't have enough time to free all of
> > them, hence displaying the warnings on dmesg.
> > 
> > The trick here is to make netfront to wait until netback frees all the 
> > g.e.'s
> > and only then continue to cleanup for the module removal, and this is done
> > by
> > manipulating both device states.
> > 
> > Signed-off-by: Eduardo Otubo 
> > ---
> >  drivers/net/xen-netfront.c | 11 +++
> >  1 file changed, 11 insertions(+)
> > 
> > diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
> > index 8b8689c6d887..b948e2a1ce40 100644
> > --- a/drivers/net/xen-netfront.c
> > +++ b/drivers/net/xen-netfront.c
> > @@ -2130,6 +2130,17 @@ static int xennet_remove(struct xenbus_device
> > *dev)
> > 
> > dev_dbg(&dev->dev, "%s\n", dev->nodename);
> > 
> > +   xenbus_switch_state(dev, XenbusStateClosing);
> > +   while (xenbus_read_driver_state(dev->otherend) !=
> > XenbusStateClosing){
> > +   cpu_relax();
> > +   schedule();
> > +   }
> > +   xenbus_switch_state(dev, XenbusStateClosed);
> > +   while (dev->xenbus_state != XenbusStateClosed){
> > +   cpu_relax();
> > +   schedule();
> > +   }
> > +
> 
> Waitiing for closing should be ok but waiting for closed is risky. As soon as 
> a backend is in the closed state then a toolstack can completely remove the 
> backend xenstore area, resulting a state of XenbusStateUnknown, which would 
> cause your second loop to spin forever.
> 
>   Paul

Well, that's a scenario I didn't foresee. I'll come up with a solution in order
avoid this problem. Thanks for the review.

> 
> > xennet_disconnect_backend(info);
> > 
> > unregister_netdev(info->netdev);
> > --
> > 2.13.6
> 

-- 
Eduardo Otubo


Re: [PATCH] xen-netfront: remove warning when unloading module

2017-11-20 Thread Eduardo Otubo
On Mon, Nov 20, 2017 at 12:17:11PM +0100, Juergen Gross wrote:
> On 20/11/17 11:49, Wei Liu wrote:
> > CC netfront maintainers.
> > 
> > On Mon, Nov 20, 2017 at 11:41:09AM +0100, Eduardo Otubo wrote:
> >> When unloading module xen_netfront from guest, dmesg would output
> >> warning messages like below:
> >>
> >>   [  105.236836] xen:grant_table: WARNING: g.e. 0x903 still in use!
> >>   [  105.236839] deferring g.e. 0x903 (pfn 0x35805)
> >>
> >> This problem relies on netfront and netback being out of sync. By the time
> >> netfront revokes the g.e.'s netback didn't have enough time to free all of
> >> them, hence displaying the warnings on dmesg.
> >>
> >> The trick here is to make netfront to wait until netback frees all the 
> >> g.e.'s
> >> and only then continue to cleanup for the module removal, and this is done 
> >> by
> >> manipulating both device states.
> >>
> >> Signed-off-by: Eduardo Otubo 
> >> ---
> >>  drivers/net/xen-netfront.c | 11 +++
> >>  1 file changed, 11 insertions(+)
> >>
> >> diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
> >> index 8b8689c6d887..b948e2a1ce40 100644
> >> --- a/drivers/net/xen-netfront.c
> >> +++ b/drivers/net/xen-netfront.c
> >> @@ -2130,6 +2130,17 @@ static int xennet_remove(struct xenbus_device *dev)
> >>  
> >>dev_dbg(&dev->dev, "%s\n", dev->nodename);
> >>  
> >> +  xenbus_switch_state(dev, XenbusStateClosing);
> >> +  while (xenbus_read_driver_state(dev->otherend) != XenbusStateClosing){
> >> +  cpu_relax();
> >> +  schedule();
> >> +  }
> >> +  xenbus_switch_state(dev, XenbusStateClosed);
> >> +  while (dev->xenbus_state != XenbusStateClosed){
> >> +  cpu_relax();
> >> +  schedule();
> >> +  }
> 
> I really don't like the busy waits.
> 
> Can't you use e.g. a wait queue and wait_event_interruptible() instead?

I thought about using these, but I don't think the busy waits here are much of a
problem because it's just unloading a kernel module, not a very repetitive
action. But yes I can go for this approach on v2.

> 
> BTW: what happens if the device is already in closed state if you enter
> xennet_remove()? In case this is impossible, please add a comment to
> indicate you've thought about that case.

Looks like this is the same problem Paul Durrant mentioned on his comment. I'll
work on this as well on v2.

Thanks for the review and the help on IRC :-)


Re: NETDEV WATCHDOG: eth0 (dwc-eth-dwmac): transmit queue 1 timed out

2017-11-20 Thread Giuseppe CAVALLARO

Hello Bhadram

there are some new patches actually in net/net-next repo that you should 
have; for example:


   [PATCH net-next v2 0/2] net: stmmac: Improvements for multi-queuing 
and for AVB


Let me know if these help you.

Regards
Peppe

On 11/20/2017 7:38 AM, Bhadram Varka wrote:

Hi Joao/Peppe,

Observed this issue more frequently with multi-channel case. Am I missing 
something in DT ?
Please help here to understand the issue.

Thanks,
Bhadram

-Original Message-
From: Bhadram Varka
Sent: Thursday, November 16, 2017 9:41 AM
To: linux-netdev 
Subject: NETDEV WATCHDOG: eth0 (dwc-eth-dwmac): transmit queue 1 timed out

Hi,

I am trying to enable multi-queue in Tegra186 EQOS (which has support for 4 
channels). Observed below netdev watchdog warning. Its easily reproable with 
iperf test.
In normal ping scenario this is not observed. I did not observe any issue if we 
disable TSO. Looks like issue in stmmac_tso_xmit() in multi-channel scenario.

[   88.801672] NETDEV WATCHDOG: eth0 (dwc-eth-dwmac): transmit queue 0 timed out
[   88.808818] [ cut here ]
[   88.813435] WARNING: CPU: 5 PID: 0 at net/sched/sch_generic.c:320 
dev_watchdog+0x2cc/0x2d8
[   88.821681] Modules linked in: dwmac_dwc_qos_eth stmmac_platform crc32_ce 
crct10dif_ce stmmac ip_tables x_tables ipv6
[   88.832290] CPU: 5 PID: 0 Comm: swapper/5 Tainted: G S  
4.14.0-rc7-01956-g9395db5-dirty #21
[   88.841663] Hardware name: NVIDIA Tegra186 P2771- Development Board (DT)
[   88.848697] task: 8001ec8fd400 task.stack: 09e38000
[   88.854606] PC is at dev_watchdog+0x2cc/0x2d8
[   88.858952] LR is at dev_watchdog+0x2cc/0x2d8
[   88.863300] pc : [] lr : [] pstate: 
2145
[   88.870678] sp : 0802bd80
[   88.873983] x29: 0802bd80 x28: 00a0
[   88.879287] x27:  x26: 8001eae2c3b0
[   88.884589] x25: 0005 x24: 8001ecb6be80
[   88.889891] x23: 8001eae2c39c x22: 8001eae2bfb0
[   88.895192] x21: 8001eae2c000 x20: 08fe7000
[   88.900493] x19: 0001 x18: 0010
[   88.905795] x17:  x16: 
[   88.911098] x15:  x14: 756f2064656d6974
[   88.916399] x13: 2031206575657571 x12: 08fe9df0
[   88.921699] x11: 08586180 x10: 642d6874652d6377
[   88.927000] x9 : 0016 x8 : 3a474f4448435441
[   88.932301] x7 : 572056454454454e x6 : 014f
[   88.937602] x5 : 0020 x4 : 
[   88.942902] x3 :  x2 : 08fec4c0
[   88.948203] x1 : 8001ec8fd400 x0 : 0041
[   88.953504] Call trace:
[   88.955944] Exception stack(0x0802bc40 to 0x0802bd80)
[   88.962371] bc40: 0041 8001ec8fd400 08fec4c0 

[   88.970184] bc60:  0020 014f 
572056454454454e
[   88.977998] bc80: 3a474f4448435441 0016 642d6874652d6377 
08586180
[   88.985811] bca0: 08fe9df0 2031206575657571 756f2064656d6974 

[   88.993624] bcc0:   0010 
0001
[   89.001439] bce0: 08fe7000 8001eae2c000 8001eae2bfb0 
8001eae2c39c
[   89.009252] bd00: 8001ecb6be80 0005 8001eae2c3b0 

[   89.017065] bd20: 00a0 0802bd80 0894a76c 
0802bd80
[   89.024879] bd40: 0894a76c 2145 00b67570 
0001
[   89.032693] bd60: 0001 8001ecb6b200 0802bd80 
0894a76c
[   89.040508] [] dev_watchdog+0x2cc/0x2d8
[   89.045900] [] call_timer_fn.isra.5+0x24/0x80
[   89.051809] [] expire_timers+0xa4/0xb0
[   89.057111] [] run_timer_softirq+0x140/0x170
[   89.062933] [] __do_softirq+0x12c/0x228
[   89.068323] [] irq_exit+0xd0/0x108
[   89.073278] [] __handle_domain_irq+0x60/0xb8
[   89.079098] [] gic_handle_irq+0x58/0xa8
[   89.084484] Exception stack(0x09e3be20 to 0x09e3bf60)
[   89.090910] be20:   0001 

[   89.098724] be40:  09e3bf60 8001ecffd000 
0001
[   89.106537] be60: 0002 09e3bee0 0a00 

[   89.114351] be80: 0001  001c3dfbd9959589 
1daf5b7a4860
[   89.122164] bea0: 0825b000  c0311284 
08fc5000
[   89.129978] bec0: 08fe9000 08fe9000 08fd04a0 
08fe9e90
[   89.137792] bee0:   8001ec8fd400 

[   89.145605] bf00:  09e3bf60 0808548c 
09e3bf60
[   89.153418] bf20: 08085490 0145  

[   89.161231] bf40:  081409c4 09e3bf60 
08085490
[   89.169044] [] el1_irq

Re: [PATCH RFC 0/5] Support asynchronous crypto for IPsec GSO.

2017-11-20 Thread David Miller
From: Steffen Klassert 
Date: Mon, 20 Nov 2017 08:37:47 +0100

> This patchset implements asynchronous crypto handling
> in the layer 2 TX path. With this we can allow IPsec
> ESP GSO for software crypto. This also merges the IPsec
> GSO and non-GSO paths to both use validate_xmit_xfrm().
 ...

Code looks generally fine to me.  Only thing of note is that this
adds a new dev_requeue_skb() call site and that might intersect with
John Fastabend's RFC work to make qdiscs lockless.

Also, please adhere to the reverse christmas tree rule for the
ordering of your function local variables.

Thank you.


Re: [PATCH v2 3/3] net: ethernet: fec: fix refclk enable for SMSC LAN8710/20

2017-11-20 Thread Geert Uytterhoeven
Hi Richard,

On Mon, Nov 20, 2017 at 1:55 PM, Richard Leitner
 wrote:
> On 11/20/2017 11:35 AM, Andy Duan wrote:
>> 3. add reset gpio descriptor for common phy device driver.
>
> ... if I understood it correctly the patch called "Teach phylib
> hard-resetting devices" by Geert and Sergei is exactly doing this:
> https://patchwork.ozlabs.org/cover/828503/
> https://lkml.org/lkml/2017/10/20/166
>
> So I'll implement the phy_reset_after_clk_enable function atop of this
> patch-set and add a note that my patch-series depends on it. Would that
> be OK?

I will update and respin that patch series after the merge window has closed.

Gr{oetje,eeting}s,

Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- ge...@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds


Re: [PATCH v2 3/3] net: ethernet: fec: fix refclk enable for SMSC LAN8710/20

2017-11-20 Thread Richard Leitner

On 11/20/2017 02:13 PM, Geert Uytterhoeven wrote:
> Hi Richard,
> 
> On Mon, Nov 20, 2017 at 1:55 PM, Richard Leitner
>  wrote:
>> On 11/20/2017 11:35 AM, Andy Duan wrote:
>>> 3. add reset gpio descriptor for common phy device driver.
>>
>> ... if I understood it correctly the patch called "Teach phylib
>> hard-resetting devices" by Geert and Sergei is exactly doing this:
>> https://patchwork.ozlabs.org/cover/828503/
>> https://lkml.org/lkml/2017/10/20/166
>>
>> So I'll implement the phy_reset_after_clk_enable function atop of this
>> patch-set and add a note that my patch-series depends on it. Would that
>> be OK?
> 
> I will update and respin that patch series after the merge window has closed.

Ok. Thank you for the quick response an this information.

For the Freescale Fast Ethernet Controller (FEC) there are currently (in
addition to the reset gpio) two additional optional dt properties for
the reset:
 - phy-reset-duration : Reset duration in milliseconds.
 - phy-reset-post-delay : Post reset delay in milliseconds.

IMHO it would make sense to include them also in the phylib
implementation. What do you think about it? Should I include it in my
patch-series?

kind regards;
Richard.L

> 
> Gr{oetje,eeting}s,
> 
> Geert
> 
> --
> Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- 
> ge...@linux-m68k.org
> 
> In personal conversations with technical people, I call myself a hacker. But
> when I'm talking to journalists I just say "programmer" or something like 
> that.
> -- Linus Torvalds
> 


Hello Dear

2017-11-20 Thread Mr Sheng Li Hung
Hello Dear

I am Mr.Sheng Li Hung, from china I got your information while search for
a reliable person, I have a very profitable business proposition for you
and i can assure you that you will not regret been part of this mutual
beneficial transaction after completion. Kindly get back to me for more
details on this email id: shenglil...@hotmail.com


Thanks
Sheng Li Hung


Re: [PATCH 3/8] fs: btrfs: remove unused hardirq.h

2017-11-20 Thread David Sterba
On Sat, Nov 18, 2017 at 07:02:16AM +0800, Yang Shi wrote:
> Preempt counter APIs have been split out, currently, hardirq.h just
> includes irq_enter/exit APIs which are not used by btrfs at all.
> 
> So, remove the unused hardirq.h.
> 
> Signed-off-by: Yang Shi 
> Cc: Chris Mason 
> Cc: Josef Bacik 
> Cc: David Sterba 
> Cc: linux-bt...@vger.kernel.org

Acked-by: David Sterba 


Re: len = bpf_probe_read_str(); bpf_perf_event_output(... len) == FAIL

2017-11-20 Thread Arnaldo Carvalho de Melo
Em Tue, Nov 14, 2017 at 09:25:17PM +0100, Daniel Borkmann escreveu:
> On 11/14/2017 07:15 PM, Yonghong Song wrote:
> > On 11/14/17 6:19 AM, Daniel Borkmann wrote:
> >> On 11/14/2017 02:42 PM, Arnaldo Carvalho de Melo wrote:
> >>> Em Tue, Nov 14, 2017 at 02:09:34PM +0100, Daniel Borkmann escreveu:
>  On 11/14/2017 01:58 PM, Arnaldo Carvalho de Melo wrote:
> > Em Tue, Nov 14, 2017 at 01:09:39AM +0100, Daniel Borkmann escreveu:
> >> On 11/13/2017 04:08 PM, Arnaldo Carvalho de Melo wrote:
> >>> libbpf: -- BEGIN DUMP LOG ---
> >>> libbpf:
> >>> 0: (79) r3 = *(u64 *)(r1 +104)
> >>> 1: (b7) r2 = 0
> >>> 2: (bf) r6 = r1
> >>> 3: (bf) r1 = r10
> >>> 4: (07) r1 += -128
> >>> 5: (b7) r2 = 128
> >>> 6: (85) call bpf_probe_read_str#45
> >>> 7: (bf) r1 = r0
> >>> 8: (07) r1 += -1
> >>> 9: (67) r1 <<= 32
> >>> 10: (77) r1 >>= 32
> >>> 11: (25) if r1 > 0x7f goto pc+11
> >>
> >> Right, so the compiler is optimizing the two tests into a single one 
> >> above,
> >> which means lower bound cannot properly be derived again by the 
> >> verifier due
> >> to this and thus you'll get the error. Similar issue was seen recently 
> >> [1].
> >>
> >> Does the below hack work for you?
> >>
> >> int prog([...])
> >> {
> >>  char filename[128];
> >>  int ret = bpf_probe_read_str(filename, sizeof(filename), 
> >> filename_ptr);
> >>  if (ret > 0)
> >>  bpf_perf_event_output(ctx, &__bpf_stdout__, 
> >> BPF_F_CURRENT_CPU, filename,
> >>    ret & (sizeof(filename) - 1));
> >>  return 1;
> >> }
> >>
> >> r0 should keep on tracking bounds here at least:
> >>
> >> prog:
> >>     0:    bf 16 00 00 00 00 00 00 r6 = r1
> >>     1:    bf a1 00 00 00 00 00 00 r1 = r10
> >>     2:    07 01 00 00 80 ff ff ff r1 += -128
> >>     3:    b7 02 00 00 80 00 00 00 r2 = 128
> >>     4:    85 00 00 00 2d 00 00 00 call 45
> >>     5:    67 00 00 00 20 00 00 00 r0 <<= 32
> >>     6:    c7 00 00 00 20 00 00 00 r0 s>>= 32
> >>     7:    b7 01 00 00 01 00 00 00 r1 = 1
> >>     8:    6d 01 0a 00 00 00 00 00 if r1 s> r0 goto 10
> >>     9:    57 00 00 00 7f 00 00 00 r0 &= 127
> >>    10:    bf a4 00 00 00 00 00 00 r4 = r10
> >>    11:    07 04 00 00 80 ff ff ff r4 += -128
> >>    12:    bf 61 00 00 00 00 00 00 r1 = r6
> >>    13:    18 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r2 = 
> >> 0ll
> >>    15:    18 03 00 00 ff ff ff ff 00 00 00 00 00 00 00 00 r3 = 
> >> 4294967295ll
> >>    17:    bf 05 00 00 00 00 00 00 r5 = r0
> >>    18:    85 00 00 00 19 00 00 00 call 25
> >>
> >>    [1] 
> >> https://urldefense.proofpoint.com/v2/url?u=http-3A__patchwork.ozlabs.org_project_netdev_list_-3Fseries-3D13211&d=DwIDaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=DA8e1B5r073vIqRrFz7MRA&m=Qp3xFfXEz-CT8rzYtrHeXbow2M6FlsUzwcY32i3_2Q0&s=z0d6b_hxStA845Kh7epJ-JiFwkiWqUH_z3fEadwqAQY&e=
> >
> > Not yet:
> >
> > 6: (85) call bpf_probe_read_str#45
> > 7: (bf) r1 = r0
> > 8: (67) r1 <<= 32
> > 9: (77) r1 >>= 32
> > 10: (15) if r1 == 0x0 goto pc+10
> >   R0=inv(id=0) R1=inv(id=0,umax_value=4294967295,var_off=(0x0; 
> > 0x)) R6=ctx(id=0,off=0,imm=0) R10=fp0
> > 11: (57) r0 &= 127
> > 12: (bf) r4 = r10
> > 13: (07) r4 += -128
> > 14: (bf) r1 = r6
> > 15: (18) r2 = 0x92bfc2aba840u
> > 17: (18) r3 = 0x
> > 19: (bf) r5 = r0
> > 20: (85) call bpf_perf_event_output#25
> > invalid stack type R4 off=-128 access_size=0
> >
> > I'll try updating clang/llvm...
> >
> > Full details:
> >
> > [root@jouet bpf]# cat open.c
> > #include "bpf.h"
> >
> > SEC("prog=do_sys_open filename")
> > int prog(void *ctx, int err, const char __user *filename_ptr)
> > {
> > char filename[128];
> > const unsigned len = bpf_probe_read_str(filename, sizeof(filename), 
> > filename_ptr);
> 
>  Btw, I was using 'int' here above instead of 'unsigned' as 
>  strncpy_from_unsafe()
>  could potentially return errors like -EFAULT.
> >>>
> >>> I changed to int, didn't help
> >>>  
>  Currently having a version compiled from the git tree:
> 
>  # llc --version
>  LLVM 
>  (https://urldefense.proofpoint.com/v2/url?u=http-3A__llvm.org_&d=DwIDaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=DA8e1B5r073vIqRrFz7MRA&m=Qp3xFfXEz-CT8rzYtrHeXbow2M6FlsUzwcY32i3_2Q0&s=BKC_Gu9s1hw0v13OCgCpfsGtAY2hE7dujFqg8LNaK2I&e=):
>     LLVM version 6.0.0git-2d810c2
>     Optimized build.
>     Default target: x86_64-unknown-linux-gnu
>     Host CPU: skylake
> >>>
> >>> [root@jouet bpf]# llc --version
> >>> LLVM 
>

Re: [PATCH v2 3/3] net: ethernet: fec: fix refclk enable for SMSC LAN8710/20

2017-11-20 Thread Geert Uytterhoeven
Hi Richard,

On Mon, Nov 20, 2017 at 2:21 PM, Richard Leitner
 wrote:
> On 11/20/2017 02:13 PM, Geert Uytterhoeven wrote:
>> On Mon, Nov 20, 2017 at 1:55 PM, Richard Leitner
>>  wrote:
>>> On 11/20/2017 11:35 AM, Andy Duan wrote:
 3. add reset gpio descriptor for common phy device driver.
>>>
>>> ... if I understood it correctly the patch called "Teach phylib
>>> hard-resetting devices" by Geert and Sergei is exactly doing this:
>>> https://patchwork.ozlabs.org/cover/828503/
>>> https://lkml.org/lkml/2017/10/20/166
>>>
>>> So I'll implement the phy_reset_after_clk_enable function atop of this
>>> patch-set and add a note that my patch-series depends on it. Would that
>>> be OK?
>>
>> I will update and respin that patch series after the merge window has closed.
>
> Ok. Thank you for the quick response an this information.
>
> For the Freescale Fast Ethernet Controller (FEC) there are currently (in
> addition to the reset gpio) two additional optional dt properties for
> the reset:
>  - phy-reset-duration : Reset duration in milliseconds.
>  - phy-reset-post-delay : Post reset delay in milliseconds.
>
> IMHO it would make sense to include them also in the phylib
> implementation. What do you think about it? Should I include it in my
> patch-series?

Sure, you can always extend phylib, building on top of our simple reset
implementation.

BTW, I think phy-reset-{duration,post-delay} should be moved to the
phy node as well, dropping the "phy-" prefix in the process.

Gr{oetje,eeting}s,

Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- ge...@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds


RE: [PATCH v2 3/3] net: ethernet: fec: fix refclk enable for SMSC LAN8710/20

2017-11-20 Thread Andy Duan
From: Richard Leitner  Sent: Monday, November 20, 
2017 8:55 PM
>On 11/20/2017 11:35 AM, Andy Duan wrote:
>> From: Richard Leitner  Sent: Monday,
>> November 20, 2017 5:57 PM
>>> To: Andy Duan ; f.faine...@gmail.com;
>>> and...@lunn.ch
>>> Cc: Richard Leitner ; netdev@vger.kernel.org; linux-
>>> ker...@vger.kernel.org
>>> Subject: Re: [PATCH v2 3/3] net: ethernet: fec: fix refclk enable for
>>> SMSC
>>> LAN8710/20
>>>
>>>
>>> On 11/20/2017 10:47 AM, Andy Duan wrote:
 From: Richard Leitner  Sent: Monday, November 20,
 2017
 4:34 PM
> To: f.faine...@gmail.com; Andy Duan ;
> and...@lunn.ch
> Cc: netdev@vger.kernel.org; linux-ker...@vger.kernel.org;
> richard.leit...@skidata.com
> Subject: [PATCH v2 3/3] net: ethernet: fec: fix refclk enable for
> SMSC
> LAN8710/20
>
> From: Richard Leitner 
>
> Some PHYs (for example the SMSC LAN8710/LAN8720) doesn't allow
> turning the refclk on and off again during operation (according to
> their
>>> datasheet).
> Nonetheless exactly this behaviour was introduced for power saving
> reasons by commit e8fcfcd5684a ("net: fec: optimize the clock
> management to save power").
> Therefore after enabling the refclk we detect if an affected PHY is
> attached. If so reset and initialize it again.
>>> ...
>>>
> +static int fec_enet_clk_ref_enable_reset_phy_quirk(struct
> +net_device
> +*ndev) {
> + struct phy_device *phy_dev = ndev->phydev;
> + u32 real_phy_id;
> + int ret;
> +
> + /* some PHYs need a reset after the refclk was enabled, so we
> +  * reset them here
> +  */
> + if (!phy_dev)
> + return 0;
> + if (!phy_dev->drv)
> + return 0;
> + real_phy_id = phy_dev->drv->phy_id & phy_dev->drv->phy_id_mask;
> + switch (real_phy_id) {
> + case 0x0007c0f0: /* SMSC LAN8710/LAN8720 */
 Don't hard code here...
 I believe there have many other phys also do such operation,
 hardcode is
>>> unacceptable...
 And these code can be put into phy_device.c as common interface.
>>> Ok. Thank you for the feedback.
>>> So it would be fine to hardcode the affected phy_id's in a common
>>> function in phy_device.c?
>>>
>>>
>>> Another possible solution that came to my mind is to add a flag
>>> called something like "PHY_RST_AFTER_CLK_EN" to the flags variable in
>>> struct phy_driver. This flag could then be set in the smsc PHY driver
>>> for affected PHYs.
>>>
>>> Then instead of comparing the phy_id in the MAC driver this flag
>>> could be
>>> checked:
>>>
>>> if (phydev->drv->flags & PHY_RST_AFTER_CLK_EN) {
>>>ret = fec_reset_phy(ndev);
>>>...
>>> }
>>>
>>> Would checking the flag be OK in fec_main.c?
>> Yes, it is better than previous solution.
>> But add new common API in phy_device.c is much better like:
>> 1. add a flag called "PHY_RST_AFTER_CLK_EN" to the flags variable in struct
>phy_driver,  all phy driver that need reset can set the flag.
>
>OK.
>
>> 2. add new common api interface phy_reset_after_clk_enable() in
>> phy_device.c driver
>
>OK. But see below...
>
>> 3. add reset gpio descriptor for common phy device driver.
>
>... if I understood it correctly the patch called "Teach phylib hard-resetting
>devices" by Geert and Sergei is exactly doing this:
>   https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F
>%2Fpatchwork.ozlabs.org%2Fcover%2F828503%2F&data=02%7C01%7Cfugang
>.duan%40nxp.com%7C087010d8b4784e3c88c608d53015f2ad%7C686ea1d3bc2
>b4c6fa92cd99c5c301635%7C0%7C0%7C636467793180108636&sdata=yY9thX8Q
>CCVteoF5vvUoAYYxGH0gg4wOUq7TQKtkiok%3D&reserved=0
>   https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F
>%2Flkml.org%2Flkml%2F2017%2F10%2F20%2F166&data=02%7C01%7Cfugang.
>duan%40nxp.com%7C087010d8b4784e3c88c608d53015f2ad%7C686ea1d3bc2b
>4c6fa92cd99c5c301635%7C0%7C0%7C636467793180108636&sdata=rxV12dum1
>VmbWLWvSACDuZevFSFbUoWr9AiUtVSsV6w%3D&reserved=0
>
>So I'll implement the phy_reset_after_clk_enable function atop of this patch-
>set and add a note that my patch-series depends on it. Would that be OK?
>
Yes, it is the best solution.

>> 4. then any mac driver can directly call the common
>interface .phy_reset_after_clk_enable().
>
>Sounds reasonable :-)
>
>>
>> That is only my suggestion, maybe there have better idea.
>> Thanks.
>>
>
>Thanks for your quick feedback.
>
>regards
>Richard.L


[PATCH v3] net: qcom/emac: extend DMA mask to 46bits

2017-11-20 Thread Wang Dongsheng
Since PTP doesn't support yet, so extend the DMA address to 46bits.

Signed-off-by: Wang Dongsheng 
---
v3:
- Remove "Dynamic fix TPD_BUFFER_ADDR_H_SET size."
- Add comments for TPD_BUFFER_ADDR_H_SET.
v2:
- Changes PATCH subject.
- Dynamic fix TPD_BUFFER_ADDR_H_SET size.
- Modify DMA MASK to 46bits.
- Add Comments for DMA MASK.
---
 drivers/net/ethernet/qualcomm/emac/emac-mac.h | 3 ++-
 drivers/net/ethernet/qualcomm/emac/emac.c | 7 +--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/qualcomm/emac/emac-mac.h 
b/drivers/net/ethernet/qualcomm/emac/emac-mac.h
index 5028fb4..4beedb8 100644
--- a/drivers/net/ethernet/qualcomm/emac/emac-mac.h
+++ b/drivers/net/ethernet/qualcomm/emac/emac-mac.h
@@ -114,8 +114,9 @@ struct emac_tpd {
 #define TPD_INSTC_SET(tpd, val)BITS_SET((tpd)->word[3], 17, 
17, val)
 /* High-14bit Buffer Address, So, the 64b-bit address is
  * {DESC_CTRL_11_TX_DATA_HIADDR[17:0],(register) BUFFER_ADDR_H, BUFFER_ADDR_L}
+ * Extend TPD_BUFFER_ADDR_H to [31, 18], because we never enable timestamping.
  */
-#define TPD_BUFFER_ADDR_H_SET(tpd, val)BITS_SET((tpd)->word[3], 18, 
30, val)
+#define TPD_BUFFER_ADDR_H_SET(tpd, val)BITS_SET((tpd)->word[3], 18, 
31, val)
 /* Format D. Word offset from the 1st byte of this packet to start to calculate
  * the custom checksum.
  */
diff --git a/drivers/net/ethernet/qualcomm/emac/emac.c 
b/drivers/net/ethernet/qualcomm/emac/emac.c
index 70c92b6..a4b39d1 100644
--- a/drivers/net/ethernet/qualcomm/emac/emac.c
+++ b/drivers/net/ethernet/qualcomm/emac/emac.c
@@ -615,8 +615,11 @@ static int emac_probe(struct platform_device *pdev)
u32 reg;
int ret;
 
-   /* The TPD buffer address is limited to 45 bits. */
-   ret = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(45));
+   /* The TPD buffer address is limited to:
+* 1. PTP:  45bits. (Driver doesn't support yet.)
+* 2. NON-PTP:  46bits.
+*/
+   ret = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(46));
if (ret) {
dev_err(&pdev->dev, "could not set DMA mask\n");
return ret;
-- 
2.7.4



[PATCH] can: ti_hecc: Fix napi poll return value for repoll

2017-11-20 Thread Oliver Stäbler
After commit d75b1ade567f ("net: less interrupt masking in NAPI") napi
repoll is done only when work_done == budget.
So we need to return budget if there are still packets to receive.

Signed-off-by: Oliver Stäbler 
---
 drivers/net/can/ti_hecc.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/can/ti_hecc.c b/drivers/net/can/ti_hecc.c
index 4d4941469cfc..db6ea936dc3f 100644
--- a/drivers/net/can/ti_hecc.c
+++ b/drivers/net/can/ti_hecc.c
@@ -637,6 +637,9 @@ static int ti_hecc_rx_poll(struct napi_struct *napi, int 
quota)
mbx_mask = hecc_read(priv, HECC_CANMIM);
mbx_mask |= HECC_TX_MBOX_MASK;
hecc_write(priv, HECC_CANMIM, mbx_mask);
+   } else {
+   /* repoll is done only if whole budget is used */
+   num_pkts = quota;
}
 
return num_pkts;
-- 
2.13.6



RE: NETDEV WATCHDOG: eth0 (dwc-eth-dwmac): transmit queue 1 timed out

2017-11-20 Thread Bhadram Varka
Hi Giuseppe,

Thanks for responding.

Actually I am using net-next tree for making the changes. Below patches already 
present in code base.

a0daae1 net: stmmac: Disable flow ctrl for RX AVB queues and really enable TX 
AVB queues
52a7623 net: stmmac: Use correct values in TQS/RQS fields

Thanks,
Bhadram.

-Original Message-
From: Giuseppe CAVALLARO [mailto:peppe.cavall...@st.com] 
Sent: Monday, November 20, 2017 6:37 PM
To: Bhadram Varka ; joao.pi...@synopsys.com
Cc: linux-netdev 
Subject: Re: NETDEV WATCHDOG: eth0 (dwc-eth-dwmac): transmit queue 1 timed out

Hello Bhadram

there are some new patches actually in net/net-next repo that you should have; 
for example:

[PATCH net-next v2 0/2] net: stmmac: Improvements for multi-queuing and for 
AVB

Let me know if these help you.

Regards
Peppe

On 11/20/2017 7:38 AM, Bhadram Varka wrote:
> Hi Joao/Peppe,
>
> Observed this issue more frequently with multi-channel case. Am I missing 
> something in DT ?
> Please help here to understand the issue.
>
> Thanks,
> Bhadram
>
> -Original Message-
> From: Bhadram Varka
> Sent: Thursday, November 16, 2017 9:41 AM
> To: linux-netdev 
> Subject: NETDEV WATCHDOG: eth0 (dwc-eth-dwmac): transmit queue 1 timed 
> out
>
> Hi,
>
> I am trying to enable multi-queue in Tegra186 EQOS (which has support for 4 
> channels). Observed below netdev watchdog warning. Its easily reproable with 
> iperf test.
> In normal ping scenario this is not observed. I did not observe any issue if 
> we disable TSO. Looks like issue in stmmac_tso_xmit() in multi-channel 
> scenario.
>
> [   88.801672] NETDEV WATCHDOG: eth0 (dwc-eth-dwmac): transmit queue 0 timed 
> out
> [   88.808818] [ cut here ]
> [   88.813435] WARNING: CPU: 5 PID: 0 at net/sched/sch_generic.c:320 
> dev_watchdog+0x2cc/0x2d8
> [   88.821681] Modules linked in: dwmac_dwc_qos_eth stmmac_platform crc32_ce 
> crct10dif_ce stmmac ip_tables x_tables ipv6
> [   88.832290] CPU: 5 PID: 0 Comm: swapper/5 Tainted: G S  
> 4.14.0-rc7-01956-g9395db5-dirty #21
> [   88.841663] Hardware name: NVIDIA Tegra186 P2771- Development Board 
> (DT)
> [   88.848697] task: 8001ec8fd400 task.stack: 09e38000
> [   88.854606] PC is at dev_watchdog+0x2cc/0x2d8
> [   88.858952] LR is at dev_watchdog+0x2cc/0x2d8
> [   88.863300] pc : [] lr : [] pstate: 
> 2145
> [   88.870678] sp : 0802bd80
> [   88.873983] x29: 0802bd80 x28: 00a0
> [   88.879287] x27:  x26: 8001eae2c3b0
> [   88.884589] x25: 0005 x24: 8001ecb6be80
> [   88.889891] x23: 8001eae2c39c x22: 8001eae2bfb0
> [   88.895192] x21: 8001eae2c000 x20: 08fe7000
> [   88.900493] x19: 0001 x18: 0010
> [   88.905795] x17:  x16: 
> [   88.911098] x15:  x14: 756f2064656d6974
> [   88.916399] x13: 2031206575657571 x12: 08fe9df0
> [   88.921699] x11: 08586180 x10: 642d6874652d6377
> [   88.927000] x9 : 0016 x8 : 3a474f4448435441
> [   88.932301] x7 : 572056454454454e x6 : 014f
> [   88.937602] x5 : 0020 x4 : 
> [   88.942902] x3 :  x2 : 08fec4c0
> [   88.948203] x1 : 8001ec8fd400 x0 : 0041
> [   88.953504] Call trace:
> [   88.955944] Exception stack(0x0802bc40 to 0x0802bd80)
> [   88.962371] bc40: 0041 8001ec8fd400 08fec4c0 
> 
> [   88.970184] bc60:  0020 014f 
> 572056454454454e
> [   88.977998] bc80: 3a474f4448435441 0016 642d6874652d6377 
> 08586180
> [   88.985811] bca0: 08fe9df0 2031206575657571 756f2064656d6974 
> 
> [   88.993624] bcc0:   0010 
> 0001
> [   89.001439] bce0: 08fe7000 8001eae2c000 8001eae2bfb0 
> 8001eae2c39c
> [   89.009252] bd00: 8001ecb6be80 0005 8001eae2c3b0 
> 
> [   89.017065] bd20: 00a0 0802bd80 0894a76c 
> 0802bd80
> [   89.024879] bd40: 0894a76c 2145 00b67570 
> 0001
> [   89.032693] bd60: 0001 8001ecb6b200 0802bd80 
> 0894a76c
> [   89.040508] [] dev_watchdog+0x2cc/0x2d8
> [   89.045900] [] call_timer_fn.isra.5+0x24/0x80
> [   89.051809] [] expire_timers+0xa4/0xb0
> [   89.057111] [] run_timer_softirq+0x140/0x170
> [   89.062933] [] __do_softirq+0x12c/0x228
> [   89.068323] [] irq_exit+0xd0/0x108
> [   89.073278] [] __handle_domain_irq+0x60/0xb8
> [   89.079098] [] gic_handle_irq+0x58/0xa8
> [   89.084484] Exception stack(0x09e3be20 to 0x09e3bf60)
> [   89.090910] be20:   0001 
> 
> [   89.098724] be40:  09e3bf60 8001ecffd000 
> 000

RE: [PATCH 3/7] net: core: eliminate dev_alloc_name{,_ns} code duplication

2017-11-20 Thread David Laight
From: Rasmus Villemoes
> Sent: 12 November 2017 23:15
> dev_alloc_name contained a BUG_ON(), which I moved to dev_alloc_name_ns;
> the only other caller of that already has the same BUG_ON.
> 
> Signed-off-by: Rasmus Villemoes 
> ---
>  net/core/dev.c | 12 ++--
>  1 file changed, 2 insertions(+), 10 deletions(-)
> 
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 240ae6bc1097..1077bfe97bde 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -1112,6 +1112,7 @@ static int dev_alloc_name_ns(struct net *net,
>   char buf[IFNAMSIZ];
>   int ret;
> 
> + BUG_ON(!net);
>   ret = __dev_alloc_name(net, name, buf);

Just delete it.
The NULL pointer dereference is as easy to debug as the BUG().

David



Re: [PATCH 11/31] nds32: Atomic operations

2017-11-20 Thread Will Deacon
Hi Greentime,

On Wed, Nov 08, 2017 at 01:54:59PM +0800, Greentime Hu wrote:
> From: Greentime Hu 
> 
> Signed-off-by: Vincent Chen 
> Signed-off-by: Greentime Hu 
> ---
>  arch/nds32/include/asm/futex.h|  116 
>  arch/nds32/include/asm/spinlock.h |  178 
> +
>  2 files changed, 294 insertions(+)
>  create mode 100644 arch/nds32/include/asm/futex.h
>  create mode 100644 arch/nds32/include/asm/spinlock.h

[...]

> +static inline int
> +futex_atomic_cmpxchg_inatomic(u32 * uval, u32 __user * uaddr,
> +   u32 oldval, u32 newval)
> +{
> + int ret = 0;
> + u32 val, tmp, flags;
> +
> + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
> + return -EFAULT;
> +
> + smp_mb();
> + asm volatile ("   movi$ta, #0\n"
> +   "1: llw %1, [%6 + $ta]\n"
> +   "   sub %3, %1, %4\n"
> +   "   cmovz   %2, %5, %3\n"
> +   "   cmovn   %2, %1, %3\n"
> +   "2: scw %2, [%6 + $ta]\n"
> +   "   beqz%2, 1b\n"
> +   "3:\n   " __futex_atomic_ex_table("%7")
> +   :"+&r"(ret), "=&r"(val), "=&r"(tmp), "=&r"(flags)
> +   :"r"(oldval), "r"(newval), "r"(uaddr), "i"(-EFAULT)
> +   :"$ta", "memory");
> + smp_mb();
> +
> + *uval = val;
> + return ret;
> +}

I see you rely on asm-generic/barrier.h for your barrier definitions, which
suggests that you only need to prevent reordering by the compiler because
you're not SMP. Is that right? If so, using smp_mb() is a little weird.

What about DMA transactions? I imagine you might need some extra
instructions for the mandatory barriers there.

Also:

> +static inline void arch_spin_lock(arch_spinlock_t * lock)
> +{
> + unsigned long tmp;
> +
> + __asm__ __volatile__("1:\n"
> +  "\tllw\t%0, [%1]\n"
> +  "\tbnez\t%0, 1b\n"
> +  "\tmovi\t%0, #0x1\n"
> +  "\tscw\t%0, [%1]\n"
> +  "\tbeqz\t%0, 1b\n"
> +  :"=&r"(tmp)
> +  :"r"(&lock->lock)
> +  :"memory");
> +}

Here it looks like you're eliding an explicit barrier here because you
already have a "memory" clobber. Can't you do the same for the futex code
above?

Will


Re: [PATCH net 05/10] net: xdp: don't allow device-bound programs in driver mode

2017-11-20 Thread David Ahern
On 11/19/17 9:55 PM, Jakub Kicinski wrote:
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 09525a27319c..21de2d37a0ba 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -7143,6 +7143,13 @@ int dev_change_xdp_fd(struct net_device *dev, struct 
> netlink_ext_ack *extack,
>bpf_op == ops->ndo_bpf);
>   if (IS_ERR(prog))
>   return PTR_ERR(prog);
> +
> + if (!(flags & XDP_FLAGS_HW_MODE) &&
> + bpf_prog_is_dev_bound(prog->aux)) {
> + NL_SET_ERR_MSG_MOD(extack, "using device-bound program 
> without HW_MODE flag not supported");

I don't see dev_change_xdp_fd called by device drivers, so that should
just be NL_SET_ERR_MSG. Also, "is not supported" sounds better to me
than just "not supported".


ksz884x: Why is advertising changed based on speed?

2017-11-20 Thread George McCollister
Does anyone know why ksz884x manipulates advertising based on the
value of speed instead of just passing cmd through to
mii_ethtool_set_link_ksettings() like most other drivers do?

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/net/ethernet/micrel/ksz884x.c?id=11dd894e4afa7995d8e4bd6008cbd79840c3a8bd#n5999

I am finding that if I do `ethtool -s enp3s0 autoneg` immediately
after bringing up the interface that it changes the advertised link
modes to 10baseT/Half since the driver apparently is reporting a speed
of 10 and duplex of 0 at that time.

If no one can site a reason for the driver doing this I'll be happy to
send a patch that removes it.

Thanks,
George McCollister


RE: [PATCH net-next] net: thunderbolt: Clear finished Tx frame bus address in tbnet_tx_callback()

2017-11-20 Thread David Laight
From: Mika Westerberg
> Sent: 13 November 2017 10:22
> To: David Miller
> Cc: michael.ja...@intel.com; yehezkel.ber...@intel.com; netdev@vger.kernel.org
> Subject: Re: [PATCH net-next] net: thunderbolt: Clear finished Tx frame bus 
> address in
> tbnet_tx_callback()
> 
> On Sat, Nov 11, 2017 at 07:21:24PM +0900, David Miller wrote:
> > From: Mika Westerberg 
> > Date: Thu,  9 Nov 2017 13:46:28 +0300
> >
> > > When Thunderbolt network interface is disabled or when the cable is
> > > unplugged the driver releases all allocated buffers by calling
> > > tbnet_free_buffers() for each ring. This function then calls
> > > dma_unmap_page() for each buffer it finds where bus address is non-zero.
> > > Now, we only clear this bus address when the Tx buffer is sent to the
> > > hardware so it is possible that the function finds an entry that has
> > > already been unmapped.
> > >
> > > Enabling DMA-API debugging catches this as well:
> > >
> > >   thunderbolt :06:00.0: DMA-API: device driver tries to free DMA
> > > memory it has not allocated [device address=0x68321000] 
> > > [size=4096 bytes]
> > >
> > > Fix this by clearing the bus address of a Tx frame right after we have
> > > unmapped the buffer.
> > >
> > > Signed-off-by: Mika Westerberg 
> >
> > Applied, but assuming zero is a non-valid DMA address is never a good
> > idea.  That's why we have the DMA error code signaling abstracted.
> 
> There does not seem to be a way to mark DMA address invalid in a driver
> so we probably need to add a flag to struct tbnet_frame instead.

Can you use the length?

David



RE: [PATCH] net: mvneta: fix handling of the Tx descriptor counter

2017-11-20 Thread David Laight
From: Simon Guinot
> Sent: 13 November 2017 15:36
> To: David Miller
> Cc: thomas.petazz...@free-electrons.com; netdev@vger.kernel.org; m...@gmx.de;
> andreas.tob...@cloudguard.ch; gregory.clem...@free-electrons.com; 
> antoine.ten...@free-electrons.com;
> m...@semihalf.com; sta...@vger.kernel.org
> Subject: Re: [PATCH] net: mvneta: fix handling of the Tx descriptor counter
> 
> On Mon, Nov 13, 2017 at 11:54:14PM +0900, David Miller wrote:
> > From: Simon Guinot 
> > Date: Mon, 13 Nov 2017 15:51:15 +0100
> >
> > > IIUC the driver stops the queue if a threshold of 316 Tx descriptors is
> > > reached (default and worst value).
> >
> > That's a lot of latency.
> 
> OK, then I'll keep the "tx_pending > 255" flushing condition. But note
> there is no other software mechanism to limit the Tx latency inside the
> mvneta driver. Should we add something ? And is that not rather the job
> of the network stack to keep track of the latency and to limit the txq
> size ?

This is 'first packet transmit latency'.

If the 'doorbell write' is just a PCIe write then, on most systems,
that is cheap and pipelined/posted.
I'd almost be surprised if you see any 'improvement' from not doing
it every packet.

The overall tx queue size is a different issue - usually needs
limiting by BQL if TSO is done.

David



Re: [PATCH net-next] net: thunderbolt: Clear finished Tx frame bus address in tbnet_tx_callback()

2017-11-20 Thread 'Mika Westerberg'
On Mon, Nov 20, 2017 at 02:46:53PM +, David Laight wrote:
> > There does not seem to be a way to mark DMA address invalid in a driver
> > so we probably need to add a flag to struct tbnet_frame instead.
> 
> Can you use the length?

Unfortunately no because size is 12-bit field in struct ring_frame and 0
means 4096 bytes.


Re: Linux ECN Handling

2017-11-20 Thread Neal Cardwell
On Mon, Nov 20, 2017 at 2:31 AM, Steve Ibanez  wrote:
> Hi Folks,
>
> I wanted to check back in on this for another update and to solicit
> some more suggestions. I did a bit more digging to try an isolate the
> problem.

Going back to one of your Oct 19 trace snapshots (attached), AFAICT at
the time of the timeout there is actually almost 64KBytes  (352553398
+ 1448 - 352489686 = 65160) of unacknowledged data. So there really
does seem to be a significant chunk of packets that were in-flight
that were then declared lost.

So here is a possibility: perhaps the combination of CWR+PRR plus
tcp_tso_should_defer() means that PRR can make cwnd so gentle that
tcp_tso_should_defer() thinks we should wait for another ACK to send,
and that ACK doesn't come. Breaking it, down, the potential sequence
would be:

(1) tcp_write_xmit() does not send, because the CWR behavior, using
PRR, does not leave enough cwnd for tcp_tso_should_defer() to think we
should send (PRR was originally designed for recovery, which did not
have TSO deferral)

(2) TLP does not fire, because we are in state CWR, not Open

(3) The only remaining option is an RTO, which fires.

In other words, the possibility is that, at the time of the stall, the
cwnd is reasonably high, but tcp_packets_in_flight() is also quite
high, so either there is (a) literally no unused cwnd left (
tcp_packets_in_flight() == cwnd), or (b) some mechanism like
tcp_tso_should_defer() is deciding that there is not enough available
cwnd for it to make sense to chop off a fraction of a TSO skb to send
now.

One way to test that conjecture would be to disable
tcp_tso_should_defer() by adding a:

   goto send_now;

at the top of tcp_tso_should_defer().

If that doesn't prevent the freezes then I would recommend adding
printks or other instrumentation to  tcp_write_xmit() to log:

- time
- ca_state
- cwnd
- ssthresh
- tcp_packets_in_flight()
- the reason for breaking out of the tcp_write_xmit() loop (tso
deferral, no packets left, tcp_snd_wnd_test, tcp_nagle_test, etc)

cheers,
neal


Re: [PATCH v3] net: qcom/emac: extend DMA mask to 46bits

2017-11-20 Thread Timur Tabi
This is much better.  Can you give me a few days to test it on some 
internal platforms?


Also, this is a candidate for 4.16, so you need to wait until net-next 
is open anyway (http://vger.kernel.org/~davem/net-next.html).


On 11/20/17 7:48 AM, Wang Dongsheng wrote:

Since PTP doesn't support yet, so extend the DMA address to 46bits.


This needs to be longer, since it's not clear that TPD3[31] is bot a 
timestamp bit and an address bit.  How about this:


Bit TPD3[31] is used as a timestamp bit if PTP is enabled,
but it's used as an address bit if PTP is disabled.  Since PTP isn't 
supported by the driver, we can extend the DMA address to 46 bits.




--
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm
Technologies, Inc.  Qualcomm Technologies, Inc. is a member of the
Code Aurora Forum, a Linux Foundation Collaborative Project.


Re: general protection fault in dst_destroy() - 4.13.9

2017-11-20 Thread Ido Schimmel
On Sun, Nov 19, 2017 at 12:45:41PM +, Anders K. Pedersen | Cohaesio wrote:
> Hello,
> 
> A few days ago, one of our routers (running Linux 4.13.9) crashed due
> to a general protection fault in dst_destroy(). At the time, it had run
> for several weeks without any problems, but then crashed three times in
> a row within a few minutes - all due to a general protection fault at
> dst_destroy()+0x35. Since then, it has run for several days without any
> further problems, so I suspect that this was triggered by a traffic
> pattern in the routed packets, but I don't have a way to reproduce it.
> 
> Disassembly shows that this is in the inlined dev_put(), which does
> this_cpu_dec(*dev->pcpu_refcnt). As far as I can tell there haven't
> been any fixes in this area since 4.13, and a Google search didn't find
> anything recent, so I'm guessing this is not a known problem.
> 
> I have included the kernel output via serial console below as well as
> gdb and objdump information. Please let me know, if I can provide any
> additional information.
> 
> 
> [2024260.461401] general protection fault:  [#1] SMP
> [2024260.467193] Modules linked in:
> [2024260.470897] CPU: 15 PID: 0 Comm: swapper/15 Tainted: GW   
> 4.13.9 #2
> [2024260.479488] Hardware name: Dell Inc. PowerEdge R730/0H21J3, BIOS 2.5.5 
> 08/16/2017
> [2024260.488279] task: 88085b625cc0 task.stack: c90e4000
> [2024260.495277] RIP: 0010:dst_destroy+0x35/0xa0
> [2024260.500277] RSP: 0018:88085f5c3f08 EFLAGS: 00010286
> [2024260.506474] RAX: 88085ac0e880 RBX: 88082cf9fb00 RCX: 
> 0020
> [2024260.514868] RDX: 88082cf9fbc0 RSI:  RDI: 
> 816786c0
> [2024260.523258] RBP:  R08: ff00 R09: 
> 
> [2024260.531649] R10:  R11:  R12: 
> 88085f5da678
> [2024260.540040] R13: 000a R14: 88085b625cc0 R15: 
> 88085b625cc0
> [2024260.548431] FS:  () GS:88085f5c() 
> knlGS:
> [2024260.557924] CS:  0010 DS:  ES:  CR0: 80050033
> [2024260.564719] CR2: 7fc800e48e88 CR3: 01809000 CR4: 
> 001406e0
> [2024260.573112] Call Trace:
> [2024260.576113]  
> [2024260.578618]  ? rcu_process_callbacks+0x18f/0x460
> [2024260.584126]  ? rebalance_domains+0xe2/0x290
> [2024260.589128]  ? __do_softirq+0x100/0x292
> [2024260.593727]  ? irq_exit+0x92/0xa0
> [2024260.597729]  ? smp_apic_timer_interrupt+0x39/0x50
> [2024260.603328]  ? apic_timer_interrupt+0x7c/0x90
> [2024260.608528]  
> [2024260.611134]  ? cpuidle_enter_state+0x14c/0x2b0
> [2024260.616432]  ? cpuidle_enter_state+0x128/0x2b0
> [2024260.621731]  ? do_idle+0xf9/0x190
> [2024260.625733]  ? cpu_startup_entry+0x5f/0x70
> [2024260.630636]  ? start_secondary+0x12a/0x130
> [2024260.635536]  ? secondary_startup_64+0x9f/0x9f
> [2024260.640731] Code: f6 47 60 08 48 8b 6f 18 74 62 48 8b 43 20 48 8b 40 30 
> 48 85 c0 74 05 48
> 89 df ff d0 48 8b 03 48 85 c0 74 0a 48 8b 80 e0 03 00 00 <65> ff 08 f6 43 60 
> 80 74 26 48 8d bb
> e0 00 00 00 e8 e6 7f 01 00
> [2024260.662626] RIP: dst_destroy+0x35/0xa0 RSP: 88085f5c3f08
> [2024260.669333] ---[ end trace 3c1827251806827c ]---
> [2024260.724173] Kernel panic - not syncing: Fatal exception in interrupt
> [2024261.102792] Kernel Offset: disabled
> [2024261.156022] Rebooting in 60 seconds..
> [2024321.167958] ACPI MEMORY or I/O RESET_REG.

This looks very similar to a bug Eric already fixed here:
https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git/commit/?id=222d7dbd258dad4cd5241c43ef818141fad5a87a

I don't see it in v4.13.9 which might explain why you're still hitting
it. Can you please try to reproduce with mentioned patch?

Thanks


Re: [PATCH linux-firmware 0/2] Mellanox: Add new mlxsw_spectrum firmware 13.1530.152

2017-11-20 Thread Ido Schimmel
On Thu, Nov 09, 2017 at 09:15:51AM +0200, Shalom Toledo wrote:
> This set adds a new firmware version 13.1530.152 as well as information about
> the previous firmware version of the mlxsw_spectrum driver

Ping?


Re: [kernel-hardening] [PATCH v4] scripts: add leaking_addresses.pl

2017-11-20 Thread Petr Mladek
On Mon 2017-11-13 11:16:28, kaiwan.billimo...@gmail.com wrote:
> On Mon, 2017-11-13 at 09:21 +1100, Tobin C. Harding wrote:
> > On Fri, Nov 10, 2017 at 07:26:34PM +0530, kaiwan.billimo...@gmail.com
> > >  - it currently hard-codes a global 'PAGE_OFFSET_32BIT=0xc000'
> > > , just
> > >  so I can test quickly; must figure whether to query it or pass it;
> > >  Suggestions?
> > 
> > Perhaps we should have a command line option for this.
> > 
> > --kernel-base-address
> 
> Why not just detect it programatically? We could devise a series of
> fallbacks; something like:
> - if .config exists in the kernel source tree root, grep it for
> PAGE_OFFSET
> - if not, grep the arch-specific (arch//configs/)
> for the same
> - if for some reason we don't have enough info regarding specific
> platform and thus the defconfig filename (could happen for ARM, PPC?),
> we then fail and request the user to pass it as a parameter.

You might also check /proc/config.gz.

Best Regards,
Petr


Re: Linux ECN Handling

2017-11-20 Thread Eric Dumazet
On Mon, Nov 20, 2017 at 7:01 AM, Neal Cardwell  wrote:
> Going back to one of your Oct 19 trace snapshots (attached), AFAICT at the
> time of the timeout there is actually almost 64KBytes  (352553398 + 1448 -
> 352489686 = 65160) of unacknowledged data. So there really does seem to be a
> significant chunk of packets that were in-flight that were then declared
> lost.
>
> So here is a possibility: perhaps the combination of CWR+PRR plus
> tcp_tso_should_defer() means that PRR can make cwnd so gentle that
> tcp_tso_should_defer() thinks we should wait for another ACK to send, and
> that ACK doesn't come. Breaking it, down, the potential sequence would be:
>
> (1) tcp_write_xmit() does not send, because the CWR behavior, using PRR,
> does not leave enough cwnd for tcp_tso_should_defer() to think we should
> send (PRR was originally designed for recovery, which did not have TSO
> deferral)
>
> (2) TLP does not fire, because we are in state CWR, not Open
>
> (3) The only remaining option is an RTO, which fires.
>
> In other words, the possibility is that, at the time of the stall, the cwnd
> is reasonably high, but tcp_packets_in_flight() is also quite high, so
> either there is (a) literally no unused cwnd left ( tcp_packets_in_flight()
> == cwnd), or (b) some mechanism like tcp_tso_should_defer() is deciding that
> there is not enough available cwnd for it to make sense to chop off a
> fraction of a TSO skb to send now.
>
> One way to test that conjecture would be to disable tcp_tso_should_defer()
> by adding a:
>
>goto send_now;
>
> at the top of tcp_tso_should_defer().
>
> If that doesn't prevent the freezes then I would recommend adding printks or
> other instrumentation to  tcp_write_xmit() to log:
>
> - time
> - ca_state
> - cwnd
> - ssthresh
> - tcp_packets_in_flight()
> - the reason for breaking out of the tcp_write_xmit() loop (tso deferral, no
> packets left, tcp_snd_wnd_test, tcp_nagle_test, etc)
>
> cheers,
> neal
>
>
>
> On Mon, Nov 20, 2017 at 2:31 AM, Steve Ibanez  wrote:
>>
>> Hi Folks,
>>
>> I wanted to check back in on this for another update and to solicit
>> some more suggestions. I did a bit more digging to try an isolate the
>> problem.
>>
>> As I explained earlier, the log generated by tcp_probe indicates that
>> the snd_cwnd is set to 1 just before the end host receives an ECN
>> marked ACK and unexpectedly enters a timeout (
>> https://drive.google.com/open?id=1iyt8PvBxQga2jpRpBJ8KdQw3Q_mPTzZF ).
>> I was trying to track down where this is happening, but the only place
>> I could find that might be setting the snd_cwnd to 1 is in the
>> tcp_enter_loss() function. I inserted a printk() call in this function
>> to see when it is being invoked and it looks like it is only called by
>> the tcp_retransmit_timer() function after the timer expires.
>>
>> I decided to try recording the snd_cwnd, ss-thresh, and icsk_ca_state
>> inside the tcp_fastretrans_alert() function whenever it processes an
>> ECN marked ACK (
>> https://drive.google.com/open?id=17GD77lb9lkCSu0_s9p40GZ5r4EU8B4VB )
>> This plot also shows when the tcp_retransmit_timer() and
>> tcp_enter_loss() functions are invoked (red and purple dots
>> respectively). And I see that the ACK state machine is always either
>> in the TCP_CA_Open or TCP_CA_CWR state whenever the
>> tcp_fastretrans_alert() function processes ECN marked ACKs (
>> https://drive.google.com/open?id=1xwuPxjgwriT9DSblFx2uILfQ95Fy-Eqq ).
>> So I'm not sure where the snd_cwnd is being set to 1 (or possibly 0 as
>> Neal suggested) just before entering a timeout. Any suggestions here?
>>
>> In order to do a bit of profiling of the tcp_dctcp code I added
>> support into tcp_probe for recording the dctcp alpha parameter. I see
>> that alpha oscillates around about 0.1 when the flow rates have
>> converged, it goes to zero when the other host enters a timeout, and I
>> don't see any unexpected behavior just before the timeout (
>> https://drive.google.com/open?id=1zPdyS57TrUYZIekbid9p1UNyraLYrdw7 ).
>>
>> So I haven't had much luck yet trying to track down where the problem
>> is. If you have any suggestions that would help me to focus my search
>> efforts, I would appreciate the comments.
>>
>> Thanks!
>> -Steve

Steve, what HZ value your kernel is compiled with ?


Request for queuing to stable: pci id's

2017-11-20 Thread Ganesh Goudar
Hi David,

Can you please queue the below commits to 4.9 stable,
which add pci id's of T5 and T6 cards. These commits apply
as is.

5d071c24f0cb8ce9fb5642c2a65ab5ab7f5ad244
29db39841896de99dcb3b1deaed61a13cb9d8036
12eb070babbcab4b003e060933971089864a6a54
89ff67718c900754d2aa5c8e37efbe607be36154
803d5b6ebfb06a0d2ee3699fea4f1c7593958566
34929cb4d691f7f9e217ba0e3f536978cd56aa6c
acd669a8f67ed47f5edd385741486cc7a259a446
652faa98ec383c25296fb8493f17060a2c7e3438
36bf994a80571aeee2549db1bc93e34342f40c24

Thanks
Ganesh


pull-request: mac80211 2017-11-20

2017-11-20 Thread Johannes Berg
Hi Dave,

Sorry this is coming now, I had a super hectic time after travel
since I had to catch up after two weeks of being away ...

That's not really an excuse, but I'm asking you anyway to pull
Kees's timer conversions with the fixes since he really wants
to make more changes on top and clean this up properly. I've had
those pending, but in the hectic week after coming back didn't
send the mac80211-next pull request that I should have done.

If you don't want that, let me know and I'll respin without it.

Otherwise, please pull and let me know if there's any problem.

Thanks,
johannes



The following changes since commit 32a72bbd5da2411eab591bf9bc2e39349106193a:

  net: vxge: Fix some indentation issues (2017-11-20 11:36:30 +0900)

are available in the git repository at:

  ssh://korg/pub/scm/linux/kernel/git/jberg/mac80211.git 
tags/mac80211-for-davem-2017-11-20

for you to fetch changes up to 33ddd81e2bd5d9970b9f01ab383ba45035fa41ee:

  mac80211: properly free requested-but-not-started TX agg sessions (2017-11-20 
17:01:31 +0100)


A few things:
 * straggler timer conversions from Kees
 * memory leak fix in hwsim
 * fix some fallout from regdb changes if wireless is built-in
 * also free aggregation sessions in startup state when station
   goes away, to avoid crashing the timer


Ben Hutchings (1):
  mac80211_hwsim: Fix memory leak in hwsim_new_radio_nl()

Johannes Berg (3):
  nl80211: don't expose wdev->ssid for most interfaces
  cfg80211: initialize regulatory keys/database later
  mac80211: properly free requested-but-not-started TX agg sessions

Kees Cook (2):
  mac80211: Convert timers to use timer_setup()
  mac80211: aggregation: Convert timers to use timer_setup()

 drivers/net/wireless/mac80211_hwsim.c |  5 +++-
 net/mac80211/agg-rx.c | 41 -
 net/mac80211/agg-tx.c | 49 ---
 net/mac80211/ibss.c   |  7 +++--
 net/mac80211/ieee80211_i.h|  3 ++-
 net/mac80211/led.c| 11 
 net/mac80211/main.c   |  3 +--
 net/mac80211/mesh.c   | 27 +--
 net/mac80211/mesh.h   |  2 +-
 net/mac80211/mesh_hwmp.c  |  4 +--
 net/mac80211/mesh_pathtbl.c   |  3 +--
 net/mac80211/mlme.c   | 32 ++-
 net/mac80211/ocb.c| 10 +++
 net/mac80211/sta_info.c   | 15 +++
 net/mac80211/sta_info.h   | 12 +++--
 net/wireless/nl80211.c| 26 +--
 net/wireless/reg.c| 42 +++---
 17 files changed, 155 insertions(+), 137 deletions(-)


Re: wcn36xx: fix iris child-node lookup

2017-11-20 Thread Kalle Valo
Johan Hovold  wrote:

> Fix child-node lookup during probe, which ended up searching the whole
> device tree depth-first starting at the parent rather than just matching
> on its children.
> 
> To make things worse, the parent mmio node was also prematurely freed.
> 
> Fixes: fd52bdae9ab0 ("wcn36xx: Disable 5GHz for wcn3620")
> Cc: Loic Poulain 
> Signed-off-by: Johan Hovold 
> Signed-off-by: Kalle Valo 

Patch applied to ath-current branch of ath.git, thanks.

1967c12896e0 wcn36xx: fix iris child-node lookup

-- 
https://patchwork.kernel.org/patch/10054441/

https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches



Re: [PATCH 5/8] crypto: remove unused hardirq.h

2017-11-20 Thread Yang Shi

The email to Herbert is returned, resent it.

Yang


On 11/17/17 3:02 PM, Yang Shi wrote:

Preempt counter APIs have been split out, currently, hardirq.h just
includes irq_enter/exit APIs which are not used by crypto at all.

So, remove the unused hardirq.h.

Signed-off-by: Yang Shi 
Cc: Herbert Xu 
Cc: "David S. Miller" 
Cc: linux-cry...@vger.kernel.org
---
  crypto/ablk_helper.c | 1 -
  crypto/blkcipher.c   | 1 -
  crypto/mcryptd.c | 1 -
  3 files changed, 3 deletions(-)

diff --git a/crypto/ablk_helper.c b/crypto/ablk_helper.c
index 1441f07..ee52660 100644
--- a/crypto/ablk_helper.c
+++ b/crypto/ablk_helper.c
@@ -28,7 +28,6 @@
  #include 
  #include 
  #include 
-#include 
  #include 
  #include 
  #include 
diff --git a/crypto/blkcipher.c b/crypto/blkcipher.c
index 6c43a0a..01c0d4a 100644
--- a/crypto/blkcipher.c
+++ b/crypto/blkcipher.c
@@ -18,7 +18,6 @@
  #include 
  #include 
  #include 
-#include 
  #include 
  #include 
  #include 
diff --git a/crypto/mcryptd.c b/crypto/mcryptd.c
index 4e64726..9fa362c 100644
--- a/crypto/mcryptd.c
+++ b/crypto/mcryptd.c
@@ -26,7 +26,6 @@
  #include 
  #include 
  #include 
-#include 
  
  #define MCRYPTD_MAX_CPU_QLEN 100

  #define MCRYPTD_BATCH 9



Re: [E1000-devel] Questions about crashes and GRO

2017-11-20 Thread Alexander Duyck
Hi Sarah,

I am adding the netdev mailing list as I am not certain this is an
i350 specific issue. The traces themselves aren't anything I recognize
as an existing issue. From what I can tell it looks like you are
running Xen, so would I be correct in assuming you are bridging
between VMs? If so are you using any sort of tunnels on your network,
if so what type? This information would be useful as we may be looking
at a bug in a tunnel offload for GRO.

On Fri, Nov 17, 2017 at 3:28 PM, Sarah Newman  wrote:
> Hi,
>
> I have an X10 supermicro with two I350's that has crashed twice now under 
> v4.9.39 within the last 3 weeks, with no crashes before v4.9.39:

What was the last kernel you tested before v4.9.39? Just wondering as
it will help to rule out certain patches as possibly being the issue.

> $ /sbin/lspci | grep -i ethernet
> 02:00.0 Ethernet controller: Intel Corporation I350 Gigabit Network 
> Connection (rev 01)
> 02:00.1 Ethernet controller: Intel Corporation I350 Gigabit Network 
> Connection (rev 01)
> 04:00.0 Ethernet controller: Intel Corporation I350 Gigabit Network 
> Connection (rev 01)
> 04:00.1 Ethernet controller: Intel Corporation I350 Gigabit Network 
> Connection (rev 01)
>
> And some X9 supermicro's that have not crashed, with a single I350 I believe:
> $ /sbin/lspci | grep -i ethernet
> 06:00.0 Ethernet controller: Intel Corporation I350 Gigabit Network 
> Connection (rev 01)
> 06:00.1 Ethernet controller: Intel Corporation I350 Gigabit Network 
> Connection (rev 01)
> 06:00.2 Ethernet controller: Intel Corporation I350 Gigabit Network 
> Connection (rev 01)
> 06:00.3 Ethernet controller: Intel Corporation I350 Gigabit Network 
> Connection (rev 01)
>
> I see in the release notes 
> https://downloadmirror.intel.com/22919/eng/README.txt " Do Not Use LRO When 
> Routing Packets."
>
> We are bridging traffic, not routing, and the crashes are in the GRO code.
>
> Is it possible there are problems with GRO for bridging in the igb driver 
> now? If I disable GRO can I have some confidence it will fix the issue?

As far as LRO not being used when routing, just so you know LRO and
GRO are two very different things. One of the issues with LRO is that
it wasn't reversible in some cases and so could lead to the packet
being changed if they were rerouted. With GRO that shouldn't be the
case as we should be able to get back out the original packets that
were put into a frame. So there shouldn't be any issues using GRO with
bridging or routing.

GRO isn't in the driver. It is in the network stack of the kernel
itself. The only responsibility of igb is to provide the frames in the
correct format so that they can be assembled by GRO if it is enabled.

> Here are my offload settings:
> Features for eth0:
> rx-checksumming: on
> tx-checksumming: on
> tx-checksum-ipv4: off [fixed]
> tx-checksum-ip-generic: on
> tx-checksum-ipv6: off [fixed]
> tx-checksum-fcoe-crc: off [fixed]
> tx-checksum-sctp: on
> scatter-gather: on
> tx-scatter-gather: on
> tx-scatter-gather-fraglist: off [fixed]
> tcp-segmentation-offload: on
> tx-tcp-segmentation: on
> tx-tcp-ecn-segmentation: off [fixed]
> tx-tcp-mangleid-segmentation: off
> tx-tcp6-segmentation: on
> udp-fragmentation-offload: off [fixed]
> generic-segmentation-offload: on
> generic-receive-offload: on
> large-receive-offload: off [fixed]
> rx-vlan-offload: on
> tx-vlan-offload: on
> ntuple-filters: off
> receive-hashing: on
> highdma: on [fixed]
> rx-vlan-filter: on [fixed]
> vlan-challenged: off [fixed]
> tx-lockless: off [fixed]
> netns-local: off [fixed]
> tx-gso-robust: off [fixed]
> tx-fcoe-segmentation: off [fixed]
> tx-gre-segmentation: on
> tx-gre-csum-segmentation: on
> tx-ipxip4-segmentation: on
> tx-ipxip6-segmentation: on
> tx-udp_tnl-segmentation: on
> tx-udp_tnl-csum-segmentation: on
> tx-gso-partial: on
> tx-sctp-segmentation: off [fixed]
> fcoe-mtu: off [fixed]
> tx-nocache-copy: off
> loopback: off [fixed]
> rx-fcs: off [fixed]
> rx-all: off
> tx-vlan-stag-hw-insert: off [fixed]
> rx-vlan-stag-hw-parse: off [fixed]
> rx-vlan-stag-filter: off [fixed]
> l2-fwd-offload: off [fixed]
> busy-poll: off [fixed]
> hw-tc-offload: off [fixed]
>
> First crash:
>
> [4083386.299221] [ cut here ]
> [4083386.299358] WARNING: CPU: 0 PID: 0 at net/ipv4/af_inet.c:1473 
> inet_gro_complete+0xbb/0xd0
> [4083386.299520] Modules linked in: sb_edac edac_core 8021q mrp garp 
> nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack xt_physdev 
> ip6table_filter
> ip6_tables xen_pciback blktap xen_netback xen_gntdev xen_gnt
> alloc xenfs xen_privcmd xen_evtchn xen_blkback tun sch_htb fuse ext2 ebt_mark 
> ebt_ip ebt_arp ebtable_filter ebtables drbd lru_cache cls_fw
> br_netfilter bridge stp llc iTCO_wdt iTCO_vendor_support pcspkr raid456 
> async_raid6_recov async_pq
>  async_xor xor async_memcpy async_tx raid10 raid6_pq libcrc32c joydev shpchp 
> i2c_i801 i2

Re: [PATCH][v4] uprobes/x86: emulate push insns for uprobe on x86

2017-11-20 Thread Oleg Nesterov
On 11/17, Yonghong Song wrote:
>
> On 11/17/17 9:25 AM, Oleg Nesterov wrote:
> >On 11/15, Yonghong Song wrote:
> >>
> >>v3 -> v4:
> >>   . Revert most of v3 change as 32bit emulation is not really working
> >> on x86_64 platform as among other issues, function emulate_push_stack()
> >> needs to account for 32bit app on 64bit platform.
> >> A separate effort is ongoing to address this issue.
> >
> >Reviewed-by: Oleg Nesterov 
> >
> >
> >
> >Please test your patch with the fix below, in this particular case the
> >TIF_IA32 check should be fine. Although this is not what we really want,
> >we should probably use user_64bit_mode(regs) which checks ->cs. But this
> >needs more changes and doesn't solve other problems (get_unmapped_area)
> >so I still can't decide what should we do right now...
>
> I tested the below change with my patch. On x86_64, both 64bit and 32bit
> program can be uprobe emulated properly.

Good, so your patch is fine.

> On x86_32, however, there is a
> compilation error like below:

Yes, yes, when I said "in this particular case" I meant x86_64 system only.

Sorry for confusion, I asked you to test this additional change just to
ensure that we didn't miss something and your patch has no problems with
32bit tasks on 64bit system, except those we need to fix anyway.

Oleg.



Re: len = bpf_probe_read_str(); bpf_perf_event_output(... len) == FAIL

2017-11-20 Thread Yonghong Song



On 11/20/17 5:31 AM, Arnaldo Carvalho de Melo wrote:

Em Tue, Nov 14, 2017 at 09:25:17PM +0100, Daniel Borkmann escreveu:

On 11/14/2017 07:15 PM, Yonghong Song wrote:

On 11/14/17 6:19 AM, Daniel Borkmann wrote:

On 11/14/2017 02:42 PM, Arnaldo Carvalho de Melo wrote:

Em Tue, Nov 14, 2017 at 02:09:34PM +0100, Daniel Borkmann escreveu:

On 11/14/2017 01:58 PM, Arnaldo Carvalho de Melo wrote:

Em Tue, Nov 14, 2017 at 01:09:39AM +0100, Daniel Borkmann escreveu:

On 11/13/2017 04:08 PM, Arnaldo Carvalho de Melo wrote:

libbpf: -- BEGIN DUMP LOG ---
libbpf:
0: (79) r3 = *(u64 *)(r1 +104)
1: (b7) r2 = 0
2: (bf) r6 = r1
3: (bf) r1 = r10
4: (07) r1 += -128
5: (b7) r2 = 128
6: (85) call bpf_probe_read_str#45
7: (bf) r1 = r0
8: (07) r1 += -1
9: (67) r1 <<= 32
10: (77) r1 >>= 32
11: (25) if r1 > 0x7f goto pc+11


Right, so the compiler is optimizing the two tests into a single one above,
which means lower bound cannot properly be derived again by the verifier due
to this and thus you'll get the error. Similar issue was seen recently [1].

Does the below hack work for you?

int prog([...])
{
  char filename[128];
  int ret = bpf_probe_read_str(filename, sizeof(filename), 
filename_ptr);
  if (ret > 0)
  bpf_perf_event_output(ctx, &__bpf_stdout__, 
BPF_F_CURRENT_CPU, filename,
    ret & (sizeof(filename) - 1));
  return 1;
}

r0 should keep on tracking bounds here at least:

prog:
     0:    bf 16 00 00 00 00 00 00 r6 = r1
     1:    bf a1 00 00 00 00 00 00 r1 = r10
     2:    07 01 00 00 80 ff ff ff r1 += -128
     3:    b7 02 00 00 80 00 00 00 r2 = 128
     4:    85 00 00 00 2d 00 00 00 call 45
     5:    67 00 00 00 20 00 00 00 r0 <<= 32
     6:    c7 00 00 00 20 00 00 00 r0 s>>= 32
     7:    b7 01 00 00 01 00 00 00 r1 = 1
     8:    6d 01 0a 00 00 00 00 00 if r1 s> r0 goto 10
     9:    57 00 00 00 7f 00 00 00 r0 &= 127
    10:    bf a4 00 00 00 00 00 00 r4 = r10
    11:    07 04 00 00 80 ff ff ff r4 += -128
    12:    bf 61 00 00 00 00 00 00 r1 = r6
    13:    18 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r2 = 0ll
    15:    18 03 00 00 ff ff ff ff 00 00 00 00 00 00 00 00 r3 = 
4294967295ll
    17:    bf 05 00 00 00 00 00 00 r5 = r0
    18:    85 00 00 00 19 00 00 00 call 25

    [1] 
https://urldefense.proofpoint.com/v2/url?u=http-3A__patchwork.ozlabs.org_project_netdev_list_-3Fseries-3D13211&d=DwIDaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=DA8e1B5r073vIqRrFz7MRA&m=Qp3xFfXEz-CT8rzYtrHeXbow2M6FlsUzwcY32i3_2Q0&s=z0d6b_hxStA845Kh7epJ-JiFwkiWqUH_z3fEadwqAQY&e=


Not yet:

6: (85) call bpf_probe_read_str#45
7: (bf) r1 = r0
8: (67) r1 <<= 32
9: (77) r1 >>= 32
10: (15) if r1 == 0x0 goto pc+10
   R0=inv(id=0) R1=inv(id=0,umax_value=4294967295,var_off=(0x0; 0x)) 
R6=ctx(id=0,off=0,imm=0) R10=fp0
11: (57) r0 &= 127
12: (bf) r4 = r10
13: (07) r4 += -128
14: (bf) r1 = r6
15: (18) r2 = 0x92bfc2aba840u
17: (18) r3 = 0x
19: (bf) r5 = r0
20: (85) call bpf_perf_event_output#25
invalid stack type R4 off=-128 access_size=0

I'll try updating clang/llvm...

Full details:

[root@jouet bpf]# cat open.c
#include "bpf.h"

SEC("prog=do_sys_open filename")
int prog(void *ctx, int err, const char __user *filename_ptr)
{
 char filename[128];
 const unsigned len = bpf_probe_read_str(filename, sizeof(filename), 
filename_ptr);


Btw, I was using 'int' here above instead of 'unsigned' as strncpy_from_unsafe()
could potentially return errors like -EFAULT.


I changed to int, didn't help
  

Currently having a version compiled from the git tree:

# llc --version
LLVM 
(https://urldefense.proofpoint.com/v2/url?u=http-3A__llvm.org_&d=DwIDaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=DA8e1B5r073vIqRrFz7MRA&m=Qp3xFfXEz-CT8rzYtrHeXbow2M6FlsUzwcY32i3_2Q0&s=BKC_Gu9s1hw0v13OCgCpfsGtAY2hE7dujFqg8LNaK2I&e=):
    LLVM version 6.0.0git-2d810c2
    Optimized build.
    Default target: x86_64-unknown-linux-gnu
    Host CPU: skylake


[root@jouet bpf]# llc --version
LLVM 
(https://urldefense.proofpoint.com/v2/url?u=http-3A__llvm.org_&d=DwIDaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=DA8e1B5r073vIqRrFz7MRA&m=Qp3xFfXEz-CT8rzYtrHeXbow2M6FlsUzwcY32i3_2Q0&s=BKC_Gu9s1hw0v13OCgCpfsGtAY2hE7dujFqg8LNaK2I&e=):
    LLVM version 4.0.0svn

Old stuff! ;-) Will change, but improving these messages should be on
the radar, I think :-)


Yep, agree, I think we need a generic, better solution for this type of
issue instead of converting individual helpers to handle 0 min bound and
then only bailing out in such case; need to brainstorm a bit on that.

I think for the above in your case ...

   [...]
    6: (85) call bpf_probe_read_str#45
    7: (bf) r1 = r0
    8: (67) r1 <<= 32
    9: (77) r1 >>= 32
   10: (15) if r1 == 0x0 goto pc+10
    R0=inv(id=0) R1=inv(id=0,umax_value=4294967295,var_off=(0x0; 0x)) 
R6=ctx(id=0,off=0,imm=0) R10=fp0
   11: (57) r0 &= 127
   

Re: [PATCH] net: sched: crash on blocks with goto chain action

2017-11-20 Thread Cong Wang
On Sun, Nov 19, 2017 at 8:17 AM, Roman Kapl  wrote:
> tcf_block_put_ext has assumed that all filters (and thus their goto
> actions) are destroyed in RCU callback and thus can not race with our
> list iteration. However, that is not true during netns cleanup (see
> tcf_exts_get_net comment).
>
> Prevent the user after free by holding the current list element we are
> iterating over (foreach_safe is not enough).

Hmm...

Looks like we need to restore the trick we used previously, that is
holding refcnt for all list entries before this list iteration.


[PATCH net,stable] net: qmi_wwan: add Quectel BG96 2c7c:0296

2017-11-20 Thread Sebastian Sjoholm
Quectel BG96 is an Qualcomm MDM9206 based IoT modem, supporting both 
CAT-M and NB-IoT. Tested hardware is BG96 mounted on Quectel development 
board (EVB). The USB id is added to qmi_wwan.c to allow QMI 
communication with the BG96.

Signed-off-by: Sebastian Sjoholm 

---
 drivers/net/usb/qmi_wwan.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c
index 720a3a248070..c750cf7c042b 100644
--- a/drivers/net/usb/qmi_wwan.c
+++ b/drivers/net/usb/qmi_wwan.c
@@ -1239,6 +1239,7 @@ static const struct usb_device_id products[] = {
{QMI_FIXED_INTF(0x1e0e, 0x9001, 5)},/* SIMCom 7230E */
{QMI_QUIRK_SET_DTR(0x2c7c, 0x0125, 4)}, /* Quectel EC25, EC20 R2.0  
Mini PCIe */
{QMI_QUIRK_SET_DTR(0x2c7c, 0x0121, 4)}, /* Quectel EC21 Mini PCIe */
+   {QMI_FIXED_INTF(0x2c7c, 0x0296, 4)},/* Quectel BG96 */
 
/* 4. Gobi 1000 devices */
{QMI_GOBI1K_DEVICE(0x05c6, 0x9212)},/* Acer Gobi Modem Device */
-- 
2.11.0 (Apple Git-81)



Re: [PATCH RFC 0/5] Support asynchronous crypto for IPsec GSO.

2017-11-20 Thread John Fastabend
On 11/20/2017 05:09 AM, David Miller wrote:
> From: Steffen Klassert 
> Date: Mon, 20 Nov 2017 08:37:47 +0100
> 
>> This patchset implements asynchronous crypto handling
>> in the layer 2 TX path. With this we can allow IPsec
>> ESP GSO for software crypto. This also merges the IPsec
>> GSO and non-GSO paths to both use validate_xmit_xfrm().
>  ...
> 
> Code looks generally fine to me.  Only thing of note is that this
> adds a new dev_requeue_skb() call site and that might intersect with
> John Fastabend's RFC work to make qdiscs lockless.

Right, fortunately it doesn't appear that the conflicts are too
hard to resolve. It looks like sch_redirect_xmit() will need to be
resolved with an additional check to test if the qdisc lock is
needed. Then assuming my reading is correct validate_xmit_xfrm(),
xfrm_dev_resume(), and xfrm_dev_backlog() can run without qdisc
lock already.

Thanks,
John

> 
> Also, please adhere to the reverse christmas tree rule for the
> ordering of your function local variables.
> 
> Thank you.
> 



Re: [PATCH][v4] uprobes/x86: emulate push insns for uprobe on x86

2017-11-20 Thread Yonghong Song



On 11/20/17 8:41 AM, Oleg Nesterov wrote:

On 11/17, Yonghong Song wrote:


On 11/17/17 9:25 AM, Oleg Nesterov wrote:

On 11/15, Yonghong Song wrote:


v3 -> v4:
   . Revert most of v3 change as 32bit emulation is not really working
 on x86_64 platform as among other issues, function emulate_push_stack()
 needs to account for 32bit app on 64bit platform.
 A separate effort is ongoing to address this issue.


Reviewed-by: Oleg Nesterov 



Please test your patch with the fix below, in this particular case the
TIF_IA32 check should be fine. Although this is not what we really want,
we should probably use user_64bit_mode(regs) which checks ->cs. But this
needs more changes and doesn't solve other problems (get_unmapped_area)
so I still can't decide what should we do right now...


I tested the below change with my patch. On x86_64, both 64bit and 32bit
program can be uprobe emulated properly.


Good, so your patch is fine.


Thanks!




On x86_32, however, there is a
compilation error like below:


Yes, yes, when I said "in this particular case" I meant x86_64 system only.

Sorry for confusion, I asked you to test this additional change just to
ensure that we didn't miss something and your patch has no problems with
32bit tasks on 64bit system, except those we need to fix anyway.


Understood. I actually tried a little to see whether I could have a 
simple way to fix 32bit compilation error without using ugly "#ifdef 
CONFIG_X86_64". Maybe is_64bit_mm is a good choice. But we could defer 
this until you have a comprehensive fix for 32bit app uprobe on 64bit 
systems as there are multiple issues for this.




Oleg.



Re: [PATCH net,stable] net: qmi_wwan: add Quectel BG96 2c7c:0296

2017-11-20 Thread Bjørn Mork
Sebastian Sjoholm  writes:

> Quectel BG96 is an Qualcomm MDM9206 based IoT modem, supporting both 
> CAT-M and NB-IoT. Tested hardware is BG96 mounted on Quectel development 
> board (EVB). The USB id is added to qmi_wwan.c to allow QMI 
> communication with the BG96.
>
> Signed-off-by: Sebastian Sjoholm 

Perfect.  Thanks.

Acked-by: Bjørn Mork 


[PATCH v2 01/31] net: Assign net to net_namespace_list in setup_net()

2017-11-20 Thread Kirill Tkhai
This patch merges two repeating pieces of code in one,
and they will live in setup_net() now.

It acts as cleanup even despite init_net_initialized
assignment is reordered with the linking of net now.
This variable is need for proc_net_init() called from:

start_kernel()->proc_root_init()->proc_net_init(),

which can't race with net_ns_init(), called from
initcall.

Signed-off-by: Kirill Tkhai 
---
 net/core/net_namespace.c |   13 +++--
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index b797832565d3..7ecf71050ffa 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -296,6 +296,9 @@ static __net_init int setup_net(struct net *net, struct 
user_namespace *user_ns)
if (error < 0)
goto out_undo;
}
+   rtnl_lock();
+   list_add_tail_rcu(&net->list, &net_namespace_list);
+   rtnl_unlock();
 out:
return error;
 
@@ -417,11 +420,6 @@ struct net *copy_net_ns(unsigned long flags,
 
net->ucounts = ucounts;
rv = setup_net(net, user_ns);
-   if (rv == 0) {
-   rtnl_lock();
-   list_add_tail_rcu(&net->list, &net_namespace_list);
-   rtnl_unlock();
-   }
mutex_unlock(&net_mutex);
if (rv < 0) {
dec_net_namespaces(ucounts);
@@ -847,11 +845,6 @@ static int __init net_ns_init(void)
panic("Could not setup the initial network namespace");
 
init_net_initialized = true;
-
-   rtnl_lock();
-   list_add_tail_rcu(&init_net.list, &net_namespace_list);
-   rtnl_unlock();
-
mutex_unlock(&net_mutex);
 
register_pernet_subsys(&net_ns_ops);



[PATCH v2 21/31] net: Convert genl_pernet_ops

2017-11-20 Thread Kirill Tkhai
This pernet_operations create and destroy net::genl_sock.
Foreign pernet_operations don't touch it.

Signed-off-by: Kirill Tkhai 
---
 net/netlink/genetlink.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index d444daf1ac04..a66fad4c5ffa 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -1035,6 +1035,7 @@ static void __net_exit genl_pernet_exit(struct net *net)
 static struct pernet_operations genl_pernet_ops = {
.init = genl_pernet_init,
.exit = genl_pernet_exit,
+   .async = true,
 };
 
 static int __init genl_init(void)



[PATCH v2 23/31] net: Convert sysctl_core_ops

2017-11-20 Thread Kirill Tkhai
These pernet_operations register and destroy sysctl
directory, and it's not interested for foreign
pernet_operations.

Signed-off-by: Kirill Tkhai 
---
 net/core/sysctl_net_core.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index cbc3dde4cfcc..1f8c94d726da 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -520,6 +520,7 @@ static __net_exit void sysctl_core_net_exit(struct net *net)
 static __net_initdata struct pernet_operations sysctl_core_ops = {
.init = sysctl_core_net_init,
.exit = sysctl_core_net_exit,
+   .async = true,
 };
 
 static __init int sysctl_core_init(void)



[PATCH v2 27/31] net: Convert ipv4_sysctl_ops

2017-11-20 Thread Kirill Tkhai
These pernet_operations create and destroy sysctl,
which are not touched by anybody else.

Signed-off-by: Kirill Tkhai 
---
 net/ipv4/sysctl_net_ipv4.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 93e172118a94..89683d868b37 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -1219,6 +1219,7 @@ static __net_exit void ipv4_sysctl_exit_net(struct net 
*net)
 static __net_initdata struct pernet_operations ipv4_sysctl_ops = {
.init = ipv4_sysctl_init_net,
.exit = ipv4_sysctl_exit_net,
+   .async = true,
 };
 
 static __init int sysctl_ipv4_init(void)



[PATCH v2 29/31] net: Convert loopback_net_ops

2017-11-20 Thread Kirill Tkhai
These pernet_operations have only init() method. It allocates
memory for net_device, calls register_netdev() and assigns
net::loopback_dev.

register_netdev() is allowed be used without additional locks,
as it's synchronized on rtnl_lock(). There are many examples
of using this functon directly from ioctl().

The only difference, compared to ioctl(), is that net is not
completely alive at this moment. But it looks like, there is
no way for parallel pernet_operations to dereference
the net_device, as the most of struct net_device lists,
where it's linked, are related to net, and the net is not liked.

The exceptions are net_device::unreg_list, close_list, todo_list,
used for unregistration, and ::link_watch_list, where net_device
may be linked to global lists.

Unregistration of loopback_dev obviously can't happen, when
loopback_net_init() is executing, as the net as alive. It occurs
in default_device_ops, which currently requires net_mutex,
and it behaves as a barrier at the moment. It will be considered
in next patch.

Speaking about link_watch_list, it seems, there is no way
for loopback_dev at time of registration to be linked in lweventlist
and be available for another pernet_operations.

Signed-off-by: Kirill Tkhai 
---
 drivers/net/loopback.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index 30612497643c..b97a907ea5aa 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -230,4 +230,5 @@ static __net_init int loopback_net_init(struct net *net)
 /* Registered in net/core/dev.c */
 struct pernet_operations __net_initdata loopback_net_ops = {
.init = loopback_net_init,
+   .async = true,
 };



[PATCH v2 31/31] net: Convert diag_net_ops

2017-11-20 Thread Kirill Tkhai
These pernet operations just create and destroy netlink
socket. The socket is pernet and else operations don't
touch it.

Signed-off-by: Kirill Tkhai 
---
 net/core/sock_diag.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index 217f4e3b82f6..220130aee51d 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -328,6 +328,7 @@ static void __net_exit diag_net_exit(struct net *net)
 static struct pernet_operations diag_net_ops = {
.init = diag_net_init,
.exit = diag_net_exit,
+   .async = true,
 };
 
 static int __init sock_diag_init(void)



[PATCH v2 30/31] net: Convert default_device_ops

2017-11-20 Thread Kirill Tkhai
These pernet operations consist of exit() and exit_batch() methods.

default_device_exit() moves not-local and virtual devices to init_net.
There is nothing exiting, because this may happen in any time
on a working system, and rtnl_lock() and synchronize_net() protect
us from all cases of external dereference.

The same for default_device_exit_batch(). Similar unregisteration
may happen in any time on a system. Here several lists (like todo_list),
which are accessed under rtnl_lock(). After rtnl_unlock() and
netdev_run_todo() all the devices are flushed.

Signed-off-by: Kirill Tkhai 
---
 net/core/dev.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/net/core/dev.c b/net/core/dev.c
index 41a576a17430..914fdb260aae 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -8757,6 +8757,7 @@ static void __net_exit default_device_exit_batch(struct 
list_head *net_list)
 static struct pernet_operations __net_initdata default_device_ops = {
.exit = default_device_exit,
.exit_batch = default_device_exit_batch,
+   .async = true,
 };
 
 /*



[PATCH v2 28/31] net: Convert addrconf_ops

2017-11-20 Thread Kirill Tkhai
These pernet_operations (un)register sysctl, which
are not touched by anybody else.

So, it's safe to make them async.

Signed-off-by: Kirill Tkhai 
---
 net/ipv6/addrconf.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index a0ae1c9d37df..fb7cf120daa7 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -6523,6 +6523,7 @@ static void __net_exit addrconf_exit_net(struct net *net)
 static struct pernet_operations addrconf_ops = {
.init = addrconf_init_net,
.exit = addrconf_exit_net,
+   .async = true,
 };
 
 static struct rtnl_af_ops inet6_ops __read_mostly = {



[PATCH v2 26/31] net: Convert packet_net_ops

2017-11-20 Thread Kirill Tkhai
These pernet_operations just create and destroy /proc entry,
and another operations do not touch it.

Also, nobody else are interested in foreign net::packet::sklist.

Signed-off-by: Kirill Tkhai 
---
 net/packet/af_packet.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 737092ca9b4e..700cdf36767b 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -4566,6 +4566,7 @@ static void __net_exit packet_net_exit(struct net *net)
 static struct pernet_operations packet_net_ops = {
.init = packet_net_init,
.exit = packet_net_exit,
+   .async = true,
 };
 
 



[PATCH v2 25/31] net: Convert unix_net_ops

2017-11-20 Thread Kirill Tkhai
These pernet_operations are just create and destroy
/proc and sysctl entries, and are not touched by
foreign pernet_operations.

So, we are able to make them async.

Signed-off-by: Kirill Tkhai 
---
 net/unix/af_unix.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index a9ee634f3c42..1ddf77260849 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2913,6 +2913,7 @@ static void __net_exit unix_net_exit(struct net *net)
 static struct pernet_operations unix_net_ops = {
.init = unix_net_init,
.exit = unix_net_exit,
+   .async = true,
 };
 
 static int __init af_unix_init(void)



[PATCH v2 24/31] net: Convert pernet_subsys, registered from inet_init()

2017-11-20 Thread Kirill Tkhai
arp_net_ops just addr/removes /proc entry.

devinet_ops allocates and frees duplicate of init_net tables
and (un)registers sysctl entries.

fib_net_ops allocates and frees pernet tables, creates/destroys
netlink socket and (un)initializes /proc entries. Foreign
pernet_operations do not touch them.

ip_rt_proc_ops only modifies pernet /proc entries.

xfrm_net_ops creates/destroys /proc entries, allocates/frees
pernet statistics, hashes and tables, and (un)initializes
sysctl files. These are not touched by foreigh pernet_operations

xfrm4_net_ops allocates/frees private pernet memory, and
configures sysctls.

sysctl_route_ops creates/destroys sysctls.

rt_genid_ops only initializes fields of just allocated net.

ipv4_inetpeer_ops allocated/frees net private memory.

igmp_net_ops just creates/destroys /proc files and socket,
noone else interested in.

tcp_sk_ops seems to be safe, because tcp_sk_init() does not
depend on any other pernet_operations modifications. Iteration
over hash table in inet_twsk_purge() is made under RCU lock,
and it's safe to iterate the table this way. Removing from
the table happen from inet_twsk_deschedule_put(), but this
function is safe without any extern locks, as it's synchronized
inside itself. There are many examples, it's used in different
context. So, it's safe to leave tcp_sk_exit_batch() unlocked.

tcp_net_metrics_ops is synchronized on tcp_metrics_lock and safe.

udplite4_net_ops only creates/destroys pernet /proc file.

icmp_sk_ops creates percpu sockets, not touched by foreign
pernet_operations.

ipmr_net_ops creates/destroys pernet fib tables, (un)registers
fib rules and /proc files. This seem to be safe to execute
in parallel with foreign pernet_operations.

af_inet_ops just sets up default parameters of newly created net.

ipv4_mib_ops creates and destroys pernet percpu statistics.

raw_net_ops, tcp4_net_ops, udp4_net_ops, ping_v4_net_ops
and ip_proc_ops only create/destroy pernet /proc files.

ip4_frags_ops creates and destroys sysctl file.

So, it's safe to make the pernet_operations async.

Signed-off-by: Kirill Tkhai 
---
 net/ipv4/af_inet.c  |2 ++
 net/ipv4/arp.c  |1 +
 net/ipv4/devinet.c  |1 +
 net/ipv4/fib_frontend.c |1 +
 net/ipv4/icmp.c |1 +
 net/ipv4/igmp.c |1 +
 net/ipv4/ip_fragment.c  |1 +
 net/ipv4/ipmr.c |1 +
 net/ipv4/ping.c |1 +
 net/ipv4/proc.c |1 +
 net/ipv4/raw.c  |1 +
 net/ipv4/route.c|4 
 net/ipv4/tcp_ipv4.c |2 ++
 net/ipv4/tcp_metrics.c  |1 +
 net/ipv4/udp.c  |1 +
 net/ipv4/udplite.c  |1 +
 net/ipv4/xfrm4_policy.c |1 +
 net/xfrm/xfrm_policy.c  |1 +
 18 files changed, 23 insertions(+)

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index ce4aa827be05..d1a2e9afbb50 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1697,6 +1697,7 @@ static __net_exit void ipv4_mib_exit_net(struct net *net)
 static __net_initdata struct pernet_operations ipv4_mib_ops = {
.init = ipv4_mib_init_net,
.exit = ipv4_mib_exit_net,
+   .async = true,
 };
 
 static int __init init_ipv4_mibs(void)
@@ -1750,6 +1751,7 @@ static __net_exit void inet_exit_net(struct net *net)
 static __net_initdata struct pernet_operations af_inet_ops = {
.init = inet_init_net,
.exit = inet_exit_net,
+   .async = true,
 };
 
 static int __init init_inet_pernet_ops(void)
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index a8d7c5a9fb05..19bcd10a928b 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1443,6 +1443,7 @@ static void __net_exit arp_net_exit(struct net *net)
 static struct pernet_operations arp_net_ops = {
.init = arp_net_init,
.exit = arp_net_exit,
+   .async = true,
 };
 
 static int __init arp_proc_init(void)
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index a4573bccd6da..c359bda18ff5 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -2474,6 +2474,7 @@ static __net_exit void devinet_exit_net(struct net *net)
 static __net_initdata struct pernet_operations devinet_ops = {
.init = devinet_init_net,
.exit = devinet_exit_net,
+   .async = true,
 };
 
 static struct rtnl_af_ops inet_af_ops __read_mostly = {
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index f52d27a422c3..6eb4aa5ee66f 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1361,6 +1361,7 @@ static void __net_exit fib_net_exit(struct net *net)
 static struct pernet_operations fib_net_ops = {
.init = fib_net_init,
.exit = fib_net_exit,
+   .async = true,
 };
 
 void __init ip_fib_init(void)
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 1617604c9284..cc56efa64d5c 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -1257,6 +1257,7 @@ static int __net_init icmp_sk_init(struct net *net)
 static struct pernet_operations __net_initdata icmp_sk_ops = {
.init = icmp_sk_init,
.exit =

[PATCH v2 22/31] net: Convert wext_pernet_ops

2017-11-20 Thread Kirill Tkhai
These pernet_operations initialize and purge net::wext_nlevents
queue, and are not touched by foreign pernet_operations.

Mark them async.

Signed-off-by: Kirill Tkhai 
---
 net/wireless/wext-core.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c
index 6cdb054484d6..32c9f1c303f9 100644
--- a/net/wireless/wext-core.c
+++ b/net/wireless/wext-core.c
@@ -390,6 +390,7 @@ static void __net_exit wext_pernet_exit(struct net *net)
 static struct pernet_operations wext_pernet_ops = {
.init = wext_pernet_init,
.exit = wext_pernet_exit,
+   .async = true,
 };
 
 static int __init wireless_nlevent_init(void)



[PATCH v2 20/31] net: Convert subsys_initcall() registered pernet_operations from net/sched

2017-11-20 Thread Kirill Tkhai
psched_net_ops only creates and destroyes /proc entry,
and safe to be executed in parallel with any foreigh
pernet_operations.

tcf_action_net_ops initializes and destructs tcf_action_net::egdev_ht,
which is not touched by foreign pernet_operations.

So, make them async.

Signed-off-by: Kirill Tkhai 
---
 net/sched/act_api.c |1 +
 net/sched/sch_api.c |1 +
 2 files changed, 2 insertions(+)

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 4d33a50a8a6d..41a26f551dbb 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -1464,6 +1464,7 @@ static struct pernet_operations tcf_action_net_ops = {
.exit = tcf_action_net_exit,
.id = &tcf_action_net_id,
.size = sizeof(struct tcf_action_net),
+   .async = true,
 };
 
 static int __init tc_action_init(void)
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index b6c4f536876b..09d63c83542a 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -2002,6 +2002,7 @@ static void __net_exit psched_net_exit(struct net *net)
 static struct pernet_operations psched_net_ops = {
.init = psched_net_init,
.exit = psched_net_exit,
+   .async = true,
 };
 
 static int __init pktsched_init(void)



[PATCH v2 19/31] net: Convert fib_* pernet_operations, registered via subsys_initcall

2017-11-20 Thread Kirill Tkhai
Both of them create and initialize lists, which are not touched
by another foreing pernet_operations.

Signed-off-by: Kirill Tkhai 
---
 net/core/fib_notifier.c |1 +
 net/core/fib_rules.c|1 +
 2 files changed, 2 insertions(+)

diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c
index 0c048bdeb016..5ace0705a3f9 100644
--- a/net/core/fib_notifier.c
+++ b/net/core/fib_notifier.c
@@ -171,6 +171,7 @@ static void __net_exit fib_notifier_net_exit(struct net 
*net)
 static struct pernet_operations fib_notifier_net_ops = {
.init = fib_notifier_net_init,
.exit = fib_notifier_net_exit,
+   .async = true,
 };
 
 static int __init fib_notifier_init(void)
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 98e1066c3d55..cb071b8e8d17 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -1030,6 +1030,7 @@ static void __net_exit fib_rules_net_exit(struct net *net)
 static struct pernet_operations fib_rules_net_ops = {
.init = fib_rules_net_init,
.exit = fib_rules_net_exit,
+   .async = true,
 };
 
 static int __init fib_rules_init(void)



[PATCH v2 18/31] net: Convert pernet_subsys ops, registered via net_dev_init()

2017-11-20 Thread Kirill Tkhai
There are:
1)dev_proc_ops and dev_mc_net_ops, which create and destroy
pernet proc file and not interested to another net namespaces;
2)netdev_net_ops, which creates pernet hash, which is not
touched by another pernet_operations.

So, make them async.

Signed-off-by: Kirill Tkhai 
---
 net/core/dev.c|1 +
 net/core/net-procfs.c |2 ++
 2 files changed, 3 insertions(+)

diff --git a/net/core/dev.c b/net/core/dev.c
index 8ee29f4f5fa9..41a576a17430 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -8656,6 +8656,7 @@ static void __net_exit netdev_exit(struct net *net)
 static struct pernet_operations __net_initdata netdev_net_ops = {
.init = netdev_init,
.exit = netdev_exit,
+   .async = true,
 };
 
 static void __net_exit default_device_exit(struct net *net)
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index 615ccab55f38..16b250dd50ed 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -352,6 +352,7 @@ static void __net_exit dev_proc_net_exit(struct net *net)
 static struct pernet_operations __net_initdata dev_proc_ops = {
.init = dev_proc_net_init,
.exit = dev_proc_net_exit,
+   .async = true,
 };
 
 static int dev_mc_seq_show(struct seq_file *seq, void *v)
@@ -409,6 +410,7 @@ static void __net_exit dev_mc_net_exit(struct net *net)
 static struct pernet_operations __net_initdata dev_mc_net_ops = {
.init = dev_mc_net_init,
.exit = dev_mc_net_exit,
+   .async = true,
 };
 
 int __init dev_proc_init(void)



[PATCH v2 17/31] net: Convert proto_net_ops

2017-11-20 Thread Kirill Tkhai
This patch starts to convert pernet_subsys, registered
from subsys initcalls.

It seems safe to be executed in parallel with others,
as it's only creates/destoyes proc entry,
which nobody else is not interested in.

Signed-off-by: Kirill Tkhai 
---
 net/core/sock.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/net/core/sock.c b/net/core/sock.c
index f04f5ec87d04..d9c3de4239e6 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3344,6 +3344,7 @@ static __net_exit void proto_exit_net(struct net *net)
 static __net_initdata struct pernet_operations proto_net_ops = {
.init = proto_init_net,
.exit = proto_exit_net,
+   .async = true,
 };
 
 static int __init proto_init(void)



[PATCH v2 16/31] net: Convert uevent_net_ops

2017-11-20 Thread Kirill Tkhai
uevent_net_init() and uevent_net_exit() create and
destroy netlink socket, and these actions serialized
in netlink code.

Parallel execution with other pernet_operations
makes the socket disappear earlier from uevent_sock_list
on ->exit. As userspace can't be interested in broadcast
messages of dying net, and, as I see, no one in kernel
listen them, we may safely make uevent_net_ops async.

Signed-off-by: Kirill Tkhai 
---
 lib/kobject_uevent.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index c3e84edc47c9..4a2c39ae1e65 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -643,6 +643,7 @@ static void uevent_net_exit(struct net *net)
 static struct pernet_operations uevent_net_ops = {
.init   = uevent_net_init,
.exit   = uevent_net_exit,
+   .async  = true,
 };
 
 static int __init kobject_uevent_init(void)



[PATCH v2 15/31] net: Convert audit_net_ops

2017-11-20 Thread Kirill Tkhai
This patch starts to convert pernet_subsys, registered
from postcore initcalls.

audit_net_init() creates netlink socket, while audit_net_exit()
destroys it. The rest of the pernet_list are not interested
in the socket, so we make audit_net_ops async.

Signed-off-by: Kirill Tkhai 
---
 kernel/audit.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/audit.c b/kernel/audit.c
index 227db99b0f19..5e49b614d0e6 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1526,6 +1526,7 @@ static struct pernet_operations audit_net_ops 
__net_initdata = {
.exit = audit_net_exit,
.id = &audit_net_id,
.size = sizeof(struct audit_net),
+   .async = true,
 };
 
 /* Initialize audit support at boot time. */



Re: [PATCH iproute2] iproute2: fixes to compile on some systems.

2017-11-20 Thread Stephen Hemminger
On Mon, 20 Nov 2017 12:57:07 +0900
Lorenzo Colitti  wrote:

> 1. Put the declarations of strlcpy and strlcat inside
>an #ifdef NEED_STRLCPY. Their declarations were already in a
>similar #ifdef.
> 2. In bpf_scm.h, include sys/un.h for struct sockaddr_un.
> 3. In utils.h, include time.h for struct timeval.
> 
> Tested: builds on ubuntu 14.04 with "make clean distclean; ./configure && 
> make -j64"
> Tested: 4.14.1 builds on Android with Android-specific #ifndefs for missing 
> library code
> Signed-off-by: Lorenzo Colitti 

Applied. Thanks for not forking


[PATCH v2 14/31] net: Convert rtnetlink_net_ops

2017-11-20 Thread Kirill Tkhai
rtnetlink_net_init() and rtnetlink_net_exit()
create and destroy netlink socket. It looks like,
another pernet_operations are not interested in
foreiner net::rtnl, so rtnetlink_net_ops may be
safely made async.

Signed-off-by: Kirill Tkhai 
---
 net/core/rtnetlink.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index cb06d43c4230..fb3f58cf9351 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -4494,6 +4494,7 @@ static void __net_exit rtnetlink_net_exit(struct net *net)
 static struct pernet_operations rtnetlink_net_ops = {
.init = rtnetlink_net_init,
.exit = rtnetlink_net_exit,
+   .async = true,
 };
 
 void __init rtnetlink_init(void)



[PATCH v2 13/31] net: Convert netlink_net_ops

2017-11-20 Thread Kirill Tkhai
The methods of netlink_net_ops create and destroy "netlink"
file, which are not interesting for foreigh pernet_operations.
So, netlink_net_ops may safely be made async.

Signed-off-by: Kirill Tkhai 
---
 net/netlink/af_netlink.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index b9e0ee4e22f5..1bb967bce57c 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2687,6 +2687,7 @@ static void __init netlink_add_usersock_entry(void)
 static struct pernet_operations __net_initdata netlink_net_ops = {
.init = netlink_net_init,
.exit = netlink_net_exit,
+   .async = true,
 };
 
 static inline u32 netlink_hash(const void *data, u32 len, u32 seed)



[PATCH v2 12/31] net: Convert net_defaults_ops

2017-11-20 Thread Kirill Tkhai
net_defaults_ops introduces only net_defaults_init_net method,
and it acts on net::core::sysctl_somaxconn, which
is not interesting for the rest of pernet_subsys and
pernet_device lists. Then, make it async.

Signed-off-by: Kirill Tkhai 
---
 net/core/net_namespace.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 757765d62daf..c91b10731498 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -332,6 +332,7 @@ static int __net_init net_defaults_init_net(struct net *net)
 
 static struct pernet_operations net_defaults_ops = {
.init = net_defaults_init_net,
+   .async = true,
 };
 
 static __init int net_defaults_init(void)



[PATCH v2 11/31] net: Convert net_inuse_ops

2017-11-20 Thread Kirill Tkhai
net_inuse_ops methods expose statistics in /proc.
No one from the rest of pernet_subsys or pernet_device
lists does not touch net::core::inuse.

So, it's safe to make net_inuse_ops async.

Signed-off-by: Kirill Tkhai 
---
 net/core/sock.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/net/core/sock.c b/net/core/sock.c
index c0b5b2f17412..f04f5ec87d04 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3075,6 +3075,7 @@ static void __net_exit sock_inuse_exit_net(struct net 
*net)
 static struct pernet_operations net_inuse_ops = {
.init = sock_inuse_init_net,
.exit = sock_inuse_exit_net,
+   .async = true,
 };
 
 static __init int net_inuse_init(void)



[PATCH v2 09/31] net: Convert netfilter_net_ops

2017-11-20 Thread Kirill Tkhai
Methods netfilter_net_init() and netfilter_net_exit()
initialize net::nf::hooks and change net-related proc
directory of net. Another pernet_operations are not
interested in forein net::nf::hooks or proc entries,
so it's safe to be execute them in parallel with
methods of other pernet operations.

Signed-off-by: Kirill Tkhai 
---
 net/netfilter/core.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 52cd2901a097..bfe2e44244ee 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -600,6 +600,7 @@ static void __net_exit netfilter_net_exit(struct net *net)
 static struct pernet_operations netfilter_net_ops = {
.init = netfilter_net_init,
.exit = netfilter_net_exit,
+   .async = true,
 };
 
 int __init netfilter_init(void)



[PATCH v2 10/31] net: Convert nf_log_net_ops

2017-11-20 Thread Kirill Tkhai
The pernet_operations would have had a problem in parallel
execution with others, if init_net had been able to released.
But it's not, and the rest is safe for that.
There is memory allocation, which nobody else interested in,
and sysctl registration. So, we make it async.

Signed-off-by: Kirill Tkhai 
---
 net/netfilter/nf_log.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index 8bb152a7cca4..6137fb1bce66 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -578,6 +578,7 @@ static void __net_exit nf_log_net_exit(struct net *net)
 static struct pernet_operations nf_log_net_ops = {
.init = nf_log_net_init,
.exit = nf_log_net_exit,
+   .async = true,
 };
 
 int __init netfilter_log_init(void)



[PATCH v2 08/31] net: Convert sysctl_pernet_ops

2017-11-20 Thread Kirill Tkhai
This patch starts to convert pernet_subsys, registered
from core initcalls.

Methods sysctl_net_init() and sysctl_net_exit() initialize
net::sysctls table of a namespace.

pernet_operations::init()/exit() methods from the rest
of the list do not touch net::sysctls of strangers,
so it's safe to execute sysctl_pernet_ops's methods
in parallel with any other pernet_operations.

Signed-off-by: Kirill Tkhai 
---
 net/sysctl_net.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/net/sysctl_net.c b/net/sysctl_net.c
index 9aed6fe1bf1a..f424539829b7 100644
--- a/net/sysctl_net.c
+++ b/net/sysctl_net.c
@@ -89,6 +89,7 @@ static void __net_exit sysctl_net_exit(struct net *net)
 static struct pernet_operations sysctl_pernet_ops = {
.init = sysctl_net_init,
.exit = sysctl_net_exit,
+   .async = true,
 };
 
 static struct ctl_table_header *net_header;



[PATCH v2 05/31] net: Allow pernet_operations to be executed in parallel

2017-11-20 Thread Kirill Tkhai
This adds new pernet_operations::async flag to indicate operations,
which ->init(), ->exit() and ->exit_batch() methods are allowed
to be executed in parallel with the methods of any other pernet_operations.

When there are only asynchronous pernet_operations in the system,
net_mutex won't be taken for a net construction and destruction.

Also, remove BUG_ON(mutex_is_locked()) from net_assign_generic()
without replacing with the equivalent net_sem check, as there is
one more lockdep assert below.

Suggested-by: Eric W. Biederman 
Signed-off-by: Kirill Tkhai 
---
 include/net/net_namespace.h |6 ++
 net/core/net_namespace.c|   29 +++--
 2 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 10f99dafd5ac..db978c4755f7 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -303,6 +303,12 @@ struct pernet_operations {
void (*exit_batch)(struct list_head *net_exit_list);
unsigned int *id;
size_t size;
+   /*
+* Indicates above methods are allowe to be executed in parallel
+* with methods of any other pernet_operations, i.e. they are not
+* need synchronization via net_mutex.
+*/
+   bool async;
 };
 
 /*
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index c4f7452906bb..550c766f73aa 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -41,8 +41,9 @@ struct net init_net = {
 EXPORT_SYMBOL(init_net);
 
 static bool init_net_initialized;
+static unsigned nr_sync_pernet_ops;
 /*
- * net_sem: protects: pernet_list, net_generic_ids,
+ * net_sem: protects: pernet_list, net_generic_ids, nr_sync_pernet_ops,
  * init_net_initialized and first_device pointer.
  */
 DECLARE_RWSEM(net_sem);
@@ -70,11 +71,10 @@ static int net_assign_generic(struct net *net, unsigned int 
id, void *data)
 {
struct net_generic *ng, *old_ng;
 
-   BUG_ON(!mutex_is_locked(&net_mutex));
BUG_ON(id < MIN_PERNET_OPS_ID);
 
old_ng = rcu_dereference_protected(net->gen,
-  lockdep_is_held(&net_mutex));
+  lockdep_is_held(&net_sem));
if (old_ng->s.len > id) {
old_ng->ptr[id] = data;
return 0;
@@ -419,11 +419,14 @@ struct net *copy_net_ns(unsigned long flags,
rv = down_read_killable(&net_sem);
if (rv < 0)
goto put_userns;
-   rv = mutex_lock_killable(&net_mutex);
-   if (rv < 0)
-   goto up_read;
+   if (nr_sync_pernet_ops) {
+   rv = mutex_lock_killable(&net_mutex);
+   if (rv < 0)
+   goto up_read;
+   }
rv = setup_net(net, user_ns);
-   mutex_unlock(&net_mutex);
+   if (nr_sync_pernet_ops)
+   mutex_unlock(&net_mutex);
 up_read:
up_read(&net_sem);
if (rv < 0) {
@@ -453,7 +456,8 @@ static void cleanup_net(struct work_struct *work)
spin_unlock_irq(&cleanup_list_lock);
 
down_read(&net_sem);
-   mutex_lock(&net_mutex);
+   if (nr_sync_pernet_ops)
+   mutex_lock(&net_mutex);
 
/* Don't let anyone else find us. */
rtnl_lock();
@@ -489,7 +493,8 @@ static void cleanup_net(struct work_struct *work)
list_for_each_entry_reverse(ops, &pernet_list, list)
ops_exit_list(ops, &net_exit_list);
 
-   mutex_unlock(&net_mutex);
+   if (nr_sync_pernet_ops)
+   mutex_unlock(&net_mutex);
 
/* Free the net generic variables */
list_for_each_entry_reverse(ops, &pernet_list, list)
@@ -961,6 +966,9 @@ static int register_pernet_operations(struct list_head 
*list,
rcu_barrier();
if (ops->id)
ida_remove(&net_generic_ids, *ops->id);
+   } else if (!ops->async) {
+   pr_info_once("Pernet operations %ps are sync.\n", ops);
+   nr_sync_pernet_ops++;
}
 
return error;
@@ -968,7 +976,8 @@ static int register_pernet_operations(struct list_head 
*list,
 
 static void unregister_pernet_operations(struct pernet_operations *ops)
 {
-   
+   if (!ops->async)
+   BUG_ON(nr_sync_pernet_ops-- == 0);
__unregister_pernet_operations(ops);
rcu_barrier();
if (ops->id)



[PATCH v2 06/31] net: Convert proc_net_ns_ops

2017-11-20 Thread Kirill Tkhai
This patch starts to convert pernet_subsys, registered
from before initcalls.

proc_net_ns_ops::proc_net_ns_init()/proc_net_ns_exit()
register pernet net->proc_net and ->proc_net_stat.

Constructors and destructors of another pernet_operations
are not interested in foreign net's proc_net and proc_net_stat.
Proc filesystem privitives are synchronized on proc_subdir_lock.

So, proc_net_ns_ops methods are able to be executed
in parallel with methods of other pernet operations.

Signed-off-by: Kirill Tkhai 
---
 fs/proc/proc_net.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index a2bf369c923d..2bf6170204b1 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -237,6 +237,7 @@ static __net_exit void proc_net_ns_exit(struct net *net)
 static struct pernet_operations __net_initdata proc_net_ns_ops = {
.init = proc_net_ns_init,
.exit = proc_net_ns_exit,
+   .async = true,
 };
 
 int __init proc_net_init(void)



[PATCH v2 04/31] net: Move mutex_unlock() in cleanup_net() up

2017-11-20 Thread Kirill Tkhai
net_sem protects from pernet_list changing, while
ops_free_list() makes simple kfree(), and it can't
race with other pernet_operations callbacks.

So we may release net_mutex earlier then it was.

Signed-off-by: Kirill Tkhai 
---
 net/core/net_namespace.c |3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 859dce31e37e..c4f7452906bb 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -489,11 +489,12 @@ static void cleanup_net(struct work_struct *work)
list_for_each_entry_reverse(ops, &pernet_list, list)
ops_exit_list(ops, &net_exit_list);
 
+   mutex_unlock(&net_mutex);
+
/* Free the net generic variables */
list_for_each_entry_reverse(ops, &pernet_list, list)
ops_free_list(ops, &net_exit_list);
 
-   mutex_unlock(&net_mutex);
up_read(&net_sem);
 
/* Ensure there are no outstanding rcu callbacks using this



[PATCH v2 03/31] net: Introduce net_sem for protection of pernet_list

2017-11-20 Thread Kirill Tkhai
Curently mutex is used to protect pernet operations list. It makes
cleanup_net() to execute ->exit methods of the same operations set,
which was used on the time of ->init, even after net namespace is
unlinked from net_namespace_list.

But the problem is it's need to synchronize_rcu() after net is removed
from net_namespace_list():

Destroy net_ns:
cleanup_net()
  mutex_lock(&net_mutex)
  list_del_rcu(&net->list)
  synchronize_rcu()  <--- Sleep there for ages
  list_for_each_entry_reverse(ops, &pernet_list, list)
ops_exit_list(ops, &net_exit_list)
  list_for_each_entry_reverse(ops, &pernet_list, list)
ops_free_list(ops, &net_exit_list)
  mutex_unlock(&net_mutex)

This primitive is not fast, especially on the systems with many processors
and/or when preemptible RCU is enabled in config. So, all the time, while
cleanup_net() is waiting for RCU grace period, creation of new net namespaces
is not possible, the tasks, who makes it, are sleeping on the same mutex:

Create net_ns:
copy_net_ns()
  mutex_lock_killable(&net_mutex)<--- Sleep there for ages

I observed 20-30 seconds hangs of "unshare -n" on ordinary 8-cpu laptop
with preemptible RCU enabled.

The solution is to convert net_mutex to the rw_semaphore and add small locks
to really small number of pernet_operations, what really need them. Then,
pernet_operations::init/::exit methods, modifying the net-related data,
will require down_read() locking only, while down_write() will be used
for changing pernet_list.

This gives signify performance increase, after all patch set is applied,
like you may see here:

%for i in {1..1}; do unshare -n bash -c exit; done

*before*
real 1m40,377s
user 0m9,672s
sys 0m19,928s

*after*
real 0m17,007s
user 0m5,311s
sys 0m11,779

(5.8 times faster)

This patch starts replacing net_mutex to net_sem. It adds rw_semaphore,
describes the variables it protects, and makes to use where appropriate.
net_mutex is still present, and next patches will kick it out step-by-step.

Signed-off-by: Kirill Tkhai 
---
 include/linux/rtnetlink.h |1 +
 net/core/net_namespace.c  |   39 ++-
 net/core/rtnetlink.c  |4 ++--
 3 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 2032ce2eb20b..f640fc87fe1d 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -35,6 +35,7 @@ extern int rtnl_is_locked(void);
 
 extern wait_queue_head_t netdev_unregistering_wq;
 extern struct mutex net_mutex;
+extern struct rw_semaphore net_sem;
 
 #ifdef CONFIG_PROVE_LOCKING
 extern bool lockdep_rtnl_is_held(void);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 2e512965bf42..859dce31e37e 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -41,6 +41,11 @@ struct net init_net = {
 EXPORT_SYMBOL(init_net);
 
 static bool init_net_initialized;
+/*
+ * net_sem: protects: pernet_list, net_generic_ids,
+ * init_net_initialized and first_device pointer.
+ */
+DECLARE_RWSEM(net_sem);
 
 #define MIN_PERNET_OPS_ID  \
((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *))
@@ -279,7 +284,7 @@ struct net *get_net_ns_by_id(struct net *net, int id)
  */
 static __net_init int setup_net(struct net *net, struct user_namespace 
*user_ns)
 {
-   /* Must be called with net_mutex held */
+   /* Must be called with net_sem held */
const struct pernet_operations *ops, *saved_ops;
int error = 0;
LIST_HEAD(net_exit_list);
@@ -411,12 +416,16 @@ struct net *copy_net_ns(unsigned long flags,
net->ucounts = ucounts;
get_user_ns(user_ns);
 
-   rv = mutex_lock_killable(&net_mutex);
+   rv = down_read_killable(&net_sem);
if (rv < 0)
goto put_userns;
-
+   rv = mutex_lock_killable(&net_mutex);
+   if (rv < 0)
+   goto up_read;
rv = setup_net(net, user_ns);
mutex_unlock(&net_mutex);
+up_read:
+   up_read(&net_sem);
if (rv < 0) {
 put_userns:
put_user_ns(user_ns);
@@ -443,6 +452,7 @@ static void cleanup_net(struct work_struct *work)
list_replace_init(&cleanup_list, &net_kill_list);
spin_unlock_irq(&cleanup_list_lock);
 
+   down_read(&net_sem);
mutex_lock(&net_mutex);
 
/* Don't let anyone else find us. */
@@ -484,6 +494,7 @@ static void cleanup_net(struct work_struct *work)
ops_free_list(ops, &net_exit_list);
 
mutex_unlock(&net_mutex);
+   up_read(&net_sem);
 
/* Ensure there are no outstanding rcu callbacks using this
 * network namespace.
@@ -510,8 +521,10 @@ static void cleanup_net(struct work_struct *work)
  */
 void net_ns_barrier(void)
 {
+   down_write(&net_sem);
mutex_lock(&net_mutex);
mutex_unlock(&net_mutex);
+   up_write(&net_sem);
 }
 EXPORT_SYMBOL(net_ns_barrier);
 
@@ -838,12 +851

Re: [PATCH] man: document ip route get mark

2017-11-20 Thread Stephen Hemminger
On Sat, 18 Nov 2017 22:56:49 +0100
Simon Ruderich  wrote:

> Signed-off-by: Simon Ruderich 
> ---
> Hello,
> 
> Just found this in an stackoverflow article from 2015 and it
> really helped. So here as patch.
> 
> Regards
> Simon


Applied man page patches,  thanks


[PATCH v2 07/31] net: Convert net_ns_ops methods

2017-11-20 Thread Kirill Tkhai
This patch starts to convert pernet_subsys, registered
from pure initcalls.

net_ns_ops::net_ns_net_init/net_ns_net_init, methods use only
ida_simple_* functions, which are not need a synchronization.

So, net_ns_ops methods are able to be executed
in parallel with methods of other pernet operations.

Signed-off-by: Kirill Tkhai 
---
 net/core/net_namespace.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 550c766f73aa..757765d62daf 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -615,6 +615,7 @@ static __net_exit void net_ns_net_exit(struct net *net)
 static struct pernet_operations __net_initdata net_ns_ops = {
.init = net_ns_net_init,
.exit = net_ns_net_exit,
+   .async = true,
 };
 
 static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = {



[PATCH v2 02/31] net: Cleanup copy_net_ns()

2017-11-20 Thread Kirill Tkhai
Line up destructors actions in the revers order
to constructors. Next patches will add more actions,
and this will be comfortable, if there is the such
order.

Signed-off-by: Kirill Tkhai 
---
 net/core/net_namespace.c |   20 +---
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 7ecf71050ffa..2e512965bf42 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -404,27 +404,25 @@ struct net *copy_net_ns(unsigned long flags,
 
net = net_alloc();
if (!net) {
-   dec_net_namespaces(ucounts);
-   return ERR_PTR(-ENOMEM);
+   rv = -ENOMEM;
+   goto dec_ucounts;
}
-
+   refcount_set(&net->passive, 1);
+   net->ucounts = ucounts;
get_user_ns(user_ns);
 
rv = mutex_lock_killable(&net_mutex);
-   if (rv < 0) {
-   net_free(net);
-   dec_net_namespaces(ucounts);
-   put_user_ns(user_ns);
-   return ERR_PTR(rv);
-   }
+   if (rv < 0)
+   goto put_userns;
 
-   net->ucounts = ucounts;
rv = setup_net(net, user_ns);
mutex_unlock(&net_mutex);
if (rv < 0) {
-   dec_net_namespaces(ucounts);
+put_userns:
put_user_ns(user_ns);
net_drop_ns(net);
+dec_ucounts:
+   dec_net_namespaces(ucounts);
return ERR_PTR(rv);
}
return net;



  1   2   >