[PATCH v6 7/7] arm64: dts: NS2: add AMAC ethernet support

2016-11-03 Thread Jon Mason
Add support for the AMAC ethernet to the Broadcom Northstar2 SoC device
tree

Signed-off-by: Jon Mason 
---
 arch/arm64/boot/dts/broadcom/ns2-svk.dts |  5 +
 arch/arm64/boot/dts/broadcom/ns2.dtsi| 12 
 2 files changed, 17 insertions(+)

diff --git a/arch/arm64/boot/dts/broadcom/ns2-svk.dts 
b/arch/arm64/boot/dts/broadcom/ns2-svk.dts
index b09f3bc..c4d5442 100644
--- a/arch/arm64/boot/dts/broadcom/ns2-svk.dts
+++ b/arch/arm64/boot/dts/broadcom/ns2-svk.dts
@@ -56,6 +56,10 @@
};
 };
 
+ {
+   status = "ok";
+};
+
 _phy0 {
status = "ok";
 };
@@ -174,6 +178,7 @@
 _mux_iproc {
mdio@10 {
gphy0: eth-phy@10 {
+   enet-phy-lane-swap;
reg = <0x10>;
};
};
diff --git a/arch/arm64/boot/dts/broadcom/ns2.dtsi 
b/arch/arm64/boot/dts/broadcom/ns2.dtsi
index d95dc40..773ed59 100644
--- a/arch/arm64/boot/dts/broadcom/ns2.dtsi
+++ b/arch/arm64/boot/dts/broadcom/ns2.dtsi
@@ -191,6 +191,18 @@
 
#include "ns2-clock.dtsi"
 
+   enet: ethernet@6100 {
+   compatible = "brcm,ns2-amac";
+   reg = <0x6100 0x1000>,
+ <0x6109 0x1000>,
+ <0x6103 0x100>;
+   reg-names = "amac_base", "idm_base", "nicpm_base";
+   interrupts = ;
+   phy-handle = <>;
+   phy-mode = "rgmii";
+   status = "disabled";
+   };
+
dma0: dma@6136 {
compatible = "arm,pl330", "arm,primecell";
reg = <0x6136 0x1000>;
-- 
2.7.4



[PATCH v6 3/7] net: phy: broadcom: Add BCM54810 PHY entry

2016-11-03 Thread Jon Mason
The BCM54810 PHY requires some semi-unique configuration, which results
in some additional configuration in addition to the standard config.
Also, some users of the BCM54810 require the PHY lanes to be swapped.
Since there is no way to detect this, add a device tree query to see if
it is applicable.

Inspired-by: Vikas Soni 
Signed-off-by: Jon Mason 
Reviewed-by: Florian Fainelli 
Reviewed-by: Andrew Lunn 
---
 drivers/net/phy/Kconfig|  2 +-
 drivers/net/phy/broadcom.c | 58 +-
 include/linux/brcmphy.h|  9 +++
 3 files changed, 67 insertions(+), 2 deletions(-)

diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
index ff31c10..d3fcfd2 100644
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -217,7 +217,7 @@ config BROADCOM_PHY
select BCM_NET_PHYLIB
---help---
  Currently supports the BCM5411, BCM5421, BCM5461, BCM54616S, BCM5464,
- BCM5481 and BCM5482 PHYs.
+ BCM5481, BCM54810 and BCM5482 PHYs.
 
 config CICADA_PHY
tristate "Cicada PHYs"
diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c
index 3a64b3d..b1e32e9 100644
--- a/drivers/net/phy/broadcom.c
+++ b/drivers/net/phy/broadcom.c
@@ -18,7 +18,7 @@
 #include 
 #include 
 #include 
-
+#include 
 
 #define BRCM_PHY_MODEL(phydev) \
((phydev)->drv->phy_id & (phydev)->drv->phy_id_mask)
@@ -45,6 +45,34 @@ static int bcm54xx_auxctl_write(struct phy_device *phydev, 
u16 regnum, u16 val)
return phy_write(phydev, MII_BCM54XX_AUX_CTL, regnum | val);
 }
 
+static int bcm54810_config(struct phy_device *phydev)
+{
+   int rc, val;
+
+   val = bcm_phy_read_exp(phydev, BCM54810_EXP_BROADREACH_LRE_MISC_CTL);
+   val &= ~BCM54810_EXP_BROADREACH_LRE_MISC_CTL_EN;
+   rc = bcm_phy_write_exp(phydev, BCM54810_EXP_BROADREACH_LRE_MISC_CTL,
+  val);
+   if (rc < 0)
+   return rc;
+
+   val = bcm54xx_auxctl_read(phydev, MII_BCM54XX_AUXCTL_SHDWSEL_MISC);
+   val &= ~MII_BCM54XX_AUXCTL_SHDWSEL_MISC_RGMII_SKEW_EN;
+   val |= MII_BCM54XX_AUXCTL_MISC_WREN;
+   rc = bcm54xx_auxctl_write(phydev, MII_BCM54XX_AUXCTL_SHDWSEL_MISC,
+ val);
+   if (rc < 0)
+   return rc;
+
+   val = bcm_phy_read_shadow(phydev, BCM54810_SHD_CLK_CTL);
+   val &= ~BCM54810_SHD_CLK_CTL_GTXCLK_EN;
+   rc = bcm_phy_write_shadow(phydev, BCM54810_SHD_CLK_CTL, val);
+   if (rc < 0)
+   return rc;
+
+   return 0;
+}
+
 /* Needs SMDSP clock enabled via bcm54xx_phydsp_config() */
 static int bcm50610_a0_workaround(struct phy_device *phydev)
 {
@@ -217,6 +245,12 @@ static int bcm54xx_config_init(struct phy_device *phydev)
(phydev->dev_flags & PHY_BRCM_AUTO_PWRDWN_ENABLE))
bcm54xx_adjust_rxrefclk(phydev);
 
+   if (BRCM_PHY_MODEL(phydev) == PHY_ID_BCM54810) {
+   err = bcm54810_config(phydev);
+   if (err)
+   return err;
+   }
+
bcm54xx_phydsp_config(phydev);
 
return 0;
@@ -314,6 +348,7 @@ static int bcm5482_read_status(struct phy_device *phydev)
 
 static int bcm5481_config_aneg(struct phy_device *phydev)
 {
+   struct device_node *np = phydev->mdio.dev.of_node;
int ret;
 
/* Aneg firsly. */
@@ -344,6 +379,14 @@ static int bcm5481_config_aneg(struct phy_device *phydev)
phy_write(phydev, 0x18, reg);
}
 
+   if (of_property_read_bool(np, "enet-phy-lane-swap")) {
+   /* Lane Swap - Undocumented register...magic! */
+   ret = bcm_phy_write_exp(phydev, MII_BCM54XX_EXP_SEL_ER + 0x9,
+   0x11B);
+   if (ret < 0)
+   return ret;
+   }
+
return ret;
 }
 
@@ -578,6 +621,18 @@ static struct phy_driver broadcom_drivers[] = {
.ack_interrupt  = bcm_phy_ack_intr,
.config_intr= bcm_phy_config_intr,
 }, {
+   .phy_id = PHY_ID_BCM54810,
+   .phy_id_mask= 0xfff0,
+   .name   = "Broadcom BCM54810",
+   .features   = PHY_GBIT_FEATURES |
+ SUPPORTED_Pause | SUPPORTED_Asym_Pause,
+   .flags  = PHY_HAS_MAGICANEG | PHY_HAS_INTERRUPT,
+   .config_init= bcm54xx_config_init,
+   .config_aneg= bcm5481_config_aneg,
+   .read_status= genphy_read_status,
+   .ack_interrupt  = bcm_phy_ack_intr,
+   .config_intr= bcm_phy_config_intr,
+}, {
.phy_id = PHY_ID_BCM5482,
.phy_id_mask= 0xfff0,
.name   = "Broadcom BCM5482",
@@ -661,6 +716,7 @@ static struct mdio_device_id __maybe_unused broadcom_tbl[] 
= {
{ PHY_ID_BCM54616S, 0xfff0 },
{ PHY_ID_BCM5464, 0xfff0 },
{ PHY_ID_BCM5481, 0xfff0 },
+   { 

[PATCH v6 2/7] Documentation: devicetree: add PHY lane swap binding

2016-11-03 Thread Jon Mason
Add the documentation for PHY lane swapping.  This is a boolean entry to
notify the phy device drivers that the TX/RX lanes need to be swapped.

Signed-off-by: Jon Mason 
Reviewed-by: Florian Fainelli 
Reviewed-by: Andrew Lunn 
---
 Documentation/devicetree/bindings/net/phy.txt | 4 
 1 file changed, 4 insertions(+)

diff --git a/Documentation/devicetree/bindings/net/phy.txt 
b/Documentation/devicetree/bindings/net/phy.txt
index bc1c3c8..4627da3 100644
--- a/Documentation/devicetree/bindings/net/phy.txt
+++ b/Documentation/devicetree/bindings/net/phy.txt
@@ -35,6 +35,10 @@ Optional Properties:
 - broken-turn-around: If set, indicates the PHY device does not correctly
   release the turn around line low at the end of a MDIO transaction.
 
+- enet-phy-lane-swap: If set, indicates the PHY will swap the TX/RX lanes to
+  compensate for the board being designed with the lanes swapped.
+
+
 Example:
 
 ethernet-phy@0 {
-- 
2.7.4



[PATCH v6 1/7] net: phy: broadcom: add bcm54xx_auxctl_read

2016-11-03 Thread Jon Mason
Add a helper function to read the AUXCTL register for the BCM54xx.  This
mirrors the bcm54xx_auxctl_write function already present in the code.

Signed-off-by: Jon Mason 
Reviewed-by: Florian Fainelli 
---
 drivers/net/phy/broadcom.c | 10 ++
 include/linux/brcmphy.h|  1 +
 2 files changed, 11 insertions(+)

diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c
index 583ef8a..3a64b3d 100644
--- a/drivers/net/phy/broadcom.c
+++ b/drivers/net/phy/broadcom.c
@@ -30,6 +30,16 @@ MODULE_DESCRIPTION("Broadcom PHY driver");
 MODULE_AUTHOR("Maciej W. Rozycki");
 MODULE_LICENSE("GPL");
 
+static int bcm54xx_auxctl_read(struct phy_device *phydev, u16 regnum)
+{
+   /* The register must be written to both the Shadow Register Select and
+* the Shadow Read Register Selector
+*/
+   phy_write(phydev, MII_BCM54XX_AUX_CTL, regnum |
+ regnum << MII_BCM54XX_AUXCTL_SHDWSEL_READ_SHIFT);
+   return phy_read(phydev, MII_BCM54XX_AUX_CTL);
+}
+
 static int bcm54xx_auxctl_write(struct phy_device *phydev, u16 regnum, u16 val)
 {
return phy_write(phydev, MII_BCM54XX_AUX_CTL, regnum | val);
diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index 60def78..0ed6691 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -110,6 +110,7 @@
 #define MII_BCM54XX_AUXCTL_MISC_FORCE_AMDIX0x0200
 #define MII_BCM54XX_AUXCTL_MISC_RDSEL_MISC 0x7000
 #define MII_BCM54XX_AUXCTL_SHDWSEL_MISC0x0007
+#define MII_BCM54XX_AUXCTL_SHDWSEL_READ_SHIFT  12
 
 #define MII_BCM54XX_AUXCTL_SHDWSEL_MASK0x0007
 
-- 
2.7.4



[PATCH v6 0/7] add NS2 support to bgmac

2016-11-03 Thread Jon Mason
Changes in v6:
* Use a common bgmac_phy_connect_direct (per Rafal Milecki) 
* Rebased on latest net-next
* Added Reviewed-by to the relevant patches


Changes in v5:
* Change a pr_err to netdev_err (per Scott Branden)
* Reword the lane swap binding documentation (per Andrew Lunn)


Changes in v4:
* Actually send out the lane swap binding doc patch (Per Scott Branden)
* Remove unused #define (Per Andrew Lunn)


Changes in v3:
* Clean-up the bgmac DT binding doc (per Rob Herring)
* Document the lane swap binding and make it generic (Per Andrew Lunn)


Changes in v2:
* Remove the PHY power-on (per Andrew Lunn)
* Misc PHY clean-ups regarding comments and #defines (per Andrew Lunn)
  This results on none of the original PHY code from Vikas being
  present.  So, I'm removing him as an author and giving him
  "Inspired-by" credit.
* Move PHY lane swapping to PHY driver (per Andrew Lunn and Florian
  Fainelli)
* Remove bgmac sleep (per Florian Fainelli)
* Re-add bgmac chip reset (per Florian Fainelli and Ray Jui)
* Rebased on latest net-next
* Added patch for bcm54xx_auxctl_read, which is used in the BCM54810


Jon Mason (7):
  net: phy: broadcom: add bcm54xx_auxctl_read
  Documentation: devicetree: add PHY lane swap binding
  net: phy: broadcom: Add BCM54810 PHY entry
  Documentation: devicetree: net: add NS2 bindings to amac
  net: ethernet: bgmac: device tree phy enablement
  net: ethernet: bgmac: add NS2 support
  arm64: dts: NS2: add AMAC ethernet support

 .../devicetree/bindings/net/brcm,amac.txt  | 16 +++--
 Documentation/devicetree/bindings/net/phy.txt  |  4 ++
 arch/arm64/boot/dts/broadcom/ns2-svk.dts   |  5 ++
 arch/arm64/boot/dts/broadcom/ns2.dtsi  | 12 
 drivers/net/ethernet/broadcom/bgmac-bcma.c | 22 +++
 drivers/net/ethernet/broadcom/bgmac-platform.c | 74 +-
 drivers/net/ethernet/broadcom/bgmac.c  | 32 +++---
 drivers/net/ethernet/broadcom/bgmac.h  |  9 +++
 drivers/net/phy/Kconfig|  2 +-
 drivers/net/phy/broadcom.c | 68 +++-
 include/linux/brcmphy.h| 10 +++
 11 files changed, 222 insertions(+), 32 deletions(-)

-- 
2.7.4



[PATCH v6 6/7] net: ethernet: bgmac: add NS2 support

2016-11-03 Thread Jon Mason
Add support for the variant of amac hardware present in the Broadcom
Northstar2 based SoCs.  Northstar2 requires an additional register to be
configured with the port speed/duplexity (NICPM).  This can be added to
the link callback to hide it from the instances that do not use this.
Also, clearing of the pending interrupts on init is required due to
observed issues on some platforms.

Signed-off-by: Jon Mason 
Reviewed-by: Florian Fainelli 
---
 drivers/net/ethernet/broadcom/bgmac-platform.c | 56 +-
 drivers/net/ethernet/broadcom/bgmac.c  |  3 ++
 drivers/net/ethernet/broadcom/bgmac.h  |  1 +
 3 files changed, 58 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bgmac-platform.c 
b/drivers/net/ethernet/broadcom/bgmac-platform.c
index 4642940..6f736c1 100644
--- a/drivers/net/ethernet/broadcom/bgmac-platform.c
+++ b/drivers/net/ethernet/broadcom/bgmac-platform.c
@@ -14,12 +14,21 @@
 #define pr_fmt(fmt)KBUILD_MODNAME ": " fmt
 
 #include 
+#include 
 #include 
 #include 
 #include 
 #include 
 #include "bgmac.h"
 
+#define NICPM_IOMUX_CTRL   0x0008
+
+#define NICPM_IOMUX_CTRL_INIT_VAL  0x3196e000
+#define NICPM_IOMUX_CTRL_SPD_SHIFT 10
+#define NICPM_IOMUX_CTRL_SPD_10M   0
+#define NICPM_IOMUX_CTRL_SPD_100M  1
+#define NICPM_IOMUX_CTRL_SPD_1000M 2
+
 static u32 platform_bgmac_read(struct bgmac *bgmac, u16 offset)
 {
return readl(bgmac->plat.base + offset);
@@ -87,12 +96,46 @@ static void platform_bgmac_cmn_maskset32(struct bgmac 
*bgmac, u16 offset,
WARN_ON(1);
 }
 
+static void bgmac_nicpm_speed_set(struct net_device *net_dev)
+{
+   struct bgmac *bgmac = netdev_priv(net_dev);
+   u32 val;
+
+   if (!bgmac->plat.nicpm_base)
+   return;
+
+   val = NICPM_IOMUX_CTRL_INIT_VAL;
+   switch (bgmac->net_dev->phydev->speed) {
+   default:
+   netdev_err(net_dev, "Unsupported speed. Defaulting to 
1000Mb\n");
+   case SPEED_1000:
+   val |= NICPM_IOMUX_CTRL_SPD_1000M << NICPM_IOMUX_CTRL_SPD_SHIFT;
+   break;
+   case SPEED_100:
+   val |= NICPM_IOMUX_CTRL_SPD_100M << NICPM_IOMUX_CTRL_SPD_SHIFT;
+   break;
+   case SPEED_10:
+   val |= NICPM_IOMUX_CTRL_SPD_10M << NICPM_IOMUX_CTRL_SPD_SHIFT;
+   break;
+   }
+
+   writel(val, bgmac->plat.nicpm_base + NICPM_IOMUX_CTRL);
+
+   bgmac_adjust_link(bgmac->net_dev);
+}
+
 static int platform_phy_connect(struct bgmac *bgmac)
 {
struct phy_device *phy_dev;
 
-   phy_dev = of_phy_get_and_connect(bgmac->net_dev, bgmac->dev->of_node,
-bgmac_adjust_link);
+   if (bgmac->plat.nicpm_base)
+   phy_dev = of_phy_get_and_connect(bgmac->net_dev,
+bgmac->dev->of_node,
+bgmac_nicpm_speed_set);
+   else
+   phy_dev = of_phy_get_and_connect(bgmac->net_dev,
+bgmac->dev->of_node,
+bgmac_adjust_link);
if (!phy_dev) {
dev_err(bgmac->dev, "PHY connection failed\n");
return -ENODEV;
@@ -156,6 +199,14 @@ static int bgmac_probe(struct platform_device *pdev)
if (IS_ERR(bgmac->plat.idm_base))
return PTR_ERR(bgmac->plat.idm_base);
 
+   regs = platform_get_resource_byname(pdev, IORESOURCE_MEM, "nicpm_base");
+   if (regs) {
+   bgmac->plat.nicpm_base = devm_ioremap_resource(>dev,
+  regs);
+   if (IS_ERR(bgmac->plat.nicpm_base))
+   return PTR_ERR(bgmac->plat.nicpm_base);
+   }
+
bgmac->read = platform_bgmac_read;
bgmac->write = platform_bgmac_write;
bgmac->idm_read = platform_bgmac_idm_read;
@@ -187,6 +238,7 @@ static int bgmac_remove(struct platform_device *pdev)
 static const struct of_device_id bgmac_of_enet_match[] = {
{.compatible = "brcm,amac",},
{.compatible = "brcm,nsp-amac",},
+   {.compatible = "brcm,ns2-amac",},
{},
 };
 
diff --git a/drivers/net/ethernet/broadcom/bgmac.c 
b/drivers/net/ethernet/broadcom/bgmac.c
index 7f66ea7..a29787f 100644
--- a/drivers/net/ethernet/broadcom/bgmac.c
+++ b/drivers/net/ethernet/broadcom/bgmac.c
@@ -1082,6 +1082,9 @@ static void bgmac_enable(struct bgmac *bgmac)
 /* http://bcm-v4.sipsolutions.net/mac-gbit/gmac/chipinit */
 static void bgmac_chip_init(struct bgmac *bgmac)
 {
+   /* Clear any erroneously pending interrupts */
+   bgmac_write(bgmac, BGMAC_INT_STATUS, ~0);
+
/* 1 interrupt per received frame */
bgmac_write(bgmac, BGMAC_INT_RECV_LAZY, 1 << BGMAC_IRL_FC_SHIFT);
 
diff --git a/drivers/net/ethernet/broadcom/bgmac.h 

[PATCH v6 4/7] Documentation: devicetree: net: add NS2 bindings to amac

2016-11-03 Thread Jon Mason
Clean-up the documentation to the bgmac-amac driver, per suggestion by
Rob Herring, and add details for NS2 support.

Signed-off-by: Jon Mason 
Reviewed-by: Florian Fainelli 
---
 Documentation/devicetree/bindings/net/brcm,amac.txt | 16 +++-
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/Documentation/devicetree/bindings/net/brcm,amac.txt 
b/Documentation/devicetree/bindings/net/brcm,amac.txt
index ba5ecc1..2fefa1a 100644
--- a/Documentation/devicetree/bindings/net/brcm,amac.txt
+++ b/Documentation/devicetree/bindings/net/brcm,amac.txt
@@ -2,11 +2,17 @@ Broadcom AMAC Ethernet Controller Device Tree Bindings
 -
 
 Required properties:
- - compatible: "brcm,amac" or "brcm,nsp-amac"
- - reg:Address and length of the GMAC registers,
-   Address and length of the GMAC IDM registers
- - reg-names:  Names of the registers.  Must have both "amac_base" and
-   "idm_base"
+ - compatible: "brcm,amac"
+   "brcm,nsp-amac"
+   "brcm,ns2-amac"
+ - reg:Address and length of the register set for the device. 
It
+   contains the information of registers in the same order as
+   described by reg-names
+ - reg-names:  Names of the registers.
+   "amac_base":Address and length of the GMAC registers
+   "idm_base": Address and length of the GMAC IDM registers
+   "nicpm_base":   Address and length of the NIC Port Manager
+   registers (required for Northstar2)
  - interrupts: Interrupt number
 
 Optional properties:
-- 
2.7.4



[PATCH v6 5/7] net: ethernet: bgmac: device tree phy enablement

2016-11-03 Thread Jon Mason
Change the bgmac driver to allow for phy's defined by the device tree

Signed-off-by: Jon Mason 
---
 drivers/net/ethernet/broadcom/bgmac-bcma.c | 22 +++
 drivers/net/ethernet/broadcom/bgmac-platform.c | 22 ++-
 drivers/net/ethernet/broadcom/bgmac.c  | 29 +-
 drivers/net/ethernet/broadcom/bgmac.h  |  8 +++
 4 files changed, 56 insertions(+), 25 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bgmac-bcma.c 
b/drivers/net/ethernet/broadcom/bgmac-bcma.c
index c16ec3a..4a4ffc0 100644
--- a/drivers/net/ethernet/broadcom/bgmac-bcma.c
+++ b/drivers/net/ethernet/broadcom/bgmac-bcma.c
@@ -80,6 +80,24 @@ static void bcma_bgmac_cmn_maskset32(struct bgmac *bgmac, 
u16 offset, u32 mask,
bcma_maskset32(bgmac->bcma.cmn, offset, mask, set);
 }
 
+static int bcma_phy_connect(struct bgmac *bgmac)
+{
+   struct phy_device *phy_dev;
+   char bus_id[MII_BUS_ID_SIZE + 3];
+
+   /* Connect to the PHY */
+   snprintf(bus_id, sizeof(bus_id), PHY_ID_FMT, bgmac->mii_bus->id,
+bgmac->phyaddr);
+   phy_dev = phy_connect(bgmac->net_dev, bus_id, bgmac_adjust_link,
+ PHY_INTERFACE_MODE_MII);
+   if (IS_ERR(phy_dev)) {
+   dev_err(bgmac->dev, "PHY connection failed\n");
+   return PTR_ERR(phy_dev);
+   }
+
+   return 0;
+}
+
 static const struct bcma_device_id bgmac_bcma_tbl[] = {
BCMA_CORE(BCMA_MANUF_BCM, BCMA_CORE_4706_MAC_GBIT,
  BCMA_ANY_REV, BCMA_ANY_CLASS),
@@ -275,6 +293,10 @@ static int bgmac_probe(struct bcma_device *core)
bgmac->cco_ctl_maskset = bcma_bgmac_cco_ctl_maskset;
bgmac->get_bus_clock = bcma_bgmac_get_bus_clock;
bgmac->cmn_maskset32 = bcma_bgmac_cmn_maskset32;
+   if (bgmac->mii_bus)
+   bgmac->phy_connect = bcma_phy_connect;
+   else
+   bgmac->phy_connect = bgmac_phy_connect_direct;
 
err = bgmac_enet_probe(bgmac);
if (err)
diff --git a/drivers/net/ethernet/broadcom/bgmac-platform.c 
b/drivers/net/ethernet/broadcom/bgmac-platform.c
index be52f27..4642940 100644
--- a/drivers/net/ethernet/broadcom/bgmac-platform.c
+++ b/drivers/net/ethernet/broadcom/bgmac-platform.c
@@ -16,6 +16,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include "bgmac.h"
 
@@ -86,6 +87,20 @@ static void platform_bgmac_cmn_maskset32(struct bgmac 
*bgmac, u16 offset,
WARN_ON(1);
 }
 
+static int platform_phy_connect(struct bgmac *bgmac)
+{
+   struct phy_device *phy_dev;
+
+   phy_dev = of_phy_get_and_connect(bgmac->net_dev, bgmac->dev->of_node,
+bgmac_adjust_link);
+   if (!phy_dev) {
+   dev_err(bgmac->dev, "PHY connection failed\n");
+   return -ENODEV;
+   }
+
+   return 0;
+}
+
 static int bgmac_probe(struct platform_device *pdev)
 {
struct device_node *np = pdev->dev.of_node;
@@ -102,7 +117,6 @@ static int bgmac_probe(struct platform_device *pdev)
/* Set the features of the 4707 family */
bgmac->feature_flags |= BGMAC_FEAT_CLKCTLST;
bgmac->feature_flags |= BGMAC_FEAT_NO_RESET;
-   bgmac->feature_flags |= BGMAC_FEAT_FORCE_SPEED_2500;
bgmac->feature_flags |= BGMAC_FEAT_CMDCFG_SR_REV4;
bgmac->feature_flags |= BGMAC_FEAT_TX_MASK_SETUP;
bgmac->feature_flags |= BGMAC_FEAT_RX_MASK_SETUP;
@@ -151,6 +165,12 @@ static int bgmac_probe(struct platform_device *pdev)
bgmac->cco_ctl_maskset = platform_bgmac_cco_ctl_maskset;
bgmac->get_bus_clock = platform_bgmac_get_bus_clock;
bgmac->cmn_maskset32 = platform_bgmac_cmn_maskset32;
+   if (of_parse_phandle(np, "phy-handle", 0)) {
+   bgmac->phy_connect = platform_phy_connect;
+   } else {
+   bgmac->phy_connect = bgmac_phy_connect_direct;
+   bgmac->feature_flags |= BGMAC_FEAT_FORCE_SPEED_2500;
+   }
 
return bgmac_enet_probe(bgmac);
 }
diff --git a/drivers/net/ethernet/broadcom/bgmac.c 
b/drivers/net/ethernet/broadcom/bgmac.c
index 31ca204..7f66ea7 100644
--- a/drivers/net/ethernet/broadcom/bgmac.c
+++ b/drivers/net/ethernet/broadcom/bgmac.c
@@ -1388,7 +1388,7 @@ static const struct ethtool_ops bgmac_ethtool_ops = {
  * MII
  **/
 
-static void bgmac_adjust_link(struct net_device *net_dev)
+void bgmac_adjust_link(struct net_device *net_dev)
 {
struct bgmac *bgmac = netdev_priv(net_dev);
struct phy_device *phy_dev = net_dev->phydev;
@@ -1411,8 +1411,9 @@ static void bgmac_adjust_link(struct net_device *net_dev)
phy_print_status(phy_dev);
}
 }
+EXPORT_SYMBOL_GPL(bgmac_adjust_link);
 
-static int bgmac_phy_connect_direct(struct bgmac *bgmac)
+int bgmac_phy_connect_direct(struct bgmac *bgmac)
 {
struct fixed_phy_status fphy_status = {
.link = 1,
@@ 

Re: [mm PATCH v2 18/26] arch/powerpc: Add option to skip DMA sync as a part of mapping

2016-11-03 Thread Michael Ellerman
Alexander Duyck  writes:

> This change allows us to pass DMA_ATTR_SKIP_CPU_SYNC which allows us to
> avoid invoking cache line invalidation if the driver will just handle it
> via a sync_for_cpu or sync_for_device call.
>
> Cc: Benjamin Herrenschmidt 
> Cc: Paul Mackerras 
> Cc: Michael Ellerman 
> Cc: linuxppc-...@lists.ozlabs.org
> Signed-off-by: Alexander Duyck 
> ---
>  arch/powerpc/kernel/dma.c |9 -
>  1 file changed, 8 insertions(+), 1 deletion(-)

LGTM.

Acked-by: Michael Ellerman  (powerpc)

cheers


[PATCH net 1/1] driver: macvlan: Destroy new macvlan port if macvlan_common_newlink failed.

2016-11-03 Thread fgao
From: Gao Feng 

When there is no existing macvlan port in lowdev, one new macvlan port
would be created. But it doesn't be destoried when something failed later.
It casues some memleak.

Now add one flag to indicate if new macvlan port is created.

Signed-off-by: Gao Feng 
---
 drivers/net/macvlan.c | 31 ++-
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 3234fcd..d2d6f12 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -1278,6 +1278,7 @@ int macvlan_common_newlink(struct net *src_net, struct 
net_device *dev,
struct net_device *lowerdev;
int err;
int macmode;
+   bool create = false;
 
if (!tb[IFLA_LINK])
return -EINVAL;
@@ -1304,12 +1305,18 @@ int macvlan_common_newlink(struct net *src_net, struct 
net_device *dev,
err = macvlan_port_create(lowerdev);
if (err < 0)
return err;
+   create = true;
}
port = macvlan_port_get_rtnl(lowerdev);
 
/* Only 1 macvlan device can be created in passthru mode */
-   if (port->passthru)
-   return -EINVAL;
+   if (port->passthru) {
+   /* The macvlan port must be not created this time,
+* still goto destroy_macvlan_port for readability.
+*/
+   err = -EINVAL;
+   goto destroy_macvlan_port;
+   }
 
vlan->lowerdev = lowerdev;
vlan->dev  = dev;
@@ -1325,24 +1332,28 @@ int macvlan_common_newlink(struct net *src_net, struct 
net_device *dev,
vlan->flags = nla_get_u16(data[IFLA_MACVLAN_FLAGS]);
 
if (vlan->mode == MACVLAN_MODE_PASSTHRU) {
-   if (port->count)
-   return -EINVAL;
+   if (port->count) {
+   err = -EINVAL;
+   goto destroy_macvlan_port;
+   }
port->passthru = true;
eth_hw_addr_inherit(dev, lowerdev);
}
 
if (data && data[IFLA_MACVLAN_MACADDR_MODE]) {
-   if (vlan->mode != MACVLAN_MODE_SOURCE)
-   return -EINVAL;
+   if (vlan->mode != MACVLAN_MODE_SOURCE) {
+   err = -EINVAL;
+   goto destroy_macvlan_port;
+   }
macmode = nla_get_u32(data[IFLA_MACVLAN_MACADDR_MODE]);
err = macvlan_changelink_sources(vlan, macmode, data);
if (err)
-   return err;
+   goto destroy_macvlan_port;
}
 
err = register_netdevice(dev);
if (err < 0)
-   return err;
+   goto destroy_macvlan_port;
 
dev->priv_flags |= IFF_MACVLAN;
err = netdev_upper_dev_link(lowerdev, dev);
@@ -1357,7 +1368,9 @@ int macvlan_common_newlink(struct net *src_net, struct 
net_device *dev,
 
 unregister_netdev:
unregister_netdevice(dev);
-
+destroy_macvlan_port:
+   if (create)
+   macvlan_port_destroy(port->dev);
return err;
 }
 EXPORT_SYMBOL_GPL(macvlan_common_newlink);
-- 
1.9.1




[PATCH net-next v2 06/11] net: dsa: mv88e6xxx: add port 802.1Q mode setter

2016-11-03 Thread Vivien Didelot
Add port functions to set the port 802.1Q mode.

Signed-off-by: Vivien Didelot 
---
 drivers/net/dsa/mv88e6xxx/chip.c | 33 ++---
 drivers/net/dsa/mv88e6xxx/port.c | 32 
 drivers/net/dsa/mv88e6xxx/port.h |  3 +++
 3 files changed, 37 insertions(+), 31 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 9c0a028..181d3b9 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -1806,48 +1806,19 @@ static int mv88e6xxx_port_check_hw_vlan(struct 
dsa_switch *ds, int port,
return err;
 }
 
-static const char * const mv88e6xxx_port_8021q_mode_names[] = {
-   [PORT_CONTROL_2_8021Q_DISABLED] = "Disabled",
-   [PORT_CONTROL_2_8021Q_FALLBACK] = "Fallback",
-   [PORT_CONTROL_2_8021Q_CHECK] = "Check",
-   [PORT_CONTROL_2_8021Q_SECURE] = "Secure",
-};
-
 static int mv88e6xxx_port_vlan_filtering(struct dsa_switch *ds, int port,
 bool vlan_filtering)
 {
struct mv88e6xxx_chip *chip = ds->priv;
-   u16 old, new = vlan_filtering ? PORT_CONTROL_2_8021Q_SECURE :
+   u16 mode = vlan_filtering ? PORT_CONTROL_2_8021Q_SECURE :
PORT_CONTROL_2_8021Q_DISABLED;
-   u16 reg;
int err;
 
if (!mv88e6xxx_has(chip, MV88E6XXX_FLAG_VTU))
return -EOPNOTSUPP;
 
mutex_lock(>reg_lock);
-
-   err = mv88e6xxx_port_read(chip, port, PORT_CONTROL_2, );
-   if (err)
-   goto unlock;
-
-   old = reg & PORT_CONTROL_2_8021Q_MASK;
-
-   if (new != old) {
-   reg &= ~PORT_CONTROL_2_8021Q_MASK;
-   reg |= new & PORT_CONTROL_2_8021Q_MASK;
-
-   err = mv88e6xxx_port_write(chip, port, PORT_CONTROL_2, reg);
-   if (err)
-   goto unlock;
-
-   netdev_dbg(ds->ports[port].netdev, "802.1Q Mode %s (was %s)\n",
-  mv88e6xxx_port_8021q_mode_names[new],
-  mv88e6xxx_port_8021q_mode_names[old]);
-   }
-
-   err = 0;
-unlock:
+   err = mv88e6xxx_port_set_8021q_mode(chip, port, mode);
mutex_unlock(>reg_lock);
 
return err;
diff --git a/drivers/net/dsa/mv88e6xxx/port.c b/drivers/net/dsa/mv88e6xxx/port.c
index 104fe2d..53d17e6 100644
--- a/drivers/net/dsa/mv88e6xxx/port.c
+++ b/drivers/net/dsa/mv88e6xxx/port.c
@@ -190,3 +190,35 @@ int mv88e6xxx_port_set_pvid(struct mv88e6xxx_chip *chip, 
int port, u16 pvid)
 
return 0;
 }
+
+/* Offset 0x08: Port Control 2 Register */
+
+static const char * const mv88e6xxx_port_8021q_mode_names[] = {
+   [PORT_CONTROL_2_8021Q_DISABLED] = "Disabled",
+   [PORT_CONTROL_2_8021Q_FALLBACK] = "Fallback",
+   [PORT_CONTROL_2_8021Q_CHECK] = "Check",
+   [PORT_CONTROL_2_8021Q_SECURE] = "Secure",
+};
+
+int mv88e6xxx_port_set_8021q_mode(struct mv88e6xxx_chip *chip, int port,
+ u16 mode)
+{
+   u16 reg;
+   int err;
+
+   err = mv88e6xxx_port_read(chip, port, PORT_CONTROL_2, );
+   if (err)
+   return err;
+
+   reg &= ~PORT_CONTROL_2_8021Q_MASK;
+   reg |= mode & PORT_CONTROL_2_8021Q_MASK;
+
+   err = mv88e6xxx_port_write(chip, port, PORT_CONTROL_2, reg);
+   if (err)
+   return err;
+
+   netdev_dbg(chip->ds->ports[port].netdev, "802.1QMode set to %s\n",
+  mv88e6xxx_port_8021q_mode_names[mode]);
+
+   return 0;
+}
diff --git a/drivers/net/dsa/mv88e6xxx/port.h b/drivers/net/dsa/mv88e6xxx/port.h
index 4489d9e..921eecf 100644
--- a/drivers/net/dsa/mv88e6xxx/port.h
+++ b/drivers/net/dsa/mv88e6xxx/port.h
@@ -31,4 +31,7 @@ int mv88e6xxx_port_set_fid(struct mv88e6xxx_chip *chip, int 
port, u16 fid);
 int mv88e6xxx_port_get_pvid(struct mv88e6xxx_chip *chip, int port, u16 *pvid);
 int mv88e6xxx_port_set_pvid(struct mv88e6xxx_chip *chip, int port, u16 pvid);
 
+int mv88e6xxx_port_set_8021q_mode(struct mv88e6xxx_chip *chip, int port,
+ u16 mode);
+
 #endif /* _MV88E6XXX_PORT_H */
-- 
2.10.2



[PATCH net-next v2 07/11] net: dsa: mv88e6xxx: add port link setter

2016-11-03 Thread Vivien Didelot
Most of the chips will have a port register control bits to force the
port's link up, down, or let normal link detection occurs.

Implement such operation to use it later when setting duplex, etc.

Signed-off-by: Vivien Didelot 
---
 drivers/net/dsa/mv88e6xxx/chip.c  | 17 +++
 drivers/net/dsa/mv88e6xxx/mv88e6xxx.h | 10 +
 drivers/net/dsa/mv88e6xxx/port.c  | 41 +++
 drivers/net/dsa/mv88e6xxx/port.h  |  2 ++
 4 files changed, 70 insertions(+)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 181d3b9..cc43e6f 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -3160,42 +3160,49 @@ static const struct mv88e6xxx_ops mv88e6085_ops = {
.set_switch_mac = mv88e6xxx_g1_set_switch_mac,
.phy_read = mv88e6xxx_phy_ppu_read,
.phy_write = mv88e6xxx_phy_ppu_write,
+   .port_set_link = mv88e6xxx_port_set_link,
 };
 
 static const struct mv88e6xxx_ops mv88e6095_ops = {
.set_switch_mac = mv88e6xxx_g1_set_switch_mac,
.phy_read = mv88e6xxx_phy_ppu_read,
.phy_write = mv88e6xxx_phy_ppu_write,
+   .port_set_link = mv88e6xxx_port_set_link,
 };
 
 static const struct mv88e6xxx_ops mv88e6123_ops = {
.set_switch_mac = mv88e6xxx_g2_set_switch_mac,
.phy_read = mv88e6xxx_read,
.phy_write = mv88e6xxx_write,
+   .port_set_link = mv88e6xxx_port_set_link,
 };
 
 static const struct mv88e6xxx_ops mv88e6131_ops = {
.set_switch_mac = mv88e6xxx_g1_set_switch_mac,
.phy_read = mv88e6xxx_phy_ppu_read,
.phy_write = mv88e6xxx_phy_ppu_write,
+   .port_set_link = mv88e6xxx_port_set_link,
 };
 
 static const struct mv88e6xxx_ops mv88e6161_ops = {
.set_switch_mac = mv88e6xxx_g2_set_switch_mac,
.phy_read = mv88e6xxx_read,
.phy_write = mv88e6xxx_write,
+   .port_set_link = mv88e6xxx_port_set_link,
 };
 
 static const struct mv88e6xxx_ops mv88e6165_ops = {
.set_switch_mac = mv88e6xxx_g2_set_switch_mac,
.phy_read = mv88e6xxx_read,
.phy_write = mv88e6xxx_write,
+   .port_set_link = mv88e6xxx_port_set_link,
 };
 
 static const struct mv88e6xxx_ops mv88e6171_ops = {
.set_switch_mac = mv88e6xxx_g2_set_switch_mac,
.phy_read = mv88e6xxx_g2_smi_phy_read,
.phy_write = mv88e6xxx_g2_smi_phy_write,
+   .port_set_link = mv88e6xxx_port_set_link,
 };
 
 static const struct mv88e6xxx_ops mv88e6172_ops = {
@@ -3204,12 +3211,14 @@ static const struct mv88e6xxx_ops mv88e6172_ops = {
.set_switch_mac = mv88e6xxx_g2_set_switch_mac,
.phy_read = mv88e6xxx_g2_smi_phy_read,
.phy_write = mv88e6xxx_g2_smi_phy_write,
+   .port_set_link = mv88e6xxx_port_set_link,
 };
 
 static const struct mv88e6xxx_ops mv88e6175_ops = {
.set_switch_mac = mv88e6xxx_g2_set_switch_mac,
.phy_read = mv88e6xxx_g2_smi_phy_read,
.phy_write = mv88e6xxx_g2_smi_phy_write,
+   .port_set_link = mv88e6xxx_port_set_link,
 };
 
 static const struct mv88e6xxx_ops mv88e6176_ops = {
@@ -3218,12 +3227,14 @@ static const struct mv88e6xxx_ops mv88e6176_ops = {
.set_switch_mac = mv88e6xxx_g2_set_switch_mac,
.phy_read = mv88e6xxx_g2_smi_phy_read,
.phy_write = mv88e6xxx_g2_smi_phy_write,
+   .port_set_link = mv88e6xxx_port_set_link,
 };
 
 static const struct mv88e6xxx_ops mv88e6185_ops = {
.set_switch_mac = mv88e6xxx_g1_set_switch_mac,
.phy_read = mv88e6xxx_phy_ppu_read,
.phy_write = mv88e6xxx_phy_ppu_write,
+   .port_set_link = mv88e6xxx_port_set_link,
 };
 
 static const struct mv88e6xxx_ops mv88e6240_ops = {
@@ -3232,6 +3243,7 @@ static const struct mv88e6xxx_ops mv88e6240_ops = {
.set_switch_mac = mv88e6xxx_g2_set_switch_mac,
.phy_read = mv88e6xxx_g2_smi_phy_read,
.phy_write = mv88e6xxx_g2_smi_phy_write,
+   .port_set_link = mv88e6xxx_port_set_link,
 };
 
 static const struct mv88e6xxx_ops mv88e6320_ops = {
@@ -3240,6 +3252,7 @@ static const struct mv88e6xxx_ops mv88e6320_ops = {
.set_switch_mac = mv88e6xxx_g2_set_switch_mac,
.phy_read = mv88e6xxx_g2_smi_phy_read,
.phy_write = mv88e6xxx_g2_smi_phy_write,
+   .port_set_link = mv88e6xxx_port_set_link,
 };
 
 static const struct mv88e6xxx_ops mv88e6321_ops = {
@@ -3248,18 +3261,21 @@ static const struct mv88e6xxx_ops mv88e6321_ops = {
.set_switch_mac = mv88e6xxx_g2_set_switch_mac,
.phy_read = mv88e6xxx_g2_smi_phy_read,
.phy_write = mv88e6xxx_g2_smi_phy_write,
+   .port_set_link = mv88e6xxx_port_set_link,
 };
 
 static const struct mv88e6xxx_ops mv88e6350_ops = {
.set_switch_mac = mv88e6xxx_g2_set_switch_mac,
.phy_read = mv88e6xxx_g2_smi_phy_read,
.phy_write = mv88e6xxx_g2_smi_phy_write,
+   .port_set_link = mv88e6xxx_port_set_link,
 };
 
 static const struct mv88e6xxx_ops mv88e6351_ops = {
 

[PATCH net-next v2 05/11] net: dsa: mv88e6xxx: add port PVID accessors

2016-11-03 Thread Vivien Didelot
Add port functions to access the ports default VID.

Signed-off-by: Vivien Didelot 
---
 drivers/net/dsa/mv88e6xxx/chip.c | 51 
 drivers/net/dsa/mv88e6xxx/port.c | 38 ++
 drivers/net/dsa/mv88e6xxx/port.h |  3 +++
 3 files changed, 45 insertions(+), 47 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 20f59f1..9c0a028 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -1288,49 +1288,6 @@ static void mv88e6xxx_port_fast_age(struct dsa_switch 
*ds, int port)
netdev_err(ds->ports[port].netdev, "failed to flush ATU\n");
 }
 
-static int _mv88e6xxx_port_pvid(struct mv88e6xxx_chip *chip, int port,
-   u16 *new, u16 *old)
-{
-   struct dsa_switch *ds = chip->ds;
-   u16 pvid, reg;
-   int err;
-
-   err = mv88e6xxx_port_read(chip, port, PORT_DEFAULT_VLAN, );
-   if (err)
-   return err;
-
-   pvid = reg & PORT_DEFAULT_VLAN_MASK;
-
-   if (new) {
-   reg &= ~PORT_DEFAULT_VLAN_MASK;
-   reg |= *new & PORT_DEFAULT_VLAN_MASK;
-
-   err = mv88e6xxx_port_write(chip, port, PORT_DEFAULT_VLAN, reg);
-   if (err)
-   return err;
-
-   netdev_dbg(ds->ports[port].netdev,
-  "DefaultVID %d (was %d)\n", *new, pvid);
-   }
-
-   if (old)
-   *old = pvid;
-
-   return 0;
-}
-
-static int _mv88e6xxx_port_pvid_get(struct mv88e6xxx_chip *chip,
-   int port, u16 *pvid)
-{
-   return _mv88e6xxx_port_pvid(chip, port, NULL, pvid);
-}
-
-static int _mv88e6xxx_port_pvid_set(struct mv88e6xxx_chip *chip,
-   int port, u16 pvid)
-{
-   return _mv88e6xxx_port_pvid(chip, port, , NULL);
-}
-
 static int _mv88e6xxx_vtu_wait(struct mv88e6xxx_chip *chip)
 {
return mv88e6xxx_g1_wait(chip, GLOBAL_VTU_OP, GLOBAL_VTU_OP_BUSY);
@@ -1510,7 +1467,7 @@ static int mv88e6xxx_port_vlan_dump(struct dsa_switch 
*ds, int port,
 
mutex_lock(>reg_lock);
 
-   err = _mv88e6xxx_port_pvid_get(chip, port, );
+   err = mv88e6xxx_port_get_pvid(chip, port, );
if (err)
goto unlock;
 
@@ -1958,7 +1915,7 @@ static void mv88e6xxx_port_vlan_add(struct dsa_switch 
*ds, int port,
   "failed to add VLAN %d%c\n",
   vid, untagged ? 'u' : 't');
 
-   if (pvid && _mv88e6xxx_port_pvid_set(chip, port, vlan->vid_end))
+   if (pvid && mv88e6xxx_port_set_pvid(chip, port, vlan->vid_end))
netdev_err(ds->ports[port].netdev, "failed to set PVID %d\n",
   vlan->vid_end);
 
@@ -2013,7 +1970,7 @@ static int mv88e6xxx_port_vlan_del(struct dsa_switch *ds, 
int port,
 
mutex_lock(>reg_lock);
 
-   err = _mv88e6xxx_port_pvid_get(chip, port, );
+   err = mv88e6xxx_port_get_pvid(chip, port, );
if (err)
goto unlock;
 
@@ -2023,7 +1980,7 @@ static int mv88e6xxx_port_vlan_del(struct dsa_switch *ds, 
int port,
goto unlock;
 
if (vid == pvid) {
-   err = _mv88e6xxx_port_pvid_set(chip, port, 0);
+   err = mv88e6xxx_port_set_pvid(chip, port, 0);
if (err)
goto unlock;
}
diff --git a/drivers/net/dsa/mv88e6xxx/port.c b/drivers/net/dsa/mv88e6xxx/port.c
index a7da812..104fe2d 100644
--- a/drivers/net/dsa/mv88e6xxx/port.c
+++ b/drivers/net/dsa/mv88e6xxx/port.c
@@ -152,3 +152,41 @@ int mv88e6xxx_port_set_fid(struct mv88e6xxx_chip *chip, 
int port, u16 fid)
 
return 0;
 }
+
+/* Offset 0x07: Default Port VLAN ID & Priority */
+
+int mv88e6xxx_port_get_pvid(struct mv88e6xxx_chip *chip, int port, u16 *pvid)
+{
+   u16 reg;
+   int err;
+
+   err = mv88e6xxx_port_read(chip, port, PORT_DEFAULT_VLAN, );
+   if (err)
+   return err;
+
+   *pvid = reg & PORT_DEFAULT_VLAN_MASK;
+
+   return 0;
+}
+
+int mv88e6xxx_port_set_pvid(struct mv88e6xxx_chip *chip, int port, u16 pvid)
+{
+   u16 reg;
+   int err;
+
+   err = mv88e6xxx_port_read(chip, port, PORT_DEFAULT_VLAN, );
+   if (err)
+   return err;
+
+   reg &= ~PORT_DEFAULT_VLAN_MASK;
+   reg |= pvid & PORT_DEFAULT_VLAN_MASK;
+
+   err = mv88e6xxx_port_write(chip, port, PORT_DEFAULT_VLAN, reg);
+   if (err)
+   return err;
+
+   netdev_dbg(chip->ds->ports[port].netdev, "DefaultVID set to %u\n",
+  pvid);
+
+   return 0;
+}
diff --git a/drivers/net/dsa/mv88e6xxx/port.h b/drivers/net/dsa/mv88e6xxx/port.h
index 0df29b9..4489d9e 100644
--- a/drivers/net/dsa/mv88e6xxx/port.h
+++ b/drivers/net/dsa/mv88e6xxx/port.h
@@ -28,4 +28,7 @@ int 

[PATCH net-next v2 03/11] net: dsa: mv88e6xxx: add port vlan map setter

2016-11-03 Thread Vivien Didelot
Add a port function to access the Port Based VLAN Map register.

Signed-off-by: Vivien Didelot 
---
 drivers/net/dsa/mv88e6xxx/chip.c | 14 ++
 drivers/net/dsa/mv88e6xxx/port.c | 25 +
 drivers/net/dsa/mv88e6xxx/port.h |  2 ++
 3 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 12c1175..087 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -1218,16 +1218,13 @@ static int _mv88e6xxx_atu_remove(struct mv88e6xxx_chip 
*chip, u16 fid,
 static int _mv88e6xxx_port_based_vlan_map(struct mv88e6xxx_chip *chip, int 
port)
 {
struct net_device *bridge = chip->ports[port].bridge_dev;
-   const u16 mask = (1 << mv88e6xxx_num_ports(chip)) - 1;
struct dsa_switch *ds = chip->ds;
u16 output_ports = 0;
-   u16 reg;
-   int err;
int i;
 
/* allow CPU port or DSA link(s) to send frames to every port */
if (dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)) {
-   output_ports = mask;
+   output_ports = ~0;
} else {
for (i = 0; i < mv88e6xxx_num_ports(chip); ++i) {
/* allow sending frames to every group member */
@@ -1243,14 +1240,7 @@ static int _mv88e6xxx_port_based_vlan_map(struct 
mv88e6xxx_chip *chip, int port)
/* prevent frames from going back out of the port they came in on */
output_ports &= ~BIT(port);
 
-   err = mv88e6xxx_port_read(chip, port, PORT_BASE_VLAN, );
-   if (err)
-   return err;
-
-   reg &= ~mask;
-   reg |= output_ports & mask;
-
-   return mv88e6xxx_port_write(chip, port, PORT_BASE_VLAN, reg);
+   return mv88e6xxx_port_set_vlan_map(chip, port, output_ports);
 }
 
 static void mv88e6xxx_port_stp_state_set(struct dsa_switch *ds, int port,
diff --git a/drivers/net/dsa/mv88e6xxx/port.c b/drivers/net/dsa/mv88e6xxx/port.c
index 8d59fe7..c6a22ae 100644
--- a/drivers/net/dsa/mv88e6xxx/port.c
+++ b/drivers/net/dsa/mv88e6xxx/port.c
@@ -60,3 +60,28 @@ int mv88e6xxx_port_set_state(struct mv88e6xxx_chip *chip, 
int port, u8 state)
 
return 0;
 }
+
+/* Offset 0x06: Port Based VLAN Map */
+
+int mv88e6xxx_port_set_vlan_map(struct mv88e6xxx_chip *chip, int port, u16 map)
+{
+   const u16 mask = GENMASK(mv88e6xxx_num_ports(chip) - 1, 0);
+   u16 reg;
+   int err;
+
+   err = mv88e6xxx_port_read(chip, port, PORT_BASE_VLAN, );
+   if (err)
+   return err;
+
+   reg &= ~mask;
+   reg |= map & mask;
+
+   err = mv88e6xxx_port_write(chip, port, PORT_BASE_VLAN, reg);
+   if (err)
+   return err;
+
+   netdev_dbg(chip->ds->ports[port].netdev, "VLANTable set to %.3x\n",
+  map);
+
+   return 0;
+}
diff --git a/drivers/net/dsa/mv88e6xxx/port.h b/drivers/net/dsa/mv88e6xxx/port.h
index ac13988..037d638 100644
--- a/drivers/net/dsa/mv88e6xxx/port.h
+++ b/drivers/net/dsa/mv88e6xxx/port.h
@@ -23,4 +23,6 @@ int mv88e6xxx_port_write(struct mv88e6xxx_chip *chip, int 
port, int reg,
 
 int mv88e6xxx_port_set_state(struct mv88e6xxx_chip *chip, int port, u8 state);
 
+int mv88e6xxx_port_set_vlan_map(struct mv88e6xxx_chip *chip, int port, u16 
map);
+
 #endif /* _MV88E6XXX_PORT_H */
-- 
2.10.2



[PATCH net-next v2 09/11] net: dsa: mv88e6xxx: add port's RGMII delay setter

2016-11-03 Thread Vivien Didelot
Some chips such as 88E6352 and 88E6390 can be programmed to add delays
to RXCLK for IND inputs or to GTXCLK for OUTD outputs when port is in
RGMII mode.

Add a port function to program such delays according to the provided PHY
interface mode.

Signed-off-by: Vivien Didelot 
---
 drivers/net/dsa/mv88e6xxx/chip.c  |  4 +++
 drivers/net/dsa/mv88e6xxx/mv88e6xxx.h |  6 
 drivers/net/dsa/mv88e6xxx/port.c  | 58 +++
 drivers/net/dsa/mv88e6xxx/port.h  |  5 +++
 4 files changed, 73 insertions(+)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 49a6935..bb93d0a 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -3220,6 +3220,7 @@ static const struct mv88e6xxx_ops mv88e6172_ops = {
.phy_write = mv88e6xxx_g2_smi_phy_write,
.port_set_link = mv88e6xxx_port_set_link,
.port_set_duplex = mv88e6xxx_port_set_duplex,
+   .port_set_rgmii_delay = mv88e6352_port_set_rgmii_delay,
 };
 
 static const struct mv88e6xxx_ops mv88e6175_ops = {
@@ -3238,6 +3239,7 @@ static const struct mv88e6xxx_ops mv88e6176_ops = {
.phy_write = mv88e6xxx_g2_smi_phy_write,
.port_set_link = mv88e6xxx_port_set_link,
.port_set_duplex = mv88e6xxx_port_set_duplex,
+   .port_set_rgmii_delay = mv88e6352_port_set_rgmii_delay,
 };
 
 static const struct mv88e6xxx_ops mv88e6185_ops = {
@@ -3256,6 +3258,7 @@ static const struct mv88e6xxx_ops mv88e6240_ops = {
.phy_write = mv88e6xxx_g2_smi_phy_write,
.port_set_link = mv88e6xxx_port_set_link,
.port_set_duplex = mv88e6xxx_port_set_duplex,
+   .port_set_rgmii_delay = mv88e6352_port_set_rgmii_delay,
 };
 
 static const struct mv88e6xxx_ops mv88e6320_ops = {
@@ -3302,6 +3305,7 @@ static const struct mv88e6xxx_ops mv88e6352_ops = {
.phy_write = mv88e6xxx_g2_smi_phy_write,
.port_set_link = mv88e6xxx_port_set_link,
.port_set_duplex = mv88e6xxx_port_set_duplex,
+   .port_set_rgmii_delay = mv88e6352_port_set_rgmii_delay,
 };
 
 static const struct mv88e6xxx_info mv88e6xxx_table[] = {
diff --git a/drivers/net/dsa/mv88e6xxx/mv88e6xxx.h 
b/drivers/net/dsa/mv88e6xxx/mv88e6xxx.h
index ab48eb9..c7527c0 100644
--- a/drivers/net/dsa/mv88e6xxx/mv88e6xxx.h
+++ b/drivers/net/dsa/mv88e6xxx/mv88e6xxx.h
@@ -728,6 +728,12 @@ struct mv88e6xxx_ops {
int (*phy_write)(struct mv88e6xxx_chip *chip, int addr, int reg,
 u16 val);
 
+   /* RGMII Receive/Transmit Timing Control
+* Add delay on PHY_INTERFACE_MODE_RGMII_*ID, no delay otherwise.
+*/
+   int (*port_set_rgmii_delay)(struct mv88e6xxx_chip *chip, int port,
+   phy_interface_t mode);
+
 #define LINK_FORCED_DOWN   0
 #define LINK_FORCED_UP 1
 #define LINK_UNFORCED  -2
diff --git a/drivers/net/dsa/mv88e6xxx/port.c b/drivers/net/dsa/mv88e6xxx/port.c
index 17b5444..838068d 100644
--- a/drivers/net/dsa/mv88e6xxx/port.c
+++ b/drivers/net/dsa/mv88e6xxx/port.c
@@ -35,6 +35,64 @@ int mv88e6xxx_port_write(struct mv88e6xxx_chip *chip, int 
port, int reg,
  * Link, Duplex and Flow Control have one force bit, one value bit.
  */
 
+static int mv88e6xxx_port_set_rgmii_delay(struct mv88e6xxx_chip *chip, int 
port,
+ phy_interface_t mode)
+{
+   u16 reg;
+   int err;
+
+   err = mv88e6xxx_port_read(chip, port, PORT_PCS_CTRL, );
+   if (err)
+   return err;
+
+   reg &= ~(PORT_PCS_CTRL_RGMII_DELAY_RXCLK |
+PORT_PCS_CTRL_RGMII_DELAY_TXCLK);
+
+   switch (mode) {
+   case PHY_INTERFACE_MODE_RGMII_RXID:
+   reg |= PORT_PCS_CTRL_RGMII_DELAY_RXCLK;
+   break;
+   case PHY_INTERFACE_MODE_RGMII_TXID:
+   reg |= PORT_PCS_CTRL_RGMII_DELAY_TXCLK;
+   break;
+   case PHY_INTERFACE_MODE_RGMII_ID:
+   reg |= PORT_PCS_CTRL_RGMII_DELAY_RXCLK |
+   PORT_PCS_CTRL_RGMII_DELAY_TXCLK;
+   break;
+   default:
+   /* no delay */
+   break;
+   }
+
+   err = mv88e6xxx_port_write(chip, port, PORT_PCS_CTRL, reg);
+   if (err)
+   return err;
+
+   netdev_dbg(chip->ds->ports[port].netdev, "delay RXCLK %s, TXCLK %s\n",
+  reg & PORT_PCS_CTRL_RGMII_DELAY_RXCLK ? "yes" : "no",
+  reg & PORT_PCS_CTRL_RGMII_DELAY_TXCLK ? "yes" : "no");
+
+   return 0;
+}
+
+int mv88e6352_port_set_rgmii_delay(struct mv88e6xxx_chip *chip, int port,
+  phy_interface_t mode)
+{
+   if (port < 5)
+   return -EOPNOTSUPP;
+
+   return mv88e6xxx_port_set_rgmii_delay(chip, port, mode);
+}
+
+int mv88e6390_port_set_rgmii_delay(struct mv88e6xxx_chip *chip, int port,
+  phy_interface_t mode)
+{
+   if (port != 0)
+ 

[PATCH net-next v2 00/11] net: dsa: mv88e6xxx: refine port operations

2016-11-03 Thread Vivien Didelot
The Marvell chips have one internal SMI device per port, containing a
set of registers used to configure a port's link, STP state, default
VLAN or addresses database, etc.

This patchset creates port files to implement the port operations as
described in datasheets, and extend the chip ops structure with them.

Patches 1 to 6 implement accessors for port's STP state, port based VLAN
map, default FID, default VID, and 802.1Q mode.

Patches 7 to 11 implement the port's MAC setup of link state, duplex
mode, RGMII delay and speed, all accessed through port's register 0x01.

The new port's MAC setup code is used to re-implement the adjust_link
code and correctly force the link down before changing any of the MAC
settings, as requested by the datasheets.

The port's MAC accessors use values compatible with struct phy_device
(e.g. DUPLEX_FULL) and extend them when needed (e.g. SPEED_MAX).

Changes in v2:

  - Strictly use new _UNFORCED values instead of re-using _UNKNOWN ones.

Vivien Didelot (11):
  net: dsa: mv88e6xxx: add port files
  net: dsa: mv88e6xxx: add port state setter
  net: dsa: mv88e6xxx: add port vlan map setter
  net: dsa: mv88e6xxx: add port FID accessors
  net: dsa: mv88e6xxx: add port PVID accessors
  net: dsa: mv88e6xxx: add port 802.1Q mode setter
  net: dsa: mv88e6xxx: add port link setter
  net: dsa: mv88e6xxx: add port duplex setter
  net: dsa: mv88e6xxx: add port's RGMII delay setter
  net: dsa: mv88e6xxx: add port's MAC speed setter
  net: dsa: mv88e6xxx: setup port's MAC

 drivers/net/dsa/mv88e6xxx/Makefile|   1 +
 drivers/net/dsa/mv88e6xxx/chip.c  | 436 +
 drivers/net/dsa/mv88e6xxx/mv88e6xxx.h |  49 +++-
 drivers/net/dsa/mv88e6xxx/port.c  | 497 ++
 drivers/net/dsa/mv88e6xxx/port.h  |  52 
 5 files changed, 727 insertions(+), 308 deletions(-)
 create mode 100644 drivers/net/dsa/mv88e6xxx/port.c
 create mode 100644 drivers/net/dsa/mv88e6xxx/port.h

-- 
2.10.2



[PATCH net-next v2 04/11] net: dsa: mv88e6xxx: add port FID accessors

2016-11-03 Thread Vivien Didelot
Add functions to port files to access the ports default FID.

Signed-off-by: Vivien Didelot 
---
 drivers/net/dsa/mv88e6xxx/chip.c | 77 +++-
 drivers/net/dsa/mv88e6xxx/port.c | 67 ++
 drivers/net/dsa/mv88e6xxx/port.h |  3 ++
 3 files changed, 74 insertions(+), 73 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 087..20f59f1 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -1674,75 +1674,6 @@ static int _mv88e6xxx_stu_loadpurge(struct 
mv88e6xxx_chip *chip,
return _mv88e6xxx_vtu_cmd(chip, GLOBAL_VTU_OP_STU_LOAD_PURGE);
 }
 
-static int _mv88e6xxx_port_fid(struct mv88e6xxx_chip *chip, int port,
-  u16 *new, u16 *old)
-{
-   struct dsa_switch *ds = chip->ds;
-   u16 upper_mask;
-   u16 fid;
-   u16 reg;
-   int err;
-
-   if (mv88e6xxx_num_databases(chip) == 4096)
-   upper_mask = 0xff;
-   else if (mv88e6xxx_num_databases(chip) == 256)
-   upper_mask = 0xf;
-   else
-   return -EOPNOTSUPP;
-
-   /* Port's default FID bits 3:0 are located in reg 0x06, offset 12 */
-   err = mv88e6xxx_port_read(chip, port, PORT_BASE_VLAN, );
-   if (err)
-   return err;
-
-   fid = (reg & PORT_BASE_VLAN_FID_3_0_MASK) >> 12;
-
-   if (new) {
-   reg &= ~PORT_BASE_VLAN_FID_3_0_MASK;
-   reg |= (*new << 12) & PORT_BASE_VLAN_FID_3_0_MASK;
-
-   err = mv88e6xxx_port_write(chip, port, PORT_BASE_VLAN, reg);
-   if (err)
-   return err;
-   }
-
-   /* Port's default FID bits 11:4 are located in reg 0x05, offset 0 */
-   err = mv88e6xxx_port_read(chip, port, PORT_CONTROL_1, );
-   if (err)
-   return err;
-
-   fid |= (reg & upper_mask) << 4;
-
-   if (new) {
-   reg &= ~upper_mask;
-   reg |= (*new >> 4) & upper_mask;
-
-   err = mv88e6xxx_port_write(chip, port, PORT_CONTROL_1, reg);
-   if (err)
-   return err;
-
-   netdev_dbg(ds->ports[port].netdev,
-  "FID %d (was %d)\n", *new, fid);
-   }
-
-   if (old)
-   *old = fid;
-
-   return 0;
-}
-
-static int _mv88e6xxx_port_fid_get(struct mv88e6xxx_chip *chip,
-  int port, u16 *fid)
-{
-   return _mv88e6xxx_port_fid(chip, port, NULL, fid);
-}
-
-static int _mv88e6xxx_port_fid_set(struct mv88e6xxx_chip *chip,
-  int port, u16 fid)
-{
-   return _mv88e6xxx_port_fid(chip, port, , NULL);
-}
-
 static int _mv88e6xxx_fid_new(struct mv88e6xxx_chip *chip, u16 *fid)
 {
DECLARE_BITMAP(fid_bitmap, MV88E6XXX_N_FID);
@@ -1753,7 +1684,7 @@ static int _mv88e6xxx_fid_new(struct mv88e6xxx_chip 
*chip, u16 *fid)
 
/* Set every FID bit used by the (un)bridged ports */
for (i = 0; i < mv88e6xxx_num_ports(chip); ++i) {
-   err = _mv88e6xxx_port_fid_get(chip, i, fid);
+   err = mv88e6xxx_port_get_fid(chip, i, fid);
if (err)
return err;
 
@@ -2203,7 +2134,7 @@ static int mv88e6xxx_port_db_load_purge(struct 
mv88e6xxx_chip *chip, int port,
 
/* Null VLAN ID corresponds to the port private database */
if (vid == 0)
-   err = _mv88e6xxx_port_fid_get(chip, port, );
+   err = mv88e6xxx_port_get_fid(chip, port, );
else
err = _mv88e6xxx_vtu_get(chip, vid, , false);
if (err)
@@ -2379,7 +2310,7 @@ static int mv88e6xxx_port_db_dump(struct mv88e6xxx_chip 
*chip, int port,
int err;
 
/* Dump port's default Filtering Information Database (VLAN ID 0) */
-   err = _mv88e6xxx_port_fid_get(chip, port, );
+   err = mv88e6xxx_port_get_fid(chip, port, );
if (err)
return err;
 
@@ -2782,7 +2713,7 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip 
*chip, int port)
 * database, and allow bidirectional communication between the
 * CPU and DSA port(s), and the other ports.
 */
-   err = _mv88e6xxx_port_fid_set(chip, port, 0);
+   err = mv88e6xxx_port_set_fid(chip, port, 0);
if (err)
return err;
 
diff --git a/drivers/net/dsa/mv88e6xxx/port.c b/drivers/net/dsa/mv88e6xxx/port.c
index c6a22ae..a7da812 100644
--- a/drivers/net/dsa/mv88e6xxx/port.c
+++ b/drivers/net/dsa/mv88e6xxx/port.c
@@ -61,6 +61,8 @@ int mv88e6xxx_port_set_state(struct mv88e6xxx_chip *chip, int 
port, u8 state)
return 0;
 }
 
+/* Offset 0x05: Port Control 1 */
+
 /* Offset 0x06: Port Based VLAN Map */
 
 int mv88e6xxx_port_set_vlan_map(struct mv88e6xxx_chip *chip, int port, u16 map)
@@ -85,3 +87,68 @@ int mv88e6xxx_port_set_vlan_map(struct mv88e6xxx_chip *chip, 
int 

[PATCH net-next v2 02/11] net: dsa: mv88e6xxx: add port state setter

2016-11-03 Thread Vivien Didelot
Add the port STP state setter to the port files.

Signed-off-by: Vivien Didelot 
---
 drivers/net/dsa/mv88e6xxx/chip.c | 49 
 drivers/net/dsa/mv88e6xxx/port.c | 31 +
 drivers/net/dsa/mv88e6xxx/port.h |  2 ++
 3 files changed, 37 insertions(+), 45 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index b32b242..12c1175 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -1215,41 +1215,6 @@ static int _mv88e6xxx_atu_remove(struct mv88e6xxx_chip 
*chip, u16 fid,
return _mv88e6xxx_atu_move(chip, fid, port, 0x0f, static_too);
 }
 
-static const char * const mv88e6xxx_port_state_names[] = {
-   [PORT_CONTROL_STATE_DISABLED] = "Disabled",
-   [PORT_CONTROL_STATE_BLOCKING] = "Blocking/Listening",
-   [PORT_CONTROL_STATE_LEARNING] = "Learning",
-   [PORT_CONTROL_STATE_FORWARDING] = "Forwarding",
-};
-
-static int _mv88e6xxx_port_state(struct mv88e6xxx_chip *chip, int port,
-u8 state)
-{
-   struct dsa_switch *ds = chip->ds;
-   u16 reg;
-   int err;
-   u8 oldstate;
-
-   err = mv88e6xxx_port_read(chip, port, PORT_CONTROL, );
-   if (err)
-   return err;
-
-   oldstate = reg & PORT_CONTROL_STATE_MASK;
-
-   reg &= ~PORT_CONTROL_STATE_MASK;
-   reg |= state;
-
-   err = mv88e6xxx_port_write(chip, port, PORT_CONTROL, reg);
-   if (err)
-   return err;
-
-   netdev_dbg(ds->ports[port].netdev, "PortState %s (was %s)\n",
-  mv88e6xxx_port_state_names[state],
-  mv88e6xxx_port_state_names[oldstate]);
-
-   return 0;
-}
-
 static int _mv88e6xxx_port_based_vlan_map(struct mv88e6xxx_chip *chip, int 
port)
 {
struct net_device *bridge = chip->ports[port].bridge_dev;
@@ -1313,13 +1278,11 @@ static void mv88e6xxx_port_stp_state_set(struct 
dsa_switch *ds, int port,
}
 
mutex_lock(>reg_lock);
-   err = _mv88e6xxx_port_state(chip, port, stp_state);
+   err = mv88e6xxx_port_set_state(chip, port, stp_state);
mutex_unlock(>reg_lock);
 
if (err)
-   netdev_err(ds->ports[port].netdev,
-  "failed to update state to %s\n",
-  mv88e6xxx_port_state_names[stp_state]);
+   netdev_err(ds->ports[port].netdev, "failed to update state\n");
 }
 
 static void mv88e6xxx_port_fast_age(struct dsa_switch *ds, int port)
@@ -2526,12 +2489,8 @@ static int mv88e6xxx_switch_reset(struct mv88e6xxx_chip 
*chip)
 
/* Set all ports to the disabled state. */
for (i = 0; i < mv88e6xxx_num_ports(chip); i++) {
-   err = mv88e6xxx_port_read(chip, i, PORT_CONTROL, );
-   if (err)
-   return err;
-
-   err = mv88e6xxx_port_write(chip, i, PORT_CONTROL,
-  reg & 0xfffc);
+   err = mv88e6xxx_port_set_state(chip, i,
+  PORT_CONTROL_STATE_DISABLED);
if (err)
return err;
}
diff --git a/drivers/net/dsa/mv88e6xxx/port.c b/drivers/net/dsa/mv88e6xxx/port.c
index 3b36c92..8d59fe7 100644
--- a/drivers/net/dsa/mv88e6xxx/port.c
+++ b/drivers/net/dsa/mv88e6xxx/port.c
@@ -29,3 +29,34 @@ int mv88e6xxx_port_write(struct mv88e6xxx_chip *chip, int 
port, int reg,
 
return mv88e6xxx_write(chip, addr, reg, val);
 }
+
+/* Offset 0x04: Port Control Register */
+
+static const char * const mv88e6xxx_port_state_names[] = {
+   [PORT_CONTROL_STATE_DISABLED] = "Disabled",
+   [PORT_CONTROL_STATE_BLOCKING] = "Blocking/Listening",
+   [PORT_CONTROL_STATE_LEARNING] = "Learning",
+   [PORT_CONTROL_STATE_FORWARDING] = "Forwarding",
+};
+
+int mv88e6xxx_port_set_state(struct mv88e6xxx_chip *chip, int port, u8 state)
+{
+   u16 reg;
+   int err;
+
+   err = mv88e6xxx_port_read(chip, port, PORT_CONTROL, );
+   if (err)
+   return err;
+
+   reg &= ~PORT_CONTROL_STATE_MASK;
+   reg |= state;
+
+   err = mv88e6xxx_port_write(chip, port, PORT_CONTROL, reg);
+   if (err)
+   return err;
+
+   netdev_dbg(chip->ds->ports[port].netdev, "PortState set to %s\n",
+  mv88e6xxx_port_state_names[state]);
+
+   return 0;
+}
diff --git a/drivers/net/dsa/mv88e6xxx/port.h b/drivers/net/dsa/mv88e6xxx/port.h
index ae1ae2b..ac13988 100644
--- a/drivers/net/dsa/mv88e6xxx/port.h
+++ b/drivers/net/dsa/mv88e6xxx/port.h
@@ -21,4 +21,6 @@ int mv88e6xxx_port_read(struct mv88e6xxx_chip *chip, int 
port, int reg,
 int mv88e6xxx_port_write(struct mv88e6xxx_chip *chip, int port, int reg,
 u16 val);
 
+int mv88e6xxx_port_set_state(struct mv88e6xxx_chip *chip, int port, u8 state);
+
 #endif /* _MV88E6XXX_PORT_H */
-- 
2.10.2



[PATCH net-next v2 01/11] net: dsa: mv88e6xxx: add port files

2016-11-03 Thread Vivien Didelot
The Marvell switches contains one internal SMI device per port, called
"Port Registers". Depending on the model, the addresses of these devices
start from 0x0, 0x8 or 0x10.

Start moving Port Registers specific code to their own files.

Signed-off-by: Vivien Didelot 
---
 drivers/net/dsa/mv88e6xxx/Makefile |  1 +
 drivers/net/dsa/mv88e6xxx/chip.c   | 17 +
 drivers/net/dsa/mv88e6xxx/port.c   | 31 +++
 drivers/net/dsa/mv88e6xxx/port.h   | 24 
 4 files changed, 57 insertions(+), 16 deletions(-)
 create mode 100644 drivers/net/dsa/mv88e6xxx/port.c
 create mode 100644 drivers/net/dsa/mv88e6xxx/port.h

diff --git a/drivers/net/dsa/mv88e6xxx/Makefile 
b/drivers/net/dsa/mv88e6xxx/Makefile
index 10ce820..c36be31 100644
--- a/drivers/net/dsa/mv88e6xxx/Makefile
+++ b/drivers/net/dsa/mv88e6xxx/Makefile
@@ -2,3 +2,4 @@ obj-$(CONFIG_NET_DSA_MV88E6XXX) += mv88e6xxx.o
 mv88e6xxx-objs := chip.o
 mv88e6xxx-objs += global1.o
 mv88e6xxx-$(CONFIG_NET_DSA_MV88E6XXX_GLOBAL2) += global2.o
+mv88e6xxx-objs += port.o
diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 72b9dac2..b32b242 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -37,6 +37,7 @@
 #include "mv88e6xxx.h"
 #include "global1.h"
 #include "global2.h"
+#include "port.h"
 
 static void assert_reg_lock(struct mv88e6xxx_chip *chip)
 {
@@ -221,22 +222,6 @@ int mv88e6xxx_write(struct mv88e6xxx_chip *chip, int addr, 
int reg, u16 val)
return 0;
 }
 
-static int mv88e6xxx_port_read(struct mv88e6xxx_chip *chip, int port, int reg,
-  u16 *val)
-{
-   int addr = chip->info->port_base_addr + port;
-
-   return mv88e6xxx_read(chip, addr, reg, val);
-}
-
-static int mv88e6xxx_port_write(struct mv88e6xxx_chip *chip, int port, int reg,
-   u16 val)
-{
-   int addr = chip->info->port_base_addr + port;
-
-   return mv88e6xxx_write(chip, addr, reg, val);
-}
-
 static int mv88e6xxx_phy_read(struct mv88e6xxx_chip *chip, int phy,
  int reg, u16 *val)
 {
diff --git a/drivers/net/dsa/mv88e6xxx/port.c b/drivers/net/dsa/mv88e6xxx/port.c
new file mode 100644
index 000..3b36c92
--- /dev/null
+++ b/drivers/net/dsa/mv88e6xxx/port.c
@@ -0,0 +1,31 @@
+/*
+ * Marvell 88E6xxx Switch Port Registers support
+ *
+ * Copyright (c) 2008 Marvell Semiconductor
+ *
+ * Copyright (c) 2016 Vivien Didelot 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include "mv88e6xxx.h"
+#include "port.h"
+
+int mv88e6xxx_port_read(struct mv88e6xxx_chip *chip, int port, int reg,
+   u16 *val)
+{
+   int addr = chip->info->port_base_addr + port;
+
+   return mv88e6xxx_read(chip, addr, reg, val);
+}
+
+int mv88e6xxx_port_write(struct mv88e6xxx_chip *chip, int port, int reg,
+u16 val)
+{
+   int addr = chip->info->port_base_addr + port;
+
+   return mv88e6xxx_write(chip, addr, reg, val);
+}
diff --git a/drivers/net/dsa/mv88e6xxx/port.h b/drivers/net/dsa/mv88e6xxx/port.h
new file mode 100644
index 000..ae1ae2b
--- /dev/null
+++ b/drivers/net/dsa/mv88e6xxx/port.h
@@ -0,0 +1,24 @@
+/*
+ * Marvell 88E6xxx Switch Port Registers support
+ *
+ * Copyright (c) 2008 Marvell Semiconductor
+ *
+ * Copyright (c) 2016 Vivien Didelot 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef _MV88E6XXX_PORT_H
+#define _MV88E6XXX_PORT_H
+
+#include "mv88e6xxx.h"
+
+int mv88e6xxx_port_read(struct mv88e6xxx_chip *chip, int port, int reg,
+   u16 *val);
+int mv88e6xxx_port_write(struct mv88e6xxx_chip *chip, int port, int reg,
+u16 val);
+
+#endif /* _MV88E6XXX_PORT_H */
-- 
2.10.2



[PATCH net-next v2 11/11] net: dsa: mv88e6xxx: setup port's MAC

2016-11-03 Thread Vivien Didelot
Now that we have setters to configure the port's MAC, use them to
refactor the port setup and adjust_link code.

Note that port's MAC speed, duplex or RGMII delay must not be changed
unless the port's link is forced down. So wrap all that in a
mv88e6xxx_port_setup_mac function.

Signed-off-by: Vivien Didelot 
---
 drivers/net/dsa/mv88e6xxx/chip.c | 140 +--
 1 file changed, 60 insertions(+), 80 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 23c05e3..c8f824d 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -701,6 +701,47 @@ static bool mv88e6xxx_6352_family(struct mv88e6xxx_chip 
*chip)
return chip->info->family == MV88E6XXX_FAMILY_6352;
 }
 
+static int mv88e6xxx_port_setup_mac(struct mv88e6xxx_chip *chip, int port,
+   int link, int speed, int duplex,
+   phy_interface_t mode)
+{
+   int err;
+
+   if (!chip->info->ops->port_set_link)
+   return 0;
+
+   /* Port's MAC control must not be changed unless the link is down */
+   err = chip->info->ops->port_set_link(chip, port, 0);
+   if (err)
+   return err;
+
+   if (chip->info->ops->port_set_speed) {
+   err = chip->info->ops->port_set_speed(chip, port, speed);
+   if (err && err != -EOPNOTSUPP)
+   goto restore_link;
+   }
+
+   if (chip->info->ops->port_set_duplex) {
+   err = chip->info->ops->port_set_duplex(chip, port, duplex);
+   if (err && err != -EOPNOTSUPP)
+   goto restore_link;
+   }
+
+   if (chip->info->ops->port_set_rgmii_delay) {
+   err = chip->info->ops->port_set_rgmii_delay(chip, port, mode);
+   if (err && err != -EOPNOTSUPP)
+   goto restore_link;
+   }
+
+   err = 0;
+restore_link:
+   if (chip->info->ops->port_set_link(chip, port, link))
+   netdev_err(chip->ds->ports[port].netdev,
+  "failed to restore MAC's link\n");
+
+   return err;
+}
+
 /* We expect the switch to perform auto negotiation if there is a real
  * phy. However, in the case of a fixed link phy, we force the port
  * settings from the fixed link settings.
@@ -709,64 +750,18 @@ static void mv88e6xxx_adjust_link(struct dsa_switch *ds, 
int port,
  struct phy_device *phydev)
 {
struct mv88e6xxx_chip *chip = ds->priv;
-   u16 reg;
int err;
 
if (!phy_is_pseudo_fixed_link(phydev))
return;
 
mutex_lock(>reg_lock);
-
-   err = mv88e6xxx_port_read(chip, port, PORT_PCS_CTRL, );
-   if (err)
-   goto out;
-
-   reg &= ~(PORT_PCS_CTRL_LINK_UP |
-PORT_PCS_CTRL_FORCE_LINK |
-PORT_PCS_CTRL_DUPLEX_FULL |
-PORT_PCS_CTRL_FORCE_DUPLEX |
-PORT_PCS_CTRL_SPEED_UNFORCED);
-
-   reg |= PORT_PCS_CTRL_FORCE_LINK;
-   if (phydev->link)
-   reg |= PORT_PCS_CTRL_LINK_UP;
-
-   if (mv88e6xxx_6065_family(chip) && phydev->speed > SPEED_100)
-   goto out;
-
-   switch (phydev->speed) {
-   case SPEED_1000:
-   reg |= PORT_PCS_CTRL_SPEED_1000;
-   break;
-   case SPEED_100:
-   reg |= PORT_PCS_CTRL_SPEED_100;
-   break;
-   case SPEED_10:
-   reg |= PORT_PCS_CTRL_SPEED_10;
-   break;
-   default:
-   pr_info("Unknown speed");
-   goto out;
-   }
-
-   reg |= PORT_PCS_CTRL_FORCE_DUPLEX;
-   if (phydev->duplex == DUPLEX_FULL)
-   reg |= PORT_PCS_CTRL_DUPLEX_FULL;
-
-   if ((mv88e6xxx_6352_family(chip) || mv88e6xxx_6351_family(chip)) &&
-   (port >= mv88e6xxx_num_ports(chip) - 2)) {
-   if (phydev->interface == PHY_INTERFACE_MODE_RGMII_RXID)
-   reg |= PORT_PCS_CTRL_RGMII_DELAY_RXCLK;
-   if (phydev->interface == PHY_INTERFACE_MODE_RGMII_TXID)
-   reg |= PORT_PCS_CTRL_RGMII_DELAY_TXCLK;
-   if (phydev->interface == PHY_INTERFACE_MODE_RGMII_ID)
-   reg |= (PORT_PCS_CTRL_RGMII_DELAY_RXCLK |
-   PORT_PCS_CTRL_RGMII_DELAY_TXCLK);
-   }
-   mv88e6xxx_port_write(chip, port, PORT_PCS_CTRL, reg);
-
-out:
+   err = mv88e6xxx_port_setup_mac(chip, port, phydev->link, phydev->speed,
+  phydev->duplex, phydev->interface);
mutex_unlock(>reg_lock);
+
+   if (err && err != -EOPNOTSUPP)
+   netdev_err(ds->ports[port].netdev, "failed to configure MAC\n");
 }
 
 static int _mv88e6xxx_stats_wait(struct mv88e6xxx_chip *chip)
@@ -2409,35 +2404,20 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip 

[PATCH net-next v2 10/11] net: dsa: mv88e6xxx: add port's MAC speed setter

2016-11-03 Thread Vivien Didelot
While the two bits for link, duplex or RGMII delays are used the same
way on chips supporting the said feature, the two bits for speed have
different meaning for most of the chips out there.

Speed value is stored in bits 1:0, 0x3 means unforce (normal detection).

Some chips reuse values for alternative speeds when bit 12 is set.

Newer chips with speed > 1Gbps reuse value 0x3 thus need a new bit 13.

Here are the values to write in register 0x1 to (un)force speed:

| Speed   | 88E6065 | 88E6185 | 88E6352 | 88E6390 | 88E6390X |
| --- | --- | --- | --- | --- |  |
| 10  | 0x  | 0x  | 0x  | 0x2000  | 0x2000   |
| 100 | 0x0001  | 0x0001  | 0x0001  | 0x2001  | 0x2001   |
| 200 | 0x0002  | NA  | 0x1001  | 0x3001  | 0x3001   |
| 1000| NA  | 0x0002  | 0x0002  | 0x2002  | 0x2002   |
| 2500| NA  | NA  | NA  | 0x3003  | 0x3003   |
| 1   | NA  | NA  | NA  | NA  | 0x2003   |
| unforce | 0x0003  | 0x0003  | 0x0003  | 0x  | 0x   |

This patch implements a generic mv88e6xxx_port_set_speed() function used
by chip-specific wrappers to filter supported ports and speeds.

Signed-off-by: Vivien Didelot 
---
 drivers/net/dsa/mv88e6xxx/chip.c  |  33 ++--
 drivers/net/dsa/mv88e6xxx/mv88e6xxx.h |  24 +-
 drivers/net/dsa/mv88e6xxx/port.c  | 138 ++
 drivers/net/dsa/mv88e6xxx/port.h  |   6 ++
 4 files changed, 189 insertions(+), 12 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index bb93d0a..23c05e3 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -725,7 +725,7 @@ static void mv88e6xxx_adjust_link(struct dsa_switch *ds, 
int port,
 PORT_PCS_CTRL_FORCE_LINK |
 PORT_PCS_CTRL_DUPLEX_FULL |
 PORT_PCS_CTRL_FORCE_DUPLEX |
-PORT_PCS_CTRL_UNFORCED);
+PORT_PCS_CTRL_SPEED_UNFORCED);
 
reg |= PORT_PCS_CTRL_FORCE_LINK;
if (phydev->link)
@@ -736,13 +736,13 @@ static void mv88e6xxx_adjust_link(struct dsa_switch *ds, 
int port,
 
switch (phydev->speed) {
case SPEED_1000:
-   reg |= PORT_PCS_CTRL_1000;
+   reg |= PORT_PCS_CTRL_SPEED_1000;
break;
case SPEED_100:
-   reg |= PORT_PCS_CTRL_100;
+   reg |= PORT_PCS_CTRL_SPEED_100;
break;
case SPEED_10:
-   reg |= PORT_PCS_CTRL_10;
+   reg |= PORT_PCS_CTRL_SPEED_10;
break;
default:
pr_info("Unknown speed");
@@ -2421,17 +2421,17 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip 
*chip, int port)
 */
err = mv88e6xxx_port_read(chip, port, PORT_PCS_CTRL, );
if (dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)) {
-   reg &= ~PORT_PCS_CTRL_UNFORCED;
+   reg &= ~PORT_PCS_CTRL_SPEED_UNFORCED;
reg |= PORT_PCS_CTRL_FORCE_LINK |
PORT_PCS_CTRL_LINK_UP |
PORT_PCS_CTRL_DUPLEX_FULL |
PORT_PCS_CTRL_FORCE_DUPLEX;
if (mv88e6xxx_6065_family(chip))
-   reg |= PORT_PCS_CTRL_100;
+   reg |= PORT_PCS_CTRL_SPEED_100;
else
-   reg |= PORT_PCS_CTRL_1000;
+   reg |= PORT_PCS_CTRL_SPEED_1000;
} else {
-   reg |= PORT_PCS_CTRL_UNFORCED;
+   reg |= PORT_PCS_CTRL_SPEED_UNFORCED;
}
 
err = mv88e6xxx_port_write(chip, port, PORT_PCS_CTRL, reg);
@@ -3162,6 +3162,7 @@ static const struct mv88e6xxx_ops mv88e6085_ops = {
.phy_write = mv88e6xxx_phy_ppu_write,
.port_set_link = mv88e6xxx_port_set_link,
.port_set_duplex = mv88e6xxx_port_set_duplex,
+   .port_set_speed = mv88e6185_port_set_speed,
 };
 
 static const struct mv88e6xxx_ops mv88e6095_ops = {
@@ -3170,6 +3171,7 @@ static const struct mv88e6xxx_ops mv88e6095_ops = {
.phy_write = mv88e6xxx_phy_ppu_write,
.port_set_link = mv88e6xxx_port_set_link,
.port_set_duplex = mv88e6xxx_port_set_duplex,
+   .port_set_speed = mv88e6185_port_set_speed,
 };
 
 static const struct mv88e6xxx_ops mv88e6123_ops = {
@@ -3178,6 +3180,7 @@ static const struct mv88e6xxx_ops mv88e6123_ops = {
.phy_write = mv88e6xxx_write,
.port_set_link = mv88e6xxx_port_set_link,
.port_set_duplex = mv88e6xxx_port_set_duplex,
+   .port_set_speed = mv88e6185_port_set_speed,
 };
 
 static const struct mv88e6xxx_ops mv88e6131_ops = {
@@ -3186,6 +3189,7 @@ static const struct mv88e6xxx_ops mv88e6131_ops = {
 

Re: [PATCH net-next v1 18/21] net: phy: expose phy_aneg_done API for use by drivers

2016-11-03 Thread kbuild test robot
Hi Tom,

[auto build test ERROR on net-next/master]

url:
https://github.com/0day-ci/linux/commits/Tom-Lendacky/amd-xgbe-AMD-XGBE-driver-updates-2016-11-01/20161103-222344
config: sparc-allyesconfig (attached as .config)
compiler: sparc64-linux-gnu-gcc (Debian 6.1.1-9) 6.1.1 20160705
reproduce:
wget 
https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross
 -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=sparc 

All errors (new ones prefixed by >>):

>> drivers/net/ethernet/aeroflex/greth.c:1293:19: error: static declaration of 
>> 'phy_aneg_done' follows non-static declaration
static inline int phy_aneg_done(struct phy_device *phydev)
  ^
   In file included from include/net/dsa.h:19:0,
from include/linux/netdevice.h:44,
from drivers/net/ethernet/aeroflex/greth.c:29:
   include/linux/phy.h:789:5: note: previous declaration of 'phy_aneg_done' was 
here
int phy_aneg_done(struct phy_device *phydev);
^

vim +/phy_aneg_done +1293 drivers/net/ethernet/aeroflex/greth.c

d4c41139 drivers/net/greth.c Kristoffer Glembo 2010-02-15  1287 
greth->speed = 0;
d4c41139 drivers/net/greth.c Kristoffer Glembo 2010-02-15  1288 
greth->duplex = -1;
d4c41139 drivers/net/greth.c Kristoffer Glembo 2010-02-15  1289  
d4c41139 drivers/net/greth.c Kristoffer Glembo 2010-02-15  1290 return 
0;
d4c41139 drivers/net/greth.c Kristoffer Glembo 2010-02-15  1291  }
d4c41139 drivers/net/greth.c Kristoffer Glembo 2010-02-15  1292  
d4c41139 drivers/net/greth.c Kristoffer Glembo 2010-02-15 @1293  static inline 
int phy_aneg_done(struct phy_device *phydev)
d4c41139 drivers/net/greth.c Kristoffer Glembo 2010-02-15  1294  {
d4c41139 drivers/net/greth.c Kristoffer Glembo 2010-02-15  1295 int 
retval;
d4c41139 drivers/net/greth.c Kristoffer Glembo 2010-02-15  1296  

:: The code at line 1293 was first introduced by commit
:: d4c41139df6e74c6fff0cbac43e51cab782133be net: Add Aeroflex Gaisler 
10/100/1G Ethernet MAC driver

:: TO: Kristoffer Glembo <kristof...@gaisler.com>
:: CC: David S. Miller <da...@davemloft.net>

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip


Re: bpf: kernel BUG in htab_elem_free

2016-11-03 Thread Dmitry Vyukov
On Thu, Nov 3, 2016 at 10:36 AM, Daniel Borkmann  wrote:
> On 11/03/2016 03:15 PM, Dmitry Vyukov wrote:
>>
>> On Wed, Nov 2, 2016 at 11:14 PM, Dmitry Vyukov  wrote:
>>>
>>> Here we go.
>>>
>>> The following program triggers kernel BUG in htab_elem_free.
>>> On commit 0c183d92b20b5c84ca655b45ef57b3318b83eb9e (Oct 31).
>>> Run as "while true; do ./a.out; done".
>
>
> This one fixes it for me. Could you check it from your side as well?
> I'll submit an official fix then.

I've seen you mailed the fix already.
If you were able to reproduce it and test the fix, then there is
nothing else I can do.

> Thanks a lot for the catch!
> Daniel
>
> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> index 570eeca..ad1bc67 100644
> --- a/kernel/bpf/hashtab.c
> +++ b/kernel/bpf/hashtab.c
> @@ -687,7 +687,8 @@ static void delete_all_elements(struct bpf_htab *htab)
>
> hlist_for_each_entry_safe(l, n, head, hash_node) {
> hlist_del_rcu(>hash_node);
> -   htab_elem_free(htab, l);
> +   if (l->state != HTAB_EXTRA_ELEM_USED)
> +   htab_elem_free(htab, l);
> }
> }
>  }


Re: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net

2016-11-03 Thread Michael S. Tsirkin
On Thu, Nov 03, 2016 at 04:29:22PM -0700, John Fastabend wrote:
> [...]
> 
> >>> - when XDP is attached disable all LRO using 
> >>> VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET
> >>>   (not used by driver so far, designed to allow dynamic LRO control with
> >>>ethtool)
> >>
> >> I see there is a UAPI bit for this but I guess we also need to add
> >> support to vhost as well? Seems otherwise we may just drop a bunch
> >> of packets on the floor out of handle_rx() when recvmsg returns larger
> >> than a page size. Or did I read this wrong...
> > 
> > It's already supported host side. However you might
> > get some packets that were in flight when you attached.
> > 
> 
> Really I must have missed it I don't see any *GUEST_FEATURES* flag in
> ./drivers/vhost/?

It's all done by QEMU catching these commands and calling
ioctls on the tun/macvtap/packet socket.

> >>> - start adding page-sized buffers
> >>
> >> I started to mangle add_recvbuf_big() and receive_big() here and this
> >> didn't seem too bad.
> > 
> > I imagine it won't be ATM but I think we'll need to support
> > mrg buffers with time and then it will be messy.
> > Besides, it's not an architectural thing that receive_big
> > uses page sized buffers, it could use any size.
> > So a separate path just for xdp would be better imho.
> > 
> >>> - do something with non-page-sized buffers added previously - what
> >>>   exactly? copy I guess? What about LRO packets that are too large -
> >>>   can we drop or can we split them up?
> >>
> >> hmm not sure I understand this here. With LRO disabled and mergeable
> >> buffers disabled all packets should fit in a page correct?
> > 
> > Assuing F_MTU is negotiated and MTU field is small enough, yes.
> > But if you disable mrg buffers dynamically you will get some packets
> > in buffers that were added before the disable.
> > Similarly for disabling LRO dynamically.
> > 
> >> With LRO enabled case I guess to start with we block XDP from being
> >> loaded for the same reason we don't allow jumbo frames on physical
> >> nics.
> > 
> > If you ask that host disables the capability, then yes, it's easy.
> > Let's do that for now, it's a start.
> > 
> > 
> >>>
> >>> I'm fine with disabling XDP for some configurations as the first step,
> >>> and we can add that support later.
> >>>
> >>
> >> In order for this to work though I guess we need to be able to
> >> dynamically disable mergeable buffers at the moment I just commented
> >> it out of the features list and fixed up virtio_has_features so it
> >> wont bug_on.
> > 
> > For now we can just set mrg_rxbuf=off on qemu command line, and
> > fail XDP attach if not there. I think we'll be able to support it
> > long term but you will need host side changes, or fully reset
> > device and reconfigure it.
> 
> see question below. I agree disabling mrg_rxbuff=off lro=off and an
> xdp receive path makes this relatively straight forward and clean with
> the MTU patch noted below as well.
> 
> > 
> >>> Ideas about mergeable buffers (optional):
> >>>
> >>> At the moment mergeable buffers can't be disabled dynamically.
> >>> They do bring a small benefit for XDP if host MTU is large (see below)
> >>> and aren't hard to support:
> >>> - if header is by itself skip 1st page
> >>> - otherwise copy all data into first page
> >>> and it's nicer not to add random limitations that require guest reboot.
> >>> It might make sense to add a command that disables/enabled
> >>> mergeable buffers dynamically but that's for newer hosts.
> >>
> >> Yep it seems disabling mergeable buffers solves this but didn't look at
> >> it too closely. I'll look closer tomorrow.
> >>
> >>>
> >>> Spec does not require it but in practice most hosts put all data
> >>> in the 1st page or all in the 2nd page so the copy will be nop
> >>> for these cases.
> >>>
> >>> Large host MTU - newer hosts report the host MTU, older ones don't.
> >>> Using mergeable buffers we can at least detect this case
> >>> (and then what? drop I guess).
> >>>
> >>
> >> The physical nics just refuse to load XDP with large MTU.
> > 
> > So let's do the same for now, unfortunately you don't know
> > the MTU unless _F_MTU is negitiated and QEMU does not
> > implement that yet, but it's easy to add.
> > In fact I suspect Aaron (cc) has an implementation since
> > he posted a patch implementing that.
> > Aaron could you post it pls?
> > 
> 
> Great! Aaron if you want me to review/test at all let me know I have
> a few systems setup running this now so can help if needed.
> 
> >> Any reason
> >> not to negotiate the mtu with the guest so that the guest can force
> >> this?
> > 
> > There are generally many guests and many NICs on the host.
> > A big packet arrives, what do you want to do with it?
> 
> Drop it just like a physical nic would do if packet is larger
> than MTU. Maybe splat a message in the log so user has some
> clue something got misconfigured.
> 
> > We probably want to build propagating MTU across all VMs and NICs
> > but let's get a 

[Patch net] ipvs: use IPVS_CMD_ATTR_MAX for family.maxattr

2016-11-03 Thread Cong Wang
family.maxattr is the max index for policy[], the size of
ops[] is determined with ARRAY_SIZE().

Reported-by: Andrey Konovalov 
Tested-by: Andrey Konovalov 
Cc: Pablo Neira Ayuso 
Signed-off-by: Cong Wang 
---
 net/netfilter/ipvs/ip_vs_ctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index c3c809b..a6e44ef 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -2845,7 +2845,7 @@ static struct genl_family ip_vs_genl_family = {
.hdrsize= 0,
.name   = IPVS_GENL_NAME,
.version= IPVS_GENL_VERSION,
-   .maxattr= IPVS_CMD_MAX,
+   .maxattr= IPVS_CMD_ATTR_MAX,
.netnsok= true, /* Make ipvsadm to work on netns */
 };
 
-- 
2.1.0



[PATCH net] bpf: fix map not being uncharged during map creation failure

2016-11-03 Thread Daniel Borkmann
In map_create(), we first find and create the map, then once that
suceeded, we charge it to the user's RLIMIT_MEMLOCK, and then fetch
a new anon fd through anon_inode_getfd(). The problem is, once the
latter fails f.e. due to RLIMIT_NOFILE limit, then we only destruct
the map via map->ops->map_free(), but without uncharging the previously
locked memory first. That means that the user_struct allocation is
leaked as well as the accounted RLIMIT_MEMLOCK memory not released.
Make the label names in the fix consistent with bpf_prog_load().

Fixes: aaac3ba95e4c ("bpf: charge user for creation of BPF maps and programs")
Signed-off-by: Daniel Borkmann 
Acked-by: Alexei Starovoitov 
---
 kernel/bpf/syscall.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 228f962..237f3d6 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -194,7 +194,7 @@ static int map_create(union bpf_attr *attr)
 
err = bpf_map_charge_memlock(map);
if (err)
-   goto free_map;
+   goto free_map_nouncharge;
 
err = bpf_map_new_fd(map);
if (err < 0)
@@ -204,6 +204,8 @@ static int map_create(union bpf_attr *attr)
return err;
 
 free_map:
+   bpf_map_uncharge_memlock(map);
+free_map_nouncharge:
map->ops->map_free(map);
return err;
 }
-- 
1.9.3



[PATCH net 6/6] net/mlx5: Fix invalid pointer reference when prof_sel parameter is invalid

2016-11-03 Thread Saeed Mahameed
From: Huy Nguyen 

When prof_sel is invalid, mlx5_core_warn is called but the
mlx5_core_dev is not initialized yet. Solution is moving the prof_sel code
after dev->pdev assignment

Fixes: 2974ab6e8bd8 ('net/mlx5: Improve driver log messages')
Signed-off-by: Huy Nguyen 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/main.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index d5433c4..3eb9315 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1226,6 +1226,9 @@ static int init_one(struct pci_dev *pdev,
 
pci_set_drvdata(pdev, dev);
 
+   dev->pdev = pdev;
+   dev->event = mlx5_core_event;
+
if (prof_sel < 0 || prof_sel >= ARRAY_SIZE(profile)) {
mlx5_core_warn(dev,
   "selected profile out of range, selecting 
default (%d)\n",
@@ -1233,8 +1236,6 @@ static int init_one(struct pci_dev *pdev,
prof_sel = MLX5_DEFAULT_PROF;
}
dev->profile = [prof_sel];
-   dev->pdev = pdev;
-   dev->event = mlx5_core_event;
 
INIT_LIST_HEAD(>ctx_list);
spin_lock_init(>ctx_lock);
-- 
2.7.4



[PATCH net 3/6] net/mlx5e: Disallow changing name-space for VF representors

2016-11-03 Thread Saeed Mahameed
From: Or Gerlitz 

VF reps should be altogether on the same NS as they were created.

Signed-off-by: Or Gerlitz 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 7fe6559..bf1c09c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -308,7 +308,7 @@ static void mlx5e_build_rep_netdev(struct net_device 
*netdev)
netdev->switchdev_ops = _rep_switchdev_ops;
 #endif
 
-   netdev->features |= NETIF_F_VLAN_CHALLENGED | NETIF_F_HW_TC;
+   netdev->features |= NETIF_F_VLAN_CHALLENGED | NETIF_F_HW_TC | 
NETIF_F_NETNS_LOCAL;
netdev->hw_features  |= NETIF_F_HW_TC;
 
eth_hw_addr_random(netdev);
-- 
2.7.4



[PATCH net 2/6] net/mlx5e: Re-arrange XDP SQ/CQ creation

2016-11-03 Thread Saeed Mahameed
In mlx5e_open_channel CQs must be created before napi is enabled.
Here we move the XDP CQ creation to satisfy that fact.

mlx5e_close_channel is already working according to the right order.

Fixes: b5503b994ed5 ("net/mlx5e: XDP TX forwarding support")
Signed-off-by: Saeed Mahameed 
Reported-by: Jesper Dangaard Brouer 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 32 +++
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index c83619d..84e8b25 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1445,6 +1445,7 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, 
int ix,
c->netdev   = priv->netdev;
c->mkey_be  = cpu_to_be32(priv->mdev->mlx5e_res.mkey.key);
c->num_tc   = priv->params.num_tc;
+   c->xdp  = !!priv->xdp_prog;
 
if (priv->params.rx_am_enabled)
rx_cq_profile = 
mlx5e_am_get_def_profile(priv->params.rx_cq_period_mode);
@@ -1468,6 +1469,12 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, 
int ix,
if (err)
goto err_close_tx_cqs;
 
+   /* XDP SQ CQ params are same as normal TXQ sq CQ params */
+   err = c->xdp ? mlx5e_open_cq(c, >tx_cq, >xdp_sq.cq,
+priv->params.tx_cq_moderation) : 0;
+   if (err)
+   goto err_close_rx_cq;
+
napi_enable(>napi);
 
err = mlx5e_open_sq(c, 0, >icosq, >icosq);
@@ -1488,21 +1495,10 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, 
int ix,
}
}
 
-   if (priv->xdp_prog) {
-   /* XDP SQ CQ params are same as normal TXQ sq CQ params */
-   err = mlx5e_open_cq(c, >tx_cq, >xdp_sq.cq,
-   priv->params.tx_cq_moderation);
-   if (err)
-   goto err_close_sqs;
-
-   err = mlx5e_open_sq(c, 0, >xdp_sq, >xdp_sq);
-   if (err) {
-   mlx5e_close_cq(>xdp_sq.cq);
-   goto err_close_sqs;
-   }
-   }
+   err = c->xdp ? mlx5e_open_sq(c, 0, >xdp_sq, >xdp_sq) : 0;
+   if (err)
+   goto err_close_sqs;
 
-   c->xdp = !!priv->xdp_prog;
err = mlx5e_open_rq(c, >rq, >rq);
if (err)
goto err_close_xdp_sq;
@@ -1512,10 +1508,8 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, 
int ix,
 
return 0;
 err_close_xdp_sq:
-   if (priv->xdp_prog) {
+   if (c->xdp)
mlx5e_close_sq(>xdp_sq);
-   mlx5e_close_cq(>xdp_sq.cq);
-   }
 
 err_close_sqs:
mlx5e_close_sqs(c);
@@ -1525,6 +1519,10 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, 
int ix,
 
 err_disable_napi:
napi_disable(>napi);
+   if (c->xdp)
+   mlx5e_close_cq(>xdp_sq.cq);
+
+err_close_rx_cq:
mlx5e_close_cq(>rq.cq);
 
 err_close_tx_cqs:
-- 
2.7.4



[PATCH net 1/6] net/mlx5e: Fix XDP error path of mlx5e_open_channel()

2016-11-03 Thread Saeed Mahameed
In case of mlx5e_open_rq fails the error handling will jump to
label err_close_xdp_sq and will try to close the xdp_sq unconditionally.
xdp_sq is valid only in case of XDP use cases, i.e priv->xdp_prog is
not null.

To fix this in this patch we test xdp_sq validity prior to closing it.

In addition we now close the xdp_sq.cq as well.

Fixes: b5503b994ed5 ("net/mlx5e: XDP TX forwarding support")
Signed-off-by: Saeed Mahameed 
Reported-by: Jesper Dangaard Brouer 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index f4c687c..c83619d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1512,7 +1512,10 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, 
int ix,
 
return 0;
 err_close_xdp_sq:
-   mlx5e_close_sq(>xdp_sq);
+   if (priv->xdp_prog) {
+   mlx5e_close_sq(>xdp_sq);
+   mlx5e_close_cq(>xdp_sq.cq);
+   }
 
 err_close_sqs:
mlx5e_close_sqs(c);
-- 
2.7.4



[PATCH net 5/6] net/mlx5: E-Switch, Set the actions for offloaded rules properly

2016-11-03 Thread Saeed Mahameed
From: Or Gerlitz 

As for the current generation of the mlx5 HW (CX4/CX4-Lx) per flow vlan
push/pop actions are emulated, we must not program them to the firmware.

Fixes: f5f82476090f ('net/mlx5: E-Switch, Support VLAN actions in the offloads 
mode')
Signed-off-by: Or Gerlitz 
Reported-by: Paul Blakey 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c 
b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index c55ad8d..d239f5d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -57,7 +57,8 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
if (esw->mode != SRIOV_OFFLOADS)
return ERR_PTR(-EOPNOTSUPP);
 
-   action = attr->action;
+   /* per flow vlan pop/push is emulated, don't set that into the firmware 
*/
+   action = attr->action & ~(MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH | 
MLX5_FLOW_CONTEXT_ACTION_VLAN_POP);
 
if (action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) {
dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
-- 
2.7.4



[PATCH net 0/6] Mellanox 100G mlx5 fixes 2016-11-04

2016-11-03 Thread Saeed Mahameed
Hi Dave,

This series contains six hot fixes of the mlx5 core and mlx5e driver.

Huy fixed an invalid pointer dereference on initialization flow for when
the selected mlx5 load profile is out of range.

Or provided three eswitch offloads related fixes
 - Prevent changing NS of a VF representor. 
 - Handle matching on vlan priority for offloaded TC rules
 - Set the actions for offloaded rules properly

On my part I here addressed the error flow related issues in
mlx5e_open_channel reported by Jesper just this week.

Thanks,
Saeed.

Huy Nguyen (1):
  net/mlx5: Fix invalid pointer reference when prof_sel parameter is
invalid

Or Gerlitz (3):
  net/mlx5e: Disallow changing name-space for VF representors
  net/mlx5e: Handle matching on vlan priority for offloaded TC rules
  net/mlx5: E-Switch, Set the actions for offloaded rules properly

Saeed Mahameed (2):
  net/mlx5e: Fix XDP error path of mlx5e_open_channel()
  net/mlx5e: Re-arrange XDP SQ/CQ creation

 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 31 +++---
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c   |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c|  5 +++-
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c |  3 ++-
 drivers/net/ethernet/mellanox/mlx5/core/main.c |  5 ++--
 5 files changed, 26 insertions(+), 20 deletions(-)

-- 
2.7.4



[PATCH net 4/6] net/mlx5e: Handle matching on vlan priority for offloaded TC rules

2016-11-03 Thread Saeed Mahameed
From: Or Gerlitz 

We ignored the vlan priority in offloaded TC rules matching part,
fix that.

Fixes: 095b6cfd69ce ('net/mlx5e: Add TC vlan match parsing')
Signed-off-by: Or Gerlitz 
Reported-by: Paul Blakey 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index ce8c54d..6bb21b3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -237,12 +237,15 @@ static int parse_cls_flower(struct mlx5e_priv *priv, 
struct mlx5_flow_spec *spec
skb_flow_dissector_target(f->dissector,
  FLOW_DISSECTOR_KEY_VLAN,
  f->mask);
-   if (mask->vlan_id) {
+   if (mask->vlan_id || mask->vlan_priority) {
MLX5_SET(fte_match_set_lyr_2_4, headers_c, vlan_tag, 1);
MLX5_SET(fte_match_set_lyr_2_4, headers_v, vlan_tag, 1);
 
MLX5_SET(fte_match_set_lyr_2_4, headers_c, first_vid, 
mask->vlan_id);
MLX5_SET(fte_match_set_lyr_2_4, headers_v, first_vid, 
key->vlan_id);
+
+   MLX5_SET(fte_match_set_lyr_2_4, headers_c, first_prio, 
mask->vlan_priority);
+   MLX5_SET(fte_match_set_lyr_2_4, headers_v, first_prio, 
key->vlan_priority);
}
}
 
-- 
2.7.4



Re: net/netlink: another global-out-of-bounds in genl_family_rcv_msg/validate_nla

2016-11-03 Thread Andrey Konovalov
On Fri, Nov 4, 2016 at 12:22 AM, Cong Wang  wrote:
> On Thu, Nov 3, 2016 at 4:04 PM, Andrey Konovalov  
> wrote:
>> Hi,
>>
>> I've got the following error report while running the syzkaller fuzzer:
>>
>> BUG: KASAN: global-out-of-bounds in validate_nla+0x49b/0x4e0 at addr
>> 84452de0
>> Read of size 2 by task syz-executor/19055
>> Address belongs to variable ip_vs_cmd_policy+0x20/0x40
>
> LOL, seems a typo
>
> diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
> index c3c809b..a6e44ef 100644
> --- a/net/netfilter/ipvs/ip_vs_ctl.c
> +++ b/net/netfilter/ipvs/ip_vs_ctl.c
> @@ -2845,7 +2845,7 @@ static struct genl_family ip_vs_genl_family = {
> .hdrsize= 0,
> .name   = IPVS_GENL_NAME,
> .version= IPVS_GENL_VERSION,
> -   .maxattr= IPVS_CMD_MAX,
> +   .maxattr= IPVS_CMD_ATTR_MAX,
> .netnsok= true, /* Make ipvsadm to work on netns */
>  };

This fixes the issue for me.

Tested-by: Andrey Konovalov 

Thanks again!


[PATCH] r8169: Module parameter for opt-in of ASPM

2016-11-03 Thread Kast Bernd
This patch adds a module parameter in order to activate ASPM. By that
the CPU can enter deep sleep modes (PC6) and power consumption can be
reduced (for example from 13W to 8W on my notebook with a Haswell CPU).
Basically, it reapplies d64ec841517a25f6d468bde9f67e5b4cffdc67c7, which
was reverted due to delayed link status detection and increased boot
times on some systems. These bugs are avoided by two actions:
1) ASPM is turned off by default to avoid any problems with the
default configuration.
2) Flags for ASPM and clock request are set after ephy_init,
which wasn't respected on the previous patch. Thus ASPM with
this patch could work even with previously failing systems.

Signed-off-by: Kast Bernd 
---
 drivers/net/ethernet/realtek/r8169.c | 100 ---
 1 file changed, 92 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c 
b/drivers/net/ethernet/realtek/r8169.c
index bf000d8..9d72198 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -346,6 +346,7 @@ MODULE_DEVICE_TABLE(pci, rtl8169_pci_tbl);
 
 static int rx_buf_sz = 16383;
 static int use_dac = -1;
+static int use_aspm;
 static struct {
u32 msg_enable;
 } debug = { -1 };
@@ -509,6 +510,7 @@ enum rtl8168_registers {
 #define PWM_EN (1 << 22)
 #define RXDV_GATED_EN  (1 << 19)
 #define EARLY_TALLY_EN (1 << 16)
+#define FORCE_CLK  (1 << 15) /* force clock request */
 };
 
 enum rtl_register_content {
@@ -860,6 +862,8 @@ MODULE_AUTHOR("Realtek and the Linux r8169 crew 
");
 MODULE_DESCRIPTION("RealTek RTL-8169 Gigabit Ethernet driver");
 module_param(use_dac, int, 0);
 MODULE_PARM_DESC(use_dac, "Enable PCI DAC. Unsafe on 32 bit PCI slot.");
+module_param(use_aspm, int, );
+MODULE_PARM_DESC(use_aspm, "Enable ASPM power saving. Unsafe on some systems");
 module_param_named(debug, debug.msg_enable, int, 0);
 MODULE_PARM_DESC(debug, "Debug verbosity level (0=none, ..., 16=all)");
 MODULE_LICENSE("GPL");
@@ -5924,7 +5928,8 @@ static void rtl_hw_start_8168e_2(struct rtl8169_private 
*tp)
 
RTL_W8(MaxTxPacketSize, EarlySize);
 
-   rtl_disable_clock_request(pdev);
+   if (!use_aspm)
+   rtl_disable_clock_request(pdev);
 
RTL_W32(TxConfig, RTL_R32(TxConfig) | TXCFG_AUTO_FIFO);
RTL_W8(MCU, RTL_R8(MCU) & ~NOW_IS_OOB);
@@ -5934,7 +5939,13 @@ static void rtl_hw_start_8168e_2(struct rtl8169_private 
*tp)
 
RTL_W8(DLLPR, RTL_R8(DLLPR) | PFM_EN);
RTL_W32(MISC, RTL_R32(MISC) | PWM_EN);
-   RTL_W8(Config5, RTL_R8(Config5) & ~Spi_en);
+
+   if (use_aspm) {
+   RTL_W8(Config5, (RTL_R8(Config5) & ~Spi_en) | ASPM_en);
+   RTL_W8(Config2, RTL_R8(Config2) | ClkReqEn);
+   } else {
+   RTL_W8(Config5, RTL_R8(Config5) & ~Spi_en);
+   }
 }
 
 static void rtl_hw_start_8168f(struct rtl8169_private *tp)
@@ -5959,13 +5970,21 @@ static void rtl_hw_start_8168f(struct rtl8169_private 
*tp)
 
RTL_W8(MaxTxPacketSize, EarlySize);
 
-   rtl_disable_clock_request(pdev);
+   if (!use_aspm)
+   rtl_disable_clock_request(pdev);
 
RTL_W32(TxConfig, RTL_R32(TxConfig) | TXCFG_AUTO_FIFO);
RTL_W8(MCU, RTL_R8(MCU) & ~NOW_IS_OOB);
RTL_W8(DLLPR, RTL_R8(DLLPR) | PFM_EN);
-   RTL_W32(MISC, RTL_R32(MISC) | PWM_EN);
-   RTL_W8(Config5, RTL_R8(Config5) & ~Spi_en);
+
+   if (use_aspm) {
+   RTL_W32(MISC, RTL_R32(MISC) | PWM_EN | FORCE_CLK);
+   RTL_W8(Config5, (RTL_R8(Config5) & ~Spi_en) | ASPM_en);
+   RTL_W8(Config2, RTL_R8(Config2) | ClkReqEn);
+   } else {
+   RTL_W32(MISC, RTL_R32(MISC) | PWM_EN);
+   RTL_W8(Config5, RTL_R8(Config5) & ~Spi_en);
+   }
 }
 
 static void rtl_hw_start_8168f_1(struct rtl8169_private *tp)
@@ -6056,6 +6075,12 @@ static void rtl_hw_start_8168g_1(struct rtl8169_private 
*tp)
RTL_W8(Config2, RTL_R8(Config2) & ~ClkReqEn);
RTL_W8(Config5, RTL_R8(Config5) & ~ASPM_en);
rtl_ephy_init(tp, e_info_8168g_1, ARRAY_SIZE(e_info_8168g_1));
+
+   if (use_aspm) {
+   RTL_W8(Config5, RTL_R8(Config5) | ASPM_en);
+   RTL_W8(Config2, RTL_R8(Config2) | ClkReqEn);
+   RTL_W32(MISC, RTL_R32(MISC) | FORCE_CLK);
+   }
 }
 
 static void rtl_hw_start_8168g_2(struct rtl8169_private *tp)
@@ -6074,6 +6099,12 @@ static void rtl_hw_start_8168g_2(struct rtl8169_private 
*tp)
RTL_W8(Config2, RTL_R8(Config2) & ~ClkReqEn);
RTL_W8(Config5, RTL_R8(Config5) & ~ASPM_en);
rtl_ephy_init(tp, e_info_8168g_2, ARRAY_SIZE(e_info_8168g_2));
+
+   if (use_aspm) {
+   RTL_W8(Config5, RTL_R8(Config5) | ASPM_en);
+   RTL_W8(Config2, RTL_R8(Config2) | ClkReqEn);
+   RTL_W32(MISC, RTL_R32(MISC) 

Re: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net

2016-11-03 Thread John Fastabend
[...]

>>> - when XDP is attached disable all LRO using 
>>> VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET
>>>   (not used by driver so far, designed to allow dynamic LRO control with
>>>ethtool)
>>
>> I see there is a UAPI bit for this but I guess we also need to add
>> support to vhost as well? Seems otherwise we may just drop a bunch
>> of packets on the floor out of handle_rx() when recvmsg returns larger
>> than a page size. Or did I read this wrong...
> 
> It's already supported host side. However you might
> get some packets that were in flight when you attached.
> 

Really I must have missed it I don't see any *GUEST_FEATURES* flag in
./drivers/vhost/?

>>> - start adding page-sized buffers
>>
>> I started to mangle add_recvbuf_big() and receive_big() here and this
>> didn't seem too bad.
> 
> I imagine it won't be ATM but I think we'll need to support
> mrg buffers with time and then it will be messy.
> Besides, it's not an architectural thing that receive_big
> uses page sized buffers, it could use any size.
> So a separate path just for xdp would be better imho.
> 
>>> - do something with non-page-sized buffers added previously - what
>>>   exactly? copy I guess? What about LRO packets that are too large -
>>>   can we drop or can we split them up?
>>
>> hmm not sure I understand this here. With LRO disabled and mergeable
>> buffers disabled all packets should fit in a page correct?
> 
> Assuing F_MTU is negotiated and MTU field is small enough, yes.
> But if you disable mrg buffers dynamically you will get some packets
> in buffers that were added before the disable.
> Similarly for disabling LRO dynamically.
> 
>> With LRO enabled case I guess to start with we block XDP from being
>> loaded for the same reason we don't allow jumbo frames on physical
>> nics.
> 
> If you ask that host disables the capability, then yes, it's easy.
> Let's do that for now, it's a start.
> 
> 
>>>
>>> I'm fine with disabling XDP for some configurations as the first step,
>>> and we can add that support later.
>>>
>>
>> In order for this to work though I guess we need to be able to
>> dynamically disable mergeable buffers at the moment I just commented
>> it out of the features list and fixed up virtio_has_features so it
>> wont bug_on.
> 
> For now we can just set mrg_rxbuf=off on qemu command line, and
> fail XDP attach if not there. I think we'll be able to support it
> long term but you will need host side changes, or fully reset
> device and reconfigure it.

see question below. I agree disabling mrg_rxbuff=off lro=off and an
xdp receive path makes this relatively straight forward and clean with
the MTU patch noted below as well.

> 
>>> Ideas about mergeable buffers (optional):
>>>
>>> At the moment mergeable buffers can't be disabled dynamically.
>>> They do bring a small benefit for XDP if host MTU is large (see below)
>>> and aren't hard to support:
>>> - if header is by itself skip 1st page
>>> - otherwise copy all data into first page
>>> and it's nicer not to add random limitations that require guest reboot.
>>> It might make sense to add a command that disables/enabled
>>> mergeable buffers dynamically but that's for newer hosts.
>>
>> Yep it seems disabling mergeable buffers solves this but didn't look at
>> it too closely. I'll look closer tomorrow.
>>
>>>
>>> Spec does not require it but in practice most hosts put all data
>>> in the 1st page or all in the 2nd page so the copy will be nop
>>> for these cases.
>>>
>>> Large host MTU - newer hosts report the host MTU, older ones don't.
>>> Using mergeable buffers we can at least detect this case
>>> (and then what? drop I guess).
>>>
>>
>> The physical nics just refuse to load XDP with large MTU.
> 
> So let's do the same for now, unfortunately you don't know
> the MTU unless _F_MTU is negitiated and QEMU does not
> implement that yet, but it's easy to add.
> In fact I suspect Aaron (cc) has an implementation since
> he posted a patch implementing that.
> Aaron could you post it pls?
> 

Great! Aaron if you want me to review/test at all let me know I have
a few systems setup running this now so can help if needed.

>> Any reason
>> not to negotiate the mtu with the guest so that the guest can force
>> this?
> 
> There are generally many guests and many NICs on the host.
> A big packet arrives, what do you want to do with it?

Drop it just like a physical nic would do if packet is larger
than MTU. Maybe splat a message in the log so user has some
clue something got misconfigured.

> We probably want to build propagating MTU across all VMs and NICs
> but let's get a basic thing merged first.

That feels like an orchestration/QEMU type problem to me. Just because
some NIC has jumbo frames enabled doesn't necessarily mean they would
ever get to any specific VM based on forwarding configuration.

And if I try to merge the last email I sent out here. In mergeable and
big_packets modes if LRO is off and MTU < PAGE_SIZE it seems we should
always get physically 

[PATCH net-next] netfilter: Update nf_send_reset6 to consider L3 domain

2016-11-03 Thread David Ahern
nf_send_reset6 is not considering the L3 domain and lookups are sent
to the wrong table. For example consider the following output rule:

ip6tables -A OUTPUT -p tcp --dport 12345 -j REJECT --reject-with tcp-reset

using perf to analyze lookups via the fib6_table_lookup tracepoint shows:

swapper 0 [001]   248.787816: fib6:fib6_table_lookup: table 255 oif 0 iif 1 
src 2100:1::3 dst 2100:1:
81439cdc perf_trace_fib6_table_lookup ([kernel.kallsyms])
814c1ce3 trace_fib6_table_lookup ([kernel.kallsyms])
814c3e89 ip6_pol_route ([kernel.kallsyms])
814c40d5 ip6_pol_route_output ([kernel.kallsyms])
814e7b6f fib6_rule_action ([kernel.kallsyms])
81437f60 fib_rules_lookup ([kernel.kallsyms])
814e7c79 fib6_rule_lookup ([kernel.kallsyms])
814c2541 ip6_route_output_flags ([kernel.kallsyms])
 528 nf_send_reset6 ([nf_reject_ipv6])

Update nf_send_reset6 to pull the L3 domain from the dst currently
attached to the skb.

Signed-off-by: David Ahern 
---
 net/ipv6/netfilter/nf_reject_ipv6.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c 
b/net/ipv6/netfilter/nf_reject_ipv6.c
index a5400223fd74..10090400c72f 100644
--- a/net/ipv6/netfilter/nf_reject_ipv6.c
+++ b/net/ipv6/netfilter/nf_reject_ipv6.c
@@ -156,6 +156,7 @@ void nf_send_reset6(struct net *net, struct sk_buff 
*oldskb, int hook)
fl6.daddr = oip6h->saddr;
fl6.fl6_sport = otcph->dest;
fl6.fl6_dport = otcph->source;
+   fl6.flowi6_oif = l3mdev_master_ifindex(skb_dst(oldskb)->dev);
security_skb_classify_flow(oldskb, flowi6_to_flowi());
dst = ip6_route_output(net, NULL, );
if (dst->error) {
-- 
2.1.4



Re: net/netlink: another global-out-of-bounds in genl_family_rcv_msg/validate_nla

2016-11-03 Thread Cong Wang
On Thu, Nov 3, 2016 at 4:04 PM, Andrey Konovalov  wrote:
> Hi,
>
> I've got the following error report while running the syzkaller fuzzer:
>
> BUG: KASAN: global-out-of-bounds in validate_nla+0x49b/0x4e0 at addr
> 84452de0
> Read of size 2 by task syz-executor/19055
> Address belongs to variable ip_vs_cmd_policy+0x20/0x40

LOL, seems a typo

diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index c3c809b..a6e44ef 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -2845,7 +2845,7 @@ static struct genl_family ip_vs_genl_family = {
.hdrsize= 0,
.name   = IPVS_GENL_NAME,
.version= IPVS_GENL_VERSION,
-   .maxattr= IPVS_CMD_MAX,
+   .maxattr= IPVS_CMD_ATTR_MAX,
.netnsok= true, /* Make ipvsadm to work on netns */
 };


[PATCH] net: icmp6_send should use dst dev to determine L3 domain

2016-11-03 Thread David Ahern
icmp6_send is called in response to some event. The skb may not have
the device set (skb->dev is NULL), but it is expected to have a dst set.
Update icmp6_send to use the dst on the skb to determine L3 domain.

Fixes: ca254490c8dfd ("net: Add VRF support to IPv6 stack")
Signed-off-by: David Ahern 
---
 net/ipv6/icmp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index bd59c343d35f..7370ad2e693a 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -448,7 +448,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 
code, __u32 info,
if (__ipv6_addr_needs_scope_id(addr_type))
iif = skb->dev->ifindex;
else
-   iif = l3mdev_master_ifindex(skb->dev);
+   iif = l3mdev_master_ifindex(skb_dst(skb)->dev);
 
/*
 *  Must not send error if the source does not uniquely
-- 
2.1.4



Re: [PATCH net-next] sock: do not set sk_err in sock_dequeue_err_skb

2016-11-03 Thread Hannes Frederic Sowa
[also cc'ed Andy, albeit this doesn't seem to solve his initial problem,
right? ]

On 03.11.2016 23:24, Soheil Hassas Yeganeh wrote:
> From: Soheil Hassas Yeganeh 
> 
> Do not set sk_err when dequeuing errors from the error queue.
> Doing so results in:
> a) Bugs: By overwriting existing sk_err values, it possibly
>hides legitimate errors. It is also incorrect when local
>errors are queued with ip_local_error. That happens in the
>context of a system call, which already returns the error
>code.
> b) Inconsistent behavior: When there are pending errors on
>the error queue, sk_err is sometimes 0 (e.g., for
>the first timestamp on the error queue) and sometimes
>set to an error code (after dequeuing the first
>timestamp).
> c) Suboptimality: Setting sk_err to ENOMSG on simple
>TX timestamps can abort parallel reads and writes.
> 
> Removing this line doesn't break userspace. This is because
> userspace code cannot rely on sk_err for detecting whether
> there is something on the error queue. Except for ICMP messages
> received for UDP and RAW, sk_err is not set at enqueue time,
> and as a result sk_err can be 0 while there are plenty of
> errors on the error queue.
> 
> For ICMP packets in UDP and RAW, sk_err is set when they are
> enqueued on the error queue, but that does not result in aborting
> reads and writes. For such cases, sk_err is only readable via
> getsockopt(SO_ERROR) which will reset the value of sk_err on
> its own. More importantly, prior to this patch,
> recvmsg(MSG_ERRQUEUE) has a race on setting sk_err (i.e.,
> sk_err is set by sock_dequeue_err_skb without atomic ops or
> locks) which can store 0 in sk_err even when we have ICMP
> messages pending. Removing this line from sock_dequeue_err_skb
> eliminates that race.
> 
> Signed-off-by: Soheil Hassas Yeganeh 
> Signed-off-by: Eric Dumazet 
> Signed-off-by: Willem de Bruijn 
> Signed-off-by: Neal Cardwell 

I think it makes sense to remove this given your argumentation.

Acked-by: Hannes Frederic Sowa 



net/netlink: another global-out-of-bounds in genl_family_rcv_msg/validate_nla

2016-11-03 Thread Andrey Konovalov
Hi,

I've got the following error report while running the syzkaller fuzzer:

BUG: KASAN: global-out-of-bounds in validate_nla+0x49b/0x4e0 at addr
84452de0
Read of size 2 by task syz-executor/19055
Address belongs to variable ip_vs_cmd_policy+0x20/0x40
CPU: 1 PID: 19055 Comm: syz-executor Not tainted 4.9.0-rc3+ #350
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
 88006b547638 81b46934 88006b5476c8 847a361f
 84452dc0 84452de0 88006b5476b8 8150ac7c
 859bdf80 85f44280 88003df282c0 0292
Call Trace:
 [< inline >] __dump_stack lib/dump_stack.c:15
 [] dump_stack+0xb3/0x10f lib/dump_stack.c:51
 [< inline >] print_address_description mm/kasan/report.c:204
 [] kasan_report_error+0x49c/0x4d0 mm/kasan/report.c:283
 [< inline >] kasan_report mm/kasan/report.c:303
 [] __asan_report_load2_noabort+0x3e/0x40
mm/kasan/report.c:322
 [] validate_nla+0x49b/0x4e0 lib/nlattr.c:41
 [] nla_parse+0x115/0x280 lib/nlattr.c:195
 [< inline >] nlmsg_parse include/net/netlink.h:386
 [] genl_family_rcv_msg+0x543/0xc80
net/netlink/genetlink.c:613
 [] genl_rcv_msg+0x1b6/0x270 net/netlink/genetlink.c:658
 [] netlink_rcv_skb+0x2c0/0x3b0 net/netlink/af_netlink.c:2281
 [] genl_rcv+0x28/0x40 net/netlink/genetlink.c:669
 [< inline >] netlink_unicast_kernel net/netlink/af_netlink.c:1214
 [] netlink_unicast+0x5a9/0x880 net/netlink/af_netlink.c:1240
 [] netlink_sendmsg+0x9b7/0xce0 net/netlink/af_netlink.c:1786
 [< inline >] sock_sendmsg_nosec net/socket.c:606
 [] sock_sendmsg+0xcc/0x110 net/socket.c:616
 [] sock_write_iter+0x221/0x3b0 net/socket.c:814
 [< inline >] new_sync_write fs/read_write.c:499
 [] __vfs_write+0x334/0x570 fs/read_write.c:512
 [] vfs_write+0x17b/0x500 fs/read_write.c:560
 [< inline >] SYSC_write fs/read_write.c:607
 [] SyS_write+0xd4/0x1a0 fs/read_write.c:599
 [] do_syscall_64+0x195/0x490 arch/x86/entry/common.c:280
 [] entry_SYSCALL64_slow_path+0x25/0x25
Memory state around the buggy address:
 84452c80: fa fa fa fa 00 00 00 00 00 00 04 fa fa fa fa fa
 84452d00: 00 00 00 00 00 00 04 fa fa fa fa fa 00 00 00 00
>84452d80: 04 fa fa fa fa fa fa fa 00 00 00 04 fa fa fa fa
   ^
 84452e00: 00 fa fa fa fa fa fa fa 00 00 fa fa fa fa fa fa
 84452e80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
==

This time the out-of-bounds is on the ip_vs_cmd_policy variable.

On commit 0c183d92b20b5c84ca655b45ef57b3318b83eb9e (Oct 31).

Thanks!


[PATCH net] bpf: fix htab map destruction when extra reserve is in use

2016-11-03 Thread Daniel Borkmann
Commit a6ed3ea65d98 ("bpf: restore behavior of bpf_map_update_elem")
added an extra per-cpu reserve to the hash table map to restore old
behaviour from pre prealloc times. When non-prealloc is in use for a
map, then problem is that once a hash table extra element has been
linked into the hash-table, and the hash table is destroyed due to
refcount dropping to zero, then htab_map_free() -> delete_all_elements()
will walk the whole hash table and drop all elements via htab_elem_free().
The problem is that the element from the extra reserve is first fed
to the wrong backend allocator and eventually freed twice.

Fixes: a6ed3ea65d98 ("bpf: restore behavior of bpf_map_update_elem")
Reported-by: Dmitry Vyukov 
Signed-off-by: Daniel Borkmann 
Acked-by: Alexei Starovoitov 
---
 kernel/bpf/hashtab.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 570eeca..ad1bc67 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -687,7 +687,8 @@ static void delete_all_elements(struct bpf_htab *htab)
 
hlist_for_each_entry_safe(l, n, head, hash_node) {
hlist_del_rcu(>hash_node);
-   htab_elem_free(htab, l);
+   if (l->state != HTAB_EXTRA_ELEM_USED)
+   htab_elem_free(htab, l);
}
}
 }
-- 
1.9.3



Re: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net

2016-11-03 Thread Michael S. Tsirkin
On Wed, Nov 02, 2016 at 11:44:33PM -0700, John Fastabend wrote:
> On 16-11-02 09:11 PM, Michael S. Tsirkin wrote:
> > On Wed, Nov 02, 2016 at 06:28:34PM -0700, Shrijeet Mukherjee wrote:
> >>> -Original Message-
> >>> From: Jesper Dangaard Brouer [mailto:bro...@redhat.com]
> >>> Sent: Wednesday, November 2, 2016 7:27 AM
> >>> To: Thomas Graf 
> >>> Cc: Shrijeet Mukherjee ; Alexei Starovoitov
> >>> ; Jakub Kicinski ; John
> >>> Fastabend ; David Miller
> >>> ; alexander.du...@gmail.com; m...@redhat.com;
> >>> shrij...@gmail.com; t...@herbertland.com; netdev@vger.kernel.org;
> >>> Roopa Prabhu ; Nikolay Aleksandrov
> >>> ; bro...@redhat.com
> >>> Subject: Re: [PATCH net-next RFC WIP] Patch for XDP support for
> >> virtio_net
> >>>
> >>> On Sat, 29 Oct 2016 13:25:14 +0200
> >>> Thomas Graf  wrote:
> >>>
>  On 10/28/16 at 08:51pm, Shrijeet Mukherjee wrote:
> > Generally agree, but SRIOV nics with multiple queues can end up in a
> > bad spot if each buffer was 4K right ? I see a specific page pool to
> > be used by queues which are enabled for XDP as the easiest to swing
> > solution that way the memory overhead can be restricted to enabled
> > queues and shared access issues can be restricted to skb's using
> >> that
> >>> pool no ?
> >>>
> >>> Yes, that is why that I've been arguing so strongly for having the
> >> flexibility to
> >>> attach a XDP program per RX queue, as this only change the memory model
> >>> for this one queue.
> >>>
> >>>
>  Isn't this clearly a must anyway? I may be missing something
>  fundamental here so please enlighten me :-)
> 
>  If we dedicate a page per packet, that could translate to 14M*4K worth
>  of memory being mapped per second for just a 10G NIC under DoS attack.
>  How can one protect such as system? Is the assumption that we can
>  always drop such packets quickly enough before we start dropping
>  randomly due to memory pressure? If a handshake is required to
>  determine validity of a packet then that is going to be difficult.
> >>>
> >>> Under DoS attacks you don't run out of memory, because a diverse set of
> >>> socket memory limits/accounting avoids that situation.  What does happen
> >>> is the maximum achievable PPS rate is directly dependent on the
> >>> time you spend on each packet.   This use of CPU resources (and
> >>> hitting mem-limits-safe-guards) push-back on the drivers speed to
> >> process
> >>> the RX ring.  In effect, packets are dropped in the NIC HW as RX-ring
> >> queue
> >>> is not emptied fast-enough.
> >>>
> >>> Given you don't control what HW drops, the attacker will "successfully"
> >>> cause your good traffic to be among the dropped packets.
> >>>
> >>> This is where XDP change the picture. If you can express (by eBPF) a
> >> filter
> >>> that can separate "bad" vs "good" traffic, then you can take back
> >> control.
> >>> Almost like controlling what traffic the HW should drop.
> >>> Given the cost of XDP-eBPF filter + serving regular traffic does not use
> >> all of
> >>> your CPU resources, you have overcome the attack.
> >>>
> >>> --
> >> Jesper,  John et al .. to make this a little concrete I am going to spin
> >> up a v2 which has only bigbuffers mode enabled for xdp acceleration, all
> >> other modes will reject the xdp ndo ..
> >>
> >> Do we have agreement on that model ?
> >>
> >> It will need that all vhost implementations will need to start with
> >> mergeable buffers disabled to get xdp goodness, but that sounds like a
> >> safe thing to do for now ..
> > 
> > It's ok for experimentation, but really after speaking with Alexei it's
> > clear to me that xdp should have a separate code path in the driver,
> > e.g. the separation between modes is something that does not
> > make sense for xdp.
> > 
> > The way I imagine it working:
> 
> OK I tried to make some sense out of this and get it working,
> 
> > 
> > - when XDP is attached disable all LRO using 
> > VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET
> >   (not used by driver so far, designed to allow dynamic LRO control with
> >ethtool)
> 
> I see there is a UAPI bit for this but I guess we also need to add
> support to vhost as well? Seems otherwise we may just drop a bunch
> of packets on the floor out of handle_rx() when recvmsg returns larger
> than a page size. Or did I read this wrong...

It's already supported host side. However you might
get some packets that were in flight when you attached.

> > - start adding page-sized buffers
> 
> I started to mangle add_recvbuf_big() and receive_big() here and this
> didn't seem too bad.

I imagine it won't be ATM but I think we'll need to support
mrg buffers with time and then it will be messy.
Besides, it's not an architectural thing that receive_big
uses page sized 

Re: [PATCH for-next V2 00/15][PULL request] Mellanox mlx5 core driver updates 2016-10-25

2016-11-03 Thread Doug Ledford
On 10/30/16 3:35 PM, David Miller wrote:
> From: Saeed Mahameed 
> Date: Sun, 30 Oct 2016 23:21:53 +0200
> 
>> This series contains some updates and fixes of mlx5 core and
>> IB drivers with the addition of two features that demand
>> new low level commands and infrastructure updates.
>>  - SRIOV VF max rate limit support
>>  - mlx5e tc support for FWD rules with counter.
>>
>> Needed for both net and rdma subsystems.
> 
> Pulled, thanks.

Thanks, done here as well.

-- 
Doug Ledford GPG Key ID: 0E572FDD
  Red Hat, Inc.
  100 E. Davie St
  Raleigh, NC 27601 USA



signature.asc
Description: OpenPGP digital signature


[PATCH net-next] sock: do not set sk_err in sock_dequeue_err_skb

2016-11-03 Thread Soheil Hassas Yeganeh
From: Soheil Hassas Yeganeh 

Do not set sk_err when dequeuing errors from the error queue.
Doing so results in:
a) Bugs: By overwriting existing sk_err values, it possibly
   hides legitimate errors. It is also incorrect when local
   errors are queued with ip_local_error. That happens in the
   context of a system call, which already returns the error
   code.
b) Inconsistent behavior: When there are pending errors on
   the error queue, sk_err is sometimes 0 (e.g., for
   the first timestamp on the error queue) and sometimes
   set to an error code (after dequeuing the first
   timestamp).
c) Suboptimality: Setting sk_err to ENOMSG on simple
   TX timestamps can abort parallel reads and writes.

Removing this line doesn't break userspace. This is because
userspace code cannot rely on sk_err for detecting whether
there is something on the error queue. Except for ICMP messages
received for UDP and RAW, sk_err is not set at enqueue time,
and as a result sk_err can be 0 while there are plenty of
errors on the error queue.

For ICMP packets in UDP and RAW, sk_err is set when they are
enqueued on the error queue, but that does not result in aborting
reads and writes. For such cases, sk_err is only readable via
getsockopt(SO_ERROR) which will reset the value of sk_err on
its own. More importantly, prior to this patch,
recvmsg(MSG_ERRQUEUE) has a race on setting sk_err (i.e.,
sk_err is set by sock_dequeue_err_skb without atomic ops or
locks) which can store 0 in sk_err even when we have ICMP
messages pending. Removing this line from sock_dequeue_err_skb
eliminates that race.

Signed-off-by: Soheil Hassas Yeganeh 
Signed-off-by: Eric Dumazet 
Signed-off-by: Willem de Bruijn 
Signed-off-by: Neal Cardwell 
---
 net/core/skbuff.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 1e3e008..0b2a6e9 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3725,7 +3725,6 @@ struct sk_buff *sock_dequeue_err_skb(struct sock *sk)
err = SKB_EXT_ERR(skb_next)->ee.ee_errno;
spin_unlock_irqrestore(>lock, flags);
 
-   sk->sk_err = err;
if (err)
sk->sk_error_report(sk);
 
-- 
2.8.0.rc3.226.g39d4020



Re: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net

2016-11-03 Thread John Fastabend
On 16-11-02 11:44 PM, John Fastabend wrote:
> On 16-11-02 09:11 PM, Michael S. Tsirkin wrote:
>> On Wed, Nov 02, 2016 at 06:28:34PM -0700, Shrijeet Mukherjee wrote:
 -Original Message-
 From: Jesper Dangaard Brouer [mailto:bro...@redhat.com]
 Sent: Wednesday, November 2, 2016 7:27 AM
 To: Thomas Graf 
 Cc: Shrijeet Mukherjee ; Alexei Starovoitov
 ; Jakub Kicinski ; John
 Fastabend ; David Miller
 ; alexander.du...@gmail.com; m...@redhat.com;
 shrij...@gmail.com; t...@herbertland.com; netdev@vger.kernel.org;
 Roopa Prabhu ; Nikolay Aleksandrov
 ; bro...@redhat.com
 Subject: Re: [PATCH net-next RFC WIP] Patch for XDP support for
>>> virtio_net

 On Sat, 29 Oct 2016 13:25:14 +0200
 Thomas Graf  wrote:

> On 10/28/16 at 08:51pm, Shrijeet Mukherjee wrote:
>> Generally agree, but SRIOV nics with multiple queues can end up in a
>> bad spot if each buffer was 4K right ? I see a specific page pool to
>> be used by queues which are enabled for XDP as the easiest to swing
>> solution that way the memory overhead can be restricted to enabled
>> queues and shared access issues can be restricted to skb's using
>>> that
 pool no ?

 Yes, that is why that I've been arguing so strongly for having the
>>> flexibility to
 attach a XDP program per RX queue, as this only change the memory model
 for this one queue.


> Isn't this clearly a must anyway? I may be missing something
> fundamental here so please enlighten me :-)
>
> If we dedicate a page per packet, that could translate to 14M*4K worth
> of memory being mapped per second for just a 10G NIC under DoS attack.
> How can one protect such as system? Is the assumption that we can
> always drop such packets quickly enough before we start dropping
> randomly due to memory pressure? If a handshake is required to
> determine validity of a packet then that is going to be difficult.

 Under DoS attacks you don't run out of memory, because a diverse set of
 socket memory limits/accounting avoids that situation.  What does happen
 is the maximum achievable PPS rate is directly dependent on the
 time you spend on each packet.   This use of CPU resources (and
 hitting mem-limits-safe-guards) push-back on the drivers speed to
>>> process
 the RX ring.  In effect, packets are dropped in the NIC HW as RX-ring
>>> queue
 is not emptied fast-enough.

 Given you don't control what HW drops, the attacker will "successfully"
 cause your good traffic to be among the dropped packets.

 This is where XDP change the picture. If you can express (by eBPF) a
>>> filter
 that can separate "bad" vs "good" traffic, then you can take back
>>> control.
 Almost like controlling what traffic the HW should drop.
 Given the cost of XDP-eBPF filter + serving regular traffic does not use
>>> all of
 your CPU resources, you have overcome the attack.

 --
>>> Jesper,  John et al .. to make this a little concrete I am going to spin
>>> up a v2 which has only bigbuffers mode enabled for xdp acceleration, all
>>> other modes will reject the xdp ndo ..
>>>
>>> Do we have agreement on that model ?
>>>
>>> It will need that all vhost implementations will need to start with
>>> mergeable buffers disabled to get xdp goodness, but that sounds like a
>>> safe thing to do for now ..
>>
>> It's ok for experimentation, but really after speaking with Alexei it's
>> clear to me that xdp should have a separate code path in the driver,
>> e.g. the separation between modes is something that does not
>> make sense for xdp.
>>
>> The way I imagine it working:
> 
> OK I tried to make some sense out of this and get it working,
> 
>>
>> - when XDP is attached disable all LRO using 
>> VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET
>>   (not used by driver so far, designed to allow dynamic LRO control with
>>ethtool)
> 
> I see there is a UAPI bit for this but I guess we also need to add
> support to vhost as well? Seems otherwise we may just drop a bunch
> of packets on the floor out of handle_rx() when recvmsg returns larger
> than a page size. Or did I read this wrong...
> 
>> - start adding page-sized buffers
> 
> I started to mangle add_recvbuf_big() and receive_big() here and this
> didn't seem too bad.
> 
>> - do something with non-page-sized buffers added previously - what
>>   exactly? copy I guess? What about LRO packets that are too large -
>>   can we drop or can we split them up?
> 
> hmm not sure I understand this here. With LRO disabled and mergeable
> buffers disabled all packets should fit in a page correct?
> 
> With LRO enabled case I guess to start with we block XDP from being
> loaded 

[PATCH net-next 2/2] sfc: report 4-tuple UDP hashing to ethtool, if it's enabled

2016-11-03 Thread Edward Cree
Signed-off-by: Edward Cree 
---
 drivers/net/ethernet/sfc/ef10.c   |  6 --
 drivers/net/ethernet/sfc/ethtool.c| 12 
 drivers/net/ethernet/sfc/net_driver.h |  2 ++
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c
index 9f6d769..e61807e 100644
--- a/drivers/net/ethernet/sfc/ef10.c
+++ b/drivers/net/ethernet/sfc/ef10.c
@@ -2319,8 +2319,10 @@ static void efx_ef10_set_rss_flags(struct efx_nic *efx, 
u32 context)
flags |= RSS_MODE_HASH_PORTS << 
MC_CMD_RSS_CONTEXT_GET_FLAGS_OUT_UDP_IPV4_RSS_MODE_LBN;
flags |= RSS_MODE_HASH_PORTS << 
MC_CMD_RSS_CONTEXT_GET_FLAGS_OUT_UDP_IPV6_RSS_MODE_LBN;
MCDI_SET_DWORD(inbuf, RSS_CONTEXT_SET_FLAGS_IN_FLAGS, flags);
-   efx_mcdi_rpc(efx, MC_CMD_RSS_CONTEXT_SET_FLAGS, inbuf, sizeof(inbuf),
-NULL, 0, NULL);
+   if (!efx_mcdi_rpc(efx, MC_CMD_RSS_CONTEXT_SET_FLAGS, inbuf, 
sizeof(inbuf),
+ NULL, 0, NULL))
+   /* Succeeded, so UDP 4-tuple is now enabled */
+   efx->rx_hash_udp_4tuple = true;
 }
 
 static int efx_ef10_alloc_rss_context(struct efx_nic *efx, u32 *context,
diff --git a/drivers/net/ethernet/sfc/ethtool.c 
b/drivers/net/ethernet/sfc/ethtool.c
index 445ccdb..bf126f9 100644
--- a/drivers/net/ethernet/sfc/ethtool.c
+++ b/drivers/net/ethernet/sfc/ethtool.c
@@ -968,20 +968,24 @@ efx_ethtool_get_rxnfc(struct net_device *net_dev,
 
info->data = 0;
switch (info->flow_type) {
+   case UDP_V4_FLOW:
+   if (efx->rx_hash_udp_4tuple)
+   /* fall through */
case TCP_V4_FLOW:
-   info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
+   info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
/* fall through */
-   case UDP_V4_FLOW:
case SCTP_V4_FLOW:
case AH_ESP_V4_FLOW:
case IPV4_FLOW:
info->data |= RXH_IP_SRC | RXH_IP_DST;
min_revision = EFX_REV_FALCON_B0;
break;
+   case UDP_V6_FLOW:
+   if (efx->rx_hash_udp_4tuple)
+   /* fall through */
case TCP_V6_FLOW:
-   info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
+   info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
/* fall through */
-   case UDP_V6_FLOW:
case SCTP_V6_FLOW:
case AH_ESP_V6_FLOW:
case IPV6_FLOW:
diff --git a/drivers/net/ethernet/sfc/net_driver.h 
b/drivers/net/ethernet/sfc/net_driver.h
index 99d8c82..fec51c4 100644
--- a/drivers/net/ethernet/sfc/net_driver.h
+++ b/drivers/net/ethernet/sfc/net_driver.h
@@ -853,6 +853,7 @@ struct vfdi_status;
  * @rx_hash_key: Toeplitz hash key for RSS
  * @rx_indir_table: Indirection table for RSS
  * @rx_scatter: Scatter mode enabled for receives
+ * @rx_hash_udp_4tuple: UDP 4-tuple hashing enabled
  * @int_error_count: Number of internal errors seen recently
  * @int_error_expire: Time at which error count will be expired
  * @irq_soft_enabled: Are IRQs soft-enabled? If not, IRQ handler will
@@ -990,6 +991,7 @@ struct efx_nic {
u8 rx_hash_key[40];
u32 rx_indir_table[128];
bool rx_scatter;
+   bool rx_hash_udp_4tuple;
 
unsigned int_error_count;
unsigned long int_error_expire;


[PATCH net-next 1/2] sfc: enable 4-tuple RSS hashing for UDP

2016-11-03 Thread Edward Cree
This improves UDP spreading, and also slightly improves GRO performance
of encapsulated TCP on 7000 series NICs.

Signed-off-by: Edward Cree 
---
 drivers/net/ethernet/sfc/ef10.c | 82 +
 1 file changed, 82 insertions(+)

diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c
index 00279da..9f6d769 100644
--- a/drivers/net/ethernet/sfc/ef10.c
+++ b/drivers/net/ethernet/sfc/ef10.c
@@ -2245,6 +2245,84 @@ static void efx_ef10_tx_write(struct efx_tx_queue 
*tx_queue)
}
 }
 
+#define RSS_MODE_HASH_ADDRS(1 << RSS_MODE_HASH_SRC_ADDR_LBN |\
+1 << RSS_MODE_HASH_DST_ADDR_LBN)
+#define RSS_MODE_HASH_PORTS(1 << RSS_MODE_HASH_SRC_PORT_LBN |\
+1 << RSS_MODE_HASH_DST_PORT_LBN)
+#define RSS_CONTEXT_FLAGS_DEFAULT  (1 << 
MC_CMD_RSS_CONTEXT_GET_FLAGS_OUT_TOEPLITZ_IPV4_EN_LBN |\
+1 << 
MC_CMD_RSS_CONTEXT_GET_FLAGS_OUT_TOEPLITZ_TCPV4_EN_LBN |\
+1 << 
MC_CMD_RSS_CONTEXT_GET_FLAGS_OUT_TOEPLITZ_IPV6_EN_LBN |\
+1 << 
MC_CMD_RSS_CONTEXT_GET_FLAGS_OUT_TOEPLITZ_TCPV6_EN_LBN |\
+(RSS_MODE_HASH_ADDRS | 
RSS_MODE_HASH_PORTS) << MC_CMD_RSS_CONTEXT_GET_FLAGS_OUT_TCP_IPV4_RSS_MODE_LBN 
|\
+RSS_MODE_HASH_ADDRS << 
MC_CMD_RSS_CONTEXT_GET_FLAGS_OUT_UDP_IPV4_RSS_MODE_LBN |\
+RSS_MODE_HASH_ADDRS << 
MC_CMD_RSS_CONTEXT_GET_FLAGS_OUT_OTHER_IPV4_RSS_MODE_LBN |\
+(RSS_MODE_HASH_ADDRS | 
RSS_MODE_HASH_PORTS) << MC_CMD_RSS_CONTEXT_GET_FLAGS_OUT_TCP_IPV6_RSS_MODE_LBN 
|\
+RSS_MODE_HASH_ADDRS << 
MC_CMD_RSS_CONTEXT_GET_FLAGS_OUT_UDP_IPV6_RSS_MODE_LBN |\
+RSS_MODE_HASH_ADDRS << 
MC_CMD_RSS_CONTEXT_GET_FLAGS_OUT_OTHER_IPV6_RSS_MODE_LBN)
+
+static int efx_ef10_get_rss_flags(struct efx_nic *efx, u32 context, u32 *flags)
+{
+   /* Firmware had a bug (sfc bug 61952) where it would not actually
+* fill in the flags field in the response to 
MC_CMD_RSS_CONTEXT_GET_FLAGS.
+* This meant that it would always contain whatever was previously
+* in the MCDI buffer.  Fortunately, all firmware versions with
+* this bug have the same default flags value for a newly-allocated
+* RSS context, and the only time we want to get the flags is just
+* after allocating.  Moreover, the response has a 32-bit hole
+* where the context ID would be in the request, so we can use an
+* overlength buffer in the request and pre-fill the flags field
+* with what we believe the default to be.  Thus if the firmware
+* has the bug, it will leave our pre-filled value in the flags
+* field of the response, and we will get the right answer.
+*
+* However, this does mean that this function should NOT be used if
+* the RSS context flags might not be their defaults - it is ONLY
+* reliably correct for a newly-allocated RSS context.
+*/
+   MCDI_DECLARE_BUF(inbuf, MC_CMD_RSS_CONTEXT_GET_FLAGS_OUT_LEN);
+   MCDI_DECLARE_BUF(outbuf, MC_CMD_RSS_CONTEXT_GET_FLAGS_OUT_LEN);
+   size_t outlen;
+   int rc;
+
+   /* Check we have a hole for the context ID */
+   BUILD_BUG_ON(MC_CMD_RSS_CONTEXT_GET_FLAGS_IN_LEN != 
MC_CMD_RSS_CONTEXT_GET_FLAGS_OUT_FLAGS_OFST);
+   MCDI_SET_DWORD(inbuf, RSS_CONTEXT_GET_FLAGS_IN_RSS_CONTEXT_ID, context);
+   MCDI_SET_DWORD(inbuf, RSS_CONTEXT_GET_FLAGS_OUT_FLAGS,
+  RSS_CONTEXT_FLAGS_DEFAULT);
+   rc = efx_mcdi_rpc(efx, MC_CMD_RSS_CONTEXT_GET_FLAGS, inbuf,
+ sizeof(inbuf), outbuf, sizeof(outbuf), );
+   if (rc == 0) {
+   if (outlen < MC_CMD_RSS_CONTEXT_GET_FLAGS_OUT_LEN)
+   rc = -EIO;
+   else
+   *flags = MCDI_DWORD(outbuf, 
RSS_CONTEXT_GET_FLAGS_OUT_FLAGS);
+   }
+   return rc;
+}
+
+/* Attempt to enable 4-tuple UDP hashing on the specified RSS context.
+ * If we fail, we just leave the RSS context at its default hash settings,
+ * which is safe but may slightly reduce performance.
+ * Defaults are 4-tuple for TCP and 2-tuple for UDP and other-IP, so we
+ * just need to set the UDP ports flags (for both IP versions).
+ */
+static void efx_ef10_set_rss_flags(struct efx_nic *efx, u32 context)
+{
+   MCDI_DECLARE_BUF(inbuf, MC_CMD_RSS_CONTEXT_SET_FLAGS_IN_LEN);
+   u32 flags;
+
+   BUILD_BUG_ON(MC_CMD_RSS_CONTEXT_SET_FLAGS_OUT_LEN != 0);
+
+   if (efx_ef10_get_rss_flags(efx, context, ) != 0)
+   return;
+   MCDI_SET_DWORD(inbuf, RSS_CONTEXT_SET_FLAGS_IN_RSS_CONTEXT_ID, context);
+   flags |= RSS_MODE_HASH_PORTS << 

[PATCH net-next 0/2] sfc: enable 4-tuple UDP RSS hashing

2016-11-03 Thread Edward Cree
EF10 based NICs have configurable RSS hash fields, and can be made to take the
ports into the hash on UDP (they already do so for TCP).  This patch series
enables this, in order to improve spreading of UDP traffic.

Edward Cree (2):
  sfc: enable 4-tuple RSS hashing for UDP
  sfc: report 4-tuple UDP hashing to ethtool, if it's enabled

 drivers/net/ethernet/sfc/ef10.c   | 84 +++
 drivers/net/ethernet/sfc/ethtool.c| 12 +++--
 drivers/net/ethernet/sfc/net_driver.h |  2 +
 3 files changed, 94 insertions(+), 4 deletions(-)



[PATCH net-next V4 4/9] liquidio CN23XX: mailbox interrupt processing

2016-11-03 Thread Raghu Vatsavayi
Adds support for mailbox interrupt processing of various
commands.

Signed-off-by: Raghu Vatsavayi 
Signed-off-by: Derek Chickles 
Signed-off-by: Satanand Burla 
Signed-off-by: Felix Manlunas 
---
 .../ethernet/cavium/liquidio/cn23xx_pf_device.c| 157 +
 drivers/net/ethernet/cavium/liquidio/lio_main.c|  12 ++
 .../net/ethernet/cavium/liquidio/octeon_device.c   |   1 +
 .../net/ethernet/cavium/liquidio/octeon_device.h   |  21 ++-
 4 files changed, 184 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c 
b/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c
index 4c93eac..79b8c48 100644
--- a/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c
+++ b/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c
@@ -30,6 +30,7 @@
 #include "octeon_device.h"
 #include "cn23xx_pf_device.h"
 #include "octeon_main.h"
+#include "octeon_mailbox.h"
 
 #define RESET_NOTDONE 0
 #define RESET_DONE 1
@@ -677,6 +678,118 @@ static void cn23xx_setup_oq_regs(struct octeon_device 
*oct, u32 oq_no)
}
 }
 
+static void cn23xx_pf_mbox_thread(struct work_struct *work)
+{
+   struct cavium_wk *wk = (struct cavium_wk *)work;
+   struct octeon_mbox *mbox = (struct octeon_mbox *)wk->ctxptr;
+   struct octeon_device *oct = mbox->oct_dev;
+   u64 mbox_int_val, val64;
+   u32 q_no, i;
+
+   if (oct->rev_id < OCTEON_CN23XX_REV_1_1) {
+   /*read and clear by writing 1*/
+   mbox_int_val = readq(mbox->mbox_int_reg);
+   writeq(mbox_int_val, mbox->mbox_int_reg);
+
+   for (i = 0; i < oct->sriov_info.num_vfs_alloced; i++) {
+   q_no = i * oct->sriov_info.rings_per_vf;
+
+   val64 = readq(oct->mbox[q_no]->mbox_write_reg);
+
+   if (val64 && (val64 != OCTEON_PFVFACK)) {
+   if (octeon_mbox_read(oct->mbox[q_no]))
+   octeon_mbox_process_message(
+   oct->mbox[q_no]);
+   }
+   }
+
+   schedule_delayed_work(>work, msecs_to_jiffies(10));
+   } else {
+   octeon_mbox_process_message(mbox);
+   }
+}
+
+static int cn23xx_setup_pf_mbox(struct octeon_device *oct)
+{
+   struct octeon_mbox *mbox = NULL;
+   u16 mac_no = oct->pcie_port;
+   u16 pf_num = oct->pf_num;
+   u32 q_no, i;
+
+   if (!oct->sriov_info.max_vfs)
+   return 0;
+
+   for (i = 0; i < oct->sriov_info.max_vfs; i++) {
+   q_no = i * oct->sriov_info.rings_per_vf;
+
+   mbox = vmalloc(sizeof(*mbox));
+   if (!mbox)
+   goto free_mbox;
+
+   memset(mbox, 0, sizeof(struct octeon_mbox));
+
+   spin_lock_init(>lock);
+
+   mbox->oct_dev = oct;
+
+   mbox->q_no = q_no;
+
+   mbox->state = OCTEON_MBOX_STATE_IDLE;
+
+   /* PF mbox interrupt reg */
+   mbox->mbox_int_reg = (u8 *)oct->mmio[0].hw_addr +
+CN23XX_SLI_MAC_PF_MBOX_INT(mac_no, pf_num);
+
+   /* PF writes into SIG0 reg */
+   mbox->mbox_write_reg = (u8 *)oct->mmio[0].hw_addr +
+  CN23XX_SLI_PKT_PF_VF_MBOX_SIG(q_no, 0);
+
+   /* PF reads from SIG1 reg */
+   mbox->mbox_read_reg = (u8 *)oct->mmio[0].hw_addr +
+ CN23XX_SLI_PKT_PF_VF_MBOX_SIG(q_no, 1);
+
+   /*Mail Box Thread creation*/
+   INIT_DELAYED_WORK(>mbox_poll_wk.work,
+ cn23xx_pf_mbox_thread);
+   mbox->mbox_poll_wk.ctxptr = (void *)mbox;
+
+   oct->mbox[q_no] = mbox;
+
+   writeq(OCTEON_PFVFSIG, mbox->mbox_read_reg);
+   }
+
+   if (oct->rev_id < OCTEON_CN23XX_REV_1_1)
+   schedule_delayed_work(>mbox[0]->mbox_poll_wk.work,
+ msecs_to_jiffies(0));
+
+   return 0;
+
+free_mbox:
+   while (i) {
+   i--;
+   vfree(oct->mbox[i]);
+   }
+
+   return 1;
+}
+
+static int cn23xx_free_pf_mbox(struct octeon_device *oct)
+{
+   u32 q_no, i;
+
+   if (!oct->sriov_info.max_vfs)
+   return 0;
+
+   for (i = 0; i < oct->sriov_info.max_vfs; i++) {
+   q_no = i * oct->sriov_info.rings_per_vf;
+   cancel_delayed_work_sync(
+   >mbox[q_no]->mbox_poll_wk.work);
+   vfree(oct->mbox[q_no]);
+   }
+
+   return 0;
+}
+
 static int cn23xx_enable_io_queues(struct octeon_device *oct)
 {
u64 reg_val;
@@ -871,6 +984,29 @@ static u64 cn23xx_pf_msix_interrupt_handler(void *dev)
return 

[PATCH net-next V4 6/9] liquidio CN23XX: device states

2016-11-03 Thread Raghu Vatsavayi
Cleaned up resource leaks during destroy resources by
introducing more device states.

Signed-off-by: Raghu Vatsavayi 
Signed-off-by: Derek Chickles 
Signed-off-by: Satanand Burla 
Signed-off-by: Felix Manlunas 
---
 drivers/net/ethernet/cavium/liquidio/lio_main.c| 33 --
 .../net/ethernet/cavium/liquidio/octeon_device.c   |  6 +++-
 .../net/ethernet/cavium/liquidio/octeon_device.h   | 29 ++-
 drivers/net/ethernet/cavium/liquidio/octeon_droq.c | 13 +
 drivers/net/ethernet/cavium/liquidio/octeon_main.h |  8 --
 .../net/ethernet/cavium/liquidio/request_manager.c |  6 +++-
 6 files changed, 64 insertions(+), 31 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c 
b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index b297dae..10b7e9b 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -766,6 +766,7 @@ static void delete_glists(struct lio *lio)
}
 
kfree((void *)lio->glist);
+   kfree((void *)lio->glist_lock);
 }
 
 /**
@@ -1325,6 +1326,7 @@ static int liquidio_watchdog(void *param)
complete(_stage);
 
if (octeon_device_init(oct_dev)) {
+   complete(>init);
liquidio_remove(pdev);
return -ENOMEM;
}
@@ -1349,7 +1351,15 @@ static int liquidio_watchdog(void *param)
oct_dev->watchdog_task = kthread_create(
liquidio_watchdog, oct_dev,
"liowd/%02hhx:%02hhx.%hhx", bus, device, function);
-   wake_up_process(oct_dev->watchdog_task);
+   if (!IS_ERR(oct_dev->watchdog_task)) {
+   wake_up_process(oct_dev->watchdog_task);
+   } else {
+   oct_dev->watchdog_task = NULL;
+   dev_err(_dev->pci_dev->dev,
+   "failed to create kernel_thread\n");
+   liquidio_remove(pdev);
+   return -1;
+   }
}
}
 
@@ -1413,6 +1423,8 @@ static void octeon_destroy_resources(struct octeon_device 
*oct)
if (lio_wait_for_oq_pkts(oct))
dev_err(>pci_dev->dev, "OQ had pending packets\n");
 
+   /* fallthrough */
+   case OCT_DEV_INTR_SET_DONE:
/* Disable interrupts  */
oct->fn_list.disable_interrupt(oct, OCTEON_ALL_INTR);
 
@@ -1439,6 +1451,8 @@ static void octeon_destroy_resources(struct octeon_device 
*oct)
pci_disable_msi(oct->pci_dev);
}
 
+   /* fallthrough */
+   case OCT_DEV_MSIX_ALLOC_VECTOR_DONE:
if (OCTEON_CN23XX_PF(oct))
octeon_free_ioq_vector(oct);
 
@@ -1502,10 +1516,13 @@ static void octeon_destroy_resources(struct 
octeon_device *oct)
octeon_unmap_pci_barx(oct, 1);
 
/* fallthrough */
-   case OCT_DEV_BEGIN_STATE:
+   case OCT_DEV_PCI_ENABLE_DONE:
+   pci_clear_master(oct->pci_dev);
/* Disable the device, releasing the PCI INT */
pci_disable_device(oct->pci_dev);
 
+   /* fallthrough */
+   case OCT_DEV_BEGIN_STATE:
/* Nothing to be done here either */
break;
}   /* end switch (oct->status) */
@@ -1775,6 +1792,7 @@ static int octeon_pci_os_setup(struct octeon_device *oct)
 
if (dma_set_mask_and_coherent(>pci_dev->dev, DMA_BIT_MASK(64))) {
dev_err(>pci_dev->dev, "Unexpected DMA device 
capability\n");
+   pci_disable_device(oct->pci_dev);
return 1;
}
 
@@ -4429,6 +4447,8 @@ static int octeon_device_init(struct octeon_device 
*octeon_dev)
if (octeon_pci_os_setup(octeon_dev))
return 1;
 
+   atomic_set(_dev->status, OCT_DEV_PCI_ENABLE_DONE);
+
/* Identify the Octeon type and map the BAR address space. */
if (octeon_chip_specific_setup(octeon_dev)) {
dev_err(_dev->pci_dev->dev, "Chip specific setup 
failed\n");
@@ -4500,9 +4520,6 @@ static int octeon_device_init(struct octeon_device 
*octeon_dev)
if (octeon_setup_instr_queues(octeon_dev)) {
dev_err(_dev->pci_dev->dev,
"instruction queue initialization failed\n");
-   /* On error, release any previously allocated queues */
-   for (j = 0; j < octeon_dev->num_iqs; j++)
-   octeon_delete_instr_queue(octeon_dev, j);
return 1;
}
atomic_set(_dev->status, OCT_DEV_INSTR_QUEUE_INIT_DONE);
@@ -4518,9 

[PATCH net-next V4 9/9] liquidio CN23XX: fix for new check patch errors

2016-11-03 Thread Raghu Vatsavayi
New checkpatch script shows some errors with pre-existing
driver. This patch provides fix for those errors.

Signed-off-by: Raghu Vatsavayi 
Signed-off-by: Derek Chickles 
Signed-off-by: Satanand Burla 
Signed-off-by: Felix Manlunas 
---
 .../net/ethernet/cavium/liquidio/cn23xx_pf_regs.h  |  12 +--
 drivers/net/ethernet/cavium/liquidio/cn66xx_regs.h |  12 +--
 .../net/ethernet/cavium/liquidio/cn68xx_device.c   |   2 +-
 drivers/net/ethernet/cavium/liquidio/lio_ethtool.c |   9 +-
 drivers/net/ethernet/cavium/liquidio/lio_main.c|  15 +--
 .../net/ethernet/cavium/liquidio/liquidio_common.h |  50 -
 .../net/ethernet/cavium/liquidio/octeon_console.c  | 113 ++---
 .../net/ethernet/cavium/liquidio/octeon_device.c   |  28 ++---
 .../net/ethernet/cavium/liquidio/octeon_device.h   |  25 +++--
 drivers/net/ethernet/cavium/liquidio/octeon_droq.c |  40 
 drivers/net/ethernet/cavium/liquidio/octeon_iq.h   |   3 +
 .../net/ethernet/cavium/liquidio/octeon_mem_ops.c  |   2 +-
 .../net/ethernet/cavium/liquidio/octeon_network.h  |   6 +-
 drivers/net/ethernet/cavium/liquidio/octeon_nic.h  |   2 +-
 .../net/ethernet/cavium/liquidio/request_manager.c |  16 ++-
 15 files changed, 155 insertions(+), 180 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_regs.h 
b/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_regs.h
index 680a405..e6d4ad9 100644
--- a/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_regs.h
+++ b/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_regs.h
@@ -58,7 +58,7 @@
 
 #define CN23XX_CONFIG_SRIOV_BAR_START 0x19C
 #define CN23XX_CONFIG_SRIOV_BARX(i)\
-   (CN23XX_CONFIG_SRIOV_BAR_START + (i * 4))
+   (CN23XX_CONFIG_SRIOV_BAR_START + ((i) * 4))
 #define CN23XX_CONFIG_SRIOV_BAR_PF0x08
 #define CN23XX_CONFIG_SRIOV_BAR_64BIT 0x04
 #define CN23XX_CONFIG_SRIOV_BAR_IO0x01
@@ -508,7 +508,7 @@
 /* 4 Registers (64 - bit) */
 #defineCN23XX_SLI_S2M_PORT_CTL_START 0x23D80
 #defineCN23XX_SLI_S2M_PORTX_CTL(port)  \
-   (CN23XX_SLI_S2M_PORT_CTL_START + (port * 0x10))
+   (CN23XX_SLI_S2M_PORT_CTL_START + ((port) * 0x10))
 
 #defineCN23XX_SLI_MAC_NUMBER 0x20050
 
@@ -549,26 +549,26 @@
  * Provides DMA Engine Queue Enable
  */
 #defineCN23XX_DPI_DMA_ENG0_ENB0x0001df80ULL
-#defineCN23XX_DPI_DMA_ENG_ENB(eng) (CN23XX_DPI_DMA_ENG0_ENB + (eng * 8))
+#defineCN23XX_DPI_DMA_ENG_ENB(eng) (CN23XX_DPI_DMA_ENG0_ENB + ((eng) * 8))
 
 /* 8 register (64-bit) - DPI_DMA(0..7)_REQQ_CTL
  * Provides control bits for transaction on 8 Queues
  */
 #defineCN23XX_DPI_DMA_REQQ0_CTL   0x0001df000180ULL
 #defineCN23XX_DPI_DMA_REQQ_CTL(q_no)   \
-   (CN23XX_DPI_DMA_REQQ0_CTL + (q_no * 8))
+   (CN23XX_DPI_DMA_REQQ0_CTL + ((q_no) * 8))
 
 /* 6 register (64-bit) - DPI_ENG(0..5)_BUF
  * Provides DMA Engine FIFO (Queue) Size
  */
 #defineCN23XX_DPI_DMA_ENG0_BUF0x0001df000880ULL
 #defineCN23XX_DPI_DMA_ENG_BUF(eng)   \
-   (CN23XX_DPI_DMA_ENG0_BUF + (eng * 8))
+   (CN23XX_DPI_DMA_ENG0_BUF + ((eng) * 8))
 
 /* 4 Registers (64-bit) */
 #defineCN23XX_DPI_SLI_PRT_CFG_START   0x0001df000900ULL
 #defineCN23XX_DPI_SLI_PRTX_CFG(port)\
-   (CN23XX_DPI_SLI_PRT_CFG_START + (port * 0x8))
+   (CN23XX_DPI_SLI_PRT_CFG_START + ((port) * 0x8))
 
 /* Masks for DPI_DMA_CONTROL Register */
 #defineCN23XX_DPI_DMA_COMMIT_MODE BIT_ULL(58)
diff --git a/drivers/net/ethernet/cavium/liquidio/cn66xx_regs.h 
b/drivers/net/ethernet/cavium/liquidio/cn66xx_regs.h
index 23152c0..b248966 100644
--- a/drivers/net/ethernet/cavium/liquidio/cn66xx_regs.h
+++ b/drivers/net/ethernet/cavium/liquidio/cn66xx_regs.h
@@ -438,10 +438,10 @@
 #defineCN6XXX_SLI_S2M_PORT0_CTL  0x3D80
 #defineCN6XXX_SLI_S2M_PORT1_CTL  0x3D90
 #defineCN6XXX_SLI_S2M_PORTX_CTL(port)\
-   (CN6XXX_SLI_S2M_PORT0_CTL + (port * 0x10))
+   (CN6XXX_SLI_S2M_PORT0_CTL + ((port) * 0x10))
 
 #defineCN6XXX_SLI_INT_ENB64(port)\
-   (CN6XXX_SLI_INT_ENB64_PORT0 + (port * 0x10))
+   (CN6XXX_SLI_INT_ENB64_PORT0 + ((port) * 0x10))
 
 #defineCN6XXX_SLI_MAC_NUMBER 0x3E00
 
@@ -453,7 +453,7 @@
 #defineCN6XXX_PCI_BAR1_OFFSET  0x8
 
 #defineCN6XXX_BAR1_REG(idx, port) \
-   (CN6XXX_BAR1_INDEX_START + (port * CN6XXX_PEM_OFFSET) + \
+   (CN6XXX_BAR1_INDEX_START + ((port) * CN6XXX_PEM_OFFSET) + \
(CN6XXX_PCI_BAR1_OFFSET * (idx)))
 
 /* DPI #*/
@@ -471,17 +471,17 @@
 #defineCN6XXX_DPI_DMA_ENG0_ENB0x0001df80ULL
 
 

[PATCH net-next V4 2/9] liquidio CN23XX: sysfs VF config support

2016-11-03 Thread Raghu Vatsavayi
Adds sysfs based support for enabling or disabling VFs.

Signed-off-by: Raghu Vatsavayi 
Signed-off-by: Derek Chickles 
Signed-off-by: Satanand Burla 
Signed-off-by: Felix Manlunas 
---
 drivers/net/ethernet/cavium/liquidio/lio_main.c| 98 ++
 .../net/ethernet/cavium/liquidio/octeon_config.h   |  3 +
 .../net/ethernet/cavium/liquidio/octeon_device.h   |  8 ++
 3 files changed, 109 insertions(+)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c 
b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index 71d01a7..29f5985 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -180,6 +180,8 @@ struct octeon_device_priv {
unsigned long napi_mask;
 };
 
+static int liquidio_enable_sriov(struct pci_dev *dev, int num_vfs);
+
 static int octeon_device_init(struct octeon_device *);
 static int liquidio_stop(struct net_device *netdev);
 static void liquidio_remove(struct pci_dev *pdev);
@@ -518,6 +520,7 @@ static int liquidio_resume(struct pci_dev *pdev 
__attribute__((unused)))
.suspend= liquidio_suspend,
.resume = liquidio_resume,
 #endif
+   .sriov_configure = liquidio_enable_sriov,
 };
 
 /**
@@ -1472,6 +1475,8 @@ static void octeon_destroy_resources(struct octeon_device 
*oct)
continue;
octeon_delete_instr_queue(oct, i);
}
+   if (oct->sriov_info.sriov_enabled)
+   pci_disable_sriov(oct->pci_dev);
/* fallthrough */
case OCT_DEV_SC_BUFF_POOL_INIT_DONE:
octeon_free_sc_buffer_pool(oct);
@@ -3990,6 +3995,99 @@ static int setup_nic_devices(struct octeon_device 
*octeon_dev)
return -ENODEV;
 }
 
+static int octeon_enable_sriov(struct octeon_device *oct)
+{
+   unsigned int num_vfs_alloced = oct->sriov_info.num_vfs_alloced;
+   struct pci_dev *vfdev;
+   int err;
+   u32 u;
+
+   if (OCTEON_CN23XX_PF(oct) && num_vfs_alloced) {
+   err = pci_enable_sriov(oct->pci_dev,
+  oct->sriov_info.num_vfs_alloced);
+   if (err) {
+   dev_err(>pci_dev->dev,
+   "OCTEON: Failed to enable PCI sriov: %d\n",
+   err);
+   oct->sriov_info.num_vfs_alloced = 0;
+   return err;
+   }
+   oct->sriov_info.sriov_enabled = 1;
+
+   /* init lookup table that maps DPI ring number to VF pci_dev
+* struct pointer
+*/
+   u = 0;
+   vfdev = pci_get_device(PCI_VENDOR_ID_CAVIUM,
+  OCTEON_CN23XX_VF_VID, NULL);
+   while (vfdev) {
+   if (vfdev->is_virtfn &&
+   (vfdev->physfn == oct->pci_dev)) {
+   oct->sriov_info.dpiring_to_vfpcidev_lut[u] =
+   vfdev;
+   u += oct->sriov_info.rings_per_vf;
+   }
+   vfdev = pci_get_device(PCI_VENDOR_ID_CAVIUM,
+  OCTEON_CN23XX_VF_VID, vfdev);
+   }
+   }
+
+   return num_vfs_alloced;
+}
+
+static int lio_pci_sriov_disable(struct octeon_device *oct)
+{
+   int u;
+
+   if (pci_vfs_assigned(oct->pci_dev)) {
+   dev_err(>pci_dev->dev, "VFs are still assigned to VMs.\n");
+   return -EPERM;
+   }
+
+   pci_disable_sriov(oct->pci_dev);
+
+   u = 0;
+   while (u < MAX_POSSIBLE_VFS) {
+   oct->sriov_info.dpiring_to_vfpcidev_lut[u] = NULL;
+   u += oct->sriov_info.rings_per_vf;
+   }
+
+   oct->sriov_info.num_vfs_alloced = 0;
+   dev_info(>pci_dev->dev, "oct->pf_num:%d disabled VFs\n",
+oct->pf_num);
+
+   return 0;
+}
+
+static int liquidio_enable_sriov(struct pci_dev *dev, int num_vfs)
+{
+   struct octeon_device *oct = pci_get_drvdata(dev);
+   int ret = 0;
+
+   if ((num_vfs == oct->sriov_info.num_vfs_alloced) &&
+   (oct->sriov_info.sriov_enabled)) {
+   dev_info(>pci_dev->dev, "oct->pf_num:%d already enabled 
num_vfs:%d\n",
+oct->pf_num, num_vfs);
+   return 0;
+   }
+
+   if (!num_vfs) {
+   ret = lio_pci_sriov_disable(oct);
+   } else if (num_vfs > oct->sriov_info.max_vfs) {
+   dev_err(>pci_dev->dev,
+   "OCTEON: Max allowed VFs:%d user requested:%d",
+   oct->sriov_info.max_vfs, num_vfs);
+   ret = -EPERM;
+   } else {
+   

[PATCH net-next V4 8/9] liquidio CN23XX: copyrights changes and alignment

2016-11-03 Thread Raghu Vatsavayi
Updated copyrights comments and also changed some other comments
alignments.

Signed-off-by: Raghu Vatsavayi 
Signed-off-by: Derek Chickles 
Signed-off-by: Satanand Burla 
Signed-off-by: Felix Manlunas 
---
 .../ethernet/cavium/liquidio/cn23xx_pf_device.c| 53 ++
 .../ethernet/cavium/liquidio/cn23xx_pf_device.h| 39 +++-
 .../net/ethernet/cavium/liquidio/cn23xx_pf_regs.h  | 39 +++-
 .../net/ethernet/cavium/liquidio/cn66xx_device.c   | 36 +++
 .../net/ethernet/cavium/liquidio/cn66xx_device.h   | 37 +++
 drivers/net/ethernet/cavium/liquidio/cn66xx_regs.h | 37 +++
 .../net/ethernet/cavium/liquidio/cn68xx_device.c   | 36 +++
 .../net/ethernet/cavium/liquidio/cn68xx_device.h   | 37 +++
 drivers/net/ethernet/cavium/liquidio/cn68xx_regs.h | 37 +++
 drivers/net/ethernet/cavium/liquidio/lio_core.c| 36 +++
 drivers/net/ethernet/cavium/liquidio/lio_ethtool.c | 42 -
 drivers/net/ethernet/cavium/liquidio/lio_main.c| 36 +++
 .../net/ethernet/cavium/liquidio/liquidio_common.h | 37 +++
 .../net/ethernet/cavium/liquidio/liquidio_image.h  | 36 +++
 .../net/ethernet/cavium/liquidio/octeon_config.h   | 37 +++
 .../net/ethernet/cavium/liquidio/octeon_console.c  | 43 --
 .../net/ethernet/cavium/liquidio/octeon_device.c   | 36 +++
 .../net/ethernet/cavium/liquidio/octeon_device.h   | 45 --
 drivers/net/ethernet/cavium/liquidio/octeon_droq.c | 36 +++
 drivers/net/ethernet/cavium/liquidio/octeon_droq.h | 17 +++
 drivers/net/ethernet/cavium/liquidio/octeon_iq.h   | 21 -
 drivers/net/ethernet/cavium/liquidio/octeon_main.h | 19 +++-
 .../net/ethernet/cavium/liquidio/octeon_mem_ops.c  |  5 +-
 .../net/ethernet/cavium/liquidio/octeon_mem_ops.h  |  5 +-
 .../net/ethernet/cavium/liquidio/octeon_network.h  |  5 +-
 drivers/net/ethernet/cavium/liquidio/octeon_nic.c  |  5 +-
 drivers/net/ethernet/cavium/liquidio/octeon_nic.h  |  5 +-
 .../net/ethernet/cavium/liquidio/request_manager.c |  5 +-
 .../ethernet/cavium/liquidio/response_manager.c|  5 +-
 .../ethernet/cavium/liquidio/response_manager.h|  5 +-
 30 files changed, 352 insertions(+), 480 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c 
b/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c
index cbf9667..2644a84 100644
--- a/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c
+++ b/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c
@@ -1,27 +1,21 @@
 /**
-* Author: Cavium, Inc.
-*
-* Contact: supp...@cavium.com
-*  Please include "LiquidIO" in the subject.
-*
-* Copyright (c) 2003-2015 Cavium, Inc.
-*
-* This file is free software; you can redistribute it and/or modify
-* it under the terms of the GNU General Public License, Version 2, as
-* published by the Free Software Foundation.
-*
-* This file is distributed in the hope that it will be useful, but
-* AS-IS and WITHOUT ANY WARRANTY; without even the implied warranty
-* of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, TITLE, or
-* NONINFRINGEMENT.  See the GNU General Public License for more
-* details.
-*
-* This file may also be available under a different license from Cavium.
-* Contact Cavium, Inc. for more information
-**/
-
+ * Author: Cavium, Inc.
+ *
+ * Contact: supp...@cavium.com
+ *  Please include "LiquidIO" in the subject.
+ *
+ * Copyright (c) 2003-2016 Cavium, Inc.
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, Version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This file is distributed in the hope that it will be useful, but
+ * AS-IS and WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, TITLE, or
+ * NONINFRINGEMENT.  See the GNU General Public License for more details.
+ ***/
 #include 
-#include 
 #include 
 #include 
 #include "liquidio_common.h"
@@ -421,10 +415,10 @@ static int cn23xx_pf_setup_global_input_regs(struct 
octeon_device *oct)
return -1;
 
/** Set the MAC_NUM and PVF_NUM in IQ_PKT_CONTROL reg
-   * for all queues.Only PF can set these bits.
-   * bits 29:30 indicate the MAC num.
-   * bits 32:47 indicate the PVF num.
-   */
+* for all queues.Only PF can set these bits.
+* bits 29:30 indicate the MAC num.
+* bits 32:47 indicate the PVF num.
+*/
for (q_no = 0; q_no < 

[PATCH net-next V4 7/9] liquidio CN23XX: code cleanup

2016-11-03 Thread Raghu Vatsavayi
Cleaned up unnecessary comments and added some minor macros.

Signed-off-by: Raghu Vatsavayi 
Signed-off-by: Derek Chickles 
Signed-off-by: Satanand Burla 
Signed-off-by: Felix Manlunas 
---
 drivers/net/ethernet/cavium/liquidio/cn66xx_device.c   | 13 -
 drivers/net/ethernet/cavium/liquidio/cn66xx_device.h   |  4 ++--
 drivers/net/ethernet/cavium/liquidio/lio_ethtool.c | 14 --
 drivers/net/ethernet/cavium/liquidio/lio_main.c| 16 +---
 drivers/net/ethernet/cavium/liquidio/liquidio_common.h |  2 --
 drivers/net/ethernet/cavium/liquidio/octeon_device.c   |  8 
 drivers/net/ethernet/cavium/liquidio/octeon_droq.c |  2 +-
 drivers/net/ethernet/cavium/liquidio/octeon_droq.h |  1 -
 drivers/net/ethernet/cavium/liquidio/octeon_iq.h   |  1 -
 drivers/net/ethernet/cavium/liquidio/octeon_main.h | 18 --
 drivers/net/ethernet/cavium/liquidio/request_manager.c |  7 ++-
 .../net/ethernet/cavium/liquidio/response_manager.c|  6 +-
 .../net/ethernet/cavium/liquidio/response_manager.h|  1 -
 13 files changed, 23 insertions(+), 70 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/cn66xx_device.c 
b/drivers/net/ethernet/cavium/liquidio/cn66xx_device.c
index e779af8..1ebc225 100644
--- a/drivers/net/ethernet/cavium/liquidio/cn66xx_device.c
+++ b/drivers/net/ethernet/cavium/liquidio/cn66xx_device.c
@@ -275,7 +275,6 @@ void lio_cn6xxx_setup_iq_regs(struct octeon_device *oct, 
u32 iq_no)
 {
struct octeon_instr_queue *iq = oct->instr_queue[iq_no];
 
-   /* Disable Packet-by-Packet mode; No Parse Mode or Skip length */
octeon_write_csr64(oct, CN6XXX_SLI_IQ_PKT_INSTR_HDR64(iq_no), 0);
 
/* Write the start of the input queue's ring and its size  */
@@ -378,7 +377,7 @@ void lio_cn6xxx_disable_io_queues(struct octeon_device *oct)
 
/* Reset the doorbell register for each Input queue. */
for (i = 0; i < MAX_OCTEON_INSTR_QUEUES(oct); i++) {
-   if (!(oct->io_qmask.iq & (1ULL << i)))
+   if (!(oct->io_qmask.iq & BIT_ULL(i)))
continue;
octeon_write_csr(oct, CN6XXX_SLI_IQ_DOORBELL(i), 0x);
d32 = octeon_read_csr(oct, CN6XXX_SLI_IQ_DOORBELL(i));
@@ -400,9 +399,8 @@ void lio_cn6xxx_disable_io_queues(struct octeon_device *oct)
;
 
/* Reset the doorbell register for each Output queue. */
-   /* for (i = 0; i < oct->num_oqs; i++) { */
for (i = 0; i < MAX_OCTEON_OUTPUT_QUEUES(oct); i++) {
-   if (!(oct->io_qmask.oq & (1ULL << i)))
+   if (!(oct->io_qmask.oq & BIT_ULL(i)))
continue;
octeon_write_csr(oct, CN6XXX_SLI_OQ_PKTS_CREDIT(i), 0x);
d32 = octeon_read_csr(oct, CN6XXX_SLI_OQ_PKTS_CREDIT(i));
@@ -537,15 +535,14 @@ static int lio_cn6xxx_process_droq_intr_regs(struct 
octeon_device *oct)
 
oct->droq_intr = 0;
 
-   /* for (oq_no = 0; oq_no < oct->num_oqs; oq_no++) { */
for (oq_no = 0; oq_no < MAX_OCTEON_OUTPUT_QUEUES(oct); oq_no++) {
-   if (!(droq_mask & (1ULL << oq_no)))
+   if (!(droq_mask & BIT_ULL(oq_no)))
continue;
 
droq = oct->droq[oq_no];
pkt_count = octeon_droq_check_hw_for_pkts(droq);
if (pkt_count) {
-   oct->droq_intr |= (1ULL << oq_no);
+   oct->droq_intr |= BIT_ULL(oq_no);
if (droq->ops.poll_mode) {
u32 value;
u32 reg;
@@ -721,8 +718,6 @@ int lio_setup_cn66xx_octeon_device(struct octeon_device 
*oct)
 int lio_validate_cn6xxx_config_info(struct octeon_device *oct,
struct octeon_config *conf6xxx)
 {
-   /* int total_instrs = 0; */
-
if (CFG_GET_IQ_MAX_Q(conf6xxx) > CN6XXX_MAX_INPUT_QUEUES) {
dev_err(>pci_dev->dev, "%s: Num IQ (%d) exceeds Max 
(%d)\n",
__func__, CFG_GET_IQ_MAX_Q(conf6xxx),
diff --git a/drivers/net/ethernet/cavium/liquidio/cn66xx_device.h 
b/drivers/net/ethernet/cavium/liquidio/cn66xx_device.h
index a40a913..32fbbb2 100644
--- a/drivers/net/ethernet/cavium/liquidio/cn66xx_device.h
+++ b/drivers/net/ethernet/cavium/liquidio/cn66xx_device.h
@@ -96,8 +96,8 @@ void lio_cn6xxx_setup_reg_address(struct octeon_device *oct, 
void *chip,
  struct octeon_reg_list *reg_list);
 u32 lio_cn6xxx_coprocessor_clock(struct octeon_device *oct);
 u32 lio_cn6xxx_get_oq_ticks(struct octeon_device *oct, u32 time_intr_in_us);
-int lio_setup_cn66xx_octeon_device(struct octeon_device *);
+int lio_setup_cn66xx_octeon_device(struct octeon_device *oct);
 int 

[PATCH net-next V4 5/9] liquidio CN23XX: VF related operations

2016-11-03 Thread Raghu Vatsavayi
Adds support for VF related operations like mac address vlan
and link changes.

Signed-off-by: Raghu Vatsavayi 
Signed-off-by: Derek Chickles 
Signed-off-by: Satanand Burla 
Signed-off-by: Felix Manlunas 
---
 .../ethernet/cavium/liquidio/cn23xx_pf_device.c|  22 +++
 .../ethernet/cavium/liquidio/cn23xx_pf_device.h|   5 +
 drivers/net/ethernet/cavium/liquidio/lio_main.c| 214 +
 .../net/ethernet/cavium/liquidio/liquidio_common.h |   5 +
 .../net/ethernet/cavium/liquidio/octeon_device.h   |   8 +
 5 files changed, 254 insertions(+)

diff --git a/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c 
b/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c
index 79b8c48..cbf9667 100644
--- a/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c
+++ b/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "liquidio_common.h"
 #include "octeon_droq.h"
 #include "octeon_iq.h"
@@ -1410,3 +1411,24 @@ int cn23xx_fw_loaded(struct octeon_device *oct)
val = octeon_read_csr64(oct, CN23XX_SLI_SCRATCH1);
return (val >> 1) & 1ULL;
 }
+
+void cn23xx_tell_vf_its_macaddr_changed(struct octeon_device *oct, int vfidx,
+   u8 *mac)
+{
+   if (oct->sriov_info.vf_drv_loaded_mask & BIT_ULL(vfidx)) {
+   struct octeon_mbox_cmd mbox_cmd;
+
+   mbox_cmd.msg.u64 = 0;
+   mbox_cmd.msg.s.type = OCTEON_MBOX_REQUEST;
+   mbox_cmd.msg.s.resp_needed = 0;
+   mbox_cmd.msg.s.cmd = OCTEON_PF_CHANGED_VF_MACADDR;
+   mbox_cmd.msg.s.len = 1;
+   mbox_cmd.recv_len = 0;
+   mbox_cmd.recv_status = 0;
+   mbox_cmd.fn = NULL;
+   mbox_cmd.fn_arg = 0;
+   ether_addr_copy(mbox_cmd.msg.s.params, mac);
+   mbox_cmd.q_no = vfidx * oct->sriov_info.rings_per_vf;
+   octeon_mbox_write(oct, _cmd);
+   }
+}
diff --git a/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.h 
b/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.h
index 21b5c90..cee346a 100644
--- a/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.h
+++ b/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.h
@@ -29,6 +29,8 @@
 
 #include "cn23xx_pf_regs.h"
 
+#define LIO_CMD_WAIT_TM 100
+
 /* Register address and configuration for a CN23XX devices.
  * If device specific changes need to be made then add a struct to include
  * device specific fields as shown in the commented section
@@ -56,4 +58,7 @@ int validate_cn23xx_pf_config_info(struct octeon_device *oct,
 void cn23xx_dump_pf_initialized_regs(struct octeon_device *oct);
 
 int cn23xx_fw_loaded(struct octeon_device *oct);
+
+void cn23xx_tell_vf_its_macaddr_changed(struct octeon_device *oct, int vfidx,
+   u8 *mac);
 #endif
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c 
b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index 438465b..b297dae 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -3567,6 +3567,151 @@ static void liquidio_del_vxlan_port(struct net_device 
*netdev,
OCTNET_CMD_VXLAN_PORT_DEL);
 }
 
+static int __liquidio_set_vf_mac(struct net_device *netdev, int vfidx,
+u8 *mac, bool is_admin_assigned)
+{
+   struct lio *lio = GET_LIO(netdev);
+   struct octeon_device *oct = lio->oct_dev;
+   struct octnic_ctrl_pkt nctrl;
+
+   if (!is_valid_ether_addr(mac))
+   return -EINVAL;
+
+   if (vfidx < 0 || vfidx >= oct->sriov_info.max_vfs)
+   return -EINVAL;
+
+   memset(, 0, sizeof(struct octnic_ctrl_pkt));
+
+   nctrl.ncmd.u64 = 0;
+   nctrl.ncmd.s.cmd = OCTNET_CMD_CHANGE_MACADDR;
+   /* vfidx is 0 based, but vf_num (param1) is 1 based */
+   nctrl.ncmd.s.param1 = vfidx + 1;
+   nctrl.ncmd.s.param2 = (is_admin_assigned ? 1 : 0);
+   nctrl.ncmd.s.more = 1;
+   nctrl.iq_no = lio->linfo.txpciq[0].s.q_no;
+   nctrl.cb_fn = 0;
+   nctrl.wait_time = LIO_CMD_WAIT_TM;
+
+   nctrl.udd[0] = 0;
+   /* The MAC Address is presented in network byte order. */
+   ether_addr_copy((u8 *)[0] + 2, mac);
+
+   oct->sriov_info.vf_macaddr[vfidx] = nctrl.udd[0];
+
+   octnet_send_nic_ctrl_pkt(oct, );
+
+   return 0;
+}
+
+static int liquidio_set_vf_mac(struct net_device *netdev, int vfidx, u8 *mac)
+{
+   struct lio *lio = GET_LIO(netdev);
+   struct octeon_device *oct = lio->oct_dev;
+   int retval;
+
+   retval = __liquidio_set_vf_mac(netdev, vfidx, mac, true);
+   if (!retval)
+   cn23xx_tell_vf_its_macaddr_changed(oct, vfidx, mac);
+
+   return retval;
+}
+

[PATCH net-next V4 3/9] liquidio CN23XX: Mailbox support

2016-11-03 Thread Raghu Vatsavayi
Adds support for mailbox communication between PF and VF.

Signed-off-by: Raghu Vatsavayi 
Signed-off-by: Derek Chickles 
Signed-off-by: Satanand Burla 
Signed-off-by: Felix Manlunas 
---
 drivers/net/ethernet/cavium/liquidio/Makefile  |   1 +
 drivers/net/ethernet/cavium/liquidio/lio_core.c|  32 +++
 .../net/ethernet/cavium/liquidio/liquidio_common.h |   6 +-
 .../net/ethernet/cavium/liquidio/octeon_device.h   |   4 +
 .../net/ethernet/cavium/liquidio/octeon_mailbox.c  | 318 +
 .../net/ethernet/cavium/liquidio/octeon_mailbox.h  | 115 
 drivers/net/ethernet/cavium/liquidio/octeon_main.h |   2 +-
 7 files changed, 475 insertions(+), 3 deletions(-)
 create mode 100644 drivers/net/ethernet/cavium/liquidio/octeon_mailbox.c
 create mode 100644 drivers/net/ethernet/cavium/liquidio/octeon_mailbox.h

diff --git a/drivers/net/ethernet/cavium/liquidio/Makefile 
b/drivers/net/ethernet/cavium/liquidio/Makefile
index 5a27b2a..14958de 100644
--- a/drivers/net/ethernet/cavium/liquidio/Makefile
+++ b/drivers/net/ethernet/cavium/liquidio/Makefile
@@ -11,6 +11,7 @@ liquidio-$(CONFIG_LIQUIDIO) += lio_ethtool.o \
cn66xx_device.o\
cn68xx_device.o\
cn23xx_pf_device.o \
+   octeon_mailbox.o   \
octeon_mem_ops.o   \
octeon_droq.o  \
octeon_nic.o
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_core.c 
b/drivers/net/ethernet/cavium/liquidio/lio_core.c
index 201eddb..e6026df 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_core.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_core.c
@@ -264,3 +264,35 @@ void liquidio_link_ctrl_cmd_completion(void *nctrl_ptr)
nctrl->ncmd.s.cmd);
}
 }
+
+void octeon_pf_changed_vf_macaddr(struct octeon_device *oct, u8 *mac)
+{
+   bool macaddr_changed = false;
+   struct net_device *netdev;
+   struct lio *lio;
+
+   rtnl_lock();
+
+   netdev = oct->props[0].netdev;
+   lio = GET_LIO(netdev);
+
+   lio->linfo.macaddr_is_admin_asgnd = true;
+
+   if (!ether_addr_equal(netdev->dev_addr, mac)) {
+   macaddr_changed = true;
+   ether_addr_copy(netdev->dev_addr, mac);
+   ether_addr_copy(((u8 *)>linfo.hw_addr) + 2, mac);
+   call_netdevice_notifiers(NETDEV_CHANGEADDR, netdev);
+   }
+
+   rtnl_unlock();
+
+   if (macaddr_changed)
+   dev_info(>pci_dev->dev,
+"PF changed VF's MAC address to 
%02hhx:%02hhx:%02hhx:%02hhx:%02hhx:%02hhx\n",
+mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]);
+
+   /* no need to notify the firmware of the macaddr change because
+* the PF did that already
+*/
+}
diff --git a/drivers/net/ethernet/cavium/liquidio/liquidio_common.h 
b/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
index 0d990ac..caeff9a 100644
--- a/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
+++ b/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
@@ -731,13 +731,15 @@ struct oct_link_info {
 
 #ifdef __BIG_ENDIAN_BITFIELD
u64 gmxport:16;
-   u64 rsvd:32;
+   u64 macaddr_is_admin_asgnd:1;
+   u64 rsvd:31;
u64 num_txpciq:8;
u64 num_rxpciq:8;
 #else
u64 num_rxpciq:8;
u64 num_txpciq:8;
-   u64 rsvd:32;
+   u64 rsvd:31;
+   u64 macaddr_is_admin_asgnd:1;
u64 gmxport:16;
 #endif
 
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_device.h 
b/drivers/net/ethernet/cavium/liquidio/octeon_device.h
index cfd12ec..77a6eb7 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_device.h
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_device.h
@@ -492,6 +492,9 @@ struct octeon_device {
 
int msix_on;
 
+   /** Mail Box details of each octeon queue. */
+   struct octeon_mbox  *mbox[MAX_POSSIBLE_VFS];
+
/** IOq information of it's corresponding MSI-X interrupt. */
struct octeon_ioq_vector*ioq_vector;
 
@@ -511,6 +514,7 @@ struct octeon_device {
 #define  OCTEON_CN6XXX(oct)   ((oct->chip_id == OCTEON_CN66XX) || \
   (oct->chip_id == OCTEON_CN68XX))
 #define  OCTEON_CN23XX_PF(oct)(oct->chip_id == OCTEON_CN23XX_PF_VID)
+#define  OCTEON_CN23XX_VF(oct)((oct)->chip_id == OCTEON_CN23XX_VF_VID)
 #define CHIP_FIELD(oct, TYPE, field) \
(((struct octeon_ ## TYPE  *)(oct->chip))->field)
 
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_mailbox.c 
b/drivers/net/ethernet/cavium/liquidio/octeon_mailbox.c
new file mode 100644
index 000..5309384
--- /dev/null
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_mailbox.c
@@ -0,0 +1,318 @@

[PATCH net-next V4 1/9] liquidio CN23XX: HW config for VF support

2016-11-03 Thread Raghu Vatsavayi
Adds support for configuring HW for creating VFs.

Signed-off-by: Raghu Vatsavayi 
Signed-off-by: Derek Chickles 
Signed-off-by: Satanand Burla 
Signed-off-by: Felix Manlunas 
---
 .../ethernet/cavium/liquidio/cn23xx_pf_device.c| 84 +-
 .../net/ethernet/cavium/liquidio/octeon_config.h   |  6 ++
 .../net/ethernet/cavium/liquidio/octeon_device.h   | 12 +++-
 3 files changed, 68 insertions(+), 34 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c 
b/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c
index 380a641..4c93eac 100644
--- a/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c
+++ b/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c
@@ -40,11 +40,6 @@
  */
 #define CN23XX_INPUT_JABBER 64600
 
-#define LIOLUT_RING_DISTRIBUTION 9
-const int liolut_num_vfs_to_rings_per_vf[LIOLUT_RING_DISTRIBUTION] = {
-   0, 8, 4, 2, 2, 2, 1, 1, 1
-};
-
 void cn23xx_dump_pf_initialized_regs(struct octeon_device *oct)
 {
int i = 0;
@@ -309,9 +304,10 @@ u32 cn23xx_pf_get_oq_ticks(struct octeon_device *oct, u32 
time_intr_in_us)
 
 static void cn23xx_setup_global_mac_regs(struct octeon_device *oct)
 {
-   u64 reg_val;
u16 mac_no = oct->pcie_port;
u16 pf_num = oct->pf_num;
+   u64 reg_val;
+   u64 temp;
 
/* programming SRN and TRS for each MAC(0..3)  */
 
@@ -333,6 +329,14 @@ static void cn23xx_setup_global_mac_regs(struct 
octeon_device *oct)
/* setting TRS <23:16> */
reg_val = reg_val |
  (oct->sriov_info.trs << CN23XX_PKT_MAC_CTL_RINFO_TRS_BIT_POS);
+   /* setting RPVF <39:32> */
+   temp = oct->sriov_info.rings_per_vf & 0xff;
+   reg_val |= (temp << CN23XX_PKT_MAC_CTL_RINFO_RPVF_BIT_POS);
+
+   /* setting NVFS <55:48> */
+   temp = oct->sriov_info.max_vfs & 0xff;
+   reg_val |= (temp << CN23XX_PKT_MAC_CTL_RINFO_NVFS_BIT_POS);
+
/* write these settings to MAC register */
octeon_write_csr64(oct, CN23XX_SLI_PKT_MAC_RINFO64(mac_no, pf_num),
   reg_val);
@@ -399,11 +403,12 @@ static int cn23xx_reset_io_queues(struct octeon_device 
*oct)
 
 static int cn23xx_pf_setup_global_input_regs(struct octeon_device *oct)
 {
+   struct octeon_cn23xx_pf *cn23xx = (struct octeon_cn23xx_pf *)oct->chip;
+   struct octeon_instr_queue *iq;
+   u64 intr_threshold, reg_val;
u32 q_no, ern, srn;
u64 pf_num;
-   u64 intr_threshold, reg_val;
-   struct octeon_instr_queue *iq;
-   struct octeon_cn23xx_pf *cn23xx = (struct octeon_cn23xx_pf *)oct->chip;
+   u64 vf_num;
 
pf_num = oct->pf_num;
 
@@ -420,6 +425,16 @@ static int cn23xx_pf_setup_global_input_regs(struct 
octeon_device *oct)
*/
for (q_no = 0; q_no < ern; q_no++) {
reg_val = oct->pcie_port << CN23XX_PKT_INPUT_CTL_MAC_NUM_POS;
+
+   /* for VF assigned queues. */
+   if (q_no < oct->sriov_info.pf_srn) {
+   vf_num = q_no / oct->sriov_info.rings_per_vf;
+   vf_num += 1; /* VF1, VF2, */
+   } else {
+   vf_num = 0;
+   }
+
+   reg_val |= vf_num << CN23XX_PKT_INPUT_CTL_VF_NUM_POS;
reg_val |= pf_num << CN23XX_PKT_INPUT_CTL_PF_NUM_POS;
 
octeon_write_csr64(oct, CN23XX_SLI_IQ_PKT_CONTROL64(q_no),
@@ -1048,50 +1063,53 @@ static void cn23xx_setup_reg_address(struct 
octeon_device *oct)
 
 static int cn23xx_sriov_config(struct octeon_device *oct)
 {
-   u32 total_rings;
struct octeon_cn23xx_pf *cn23xx = (struct octeon_cn23xx_pf *)oct->chip;
-   /* num_vfs is already filled for us */
+   u32 max_rings, total_rings, max_vfs;
u32 pf_srn, num_pf_rings;
+   u32 max_possible_vfs;
 
cn23xx->conf =
-   (struct octeon_config *)oct_get_config_info(oct, LIO_23XX);
+   (struct octeon_config *)oct_get_config_info(oct, LIO_23XX);
switch (oct->rev_id) {
case OCTEON_CN23XX_REV_1_0:
-   total_rings = CN23XX_MAX_RINGS_PER_PF_PASS_1_0;
+   max_rings = CN23XX_MAX_RINGS_PER_PF_PASS_1_0;
+   max_possible_vfs = CN23XX_MAX_VFS_PER_PF_PASS_1_0;
break;
case OCTEON_CN23XX_REV_1_1:
-   total_rings = CN23XX_MAX_RINGS_PER_PF_PASS_1_1;
+   max_rings = CN23XX_MAX_RINGS_PER_PF_PASS_1_1;
+   max_possible_vfs = CN23XX_MAX_VFS_PER_PF_PASS_1_1;
break;
default:
-   total_rings = CN23XX_MAX_RINGS_PER_PF;
+   max_rings = CN23XX_MAX_RINGS_PER_PF;
+   max_possible_vfs = CN23XX_MAX_VFS_PER_PF;
break;
}
-   if (!oct->sriov_info.num_pf_rings) {
-   if (total_rings > num_present_cpus())
-  

[PATCH net-next V4 0/9] liquidio CN23XX VF support

2016-11-03 Thread Raghu Vatsavayi
Dave,

Following is the V4 patch series for adding VF support on
CN23XX devices. This version addressed:
1) Your concern for ordering of local variable declarations
   from longest to shortest line.
2) Removed module parameters max_vfs, num_queues_per_{p,v}f.
3) Minor changes for fixing new checkpatch script related 
   errors on pre-existing driver.

I will post remaining VF patches soon after this patchseries is
applied. Please apply patches in the following order as some of
the patches depend on earlier patches.

Thanks.

Raghu Vatsavayi (9):
  liquidio CN23XX: HW config for VF support
  liquidio CN23XX: sysfs VF config support
  liquidio CN23XX: Mailbox support
  liquidio CN23XX: mailbox interrupt processing
  liquidio CN23XX: VF related operations
  liquidio CN23XX: device states
  liquidio CN23XX: code cleanup
  liquidio CN23XX: copyrights changes and alignment
  liquidio CN23XX: fix for new check patch errors

 drivers/net/ethernet/cavium/liquidio/Makefile  |   1 +
 .../ethernet/cavium/liquidio/cn23xx_pf_device.c| 316 ---
 .../ethernet/cavium/liquidio/cn23xx_pf_device.h|  44 +--
 .../net/ethernet/cavium/liquidio/cn23xx_pf_regs.h  |  51 ++-
 .../net/ethernet/cavium/liquidio/cn66xx_device.c   |  49 +--
 .../net/ethernet/cavium/liquidio/cn66xx_device.h   |  41 +-
 drivers/net/ethernet/cavium/liquidio/cn66xx_regs.h |  49 ++-
 .../net/ethernet/cavium/liquidio/cn68xx_device.c   |  38 +-
 .../net/ethernet/cavium/liquidio/cn68xx_device.h   |  37 +-
 drivers/net/ethernet/cavium/liquidio/cn68xx_regs.h |  37 +-
 drivers/net/ethernet/cavium/liquidio/lio_core.c|  68 +++-
 drivers/net/ethernet/cavium/liquidio/lio_ethtool.c |  65 ++--
 drivers/net/ethernet/cavium/liquidio/lio_main.c| 424 ++---
 .../net/ethernet/cavium/liquidio/liquidio_common.h | 100 +++--
 .../net/ethernet/cavium/liquidio/liquidio_image.h  |  36 +-
 .../net/ethernet/cavium/liquidio/octeon_config.h   |  46 ++-
 .../net/ethernet/cavium/liquidio/octeon_console.c  | 156 
 .../net/ethernet/cavium/liquidio/octeon_device.c   |  79 ++--
 .../net/ethernet/cavium/liquidio/octeon_device.h   | 138 ---
 drivers/net/ethernet/cavium/liquidio/octeon_droq.c |  91 +++--
 drivers/net/ethernet/cavium/liquidio/octeon_droq.h |  18 +-
 drivers/net/ethernet/cavium/liquidio/octeon_iq.h   |  25 +-
 .../net/ethernet/cavium/liquidio/octeon_mailbox.c  | 318 
 .../net/ethernet/cavium/liquidio/octeon_mailbox.h  | 115 ++
 drivers/net/ethernet/cavium/liquidio/octeon_main.h |  47 +--
 .../net/ethernet/cavium/liquidio/octeon_mem_ops.c  |   7 +-
 .../net/ethernet/cavium/liquidio/octeon_mem_ops.h  |   5 +-
 .../net/ethernet/cavium/liquidio/octeon_network.h  |  11 +-
 drivers/net/ethernet/cavium/liquidio/octeon_nic.c  |   5 +-
 drivers/net/ethernet/cavium/liquidio/octeon_nic.h  |   7 +-
 .../net/ethernet/cavium/liquidio/request_manager.c |  34 +-
 .../ethernet/cavium/liquidio/response_manager.c|  11 +-
 .../ethernet/cavium/liquidio/response_manager.h|   6 +-
 33 files changed, 1677 insertions(+), 798 deletions(-)
 create mode 100644 drivers/net/ethernet/cavium/liquidio/octeon_mailbox.c
 create mode 100644 drivers/net/ethernet/cavium/liquidio/octeon_mailbox.h

-- 
1.8.3.1



Re: [PATCH net v3] ipv4: allow local fragmentation in ip_finish_output_gso()

2016-11-03 Thread Hannes Frederic Sowa
On 03.11.2016 22:05, Lance Richardson wrote:
>> From: "Shmulik Ladkani" 
>> To: "Lance Richardson" , f...@strlen.de, 
>> han...@stressinduktion.org
>> Cc: netdev@vger.kernel.org, jtl...@redhat.com
>> Sent: Thursday, November 3, 2016 4:27:51 PM
>> Subject: Re: [PATCH net v3] ipv4: allow local fragmentation in 
>> ip_finish_output_gso()
>>
>> Hi Hannes, Lance,
>>
>> On Wed,  2 Nov 2016 16:36:17 -0400 Lance Richardson 
>> wrote:
>>>  
>>> -   if (skb_iif && !(df & htons(IP_DF))) {
>>> -   /* Arrived from an ingress interface, got encapsulated, with
>>> -* fragmentation of encapulating frames allowed.
>>> -* If skb is gso, the resulting encapsulated network segments
>>> -* may exceed dst mtu.
>>> -* Allow IP Fragmentation of segments.
>>> -*/
>>> -   IPCB(skb)->flags |= IPSKB_FRAG_SEGS;
>>> -   }
>>
>> Thinking this over, I'm concerned of this change.
>>
>> Few months back, we discussed this and got to the conclusion that in the
>> "ingress,tunnel,egress" scenario, segments are allowed to be
>> fragmented if the original inner ip packet does NOT have the DF.



>>
>> See
>>   https://patchwork.ozlabs.org/patch/657132/
>>   https://patchwork.ozlabs.org/patch/661219/
>>
>> I think you expressed that those tunneled skbs already having DF set
>> should go through pmtu discovery.
>>
>> Suggested patch unconditionally calls skb_gso_validate_mtu().
>>
>> Thus we're changing behavior for "ingress,tunnel,egress" scenario of
>> the tunneled packets having DF set in the inner iph.
>>
>> WDYT?
>>
> 
> I'm still digesting the patchwork history, but it seems to me:
> 
>1) If we call skb_gso_validate_mtu() and it returns true, 
> ip_finish_output2() will
>   be called, just as before, so nothing changes.
> 
>2) If we were to avoid calling skb_gso_validate_mtu() when it would have 
> returned
>   false and call ip_finish_output2() without performing fragmentation, we
>   would transmit (or attempt to transmit) a packet that exceeds the 
> configured
>   MTU.
> 
>3) If we do call skb_gso_validate_mtu() and it returns false, 
> ip_finish_output_gso()
>   will call ip_fragment() to perform needed fragmentation. Whether 
> fragmentation
>   actually occurs at this point depends on the value of the DF flag in the
>   IP header (and perhaps skb->ignore_df, frag_max_size, etc.).
> 
> Is the issue you're pointing out about cases in which the inner IP header has 
> DF set
> but the tunnel header does not?

Correct, but we should maybe redefine the code a bit. From my
understanding we can now create an ICMP storm in case every fragment gets.

I think for net this is currently fine and we certainly don't want to
generate oversized datagrams.

I fear the more special case logic we add the sooner or later it will
bite us again. :/

Right now, I see the problem we might end up generating lots of error
callbacks for large gso segments. Maybe we can also just abort after
fragmenting the first frame generated an error. Florian? Or just
overoptimize and jump to the last one, which could have a different size. :)

Anyway, the best thing would be that vxlan etc. inherits the inner DF bit.

The problem to solve here is that for some time we generated oversized
packets on the wire, which is absolutely a no-go. If we now start to
break things for people with "wrong" configuration, we could also get
more complains. Currently I think this is the only way out, unfortunately.

Any other ideas?


Re: [PATCH net v3] ipv4: allow local fragmentation in ip_finish_output_gso()

2016-11-03 Thread Lance Richardson
> From: "Shmulik Ladkani" 
> To: "Lance Richardson" , f...@strlen.de, 
> han...@stressinduktion.org
> Cc: netdev@vger.kernel.org, jtl...@redhat.com
> Sent: Thursday, November 3, 2016 4:27:51 PM
> Subject: Re: [PATCH net v3] ipv4: allow local fragmentation in 
> ip_finish_output_gso()
> 
> Hi Hannes, Lance,
> 
> On Wed,  2 Nov 2016 16:36:17 -0400 Lance Richardson 
> wrote:
> >  
> > -   if (skb_iif && !(df & htons(IP_DF))) {
> > -   /* Arrived from an ingress interface, got encapsulated, with
> > -* fragmentation of encapulating frames allowed.
> > -* If skb is gso, the resulting encapsulated network segments
> > -* may exceed dst mtu.
> > -* Allow IP Fragmentation of segments.
> > -*/
> > -   IPCB(skb)->flags |= IPSKB_FRAG_SEGS;
> > -   }
> 
> Thinking this over, I'm concerned of this change.
> 
> Few months back, we discussed this and got to the conclusion that in the
> "ingress,tunnel,egress" scenario, segments are allowed to be
> fragmented if the original inner ip packet does NOT have the DF.
> 
> See
>   https://patchwork.ozlabs.org/patch/657132/
>   https://patchwork.ozlabs.org/patch/661219/
> 
> I think you expressed that those tunneled skbs already having DF set
> should go through pmtu discovery.
> 
> Suggested patch unconditionally calls skb_gso_validate_mtu().
> 
> Thus we're changing behavior for "ingress,tunnel,egress" scenario of
> the tunneled packets having DF set in the inner iph.
> 
> WDYT?
> 

I'm still digesting the patchwork history, but it seems to me:

   1) If we call skb_gso_validate_mtu() and it returns true, 
ip_finish_output2() will
  be called, just as before, so nothing changes.

   2) If we were to avoid calling skb_gso_validate_mtu() when it would have 
returned
  false and call ip_finish_output2() without performing fragmentation, we
  would transmit (or attempt to transmit) a packet that exceeds the 
configured
  MTU.

   3) If we do call skb_gso_validate_mtu() and it returns false, 
ip_finish_output_gso()
  will call ip_fragment() to perform needed fragmentation. Whether 
fragmentation
  actually occurs at this point depends on the value of the DF flag in the
  IP header (and perhaps skb->ignore_df, frag_max_size, etc.).

Is the issue you're pointing out about cases in which the inner IP header has 
DF set
but the tunnel header does not?

Thanks,

   Lance


Re: Time to revisit LISP?

2016-11-03 Thread David Miller
From: Tom Herbert 
Date: Thu, 3 Nov 2016 13:57:59 -0700

> On Thu, Nov 3, 2016 at 1:37 PM, David Miller  wrote:
>> Userspace resolution of paths in response to data path signalling
>> simply does not scale and is fundamentally an extremely poor design
>> choice.  We're trying to move away from, rather than towards, these
>> kinds of architectures.
> 
> OVS is quite different I think. LISP is a specific resolution protocol
> of identifier to locator as opposed to be some open ended mechanism to
> resolve some arbitrary definition of flows like OVS. Also, I don't
> think there's any specific requirement in LISP that prevents on from
> implementing the mapping protocol in the kernel, it should just be a
> simple UDP communication.
> 
> Do you see anything in the protocol itself that would be a showstopper?

I'd have to see the code and how it works.  I can't review hypothetical
implementations.


Re: [PATCH net] ipv6: dccp: add missing bind_conflict to dccp_ipv6_mapped

2016-11-03 Thread David Miller
From: Eric Dumazet 
Date: Thu, 03 Nov 2016 08:59:46 -0700

> From: Eric Dumazet 
> 
> While fuzzing kernel with syzkaller, Andrey reported a nasty crash
> in inet6_bind() caused by DCCP lacking a required method.
> 
> Fixes: ab1e0a13d7029 ("[SOCK] proto: Add hashinfo member to struct proto")
> Signed-off-by: Eric Dumazet 
> Reported-by: Andrey Konovalov 
> Tested-by: Andrey Konovalov 

Applied and queued up for -stable, thanks Eric.


Re: Time to revisit LISP?

2016-11-03 Thread Tom Herbert
On Thu, Nov 3, 2016 at 1:37 PM, David Miller  wrote:
> From: Tom Herbert 
> Date: Thu, 3 Nov 2016 12:22:52 -0700
>
>> For instance, one of the his questions is:
>>
>> "What is to keep one from having to service a full Map-Request -->
>> Map-Reply cycle for every packet received?"
>>
>> This can be solved by judicious rate limiting, for instance the
>> infrastructure I implemented to rate limit ILA resolver request could
>> be applied here.
>
> All of these things work great if you have tables that are either very
> tiny or change infrequently.
>
> But once you run into anything seriously dynamic, it has the same
> problems that the routing cache had and OVS can have.
>
One way or another we are going to have to deal with this. If we want
Linux to be serve as router for mobility it is going to have to scale
for having bunches of host routes and they will be quite dynamic
because of mobility.

> And for this reason things like the flow cache are on the chopping
> block.  And frankly, I'd remove OVS from the kernel if I could.
>
Quite frankly the DOS issues with OVS were obvious just by looking at
the initial design-- it should have been done better. Yes, if you
upcall every unresolved packet to userspace you're just inviting a DOS
attack.

> Userspace resolution of paths in response to data path signalling
> simply does not scale and is fundamentally an extremely poor design
> choice.  We're trying to move away from, rather than towards, these
> kinds of architectures.

OVS is quite different I think. LISP is a specific resolution protocol
of identifier to locator as opposed to be some open ended mechanism to
resolve some arbitrary definition of flows like OVS. Also, I don't
think there's any specific requirement in LISP that prevents on from
implementing the mapping protocol in the kernel, it should just be a
simple UDP communication.

Do you see anything in the protocol itself that would be a showstopper?

Tom


Re: [PATCH net v3] ipv4: allow local fragmentation in ip_finish_output_gso()

2016-11-03 Thread David Miller
From: Shmulik Ladkani 
Date: Thu, 3 Nov 2016 22:40:52 +0200

> On Thu, 03 Nov 2016 16:12:44 -0400 (EDT) David Miller  
> wrote:
>> Applied and queued up for -stable.
> 
> Dave, my response lagged your "Applied" by few minutes ;)
> 
> This seems to deserve some more thought to make sure nothing got broken,
> as expressed last in https://patchwork.ozlabs.org/patch/690594/

Feel free to send a followup or a revert if necessary.


Re: [net-next PATCH 2/3] net/qdisc: IFF_NO_QUEUE drivers should use consistent TX queue len

2016-11-03 Thread Krister Johansen
On Thu, Nov 03, 2016 at 02:56:06PM +0100, Jesper Dangaard Brouer wrote:
> The flag IFF_NO_QUEUE marks virtual device drivers that doesn't need a
> default qdisc attached, given they will be backed by physical device,
> that already have a qdisc attached for pushback.
> 
> It is still supported to attach a qdisc to a IFF_NO_QUEUE device, as
> this can be useful for difference policy reasons (e.g. bandwidth
> limiting containers).  For this to work, the tx_queue_len need to have
> a sane value, because some qdiscs inherit/copy the tx_queue_len
> (namely, pfifo, bfifo, gred, htb, plug and sfb).
> 
> Commit a813104d9233 ("IFF_NO_QUEUE: Fix for drivers not calling
> ether_setup()") caught situations where some drivers didn't initialize
> tx_queue_len.  The problem with the commit was choosing 1 as the
> fallback value.
> 
> A qdisc queue length of 1 causes more harm than good, because it
> creates hard to debug situations for userspace. It gives userspace a
> false sense of a working config after attaching a qdisc.  As low
> volume traffic (that doesn't activate the qdisc policy) works,
> like ping, while traffic that e.g. needs shaping cannot reach the
> configured policy levels, given the queue length is too small.

Thanks for fixing this.  I've run into this in the exact scenario you
describe -- bandwith limiting containers.  I'm pretty sure my vote
doesn't count, but I'm in favor of this change.

-K


Re: [Patch net] genetlink: fix a memory leak on error path

2016-11-03 Thread David Miller
From: Cong Wang 
Date: Thu,  3 Nov 2016 09:42:35 -0700

> In __genl_register_family(), when genl_validate_assign_mc_groups()
> fails, we forget to free the memory we possibly allocate for
> family->attrbuf.
> 
> Note, some callers call genl_unregister_family() to clean up
> on error path, it doesn't work because the family is inserted
> to the global list in the nearly last step.
> 
> Cc: Jakub Kicinski 
> Cc: Johannes Berg 
> Signed-off-by: Cong Wang 

Applied, thanks Cong.


Re: [RFC] make kmemleak scan __ro_after_init section (was: Re: [PATCH 0/5] genetlink improvements)

2016-11-03 Thread Catalin Marinas
On Wed, Nov 02, 2016 at 11:47:55PM +, Jakub Kicinski wrote:
> I realized that kmemleak is not scanning the __ro_after_init section...
> Following patch solves the false positives but I wonder if it's the
> right/acceptable solution.

Thanks for putting this together. I actually hit a similar issue on
arm64 but didn't get the chance to fix it (also at LPC). With a proper
commit message, feel free to add:

Reviewed-by: Catalin Marinas 


Re: Time to revisit LISP?

2016-11-03 Thread David Miller
From: Tom Herbert 
Date: Thu, 3 Nov 2016 12:22:52 -0700

> For instance, one of the his questions is:
> 
> "What is to keep one from having to service a full Map-Request -->
> Map-Reply cycle for every packet received?"
> 
> This can be solved by judicious rate limiting, for instance the
> infrastructure I implemented to rate limit ILA resolver request could
> be applied here.

All of these things work great if you have tables that are either very
tiny or change infrequently.

But once you run into anything seriously dynamic, it has the same
problems that the routing cache had and OVS can have.

And for this reason things like the flow cache are on the chopping
block.  And frankly, I'd remove OVS from the kernel if I could.

Userspace resolution of paths in response to data path signalling
simply does not scale and is fundamentally an extremely poor design
choice.  We're trying to move away from, rather than towards, these
kinds of architectures.


Re: [PATCH net v3] ipv4: allow local fragmentation in ip_finish_output_gso()

2016-11-03 Thread Shmulik Ladkani
On Thu, 03 Nov 2016 16:12:44 -0400 (EDT) David Miller  
wrote:
> Applied and queued up for -stable.

Dave, my response lagged your "Applied" by few minutes ;)

This seems to deserve some more thought to make sure nothing got broken,
as expressed last in https://patchwork.ozlabs.org/patch/690594/

Best,
Shmulik


Re: [PATCH net v3] ipv4: allow local fragmentation in ip_finish_output_gso()

2016-11-03 Thread Shmulik Ladkani
Hi Hannes, Lance,

On Wed,  2 Nov 2016 16:36:17 -0400 Lance Richardson  wrote:
>  
> - if (skb_iif && !(df & htons(IP_DF))) {
> - /* Arrived from an ingress interface, got encapsulated, with
> -  * fragmentation of encapulating frames allowed.
> -  * If skb is gso, the resulting encapsulated network segments
> -  * may exceed dst mtu.
> -  * Allow IP Fragmentation of segments.
> -  */
> - IPCB(skb)->flags |= IPSKB_FRAG_SEGS;
> - }

Thinking this over, I'm concerned of this change.

Few months back, we discussed this and got to the conclusion that in the
"ingress,tunnel,egress" scenario, segments are allowed to be
fragmented if the original inner ip packet does NOT have the DF.

See 
  https://patchwork.ozlabs.org/patch/657132/
  https://patchwork.ozlabs.org/patch/661219/

I think you expressed that those tunneled skbs already having DF set
should go through pmtu discovery.

Suggested patch unconditionally calls skb_gso_validate_mtu().

Thus we're changing behavior for "ingress,tunnel,egress" scenario of
the tunneled packets having DF set in the inner iph.

WDYT?


Re: [patch net-next] mlxsw: pci: Fix the FW ready mask length

2016-11-03 Thread David Miller
From: Jiri Pirko 
Date: Thu,  3 Nov 2016 09:41:55 +0100

> From: Elad Raz 
> 
> The system-status register is actually 16-bit wide and not 8 bit-wide.
> 
> Fixes: 233fa44bd67ae ("mlxsw: pci: Implement reset done check")
> Signed-off-by: Elad Raz 
> Signed-off-by: Jiri Pirko 

Applied.


Re: [PATCH net-next] net/sched: cls_flower: Support matching on SCTP ports

2016-11-03 Thread David Miller
From: Simon Horman 
Date: Thu,  3 Nov 2016 13:24:21 +0100

> Support matching on SCTP ports in the same way that matching
> on TCP and UDP ports is already supported.
> 
> Example usage:
> 
> tc qdisc add dev eth0 ingress
> 
> tc filter add dev eth0 protocol ip parent : \
> flower indev eth0 ip_proto sctp dst_port 80 \
> action drop
> 
> Signed-off-by: Simon Horman 

Applied, thanks Simon.


Re: [PATCH net] ehea: fix operation state report

2016-11-03 Thread David Miller
From: "Guilherme G. Piccoli" 
Date: Thu,  3 Nov 2016 08:16:20 -0200

> Currently the ehea driver is missing a call to netif_carrier_off()
> before the interface bring-up; this is necessary in order to
> initialize the __LINK_STATE_NOCARRIER bit in the net_device state
> field. Otherwise, we observe state UNKNOWN on "ip address" command
> output.
> 
> This patch adds a call to netif_carrier_off() on ehea's net device
> open callback.
> 
> Reported-by: Xiong Zhou 
> Reference-ID: IBM bz #137702, Red Hat bz #1089134
> Signed-off-by: Guilherme G. Piccoli 

Applied.


Re: [PATCH net] netlink: netlink_diag_dump() runs without locks

2016-11-03 Thread David Miller
From: Eric Dumazet 
Date: Wed, 02 Nov 2016 20:21:20 -0700

> From: Eric Dumazet 
> 
> A recent commit removed locking from netlink_diag_dump() but forgot
> one error case.
 ...
> Fixes: ad202074320c ("netlink: Use rhashtable walk interface in diag dump")
> Signed-off-by: Eric Dumazet 
> Reported-by: Andrey Konovalov 
> Tested-by: Andrey Konovalov 

Applied.


Re: [PATCH net] ipv6: dccp: fix out of bound access in dccp_v6_err()

2016-11-03 Thread David Miller
From: Eric Dumazet 
Date: Wed, 02 Nov 2016 20:30:48 -0700

> From: Eric Dumazet 
> 
> dccp_v6_err() does not use pskb_may_pull() and might access garbage.
> 
> We only need 4 bytes at the beginning of the DCCP header, like TCP,
> so the 8 bytes pulled in icmpv6_notify() are more than enough.
> 
> Signed-off-by: Eric Dumazet 

Applied and queue up for -stable.

Thanks Eric.


Re: [PATCH net] dccp: fix out of bound access in dccp_v4_err()

2016-11-03 Thread David Miller
From: Eric Dumazet 
Date: Wed, 02 Nov 2016 19:00:40 -0700

> From: Eric Dumazet 
> 
> dccp_v4_err() does not use pskb_may_pull() and might access garbage.
> 
> We only need 4 bytes at the beginning of the DCCP header, like TCP,
> so the 8 bytes pulled in icmp_socket_deliver() are more than enough.
> 
> This patch might allow to process more ICMP messages, as some routers
> are still limiting the size of reflected bytes to 28 (RFC 792), instead
> of extended lengths (RFC 1812 4.3.2.3)
> 
> Signed-off-by: Eric Dumazet 

Applied and queued up for -stable.


Re: [PATCH net] dccp: do not release listeners too soon

2016-11-03 Thread David Miller
From: Eric Dumazet 
Date: Wed, 02 Nov 2016 17:14:41 -0700

> From: Eric Dumazet 
> 
> Andrey Konovalov reported following error while fuzzing with syzkaller :
 ...
> It turns out DCCP calls __sk_receive_skb(), and this broke when
> lookups no longer took a reference on listeners.
> 
> Fix this issue by adding a @refcounted parameter to __sk_receive_skb(),
> so that sock_put() is used only when needed.
> 
> Fixes: 3b24d854cb35 ("tcp/dccp: do not touch listener sk_refcnt under 
> synflood")
> Signed-off-by: Eric Dumazet 
> Reported-by: Andrey Konovalov 
> Tested-by: Andrey Konovalov 

Applied and queued up for -stable.


Re: [PATCH net] dccp: do not send reset to already closed sockets

2016-11-03 Thread David Miller
From: Eric Dumazet 
Date: Wed, 02 Nov 2016 18:04:24 -0700

> From: Eric Dumazet 
> 
> Andrey reported following warning while fuzzing with syzkaller
> 
> WARNING: CPU: 1 PID: 21072 at net/dccp/proto.c:83 dccp_set_state+0x229/0x290
> Kernel panic - not syncing: panic_on_warn set ...
 ...
> Fix this the same way we did for TCP in commit 565b7b2d2e63
> ("tcp: do not send reset to already closed sockets")
> 
> Signed-off-by: Eric Dumazet 
> Reported-by: Andrey Konovalov 
> Tested-by: Andrey Konovalov 

Applied and queued up for -stable.


Re: [PATCH net] tcp: fix return value for partial writes

2016-11-03 Thread David Miller
From: Eric Dumazet 
Date: Wed, 02 Nov 2016 14:41:50 -0700

> From: Eric Dumazet 
> 
> After my commit, tcp_sendmsg() might restart its loop after
> processing socket backlog.
> 
> If sk_err is set, we blindly return an error, even though we
> copied data to user space before.
> 
> We should instead return number of bytes that could be copied,
> otherwise user space might resend data and corrupt the stream.
> 
> This might happen if another thread is using recvmsg(MSG_ERRQUEUE)
> to process timestamps.
> 
> Issue was diagnosed by Soheil and Willem, big kudos to them !
> 
> Fixes: d41a69f1d390f ("tcp: make tcp_sendmsg() aware of socket backlog")
> Signed-off-by: Eric Dumazet 

Applied and queued up for -stable.


Re: [PATCH net v3] ipv4: allow local fragmentation in ip_finish_output_gso()

2016-11-03 Thread David Miller
From: Lance Richardson 
Date: Wed,  2 Nov 2016 16:36:17 -0400

> Some configurations (e.g. geneve interface with default
> MTU of 1500 over an ethernet interface with 1500 MTU) result
> in the transmission of packets that exceed the configured MTU.
> While this should be considered to be a "bad" configuration,
> it is still allowed and should not result in the sending
> of packets that exceed the configured MTU.
> 
> Fix by dropping the assumption in ip_finish_output_gso() that
> locally originated gso packets will never need fragmentation.
> Basic testing using iperf (observing CPU usage and bandwidth)
> have shown no measurable performance impact for traffic not
> requiring fragmentation.
> 
> Fixes: c7ba65d7b649 ("net: ip: push gso skb forwarding handling down the 
> stack")
> Reported-by: Jan Tluka 
> Signed-off-by: Lance Richardson 

Applied and queued up for -stable.


Re: [PATCH net-next] net: remove unused argument in checksum unnecessary conversion

2016-11-03 Thread David Miller
From: Willem de Bruijn 
Date: Wed,  2 Nov 2016 16:14:11 -0400

> From: Willem de Bruijn 
> 
> The check argument is never used. This code has not changed since
> the original introduction in d96535a17dbb ("net: Infrastructure for
> checksum unnecessary conversions"). Remove the unused argument and
> update all callers.
> 
> Signed-off-by: Willem de Bruijn 

Applied.


Re: [PATCH] net: tcp: check skb is non-NULL for exact match on lookups

2016-11-03 Thread David Miller
From: David Ahern 
Date: Wed,  2 Nov 2016 12:08:25 -0700

> Andrey reported the following error report while running the syzkaller
> fuzzer:
 ...
> MD5 has a code path that calls __inet_lookup_listener with a null skb,
> so inet{6}_exact_dif_match needs to check skb against null before pulling
> the flag.
> 
> Fixes: a04a480d4392 ("net: Require exact match for TCP socket lookups if
>dif is l3mdev")
> Reported-by: Andrey Konovalov 
> Signed-off-by: David Ahern 
> ---
> Dave: commit a04a480d4392 was queued for stable, so this needs to follow it.

Applied and queued up for -stable, thanks.


Re: [PATCH v5 0/7] add NS2 support to bgmac

2016-11-03 Thread David Miller
From: Jon Mason 
Date: Wed,  2 Nov 2016 13:08:01 -0400

> Add support for the amac found in the Broadcom Northstar2 SoC to the
> bgmac driver.  This necessitates adding support to connect to an
> externally defined phy (as described in the device tree) in the driver.
> These phy changes are in addition to the changes necessary to get NS2
> working.

This does not apply cleanly to the net-next, please respin.


Re: [PATCH net-next v6 02/10] dpaa_eth: add support for DPAA Ethernet

2016-11-03 Thread David Miller
From: Madalin Bucur 
Date: Wed, 2 Nov 2016 22:17:26 +0200

> This introduces the Freescale Data Path Acceleration Architecture
> +static inline size_t bpool_buffer_raw_size(u8 index, u8 cnt)
> +{
> + u8 i;
> + size_t res = DPAA_BP_RAW_SIZE / 2;

Always order local variable declarations from longest to shortest line,
also know as Reverse Christmas Tree Format.

Please audit your entire submission for this problem, it occurs
everywhere.

> + /* we do not want shared skbs on TX */
> + net_dev->priv_flags &= ~IFF_TX_SKB_SHARING;

Why?  By clearing this, you disallow an important fundamental way to do
performane testing, via pktgen.


> + int numstats = sizeof(struct rtnl_link_stats64) / sizeof(u64);
 ...
> + cpustats = (u64 *)_priv->stats;
> +
> + for (j = 0; j < numstats; j++)
> + netstats[j] += cpustats[j];

This is a memcpy() on well-typed datastructures which requires no
casting or special handling whatsoever, so use memcpy instead of
needlessly open coding the operation.

> +static int dpaa_change_mtu(struct net_device *net_dev, int new_mtu)
> +{
> + const int max_mtu = dpaa_get_max_mtu();
> +
> + /* Make sure we don't exceed the Ethernet controller's MAXFRM */
> + if (new_mtu < 68 || new_mtu > max_mtu) {
> + netdev_err(net_dev, "Invalid L3 mtu %d (must be between %d and 
> %d).\n",
> +new_mtu, 68, max_mtu);
> + return -EINVAL;
> + }
> + net_dev->mtu = new_mtu;
> +
> + return 0;
> +}

MTU restrictions are handled in the net-next tree via net_dev->min_mtu and
net_dev->max_mtu.  Use that and do not define this NDO operation as you do
not need it.

> +static int dpaa_set_features(struct net_device *dev, netdev_features_t 
> features)
> +{
> + /* Not much to do here for now */
> + dev->features = features;
> + return 0;
> +}

Do not define unnecessary NDO operations, let the defaults do their job.

> +static netdev_features_t dpaa_fix_features(struct net_device *dev,
> +netdev_features_t features)
> +{
> + netdev_features_t unsupported_features = 0;
> +
> + /* In theory we should never be requested to enable features that
> +  * we didn't set in netdev->features and netdev->hw_features at probe
> +  * time, but double check just to be on the safe side.
> +  */
> + unsupported_features |= NETIF_F_RXCSUM;
> +
> + features &= ~unsupported_features;
> +
> + return features;
> +}

Unless you can show that your need this, do not "guess" by implement this
NDO operation.  You don't need it.

> +#ifdef CONFIG_FSL_DPAA_ETH_FRIENDLY_IF_NAME
> +static int dpaa_mac_hw_index_get(struct platform_device *pdev)
> +{
> + struct device *dpaa_dev;
> + struct dpaa_eth_data *eth_data;
> +
> + dpaa_dev = >dev;
> + eth_data = dpaa_dev->platform_data;
> +
> + return eth_data->mac_hw_id;
> +}
> +
> +static int dpaa_mac_fman_index_get(struct platform_device *pdev)
> +{
> + struct device *dpaa_dev;
> + struct dpaa_eth_data *eth_data;
> +
> + dpaa_dev = >dev;
> + eth_data = dpaa_dev->platform_data;
> +
> + return eth_data->fman_hw_id;
> +}
> +#endif

Do not play network device naming games like this, use the standard name
assignment done by the kernel and have userspace entities do geographic or
device type specific naming.

I want to see this code completely removed.

> +static int dpaa_set_mac_address(struct net_device *net_dev, void *addr)
> +{
> + const struct dpaa_priv  *priv;
> + int err;
> + struct mac_device *mac_dev;
> +
> + priv = netdev_priv(net_dev);
> +
> + err = eth_mac_addr(net_dev, addr);
> + if (err < 0) {
> + netif_err(priv, drv, net_dev, "eth_mac_addr() = %d\n", err);
> + return err;
> + }
> +
> + mac_dev = priv->mac_dev;
> +
> + err = mac_dev->change_addr(mac_dev->fman_mac,
> +(enet_addr_t *)net_dev->dev_addr);
> + if (err < 0) {
> + netif_err(priv, drv, net_dev, "mac_dev->change_addr() = %d\n",
> +   err);
> + return err;
> + }

You MUST NOT return an error at this point without rewinding the state change
performed by eth_mac_addr().  Otherwise device will be left in an inconsistent
state compared to what the software MAC address has recorded.

This driver is enormous, I don't have the time nor the patience to
review it further for what seems to be many fundamental errors like
the ones I have pointed out so far.

Sorry.


Re: [PATCH net-next 0/3] ip: add RECVFRAGSIZE cmsg

2016-11-03 Thread David Miller
From: Willem de Bruijn 
Date: Wed,  2 Nov 2016 11:02:15 -0400

> On IP datagrams and raw sockets, when packets arrive fragmented,
> expose the largest received fragment size through a new cmsg.
> 
> Protocols implemented on top of these sockets may use this, for
> instance, to inform peers to lower MSS on platforms that silently
> allow send calls to exceed PMTU and cause fragmentation.

Looks good, series applied, thanks Willem.


Re: [PATCH net] qede: Correctly map aggregation replacement pages

2016-11-03 Thread David Miller
From: Yuval Mintz 
Date: Wed, 2 Nov 2016 16:36:46 +0200

> Driver allocates replacement buffers before-hand to make
> sure whenever an aggregation begins there would be a replacement
> for the Rx buffers, as we can't release the buffer until
> aggregation is terminated and driver logic assumes the Rx rings
> are always full.
> 
> For every other Rx page that's being allocated [I.e., regular]
> the page is being completely mapped while for the replacement
> buffers only the first portion of the page is being mapped.
> This means that:
>   a. Once replacement buffer replenishes the regular Rx ring,
> assuming there's more than a single packet on page we'd post unmapped
> memory toward HW [assuming mapping is actually done in granularity
> smaller than page].
>   b. Unmaps are being done for the entire page, which is incorrect.
> 
> Fixes: 55482edc25f06 ("qede: Add slowpath/fastpath support and enable 
> hardware GRO")
> Signed-off-by: Yuval Mintz 

Applied.


Re: [PATCH net] tcp: fix potential memory corruption

2016-11-03 Thread David Miller
From: Eric Dumazet 
Date: Wed, 02 Nov 2016 07:53:17 -0700

> From: Eric Dumazet 
> 
> Imagine initial value of max_skb_frags is 17, and last
> skb in write queue has 15 frags.
> 
> Then max_skb_frags is lowered to 14 or smaller value.
> 
> tcp_sendmsg() will then be allowed to add additional page frags
> and eventually go past MAX_SKB_FRAGS, overflowing struct
> skb_shared_info.
> 
> Fixes: 5f74f82ea34c ("net:Add sysctl_max_skb_frags")
> Signed-off-by: Eric Dumazet 

Applied and queued up for -stable, thanks.


Re: [PATCH v3 0/2] net: stmmac: Add OXNAS DWMAC Glue

2016-11-03 Thread David Miller
From: Neil Armstrong 
Date: Wed,  2 Nov 2016 15:02:35 +0100

> This patchset add support for the Sysnopsys DWMAC Gigabit Ethernet
> controller Glue layer of the Oxford Semiconductor OX820 SoC.

Series applied to net-next, thanks.


Re: [patch net-next 0/2] Fixes for raw diag sockets handling

2016-11-03 Thread David Miller
From: Cyrill Gorcunov 
Date: Wed, 02 Nov 2016 15:36:30 +0300

> Hi! Here are a few fixes for raw-diag sockets handling: missing
> sock_put call and jump for exiting from nested cycle. I made
> patches for iproute2 as well so will send them out soon.

Series applied, thanks.


Time to revisit LISP?

2016-11-03 Thread Tom Herbert
Hi Chris,

Looking at netdev archives I see that Dave's response to the LISP
patches from June 2014 was:

"Sorry, I'm not too thrilled about LISP and this patch in particular,
from several different angles.  And therefore I'm going to mark this
patch deferred and not apply it at this time."

It seems to the me that he didn't close the door on ever accepting
LISP into the kernel! I am wondering if it is time to take another
look at this, I am starting to see that there is some existing
deployment of LISP.

AFAICT, most of the concerns Dave had were along the lines of the
infrastructure not the protocol. We might be able to address these
now.

For instance, one of the his questions is:

"What is to keep one from having to service a full Map-Request -->
Map-Reply cycle for every packet received?"

This can be solved by judicious rate limiting, for instance the
infrastructure I implemented to rate limit ILA resolver request could
be applied here.

Thanks,
Tom


Re: [PATCH net] cxgb4: correct device ID of T6 adapter

2016-11-03 Thread David Miller
From: Hariprasad Shenai 
Date: Wed,  2 Nov 2016 10:52:53 +0530

> Signed-off-by: Hariprasad Shenai 

Applied, thanks.


Re: [Patch net] inet: fix sleeping inside inet_wait_for_connect()

2016-11-03 Thread David Miller
From: Cong Wang 
Date: Tue,  1 Nov 2016 16:04:36 -0700

> Andrey reported this kernel warning:
 ...
> Unlike commit 26cabd31259ba43f68026ce3f62b78094124333f
> ("sched, net: Clean up sk_wait_event() vs. might_sleep()"), the
> sleeping function is called before schedule_timeout(), this is indeed
> a bug. Fix this by moving the wait logic to the new API, it is similar
> to commit ff960a731788a7408b6f66ec4fd772ff18833211
> ("netdev, sched/wait: Fix sleeping inside wait event").
> 
> Reported-by: Andrey Konovalov 
> Cc: Andrey Konovalov 
> Cc: Eric Dumazet 
> Cc: Peter Zijlstra 
> Signed-off-by: Cong Wang 

Applied, thanks.


Re: [PATCH net-next] netfilter: Update ip_route_me_harder to consider L3 domain

2016-11-03 Thread David Ahern
On 11/3/16 11:43 AM, David Ahern wrote:
> ip_route_me_harder is not considering the L3 domain and sending lookups
> to the wrong table. For example consider the following output rule:
> 
> iptables -I OUTPUT -p tcp --dport 12345 -j REJECT --reject-with tcp-reset
> 
> using perf to analyze lookups via the fib_table_lookup tracepoint shows:
> 
> vrf-test  1187 [001] 46887.295927: fib:fib_table_lookup: table 255 oif 0 iif 
> 0 src 0.0.0.0 dst 10.100.1.254 tos 0 scope 0 flags 0
> 8143922c perf_trace_fib_table_lookup ([kernel.kallsyms])
> 81493aac fib_table_lookup ([kernel.kallsyms])
> 8148dda3 __inet_dev_addr_type ([kernel.kallsyms])
> 8148ddf6 inet_addr_type ([kernel.kallsyms])
> 8149e344 ip_route_me_harder ([kernel.kallsyms])
> 
> and
> 
> vrf-test  1187 [001] 46887.295933: fib:fib_table_lookup: table 255 oif 0 iif 
> 1 src 10.100.1.254 dst 10.100.1.2 tos 0 scope 0 flags
> 8143922c perf_trace_fib_table_lookup ([kernel.kallsyms])
> 81493aac fib_table_lookup ([kernel.kallsyms])
> 814998ff fib4_rule_action ([kernel.kallsyms])
> 81437f35 fib_rules_lookup ([kernel.kallsyms])
> 81499758 __fib_lookup ([kernel.kallsyms])
> 8144f010 fib_lookup.constprop.34 ([kernel.kallsyms])
> 8144f759 __ip_route_output_key_hash ([kernel.kallsyms])
> 8144fc6a ip_route_output_flow ([kernel.kallsyms])
> 8149e39b ip_route_me_harder ([kernel.kallsyms])
> 
> Updating both lookups to pull the L3 domain from the dst currently
> attached to the skb directs both lookups to the correct table.
> 
> Signed-off-by: David Ahern 
> ---
> Pablo: from a code review it seems ip_route_me_harder is only called in
>the output path and after skb_dst is set.

Correction: the 'output path' comment is wrong, but dst appears to always be 
set for both rx and tx paths.


[PATCH net] sctp: assign assoc_id earlier in __sctp_connect

2016-11-03 Thread Marcelo Ricardo Leitner
sctp_wait_for_connect() currently already holds the asoc to keep it
alive during the sleep, in case another thread release it. But Andrey
Konovalov and Dmitry Vyukov reported an use-after-free in such
situation.

Problem is that __sctp_connect() doesn't get a ref on the asoc and will
do a read on the asoc after calling sctp_wait_for_connect(), but by then
another thread may have closed it and the _put on sctp_wait_for_connect
will actually release it, causing the use-after-free.

Fix is, instead of doing the read after waiting for the connect, do it
before so, and avoid this issue as the socket is still locked by then.
There should be no issue on returning the asoc id in case of failure as
the application shouldn't trust on that number in such situations
anyway.

This issue doesn't exist in sctp_sendmsg() path.

Reported-by: Dmitry Vyukov 
Reported-by: Andrey Konovalov 
Tested-by: Andrey Konovalov 
Signed-off-by: Marcelo Ricardo Leitner 
---
 net/sctp/socket.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 
6cdc61c21438aa9b6dbdad93e70759071a4d6789..be1d9bb98230c9d77f676949db773b2dacd801a4
 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1214,9 +1214,12 @@ static int __sctp_connect(struct sock *sk,
 
timeo = sock_sndtimeo(sk, f_flags & O_NONBLOCK);
 
-   err = sctp_wait_for_connect(asoc, );
-   if ((err == 0 || err == -EINPROGRESS) && assoc_id)
+   if (assoc_id)
*assoc_id = asoc->assoc_id;
+   err = sctp_wait_for_connect(asoc, );
+   /* Note: the asoc may be freed after the return of
+* sctp_wait_for_connect.
+*/
 
/* Don't free association on exit. */
asoc = NULL;
-- 
2.7.4



Re: net/sctp: use-after-free in __sctp_connect

2016-11-03 Thread Andrey Konovalov
On Thu, Nov 3, 2016 at 7:35 PM, Marcelo Ricardo Leitner
 wrote:
> On Thu, Nov 03, 2016 at 07:02:47PM +0100, Andrey Konovalov wrote:
>> On Thu, Nov 3, 2016 at 6:52 PM, Marcelo Ricardo Leitner
>>  wrote:
>> > On Thu, Nov 03, 2016 at 06:11:01PM +0100, Andrey Konovalov wrote:
>> >> On Wed, Nov 2, 2016 at 11:42 PM, Andrey Konovalov  
>> >> wrote:
>> >> > On Wed, Oct 19, 2016 at 6:57 PM, Marcelo Ricardo Leitner
>> >> >  wrote:
>> >> >> On Wed, Oct 19, 2016 at 02:25:24PM +0200, Andrey Konovalov wrote:
>> >> >>> Hi,
>> >> >>>
>> >> >>> I've got the following error report while running the syzkaller 
>> >> >>> fuzzer:
>> >> >>>
>> >> >>> ==
>> >> >>> BUG: KASAN: use-after-free in __sctp_connect+0xabe/0xbf0 at addr
>> >> >>> 88006b1dc610
>> >> >>
>> >> >> Seems this is the same that Dmitry Vyukov had reported back in Jan 
>> >> >> 13th.
>> >> >> So far I couldn't identify the reason.
>> >> >> "Good" to know it's still there, thanks for reporting it.
>> >>
>> >> Hi Marcelo,
>> >>
>> >
>> > Hi
>> >
>> >> So I've looked at the code.
>> >> As far as I understand, the problem is a race condition between
>> >> setsockopt(SCTP_SOCKOPT_CONNECTX) and shutdown on an sctp socket.
>> >> setsockopt() calls sctp_wait_for_connect(), which exits the for loop
>> >> on the sk->sk_shutdown & RCV_SHUTDOWN if clause, and then frees asoc
>> >> with sctp_association_put() and returns err = 0.
>> >> Then __sctp_connect() checks that err == 0 and reads asoc->assoc_id
>> >> from the freed asoc.
>> >
>> > Suddenly this seems familiar. Your description makes sense, thanks for
>> > looking deeper into this, Andrey.
>> >
>> > This fix should do it, can you please try it? I'll post it properly
>> > if it works.
>>
>> Yes, it fixes the issue.
>>
>> Tested-by: Andrey Konovalov 
>>
>> Thanks for the fix!
>
> Ahm this other fix looks better: do the read before calling
> sctp_wait_for_connect() as that id won't change in this call and the
> application shouldn't trust this number if an error is returned, so
> there should be no issues by returning it in such situation.
>
> Can you please confirm this one also works? Thanks!

Sure!

This one also works.

Tested-by: Andrey Konovalov 

>
> ---8<---
>
> diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> index 6cdc61c21438..be1d9bb98230 100644
> --- a/net/sctp/socket.c
> +++ b/net/sctp/socket.c
> @@ -1214,9 +1214,12 @@ static int __sctp_connect(struct sock *sk,
>
> timeo = sock_sndtimeo(sk, f_flags & O_NONBLOCK);
>
> -   err = sctp_wait_for_connect(asoc, );
> -   if ((err == 0 || err == -EINPROGRESS) && assoc_id)
> +   if (assoc_id)
> *assoc_id = asoc->assoc_id;
> +   err = sctp_wait_for_connect(asoc, );
> +   /* Note: the asoc may be freed after the return of
> +* sctp_wait_for_connect.
> +*/
>
> /* Don't free association on exit. */
> asoc = NULL;


Re: net/sctp: use-after-free in __sctp_connect

2016-11-03 Thread Marcelo Ricardo Leitner
On Thu, Nov 03, 2016 at 07:02:47PM +0100, Andrey Konovalov wrote:
> On Thu, Nov 3, 2016 at 6:52 PM, Marcelo Ricardo Leitner
>  wrote:
> > On Thu, Nov 03, 2016 at 06:11:01PM +0100, Andrey Konovalov wrote:
> >> On Wed, Nov 2, 2016 at 11:42 PM, Andrey Konovalov  
> >> wrote:
> >> > On Wed, Oct 19, 2016 at 6:57 PM, Marcelo Ricardo Leitner
> >> >  wrote:
> >> >> On Wed, Oct 19, 2016 at 02:25:24PM +0200, Andrey Konovalov wrote:
> >> >>> Hi,
> >> >>>
> >> >>> I've got the following error report while running the syzkaller fuzzer:
> >> >>>
> >> >>> ==
> >> >>> BUG: KASAN: use-after-free in __sctp_connect+0xabe/0xbf0 at addr
> >> >>> 88006b1dc610
> >> >>
> >> >> Seems this is the same that Dmitry Vyukov had reported back in Jan 13th.
> >> >> So far I couldn't identify the reason.
> >> >> "Good" to know it's still there, thanks for reporting it.
> >>
> >> Hi Marcelo,
> >>
> >
> > Hi
> >
> >> So I've looked at the code.
> >> As far as I understand, the problem is a race condition between
> >> setsockopt(SCTP_SOCKOPT_CONNECTX) and shutdown on an sctp socket.
> >> setsockopt() calls sctp_wait_for_connect(), which exits the for loop
> >> on the sk->sk_shutdown & RCV_SHUTDOWN if clause, and then frees asoc
> >> with sctp_association_put() and returns err = 0.
> >> Then __sctp_connect() checks that err == 0 and reads asoc->assoc_id
> >> from the freed asoc.
> >
> > Suddenly this seems familiar. Your description makes sense, thanks for
> > looking deeper into this, Andrey.
> >
> > This fix should do it, can you please try it? I'll post it properly
> > if it works.
> 
> Yes, it fixes the issue.
> 
> Tested-by: Andrey Konovalov 
> 
> Thanks for the fix!

Ahm this other fix looks better: do the read before calling
sctp_wait_for_connect() as that id won't change in this call and the
application shouldn't trust this number if an error is returned, so
there should be no issues by returning it in such situation.

Can you please confirm this one also works? Thanks!

---8<---

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 6cdc61c21438..be1d9bb98230 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1214,9 +1214,12 @@ static int __sctp_connect(struct sock *sk,
 
timeo = sock_sndtimeo(sk, f_flags & O_NONBLOCK);
 
-   err = sctp_wait_for_connect(asoc, );
-   if ((err == 0 || err == -EINPROGRESS) && assoc_id)
+   if (assoc_id)
*assoc_id = asoc->assoc_id;
+   err = sctp_wait_for_connect(asoc, );
+   /* Note: the asoc may be freed after the return of
+* sctp_wait_for_connect.
+*/
 
/* Don't free association on exit. */
asoc = NULL;


[PATCH net-next v1 08/10] amd-xgbe: Support for 64-bit management counter registers

2016-11-03 Thread Tom Lendacky
Add support for reading all management counter registers as 64-bit
values.  The indication of whether to read the high 32-bits to form
a 64-bit value is indicated in the version data.

Signed-off-by: Tom Lendacky 
---
 drivers/net/ethernet/amd/xgbe/xgbe-dev.c |   36 ++
 drivers/net/ethernet/amd/xgbe/xgbe.h |1 +
 2 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-dev.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-dev.c
index b8a04e7..fbd60ee 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-dev.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-dev.c
@@ -2588,17 +2588,33 @@ static u64 xgbe_mmc_read(struct xgbe_prv_data *pdata, 
unsigned int reg_lo)
bool read_hi;
u64 val;
 
-   switch (reg_lo) {
-   /* These registers are always 64 bit */
-   case MMC_TXOCTETCOUNT_GB_LO:
-   case MMC_TXOCTETCOUNT_G_LO:
-   case MMC_RXOCTETCOUNT_GB_LO:
-   case MMC_RXOCTETCOUNT_G_LO:
-   read_hi = true;
-   break;
+   if (pdata->vdata->mmc_64bit) {
+   switch (reg_lo) {
+   /* These registers are always 32 bit */
+   case MMC_RXRUNTERROR:
+   case MMC_RXJABBERERROR:
+   case MMC_RXUNDERSIZE_G:
+   case MMC_RXOVERSIZE_G:
+   case MMC_RXWATCHDOGERROR:
+   read_hi = false;
+   break;
 
-   default:
-   read_hi = false;
+   default:
+   read_hi = true;
+   }
+   } else {
+   switch (reg_lo) {
+   /* These registers are always 64 bit */
+   case MMC_TXOCTETCOUNT_GB_LO:
+   case MMC_TXOCTETCOUNT_G_LO:
+   case MMC_RXOCTETCOUNT_GB_LO:
+   case MMC_RXOCTETCOUNT_G_LO:
+   read_hi = true;
+   break;
+
+   default:
+   read_hi = false;
+   }
}
 
val = XGMAC_IOREAD(pdata, reg_lo);
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe.h 
b/drivers/net/ethernet/amd/xgbe/xgbe.h
index 160b498..7cbf91b 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe.h
+++ b/drivers/net/ethernet/amd/xgbe/xgbe.h
@@ -804,6 +804,7 @@ struct xgbe_hw_features {
 struct xgbe_version_data {
void (*init_function_ptrs_phy_impl)(struct xgbe_phy_if *);
enum xgbe_xpcs_access xpcs_access;
+   unsigned int mmc_64bit;
 };
 
 struct xgbe_prv_data {



[PATCH net-next v1 05/10] amd-xgbe: Prepare for introduction of clause 37 autoneg

2016-11-03 Thread Tom Lendacky
Prepare for the future introduction of clause 37 auto-negotiation by
updating the current auto-negotiation related functions to identify
them as clause 73 functions. Move interrupt enablement to the
enable/disable auto-negotiation functions. Update what will be common
routines to check for the current type of AN and process accordingly.

Signed-off-by: Tom Lendacky 
---
 drivers/net/ethernet/amd/xgbe/xgbe-common.h |5 +
 drivers/net/ethernet/amd/xgbe/xgbe-mdio.c   |  235 ++-
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v1.c |7 +
 drivers/net/ethernet/amd/xgbe/xgbe.h|   14 +-
 4 files changed, 176 insertions(+), 85 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-common.h 
b/drivers/net/ethernet/amd/xgbe/xgbe-common.h
index bbef959..695e982 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-common.h
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-common.h
@@ -1052,6 +1052,11 @@
 #endif
 
 /* MDIO mask values */
+#define XGBE_AN_CL73_INT_CMPLT BIT(0)
+#define XGBE_AN_CL73_INC_LINK  BIT(1)
+#define XGBE_AN_CL73_PG_RCVBIT(2)
+#define XGBE_AN_CL73_INT_MASK  0x07
+
 #define XGBE_XNP_MCF_NULL_MESSAGE  0x001
 #define XGBE_XNP_ACK_PROCESSED BIT(12)
 #define XGBE_XNP_MP_FORMATTED  BIT(13)
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
index 35c302f..d5bfbe4 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
@@ -125,7 +125,33 @@
 #include "xgbe.h"
 #include "xgbe-common.h"
 
-static void xgbe_an_enable_kr_training(struct xgbe_prv_data *pdata)
+static void xgbe_an73_clear_interrupts(struct xgbe_prv_data *pdata)
+{
+   XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_AN_INT, 0);
+}
+
+static void xgbe_an73_disable_interrupts(struct xgbe_prv_data *pdata)
+{
+   XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_AN_INTMASK, 0);
+}
+
+static void xgbe_an73_enable_interrupts(struct xgbe_prv_data *pdata)
+{
+   XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_AN_INTMASK, XGBE_AN_CL73_INT_MASK);
+}
+
+static void xgbe_an_enable_interrupts(struct xgbe_prv_data *pdata)
+{
+   switch (pdata->an_mode) {
+   case XGBE_AN_MODE_CL73:
+   xgbe_an73_enable_interrupts(pdata);
+   break;
+   default:
+   break;
+   }
+}
+
+static void xgbe_an73_enable_kr_training(struct xgbe_prv_data *pdata)
 {
unsigned int reg;
 
@@ -135,7 +161,7 @@ static void xgbe_an_enable_kr_training(struct xgbe_prv_data 
*pdata)
XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL, reg);
 }
 
-static void xgbe_an_disable_kr_training(struct xgbe_prv_data *pdata)
+static void xgbe_an73_disable_kr_training(struct xgbe_prv_data *pdata)
 {
unsigned int reg;
 
@@ -148,7 +174,7 @@ static void xgbe_an_disable_kr_training(struct 
xgbe_prv_data *pdata)
 static void xgbe_kr_mode(struct xgbe_prv_data *pdata)
 {
/* Enable KR training */
-   xgbe_an_enable_kr_training(pdata);
+   xgbe_an73_enable_kr_training(pdata);
 
/* Set MAC to 10G speed */
pdata->hw_if.set_speed(pdata, SPEED_1);
@@ -160,7 +186,7 @@ static void xgbe_kr_mode(struct xgbe_prv_data *pdata)
 static void xgbe_kx_2500_mode(struct xgbe_prv_data *pdata)
 {
/* Disable KR training */
-   xgbe_an_disable_kr_training(pdata);
+   xgbe_an73_disable_kr_training(pdata);
 
/* Set MAC to 2.5G speed */
pdata->hw_if.set_speed(pdata, SPEED_2500);
@@ -172,7 +198,7 @@ static void xgbe_kx_2500_mode(struct xgbe_prv_data *pdata)
 static void xgbe_kx_1000_mode(struct xgbe_prv_data *pdata)
 {
/* Disable KR training */
-   xgbe_an_disable_kr_training(pdata);
+   xgbe_an73_disable_kr_training(pdata);
 
/* Set MAC to 1G speed */
pdata->hw_if.set_speed(pdata, SPEED_1000);
@@ -232,7 +258,8 @@ static bool xgbe_use_mode(struct xgbe_prv_data *pdata,
return pdata->phy_if.phy_impl.use_mode(pdata, mode);
 }
 
-static void xgbe_set_an(struct xgbe_prv_data *pdata, bool enable, bool restart)
+static void xgbe_an73_set(struct xgbe_prv_data *pdata, bool enable,
+ bool restart)
 {
unsigned int reg;
 
@@ -248,22 +275,46 @@ static void xgbe_set_an(struct xgbe_prv_data *pdata, bool 
enable, bool restart)
XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_CTRL1, reg);
 }
 
-static void xgbe_restart_an(struct xgbe_prv_data *pdata)
+static void xgbe_an73_restart(struct xgbe_prv_data *pdata)
 {
-   xgbe_set_an(pdata, true, true);
+   xgbe_an73_enable_interrupts(pdata);
+   xgbe_an73_set(pdata, true, true);
 
-   netif_dbg(pdata, link, pdata->netdev, "AN enabled/restarted\n");
+   netif_dbg(pdata, link, pdata->netdev, "CL73 AN enabled/restarted\n");
 }
 
-static void xgbe_disable_an(struct xgbe_prv_data *pdata)
+static void xgbe_an73_disable(struct xgbe_prv_data *pdata)
 {
-   xgbe_set_an(pdata, false, false);
+   

[PATCH net-next v1 10/10] amd-xgbe: Prepare for supporting PCI devices

2016-11-03 Thread Tom Lendacky
Update the driver framework to separate out platform/ACPI specific code
from general code during device initialization. This will allow for the
introduction of PCI device support.

Signed-off-by: Tom Lendacky 
---
 drivers/net/ethernet/amd/xgbe/Makefile|3 
 drivers/net/ethernet/amd/xgbe/xgbe-dev.c  |   16 -
 drivers/net/ethernet/amd/xgbe/xgbe-drv.c  |   22 -
 drivers/net/ethernet/amd/xgbe/xgbe-main.c |  569 ---
 drivers/net/ethernet/amd/xgbe/xgbe-platform.c |  632 +
 drivers/net/ethernet/amd/xgbe/xgbe.h  |   23 +
 6 files changed, 755 insertions(+), 510 deletions(-)
 create mode 100644 drivers/net/ethernet/amd/xgbe/xgbe-platform.c

diff --git a/drivers/net/ethernet/amd/xgbe/Makefile 
b/drivers/net/ethernet/amd/xgbe/Makefile
index 60b4ae2..217d59e 100644
--- a/drivers/net/ethernet/amd/xgbe/Makefile
+++ b/drivers/net/ethernet/amd/xgbe/Makefile
@@ -3,7 +3,8 @@ obj-$(CONFIG_AMD_XGBE) += amd-xgbe.o
 amd-xgbe-objs := xgbe-main.o xgbe-drv.o xgbe-dev.o \
 xgbe-desc.o xgbe-ethtool.o xgbe-mdio.o \
 xgbe-ptp.o \
-xgbe-phy-v1.o
+xgbe-phy-v1.o \
+xgbe-platform.o
 
 amd-xgbe-$(CONFIG_AMD_XGBE_DCB) += xgbe-dcb.o
 amd-xgbe-$(CONFIG_DEBUG_FS) += xgbe-debugfs.o
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-dev.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-dev.c
index 0a7ab63..9037319 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-dev.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-dev.c
@@ -2088,24 +2088,16 @@ static void xgbe_config_flow_control_threshold(struct 
xgbe_prv_data *pdata)
 
 static unsigned int xgbe_get_tx_fifo_size(struct xgbe_prv_data *pdata)
 {
-   unsigned int fifo_size;
-
-   /* Calculate the configured fifo size */
-   fifo_size = 1 << (pdata->hw_feat.tx_fifo_size + 7);
-
/* The configured value may not be the actual amount of fifo RAM */
-   return min_t(unsigned int, XGMAC_FIFO_TX_MAX, fifo_size);
+   return min_t(unsigned int, pdata->tx_max_fifo_size,
+pdata->hw_feat.tx_fifo_size);
 }
 
 static unsigned int xgbe_get_rx_fifo_size(struct xgbe_prv_data *pdata)
 {
-   unsigned int fifo_size;
-
-   /* Calculate the configured fifo size */
-   fifo_size = 1 << (pdata->hw_feat.rx_fifo_size + 7);
-
/* The configured value may not be the actual amount of fifo RAM */
-   return min_t(unsigned int, XGMAC_FIFO_RX_MAX, fifo_size);
+   return min_t(unsigned int, pdata->rx_max_fifo_size,
+pdata->hw_feat.rx_fifo_size);
 }
 
 static void xgbe_calculate_equal_fifo(unsigned int fifo_size,
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
index dd166a0..a43e9303 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
@@ -114,7 +114,6 @@
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include 
 #include 
 #include 
 #include 
@@ -160,18 +159,8 @@ static int xgbe_alloc_channels(struct xgbe_prv_data *pdata)
channel->dma_regs = pdata->xgmac_regs + DMA_CH_BASE +
(DMA_CH_INC * i);
 
-   if (pdata->per_channel_irq) {
-   /* Get the DMA interrupt (offset 1) */
-   ret = platform_get_irq(pdata->pdev, i + 1);
-   if (ret < 0) {
-   netdev_err(pdata->netdev,
-  "platform_get_irq %u failed\n",
-  i + 1);
-   goto err_irq;
-   }
-
-   channel->dma_irq = ret;
-   }
+   if (pdata->per_channel_irq)
+   channel->dma_irq = pdata->channel_irq[i];
 
if (i < pdata->tx_ring_count) {
spin_lock_init(_ring->lock);
@@ -194,9 +183,6 @@ static int xgbe_alloc_channels(struct xgbe_prv_data *pdata)
 
return 0;
 
-err_irq:
-   kfree(rx_ring);
-
 err_rx_ring:
kfree(tx_ring);
 
@@ -590,6 +576,10 @@ void xgbe_get_all_hw_features(struct xgbe_prv_data *pdata)
hw_feat->tx_ch_cnt++;
hw_feat->tc_cnt++;
 
+   /* Translate the fifo sizes into actual numbers */
+   hw_feat->rx_fifo_size = 1 << (hw_feat->rx_fifo_size + 7);
+   hw_feat->tx_fifo_size = 1 << (hw_feat->tx_fifo_size + 7);
+
DBGPR("<--xgbe_get_all_hw_features\n");
 }
 
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-main.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-main.c
index d9864f0..c7187fc 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-main.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-main.c
@@ -116,20 +116,10 @@
 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
 #include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
 
 #include "xgbe.h"
 

  1   2   3   >