[PATCH v2 net-next] net: phy: improve handling delayed work

2018-09-27 Thread Heiner Kallweit
Using mod_delayed_work() allows to simplify handling delayed work and
removes the need for the sync parameter in phy_trigger_machine().
Also introduce a helper phy_queue_state_machine() to encapsulate the
low-level delayed work calls. No functional change intended.

Signed-off-by: Heiner Kallweit 
---
v2:
- removed inline annotation from phy_queue_state_machine()
---
 drivers/net/phy/phy.c | 29 +++--
 include/linux/phy.h   |  2 +-
 2 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index a1f8e4816..14509a890 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -537,7 +537,7 @@ static int phy_start_aneg_priv(struct phy_device *phydev, 
bool sync)
mutex_unlock(&phydev->lock);
 
if (trigger)
-   phy_trigger_machine(phydev, sync);
+   phy_trigger_machine(phydev);
 
return err;
 }
@@ -635,6 +635,13 @@ int phy_speed_up(struct phy_device *phydev)
 }
 EXPORT_SYMBOL_GPL(phy_speed_up);
 
+static void phy_queue_state_machine(struct phy_device *phydev,
+   unsigned int secs)
+{
+   mod_delayed_work(system_power_efficient_wq, &phydev->state_queue,
+secs * HZ);
+}
+
 /**
  * phy_start_machine - start PHY state machine tracking
  * @phydev: the phy_device struct
@@ -647,7 +654,7 @@ EXPORT_SYMBOL_GPL(phy_speed_up);
  */
 void phy_start_machine(struct phy_device *phydev)
 {
-   queue_delayed_work(system_power_efficient_wq, &phydev->state_queue, HZ);
+   phy_queue_state_machine(phydev, 1);
 }
 EXPORT_SYMBOL_GPL(phy_start_machine);
 
@@ -655,19 +662,14 @@ EXPORT_SYMBOL_GPL(phy_start_machine);
  * phy_trigger_machine - trigger the state machine to run
  *
  * @phydev: the phy_device struct
- * @sync: indicate whether we should wait for the workqueue cancelation
  *
  * Description: There has been a change in state which requires that the
  *   state machine runs.
  */
 
-void phy_trigger_machine(struct phy_device *phydev, bool sync)
+void phy_trigger_machine(struct phy_device *phydev)
 {
-   if (sync)
-   cancel_delayed_work_sync(&phydev->state_queue);
-   else
-   cancel_delayed_work(&phydev->state_queue);
-   queue_delayed_work(system_power_efficient_wq, &phydev->state_queue, 0);
+   phy_queue_state_machine(phydev, 0);
 }
 
 /**
@@ -703,7 +705,7 @@ static void phy_error(struct phy_device *phydev)
phydev->state = PHY_HALTED;
mutex_unlock(&phydev->lock);
 
-   phy_trigger_machine(phydev, false);
+   phy_trigger_machine(phydev);
 }
 
 /**
@@ -745,7 +747,7 @@ static irqreturn_t phy_change(struct phy_device *phydev)
mutex_unlock(&phydev->lock);
 
/* reschedule state queue work to run as soon as possible */
-   phy_trigger_machine(phydev, true);
+   phy_trigger_machine(phydev);
 
if (phy_interrupt_is_valid(phydev) && phy_clear_interrupt(phydev))
goto phy_err;
@@ -911,7 +913,7 @@ void phy_start(struct phy_device *phydev)
}
mutex_unlock(&phydev->lock);
 
-   phy_trigger_machine(phydev, true);
+   phy_trigger_machine(phydev);
 }
 EXPORT_SYMBOL(phy_start);
 
@@ -1130,8 +1132,7 @@ void phy_state_machine(struct work_struct *work)
 * called from phy_disconnect() synchronously.
 */
if (phy_polling_mode(phydev) && old_state != PHY_HALTED)
-   queue_delayed_work(system_power_efficient_wq, 
&phydev->state_queue,
-  PHY_STATE_TIME * HZ);
+   phy_queue_state_machine(phydev, PHY_STATE_TIME);
 }
 
 /**
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 192a1fa0c..15bd074ef 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1039,7 +1039,7 @@ void phy_change_work(struct work_struct *work);
 void phy_mac_interrupt(struct phy_device *phydev);
 void phy_start_machine(struct phy_device *phydev);
 void phy_stop_machine(struct phy_device *phydev);
-void phy_trigger_machine(struct phy_device *phydev, bool sync);
+void phy_trigger_machine(struct phy_device *phydev);
 int phy_ethtool_sset(struct phy_device *phydev, struct ethtool_cmd *cmd);
 void phy_ethtool_ksettings_get(struct phy_device *phydev,
   struct ethtool_link_ksettings *cmd);
-- 
2.19.0



[PATCH v3 net-next 6/9] bnxt_en: Use msix_vec_per_pf_max and msix_vec_per_pf_min devlink params.

2018-09-27 Thread Vasundhara Volam
This patch adds support for following generic permanent mode
devlink parameters. They can be modified using devlink param
commands.

msix_vec_per_pf_max - This param sets the number of MSIX vectors
that the device requests from the host on driver initialization.
This value is set in the device which limits MSIX vectors per PF.

msix_vec_per_pf_min - This param sets the number of minimal MSIX
vectors required for the device initialization. Value 0 indicates
a default value is selected. This value is set in the device which
limits MSIX vectors per PF.

Cc: Michael Chan 
Signed-off-by: Vasundhara Volam 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c | 50 ++-
 drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h |  5 +++
 2 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
index dc566fd..de7e74a 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
@@ -26,6 +26,10 @@
 BNXT_NVM_SHARED_CFG, 1},
{DEVLINK_PARAM_GENERIC_ID_IGNORE_ARI, NVM_OFF_IGNORE_ARI,
 BNXT_NVM_SHARED_CFG, 1},
+   {DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MAX,
+NVM_OFF_MSIX_VEC_PER_PF_MAX, BNXT_NVM_SHARED_CFG, 10},
+   {DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MIN,
+NVM_OFF_MSIX_VEC_PER_PF_MIN, BNXT_NVM_SHARED_CFG, 7},
 };
 
 static int bnxt_hwrm_nvm_req(struct bnxt *bp, u32 param_id, void *msg,
@@ -57,8 +61,22 @@ static int bnxt_hwrm_nvm_req(struct bnxt *bp, u32 param_id, 
void *msg,
idx = bp->pf.fw_fid - BNXT_FIRST_PF_FID;
 
bytesize = roundup(nvm_param.num_bits, BITS_PER_BYTE) / BITS_PER_BYTE;
-   if (nvm_param.num_bits == 1)
-   buf = &val->vbool;
+   switch (bytesize) {
+   case 1:
+   if (nvm_param.num_bits == 1)
+   buf = &val->vbool;
+   else
+   buf = &val->vu8;
+   break;
+   case 2:
+   buf = &val->vu16;
+   break;
+   case 4:
+   buf = &val->vu32;
+   break;
+   default:
+   return -EFAULT;
+   }
 
data_addr = dma_zalloc_coherent(&bp->pdev->dev, bytesize,
&data_dma_addr, GFP_KERNEL);
@@ -109,6 +127,26 @@ static int bnxt_dl_nvm_param_set(struct devlink *dl, u32 
id,
return bnxt_hwrm_nvm_req(bp, id, &req, sizeof(req), &ctx->val);
 }
 
+static int bnxt_dl_msix_validate(struct devlink *dl, u32 id,
+union devlink_param_value val,
+struct netlink_ext_ack *extack)
+{
+   int max_val;
+
+   if (id == DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MAX)
+   max_val = BNXT_MSIX_VEC_MAX;
+
+   if (id == DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MIN)
+   max_val = BNXT_MSIX_VEC_MIN_MAX;
+
+   if (val.vu32 < 0 || val.vu32 > max_val) {
+   NL_SET_ERR_MSG_MOD(extack, "MSIX value is exceeding the range");
+   return -EINVAL;
+   }
+
+   return 0;
+}
+
 static const struct devlink_param bnxt_dl_params[] = {
DEVLINK_PARAM_GENERIC(ENABLE_SRIOV,
  BIT(DEVLINK_PARAM_CMODE_PERMANENT),
@@ -118,6 +156,14 @@ static int bnxt_dl_nvm_param_set(struct devlink *dl, u32 
id,
  BIT(DEVLINK_PARAM_CMODE_PERMANENT),
  bnxt_dl_nvm_param_get, bnxt_dl_nvm_param_set,
  NULL),
+   DEVLINK_PARAM_GENERIC(MSIX_VEC_PER_PF_MAX,
+ BIT(DEVLINK_PARAM_CMODE_PERMANENT),
+ bnxt_dl_nvm_param_get, bnxt_dl_nvm_param_set,
+ bnxt_dl_msix_validate),
+   DEVLINK_PARAM_GENERIC(MSIX_VEC_PER_PF_MIN,
+ BIT(DEVLINK_PARAM_CMODE_PERMANENT),
+ bnxt_dl_nvm_param_get, bnxt_dl_nvm_param_set,
+ bnxt_dl_msix_validate),
 };
 
 int bnxt_dl_register(struct bnxt *bp)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h 
b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h
index da146492..0e67c05 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h
@@ -33,10 +33,15 @@ static inline void bnxt_link_bp_to_dl(struct bnxt *bp, 
struct devlink *dl)
}
 }
 
+#define NVM_OFF_MSIX_VEC_PER_PF_MAX108
+#define NVM_OFF_MSIX_VEC_PER_PF_MIN114
 #define NVM_OFF_IGNORE_ARI 164
 #define NVM_OFF_HW_TC_OFFLOAD  170
 #define NVM_OFF_ENABLE_SRIOV   401
 
+#define BNXT_MSIX_VEC_MAX  1280
+#define BNXT_MSIX_VEC_MIN_MAX  128
+
 enum bnxt_nvm_dir_type {
BNXT_NVM_SHARED_CFG = 40,
BNXT_NVM_PORT_CFG,
-- 
1.8.3.1



[PATCH v3 net-next 8/9] devlink: Add Documentation/networking/devlink-params.txt

2018-09-27 Thread Vasundhara Volam
This patch adds a new file to add information about some of the
generic configuration parameters set via devlink.

Cc: "David S. Miller" 
Cc: Jonathan Corbet 
Cc: linux-...@vger.kernel.org
Cc: Jiri Pirko 
Cc: Michael Chan 
Signed-off-by: Vasundhara Volam 
---
 Documentation/networking/devlink-params.txt | 42 +
 1 file changed, 42 insertions(+)
 create mode 100644 Documentation/networking/devlink-params.txt

diff --git a/Documentation/networking/devlink-params.txt 
b/Documentation/networking/devlink-params.txt
new file mode 100644
index 000..ae444ff
--- /dev/null
+++ b/Documentation/networking/devlink-params.txt
@@ -0,0 +1,42 @@
+Devlink configuration parameters
+
+Following is the list of configuration parameters via devlink interface.
+Each parameter can be generic or driver specific and are device level
+parameters.
+
+Note that the driver-specific files should contain the generic params
+they support to, with supported config modes.
+
+Each parameter can be set in different configuration modes:
+   runtime - set while driver is running, no reset required.
+   driverinit  - applied while driver initializes, requires restart
+   driver by devlink reload command.
+   permanent   - written to device's non-volatile memory, hard reset
+   required.
+
+Following is the list of parameters:
+
+enable_sriov   [DEVICE, GENERIC]
+   Enable Single Root I/O Virtualisation (SRIOV) in
+   the device.
+   Type: Boolean
+
+ignore_ari [DEVICE, GENERIC]
+   Ignore Alternative Routing-ID Interpretation (ARI)
+   capability. If enabled, adapter will ignore ARI
+   capability even when platforms has the support
+   enabled and creates same number of partitions when
+   platform does not support ARI.
+   Type: Boolean
+
+msix_vec_per_pf_max[DEVICE, GENERIC]
+   Provides the maximum number of MSIX interrupts that
+   a device can create. Value is same across all
+   physical functions (PFs) in the device.
+   Type: u32
+
+msix_vec_per_pf_min[DEVICE, GENERIC]
+   Provides the minimum number of MSIX interrupts required
+   for the device initialization. Value is same across all
+   physical functions (PFs) in the device.
+   Type: u32
-- 
1.8.3.1



[PATCH v3 net-next 2/9] devlink: Add generic parameter msix_vec_per_pf_max

2018-09-27 Thread Vasundhara Volam
msix_vec_per_pf_max - This param sets the number of MSIX vectors
that the device requests from the host on driver initialization.
This value is set in the device which is applicable per PF.

Cc: Jiri Pirko 
Cc: Michael Chan 
Signed-off-by: Vasundhara Volam 
---
 include/net/devlink.h | 4 
 net/core/devlink.c| 5 +
 2 files changed, 9 insertions(+)

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 90d8343..59be17b 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -363,6 +363,7 @@ enum devlink_param_generic_id {
DEVLINK_PARAM_GENERIC_ID_ENABLE_SRIOV,
DEVLINK_PARAM_GENERIC_ID_REGION_SNAPSHOT,
DEVLINK_PARAM_GENERIC_ID_IGNORE_ARI,
+   DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MAX,
 
/* add new param generic ids above here*/
__DEVLINK_PARAM_GENERIC_ID_MAX,
@@ -384,6 +385,9 @@ enum devlink_param_generic_id {
 #define DEVLINK_PARAM_GENERIC_IGNORE_ARI_NAME "ignore_ari"
 #define DEVLINK_PARAM_GENERIC_IGNORE_ARI_TYPE DEVLINK_PARAM_TYPE_BOOL
 
+#define DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_NAME "msix_vec_per_pf_max"
+#define DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_TYPE DEVLINK_PARAM_TYPE_U32
+
 #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate) \
 {  \
.id = DEVLINK_PARAM_GENERIC_ID_##_id,   \
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 3349a4d..ce9fe63 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -2680,6 +2680,11 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, 
struct genl_info *info)
.name = DEVLINK_PARAM_GENERIC_IGNORE_ARI_NAME,
.type = DEVLINK_PARAM_GENERIC_IGNORE_ARI_TYPE,
},
+   {
+   .id = DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MAX,
+   .name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_NAME,
+   .type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_TYPE,
+   },
 };
 
 static int devlink_param_generic_verify(const struct devlink_param *param)
-- 
1.8.3.1



[PATCH v3 net-next 4/9] bnxt_en: Use ignore_ari devlink parameter

2018-09-27 Thread Vasundhara Volam
This patch adds support for ignore_ari generic permanent mode
devlink parameter. This parameter is disabled by default. It can be
enabled using devlink param commands.

ignore_ari - If enabled, device ignores ARI(Alternate Routing ID)
capability, even when platforms has the support and creates same number
of partitions when platform does not support ARI capability.

Cc: Michael Chan 
Signed-off-by: Vasundhara Volam 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c | 6 ++
 drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h | 2 ++
 2 files changed, 8 insertions(+)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
index 790c684..5173881 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
@@ -24,6 +24,8 @@
 static const struct bnxt_dl_nvm_param nvm_params[] = {
{DEVLINK_PARAM_GENERIC_ID_ENABLE_SRIOV, NVM_OFF_ENABLE_SRIOV,
 BNXT_NVM_SHARED_CFG, 1},
+   {DEVLINK_PARAM_GENERIC_ID_IGNORE_ARI, NVM_OFF_IGNORE_ARI,
+BNXT_NVM_SHARED_CFG, 1},
 };
 
 static int bnxt_hwrm_nvm_req(struct bnxt *bp, u32 param_id, void *msg,
@@ -108,6 +110,10 @@ static int bnxt_dl_nvm_param_set(struct devlink *dl, u32 
id,
  BIT(DEVLINK_PARAM_CMODE_PERMANENT),
  bnxt_dl_nvm_param_get, bnxt_dl_nvm_param_set,
  NULL),
+   DEVLINK_PARAM_GENERIC(IGNORE_ARI,
+ BIT(DEVLINK_PARAM_CMODE_PERMANENT),
+ bnxt_dl_nvm_param_get, bnxt_dl_nvm_param_set,
+ NULL),
 };
 
 int bnxt_dl_register(struct bnxt *bp)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h 
b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h
index 2f68dc0..da146492 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h
@@ -33,6 +33,8 @@ static inline void bnxt_link_bp_to_dl(struct bnxt *bp, struct 
devlink *dl)
}
 }
 
+#define NVM_OFF_IGNORE_ARI 164
+#define NVM_OFF_HW_TC_OFFLOAD  170
 #define NVM_OFF_ENABLE_SRIOV   401
 
 enum bnxt_nvm_dir_type {
-- 
1.8.3.1



[PATCH v3 net-next 1/9] devlink: Add generic parameter ignore_ari

2018-09-27 Thread Vasundhara Volam
ignore_ari - Device ignores ARI(Alternate Routing ID) capability,
even when platforms has the support and creates same number of
partitions when platform does not support ARI capability.

Cc: Jiri Pirko 
Cc: Michael Chan 
Signed-off-by: Vasundhara Volam 
---
 include/net/devlink.h | 4 
 net/core/devlink.c| 5 +
 2 files changed, 9 insertions(+)

diff --git a/include/net/devlink.h b/include/net/devlink.h
index b9b89d6..90d8343 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -362,6 +362,7 @@ enum devlink_param_generic_id {
DEVLINK_PARAM_GENERIC_ID_MAX_MACS,
DEVLINK_PARAM_GENERIC_ID_ENABLE_SRIOV,
DEVLINK_PARAM_GENERIC_ID_REGION_SNAPSHOT,
+   DEVLINK_PARAM_GENERIC_ID_IGNORE_ARI,
 
/* add new param generic ids above here*/
__DEVLINK_PARAM_GENERIC_ID_MAX,
@@ -380,6 +381,9 @@ enum devlink_param_generic_id {
 #define DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_NAME "region_snapshot_enable"
 #define DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_TYPE DEVLINK_PARAM_TYPE_BOOL
 
+#define DEVLINK_PARAM_GENERIC_IGNORE_ARI_NAME "ignore_ari"
+#define DEVLINK_PARAM_GENERIC_IGNORE_ARI_TYPE DEVLINK_PARAM_TYPE_BOOL
+
 #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate) \
 {  \
.id = DEVLINK_PARAM_GENERIC_ID_##_id,   \
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 8c0ed22..3349a4d 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -2675,6 +2675,11 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, 
struct genl_info *info)
.name = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_NAME,
.type = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_TYPE,
},
+   {
+   .id = DEVLINK_PARAM_GENERIC_ID_IGNORE_ARI,
+   .name = DEVLINK_PARAM_GENERIC_IGNORE_ARI_NAME,
+   .type = DEVLINK_PARAM_GENERIC_IGNORE_ARI_TYPE,
+   },
 };
 
 static int devlink_param_generic_verify(const struct devlink_param *param)
-- 
1.8.3.1



[PATCH v3 net-next 3/9] devlink: Add generic parameter msix_vec_per_pf_min

2018-09-27 Thread Vasundhara Volam
msix_vec_per_pf_min - This param sets the number of minimal MSIX
vectors required for the device initialization. This value is set
in the device which limits MSIX vectors per PF.

Cc: Jiri Pirko 
Cc: Michael Chan 
Signed-off-by: Vasundhara Volam 
---
 include/net/devlink.h | 4 
 net/core/devlink.c| 5 +
 2 files changed, 9 insertions(+)

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 59be17b..361f525 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -364,6 +364,7 @@ enum devlink_param_generic_id {
DEVLINK_PARAM_GENERIC_ID_REGION_SNAPSHOT,
DEVLINK_PARAM_GENERIC_ID_IGNORE_ARI,
DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MAX,
+   DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MIN,
 
/* add new param generic ids above here*/
__DEVLINK_PARAM_GENERIC_ID_MAX,
@@ -388,6 +389,9 @@ enum devlink_param_generic_id {
 #define DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_NAME "msix_vec_per_pf_max"
 #define DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_TYPE DEVLINK_PARAM_TYPE_U32
 
+#define DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_NAME "msix_vec_per_pf_min"
+#define DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_TYPE DEVLINK_PARAM_TYPE_U32
+
 #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate) \
 {  \
.id = DEVLINK_PARAM_GENERIC_ID_##_id,   \
diff --git a/net/core/devlink.c b/net/core/devlink.c
index ce9fe63..25d3bfa 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -2685,6 +2685,11 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, 
struct genl_info *info)
.name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_NAME,
.type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_TYPE,
},
+   {
+   .id = DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MIN,
+   .name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_NAME,
+   .type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_TYPE,
+   },
 };
 
 static int devlink_param_generic_verify(const struct devlink_param *param)
-- 
1.8.3.1



[PATCH v3 net-next 9/9] devlink: Add Documentation/networking/devlink-params-bnxt.txt

2018-09-27 Thread Vasundhara Volam
This patch adds a new file to add information about configuration
parameters that are supported by bnxt_en driver via devlink.

Cc: "David S. Miller" 
Cc: Jonathan Corbet 
Cc: linux-...@vger.kernel.org
Cc: Jiri Pirko 
Cc: Michael Chan 
Signed-off-by: Vasundhara Volam 
---
 Documentation/networking/devlink-params-bnxt.txt | 22 ++
 1 file changed, 22 insertions(+)
 create mode 100644 Documentation/networking/devlink-params-bnxt.txt

diff --git a/Documentation/networking/devlink-params-bnxt.txt 
b/Documentation/networking/devlink-params-bnxt.txt
new file mode 100644
index 000..c7bc9d8
--- /dev/null
+++ b/Documentation/networking/devlink-params-bnxt.txt
@@ -0,0 +1,22 @@
+enable_sriov   [DEVICE, GENERIC]
+   Type: Boolean
+   Configuration mode: Permanent
+
+ignore_ari [DEVICE, GENERIC]
+   Type: Boolean
+   Configuration mode: Permanent
+
+msix_vec_per_pf_max[DEVICE, GENERIC]
+   Type: u32
+   Configuration mode: Permanent
+
+msix_vec_per_pf_min[DEVICE, GENERIC]
+   Type: u32
+   Configuration mode: Permanent
+
+gre_ver_check  [DEVICE, DRIVER-SPECIFIC]
+   Generic Routing Encapsulation (GRE) version check will
+   be enabled in the device. If disabled, device skips
+   version checking for incoming packets.
+   Type: Boolean
+   Configuration mode: Permanent
-- 
1.8.3.1



[PATCH v3 net-next 7/9] bnxt_en: Add a driver specific gre_ver_check devlink parameter.

2018-09-27 Thread Vasundhara Volam
This patch adds following driver-specific permanent mode boolean
parameter.

gre_ver_check - Generic Routing Encapsulation(GRE) version check
will be enabled in the device. If disabled, device skips version
checking for GRE packets.

Cc: Michael Chan 
Signed-off-by: Vasundhara Volam 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c | 24 ++-
 drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h |  1 +
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
index de7e74a..8a10e01 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
@@ -21,6 +21,11 @@
 #endif /* CONFIG_BNXT_SRIOV */
 };
 
+enum bnxt_dl_param_id {
+   BNXT_DEVLINK_PARAM_ID_BASE = DEVLINK_PARAM_GENERIC_ID_MAX,
+   BNXT_DEVLINK_PARAM_ID_GRE_VER_CHECK,
+};
+
 static const struct bnxt_dl_nvm_param nvm_params[] = {
{DEVLINK_PARAM_GENERIC_ID_ENABLE_SRIOV, NVM_OFF_ENABLE_SRIOV,
 BNXT_NVM_SHARED_CFG, 1},
@@ -30,6 +35,8 @@
 NVM_OFF_MSIX_VEC_PER_PF_MAX, BNXT_NVM_SHARED_CFG, 10},
{DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MIN,
 NVM_OFF_MSIX_VEC_PER_PF_MIN, BNXT_NVM_SHARED_CFG, 7},
+   {BNXT_DEVLINK_PARAM_ID_GRE_VER_CHECK, NVM_OFF_DIS_GRE_VER_CHECK,
+BNXT_NVM_SHARED_CFG, 1},
 };
 
 static int bnxt_hwrm_nvm_req(struct bnxt *bp, u32 param_id, void *msg,
@@ -112,9 +119,15 @@ static int bnxt_dl_nvm_param_get(struct devlink *dl, u32 
id,
 {
struct hwrm_nvm_get_variable_input req = {0};
struct bnxt *bp = bnxt_get_bp_from_dl(dl);
+   int rc;
 
bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_NVM_GET_VARIABLE, -1, -1);
-   return bnxt_hwrm_nvm_req(bp, id, &req, sizeof(req), &ctx->val);
+   rc = bnxt_hwrm_nvm_req(bp, id, &req, sizeof(req), &ctx->val);
+   if (!rc)
+   if (id == BNXT_DEVLINK_PARAM_ID_GRE_VER_CHECK)
+   ctx->val.vbool = !ctx->val.vbool;
+
+   return rc;
 }
 
 static int bnxt_dl_nvm_param_set(struct devlink *dl, u32 id,
@@ -124,6 +137,10 @@ static int bnxt_dl_nvm_param_set(struct devlink *dl, u32 
id,
struct bnxt *bp = bnxt_get_bp_from_dl(dl);
 
bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_NVM_SET_VARIABLE, -1, -1);
+
+   if (id == BNXT_DEVLINK_PARAM_ID_GRE_VER_CHECK)
+   ctx->val.vbool = !ctx->val.vbool;
+
return bnxt_hwrm_nvm_req(bp, id, &req, sizeof(req), &ctx->val);
 }
 
@@ -164,6 +181,11 @@ static int bnxt_dl_msix_validate(struct devlink *dl, u32 
id,
  BIT(DEVLINK_PARAM_CMODE_PERMANENT),
  bnxt_dl_nvm_param_get, bnxt_dl_nvm_param_set,
  bnxt_dl_msix_validate),
+   DEVLINK_PARAM_DRIVER(BNXT_DEVLINK_PARAM_ID_GRE_VER_CHECK,
+"gre_ver_check", DEVLINK_PARAM_TYPE_BOOL,
+BIT(DEVLINK_PARAM_CMODE_PERMANENT),
+bnxt_dl_nvm_param_get, bnxt_dl_nvm_param_set,
+NULL),
 };
 
 int bnxt_dl_register(struct bnxt *bp)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h 
b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h
index 0e67c05..e36e41a 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h
@@ -37,6 +37,7 @@ static inline void bnxt_link_bp_to_dl(struct bnxt *bp, struct 
devlink *dl)
 #define NVM_OFF_MSIX_VEC_PER_PF_MIN114
 #define NVM_OFF_IGNORE_ARI 164
 #define NVM_OFF_HW_TC_OFFLOAD  170
+#define NVM_OFF_DIS_GRE_VER_CHECK  171
 #define NVM_OFF_ENABLE_SRIOV   401
 
 #define BNXT_MSIX_VEC_MAX  1280
-- 
1.8.3.1



[PATCH v3 net-next 5/9] bnxt_en: return proper error when FW returns HWRM_ERR_CODE_RESOURCE_ACCESS_DENIED

2018-09-27 Thread Vasundhara Volam
Return proper error code when Firmware returns
HWRM_ERR_CODE_RESOURCE_ACCESS_DENIED for HWRM_NVM_GET/SET_VARIABLE
commands.

Cc: Michael Chan 
Signed-off-by: Vasundhara Volam 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
index 5173881..dc566fd 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
@@ -80,8 +80,12 @@ static int bnxt_hwrm_nvm_req(struct bnxt *bp, u32 param_id, 
void *msg,
memcpy(buf, data_addr, bytesize);
 
dma_free_coherent(&bp->pdev->dev, bytesize, data_addr, data_dma_addr);
-   if (rc)
+   if (rc == HWRM_ERR_CODE_RESOURCE_ACCESS_DENIED) {
+   netdev_err(bp->dev, "PF does not have admin privileges to 
modify NVM config\n");
+   return -EACCES;
+   } else if (rc) {
return -EIO;
+   }
return 0;
 }
 
-- 
1.8.3.1



[PATCH v3 net-next 0/9] bnxt_en: devlink param updates

2018-09-27 Thread Vasundhara Volam
This patchset adds support for 3 generic and 1 driver-specific devlink
parameters. Add documentation for these configuration parameters.

Also, this patchset adds support to return proper error code if
HWRM_NVM_GET/SET_VARIABLE commands return error code
HWRM_ERR_CODE_RESOURCE_ACCESS_DENIED.

v2->v3:
-Remove description of generic parameters from devlink-params-bnxt.txt.

v1->v2:
-Remove hw_tc_offload parameter.
-Update all patches with Cc of MAINTAINERS.
-Add more description in commit message for device specific parameter.
-Add a new Documentation/networking/devlink-params.txt with some
generic devlink parameters information.
-Add a new Documentation/networking/devlink-params-bnxt.txt with devlink
parameters information that are supported by bnxt_en driver.

Vasundhara Volam (9):
  devlink: Add generic parameter ignore_ari
  devlink: Add generic parameter msix_vec_per_pf_max
  devlink: Add generic parameter msix_vec_per_pf_min
  bnxt_en: Use ignore_ari devlink parameter
  bnxt_en: return proper error when FW returns
HWRM_ERR_CODE_RESOURCE_ACCESS_DENIED
  bnxt_en: Use msix_vec_per_pf_max and msix_vec_per_pf_min devlink
params.
  bnxt_en: Add a driver specific gre_ver_check devlink parameter.
  devlink: Add Documentation/networking/devlink-params.txt
  devlink: Add Documentation/networking/devlink-params-bnxt.txt

 Documentation/networking/devlink-params-bnxt.txt  | 22 ++
 Documentation/networking/devlink-params.txt   | 42 +++
 drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c | 86 +--
 drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h |  8 +++
 include/net/devlink.h | 12 
 net/core/devlink.c| 15 
 6 files changed, 181 insertions(+), 4 deletions(-)
 create mode 100644 Documentation/networking/devlink-params-bnxt.txt
 create mode 100644 Documentation/networking/devlink-params.txt

-- 
1.8.3.1



Re: re iproute2 - don't return error on success fix

2018-09-27 Thread Or Gerlitz
On Thu, Sep 27, 2018 at 3:53 PM Phil Sutter  wrote:

> Hmm, I can't reproduce this. My HEAD is at the commit you mentioned:
>
> | % sudo ./tc/tc filter add dev d0 protocol ip parent : flower skip_sw 
> ip_flags nofirstfrag action drop
> | RTNETLINK answers: Operation not supported
> | We have an error talking to the kernel, -1
> | % echo $?
> | 2
>
> Are you sure you tested the right binary?

I will double check, but we're on weekend + holiday, so I'll get doing
that next week


[PATCH 13/15] octeontx2-af: Add support for CGX link management

2018-09-27 Thread sunil . kovvuri
From: Linu Cherian 

CGX LMAC initialization, link status polling etc is done
by low level secure firmware. For link management this patch
adds a interface or communication mechanism between firmware
and this kernel CGX driver.

- Firmware interface specification is defined in cgx_fw_if.h.
- Support to send/receive commands/events to/form firmware.
- events/commands implemented
  * link up
  * link down
  * reading firmware version

Signed-off-by: Linu Cherian 
Signed-off-by: Nithya Mani 
---
 drivers/net/ethernet/marvell/octeontx2/af/cgx.c| 364 -
 drivers/net/ethernet/marvell/octeontx2/af/cgx.h|  32 ++
 .../net/ethernet/marvell/octeontx2/af/cgx_fw_if.h  | 222 +
 3 files changed, 614 insertions(+), 4 deletions(-)
 create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/cgx_fw_if.h

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c 
b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c
index 06fd9fd..b306f57 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c
@@ -24,16 +24,43 @@
 #define DRV_NAME   "octeontx2-cgx"
 #define DRV_STRING  "Marvell OcteonTX2 CGX/MAC Driver"
 
+/**
+ * struct lmac
+ * @wq_cmd_cmplt:  waitq to keep the process blocked until cmd completion
+ * @cmd_lock:  Lock to serialize the command interface
+ * @resp:  command response
+ * @event_cb:  callback for linkchange events
+ * @cmd_pend:  flag set before new command is started
+ * flag cleared after command response is received
+ * @cgx:   parent cgx port
+ * @lmac_id:   lmac port id
+ * @name:  lmac port name
+ */
+struct lmac {
+   wait_queue_head_t wq_cmd_cmplt;
+   struct mutex cmd_lock;
+   struct cgx_evt_sts resp;
+   struct cgx_event_cb event_cb;
+   bool cmd_pend;
+   struct cgx *cgx;
+   u8 lmac_id;
+   char *name;
+};
+
 struct cgx {
void __iomem*reg_base;
struct pci_dev  *pdev;
u8  cgx_id;
u8  lmac_count;
+   struct lmac *lmac_idmap[MAX_LMAC_PER_CGX];
struct list_headcgx_list;
 };
 
 static LIST_HEAD(cgx_list);
 
+/* CGX PHY management internal APIs */
+static int cgx_fwi_link_change(struct cgx *cgx, int lmac_id, bool en);
+
 /* Supported devices */
 static const struct pci_device_id cgx_id_table[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, PCI_DEVID_OCTEONTX2_CGX) },
@@ -45,11 +72,24 @@ MODULE_DESCRIPTION(DRV_STRING);
 MODULE_LICENSE("GPL v2");
 MODULE_DEVICE_TABLE(pci, cgx_id_table);
 
+static void cgx_write(struct cgx *cgx, u64 lmac, u64 offset, u64 val)
+{
+   writeq(val, cgx->reg_base + (lmac << 18) + offset);
+}
+
 static u64 cgx_read(struct cgx *cgx, u64 lmac, u64 offset)
 {
return readq(cgx->reg_base + (lmac << 18) + offset);
 }
 
+static inline struct lmac *lmac_pdata(u8 lmac_id, struct cgx *cgx)
+{
+   if (!cgx || lmac_id >= MAX_LMAC_PER_CGX)
+   return NULL;
+
+   return cgx->lmac_idmap[lmac_id];
+}
+
 int cgx_get_cgx_cnt(void)
 {
struct cgx *cgx_dev;
@@ -85,18 +125,318 @@ void *cgx_get_pdata(int cgx_id)
 }
 EXPORT_SYMBOL(cgx_get_pdata);
 
-static void cgx_lmac_init(struct cgx *cgx)
+/* CGX Firmware interface low level support */
+static int cgx_fwi_cmd_send(struct cgx_cmd *cmd, struct cgx_evt_sts *rsp,
+   struct lmac *lmac)
+{
+   struct cgx *cgx = lmac->cgx;
+   union cgx_cmdreg creg;
+   union cgx_evtreg ereg;
+   struct device *dev;
+   int err = 0;
+
+   /* Ensure no other command is in progress */
+   err = mutex_lock_interruptible(&lmac->cmd_lock);
+   if (err)
+   return err;
+
+   /* Ensure command register is free */
+   creg.val = cgx_read(cgx, lmac->lmac_id,  CGX_COMMAND_REG);
+   if (creg.cmd.own != CGX_CMD_OWN_NS) {
+   err = -EBUSY;
+   goto unlock;
+   }
+
+   /* Update ownership in command request */
+   cmd->own = CGX_CMD_OWN_FIRMWARE;
+
+   /* Mark this lmac as pending, before we start */
+   lmac->cmd_pend = true;
+
+   /* Start command in hardware */
+   creg.cmd = *cmd;
+   cgx_write(cgx, lmac->lmac_id, CGX_COMMAND_REG, creg.val);
+   creg.val = cgx_read(cgx, lmac->lmac_id,  CGX_COMMAND_REG);
+
+   /* Ensure command is completed without errors */
+   if (!wait_event_timeout(lmac->wq_cmd_cmplt, !lmac->cmd_pend,
+   msecs_to_jiffies(CGX_CMD_TIMEOUT))) {
+   dev = &cgx->pdev->dev;
+   ereg.val = cgx_read(cgx, lmac->lmac_id,  CGX_EVENT_REG);
+   if (ereg.val) {
+   dev_err(dev, "cgx port %d:%d: No event for response\n",
+   cgx->cgx_id, lmac->lmac_id);
+   /* copy event */
+   lmac->resp = ereg

[PATCH 09/15] octeontx2-af: Configure block LF's MSIX vector offset

2018-09-27 Thread sunil . kovvuri
From: Sunil Goutham 

Firmware configures a certain number of MSIX vectors to each of
enabled RVU PF/VF. When a block LF is attached to a PF/VF, number
of MSIX vectors needed by that LF are set aside (out of PF/VF's
total MSIX vectors) and LF's msix_offset is configured in HW.

Also added support for a RVU PF/VF to retrieve that block LF's
MSIX vector offset information from AF via mbox.

Signed-off-by: Sunil Goutham 
---
 drivers/net/ethernet/marvell/octeontx2/af/mbox.h   |  18 ++
 drivers/net/ethernet/marvell/octeontx2/af/rvu.c| 333 -
 drivers/net/ethernet/marvell/octeontx2/af/rvu.h|   7 +
 .../net/ethernet/marvell/octeontx2/af/rvu_struct.h |   2 +
 4 files changed, 357 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h 
b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
index 7280d49..bedf0ee 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
@@ -122,6 +122,7 @@ static inline struct mbox_msghdr 
*otx2_mbox_alloc_msg(struct otx2_mbox *mbox,
 M(READY,   0x001, msg_req, ready_msg_rsp)  \
 M(ATTACH_RESOURCES,0x002, rsrc_attach, msg_rsp)\
 M(DETACH_RESOURCES,0x003, rsrc_detach, msg_rsp)\
+M(MSIX_OFFSET, 0x004, msg_req, msix_offset_rsp)\
 /* CGX mbox IDs (range 0x200 - 0x3FF) */   \
 /* NPA mbox IDs (range 0x400 - 0x5FF) */   \
 /* SSO/SSOW mbox IDs (range 0x600 - 0x7FF) */  \
@@ -190,4 +191,21 @@ struct rsrc_detach {
u8 cptlfs:1;
 };
 
+#define MSIX_VECTOR_INVALID0x
+#define MAX_RVU_BLKLF_CNT  256
+
+struct msix_offset_rsp {
+   struct mbox_msghdr hdr;
+   u16  npa_msixoff;
+   u16  nix_msixoff;
+   u8   sso;
+   u8   ssow;
+   u8   timlfs;
+   u8   cptlfs;
+   u16  sso_msixoff[MAX_RVU_BLKLF_CNT];
+   u16  ssow_msixoff[MAX_RVU_BLKLF_CNT];
+   u16  timlf_msixoff[MAX_RVU_BLKLF_CNT];
+   u16  cptlf_msixoff[MAX_RVU_BLKLF_CNT];
+};
+
 #endif /* MBOX_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
index 23e635c..234d273 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
@@ -24,6 +24,11 @@
 
 static int rvu_get_hwvf(struct rvu *rvu, int pcifunc);
 
+static void rvu_set_msix_offset(struct rvu *rvu, struct rvu_pfvf *pfvf,
+   struct rvu_block *block, int lf);
+static void rvu_clear_msix_offset(struct rvu *rvu, struct rvu_pfvf *pfvf,
+ struct rvu_block *block, int lf);
+
 /* Supported devices */
 static const struct pci_device_id rvu_id_table[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, PCI_DEVID_OCTEONTX2_RVU_AF) },
@@ -75,6 +80,45 @@ int rvu_alloc_rsrc(struct rsrc_bmap *rsrc)
return id;
 }
 
+static int rvu_alloc_rsrc_contig(struct rsrc_bmap *rsrc, int nrsrc)
+{
+   int start;
+
+   if (!rsrc->bmap)
+   return -EINVAL;
+
+   start = bitmap_find_next_zero_area(rsrc->bmap, rsrc->max, 0, nrsrc, 0);
+   if (start >= rsrc->max)
+   return -ENOSPC;
+
+   bitmap_set(rsrc->bmap, start, nrsrc);
+   return start;
+}
+
+static void rvu_free_rsrc_contig(struct rsrc_bmap *rsrc, int nrsrc, int start)
+{
+   if (!rsrc->bmap)
+   return;
+   if (start >= rsrc->max)
+   return;
+
+   bitmap_clear(rsrc->bmap, start, nrsrc);
+}
+
+static bool rvu_rsrc_check_contig(struct rsrc_bmap *rsrc, int nrsrc)
+{
+   int start;
+
+   if (!rsrc->bmap)
+   return false;
+
+   start = bitmap_find_next_zero_area(rsrc->bmap, rsrc->max, 0, nrsrc, 0);
+   if (start >= rsrc->max)
+   return false;
+
+   return true;
+}
+
 void rvu_free_rsrc(struct rsrc_bmap *rsrc, int id)
 {
if (!rsrc->bmap)
@@ -103,6 +147,26 @@ int rvu_alloc_bitmap(struct rsrc_bmap *rsrc)
return 0;
 }
 
+/* Get block LF's HW index from a PF_FUNC's block slot number */
+int rvu_get_lf(struct rvu *rvu, struct rvu_block *block, u16 pcifunc, u16 slot)
+{
+   int lf;
+   u16 match = 0;
+
+   spin_lock(&rvu->rsrc_lock);
+   for (lf = 0; lf < block->lf.max; lf++) {
+   if (block->fn_map[lf] == pcifunc) {
+   if (slot == match) {
+   spin_unlock(&rvu->rsrc_lock);
+   return lf;
+   }
+   match++;
+   }
+   }
+   spin_unlock(&rvu->rsrc_lock);
+   return -ENODEV;
+}
+
 /* Convert BLOCK_TYPE_E to a BLOCK_ADDR_E.
  * Some silicon variants of OcteonTX2 supports
  * multiple blocks of same type.
@@ -237,6 +301,16 @@ inline int rvu_get_pf(u16 pcifunc)
return (pcifunc >> RVU_PFVF_PF_SHIFT) & RVU_PFVF_PF_MASK;
 }
 

[PATCH 05/15] octeontx2-af: Add mailbox IRQ and msg handlers

2018-09-27 Thread sunil . kovvuri
From: Sunil Goutham 

This patch adds support for mailbox interrupt and message
handling. Mapped mailbox region and registered a workqueue
for message handling. Enabled mailbox IRQ of RVU PFs
and registered a interrupt handler. When IRQ is triggered
work is added to the mbox workqueue for msgs to get processed.

Signed-off-by: Sunil Goutham 
---
 drivers/net/ethernet/marvell/octeontx2/af/mbox.h   |  14 +-
 drivers/net/ethernet/marvell/octeontx2/af/rvu.c| 254 +
 drivers/net/ethernet/marvell/octeontx2/af/rvu.h|  22 ++
 .../net/ethernet/marvell/octeontx2/af/rvu_struct.h |  22 ++
 4 files changed, 309 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h 
b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
index 8e205fd..fc593f0 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
@@ -33,6 +33,8 @@
 # error "incorrect mailbox area sizes"
 #endif
 
+#define INTR_MASK(pfvfs) ((pfvfs < 64) ? (BIT_ULL(pfvfs) - 1) : (~0ull))
+
 #define MBOX_RSP_TIMEOUT   1000 /* in ms, Time to wait for mbox response */
 
 #define MBOX_MSG_ALIGN 16  /* Align mbox msg start to 16bytes */
@@ -90,8 +92,9 @@ struct mbox_msghdr {
 
 void otx2_mbox_reset(struct otx2_mbox *mbox, int devid);
 void otx2_mbox_destroy(struct otx2_mbox *mbox);
-int otx2_mbox_init(struct otx2_mbox *mbox, void *hwbase, struct pci_dev *pdev,
-  void *reg_base, int direction, int ndevs);
+int otx2_mbox_init(struct otx2_mbox *mbox, void __force *hwbase,
+  struct pci_dev *pdev, void __force *reg_base,
+  int direction, int ndevs);
 void otx2_mbox_msg_send(struct otx2_mbox *mbox, int devid);
 int otx2_mbox_wait_for_rsp(struct otx2_mbox *mbox, int devid);
 int otx2_mbox_busy_poll_for_rsp(struct otx2_mbox *mbox, int devid);
@@ -115,7 +118,7 @@ static inline struct mbox_msghdr 
*otx2_mbox_alloc_msg(struct otx2_mbox *mbox,
 #define MBOX_MSG_MAX   0x
 
 #define MBOX_MESSAGES  \
-M(READY,   0x001, msg_req, msg_rsp)
+M(READY,   0x001, msg_req, ready_msg_rsp)
 
 enum {
 #define M(_name, _id, _1, _2) MBOX_MSG_ ## _name = _id,
@@ -139,4 +142,9 @@ struct msg_rsp {
struct mbox_msghdr hdr;
 };
 
+struct ready_msg_rsp {
+   struct mbox_msghdr hdr;
+   u16sclk_feq;/* SCLK frequency */
+};
+
 #endif /* MBOX_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
index fa5f40b..e795c2f 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
@@ -258,6 +258,245 @@ static int rvu_setup_hw_resources(struct rvu *rvu)
return 0;
 }
 
+static int rvu_process_mbox_msg(struct rvu *rvu, int devid,
+   struct mbox_msghdr *req)
+{
+   /* Check if valid, if not reply with a invalid msg */
+   if (req->sig != OTX2_MBOX_REQ_SIG)
+   goto bad_message;
+
+   if (req->id == MBOX_MSG_READY)
+   return 0;
+
+bad_message:
+   otx2_reply_invalid_msg(&rvu->mbox, devid, req->pcifunc,
+  req->id);
+   return -ENODEV;
+}
+
+static void rvu_mbox_handler(struct work_struct *work)
+{
+   struct rvu_work *mwork = container_of(work, struct rvu_work, work);
+   struct rvu *rvu = mwork->rvu;
+   struct otx2_mbox_dev *mdev;
+   struct mbox_hdr *req_hdr;
+   struct mbox_msghdr *msg;
+   struct otx2_mbox *mbox;
+   int offset, id, err;
+   u16 pf;
+
+   mbox = &rvu->mbox;
+   pf = mwork - rvu->mbox_wrk;
+   mdev = &mbox->dev[pf];
+
+   /* Process received mbox messages */
+   req_hdr = (struct mbox_hdr *)(mdev->mbase + mbox->rx_start);
+   if (req_hdr->num_msgs == 0)
+   return;
+
+   offset = mbox->rx_start + ALIGN(sizeof(*req_hdr), MBOX_MSG_ALIGN);
+
+   for (id = 0; id < req_hdr->num_msgs; id++) {
+   msg = (struct mbox_msghdr *)(mdev->mbase + offset);
+
+   /* Set which PF sent this message based on mbox IRQ */
+   msg->pcifunc &= ~(RVU_PFVF_PF_MASK << RVU_PFVF_PF_SHIFT);
+   msg->pcifunc |= (pf << RVU_PFVF_PF_SHIFT);
+   err = rvu_process_mbox_msg(rvu, pf, msg);
+   if (!err) {
+   offset = mbox->rx_start + msg->next_msgoff;
+   continue;
+   }
+
+   if (msg->pcifunc & RVU_PFVF_FUNC_MASK)
+   dev_warn(rvu->dev, "Error %d when processing message %s 
(0x%x) from PF%d:VF%d\n",
+err, otx2_mbox_id2name(msg->id), msg->id, pf,
+(msg->pcifunc & RVU_PFVF_FUNC_MASK) - 1);
+   else
+   dev_warn(rvu->dev, "Error %d when processing message %s 
(0x%x) from PF%d\n",
+ 

[PATCH 07/15] octeontx2-af: Scan blocks for LFs provisioned to PF/VF

2018-09-27 Thread sunil . kovvuri
From: Sunil Goutham 

Scan all RVU blocks to find any 'LF to RVU PF/VF' mapping done by
low level firmware. If found any, mark them as used in respective
block's LF bitmap and also save mapped PF/VF's PF_FUNC info.

This is done to avoid reattaching a block LF to a different RVU PF/VF.

Signed-off-by: Sunil Goutham 
---
 drivers/net/ethernet/marvell/octeontx2/af/rvu.c| 148 -
 drivers/net/ethernet/marvell/octeontx2/af/rvu.h|  16 +++
 .../net/ethernet/marvell/octeontx2/af/rvu_struct.h |  16 +++
 3 files changed, 178 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
index 25f79bf..9539ab9 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
@@ -22,6 +22,8 @@
 #define DRV_STRING  "Marvell OcteonTX2 RVU Admin Function Driver"
 #define DRV_VERSION"1.0"
 
+static int rvu_get_hwvf(struct rvu *rvu, int pcifunc);
+
 /* Supported devices */
 static const struct pci_device_id rvu_id_table[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, PCI_DEVID_OCTEONTX2_RVU_AF) },
@@ -66,6 +68,91 @@ int rvu_alloc_bitmap(struct rsrc_bmap *rsrc)
return 0;
 }
 
+static void rvu_update_rsrc_map(struct rvu *rvu, struct rvu_pfvf *pfvf,
+   struct rvu_block *block, u16 pcifunc,
+   u16 lf, bool attach)
+{
+   int devnum, num_lfs = 0;
+   bool is_pf;
+   u64 reg;
+
+   if (lf >= block->lf.max) {
+   dev_err(&rvu->pdev->dev,
+   "%s: FATAL: LF %d is >= %s's max lfs i.e %d\n",
+   __func__, lf, block->name, block->lf.max);
+   return;
+   }
+
+   /* Check if this is for a RVU PF or VF */
+   if (pcifunc & RVU_PFVF_FUNC_MASK) {
+   is_pf = false;
+   devnum = rvu_get_hwvf(rvu, pcifunc);
+   } else {
+   is_pf = true;
+   devnum = rvu_get_pf(pcifunc);
+   }
+
+   block->fn_map[lf] = attach ? pcifunc : 0;
+
+   switch (block->type) {
+   case BLKTYPE_NPA:
+   pfvf->npalf = attach ? true : false;
+   num_lfs = pfvf->npalf;
+   break;
+   case BLKTYPE_NIX:
+   pfvf->nixlf = attach ? true : false;
+   num_lfs = pfvf->nixlf;
+   break;
+   case BLKTYPE_SSO:
+   attach ? pfvf->sso++ : pfvf->sso--;
+   num_lfs = pfvf->sso;
+   break;
+   case BLKTYPE_SSOW:
+   attach ? pfvf->ssow++ : pfvf->ssow--;
+   num_lfs = pfvf->ssow;
+   break;
+   case BLKTYPE_TIM:
+   attach ? pfvf->timlfs++ : pfvf->timlfs--;
+   num_lfs = pfvf->timlfs;
+   break;
+   case BLKTYPE_CPT:
+   attach ? pfvf->cptlfs++ : pfvf->cptlfs--;
+   num_lfs = pfvf->cptlfs;
+   break;
+   }
+
+   reg = is_pf ? block->pf_lfcnt_reg : block->vf_lfcnt_reg;
+   rvu_write64(rvu, BLKADDR_RVUM, reg | (devnum << 16), num_lfs);
+}
+
+inline int rvu_get_pf(u16 pcifunc)
+{
+   return (pcifunc >> RVU_PFVF_PF_SHIFT) & RVU_PFVF_PF_MASK;
+}
+
+static int rvu_get_hwvf(struct rvu *rvu, int pcifunc)
+{
+   int pf, func;
+   u64 cfg;
+
+   pf = rvu_get_pf(pcifunc);
+   func = pcifunc & RVU_PFVF_FUNC_MASK;
+
+   /* Get first HWVF attached to this PF */
+   cfg = rvu_read64(rvu, BLKADDR_RVUM, RVU_PRIV_PFX_CFG(pf));
+
+   return ((cfg & 0xFFF) + func - 1);
+}
+
+struct rvu_pfvf *rvu_get_pfvf(struct rvu *rvu, int pcifunc)
+{
+   /* Check if it is a PF or VF */
+   if (pcifunc & RVU_PFVF_FUNC_MASK)
+   return &rvu->hwvf[rvu_get_hwvf(rvu, pcifunc)];
+   else
+   return &rvu->pf[rvu_get_pf(pcifunc)];
+}
+
 static void rvu_check_block_implemented(struct rvu *rvu)
 {
struct rvu_hwinfo *hw = rvu->hw;
@@ -107,6 +194,28 @@ static void rvu_reset_all_blocks(struct rvu *rvu)
rvu_block_reset(rvu, BLKADDR_NDC2, NDC_AF_BLK_RST);
 }
 
+static void rvu_scan_block(struct rvu *rvu, struct rvu_block *block)
+{
+   struct rvu_pfvf *pfvf;
+   u64 cfg;
+   int lf;
+
+   for (lf = 0; lf < block->lf.max; lf++) {
+   cfg = rvu_read64(rvu, block->addr,
+block->lfcfg_reg | (lf << block->lfshift));
+   if (!(cfg & BIT_ULL(63)))
+   continue;
+
+   /* Set this resource as being used */
+   __set_bit(lf, block->lf.bmap);
+
+   /* Get, to whom this LF is attached */
+   pfvf = rvu_get_pfvf(rvu, (cfg >> 8) & 0x);
+   rvu_update_rsrc_map(rvu, pfvf, block,
+   (cfg >> 8) & 0x, lf, true);
+   }
+}
+
 static void rvu_free_hw_resources(struct rvu *rvu)
 {
struct rvu_hwinfo *hw = rvu->hw;
@@ -124,7 +233,7 @@ static int 

[PATCH 15/15] MAINTAINERS: Add entry for Marvell OcteonTX2 Admin Function driver

2018-09-27 Thread sunil . kovvuri
From: Sunil Goutham 

Added maintainers entry for Marvell OcteonTX2 SOC's RVU
admin function driver.

Signed-off-by: Sunil Goutham 
---
 MAINTAINERS | 9 +
 1 file changed, 9 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 7233a9e..4f93114 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8843,6 +8843,15 @@ S:   Supported
 F: drivers/mmc/host/sdhci-xenon*
 F: Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.txt
 
+MARVELL OCTEONTX2 RVU ADMIN FUNCTION DRIVER
+M: Sunil Goutham 
+M: Linu Cherian 
+M: Geetha sowjanya 
+M: Jerin Jacob 
+L: netdev@vger.kernel.org
+S: Maintained
+F: drivers/net/ethernet/marvell/octeontx2/af
+
 MATROX FRAMEBUFFER DRIVER
 L: linux-fb...@vger.kernel.org
 S: Orphan
-- 
2.7.4



[PATCH 11/15] octeontx2-af: Add Marvell OcteonTX2 CGX driver

2018-09-27 Thread sunil . kovvuri
From: Sunil Goutham 

This patch adds basic template for Marvell OcteonTX2's
CGX ethernet interface driver. Just the probe.
RVU AF driver will use APIs exported by this driver
for various things like PF to physical interface mapping,
loopback mode, interface stats etc. Hence marged both
drivers into a single module.

Signed-off-by: Sunil Goutham 
---
 drivers/net/ethernet/marvell/octeontx2/af/Makefile |   2 +-
 drivers/net/ethernet/marvell/octeontx2/af/cgx.c| 100 +
 drivers/net/ethernet/marvell/octeontx2/af/cgx.h|  22 +
 drivers/net/ethernet/marvell/octeontx2/af/rvu.c|  14 ++-
 4 files changed, 136 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/cgx.c
 create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/cgx.h

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/Makefile 
b/drivers/net/ethernet/marvell/octeontx2/af/Makefile
index ac17cb9..8646421 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/Makefile
+++ b/drivers/net/ethernet/marvell/octeontx2/af/Makefile
@@ -7,4 +7,4 @@ obj-$(CONFIG_OCTEONTX2_MBOX) += octeontx2_mbox.o
 obj-$(CONFIG_OCTEONTX2_AF) += octeontx2_af.o
 
 octeontx2_mbox-y := mbox.o
-octeontx2_af-y := rvu.o
+octeontx2_af-y := cgx.o rvu.o
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c 
b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c
new file mode 100644
index 000..cfd80d2
--- /dev/null
+++ b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Marvell OcteonTx2 CGX driver
+ *
+ * Copyright (C) 2018 Marvell International Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "cgx.h"
+
+#define DRV_NAME   "octeontx2-cgx"
+#define DRV_STRING  "Marvell OcteonTX2 CGX/MAC Driver"
+
+struct cgx {
+   void __iomem*reg_base;
+   struct pci_dev  *pdev;
+   u8  cgx_id;
+};
+
+/* Supported devices */
+static const struct pci_device_id cgx_id_table[] = {
+   { PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, PCI_DEVID_OCTEONTX2_CGX) },
+   { 0, }  /* end of table */
+};
+
+MODULE_AUTHOR("Marvell International Ltd.");
+MODULE_DESCRIPTION(DRV_STRING);
+MODULE_LICENSE("GPL v2");
+MODULE_DEVICE_TABLE(pci, cgx_id_table);
+
+static int cgx_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+   int err;
+   struct device *dev = &pdev->dev;
+   struct cgx *cgx;
+
+   cgx = devm_kzalloc(dev, sizeof(*cgx), GFP_KERNEL);
+   if (!cgx)
+   return -ENOMEM;
+   cgx->pdev = pdev;
+
+   pci_set_drvdata(pdev, cgx);
+
+   err = pci_enable_device(pdev);
+   if (err) {
+   dev_err(dev, "Failed to enable PCI device\n");
+   pci_set_drvdata(pdev, NULL);
+   return err;
+   }
+
+   err = pci_request_regions(pdev, DRV_NAME);
+   if (err) {
+   dev_err(dev, "PCI request regions failed 0x%x\n", err);
+   goto err_disable_device;
+   }
+
+   /* MAP configuration registers */
+   cgx->reg_base = pcim_iomap(pdev, PCI_CFG_REG_BAR_NUM, 0);
+   if (!cgx->reg_base) {
+   dev_err(dev, "CGX: Cannot map CSR memory space, aborting\n");
+   err = -ENOMEM;
+   goto err_release_regions;
+   }
+
+   return 0;
+
+err_release_regions:
+   pci_release_regions(pdev);
+err_disable_device:
+   pci_disable_device(pdev);
+   pci_set_drvdata(pdev, NULL);
+   return err;
+}
+
+static void cgx_remove(struct pci_dev *pdev)
+{
+   pci_release_regions(pdev);
+   pci_disable_device(pdev);
+   pci_set_drvdata(pdev, NULL);
+}
+
+struct pci_driver cgx_driver = {
+   .name = DRV_NAME,
+   .id_table = cgx_id_table,
+   .probe = cgx_probe,
+   .remove = cgx_remove,
+};
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cgx.h 
b/drivers/net/ethernet/marvell/octeontx2/af/cgx.h
new file mode 100644
index 000..a7d4b39
--- /dev/null
+++ b/drivers/net/ethernet/marvell/octeontx2/af/cgx.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Marvell OcteonTx2 CGX driver
+ *
+ * Copyright (C) 2018 Marvell International Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef CGX_H
+#define CGX_H
+
+ /* PCI device IDs */
+#definePCI_DEVID_OCTEONTX2_CGX 0xA059
+
+/* PCI BAR nos */
+#define PCI_CFG_REG_BAR_NUM0
+
+extern struct pci_driver cgx_driver;
+
+#endif /* CGX_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c 
b/drivers/net/ethe

[PATCH 14/15] octeontx2-af: Register for CGX lmac events

2018-09-27 Thread sunil . kovvuri
From: Linu Cherian 

Added support in RVU AF driver to register for
CGX LMAC link status change events from firmware
and managing them. Processing part will be added
in followup patches.

- Introduced eventqueue for posting events from cgx lmac.
  Queueing mechanism will ensure that events can be posted
  and firmware can be acked immediately and hence event
  reception and processing are decoupled.
- Events gets added to the queue by notification callback.
  Notification callback is expected to be atomic, since it
  is called from interrupt context.
- Events are dequeued and processed in a worker thread.

Signed-off-by: Linu Cherian 
---
 drivers/net/ethernet/marvell/octeontx2/af/rvu.c|   6 +-
 drivers/net/ethernet/marvell/octeontx2/af/rvu.h|   5 +
 .../net/ethernet/marvell/octeontx2/af/rvu_cgx.c| 101 -
 3 files changed, 108 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
index b363d19..adc7fc6 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
@@ -1564,10 +1564,11 @@ static int rvu_probe(struct pci_dev *pdev, const struct 
pci_device_id *id)
 
err = rvu_register_interrupts(rvu);
if (err)
-   goto err_mbox;
+   goto err_cgx;
 
return 0;
-
+err_cgx:
+   rvu_cgx_wq_destroy(rvu);
 err_mbox:
rvu_mbox_destroy(rvu);
 err_hwsetup:
@@ -1589,6 +1590,7 @@ static void rvu_remove(struct pci_dev *pdev)
struct rvu *rvu = pci_get_drvdata(pdev);
 
rvu_unregister_interrupts(rvu);
+   rvu_cgx_wq_destroy(rvu);
rvu_mbox_destroy(rvu);
rvu_reset_all_blocks(rvu);
rvu_free_hw_resources(rvu);
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
index 385f597..d169fa9 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
@@ -110,6 +110,10 @@ struct rvu {
  * every cgx lmac port
  */
void**cgx_idmap; /* cgx id to cgx data map table */
+   struct  work_struct cgx_evh_work;
+   struct  workqueue_struct *cgx_evh_wq;
+   spinlock_t  cgx_evq_lock; /* cgx event queue lock */
+   struct list_headcgx_evq_head; /* cgx event queue head */
 };
 
 static inline void rvu_write64(struct rvu *rvu, u64 block, u64 offset, u64 val)
@@ -150,4 +154,5 @@ int rvu_poll_reg(struct rvu *rvu, u64 block, u64 offset, 
u64 mask, bool zero);
 
 /* CGX APIs */
 int rvu_cgx_probe(struct rvu *rvu);
+void rvu_cgx_wq_destroy(struct rvu *rvu);
 #endif /* RVU_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c
index bf81507..2359806e 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c
@@ -15,6 +15,11 @@
 #include "rvu.h"
 #include "cgx.h"
 
+struct cgx_evq_entry {
+   struct list_head evq_node;
+   struct cgx_link_event link_event;
+};
+
 static inline u8 cgxlmac_id_to_bmap(u8 cgx_id, u8 lmac_id)
 {
return ((cgx_id & 0xF) << 4) | (lmac_id & 0xF);
@@ -72,9 +77,95 @@ static int rvu_map_cgx_lmac_pf(struct rvu *rvu)
return 0;
 }
 
+/* This is called from interrupt context and is expected to be atomic */
+static int cgx_lmac_postevent(struct cgx_link_event *event, void *data)
+{
+   struct rvu *rvu = data;
+   struct cgx_evq_entry *qentry;
+
+   /* post event to the event queue */
+   qentry = kmalloc(sizeof(*qentry), GFP_ATOMIC);
+   if (!qentry)
+   return -ENOMEM;
+   qentry->link_event = *event;
+   spin_lock(&rvu->cgx_evq_lock);
+   list_add_tail(&qentry->evq_node, &rvu->cgx_evq_head);
+   spin_unlock(&rvu->cgx_evq_lock);
+
+   /* start worker to process the events */
+   queue_work(rvu->cgx_evh_wq, &rvu->cgx_evh_work);
+
+   return 0;
+}
+
+static void cgx_evhandler_task(struct work_struct *work)
+{
+   struct rvu *rvu = container_of(work, struct rvu, cgx_evh_work);
+   struct cgx_evq_entry *qentry;
+   struct cgx_link_event *event;
+   unsigned long flags;
+
+   do {
+   /* Dequeue an event */
+   spin_lock_irqsave(&rvu->cgx_evq_lock, flags);
+   qentry = list_first_entry_or_null(&rvu->cgx_evq_head,
+ struct cgx_evq_entry,
+ evq_node);
+   if (qentry)
+   list_del(&qentry->evq_node);
+   spin_unlock_irqrestore(&rvu->cgx_evq_lock, flags);
+   if (!qentry)
+   break; /* nothing more to process */
+
+   event = &qe

[PATCH 10/15] octeontx2-af: Reconfig MSIX base with IOVA

2018-09-27 Thread sunil . kovvuri
From: Geetha sowjanya 

HW interprets RVU_AF_MSIXTR_BASE address as an IOVA, hence
create a IOMMU mapping for the physcial address configured by
firmware and reconfig RVU_AF_MSIXTR_BASE with IOVA.

Signed-off-by: Geetha sowjanya 
Signed-off-by: Sunil Goutham 
---
 drivers/net/ethernet/marvell/octeontx2/af/rvu.c | 33 ++---
 drivers/net/ethernet/marvell/octeontx2/af/rvu.h |  1 +
 2 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
index 234d273..2a9d2b7 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
@@ -442,9 +442,10 @@ static int rvu_setup_msix_resources(struct rvu *rvu)
 {
struct rvu_hwinfo *hw = rvu->hw;
int pf, vf, numvfs, hwvf, err;
+   int nvecs, offset, max_msix;
struct rvu_pfvf *pfvf;
-   int nvecs, offset;
-   u64 cfg;
+   u64 cfg, phy_addr;
+   dma_addr_t iova;
 
for (pf = 0; pf < hw->total_pfs; pf++) {
cfg = rvu_read64(rvu, BLKADDR_RVUM, RVU_PRIV_PFX_CFG(pf));
@@ -523,6 +524,22 @@ static int rvu_setup_msix_resources(struct rvu *rvu)
}
}
 
+   /* HW interprets RVU_AF_MSIXTR_BASE address as an IOVA, hence
+* create a IOMMU mapping for the physcial address configured by
+* firmware and reconfig RVU_AF_MSIXTR_BASE with IOVA.
+*/
+   cfg = rvu_read64(rvu, BLKADDR_RVUM, RVU_PRIV_CONST);
+   max_msix = cfg & 0xF;
+   phy_addr = rvu_read64(rvu, BLKADDR_RVUM, RVU_AF_MSIXTR_BASE);
+   iova = dma_map_single(rvu->dev, (void *)phy_addr,
+ max_msix * PCI_MSIX_ENTRY_SIZE,
+ DMA_BIDIRECTIONAL);
+   if (dma_mapping_error(rvu->dev, iova))
+   return -ENOMEM;
+
+   rvu_write64(rvu, BLKADDR_RVUM, RVU_AF_MSIXTR_BASE, (u64)iova);
+   rvu->msix_base_iova = iova;
+
return 0;
 }
 
@@ -531,7 +548,8 @@ static void rvu_free_hw_resources(struct rvu *rvu)
struct rvu_hwinfo *hw = rvu->hw;
struct rvu_block *block;
struct rvu_pfvf  *pfvf;
-   int id;
+   int id, max_msix;
+   u64 cfg;
 
/* Free block LF bitmaps */
for (id = 0; id < BLK_COUNT; id++) {
@@ -549,6 +567,15 @@ static void rvu_free_hw_resources(struct rvu *rvu)
pfvf = &rvu->hwvf[id];
kfree(pfvf->msix.bmap);
}
+
+   /* Unmap MSIX vector base IOVA mapping */
+   if (!rvu->msix_base_iova)
+   return;
+   cfg = rvu_read64(rvu, BLKADDR_RVUM, RVU_PRIV_CONST);
+   max_msix = cfg & 0xF;
+   dma_unmap_single(rvu->dev, rvu->msix_base_iova,
+max_msix * PCI_MSIX_ENTRY_SIZE,
+DMA_BIDIRECTIONAL);
 }
 
 static int rvu_setup_hw_resources(struct rvu *rvu)
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
index 7435e83..92c2022 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
@@ -99,6 +99,7 @@ struct rvu {
u16 num_vec;
char*irq_name;
bool*irq_allocated;
+   dma_addr_t  msix_base_iova;
 };
 
 static inline void rvu_write64(struct rvu *rvu, u64 block, u64 offset, u64 val)
-- 
2.7.4



[PATCH 06/15] octeontx2-af: Convert mbox msg id check to a macro

2018-09-27 Thread sunil . kovvuri
From: Aleksey Makarov 

With 10's of mailbox messages expected to be handled in future,
checking for message id could become a lengthy switch case. Hence
added a macro to auto generate the switch case for each msg id.

Signed-off-by: Aleksey Makarov 
---
 drivers/net/ethernet/marvell/octeontx2/af/rvu.c | 44 +
 1 file changed, 38 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
index e795c2f..25f79bf 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
@@ -258,6 +258,12 @@ static int rvu_setup_hw_resources(struct rvu *rvu)
return 0;
 }
 
+static int rvu_mbox_handler_READY(struct rvu *rvu, struct msg_req *req,
+ struct ready_msg_rsp *rsp)
+{
+   return 0;
+}
+
 static int rvu_process_mbox_msg(struct rvu *rvu, int devid,
struct mbox_msghdr *req)
 {
@@ -265,13 +271,39 @@ static int rvu_process_mbox_msg(struct rvu *rvu, int 
devid,
if (req->sig != OTX2_MBOX_REQ_SIG)
goto bad_message;
 
-   if (req->id == MBOX_MSG_READY)
-   return 0;
-
+   switch (req->id) {
+#define M(_name, _id, _req_type, _rsp_type)\
+   case _id: { \
+   struct _rsp_type *rsp;  \
+   int err;\
+   \
+   rsp = (struct _rsp_type *)otx2_mbox_alloc_msg(  \
+   &rvu->mbox, devid,  \
+   sizeof(struct _rsp_type));  \
+   if (rsp) {  \
+   rsp->hdr.id = _id;  \
+   rsp->hdr.sig = OTX2_MBOX_RSP_SIG;   \
+   rsp->hdr.pcifunc = req->pcifunc;\
+   rsp->hdr.rc = 0;\
+   }   \
+   \
+   err = rvu_mbox_handler_ ## _name(rvu,   \
+(struct _req_type *)req, \
+rsp);  \
+   if (rsp && err) \
+   rsp->hdr.rc = err;  \
+   \
+   return rsp ? err : -ENOMEM; \
+   }
+MBOX_MESSAGES
+#undef M
+   break;
 bad_message:
-   otx2_reply_invalid_msg(&rvu->mbox, devid, req->pcifunc,
-  req->id);
-   return -ENODEV;
+   default:
+   otx2_reply_invalid_msg(&rvu->mbox, devid, req->pcifunc,
+  req->id);
+   return -ENODEV;
+   }
 }
 
 static void rvu_mbox_handler(struct work_struct *work)
-- 
2.7.4



[PATCH 08/15] octeontx2-af: Add RVU block LF provisioning support

2018-09-27 Thread sunil . kovvuri
From: Sunil Goutham 

Added support for a RVU PF/VF to request AF via mailbox
to attach or detach NPA/NIX/SSO/SSOW/TIM/CPT block LFs.
Also supports partial detachment and modifying current
LF attached count of a certian block type.

Signed-off-by: Sunil Goutham 
---
 drivers/net/ethernet/marvell/octeontx2/af/mbox.h   |  45 +-
 drivers/net/ethernet/marvell/octeontx2/af/rvu.c| 472 -
 drivers/net/ethernet/marvell/octeontx2/af/rvu.h|   8 +-
 .../net/ethernet/marvell/octeontx2/af/rvu_reg.h|   8 +-
 4 files changed, 523 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h 
b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
index fc593f0..7280d49 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
@@ -118,7 +118,17 @@ static inline struct mbox_msghdr 
*otx2_mbox_alloc_msg(struct otx2_mbox *mbox,
 #define MBOX_MSG_MAX   0x
 
 #define MBOX_MESSAGES  \
-M(READY,   0x001, msg_req, ready_msg_rsp)
+/* Generic mbox IDs (range 0x000 - 0x1FF) */   \
+M(READY,   0x001, msg_req, ready_msg_rsp)  \
+M(ATTACH_RESOURCES,0x002, rsrc_attach, msg_rsp)\
+M(DETACH_RESOURCES,0x003, rsrc_detach, msg_rsp)\
+/* CGX mbox IDs (range 0x200 - 0x3FF) */   \
+/* NPA mbox IDs (range 0x400 - 0x5FF) */   \
+/* SSO/SSOW mbox IDs (range 0x600 - 0x7FF) */  \
+/* TIM mbox IDs (range 0x800 - 0x9FF) */   \
+/* CPT mbox IDs (range 0xA00 - 0xBFF) */   \
+/* NPC mbox IDs (range 0x6000 - 0x7FFF) */ \
+/* NIX mbox IDs (range 0x8000 - 0x) */ \
 
 enum {
 #define M(_name, _id, _1, _2) MBOX_MSG_ ## _name = _id,
@@ -147,4 +157,37 @@ struct ready_msg_rsp {
u16sclk_feq;/* SCLK frequency */
 };
 
+/* Structure for requesting resource provisioning.
+ * 'modify' flag to be used when either requesting more
+ * or to detach partial of a cetain resource type.
+ * Rest of the fields specify how many of what type to
+ * be attached.
+ */
+struct rsrc_attach {
+   struct mbox_msghdr hdr;
+   u8   modify:1;
+   u8   npalf:1;
+   u8   nixlf:1;
+   u16  sso;
+   u16  ssow;
+   u16  timlfs;
+   u16  cptlfs;
+};
+
+/* Structure for relinquishing resources.
+ * 'partial' flag to be used when relinquishing all resources
+ * but only of a certain type. If not set, all resources of all
+ * types provisioned to the RVU function will be detached.
+ */
+struct rsrc_detach {
+   struct mbox_msghdr hdr;
+   u8 partial:1;
+   u8 npalf:1;
+   u8 nixlf:1;
+   u8 sso:1;
+   u8 ssow:1;
+   u8 timlfs:1;
+   u8 cptlfs:1;
+};
+
 #endif /* MBOX_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
index 9539ab9..23e635c 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
@@ -59,6 +59,41 @@ int rvu_poll_reg(struct rvu *rvu, u64 block, u64 offset, u64 
mask, bool zero)
return -EBUSY;
 }
 
+int rvu_alloc_rsrc(struct rsrc_bmap *rsrc)
+{
+   int id;
+
+   if (!rsrc->bmap)
+   return -EINVAL;
+
+   id = find_first_zero_bit(rsrc->bmap, rsrc->max);
+   if (id >= rsrc->max)
+   return -ENOSPC;
+
+   __set_bit(id, rsrc->bmap);
+
+   return id;
+}
+
+void rvu_free_rsrc(struct rsrc_bmap *rsrc, int id)
+{
+   if (!rsrc->bmap)
+   return;
+
+   __clear_bit(id, rsrc->bmap);
+}
+
+int rvu_rsrc_free_count(struct rsrc_bmap *rsrc)
+{
+   int used;
+
+   if (!rsrc->bmap)
+   return 0;
+
+   used = bitmap_weight(rsrc->bmap, rsrc->max);
+   return (rsrc->max - used);
+}
+
 int rvu_alloc_bitmap(struct rsrc_bmap *rsrc)
 {
rsrc->bmap = kcalloc(BITS_TO_LONGS(rsrc->max),
@@ -68,6 +103,78 @@ int rvu_alloc_bitmap(struct rsrc_bmap *rsrc)
return 0;
 }
 
+/* Convert BLOCK_TYPE_E to a BLOCK_ADDR_E.
+ * Some silicon variants of OcteonTX2 supports
+ * multiple blocks of same type.
+ *
+ * @pcifunc has to be zero when no LF is yet attached.
+ */
+int rvu_get_blkaddr(struct rvu *rvu, int blktype, u16 pcifunc)
+{
+   int devnum, blkaddr = -ENODEV;
+   u64 cfg, reg;
+   bool is_pf;
+
+   switch (blktype) {
+   case BLKTYPE_NPA:
+   blkaddr = BLKADDR_NPA;
+   goto exit;
+   case BLKTYPE_NIX:
+   /* For now assume NIX0 */
+   if (!pcifunc) {
+   blkaddr = BLKADDR_NIX0;
+   goto exit;
+   }
+   break;
+   case BLKTYPE_SSO:
+   blkaddr = BLKADDR_SSO;

[PATCH 12/15] octeontx2-af: Set RVU PFs to CGX LMACs mapping

2018-09-27 Thread sunil . kovvuri
From: Linu Cherian 

Each of the enabled CGX LMAC is considered a physical
interface and RVU PFs are mapped to these. VFs of these
SRIOV PFs will be virtual interfaces and share CGX LMAC
along with PF.

This mapping info will be used later on for Rx/Tx pkt steering.

Signed-off-by: Linu Cherian 
Signed-off-by: Geetha sowjanya 
---
 drivers/net/ethernet/marvell/octeontx2/af/Makefile |  2 +-
 drivers/net/ethernet/marvell/octeontx2/af/cgx.c| 59 +
 drivers/net/ethernet/marvell/octeontx2/af/cgx.h| 15 +++-
 drivers/net/ethernet/marvell/octeontx2/af/rvu.c|  4 +
 drivers/net/ethernet/marvell/octeontx2/af/rvu.h| 12 +++
 .../net/ethernet/marvell/octeontx2/af/rvu_cgx.c| 97 ++
 6 files changed, 186 insertions(+), 3 deletions(-)
 create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/Makefile 
b/drivers/net/ethernet/marvell/octeontx2/af/Makefile
index 8646421..eaac264 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/Makefile
+++ b/drivers/net/ethernet/marvell/octeontx2/af/Makefile
@@ -7,4 +7,4 @@ obj-$(CONFIG_OCTEONTX2_MBOX) += octeontx2_mbox.o
 obj-$(CONFIG_OCTEONTX2_AF) += octeontx2_af.o
 
 octeontx2_mbox-y := mbox.o
-octeontx2_af-y := cgx.o rvu.o
+octeontx2_af-y := cgx.o rvu.o rvu_cgx.o
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c 
b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c
index cfd80d2..06fd9fd 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c
@@ -28,8 +28,12 @@ struct cgx {
void __iomem*reg_base;
struct pci_dev  *pdev;
u8  cgx_id;
+   u8  lmac_count;
+   struct list_headcgx_list;
 };
 
+static LIST_HEAD(cgx_list);
+
 /* Supported devices */
 static const struct pci_device_id cgx_id_table[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, PCI_DEVID_OCTEONTX2_CGX) },
@@ -41,6 +45,53 @@ MODULE_DESCRIPTION(DRV_STRING);
 MODULE_LICENSE("GPL v2");
 MODULE_DEVICE_TABLE(pci, cgx_id_table);
 
+static u64 cgx_read(struct cgx *cgx, u64 lmac, u64 offset)
+{
+   return readq(cgx->reg_base + (lmac << 18) + offset);
+}
+
+int cgx_get_cgx_cnt(void)
+{
+   struct cgx *cgx_dev;
+   int count = 0;
+
+   list_for_each_entry(cgx_dev, &cgx_list, cgx_list)
+   count++;
+
+   return count;
+}
+EXPORT_SYMBOL(cgx_get_cgx_cnt);
+
+int cgx_get_lmac_cnt(void *cgxd)
+{
+   struct cgx *cgx = cgxd;
+
+   if (!cgx)
+   return -ENODEV;
+
+   return cgx->lmac_count;
+}
+EXPORT_SYMBOL(cgx_get_lmac_cnt);
+
+void *cgx_get_pdata(int cgx_id)
+{
+   struct cgx *cgx_dev;
+
+   list_for_each_entry(cgx_dev, &cgx_list, cgx_list) {
+   if (cgx_dev->cgx_id == cgx_id)
+   return cgx_dev;
+   }
+   return NULL;
+}
+EXPORT_SYMBOL(cgx_get_pdata);
+
+static void cgx_lmac_init(struct cgx *cgx)
+{
+   cgx->lmac_count = cgx_read(cgx, 0, CGXX_CMRX_RX_LMACS) & 0x7;
+   if (cgx->lmac_count > MAX_LMAC_PER_CGX)
+   cgx->lmac_count = MAX_LMAC_PER_CGX;
+}
+
 static int cgx_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
int err;
@@ -75,9 +126,14 @@ static int cgx_probe(struct pci_dev *pdev, const struct 
pci_device_id *id)
goto err_release_regions;
}
 
+   list_add(&cgx->cgx_list, &cgx_list);
+   cgx->cgx_id = cgx_get_cgx_cnt() - 1;
+   cgx_lmac_init(cgx);
+
return 0;
 
 err_release_regions:
+   list_del(&cgx->cgx_list);
pci_release_regions(pdev);
 err_disable_device:
pci_disable_device(pdev);
@@ -87,6 +143,9 @@ static int cgx_probe(struct pci_dev *pdev, const struct 
pci_device_id *id)
 
 static void cgx_remove(struct pci_dev *pdev)
 {
+   struct cgx *cgx = pci_get_drvdata(pdev);
+
+   list_del(&cgx->cgx_list);
pci_release_regions(pdev);
pci_disable_device(pdev);
pci_set_drvdata(pdev, NULL);
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cgx.h 
b/drivers/net/ethernet/marvell/octeontx2/af/cgx.h
index a7d4b39..acdc16e 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/cgx.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/cgx.h
@@ -12,11 +12,22 @@
 #define CGX_H
 
  /* PCI device IDs */
-#definePCI_DEVID_OCTEONTX2_CGX 0xA059
+#definePCI_DEVID_OCTEONTX2_CGX 0xA059
 
 /* PCI BAR nos */
-#define PCI_CFG_REG_BAR_NUM0
+#define PCI_CFG_REG_BAR_NUM0
+
+#define MAX_CGX3
+#define MAX_LMAC_PER_CGX   4
+#define CGX_OFFSET(x)  ((x) * MAX_LMAC_PER_CGX)
+
+/* Registers */
+#define CGXX_CMRX_RX_ID_MAP0x060
+#define CGXX_CMRX_RX_LMACS 0x128
 
 extern struct pci_driver cgx_driver;
 
+int cgx_get_cgx_cnt(void);
+int cgx_get_lmac_cnt(void *cgxd);
+void *cgx_get_pdata(int cgx_id);
 

[PATCH 03/15] octeontx2-af: Gather RVU blocks HW info

2018-09-27 Thread sunil . kovvuri
From: Sunil Goutham 

This patch gathers NPA/NIX/SSO/SSOW/TIM/CPT RVU blocks's
HW info like number of LFs. Important register offsets
saved for later use to avoid code duplication for each block.
A bitmap is allocated for each of the blocks which later
on will be used to allocate a LF for a RVU PF/VF.

Also added RVU NIX/NPA block registers and few registers
of other blocks.

Signed-off-by: Sunil Goutham 
---
 drivers/net/ethernet/marvell/octeontx2/af/rvu.c| 167 +++
 drivers/net/ethernet/marvell/octeontx2/af/rvu.h|  21 ++
 .../net/ethernet/marvell/octeontx2/af/rvu_reg.h| 333 -
 3 files changed, 517 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
index d40fabf..fa5f40b 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
@@ -57,6 +57,15 @@ int rvu_poll_reg(struct rvu *rvu, u64 block, u64 offset, u64 
mask, bool zero)
return -EBUSY;
 }
 
+int rvu_alloc_bitmap(struct rsrc_bmap *rsrc)
+{
+   rsrc->bmap = kcalloc(BITS_TO_LONGS(rsrc->max),
+sizeof(long), GFP_KERNEL);
+   if (!rsrc->bmap)
+   return -ENOMEM;
+   return 0;
+}
+
 static void rvu_check_block_implemented(struct rvu *rvu)
 {
struct rvu_hwinfo *hw = rvu->hw;
@@ -98,6 +107,157 @@ static void rvu_reset_all_blocks(struct rvu *rvu)
rvu_block_reset(rvu, BLKADDR_NDC2, NDC_AF_BLK_RST);
 }
 
+static void rvu_free_hw_resources(struct rvu *rvu)
+{
+   struct rvu_hwinfo *hw = rvu->hw;
+   struct rvu_block *block;
+   int id;
+
+   /* Free all bitmaps */
+   for (id = 0; id < BLK_COUNT; id++) {
+   block = &hw->block[id];
+   kfree(block->lf.bmap);
+   }
+}
+
+static int rvu_setup_hw_resources(struct rvu *rvu)
+{
+   struct rvu_hwinfo *hw = rvu->hw;
+   struct rvu_block *block;
+   int err;
+   u64 cfg;
+
+   /* Get HW supported max RVU PF & VF count */
+   cfg = rvu_read64(rvu, BLKADDR_RVUM, RVU_PRIV_CONST);
+   hw->total_pfs = (cfg >> 32) & 0xFF;
+   hw->total_vfs = (cfg >> 20) & 0xFFF;
+   hw->max_vfs_per_pf = (cfg >> 40) & 0xFF;
+
+   /* Init NPA LF's bitmap */
+   block = &hw->block[BLKADDR_NPA];
+   if (!block->implemented)
+   goto nix;
+   cfg = rvu_read64(rvu, BLKADDR_NPA, NPA_AF_CONST);
+   block->lf.max = (cfg >> 16) & 0xFFF;
+   block->addr = BLKADDR_NPA;
+   block->lfshift = 8;
+   block->lookup_reg = NPA_AF_RVU_LF_CFG_DEBUG;
+   block->pf_lfcnt_reg = RVU_PRIV_PFX_NPA_CFG;
+   block->vf_lfcnt_reg = RVU_PRIV_HWVFX_NPA_CFG;
+   block->lfcfg_reg = NPA_PRIV_LFX_CFG;
+   block->msixcfg_reg = NPA_PRIV_LFX_INT_CFG;
+   block->lfreset_reg = NPA_AF_LF_RST;
+   sprintf(block->name, "NPA");
+   err = rvu_alloc_bitmap(&block->lf);
+   if (err)
+   return err;
+
+nix:
+   /* Init NIX LF's bitmap */
+   block = &hw->block[BLKADDR_NIX0];
+   if (!block->implemented)
+   goto sso;
+   cfg = rvu_read64(rvu, BLKADDR_NIX0, NIX_AF_CONST2);
+   block->lf.max = cfg & 0xFFF;
+   block->addr = BLKADDR_NIX0;
+   block->lfshift = 8;
+   block->lookup_reg = NIX_AF_RVU_LF_CFG_DEBUG;
+   block->pf_lfcnt_reg = RVU_PRIV_PFX_NIX_CFG;
+   block->vf_lfcnt_reg = RVU_PRIV_HWVFX_NIX_CFG;
+   block->lfcfg_reg = NIX_PRIV_LFX_CFG;
+   block->msixcfg_reg = NIX_PRIV_LFX_INT_CFG;
+   block->lfreset_reg = NIX_AF_LF_RST;
+   sprintf(block->name, "NIX");
+   err = rvu_alloc_bitmap(&block->lf);
+   if (err)
+   return err;
+
+sso:
+   /* Init SSO group's bitmap */
+   block = &hw->block[BLKADDR_SSO];
+   if (!block->implemented)
+   goto ssow;
+   cfg = rvu_read64(rvu, BLKADDR_SSO, SSO_AF_CONST);
+   block->lf.max = cfg & 0x;
+   block->addr = BLKADDR_SSO;
+   block->multislot = true;
+   block->lfshift = 3;
+   block->lookup_reg = SSO_AF_RVU_LF_CFG_DEBUG;
+   block->pf_lfcnt_reg = RVU_PRIV_PFX_SSO_CFG;
+   block->vf_lfcnt_reg = RVU_PRIV_HWVFX_SSO_CFG;
+   block->lfcfg_reg = SSO_PRIV_LFX_HWGRP_CFG;
+   block->msixcfg_reg = SSO_PRIV_LFX_HWGRP_INT_CFG;
+   block->lfreset_reg = SSO_AF_LF_HWGRP_RST;
+   sprintf(block->name, "SSO GROUP");
+   err = rvu_alloc_bitmap(&block->lf);
+   if (err)
+   return err;
+
+ssow:
+   /* Init SSO workslot's bitmap */
+   block = &hw->block[BLKADDR_SSOW];
+   if (!block->implemented)
+   goto tim;
+   block->lf.max = (cfg >> 56) & 0xFF;
+   block->addr = BLKADDR_SSOW;
+   block->multislot = true;
+   block->lfshift = 3;
+   block->lookup_reg = SSOW_AF_RVU_LF_HWS_CFG_DEBUG;
+   block->pf_lfcnt_reg = RVU_PRIV_PFX_SSOW_CFG;
+   block->vf_lfcnt_reg = RVU_PRIV_HWVFX_SSOW_CFG;
+   block->lfcfg_reg = SSO

[PATCH 01/15] octeontx2-af: Add Marvell OcteonTX2 RVU AF driver

2018-09-27 Thread sunil . kovvuri
From: Sunil Goutham 

This patch adds basic template for Marvell OcteonTX2's
resource virtualization unit (RVU) admin function (AF)
driver. Just the driver registration and probe.

Signed-off-by: Sunil Goutham 
---
 drivers/net/ethernet/marvell/Kconfig   |   3 +
 drivers/net/ethernet/marvell/Makefile  |   1 +
 drivers/net/ethernet/marvell/octeontx2/Kconfig |  12 ++
 drivers/net/ethernet/marvell/octeontx2/Makefile|   6 +
 drivers/net/ethernet/marvell/octeontx2/af/Makefile |   8 ++
 drivers/net/ethernet/marvell/octeontx2/af/rvu.c| 126 +
 drivers/net/ethernet/marvell/octeontx2/af/rvu.h|  31 +
 7 files changed, 187 insertions(+)
 create mode 100644 drivers/net/ethernet/marvell/octeontx2/Kconfig
 create mode 100644 drivers/net/ethernet/marvell/octeontx2/Makefile
 create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/Makefile
 create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/rvu.c
 create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/rvu.h

diff --git a/drivers/net/ethernet/marvell/Kconfig 
b/drivers/net/ethernet/marvell/Kconfig
index f33fd22..3238aa7 100644
--- a/drivers/net/ethernet/marvell/Kconfig
+++ b/drivers/net/ethernet/marvell/Kconfig
@@ -167,4 +167,7 @@ config SKY2_DEBUG
 
  If unsure, say N.
 
+
+source "drivers/net/ethernet/marvell/octeontx2/Kconfig"
+
 endif # NET_VENDOR_MARVELL
diff --git a/drivers/net/ethernet/marvell/Makefile 
b/drivers/net/ethernet/marvell/Makefile
index 55d4d10..89dea72 100644
--- a/drivers/net/ethernet/marvell/Makefile
+++ b/drivers/net/ethernet/marvell/Makefile
@@ -11,3 +11,4 @@ obj-$(CONFIG_MVPP2) += mvpp2/
 obj-$(CONFIG_PXA168_ETH) += pxa168_eth.o
 obj-$(CONFIG_SKGE) += skge.o
 obj-$(CONFIG_SKY2) += sky2.o
+obj-y  += octeontx2/
diff --git a/drivers/net/ethernet/marvell/octeontx2/Kconfig 
b/drivers/net/ethernet/marvell/octeontx2/Kconfig
new file mode 100644
index 000..9743502
--- /dev/null
+++ b/drivers/net/ethernet/marvell/octeontx2/Kconfig
@@ -0,0 +1,12 @@
+#
+# Marvell OcteonTX2 drivers configuration
+#
+
+config OCTEONTX2_AF
+   tristate "Marvell OcteonTX2 RVU Admin Function driver"
+   depends on ARM64 && PCI
+   help
+ This driver supports Marvell's OcteonTX2 Resource Virtualization
+ Unit's admin function manager which manages all RVU HW resources
+ and provides a medium to other PF/VFs to configure HW. Should be
+ enabled for other RVU device drivers to work.
diff --git a/drivers/net/ethernet/marvell/octeontx2/Makefile 
b/drivers/net/ethernet/marvell/octeontx2/Makefile
new file mode 100644
index 000..e579dcd
--- /dev/null
+++ b/drivers/net/ethernet/marvell/octeontx2/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for Marvell OcteonTX2 device drivers.
+#
+
+obj-$(CONFIG_OCTEONTX2_AF) += af/
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/Makefile 
b/drivers/net/ethernet/marvell/octeontx2/af/Makefile
new file mode 100644
index 000..dacbd16
--- /dev/null
+++ b/drivers/net/ethernet/marvell/octeontx2/af/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for Marvell's OcteonTX2 RVU Admin Function driver
+#
+
+obj-$(CONFIG_OCTEONTX2_AF) += octeontx2_af.o
+
+octeontx2_af-y := rvu.o
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
new file mode 100644
index 000..5af4da6
--- /dev/null
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Marvell OcteonTx2 RVU Admin Function driver
+ *
+ * Copyright (C) 2018 Marvell International Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "rvu.h"
+
+#define DRV_NAME   "octeontx2-af"
+#define DRV_STRING  "Marvell OcteonTX2 RVU Admin Function Driver"
+#define DRV_VERSION"1.0"
+
+/* Supported devices */
+static const struct pci_device_id rvu_id_table[] = {
+   { PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, PCI_DEVID_OCTEONTX2_RVU_AF) },
+   { 0, }  /* end of table */
+};
+
+MODULE_AUTHOR("Marvell International Ltd.");
+MODULE_DESCRIPTION(DRV_STRING);
+MODULE_LICENSE("GPL v2");
+MODULE_VERSION(DRV_VERSION);
+MODULE_DEVICE_TABLE(pci, rvu_id_table);
+
+static int rvu_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+   struct device *dev = &pdev->dev;
+   struct rvu *rvu;
+   interr;
+
+   rvu = devm_kzalloc(dev, sizeof(*rvu), GFP_KERNEL);
+   if (!rvu)
+   return -ENOMEM;
+
+   pci_set_drvdata(pdev, rvu);
+   rvu->pdev = pdev;
+   rvu->dev = &pdev->dev;
+
+   err = pci_enable_device(pdev);
+   if (err) {
+   dev_err(dev, "Failed to enable PCI device\n");
+   goto err_free

[PATCH 02/15] octeontx2-af: Reset all RVU blocks

2018-09-27 Thread sunil . kovvuri
From: Sunil Goutham 

Go through all BLKADDRs and check which ones are implemented
on this silicon and do a HW reset of each implemented block.
Also added all RVU AF and PF register offsets.

Signed-off-by: Sunil Goutham 
---
 drivers/net/ethernet/marvell/octeontx2/af/rvu.c|  78 ++
 drivers/net/ethernet/marvell/octeontx2/af/rvu.h|  37 +++
 .../net/ethernet/marvell/octeontx2/af/rvu_reg.h| 112 +
 .../net/ethernet/marvell/octeontx2/af/rvu_struct.h |  34 +++
 4 files changed, 261 insertions(+)
 create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/rvu_reg.h
 create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/rvu_struct.h

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
index 5af4da6..d40fabf 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
@@ -16,6 +16,7 @@
 #include 
 
 #include "rvu.h"
+#include "rvu_reg.h"
 
 #define DRV_NAME   "octeontx2-af"
 #define DRV_STRING  "Marvell OcteonTX2 RVU Admin Function Driver"
@@ -33,6 +34,70 @@ MODULE_LICENSE("GPL v2");
 MODULE_VERSION(DRV_VERSION);
 MODULE_DEVICE_TABLE(pci, rvu_id_table);
 
+/* Poll a RVU block's register 'offset', for a 'zero'
+ * or 'nonzero' at bits specified by 'mask'
+ */
+int rvu_poll_reg(struct rvu *rvu, u64 block, u64 offset, u64 mask, bool zero)
+{
+   void __iomem *reg;
+   int timeout = 100;
+   u64 reg_val;
+
+   reg = rvu->afreg_base + ((block << 28) | offset);
+   while (timeout) {
+   reg_val = readq(reg);
+   if (zero && !(reg_val & mask))
+   return 0;
+   if (!zero && (reg_val & mask))
+   return 0;
+   udelay(1);
+   cpu_relax();
+   timeout--;
+   }
+   return -EBUSY;
+}
+
+static void rvu_check_block_implemented(struct rvu *rvu)
+{
+   struct rvu_hwinfo *hw = rvu->hw;
+   struct rvu_block *block;
+   int blkid;
+   u64 cfg;
+
+   /* For each block check if 'implemented' bit is set */
+   for (blkid = 0; blkid < BLK_COUNT; blkid++) {
+   block = &hw->block[blkid];
+   cfg = rvupf_read64(rvu, RVU_PF_BLOCK_ADDRX_DISC(blkid));
+   if (cfg & BIT_ULL(11))
+   block->implemented = true;
+   }
+}
+
+static void rvu_block_reset(struct rvu *rvu, int blkaddr, u64 rst_reg)
+{
+   struct rvu_block *block = &rvu->hw->block[blkaddr];
+
+   if (!block->implemented)
+   return;
+
+   rvu_write64(rvu, blkaddr, rst_reg, BIT_ULL(0));
+   rvu_poll_reg(rvu, blkaddr, rst_reg, BIT_ULL(63), true);
+}
+
+static void rvu_reset_all_blocks(struct rvu *rvu)
+{
+   /* Do a HW reset of all RVU blocks */
+   rvu_block_reset(rvu, BLKADDR_NPA, NPA_AF_BLK_RST);
+   rvu_block_reset(rvu, BLKADDR_NIX0, NIX_AF_BLK_RST);
+   rvu_block_reset(rvu, BLKADDR_NPC, NPC_AF_BLK_RST);
+   rvu_block_reset(rvu, BLKADDR_SSO, SSO_AF_BLK_RST);
+   rvu_block_reset(rvu, BLKADDR_TIM, TIM_AF_BLK_RST);
+   rvu_block_reset(rvu, BLKADDR_CPT0, CPT_AF_BLK_RST);
+   rvu_block_reset(rvu, BLKADDR_NDC0, NDC_AF_BLK_RST);
+   rvu_block_reset(rvu, BLKADDR_NDC1, NDC_AF_BLK_RST);
+   rvu_block_reset(rvu, BLKADDR_NDC2, NDC_AF_BLK_RST);
+}
+
 static int rvu_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
struct device *dev = &pdev->dev;
@@ -43,6 +108,12 @@ static int rvu_probe(struct pci_dev *pdev, const struct 
pci_device_id *id)
if (!rvu)
return -ENOMEM;
 
+   rvu->hw = devm_kzalloc(dev, sizeof(struct rvu_hwinfo), GFP_KERNEL);
+   if (!rvu->hw) {
+   devm_kfree(dev, rvu);
+   return -ENOMEM;
+   }
+
pci_set_drvdata(pdev, rvu);
rvu->pdev = pdev;
rvu->dev = &pdev->dev;
@@ -80,6 +151,11 @@ static int rvu_probe(struct pci_dev *pdev, const struct 
pci_device_id *id)
goto err_release_regions;
}
 
+   /* Check which blocks the HW supports */
+   rvu_check_block_implemented(rvu);
+
+   rvu_reset_all_blocks(rvu);
+
return 0;
 
 err_release_regions:
@@ -88,6 +164,7 @@ static int rvu_probe(struct pci_dev *pdev, const struct 
pci_device_id *id)
pci_disable_device(pdev);
 err_freemem:
pci_set_drvdata(pdev, NULL);
+   devm_kfree(&pdev->dev, rvu->hw);
devm_kfree(dev, rvu);
return err;
 }
@@ -100,6 +177,7 @@ static void rvu_remove(struct pci_dev *pdev)
pci_disable_device(pdev);
pci_set_drvdata(pdev, NULL);
 
+   devm_kfree(&pdev->dev, rvu->hw);
devm_kfree(&pdev->dev, rvu);
 }
 
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h 
b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
index 4a4b0ad..e2c54d0 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
@

[PATCH 04/15] octeontx2-af: Add mailbox support infra

2018-09-27 Thread sunil . kovvuri
From: Sunil Goutham 

This patch adds mailbox support infrastructure APIs.
Each RVU device has a dedicated 64KB mailbox region
shared with it's peer for communication. RVU AF has
a separate mailbox region shared with each of RVU PFs
and a RVU PF has a separate region shared with each of
it's VF.

These set of APIs are used by this driver (RVU AF) and
other RVU PF/VF drivers eg netdev, crypto e.t.c.

Signed-off-by: Aleksey Makarov 
Signed-off-by: Sunil Goutham 
Signed-off-by: Lukasz Bartosik 
---
 drivers/net/ethernet/marvell/octeontx2/Kconfig |   4 +
 drivers/net/ethernet/marvell/octeontx2/af/Makefile |   2 +
 drivers/net/ethernet/marvell/octeontx2/af/mbox.c   | 303 +
 drivers/net/ethernet/marvell/octeontx2/af/mbox.h   | 142 ++
 .../net/ethernet/marvell/octeontx2/af/rvu_reg.h|   4 +
 5 files changed, 455 insertions(+)
 create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/mbox.c
 create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/mbox.h

diff --git a/drivers/net/ethernet/marvell/octeontx2/Kconfig 
b/drivers/net/ethernet/marvell/octeontx2/Kconfig
index 9743502..8002f9c 100644
--- a/drivers/net/ethernet/marvell/octeontx2/Kconfig
+++ b/drivers/net/ethernet/marvell/octeontx2/Kconfig
@@ -2,8 +2,12 @@
 # Marvell OcteonTX2 drivers configuration
 #
 
+config OCTEONTX2_MBOX
+tristate
+
 config OCTEONTX2_AF
tristate "Marvell OcteonTX2 RVU Admin Function driver"
+   select OCTEONTX2_MBOX
depends on ARM64 && PCI
help
  This driver supports Marvell's OcteonTX2 Resource Virtualization
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/Makefile 
b/drivers/net/ethernet/marvell/octeontx2/af/Makefile
index dacbd16..ac17cb9 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/Makefile
+++ b/drivers/net/ethernet/marvell/octeontx2/af/Makefile
@@ -3,6 +3,8 @@
 # Makefile for Marvell's OcteonTX2 RVU Admin Function driver
 #
 
+obj-$(CONFIG_OCTEONTX2_MBOX) += octeontx2_mbox.o
 obj-$(CONFIG_OCTEONTX2_AF) += octeontx2_af.o
 
+octeontx2_mbox-y := mbox.o
 octeontx2_af-y := rvu.o
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mbox.c 
b/drivers/net/ethernet/marvell/octeontx2/af/mbox.c
new file mode 100644
index 000..0722fa4
--- /dev/null
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mbox.c
@@ -0,0 +1,303 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Marvell OcteonTx2 RVU Admin Function driver
+ *
+ * Copyright (C) 2018 Marvell International Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include 
+#include 
+#include 
+
+#include "rvu_reg.h"
+#include "mbox.h"
+
+static const u16 msgs_offset = ALIGN(sizeof(struct mbox_hdr), MBOX_MSG_ALIGN);
+
+void otx2_mbox_reset(struct otx2_mbox *mbox, int devid)
+{
+   struct otx2_mbox_dev *mdev = &mbox->dev[devid];
+   struct mbox_hdr *tx_hdr =
+   (struct mbox_hdr *)(mdev->mbase  + mbox->tx_start);
+   struct mbox_hdr *rx_hdr =
+   (struct mbox_hdr *)(mdev->mbase  + mbox->rx_start);
+
+   spin_lock(&mdev->mbox_lock);
+   mdev->msg_size = 0;
+   mdev->rsp_size = 0;
+   tx_hdr->num_msgs = 0;
+   rx_hdr->num_msgs = 0;
+   spin_unlock(&mdev->mbox_lock);
+}
+EXPORT_SYMBOL(otx2_mbox_reset);
+
+void otx2_mbox_destroy(struct otx2_mbox *mbox)
+{
+   mbox->reg_base = NULL;
+   mbox->hwbase = NULL;
+
+   kfree(mbox->dev);
+   mbox->dev = NULL;
+}
+EXPORT_SYMBOL(otx2_mbox_destroy);
+
+int otx2_mbox_init(struct otx2_mbox *mbox, void *hwbase, struct pci_dev *pdev,
+  void *reg_base, int direction, int ndevs)
+{
+   int devid;
+   struct otx2_mbox_dev *mdev;
+
+   switch (direction) {
+   case MBOX_DIR_AFPF:
+   case MBOX_DIR_PFVF:
+   mbox->tx_start = MBOX_DOWN_TX_START;
+   mbox->rx_start = MBOX_DOWN_RX_START;
+   mbox->tx_size  = MBOX_DOWN_TX_SIZE;
+   mbox->rx_size  = MBOX_DOWN_RX_SIZE;
+   break;
+   case MBOX_DIR_PFAF:
+   case MBOX_DIR_VFPF:
+   mbox->tx_start = MBOX_DOWN_RX_START;
+   mbox->rx_start = MBOX_DOWN_TX_START;
+   mbox->tx_size  = MBOX_DOWN_RX_SIZE;
+   mbox->rx_size  = MBOX_DOWN_TX_SIZE;
+   break;
+   case MBOX_DIR_AFPF_UP:
+   case MBOX_DIR_PFVF_UP:
+   mbox->tx_start = MBOX_UP_TX_START;
+   mbox->rx_start = MBOX_UP_RX_START;
+   mbox->tx_size  = MBOX_UP_TX_SIZE;
+   mbox->rx_size  = MBOX_UP_RX_SIZE;
+   break;
+   case MBOX_DIR_PFAF_UP:
+   case MBOX_DIR_VFPF_UP:
+   mbox->tx_start = MBOX_UP_RX_START;
+   mbox->rx_start = MBOX_UP_TX_START;
+   mbox->tx_size  = MBOX_UP_RX_SIZE;
+   mbox->rx_size  = MBOX_UP_TX_SIZE;
+   break;
+   default:
+

[PATCH 00/15] octeontx2-af: Add RVU Admin Function driver

2018-09-27 Thread sunil . kovvuri
From: Sunil Goutham 

Resource virtualization unit (RVU) on Marvell's OcteonTX2 SOC maps HW
resources from the network, crypto and other functional blocks into
PCI-compatible physical and virtual functions. Each functional block
again has multiple local functions (LFs) for provisioning to PCI devices.
RVU supports multiple PCIe SRIOV physical functions (PFs) and virtual
functions (VFs). PF0 is called the administrative / admin function (AF)
and has privileges to provision RVU functional block's LFs to each of the
PF/VF.

RVU managed networking functional blocks
 - Network pool allocator (NPA)
 - Network interface controller (NIX)
 - Network parser CAM (NPC)
 - Schedule/Synchronize/Order unit (SSO)

RVU managed non-networking functional blocks
 - Crypto accelerator (CPT)
 - Scheduled timers unit (TIM)
 - Schedule/Synchronize/Order unit (SSO)
   Used for both networking and non networking usecases
 - Compression (upcoming in future variants of the silicons)

Resource provisioning examples
 - A PF/VF with NIX-LF & NPA-LF resources works as a pure network device
 - A PF/VF with CPT-LF resource works as a pure cyrpto offload device.

This admin function driver neither receives any data nor processes it i.e
no I/O, a configuration only driver.

PF/VFs communicates with AF via a shared memory region (mailbox). Upon
receiving requests from PF/VF, AF does resource provisioning and other
HW configuration. AF is always attached to host, but PF/VFs may be used
by host kernel itself, or attached to VMs or to userspace applications
like DPDK etc. So AF has to handle provisioning/configuration requests
sent by any device from any domain.

This patch series adds logic for the following
 - RVU AF driver with functional blocks provisioning support.
 - Mailbox infrastructure for communication between AF and PFs.
 - CGX (MAC controller) driver which communicates with firmware for
   managing  physical ethernet interfaces. AF collects info from this
   driver and forwards the same to the PF/VFs uaing these interfaces.

This is the first set of patches out of 80+ patches.

Aleksey Makarov (1):
  octeontx2-af: Convert mbox msg id check to a macro

Geetha sowjanya (1):
  octeontx2-af: Reconfig MSIX base with IOVA

Linu Cherian (3):
  octeontx2-af: Set RVU PFs to CGX LMACs mapping
  octeontx2-af: Add support for CGX link management
  octeontx2-af: Register for CGX lmac events

Sunil Goutham (10):
  octeontx2-af: Add Marvell OcteonTX2 RVU AF driver
  octeontx2-af: Reset all RVU blocks
  octeontx2-af: Gather RVU blocks HW info
  octeontx2-af: Add mailbox support infra
  octeontx2-af: Add mailbox IRQ and msg handlers
  octeontx2-af: Scan blocks for LFs provisioned to PF/VF
  octeontx2-af: Add RVU block LF provisioning support
  octeontx2-af: Configure block LF's MSIX vector offset
  octeontx2-af: Add Marvell OcteonTX2 CGX driver
  MAINTAINERS: Add entry for Marvell OcteonTX2 Admin Function driver

 MAINTAINERS|9 +
 drivers/net/ethernet/marvell/Kconfig   |3 +
 drivers/net/ethernet/marvell/Makefile  |1 +
 drivers/net/ethernet/marvell/octeontx2/Kconfig |   16 +
 drivers/net/ethernet/marvell/octeontx2/Makefile|6 +
 drivers/net/ethernet/marvell/octeontx2/af/Makefile |   10 +
 drivers/net/ethernet/marvell/octeontx2/af/cgx.c|  515 ++
 drivers/net/ethernet/marvell/octeontx2/af/cgx.h|   65 +
 .../net/ethernet/marvell/octeontx2/af/cgx_fw_if.h  |  225 +++
 drivers/net/ethernet/marvell/octeontx2/af/mbox.c   |  303 
 drivers/net/ethernet/marvell/octeontx2/af/mbox.h   |  211 +++
 drivers/net/ethernet/marvell/octeontx2/af/rvu.c| 1637 
 drivers/net/ethernet/marvell/octeontx2/af/rvu.h|  158 ++
 .../net/ethernet/marvell/octeontx2/af/rvu_cgx.c|  194 +++
 .../net/ethernet/marvell/octeontx2/af/rvu_reg.h|  441 ++
 .../net/ethernet/marvell/octeontx2/af/rvu_struct.h |   74 +
 16 files changed, 3868 insertions(+)
 create mode 100644 drivers/net/ethernet/marvell/octeontx2/Kconfig
 create mode 100644 drivers/net/ethernet/marvell/octeontx2/Makefile
 create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/Makefile
 create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/cgx.c
 create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/cgx.h
 create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/cgx_fw_if.h
 create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/mbox.c
 create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/mbox.h
 create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/rvu.c
 create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/rvu.h
 create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c
 create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/rvu_reg.h
 create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/rvu_struct.h

-- 
2.7.4



Re: [Patch net-next] net_sched: fix an extack message in tcf_block_find()

2018-09-27 Thread David Ahern
On 9/27/18 3:36 PM, Cong Wang wrote:
> On Thu, Sep 27, 2018 at 2:16 PM Eric Dumazet  wrote:
>>
>>
>>
>> On 09/27/2018 01:42 PM, Cong Wang wrote:
>>> It is clearly a copy-n-paste.
>>>
>>> Signed-off-by: Cong Wang 
>>> ---
>>>  net/sched/cls_api.c | 2 +-
>>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>>
>>> diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
>>> index 3de47e99b788..8dd7f8af6d54 100644
>>> --- a/net/sched/cls_api.c
>>> +++ b/net/sched/cls_api.c
>>> @@ -655,7 +655,7 @@ static struct tcf_block *tcf_block_find(struct net 
>>> *net, struct Qdisc **q,
>>>
>>>   *q = qdisc_refcount_inc_nz(*q);
>>>   if (!*q) {
>>> - NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists");
>>> + NL_SET_ERR_MSG(extack, "Can't increase Qdisc 
>>> refcount");
>>
>>
>> I am not sure it was a copy-n-paste.
> 
> 
> Make sure you knew there is an exactly same extack message
> (with a same English grammar error).
> 
> 
>>
>> Qdisc refcount business is kernel internal.
> 
> Yeah, but the extack message is already there, this patch doesn't add
> any new extack. Or you are suggesting we should remove it?

IMO the message grammar should be fixed, but the content is correct --
ie, parent qdisc does not exist.



Re: [PATCH net] net/ncsi: Extend NC-SI Netlink interface to allow user space to send NC-SI command

2018-09-27 Thread Samuel Mendoza-Jonas
On Thu, 2018-09-27 at 21:08 +, justin.l...@dell.com wrote:
> The new command (NCSI_CMD_SEND_CMD) is added to allow user space application 
> to send NC-SI command to the network card.
> Also, add a new attribute (NCSI_ATTR_DATA) for transferring request and 
> response.
> 
> The work flow is as below. 
> 
> Request:
> User space application -> Netlink interface (msg)
>   -> new Netlink handler - 
> ncsi_send_cmd_nl()
>   -> ncsi_xmit_cmd()
> Response:
> Response received - ncsi_rcv_rsp() -> internal response handler - 
> ncsi_rsp_handler_xxx()
> -> 
> ncsi_rsp_handler_netlink()
> -> 
> ncsi_send_netlink_rsp ()
> -> 
> Netlink interface (msg)
> -> 
> user space application
> Command timeout - ncsi_request_timeout() -> ncsi_send_netlink_timeout ()
>   
>   -> Netlink interface (msg with zero data length)
>   
>   -> user space application
> Error:
> Error detected -> ncsi_send_netlink_err () -> Netlink interface (err msg)
>   
>  -> user space application
> 
> 
> Signed-off-by: Justin Lee 
> 

Hi Justin,

Thanks for posting this on the list! The overall design looks good and so
far looks like it should fit relatively well with the other OEM command
patch. I'll try and run some OEM commands against my machine.
Some comments below:

> 
> ---
>  include/uapi/linux/ncsi.h |   3 +
>  net/ncsi/internal.h   |  12 ++-
>  net/ncsi/ncsi-aen.c   |  10 ++-
>  net/ncsi/ncsi-cmd.c   | 106 
>  net/ncsi/ncsi-manage.c|  74 ++---
>  net/ncsi/ncsi-netlink.c   | 199 
> +-
>  net/ncsi/ncsi-netlink.h   |   4 +
>  net/ncsi/ncsi-rsp.c   |  70 ++--
>  8 files changed, 420 insertions(+), 58 deletions(-)
> 
> diff --git a/include/uapi/linux/ncsi.h b/include/uapi/linux/ncsi.h
> index 4c292ec..4992bfc 100644
> --- a/include/uapi/linux/ncsi.h
> +++ b/include/uapi/linux/ncsi.h
> @@ -30,6 +30,7 @@ enum ncsi_nl_commands {
>   NCSI_CMD_PKG_INFO,
>   NCSI_CMD_SET_INTERFACE,
>   NCSI_CMD_CLEAR_INTERFACE,
> + NCSI_CMD_SEND_CMD,
>  
>   __NCSI_CMD_AFTER_LAST,
>   NCSI_CMD_MAX = __NCSI_CMD_AFTER_LAST - 1
> @@ -43,6 +44,7 @@ enum ncsi_nl_commands {
>   * @NCSI_ATTR_PACKAGE_LIST: nested array of NCSI_PKG_ATTR attributes
>   * @NCSI_ATTR_PACKAGE_ID: package ID
>   * @NCSI_ATTR_CHANNEL_ID: channel ID
> + * @NCSI_ATTR_DATA: command payload
>   * @NCSI_ATTR_MAX: highest attribute number
>   */
>  enum ncsi_nl_attrs {
> @@ -51,6 +53,7 @@ enum ncsi_nl_attrs {
>   NCSI_ATTR_PACKAGE_LIST,
>   NCSI_ATTR_PACKAGE_ID,
>   NCSI_ATTR_CHANNEL_ID,
> + NCSI_ATTR_DATA,
>  
>   __NCSI_ATTR_AFTER_LAST,
>   NCSI_ATTR_MAX = __NCSI_ATTR_AFTER_LAST - 1
> diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h
> index 8055e39..20ce735 100644
> --- a/net/ncsi/internal.h
> +++ b/net/ncsi/internal.h
> @@ -215,12 +215,17 @@ struct ncsi_request {
>   unsigned charid;  /* Request ID - 0 to 255   */
>   bool used;/* Request that has been assigned  */
>   unsigned int flags;   /* NCSI request property   */
> -#define NCSI_REQ_FLAG_EVENT_DRIVEN   1
> +#define NCSI_REQ_FLAG_EVENT_DRIVEN   1
> +#define NCSI_REQ_FLAG_NETLINK_DRIVEN 2
>   struct ncsi_dev_priv *ndp;/* Associated NCSI device  */
>   struct sk_buff   *cmd;/* Associated NCSI command packet  */
>   struct sk_buff   *rsp;/* Associated NCSI response packet */
>   struct timer_listtimer;   /* Timer on waiting for response   */
>   bool enabled; /* Time has been enabled or not*/
> +
> + u32  snd_seq; /* netlink sending sequence number */
> + u32  snd_portid;  /* netlink portid of sender*/
> + struct nlmsghdr  nlhdr;   /* netlink message header  */
>  };
>  
>  enum {
> @@ -301,10 +306,13 @@ struct ncsi_cmd_arg {
>   unsigned short   payload; /* Command packet payload length */
>   unsigned int req_flags;   /* NCSI request properties   */
>   union {
> - unsigned char  bytes[16]; /* Command packet specific data  */
> + unsigned char  bytes[16]; /* Command packet specific data  
> */
>   unsigned short words[8];
>   unsigned int   dwords[4];
> 

Re: [PATCH net-next v6 00/23] WireGuard: Secure Network Tunnel

2018-09-27 Thread Eric Biggers
On Thu, Sep 27, 2018 at 11:35:39PM +0200, Jason A. Donenfeld wrote:
> Hi Eric,
> 
> On Thu, Sep 27, 2018 at 8:29 PM Eric Biggers  wrote:
> > Why is Herbert Xu's existing crypto tree being circumvented, especially for
> > future patches (the initial merge isn't quite as important as that's a 
> > one-time
> > event)?  I like being able to check out cryptodev to test upcoming crypto
> > patches.  And currently, changes to APIs, algorithms, tests, and 
> > implementations
> > all go through cryptodev, which is convenient for crypto developers.
> >
> > Apparently, you're proposing that someone adding a new algorithm will now 
> > have
> > to submit the API portion to one maintainer (Herbert Xu) and the 
> > implementation
> > portion to another maintainer (you), and they'll go through separate git 
> > trees.
> > That's inconvenient for developers, and it seems that in practice you and
> > Herbert will be stepping on each other's toes a lot.
> >
> > Can you please reach some kind of sane agreement with Herbert so that the
> > development process isn't fractured into two?  Perhaps you could review 
> > patches,
> > but Herbert could still apply them?
> 
> I think you're overthinking it a bit. Zinc will have a few software
> implementations of primitives that are useful in cases where it's nice to call
> the primitive directly. Think: various usages of sha2, siphash, the wireguard
> suite (what this patchset includes), other things in lib/, etc. In so much as
> this winds up duplicating things within the crypto API, I'll work with Herbert
> to build one on top of the other -- as I've done in the two commits in this
> series. But beyond that, think of the two initiatives as orthogonal. I'm
> working on curating a few primitives that are maximally useful throughout
> the kernel for various uses, and doing so in a way that I think brings
> about a certain quality. Meanwhile the crypto API is amassing a huge
> collection of primitives for some things, and that will continue to exist,
> and Herbert will continue to maintain that. I expect for the crossover
> to be fairly isolated and manageable, without too much foreseeable tree-
> conflicts and such. Therefore, Samuel Neves and I plan to maintain the
> codebase we've spent quite some time writing, and maintain our own tree for
> it, which we'll be submitting through Greg. In other words, this is not
> a matter of "circumvention" or "stepping on toes", but rather separate
> efforts. I'm quite certain to the extent they overlap we'll be able to work
> out fairly easily.
> 
> Either way, I'll take your suggestion and reach out to Herbert, since at
> least a discussion between the two of us sounds like it could be productive.

So, Zinc will simultaneously replace the current crypto implementations, *and*
be "orthogonal" and "separate" from all the crypto code currently maintained by
Herbert?  You can't have your cake and eat it too...

I'm still concerned you're splitting the community in two.  It will be unclear
where new algorithms and implementations should go.  Some people will choose
Herbert and the current crypto API and conventions, and some people will choose
you and Zinc...  I still don't see clear guidelines for what will go where.  And
yes, you and Herbert will step on each others' toes and duplicate stuff, as the
efforts are *not* separate, as you've even argued yourself.

Please reach out to Herbert to find a sane solution, ideally one that involves
having a single git tree for crypto development and allows people to continue
crypto development without choosing "sides".

> 
> > I'm also wondering about the criteria for making additions and changes to
> > "Zinc".  You mentioned before that one of the "advantages" of Zinc is that 
> > it
> > doesn't include "cipher modes from 90s cryptographers" -- what does that 
> > mean
> > exactly?  You've also indicated before that you don't want people modifying 
> > the
> > Poly1305 implementations as they are too error-prone.  Useful contributions
> > could be blocked or discouraged in the future. Can you please elaborate on
> > your criteria for contributions to Zinc?
> >
> > Also, will you allow algorithms that aren't up to modern security standards 
> > but
> > are needed for compatibility reasons, e.g. MD5, SHA-1, and DES?  There are
> > existing standards, APIs, and data formats that use these "legacy" 
> > algorithms;
> > so implementations of them are often still needed, whether we like it or 
> > not.
> >
> > And does it matter who designed the algorithms, e.g. do algorithms from 
> > Daniel
> > Bernstein get effectively a free pass, while algorithms from certain 
> > countries,
> > governments, or organizations are not allowed?  E.g. wireless driver 
> > developers
> > may need the SM4 block cipher (which is now supported by the crypto API) as 
> > it's
> > specified in a Chinese wireless standard.  Will you allow SM4 in Zinc?  Or 
> > will
> > people have to submit some algorithms to Herbert and some to you 

[PATCH net-next] geneve: fix ttl inherit type

2018-09-27 Thread Hangbin Liu
Phil pointed out that there is a mismatch between vxlan and geneve ttl
inherit. We should define it as a flag and use nla_put_flag to export this
opiton.

Fixes: 52d0d404d39dd ("geneve: add ttl inherit support")
Reported-by: Phil Sutter 
Signed-off-by: Hangbin Liu 
---
 drivers/net/geneve.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 6625fab..09ab2fd 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -1100,7 +1100,7 @@ static const struct nla_policy 
geneve_policy[IFLA_GENEVE_MAX + 1] = {
[IFLA_GENEVE_UDP_CSUM]  = { .type = NLA_U8 },
[IFLA_GENEVE_UDP_ZERO_CSUM6_TX] = { .type = NLA_U8 },
[IFLA_GENEVE_UDP_ZERO_CSUM6_RX] = { .type = NLA_U8 },
-   [IFLA_GENEVE_TTL_INHERIT]   = { .type = NLA_U8 },
+   [IFLA_GENEVE_TTL_INHERIT]   = { .type = NLA_FLAG },
 };
 
 static int geneve_validate(struct nlattr *tb[], struct nlattr *data[],
@@ -1582,7 +1582,7 @@ static size_t geneve_get_size(const struct net_device 
*dev)
nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_CSUM */
nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_ZERO_CSUM6_TX 
*/
nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_ZERO_CSUM6_RX 
*/
-   nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_TTL_INHERIT */
+   nla_total_size(0) + /* IFLA_GENEVE_TTL_INHERIT */
0;
 }
 
@@ -1636,7 +1636,7 @@ static int geneve_fill_info(struct sk_buff *skb, const 
struct net_device *dev)
goto nla_put_failure;
 #endif
 
-   if (nla_put_u8(skb, IFLA_GENEVE_TTL_INHERIT, ttl_inherit))
+   if (ttl_inherit && nla_put_flag(skb, IFLA_GENEVE_TTL_INHERIT))
goto nla_put_failure;
 
return 0;
-- 
2.5.5



Re: [PATCH net-next v6 23/23] net: WireGuard secure network tunnel

2018-09-27 Thread Jason A. Donenfeld
On Fri, Sep 28, 2018 at 12:37 AM Jason A. Donenfeld  wrote:
> Will do. v7 will include the wg_ prefix.

$ nm *.o | while read a b c; do [[ $b == T ]] && echo $c; done | grep -v ^wg_
cleanup_module
init_module

Success.


[PATCH net] vxlan: use nla_put_flag for ttl inherit

2018-09-27 Thread Hangbin Liu
Phil pointed out that there is a mismatch between vxlan and geneve ttl inherit.
We should define it as a flag and use nla_put_flag to export this opiton.

Fixes: 8fd780698745b ("vxlan: fill ttl inherit info")
Reported-by: Phil Sutter 
Signed-off-by: Hangbin Liu 
---
 drivers/net/vxlan.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 2b8da2b..479dda4 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -3539,7 +3539,7 @@ static size_t vxlan_get_size(const struct net_device *dev)
nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LINK */
nla_total_size(sizeof(struct in6_addr)) + /* 
IFLA_VXLAN_LOCAL{6} */
nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_TTL */
-   nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_TTL_INHERIT */
+   nla_total_size(0) + /* IFLA_VXLAN_TTL_INHERIT */
nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_TOS */
nla_total_size(sizeof(__be32)) + /* IFLA_VXLAN_LABEL */
nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_LEARNING */
@@ -3604,8 +3604,6 @@ static int vxlan_fill_info(struct sk_buff *skb, const 
struct net_device *dev)
}
 
if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->cfg.ttl) ||
-   nla_put_u8(skb, IFLA_VXLAN_TTL_INHERIT,
-  !!(vxlan->cfg.flags & VXLAN_F_TTL_INHERIT)) ||
nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) ||
nla_put_be32(skb, IFLA_VXLAN_LABEL, vxlan->cfg.label) ||
nla_put_u8(skb, IFLA_VXLAN_LEARNING,
@@ -3650,6 +3648,10 @@ static int vxlan_fill_info(struct sk_buff *skb, const 
struct net_device *dev)
nla_put_flag(skb, IFLA_VXLAN_REMCSUM_NOPARTIAL))
goto nla_put_failure;
 
+   if (vxlan->cfg.flags & VXLAN_F_TTL_INHERIT &&
+   nla_put_flag(skb, IFLA_VXLAN_TTL_INHERIT))
+   goto nla_put_failure;
+
return 0;
 
 nla_put_failure:
-- 
2.5.5



Re: [PATCH iproute2-next] geneve: fix ttl inherit behavior

2018-09-27 Thread Hangbin Liu
On Thu, Sep 27, 2018 at 11:08:36AM +0200, Phil Sutter wrote:
> On Thu, Sep 27, 2018 at 03:27:37PM +0800, Hangbin Liu wrote:
> > Currently when we add geneve with "ttl inherit", we set ttl to 0, which
> > is actually use whatever default value instead of inherit the inner
> > protocol's ttl value.
> > 
> > To respect compatibility with old behavior and make a difference between
> > ttl inherit and ttl == 0, we add an attribute IFLA_GENEVE_TTL_INHERIT in
> > kernel commit 52d0d404d39dd ("geneve: add ttl inherit support").
> > 
> > Now let's use "ttl inherit" to inherit the inner protocol's ttl, and use
> > "ttl auto" to means "use whatever default value", the same behavior with
> > ttl == 0.
> > 
> > Reported-by: Jianlin Shi 
> > Signed-off-by: Hangbin Liu 
> 
> Acked-by: Phil Sutter 

Hi Stephen, David,

Please hold on this path and let me fix the inherit flag issue first.

Thanks
Hangbin


Re: [PATCH resend] can: rcar_can: convert to SPDX identifiers

2018-09-27 Thread Kuninori Morimoto


Hi Marc

> > From: Kuninori Morimoto 
> > 
> > This patch updates license to use SPDX-License-Identifier
> > instead of verbose license text.
> > 
> > Signed-off-by: Kuninori Morimoto 
> > Reviewed-by: Simon Horman 
> 
> Wolfram Sang has already supplied a similar patch, but not for Makefile
> and Kconfig. I've applied your patch for Makefile and Kconfig and
> adjusted the commit message accordingly.

Thank you very much


Re: WARN_ON in TLP causing RT throttling

2018-09-27 Thread Eric Dumazet



On 09/27/2018 05:16 PM, stran...@codeaurora.org wrote:

> Hi Yuchung,
> 
> Based on the dumps we were able to get, it appears that TFO was not used in 
> this case.
> We also tried some local experiments where we dropped incoming SYN packets 
> after already
> successful TFO connections on the receive side to see if TFO would trigger 
> this scenario, but
> have not been able to reproduce it.
> 
> One other interesting thing we found is that the socket never sent or 
> received any data. It only
> sent/received the packets for the initial handshake and the outgoing FIN.

Just to make sure : Was this some sort of syzkaller (or other fuzzer) run ?


Re: WARN_ON in TLP causing RT throttling

2018-09-27 Thread stranche

On 2018-09-27 13:14, Yuchung Cheng wrote:
On Wed, Sep 26, 2018 at 5:09 PM, Eric Dumazet  
wrote:




On 09/26/2018 04:46 PM, stran...@codeaurora.org wrote:
> Hi Eric,
>
> Someone recently reported a crash to us on the 4.14.62 kernel where excessive
> WARNING prints were spamming the logs and causing watchdog bites. The kernel
> does have the following commit by Soheil:
> bffd168c3fc5 "tcp: clear tp->packets_out when purging write queue"
>
> Before this bug we see over 1 second of continuous WARN_ON prints from
> tcp_send_loss_probe() like so:
>
> 7795.530450:   <2>  tcp_send_loss_probe+0x194/0x1b8
> 7795.534833:   <2>  tcp_write_timer_handler+0xf8/0x1c4
> 7795.539492:   <2>  tcp_write_timer+0x4c/0x74
> 7795.543348:   <2>  call_timer_fn+0xc0/0x1b4
> 7795.547113:   <2>  run_timer_softirq+0x248/0x81c
>
> Specifically, the prints come from the following check:
>
> /* Retransmit last segment. */
> if (WARN_ON(!skb))
> goto rearm_timer;
>
> Since skb is always NULL, we know there's nothing on the write queue or the
> retransmit queue, so we just keep resetting the timer, waiting for more data
> to be queued. However, we were able to determine that the TCP socket is in the
> TCP_FIN_WAIT1 state, so we will no longer be sending any data and these queues
> remain empty.
>
> Would it be appropriate to stop resetting the TLP timer if we detect that the
> connection is starting to close and we have no more data to send the probe 
with,
> or is there some way that this scenario should already be handled?
>
> Unfortunately, we don't have a reproducer for this crash.
>

Something is fishy.

If there is no skb in the queues, then tp->packets_out should be 0,
therefore tcp_rearm_rto() should simply call 
inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);


I have never seen this report before.

Do you use Fast Open? I am wondering if its a bug when a TFO server
closes the socket before the handshake finishes...

Either way, it's pretty safe to just stop TLP if write queue is empty
for any unexpected reason.




Hi Yuchung,

Based on the dumps we were able to get, it appears that TFO was not used 
in this case.
We also tried some local experiments where we dropped incoming SYN 
packets after already
successful TFO connections on the receive side to see if TFO would 
trigger this scenario, but

have not been able to reproduce it.

One other interesting thing we found is that the socket never sent or 
received any data. It only
sent/received the packets for the initial handshake and the outgoing 
FIN.


Re: [PATCH net V2] vhost-vsock: fix use after free

2018-09-27 Thread Michael S. Tsirkin
On Fri, Sep 28, 2018 at 07:37:37AM +0800, Jason Wang wrote:
> 
> 
> On 2018年09月28日 01:04, Michael S. Tsirkin wrote:
> > On Thu, Sep 27, 2018 at 08:22:04PM +0800, Jason Wang wrote:
> > > The access of vsock is not protected by vhost_vsock_lock. This may
> > > lead to use after free since vhost_vsock_dev_release() may free the
> > > pointer at the same time.
> > > 
> > > Fix this by holding the lock during the access.
> > > 
> > > Reported-by:syzbot+e3e074963495f92a8...@syzkaller.appspotmail.com
> > > Fixes: 16320f363ae1 ("vhost-vsock: add pkt cancel capability")
> > > Fixes: 433fc58e6bf2 ("VSOCK: Introduce vhost_vsock.ko")
> > > Cc: Stefan Hajnoczi
> > > Signed-off-by: Jason Wang
> > Wow is that really the best we can do?
> 
> For net/stable, probably yes.
> 
> >   A global lock on a data path
> > operation?
> 
> It's already there,

&vhost_vsock_lock? were is it takes on data path?

> and the patch only increase the critical section.
> 
> >   Granted use after free is nasty but Stefan said he sees
> > a way to fix it using a per socket refcount. He's on vacation
> > until Oct 4 though ...
> > 
> 
> Stefan has acked the pacth, so I think it's ok? We can do optimization for
> -next on top.
> 
> Thanks


Well on high SMP serializing can drop performance as much as x100 so I'm
not sure it's appropriate - seems to fix a bug but can introduce a
regression. Let's see how does a proper fix look first?

-- 
MST


Re: [PATCH net-next] virtio_net: ethtool tx napi configuration

2018-09-27 Thread Jason Wang




On 2018年09月27日 21:53, Willem de Bruijn wrote:

On Thu, Sep 27, 2018 at 4:51 AM Jason Wang  wrote:



On 2018年09月14日 12:46, Willem de Bruijn wrote:

I'm not sure I get this. If we don't enable tx napi, we tend to delay TX
interrupt if we found the ring is about to full to avoid interrupt
storm, so we're probably ok in this case.

I'm only concerned about the transition state when converting from
napi to no-napi when the queue is stopped and tx interrupt disabled.

With napi mode the interrupt is only disabled if napi is scheduled,
in which case it will eventually reenable the interrupt. But when
switching to no-napi mode in this state no progress will be made.

But it seems this cannot happen. When converting to no-napi
mode, set_coalesce waits for napi to complete in napi_disable.
So the interrupt should always start enabled when transitioning
into no-napi mode.

An update, I meet a hang in napi_disalbe(). But it's hard to be
reproduced. I tend to choose a easy way like V1 that only allow the
switching when device is down.

I agree.


I will post the patch after a vacation. (or you can post if it was
urgent for you).

If you have time to review and add your signed-off-by, I can post it.
It's a pretty small diff at this point.

But no rush, we can also wait until after your vacation.


Then let me post it after the vacation.



I also need to look at a patch to toggle LRO using ethtool, btw.


Interesting, we've already did something similar during XDP. The 
GUEST_TSO_XXX part may need some private flags I believe.


Thanks


Re: [PATCH net V2] vhost-vsock: fix use after free

2018-09-27 Thread Jason Wang




On 2018年09月28日 01:04, Michael S. Tsirkin wrote:

On Thu, Sep 27, 2018 at 08:22:04PM +0800, Jason Wang wrote:

The access of vsock is not protected by vhost_vsock_lock. This may
lead to use after free since vhost_vsock_dev_release() may free the
pointer at the same time.

Fix this by holding the lock during the access.

Reported-by:syzbot+e3e074963495f92a8...@syzkaller.appspotmail.com
Fixes: 16320f363ae1 ("vhost-vsock: add pkt cancel capability")
Fixes: 433fc58e6bf2 ("VSOCK: Introduce vhost_vsock.ko")
Cc: Stefan Hajnoczi
Signed-off-by: Jason Wang

Wow is that really the best we can do?


For net/stable, probably yes.


  A global lock on a data path
operation?


It's already there, and the patch only increase the critical section.


  Granted use after free is nasty but Stefan said he sees
a way to fix it using a per socket refcount. He's on vacation
until Oct 4 though ...



Stefan has acked the pacth, so I think it's ok? We can do optimization 
for -next on top.


Thanks


[PATCHv3 bpf-next 00/12] Add socket lookup support

2018-09-27 Thread Joe Stringer
This series proposes a new helper for the BPF API which allows BPF programs to
perform lookups for sockets in a network namespace. This would allow programs
to determine early on in processing whether the stack is expecting to receive
the packet, and perform some action (eg drop, forward somewhere) based on this
information.

The series is structured roughly into:
* Misc refactor
* Add the socket pointer type
* Add reference tracking to ensure that socket references are freed
* Extend the BPF API to add sk_lookup_xxx() / sk_release() functions
* Add tests/documentation

The helper proposed in this series includes a parameter for a tuple which must
be filled in by the caller to determine the socket to look up. The simplest
case would be filling with the contents of the packet, ie mapping the packet's
5-tuple into the parameter. In common cases, it may alternatively be useful to
reverse the direction of the tuple and perform a lookup, to find the socket
that initiates this connection; and if the BPF program ever performs a form of
IP address translation, it may further be useful to be able to look up
arbitrary tuples that are not based upon the packet, but instead based on state
held in BPF maps or hardcoded in the BPF program.

Currently, access into the socket's fields are limited to those which are
otherwise already accessible, and are restricted to read-only access.

Changes since v2:
* New patch: "selftests/bpf: Generalize dummy program types".
  This enables adding verifier tests for socket lookup with tail calls.
* Define the semantics of the new helpers more clearly in uAPI header.
* Fix release of caller_net when netns is not specified.
* Use skb->sk to find caller net when skb->dev is unavailable.
* Fix build with !CONFIG_NET.
* Replace ptr_id defensive coding when releasing reference state with an
  internal error (-EFAULT).
* Remove flags argument to sk_release().
* Add several new assembly tests suggested by Daniel.
* Add a few new C tests.
* Fix typo in verifier error message.

Changes since v1:
* Limit netns_id field to 32 bits
* Reuse reg_type_mismatch() in more places
* Reduce the number of passes at convert_ctx_access()
* Replace ptr_id defensive coding when releasing reference state with an
  internal error (-EFAULT)
* Rework 'struct bpf_sock_tuple' to allow passing a packet pointer
* Allow direct packet access from helper
* Fix compile error with CONFIG_IPV6 enabled
* Improve commit messages

Changes since RFC:
* Split up sk_lookup() into sk_lookup_tcp(), sk_lookup_udp().
* Only take references on the socket when necessary.
  * Make sk_release() only free the socket reference in this case.
* Fix some runtime reference leaks:
  * Disallow BPF_LD_[ABS|IND] instructions while holding a reference.
  * Disallow bpf_tail_call() while holding a reference.
* Prevent the same instruction being used for reference and other
  pointer type.
* Simplify locating copies of a reference during helper calls by caching
  the pointer id from the caller.
* Fix kbuild compilation warnings with particular configs.
* Improve code comments describing the new verifier pieces.
* Testing courtesy of Nitin

This tree is also available at:
https://github.com/joestringer/linux/commits/submit/sk-lookup-v3

Joe Stringer (12):
  bpf: Add iterator for spilled registers
  bpf: Simplify ptr_min_max_vals adjustment
  bpf: Generalize ptr_or_null regs check
  bpf: Add PTR_TO_SOCKET verifier type
  bpf: Macrofy stack state copy
  bpf: Add reference tracking to verifier
  bpf: Add helper to retrieve socket in BPF
  selftests/bpf: Generalize dummy program types
  selftests/bpf: Add tests for reference tracking
  libbpf: Support loading individual progs
  selftests/bpf: Add C tests for reference tracking
  Documentation: Describe bpf reference tracking

 Documentation/networking/filter.txt   |  64 ++
 include/linux/bpf.h   |  34 +
 include/linux/bpf_verifier.h  |  37 +-
 include/uapi/linux/bpf.h  |  93 ++-
 kernel/bpf/verifier.c | 594 +---
 net/core/filter.c | 181 -
 tools/include/uapi/linux/bpf.h|  93 ++-
 tools/lib/bpf/libbpf.c|   4 +-
 tools/lib/bpf/libbpf.h|   3 +
 tools/testing/selftests/bpf/Makefile  |   2 +-
 tools/testing/selftests/bpf/bpf_helpers.h |  12 +
 tools/testing/selftests/bpf/test_progs.c  |  38 +
 .../selftests/bpf/test_sk_lookup_kern.c   | 180 +
 tools/testing/selftests/bpf/test_verifier.c   | 670 +-
 14 files changed, 1858 insertions(+), 147 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/test_sk_lookup_kern.c

-- 
2.17.1



[PATCHv3 bpf-next 04/12] bpf: Add PTR_TO_SOCKET verifier type

2018-09-27 Thread Joe Stringer
Teach the verifier a little bit about a new type of pointer, a
PTR_TO_SOCKET. This pointer type is accessed from BPF through the
'struct bpf_sock' structure.

Signed-off-by: Joe Stringer 
---
v2: Reuse reg_type_mismatch() in more places
Reduce the number of passes at convert_ctx_access()

v3: Fix build with !CONFIG_NET
---
 include/linux/bpf.h  |  34 ++
 include/linux/bpf_verifier.h |   2 +
 kernel/bpf/verifier.c| 120 +++
 net/core/filter.c|  30 +
 4 files changed, 160 insertions(+), 26 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 018299a595c8..027697b6a22f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -154,6 +154,7 @@ enum bpf_arg_type {
 
ARG_PTR_TO_CTX, /* pointer to context */
ARG_ANYTHING,   /* any (initialized) argument is ok */
+   ARG_PTR_TO_SOCKET,  /* pointer to bpf_sock */
 };
 
 /* type of values returned from helper functions */
@@ -162,6 +163,7 @@ enum bpf_return_type {
RET_VOID,   /* function doesn't return anything */
RET_PTR_TO_MAP_VALUE,   /* returns a pointer to map elem value 
*/
RET_PTR_TO_MAP_VALUE_OR_NULL,   /* returns a pointer to map elem value 
or NULL */
+   RET_PTR_TO_SOCKET_OR_NULL,  /* returns a pointer to a socket or 
NULL */
 };
 
 /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF 
programs
@@ -213,6 +215,8 @@ enum bpf_reg_type {
PTR_TO_PACKET,   /* reg points to skb->data */
PTR_TO_PACKET_END,   /* skb->data + headlen */
PTR_TO_FLOW_KEYS,/* reg points to bpf_flow_keys */
+   PTR_TO_SOCKET,   /* reg points to struct bpf_sock */
+   PTR_TO_SOCKET_OR_NULL,   /* reg points to struct bpf_sock or NULL */
 };
 
 /* The information passed from prog-specific *_is_valid_access
@@ -343,6 +347,11 @@ const struct bpf_func_proto 
*bpf_get_trace_printk_proto(void);
 
 typedef unsigned long (*bpf_ctx_copy_t)(void *dst, const void *src,
unsigned long off, unsigned long len);
+typedef u32 (*bpf_convert_ctx_access_t)(enum bpf_access_type type,
+   const struct bpf_insn *src,
+   struct bpf_insn *dst,
+   struct bpf_prog *prog,
+   u32 *target_size);
 
 u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
 void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy);
@@ -836,4 +845,29 @@ extern const struct bpf_func_proto 
bpf_get_local_storage_proto;
 void bpf_user_rnd_init_once(void);
 u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
+#if defined(CONFIG_NET)
+bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
+ struct bpf_insn_access_aux *info);
+u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
+   const struct bpf_insn *si,
+   struct bpf_insn *insn_buf,
+   struct bpf_prog *prog,
+   u32 *target_size);
+#else
+static inline bool bpf_sock_is_valid_access(int off, int size,
+   enum bpf_access_type type,
+   struct bpf_insn_access_aux *info)
+{
+   return false;
+}
+static inline u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog,
+ u32 *target_size)
+{
+   return 0;
+}
+#endif
+
 #endif /* _LINUX_BPF_H */
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index af262b97f586..23a2b17bfd75 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -58,6 +58,8 @@ struct bpf_reg_state {
 * offset, so they can share range knowledge.
 * For PTR_TO_MAP_VALUE_OR_NULL this is used to share which map value we
 * came from, when one is tested for != NULL.
+* For PTR_TO_SOCKET this is used to share which pointers retain the
+* same reference to the socket, to determine proper reference freeing.
 */
u32 id;
/* For scalar types (SCALAR_VALUE), this represents our knowledge of
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index bbb0a812ee81..d4abbf0d5727 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -80,8 +80,8 @@ static const struct bpf_verifier_ops * const 
bpf_verifier_ops[] = {
  * (like pointer plus pointer becomes SCALAR_VALUE type)
  *
  * When verifier sees load or store instructions the type of base registe

[PATCHv3 bpf-next 08/12] selftests/bpf: Generalize dummy program types

2018-09-27 Thread Joe Stringer
Don't hardcode the dummy program types to SOCKET_FILTER type, as this
prevents testing bpf_tail_call in conjunction with other program types.
Instead, use the program type specified in the test case.

Signed-off-by: Joe Stringer 
---
 tools/testing/selftests/bpf/test_verifier.c | 31 +++--
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_verifier.c 
b/tools/testing/selftests/bpf/test_verifier.c
index a90be44f61e0..020b1467e565 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -12652,18 +12652,18 @@ static int create_map(uint32_t type, uint32_t 
size_key,
return fd;
 }
 
-static int create_prog_dummy1(void)
+static int create_prog_dummy1(enum bpf_map_type prog_type)
 {
struct bpf_insn prog[] = {
BPF_MOV64_IMM(BPF_REG_0, 42),
BPF_EXIT_INSN(),
};
 
-   return bpf_load_program(BPF_PROG_TYPE_SOCKET_FILTER, prog,
+   return bpf_load_program(prog_type, prog,
ARRAY_SIZE(prog), "GPL", 0, NULL, 0);
 }
 
-static int create_prog_dummy2(int mfd, int idx)
+static int create_prog_dummy2(enum bpf_map_type prog_type, int mfd, int idx)
 {
struct bpf_insn prog[] = {
BPF_MOV64_IMM(BPF_REG_3, idx),
@@ -12674,11 +12674,12 @@ static int create_prog_dummy2(int mfd, int idx)
BPF_EXIT_INSN(),
};
 
-   return bpf_load_program(BPF_PROG_TYPE_SOCKET_FILTER, prog,
+   return bpf_load_program(prog_type, prog,
ARRAY_SIZE(prog), "GPL", 0, NULL, 0);
 }
 
-static int create_prog_array(uint32_t max_elem, int p1key)
+static int create_prog_array(enum bpf_map_type prog_type, uint32_t max_elem,
+int p1key)
 {
int p2key = 1;
int mfd, p1fd, p2fd;
@@ -12690,8 +12691,8 @@ static int create_prog_array(uint32_t max_elem, int 
p1key)
return -1;
}
 
-   p1fd = create_prog_dummy1();
-   p2fd = create_prog_dummy2(mfd, p2key);
+   p1fd = create_prog_dummy1(prog_type);
+   p2fd = create_prog_dummy2(prog_type, mfd, p2key);
if (p1fd < 0 || p2fd < 0)
goto out;
if (bpf_map_update_elem(mfd, &p1key, &p1fd, BPF_ANY) < 0)
@@ -12748,8 +12749,8 @@ static int create_cgroup_storage(bool percpu)
 
 static char bpf_vlog[UINT_MAX >> 8];
 
-static void do_test_fixup(struct bpf_test *test, struct bpf_insn *prog,
- int *map_fds)
+static void do_test_fixup(struct bpf_test *test, enum bpf_map_type prog_type,
+ struct bpf_insn *prog, int *map_fds)
 {
int *fixup_map1 = test->fixup_map1;
int *fixup_map2 = test->fixup_map2;
@@ -12805,7 +12806,7 @@ static void do_test_fixup(struct bpf_test *test, struct 
bpf_insn *prog,
}
 
if (*fixup_prog1) {
-   map_fds[4] = create_prog_array(4, 0);
+   map_fds[4] = create_prog_array(prog_type, 4, 0);
do {
prog[*fixup_prog1].imm = map_fds[4];
fixup_prog1++;
@@ -12813,7 +12814,7 @@ static void do_test_fixup(struct bpf_test *test, struct 
bpf_insn *prog,
}
 
if (*fixup_prog2) {
-   map_fds[5] = create_prog_array(8, 7);
+   map_fds[5] = create_prog_array(prog_type, 8, 7);
do {
prog[*fixup_prog2].imm = map_fds[5];
fixup_prog2++;
@@ -12859,11 +12860,13 @@ static void do_test_single(struct bpf_test *test, 
bool unpriv,
for (i = 0; i < MAX_NR_MAPS; i++)
map_fds[i] = -1;
 
-   do_test_fixup(test, prog, map_fds);
+   if (!prog_type)
+   prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
+   do_test_fixup(test, prog_type, prog, map_fds);
prog_len = probe_filter_length(prog);
 
-   fd_prog = bpf_verify_program(prog_type ? : BPF_PROG_TYPE_SOCKET_FILTER,
-prog, prog_len, test->flags & 
F_LOAD_WITH_STRICT_ALIGNMENT,
+   fd_prog = bpf_verify_program(prog_type, prog, prog_len,
+test->flags & F_LOAD_WITH_STRICT_ALIGNMENT,
 "GPL", 0, bpf_vlog, sizeof(bpf_vlog), 1);
 
expected_ret = unpriv && test->result_unpriv != UNDEF ?
-- 
2.17.1



[PATCHv3 bpf-next 02/12] bpf: Simplify ptr_min_max_vals adjustment

2018-09-27 Thread Joe Stringer
An upcoming commit will add another two pointer types that need very
similar behaviour, so generalise this function now.

Signed-off-by: Joe Stringer 
Acked-by: Alexei Starovoitov 
---
 kernel/bpf/verifier.c   | 22 ++---
 tools/testing/selftests/bpf/test_verifier.c | 14 ++---
 2 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 18347de310ad..87b75efc1dc1 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2669,20 +2669,18 @@ static int adjust_ptr_min_max_vals(struct 
bpf_verifier_env *env,
return -EACCES;
}
 
-   if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
-   verbose(env, "R%d pointer arithmetic on 
PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
-   dst);
-   return -EACCES;
-   }
-   if (ptr_reg->type == CONST_PTR_TO_MAP) {
-   verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP 
prohibited\n",
-   dst);
+   switch (ptr_reg->type) {
+   case PTR_TO_MAP_VALUE_OR_NULL:
+   verbose(env, "R%d pointer arithmetic on %s prohibited, 
null-check it first\n",
+   dst, reg_type_str[ptr_reg->type]);
return -EACCES;
-   }
-   if (ptr_reg->type == PTR_TO_PACKET_END) {
-   verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END 
prohibited\n",
-   dst);
+   case CONST_PTR_TO_MAP:
+   case PTR_TO_PACKET_END:
+   verbose(env, "R%d pointer arithmetic on %s prohibited\n",
+   dst, reg_type_str[ptr_reg->type]);
return -EACCES;
+   default:
+   break;
}
 
/* In case of 'scalar += pointer', dst_reg inherits pointer type and id.
diff --git a/tools/testing/selftests/bpf/test_verifier.c 
b/tools/testing/selftests/bpf/test_verifier.c
index c7d25f23baf9..a90be44f61e0 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -3638,7 +3638,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
-   .errstr = "R3 pointer arithmetic on PTR_TO_PACKET_END",
+   .errstr = "R3 pointer arithmetic on pkt_end",
.result = REJECT,
.prog_type = BPF_PROG_TYPE_SCHED_CLS,
},
@@ -4896,7 +4896,7 @@ static struct bpf_test tests[] = {
BPF_EXIT_INSN(),
},
.fixup_map1 = { 4 },
-   .errstr = "R4 pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL",
+   .errstr = "R4 pointer arithmetic on map_value_or_null",
.result = REJECT,
.prog_type = BPF_PROG_TYPE_SCHED_CLS
},
@@ -4917,7 +4917,7 @@ static struct bpf_test tests[] = {
BPF_EXIT_INSN(),
},
.fixup_map1 = { 4 },
-   .errstr = "R4 pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL",
+   .errstr = "R4 pointer arithmetic on map_value_or_null",
.result = REJECT,
.prog_type = BPF_PROG_TYPE_SCHED_CLS
},
@@ -4938,7 +4938,7 @@ static struct bpf_test tests[] = {
BPF_EXIT_INSN(),
},
.fixup_map1 = { 4 },
-   .errstr = "R4 pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL",
+   .errstr = "R4 pointer arithmetic on map_value_or_null",
.result = REJECT,
.prog_type = BPF_PROG_TYPE_SCHED_CLS
},
@@ -7253,7 +7253,7 @@ static struct bpf_test tests[] = {
BPF_EXIT_INSN(),
},
.fixup_map_in_map = { 3 },
-   .errstr = "R1 pointer arithmetic on CONST_PTR_TO_MAP 
prohibited",
+   .errstr = "R1 pointer arithmetic on map_ptr prohibited",
.result = REJECT,
},
{
@@ -8927,7 +8927,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
-   .errstr = "R3 pointer arithmetic on PTR_TO_PACKET_END",
+   .errstr = "R3 pointer arithmetic on pkt_end",
.result = REJECT,
.prog_type = BPF_PROG_TYPE_XDP,
},
@@ -8946,7 +8946,7 @@ static struct bpf_test tests[] = {
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
-   .errstr = "R3 pointer arithmetic on PTR_TO_PACKET_END",
+   .errstr = "R3 pointer arithmetic on pkt_end",
.result = REJECT,
.prog_type = BPF_PROG_TYPE_XDP,
},
-- 
2.17.1



[PATCHv3 bpf-next 05/12] bpf: Macrofy stack state copy

2018-09-27 Thread Joe Stringer
An upcoming commit will need very similar copy/realloc boilerplate, so
refactor the existing stack copy/realloc functions into macros to
simplify it.

Signed-off-by: Joe Stringer 
Acked-by: Alexei Starovoitov 
---
 kernel/bpf/verifier.c | 106 --
 1 file changed, 60 insertions(+), 46 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d4abbf0d5727..cf8704d137fa 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -388,60 +388,74 @@ static void print_verifier_state(struct bpf_verifier_env 
*env,
verbose(env, "\n");
 }
 
-static int copy_stack_state(struct bpf_func_state *dst,
-   const struct bpf_func_state *src)
-{
-   if (!src->stack)
-   return 0;
-   if (WARN_ON_ONCE(dst->allocated_stack < src->allocated_stack)) {
-   /* internal bug, make state invalid to reject the program */
-   memset(dst, 0, sizeof(*dst));
-   return -EFAULT;
-   }
-   memcpy(dst->stack, src->stack,
-  sizeof(*src->stack) * (src->allocated_stack / BPF_REG_SIZE));
-   return 0;
-}
+#define COPY_STATE_FN(NAME, COUNT, FIELD, SIZE)
\
+static int copy_##NAME##_state(struct bpf_func_state *dst, \
+  const struct bpf_func_state *src)\
+{  \
+   if (!src->FIELD)\
+   return 0;   \
+   if (WARN_ON_ONCE(dst->COUNT < src->COUNT)) {\
+   /* internal bug, make state invalid to reject the program */ \
+   memset(dst, 0, sizeof(*dst));   \
+   return -EFAULT; \
+   }   \
+   memcpy(dst->FIELD, src->FIELD,  \
+  sizeof(*src->FIELD) * (src->COUNT / SIZE));  \
+   return 0;   \
+}
+/* copy_stack_state() */
+COPY_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE)
+#undef COPY_STATE_FN
+
+#define REALLOC_STATE_FN(NAME, COUNT, FIELD, SIZE) \
+static int realloc_##NAME##_state(struct bpf_func_state *state, int size, \
+ bool copy_old)\
+{  \
+   u32 old_size = state->COUNT;\
+   struct bpf_##NAME##_state *new_##FIELD; \
+   int slot = size / SIZE; \
+   \
+   if (size <= old_size || !size) {\
+   if (copy_old)   \
+   return 0;   \
+   state->COUNT = slot * SIZE; \
+   if (!size && old_size) {\
+   kfree(state->FIELD);\
+   state->FIELD = NULL;\
+   }   \
+   return 0;   \
+   }   \
+   new_##FIELD = kmalloc_array(slot, sizeof(struct bpf_##NAME##_state), \
+   GFP_KERNEL);\
+   if (!new_##FIELD)   \
+   return -ENOMEM; \
+   if (copy_old) { \
+   if (state->FIELD)   \
+   memcpy(new_##FIELD, state->FIELD,   \
+  sizeof(*new_##FIELD) * (old_size / SIZE)); \
+   memset(new_##FIELD + old_size / SIZE, 0,\
+  sizeof(*new_##FIELD) * (size - old_size) / SIZE); \
+   }   \
+   state->COUNT = slot * SIZE; \
+   kfree(state->FIELD);\
+   state->FIELD = new_##FIELD; \
+   return 0;   \
+}
+/* realloc_stack_state() */
+REALLOC_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE)
+#undef REALLOC_STATE_FN
 
 /* do_check() starts with zero-sized stack in struct bpf_verifier_state to
  * make it consume minim

[PATCHv3 bpf-next 01/12] bpf: Add iterator for spilled registers

2018-09-27 Thread Joe Stringer
Add this iterator for spilled registers, it concentrates the details of
how to get the current frame's spilled registers into a single macro
while clarifying the intention of the code which is calling the macro.

Signed-off-by: Joe Stringer 
Acked-by: Alexei Starovoitov 
---
 include/linux/bpf_verifier.h | 11 +++
 kernel/bpf/verifier.c| 16 +++-
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index b42b60a83e19..af262b97f586 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -131,6 +131,17 @@ struct bpf_verifier_state {
u32 curframe;
 };
 
+#define __get_spilled_reg(slot, frame) \
+   (((slot < frame->allocated_stack / BPF_REG_SIZE) && \
+ (frame->stack[slot].slot_type[0] == STACK_SPILL)) \
+? &frame->stack[slot].spilled_ptr : NULL)
+
+/* Iterate over 'frame', setting 'reg' to either NULL or a spilled register. */
+#define for_each_spilled_reg(iter, frame, reg) \
+   for (iter = 0, reg = __get_spilled_reg(iter, frame);\
+iter < frame->allocated_stack / BPF_REG_SIZE;  \
+iter++, reg = __get_spilled_reg(iter, frame))
+
 /* linked list of verifier states used to prune search */
 struct bpf_verifier_state_list {
struct bpf_verifier_state state;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a8cc83a970d1..18347de310ad 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2252,10 +2252,9 @@ static void __clear_all_pkt_pointers(struct 
bpf_verifier_env *env,
if (reg_is_pkt_pointer_any(®s[i]))
mark_reg_unknown(env, regs, i);
 
-   for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
-   if (state->stack[i].slot_type[0] != STACK_SPILL)
+   for_each_spilled_reg(i, state, reg) {
+   if (!reg)
continue;
-   reg = &state->stack[i].spilled_ptr;
if (reg_is_pkt_pointer_any(reg))
__mark_reg_unknown(reg);
}
@@ -3395,10 +3394,9 @@ static void find_good_pkt_pointers(struct 
bpf_verifier_state *vstate,
 
for (j = 0; j <= vstate->curframe; j++) {
state = vstate->frame[j];
-   for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
-   if (state->stack[i].slot_type[0] != STACK_SPILL)
+   for_each_spilled_reg(i, state, reg) {
+   if (!reg)
continue;
-   reg = &state->stack[i].spilled_ptr;
if (reg->type == type && reg->id == dst_reg->id)
reg->range = max(reg->range, new_range);
}
@@ -3643,7 +3641,7 @@ static void mark_map_regs(struct bpf_verifier_state 
*vstate, u32 regno,
  bool is_null)
 {
struct bpf_func_state *state = vstate->frame[vstate->curframe];
-   struct bpf_reg_state *regs = state->regs;
+   struct bpf_reg_state *reg, *regs = state->regs;
u32 id = regs[regno].id;
int i, j;
 
@@ -3652,8 +3650,8 @@ static void mark_map_regs(struct bpf_verifier_state 
*vstate, u32 regno,
 
for (j = 0; j <= vstate->curframe; j++) {
state = vstate->frame[j];
-   for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
-   if (state->stack[i].slot_type[0] != STACK_SPILL)
+   for_each_spilled_reg(i, state, reg) {
+   if (!reg)
continue;
mark_map_reg(&state->stack[i].spilled_ptr, 0, id, 
is_null);
}
-- 
2.17.1



[PATCHv3 bpf-next 11/12] selftests/bpf: Add C tests for reference tracking

2018-09-27 Thread Joe Stringer
Add some tests that demonstrate and test the balanced lookup/free
nature of socket lookup. Section names that start with "fail" represent
programs that are expected to fail verification; all others should
succeed.

Signed-off-by: Joe Stringer 
Acked-by: Alexei Starovoitov 
---
v3: Rebase against flags arg change of bpf_sk_release()
New tests:
* "fail_use_after_free"
* "fail_modify_sk_pointer"
* "fail_modify_sk_or_null_pointer"
---
 tools/testing/selftests/bpf/Makefile  |   2 +-
 tools/testing/selftests/bpf/test_progs.c  |  38 
 .../selftests/bpf/test_sk_lookup_kern.c   | 180 ++
 3 files changed, 219 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/test_sk_lookup_kern.c

diff --git a/tools/testing/selftests/bpf/Makefile 
b/tools/testing/selftests/bpf/Makefile
index f802de526f57..1381ab81099c 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -36,7 +36,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o 
test_tcp_estats.o test
test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \
test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o 
test_lirc_mode2_kern.o \
get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o \
-   test_skb_cgroup_id_kern.o bpf_flow.o netcnt_prog.o
+   test_skb_cgroup_id_kern.o bpf_flow.o netcnt_prog.o test_sk_lookup_kern.o
 
 # Order correspond to 'make run_tests' order
 TEST_PROGS := test_kmod.sh \
diff --git a/tools/testing/selftests/bpf/test_progs.c 
b/tools/testing/selftests/bpf/test_progs.c
index 63a671803ed6..e8becca9c521 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -1698,6 +1698,43 @@ static void test_task_fd_query_tp(void)
   "sys_enter_read");
 }
 
+static void test_reference_tracking()
+{
+   const char *file = "./test_sk_lookup_kern.o";
+   struct bpf_object *obj;
+   struct bpf_program *prog;
+   __u32 duration;
+   int err = 0;
+
+   obj = bpf_object__open(file);
+   if (IS_ERR(obj)) {
+   error_cnt++;
+   return;
+   }
+
+   bpf_object__for_each_program(prog, obj) {
+   const char *title;
+
+   /* Ignore .text sections */
+   title = bpf_program__title(prog, false);
+   if (strstr(title, ".text") != NULL)
+   continue;
+
+   bpf_program__set_type(prog, BPF_PROG_TYPE_SCHED_CLS);
+
+   /* Expect verifier failure if test name has 'fail' */
+   if (strstr(title, "fail") != NULL) {
+   libbpf_set_print(NULL, NULL, NULL);
+   err = !bpf_program__load(prog, "GPL", 0);
+   libbpf_set_print(printf, printf, NULL);
+   } else {
+   err = bpf_program__load(prog, "GPL", 0);
+   }
+   CHECK(err, title, "\n");
+   }
+   bpf_object__close(obj);
+}
+
 int main(void)
 {
jit_enabled = is_jit_enabled();
@@ -1719,6 +1756,7 @@ int main(void)
test_get_stack_raw_tp();
test_task_fd_query_rawtp();
test_task_fd_query_tp();
+   test_reference_tracking();
 
printf("Summary: %d PASSED, %d FAILED\n", pass_cnt, error_cnt);
return error_cnt ? EXIT_FAILURE : EXIT_SUCCESS;
diff --git a/tools/testing/selftests/bpf/test_sk_lookup_kern.c 
b/tools/testing/selftests/bpf/test_sk_lookup_kern.c
new file mode 100644
index ..b745bdc08c2b
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_sk_lookup_kern.c
@@ -0,0 +1,180 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+// Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+int _version SEC("version") = 1;
+char _license[] SEC("license") = "GPL";
+
+/* Fill 'tuple' with L3 info, and attempt to find L4. On fail, return NULL. */
+static struct bpf_sock_tuple *get_tuple(void *data, __u64 nh_off,
+   void *data_end, __u16 eth_proto,
+   bool *ipv4)
+{
+   struct bpf_sock_tuple *result;
+   __u8 proto = 0;
+   __u64 ihl_len;
+
+   if (eth_proto == bpf_htons(ETH_P_IP)) {
+   struct iphdr *iph = (struct iphdr *)(data + nh_off);
+
+   if (iph + 1 > data_end)
+   return NULL;
+   ihl_len = iph->ihl * 4;
+   proto = iph->protocol;
+   *ipv4 = true;
+   result = (struct bpf_sock_tuple *)&iph->saddr;
+   } else if (eth_proto == bpf_htons(ETH_P_IPV6)) {
+   struct ipv6hdr *ip6h = (struct ipv6hdr *)(data + nh_off);
+
+   if (ip6h + 1 > data_end)
+   return NULL

[PATCHv3 bpf-next 12/12] Documentation: Describe bpf reference tracking

2018-09-27 Thread Joe Stringer
Document the new pointer types in the verifier and how the pointer ID
tracking works to ensure that references which are taken are later
released.

Signed-off-by: Joe Stringer 
Acked-by: Alexei Starovoitov 
---
 Documentation/networking/filter.txt | 64 +
 1 file changed, 64 insertions(+)

diff --git a/Documentation/networking/filter.txt 
b/Documentation/networking/filter.txt
index e6b4ebb2b243..4443ce958862 100644
--- a/Documentation/networking/filter.txt
+++ b/Documentation/networking/filter.txt
@@ -1125,6 +1125,14 @@ pointer type.  The types of pointers describe their 
base, as follows:
 PTR_TO_STACKFrame pointer.
 PTR_TO_PACKET   skb->data.
 PTR_TO_PACKET_END   skb->data + headlen; arithmetic forbidden.
+PTR_TO_SOCKET   Pointer to struct bpf_sock_ops, implicitly refcounted.
+PTR_TO_SOCKET_OR_NULL
+Either a pointer to a socket, or NULL; socket lookup
+returns this type, which becomes a PTR_TO_SOCKET when
+checked != NULL. PTR_TO_SOCKET is reference-counted,
+so programs must release the reference through the
+socket release function before the end of the program.
+Arithmetic on these pointers is forbidden.
 However, a pointer may be offset from this base (as a result of pointer
 arithmetic), and this is tracked in two parts: the 'fixed offset' and 'variable
 offset'.  The former is used when an exactly-known value (e.g. an immediate
@@ -1171,6 +1179,13 @@ over the Ethernet header, then reads IHL and addes (IHL 
* 4), the resulting
 pointer will have a variable offset known to be 4n+2 for some n, so adding the 
2
 bytes (NET_IP_ALIGN) gives a 4-byte alignment and so word-sized accesses 
through
 that pointer are safe.
+The 'id' field is also used on PTR_TO_SOCKET and PTR_TO_SOCKET_OR_NULL, common
+to all copies of the pointer returned from a socket lookup. This has similar
+behaviour to the handling for PTR_TO_MAP_VALUE_OR_NULL->PTR_TO_MAP_VALUE, but
+it also handles reference tracking for the pointer. PTR_TO_SOCKET implicitly
+represents a reference to the corresponding 'struct sock'. To ensure that the
+reference is not leaked, it is imperative to NULL-check the reference and in
+the non-NULL case, and pass the valid reference to the socket release function.
 
 Direct packet access
 
@@ -1444,6 +1459,55 @@ Error:
   8: (7a) *(u64 *)(r0 +0) = 1
   R0 invalid mem access 'imm'
 
+Program that performs a socket lookup then sets the pointer to NULL without
+checking it:
+value:
+  BPF_MOV64_IMM(BPF_REG_2, 0),
+  BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -8),
+  BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+  BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+  BPF_MOV64_IMM(BPF_REG_3, 4),
+  BPF_MOV64_IMM(BPF_REG_4, 0),
+  BPF_MOV64_IMM(BPF_REG_5, 0),
+  BPF_EMIT_CALL(BPF_FUNC_sk_lookup_tcp),
+  BPF_MOV64_IMM(BPF_REG_0, 0),
+  BPF_EXIT_INSN(),
+Error:
+  0: (b7) r2 = 0
+  1: (63) *(u32 *)(r10 -8) = r2
+  2: (bf) r2 = r10
+  3: (07) r2 += -8
+  4: (b7) r3 = 4
+  5: (b7) r4 = 0
+  6: (b7) r5 = 0
+  7: (85) call bpf_sk_lookup_tcp#65
+  8: (b7) r0 = 0
+  9: (95) exit
+  Unreleased reference id=1, alloc_insn=7
+
+Program that performs a socket lookup but does not NULL-check the returned
+value:
+  BPF_MOV64_IMM(BPF_REG_2, 0),
+  BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -8),
+  BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+  BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+  BPF_MOV64_IMM(BPF_REG_3, 4),
+  BPF_MOV64_IMM(BPF_REG_4, 0),
+  BPF_MOV64_IMM(BPF_REG_5, 0),
+  BPF_EMIT_CALL(BPF_FUNC_sk_lookup_tcp),
+  BPF_EXIT_INSN(),
+Error:
+  0: (b7) r2 = 0
+  1: (63) *(u32 *)(r10 -8) = r2
+  2: (bf) r2 = r10
+  3: (07) r2 += -8
+  4: (b7) r3 = 4
+  5: (b7) r4 = 0
+  6: (b7) r5 = 0
+  7: (85) call bpf_sk_lookup_tcp#65
+  8: (95) exit
+  Unreleased reference id=1, alloc_insn=7
+
 Testing
 ---
 
-- 
2.17.1



[PATCHv3 bpf-next 07/12] bpf: Add helper to retrieve socket in BPF

2018-09-27 Thread Joe Stringer
This patch adds new BPF helper functions, bpf_sk_lookup_tcp() and
bpf_sk_lookup_udp() which allows BPF programs to find out if there is a
socket listening on this host, and returns a socket pointer which the
BPF program can then access to determine, for instance, whether to
forward or drop traffic. bpf_sk_lookup_xxx() may take a reference on the
socket, so when a BPF program makes use of this function, it must
subsequently pass the returned pointer into the newly added sk_release()
to return the reference.

By way of example, the following pseudocode would filter inbound
connections at XDP if there is no corresponding service listening for
the traffic:

  struct bpf_sock_tuple tuple;
  struct bpf_sock_ops *sk;

  populate_tuple(ctx, &tuple); // Extract the 5tuple from the packet
  sk = bpf_sk_lookup_tcp(ctx, &tuple, sizeof tuple, netns, 0);
  if (!sk) {
// Couldn't find a socket listening for this traffic. Drop.
return TC_ACT_SHOT;
  }
  bpf_sk_release(sk);
  return TC_ACT_OK;

Signed-off-by: Joe Stringer 
---
v2: Rework 'struct bpf_sock_tuple' to allow passing a packet pointer
Limit netns_id field to 32 bits
Fix compile error with CONFIG_IPV6 enabled
Allow direct packet access from helper

v3: Fix release of caller_net when netns is not specified.
Use skb->sk to find caller net when skb->dev is unavailable.
Remove flags argument to sk_release()
Define the semantics of the new helpers more clearly.
---
 include/uapi/linux/bpf.h  |  93 -
 kernel/bpf/verifier.c |   8 +-
 net/core/filter.c | 151 ++
 tools/include/uapi/linux/bpf.h|  93 -
 tools/testing/selftests/bpf/bpf_helpers.h |  12 ++
 5 files changed, 354 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e2070d819e04..f9187b41dff6 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2144,6 +2144,77 @@ union bpf_attr {
  * request in the skb.
  * Return
  * 0 on success, or a negative error in case of failure.
+ *
+ * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, 
u32 tuple_size, u32 netns, u64 flags)
+ * Description
+ * Look for TCP socket matching *tuple*, optionally in a child
+ * network namespace *netns*. The return value must be checked,
+ * and if non-NULL, released via **bpf_sk_release**\ ().
+ *
+ * The *ctx* should point to the context of the program, such as
+ * the skb or socket (depending on the hook in use). This is used
+ * to determine the base network namespace for the lookup.
+ *
+ * *tuple_size* must be one of:
+ *
+ * **sizeof**\ (*tuple*\ **->ipv4**)
+ * Look for an IPv4 socket.
+ * **sizeof**\ (*tuple*\ **->ipv6**)
+ * Look for an IPv6 socket.
+ *
+ * If the *netns* is zero, then the socket lookup table in the
+ * netns associated with the *ctx* will be used. For the TC hooks,
+ * this in the netns of the device in the skb. For socket hooks,
+ * this in the netns of the socket. If *netns* is non-zero, then
+ * it specifies the ID of the netns relative to the netns
+ * associated with the *ctx*.
+ *
+ * All values for *flags* are reserved for future usage, and must
+ * be left at zero.
+ *
+ * This helper is available only if the kernel was compiled with
+ * **CONFIG_NET** configuration option.
+ * Return
+ * Pointer to *struct bpf_sock*, or NULL in case of failure.
+ *
+ * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, 
u32 tuple_size, u32 netns, u64 flags)
+ * Description
+ * Look for UDP socket matching *tuple*, optionally in a child
+ * network namespace *netns*. The return value must be checked,
+ * and if non-NULL, released via **bpf_sk_release**\ ().
+ *
+ * The *ctx* should point to the context of the program, such as
+ * the skb or socket (depending on the hook in use). This is used
+ * to determine the base network namespace for the lookup.
+ *
+ * *tuple_size* must be one of:
+ *
+ * **sizeof**\ (*tuple*\ **->ipv4**)
+ * Look for an IPv4 socket.
+ * **sizeof**\ (*tuple*\ **->ipv6**)
+ * Look for an IPv6 socket.
+ *
+ * If the *netns* is zero, then the socket lookup table in the
+ * netns associated with the *ctx* will be used. For the TC hooks,
+ * this in the netns of the device in the skb. For socket hooks,
+ * this in the netns of the socket. If *netns* is non-zero, then
+ * it specifies the ID of the netns relative to the netns
+

[PATCHv3 bpf-next 03/12] bpf: Generalize ptr_or_null regs check

2018-09-27 Thread Joe Stringer
This check will be reused by an upcoming commit for conditional jump
checks for sockets. Refactor it a bit to simplify the later commit.

Signed-off-by: Joe Stringer 
Acked-by: Alexei Starovoitov 
---
 kernel/bpf/verifier.c | 43 +--
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 87b75efc1dc1..bbb0a812ee81 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -249,6 +249,11 @@ static bool type_is_pkt_pointer(enum bpf_reg_type type)
   type == PTR_TO_PACKET_META;
 }
 
+static bool reg_type_may_be_null(enum bpf_reg_type type)
+{
+   return type == PTR_TO_MAP_VALUE_OR_NULL;
+}
+
 /* string representation of 'enum bpf_reg_type' */
 static const char * const reg_type_str[] = {
[NOT_INIT]  = "?",
@@ -3600,12 +3605,10 @@ static void reg_combine_min_max(struct bpf_reg_state 
*true_src,
}
 }
 
-static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
-bool is_null)
+static void mark_ptr_or_null_reg(struct bpf_reg_state *reg, u32 id,
+bool is_null)
 {
-   struct bpf_reg_state *reg = ®s[regno];
-
-   if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) {
+   if (reg_type_may_be_null(reg->type) && reg->id == id) {
/* Old offset (both fixed and variable parts) should
 * have been known-zero, because we don't allow pointer
 * arithmetic on pointers that might be NULL.
@@ -3618,11 +3621,13 @@ static void mark_map_reg(struct bpf_reg_state *regs, 
u32 regno, u32 id,
}
if (is_null) {
reg->type = SCALAR_VALUE;
-   } else if (reg->map_ptr->inner_map_meta) {
-   reg->type = CONST_PTR_TO_MAP;
-   reg->map_ptr = reg->map_ptr->inner_map_meta;
-   } else {
-   reg->type = PTR_TO_MAP_VALUE;
+   } else if (reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
+   if (reg->map_ptr->inner_map_meta) {
+   reg->type = CONST_PTR_TO_MAP;
+   reg->map_ptr = reg->map_ptr->inner_map_meta;
+   } else {
+   reg->type = PTR_TO_MAP_VALUE;
+   }
}
/* We don't need id from this point onwards anymore, thus we
 * should better reset it, so that state pruning has chances
@@ -3635,8 +3640,8 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 
regno, u32 id,
 /* The logic is similar to find_good_pkt_pointers(), both could eventually
  * be folded together at some point.
  */
-static void mark_map_regs(struct bpf_verifier_state *vstate, u32 regno,
- bool is_null)
+static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno,
+ bool is_null)
 {
struct bpf_func_state *state = vstate->frame[vstate->curframe];
struct bpf_reg_state *reg, *regs = state->regs;
@@ -3644,14 +3649,14 @@ static void mark_map_regs(struct bpf_verifier_state 
*vstate, u32 regno,
int i, j;
 
for (i = 0; i < MAX_BPF_REG; i++)
-   mark_map_reg(regs, i, id, is_null);
+   mark_ptr_or_null_reg(®s[i], id, is_null);
 
for (j = 0; j <= vstate->curframe; j++) {
state = vstate->frame[j];
for_each_spilled_reg(i, state, reg) {
if (!reg)
continue;
-   mark_map_reg(&state->stack[i].spilled_ptr, 0, id, 
is_null);
+   mark_ptr_or_null_reg(reg, id, is_null);
}
}
 }
@@ -3853,12 +3858,14 @@ static int check_cond_jmp_op(struct bpf_verifier_env 
*env,
/* detect if R == 0 where R is returned from bpf_map_lookup_elem() */
if (BPF_SRC(insn->code) == BPF_K &&
insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&
-   dst_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
-   /* Mark all identical map registers in each branch as either
+   reg_type_may_be_null(dst_reg->type)) {
+   /* Mark all identical registers in each branch as either
 * safe or unknown depending R == 0 or R != 0 conditional.
 */
-   mark_map_regs(this_branch, insn->dst_reg, opcode == BPF_JNE);
-   mark_map_regs(other_branch, insn->dst_reg, opcode == BPF_JEQ);
+   mark_ptr_or_null_regs(this_branch, insn->dst_reg,
+ opcode == BPF_JNE);
+   mark_ptr_or_null_regs(other_branch, insn->dst_reg,
+ opcode == BPF_JEQ);
} else if (!try_match_pkt_pointers(insn, dst_reg, ®s[insn->src_reg],

[PATCHv3 bpf-next 09/12] selftests/bpf: Add tests for reference tracking

2018-09-27 Thread Joe Stringer
reference tracking: leak potential reference
reference tracking: leak potential reference on stack
reference tracking: leak potential reference on stack 2
reference tracking: zero potential reference
reference tracking: copy and zero potential references
reference tracking: release reference without check
reference tracking: release reference
reference tracking: release reference twice
reference tracking: release reference twice inside branch
reference tracking: alloc, check, free in one subbranch
reference tracking: alloc, check, free in both subbranches
reference tracking in call: free reference in subprog
reference tracking in call: free reference in subprog and outside
reference tracking in call: alloc & leak reference in subprog
reference tracking in call: alloc in subprog, release outside
reference tracking in call: sk_ptr leak into caller stack
reference tracking in call: sk_ptr spill into caller stack
reference tracking: allow LD_ABS
reference tracking: forbid LD_ABS while holding reference
reference tracking: allow LD_IND
reference tracking: forbid LD_IND while holding reference
reference tracking: check reference or tail call
reference tracking: release reference then tail call
reference tracking: leak possible reference over tail call
reference tracking: leak checked reference over tail call
reference tracking: mangle and release sock_or_null
reference tracking: mangle and release sock
reference tracking: access member
reference tracking: write to member
reference tracking: invalid 64-bit access of member
reference tracking: access after release
reference tracking: direct access for lookup

Signed-off-by: Joe Stringer 

---
v3: Rebase against bpf_sk_release() flags argument removal.
Removed Alexei's ack since there are many new tests:
* "reference tracking: allow LD_ABS",
* "reference tracking: forbid LD_ABS while holding reference",
* "reference tracking: allow LD_IND",
* "reference tracking: forbid LD_IND while holding reference",
* "reference tracking: check reference or tail call",
* "reference tracking: release reference then tail call",
* "reference tracking: leak possible reference over tail call",
* "reference tracking: leak checked reference over tail call",
* "reference tracking: mangle and release sock_or_null",
* "reference tracking: mangle and release sock",
* "reference tracking: access member",
* "reference tracking: write to member",
* "reference tracking: invalid 64-bit access of member",
* "reference tracking: access after release",
* "reference tracking: direct access for lookup",
---
 tools/testing/selftests/bpf/test_verifier.c | 625 
 1 file changed, 625 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_verifier.c 
b/tools/testing/selftests/bpf/test_verifier.c
index 020b1467e565..9fad54b0bbd0 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -3,6 +3,7 @@
  *
  * Copyright (c) 2014 PLUMgrid, http://plumgrid.com
  * Copyright (c) 2017 Facebook
+ * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -178,6 +179,24 @@ static void bpf_fill_rand_ld_dw(struct bpf_test *self)
self->retval = (uint32_t)res;
 }
 
+/* BPF_SK_LOOKUP contains 13 instructions, if you need to fix up maps */
+#define BPF_SK_LOOKUP  \
+   /* struct bpf_sock_tuple tuple = {} */  \
+   BPF_MOV64_IMM(BPF_REG_2, 0),\
+   BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -8),  \
+   BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -16),\
+   BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -24),\
+   BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -32),\
+   BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -40),\
+   BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -48),\
+   /* sk = sk_lookup_tcp(ctx, &tuple, sizeof tuple, 0, 0) */   \
+   BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),   \
+   BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -48), \
+   BPF_MOV64_IMM(BPF_REG_3, sizeof(struct bpf_sock_tuple)),\
+   BPF_MOV64_IMM(BPF_REG_4, 0),\
+   BPF_MOV64_IMM(BPF_REG_5, 0),\
+   BPF_EMIT_CALL(BPF_FUNC_sk_lookup_tcp)
+
 static struct bpf_test tests[] = {
{
"add+sub+mul",
@@ -12557,6 +12576,214 @@ static struct bpf_test tests[] = {
.prog_type = BPF_PROG_TYPE_SCHED_CLS,
.result = ACCEPT,
},
+   {
+   "reference tracking: leak potential reference",
+   .insns = {
+   BPF_SK

[PATCHv3 bpf-next 06/12] bpf: Add reference tracking to verifier

2018-09-27 Thread Joe Stringer
Allow helper functions to acquire a reference and return it into a
register. Specific pointer types such as the PTR_TO_SOCKET will
implicitly represent such a reference. The verifier must ensure that
these references are released exactly once in each path through the
program.

To achieve this, this commit assigns an id to the pointer and tracks it
in the 'bpf_func_state', then when the function or program exits,
verifies that all of the acquired references have been freed. When the
pointer is passed to a function that frees the reference, it is removed
from the 'bpf_func_state` and all existing copies of the pointer in
registers are marked invalid.

Signed-off-by: Joe Stringer 
Acked-by: Alexei Starovoitov 
---
v2: Replace ptr_id defensive coding when releasing reference state with an
internal error (-EFAULT)

v3: No changes.
---
 include/linux/bpf_verifier.h |  24 ++-
 kernel/bpf/verifier.c| 303 ---
 2 files changed, 306 insertions(+), 21 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 23a2b17bfd75..23f222e0cb0b 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -104,6 +104,17 @@ struct bpf_stack_state {
u8 slot_type[BPF_REG_SIZE];
 };
 
+struct bpf_reference_state {
+   /* Track each reference created with a unique id, even if the same
+* instruction creates the reference multiple times (eg, via CALL).
+*/
+   int id;
+   /* Instruction where the allocation of this reference occurred. This
+* is used purely to inform the user of a reference leak.
+*/
+   int insn_idx;
+};
+
 /* state of the program:
  * type of all registers and stack info
  */
@@ -121,7 +132,9 @@ struct bpf_func_state {
 */
u32 subprogno;
 
-   /* should be second to last. See copy_func_state() */
+   /* The following fields should be last. See copy_func_state() */
+   int acquired_refs;
+   struct bpf_reference_state *refs;
int allocated_stack;
struct bpf_stack_state *stack;
 };
@@ -217,11 +230,16 @@ __printf(2, 0) void bpf_verifier_vlog(struct 
bpf_verifier_log *log,
 __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
   const char *fmt, ...);
 
-static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env)
+static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env)
 {
struct bpf_verifier_state *cur = env->cur_state;
 
-   return cur->frame[cur->curframe]->regs;
+   return cur->frame[cur->curframe];
+}
+
+static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env)
+{
+   return cur_func(env)->regs;
 }
 
 int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index cf8704d137fa..dcc5e8cab537 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1,5 +1,6 @@
 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
  * Copyright (c) 2016 Facebook
+ * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -140,6 +141,18 @@ static const struct bpf_verifier_ops * const 
bpf_verifier_ops[] = {
  *
  * After the call R0 is set to return type of the function and registers R1-R5
  * are set to NOT_INIT to indicate that they are no longer readable.
+ *
+ * The following reference types represent a potential reference to a kernel
+ * resource which, after first being allocated, must be checked and freed by
+ * the BPF program:
+ * - PTR_TO_SOCKET_OR_NULL, PTR_TO_SOCKET
+ *
+ * When the verifier sees a helper call return a reference type, it allocates a
+ * pointer id for the reference and stores it in the current function state.
+ * Similar to the way that PTR_TO_MAP_VALUE_OR_NULL is converted into
+ * PTR_TO_MAP_VALUE, PTR_TO_SOCKET_OR_NULL becomes PTR_TO_SOCKET when the type
+ * passes through a NULL-check conditional. For the branch wherein the state is
+ * changed to CONST_IMM, the verifier releases the reference.
  */
 
 /* verifier_state + insn_idx are pushed to stack when branch is encountered */
@@ -189,6 +202,7 @@ struct bpf_call_arg_meta {
int access_size;
s64 msize_smax_value;
u64 msize_umax_value;
+   int ptr_id;
 };
 
 static DEFINE_MUTEX(bpf_verifier_lock);
@@ -251,7 +265,42 @@ static bool type_is_pkt_pointer(enum bpf_reg_type type)
 
 static bool reg_type_may_be_null(enum bpf_reg_type type)
 {
-   return type == PTR_TO_MAP_VALUE_OR_NULL;
+   return type == PTR_TO_MAP_VALUE_OR_NULL ||
+  type == PTR_TO_SOCKET_OR_NULL;
+}
+
+static bool type_is_refcounted(enum bpf_reg_type type)
+{
+   return type == PTR_TO_SOCKET;
+}
+
+static bool type_is_refcounted_or_null(enum bpf_reg_type type)
+{
+   return type == PTR_TO_SOC

[PATCHv3 bpf-next 10/12] libbpf: Support loading individual progs

2018-09-27 Thread Joe Stringer
Allow the individual program load to be invoked. This will help with
testing, where a single ELF may contain several sections, some of which
denote subprograms that are expected to fail verification, along with
some which are expected to pass verification. By allowing programs to be
iterated and individually loaded, each program can be independently
checked against its expected verification result.

Signed-off-by: Joe Stringer 
Acked-by: Alexei Starovoitov 
---
 tools/lib/bpf/libbpf.c | 4 ++--
 tools/lib/bpf/libbpf.h | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 425d5ca45c97..9e68fd9fcfca 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -228,7 +228,7 @@ struct bpf_object {
 };
 #define obj_elf_valid(o)   ((o)->efile.elf)
 
-static void bpf_program__unload(struct bpf_program *prog)
+void bpf_program__unload(struct bpf_program *prog)
 {
int i;
 
@@ -1375,7 +1375,7 @@ load_program(enum bpf_prog_type type, enum 
bpf_attach_type expected_attach_type,
return ret;
 }
 
-static int
+int
 bpf_program__load(struct bpf_program *prog,
  char *license, u32 kern_version)
 {
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 511c1294dcbf..2ed24d3f80b3 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -128,10 +128,13 @@ void bpf_program__set_ifindex(struct bpf_program *prog, 
__u32 ifindex);
 
 const char *bpf_program__title(struct bpf_program *prog, bool needs_copy);
 
+int bpf_program__load(struct bpf_program *prog, char *license,
+ u32 kern_version);
 int bpf_program__fd(struct bpf_program *prog);
 int bpf_program__pin_instance(struct bpf_program *prog, const char *path,
  int instance);
 int bpf_program__pin(struct bpf_program *prog, const char *path);
+void bpf_program__unload(struct bpf_program *prog);
 
 struct bpf_insn;
 
-- 
2.17.1



Re: KMSAN: uninit-value in __dev_mc_add

2018-09-27 Thread Vladis Dronov
Hello, Eric, all,

> I dunno, your patch looks quite not the right fix.

I agree, it looks more like a dirty hack. Unfortunately, I lack the deep
expertise in the network stack subsystem, so I've posted the patch to,
sort of, start a discussion and probably get some hints.
 
> If TUN is able to change dev->type,  how comes it does not set the
> appropriate dev->addr_len at the same time ?

Well,... probably, nobody cared to do so:

[drivers/net/tun.c]
case TUNSETLINK:
...
tun->dev->type = (int) arg; //<--- that's all!
tun_debug(KERN_INFO, tun, "linktype set to %d\n",
  tun->dev->type);
ret = 0;
}
break;

> Really the bug seems to be deeper, and without setting proper
> dev->addr_len, we'll need more 'fixes' like yours.

Absolutely. Unfortunately, I wasn't able to just write such deeper patch. 
Let me share what I have found and let me hope to get an advise.

- So setting just the tun->dev->type makes the dev struct inconsistent.

- There are more field to adjust, at least dev->broadcast. Also, there are
  a number of *_ops fields which are all set for the Ethernet type, most
  probably they must be adjusted also.

- There is no get_addr_len_by_link_type() or a simple way to get link layer
  properties by dev->type. Such settings are scattered in *_setup and
  *_init functions, like ipgre_tunnel_init() { ... dev->addr_len = 4; ...}

Having these, I can imagine 2 ways for a proper fix.

1) Destroy the net_device in question and recreate it when changing a link
type. This way all the dev fields are set right. Create it in a similar way
as rtnl_newlink() does. Again, we do not have get_X_by_link_type(), so it
probably will be some large switch()/case:

  $ grep '^#define ARPHRD_' include/uapi/linux/if_arp.h | wc -l
  59

2) Leave tun an Ethernet device, add some tun->pretend_to_be_this_link_type
field and change only it on TUNSETLINK. And use this field in cases for which
TUNSETLINK was invented in the first place. Unfortunately, I do not have such
a list. The initial the commit ff4cc3ac93e1 says:

  For use with
  wireless and other networking types it should be possible to set the
  ARP type via an ioctl.

Surely, there can be something else which I do not see. Could anyone suggest
an advice on this?

Best regards,
Vladis Dronov | Red Hat, Inc. | Product Security Engineer


Re: [PATCH] netfilter: check if the socket netns is correct.

2018-09-27 Thread Guenter Roeck
On Thu, Sep 27, 2018 at 07:58:24PM -0300, Flavio Leitner wrote:
> On Thu, Sep 27, 2018 at 01:46:29PM -0700, Guenter Roeck wrote:
> > Hi Flavio,
> > 
> > On Wed, Jun 27, 2018 at 10:34:25AM -0300, Flavio Leitner wrote:
> > > Netfilter assumes that if the socket is present in the skb, then
> > > it can be used because that reference is cleaned up while the skb
> > > is crossing netns.
> > > 
> > > We want to change that to preserve the socket reference in a future
> > > patch, so this is a preparation updating netfilter to check if the
> > > socket netns matches before use it.
> > > 
> > > Signed-off-by: Flavio Leitner 
> > > Acked-by: Florian Westphal 
> > > Signed-off-by: David S. Miller 
> > > ---
> > ...
> > > --- a/net/netfilter/xt_socket.c
> > > +++ b/net/netfilter/xt_socket.c
> > > @@ -56,8 +56,12 @@ socket_match(const struct sk_buff *skb, struct 
> > > xt_action_param *par,
> > >   struct sk_buff *pskb = (struct sk_buff *)skb;
> > >   struct sock *sk = skb->sk;
> > >  
> > > + if (!net_eq(xt_net(par), sock_net(sk)))
> > > + sk = NULL;
> > > +
> > 
> > I am having trouble with this code. With CONFIG_NET_NS enabled, it crashes
> > for me in read_pnet() because sk is NULL.
> > 
> > >   if (!sk)
> > >   sk = nf_sk_lookup_slow_v4(xt_net(par), skb, xt_in(par));
> > 
> > The old code seems to suggest that sk == NULL was possible.
> > 
> > I see the problem with the Chrome OS kernel rebased to v4.19-rc5, so I
> > can not guarantee that this really an upstream problem. The change seems
> > odd, though. Are you sure that it is not (or, rather, no longer) necessary
> > to check if sk == NULL before dereferencing it in sock_net() ?
> 
> Oops, it is necessary but if it's not and the netns doesn't match, we need
> do the lookup. So, could you check if this fixes the problem for you?
> 
> From a5f927e7f1368d753f87cb978d630d786d5adb62 Mon Sep 17 00:00:00 2001
> From: Flavio Leitner 
> Date: Thu, 27 Sep 2018 19:36:28 -0300
> Subject: [PATCH] xt_socket: check sk before checking for netns.
> 
> Only check for the network namespace if the socket is available.
> 
> Fixes: f564650106a6 ("netfilter: check if the socket netns is correct.")
> Reported-by: Guenter Roeck 
> Signed-off-by: Flavio Leitner 

This fixes the problem for me.

Tested-by: Guenter Roeck 

Thanks,
Guenter

> ---
>  net/netfilter/xt_socket.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
> index 0472f3472842..ada144e5645b 100644
> --- a/net/netfilter/xt_socket.c
> +++ b/net/netfilter/xt_socket.c
> @@ -56,7 +56,7 @@ socket_match(const struct sk_buff *skb, struct 
> xt_action_param *par,
>   struct sk_buff *pskb = (struct sk_buff *)skb;
>   struct sock *sk = skb->sk;
>  
> - if (!net_eq(xt_net(par), sock_net(sk)))
> + if (sk && !net_eq(xt_net(par), sock_net(sk)))
>   sk = NULL;
>  
>   if (!sk)
> @@ -117,7 +117,7 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct 
> xt_action_param *par)
>   struct sk_buff *pskb = (struct sk_buff *)skb;
>   struct sock *sk = skb->sk;
>  
> - if (!net_eq(xt_net(par), sock_net(sk)))
> + if (sk && !net_eq(xt_net(par), sock_net(sk)))
>   sk = NULL;
>  
>   if (!sk)
> -- 
> 2.14.4
> 


[PATCH] cfg80211: fix use-after-free in reg_process_hint()

2018-09-27 Thread Yu Zhao
reg_process_hint_country_ie() can free regulatory_request and return
REG_REQ_ALREADY_SET. We shouldn't use regulatory_request after it's
called. KASAN error was observed when this happens.

BUG: KASAN: use-after-free in reg_process_hint+0x839/0x8aa [cfg80211]
Read of size 4 at addr 8800c430d434 by task kworker/1:3/89

Workqueue: events reg_todo [cfg80211]
Call Trace:
 dump_stack+0xc1/0x10c
 ? _atomic_dec_and_lock+0x1ad/0x1ad
 ? _raw_spin_lock_irqsave+0xa0/0xd2
 print_address_description+0x86/0x26f
 ? reg_process_hint+0x839/0x8aa [cfg80211]
 kasan_report+0x241/0x29b
 reg_process_hint+0x839/0x8aa [cfg80211]
 reg_todo+0x204/0x5b9 [cfg80211]
 process_one_work+0x55f/0x8d0
 ? worker_detach_from_pool+0x1b5/0x1b5
 ? _raw_spin_unlock_irq+0x65/0xdd
 ? _raw_spin_unlock_irqrestore+0xf3/0xf3
 worker_thread+0x5dd/0x841
 ? kthread_parkme+0x1d/0x1d
 kthread+0x270/0x285
 ? pr_cont_work+0xe3/0xe3
 ? rcu_read_unlock_sched_notrace+0xca/0xca
 ret_from_fork+0x22/0x40

Allocated by task 2718:
 set_track+0x63/0xfa
 __kmalloc+0x119/0x1ac
 regulatory_hint_country_ie+0x38/0x329 [cfg80211]
 __cfg80211_connect_result+0x854/0xadd [cfg80211]
 cfg80211_rx_assoc_resp+0x3bc/0x4f0 [cfg80211]
smsc95xx v1.0.6
 ieee80211_sta_rx_queued_mgmt+0x1803/0x7ed5 [mac80211]
 ieee80211_iface_work+0x411/0x696 [mac80211]
 process_one_work+0x55f/0x8d0
 worker_thread+0x5dd/0x841
 kthread+0x270/0x285
 ret_from_fork+0x22/0x40

Freed by task 89:
 set_track+0x63/0xfa
 kasan_slab_free+0x6a/0x87
 kfree+0xdc/0x470
 reg_process_hint+0x31e/0x8aa [cfg80211]
 reg_todo+0x204/0x5b9 [cfg80211]
 process_one_work+0x55f/0x8d0
 worker_thread+0x5dd/0x841
 kthread+0x270/0x285
 ret_from_fork+0x22/0x40


Signed-off-by: Yu Zhao 
---
 net/wireless/reg.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 765dedb12361..24cfa2776f50 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -2661,11 +2661,12 @@ static void reg_process_hint(struct regulatory_request 
*reg_request)
 {
struct wiphy *wiphy = NULL;
enum reg_request_treatment treatment;
+   enum nl80211_reg_initiator initiator = reg_request->initiator;
 
if (reg_request->wiphy_idx != WIPHY_IDX_INVALID)
wiphy = wiphy_idx_to_wiphy(reg_request->wiphy_idx);
 
-   switch (reg_request->initiator) {
+   switch (initiator) {
case NL80211_REGDOM_SET_BY_CORE:
treatment = reg_process_hint_core(reg_request);
break;
@@ -2683,7 +2684,7 @@ static void reg_process_hint(struct regulatory_request 
*reg_request)
treatment = reg_process_hint_country_ie(wiphy, reg_request);
break;
default:
-   WARN(1, "invalid initiator %d\n", reg_request->initiator);
+   WARN(1, "invalid initiator %d\n", initiator);
goto out_free;
}
 
@@ -2698,7 +2699,7 @@ static void reg_process_hint(struct regulatory_request 
*reg_request)
 */
if (treatment == REG_REQ_ALREADY_SET && wiphy &&
wiphy->regulatory_flags & REGULATORY_STRICT_REG) {
-   wiphy_update_regulatory(wiphy, reg_request->initiator);
+   wiphy_update_regulatory(wiphy, initiator);
wiphy_all_share_dfs_chan_state(wiphy);
reg_check_channels();
}
-- 
2.19.0.605.g01d371f741-goog



[PATCH 07/11] net: remove 1 always zero parameter from ip6_redirect_no_header()

2018-09-27 Thread Maciej Żenczykowski
From: Maciej Żenczykowski 

(the parameter in question is mark)

Signed-off-by: Maciej Żenczykowski 
---
 include/net/ip6_route.h | 3 +--
 net/ipv6/ndisc.c| 2 +-
 net/ipv6/route.c| 4 +---
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 7b9c82de11cc..cef186dbd2ce 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -165,8 +165,7 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, 
__be32 mtu, int oif,
 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu);
 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
  kuid_t uid);
-void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
-   u32 mark);
+void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif);
 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk);
 
 struct netlink_callback;
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 0ec273997d1d..51863ada15a4 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1533,7 +1533,7 @@ static void ndisc_redirect_rcv(struct sk_buff *skb)
 
if (!ndopts.nd_opts_rh) {
ip6_redirect_no_header(skb, dev_net(skb->dev),
-   skb->dev->ifindex, 0);
+   skb->dev->ifindex);
return;
}
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index dd8c04f253d5..27f1260e053a 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2520,8 +2520,7 @@ void ip6_redirect(struct sk_buff *skb, struct net *net, 
int oif, u32 mark,
 }
 EXPORT_SYMBOL_GPL(ip6_redirect);
 
-void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
-   u32 mark)
+void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
 {
const struct ipv6hdr *iph = ipv6_hdr(skb);
const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
@@ -2529,7 +2528,6 @@ void ip6_redirect_no_header(struct sk_buff *skb, struct 
net *net, int oif,
struct flowi6 fl6 = {
.flowi6_iif = LOOPBACK_IFINDEX,
.flowi6_oif = oif,
-   .flowi6_mark = mark,
.daddr = msg->dest,
.saddr = iph->daddr,
.flowi6_uid = sock_net_uid(net, NULL),
-- 
2.19.0.605.g01d371f741-goog



[PATCH 05/11] net: ip6_redirect() - use new style struct initializer instead of memset

2018-09-27 Thread Maciej Żenczykowski
From: Maciej Żenczykowski 

(allows for better compiler optimization)

Signed-off-by: Maciej Żenczykowski 
---
 net/ipv6/route.c | 19 +--
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 9cb024451fc5..e148d197d628 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2504,16 +2504,15 @@ void ip6_redirect(struct sk_buff *skb, struct net *net, 
int oif, u32 mark,
 {
const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
struct dst_entry *dst;
-   struct flowi6 fl6;
-
-   memset(&fl6, 0, sizeof(fl6));
-   fl6.flowi6_iif = LOOPBACK_IFINDEX;
-   fl6.flowi6_oif = oif;
-   fl6.flowi6_mark = mark;
-   fl6.daddr = iph->daddr;
-   fl6.saddr = iph->saddr;
-   fl6.flowlabel = ip6_flowinfo(iph);
-   fl6.flowi6_uid = uid;
+   struct flowi6 fl6 = {
+   .flowi6_iif = LOOPBACK_IFINDEX,
+   .flowi6_oif = oif,
+   .flowi6_mark = mark,
+   .daddr = iph->daddr,
+   .saddr = iph->saddr,
+   .flowlabel = ip6_flowinfo(iph),
+   .flowi6_uid = uid,
+   };
 
dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
rt6_do_redirect(dst, NULL, skb);
-- 
2.19.0.605.g01d371f741-goog



[PATCH 11/11] net: inet6_rtm_getroute() - use new style struct initializer instead of memset

2018-09-27 Thread Maciej Żenczykowski
From: Maciej Żenczykowski 

Signed-off-by: Maciej Żenczykowski 
---
 net/ipv6/route.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 9aca81772c93..aca6a84de794 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -4819,7 +4819,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, 
struct nlmsghdr *nlh,
struct rt6_info *rt;
struct sk_buff *skb;
struct rtmsg *rtm;
-   struct flowi6 fl6;
+   struct flowi6 fl6 = {};
bool fibmatch;
 
err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
@@ -4828,7 +4828,6 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, 
struct nlmsghdr *nlh,
goto errout;
 
err = -EINVAL;
-   memset(&fl6, 0, sizeof(fl6));
rtm = nlmsg_data(nlh);
fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
-- 
2.19.0.605.g01d371f741-goog



[PATCH 06/11] net: ip6_redirect_no_header() - use new style struct initializer instead of memset

2018-09-27 Thread Maciej Żenczykowski
From: Maciej Żenczykowski 

(allows for better compiler optimization)

Signed-off-by: Maciej Żenczykowski 
---
 net/ipv6/route.c | 17 -
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index e148d197d628..dd8c04f253d5 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2526,15 +2526,14 @@ void ip6_redirect_no_header(struct sk_buff *skb, struct 
net *net, int oif,
const struct ipv6hdr *iph = ipv6_hdr(skb);
const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
struct dst_entry *dst;
-   struct flowi6 fl6;
-
-   memset(&fl6, 0, sizeof(fl6));
-   fl6.flowi6_iif = LOOPBACK_IFINDEX;
-   fl6.flowi6_oif = oif;
-   fl6.flowi6_mark = mark;
-   fl6.daddr = msg->dest;
-   fl6.saddr = iph->daddr;
-   fl6.flowi6_uid = sock_net_uid(net, NULL);
+   struct flowi6 fl6 = {
+   .flowi6_iif = LOOPBACK_IFINDEX,
+   .flowi6_oif = oif,
+   .flowi6_mark = mark,
+   .daddr = msg->dest,
+   .saddr = iph->daddr,
+   .flowi6_uid = sock_net_uid(net, NULL),
+   };
 
dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
rt6_do_redirect(dst, NULL, skb);
-- 
2.19.0.605.g01d371f741-goog



[PATCH 08/11] net: ip6_update_pmtu() - use new style struct initializer instead of memset

2018-09-27 Thread Maciej Żenczykowski
From: Maciej Żenczykowski 

(allows for better compiler optimization)

Signed-off-by: Maciej Żenczykowski 
---
 net/ipv6/route.c | 17 -
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 27f1260e053a..a87b79574a91 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2345,15 +2345,14 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net 
*net, __be32 mtu,
 {
const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
struct dst_entry *dst;
-   struct flowi6 fl6;
-
-   memset(&fl6, 0, sizeof(fl6));
-   fl6.flowi6_oif = oif;
-   fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
-   fl6.daddr = iph->daddr;
-   fl6.saddr = iph->saddr;
-   fl6.flowlabel = ip6_flowinfo(iph);
-   fl6.flowi6_uid = uid;
+   struct flowi6 fl6 = {
+   .flowi6_oif = oif,
+   .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
+   .daddr = iph->daddr,
+   .saddr = iph->saddr,
+   .flowlabel = ip6_flowinfo(iph),
+   .flowi6_uid = uid,
+   };
 
dst = ip6_route_output(net, NULL, &fl6);
if (!dst->error)
-- 
2.19.0.605.g01d371f741-goog



[PATCH 09/11] net: rtmsg_to_fib6_config() - use new style struct initializer instead of memset

2018-09-27 Thread Maciej Żenczykowski
From: Maciej Żenczykowski 

(allows for better compiler optimization)

Signed-off-by: Maciej Żenczykowski 
---
 net/ipv6/route.c | 32 
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index a87b79574a91..b8fece1d6021 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3600,23 +3600,23 @@ static void rtmsg_to_fib6_config(struct net *net,
 struct in6_rtmsg *rtmsg,
 struct fib6_config *cfg)
 {
-   memset(cfg, 0, sizeof(*cfg));
+   *cfg = (struct fib6_config){
+   .fc_table = l3mdev_fib_table_by_index(net, 
rtmsg->rtmsg_ifindex) ?
+: RT6_TABLE_MAIN,
+   .fc_ifindex = rtmsg->rtmsg_ifindex,
+   .fc_metric = rtmsg->rtmsg_metric,
+   .fc_expires = rtmsg->rtmsg_info,
+   .fc_dst_len = rtmsg->rtmsg_dst_len,
+   .fc_src_len = rtmsg->rtmsg_src_len,
+   .fc_flags = rtmsg->rtmsg_flags,
+   .fc_type = rtmsg->rtmsg_type,
+
+   .fc_nlinfo.nl_net = net,
 
-   cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
-: RT6_TABLE_MAIN;
-   cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
-   cfg->fc_metric = rtmsg->rtmsg_metric;
-   cfg->fc_expires = rtmsg->rtmsg_info;
-   cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
-   cfg->fc_src_len = rtmsg->rtmsg_src_len;
-   cfg->fc_flags = rtmsg->rtmsg_flags;
-   cfg->fc_type = rtmsg->rtmsg_type;
-
-   cfg->fc_nlinfo.nl_net = net;
-
-   cfg->fc_dst = rtmsg->rtmsg_dst;
-   cfg->fc_src = rtmsg->rtmsg_src;
-   cfg->fc_gateway = rtmsg->rtmsg_gateway;
+   .fc_dst = rtmsg->rtmsg_dst,
+   .fc_src = rtmsg->rtmsg_src,
+   .fc_gateway = rtmsg->rtmsg_gateway,
+   };
 }
 
 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
-- 
2.19.0.605.g01d371f741-goog



[PATCH 10/11] net: rtm_to_fib6_config() - use new style struct initializer instead of memset

2018-09-27 Thread Maciej Żenczykowski
From: Maciej Żenczykowski 

(allows for better compiler optimization)

Signed-off-by: Maciej Żenczykowski 
---
 net/ipv6/route.c | 23 ---
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index b8fece1d6021..9aca81772c93 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -4139,14 +4139,19 @@ static int rtm_to_fib6_config(struct sk_buff *skb, 
struct nlmsghdr *nlh,
 
err = -EINVAL;
rtm = nlmsg_data(nlh);
-   memset(cfg, 0, sizeof(*cfg));
 
-   cfg->fc_table = rtm->rtm_table;
-   cfg->fc_dst_len = rtm->rtm_dst_len;
-   cfg->fc_src_len = rtm->rtm_src_len;
-   cfg->fc_flags = RTF_UP;
-   cfg->fc_protocol = rtm->rtm_protocol;
-   cfg->fc_type = rtm->rtm_type;
+   *cfg = (struct fib6_config){
+   .fc_table = rtm->rtm_table,
+   .fc_dst_len = rtm->rtm_dst_len,
+   .fc_src_len = rtm->rtm_src_len,
+   .fc_flags = RTF_UP,
+   .fc_protocol = rtm->rtm_protocol,
+   .fc_type = rtm->rtm_type,
+
+   .fc_nlinfo.portid = NETLINK_CB(skb).portid,
+   .fc_nlinfo.nlh = nlh,
+   .fc_nlinfo.nl_net = sock_net(skb->sk),
+   };
 
if (rtm->rtm_type == RTN_UNREACHABLE ||
rtm->rtm_type == RTN_BLACKHOLE ||
@@ -4162,10 +4167,6 @@ static int rtm_to_fib6_config(struct sk_buff *skb, 
struct nlmsghdr *nlh,
 
cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
 
-   cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
-   cfg->fc_nlinfo.nlh = nlh;
-   cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
-
if (tb[RTA_GATEWAY]) {
cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
cfg->fc_flags |= RTF_GATEWAY;
-- 
2.19.0.605.g01d371f741-goog



[PATCH 04/11] net: ip6_multipath_l3_keys() - use new style struct initializer instead of memset

2018-09-27 Thread Maciej Żenczykowski
From: Maciej Żenczykowski 

Signed-off-by: Maciej Żenczykowski 
---
 net/ipv6/route.c | 6 +-
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index d28f83e01593..9cb024451fc5 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1981,12 +1981,11 @@ static void ip6_multipath_l3_keys(const struct sk_buff 
*skb,
 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
   const struct sk_buff *skb, struct flow_keys *flkeys)
 {
-   struct flow_keys hash_keys;
+   struct flow_keys hash_keys = {};
u32 mhash;
 
switch (ip6_multipath_hash_policy(net)) {
case 0:
-   memset(&hash_keys, 0, sizeof(hash_keys));
hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
if (skb) {
ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
@@ -2006,8 +2005,6 @@ u32 rt6_multipath_hash(const struct net *net, const 
struct flowi6 *fl6,
if (skb->l4_hash)
return skb_get_hash_raw(skb) >> 1;
 
-   memset(&hash_keys, 0, sizeof(hash_keys));
-
 if (!flkeys) {
skb_flow_dissect_flow_keys(skb, &keys, flag);
flkeys = &keys;
@@ -2019,7 +2016,6 @@ u32 rt6_multipath_hash(const struct net *net, const 
struct flowi6 *fl6,
hash_keys.ports.dst = flkeys->ports.dst;
hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
} else {
-   memset(&hash_keys, 0, sizeof(hash_keys));
hash_keys.control.addr_type = 
FLOW_DISSECTOR_KEY_IPV6_ADDRS;
hash_keys.addrs.v6addrs.src = fl6->saddr;
hash_keys.addrs.v6addrs.dst = fl6->daddr;
-- 
2.19.0.605.g01d371f741-goog



[PATCH 03/11] net: fib_multipath_hash() - use new style struct initializer instead of memset

2018-09-27 Thread Maciej Żenczykowski
From: Maciej Żenczykowski 

Signed-off-by: Maciej Żenczykowski 
---
 net/ipv4/route.c | 6 +-
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 048919713f4e..17953a52fbd0 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1821,12 +1821,11 @@ static void ip_multipath_l3_keys(const struct sk_buff 
*skb,
 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
   const struct sk_buff *skb, struct flow_keys *flkeys)
 {
-   struct flow_keys hash_keys;
+   struct flow_keys hash_keys = {};
u32 mhash;
 
switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
case 0:
-   memset(&hash_keys, 0, sizeof(hash_keys));
hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
if (skb) {
ip_multipath_l3_keys(skb, &hash_keys);
@@ -1845,8 +1844,6 @@ int fib_multipath_hash(const struct net *net, const 
struct flowi4 *fl4,
if (skb->l4_hash)
return skb_get_hash_raw(skb) >> 1;
 
-   memset(&hash_keys, 0, sizeof(hash_keys));
-
if (!flkeys) {
skb_flow_dissect_flow_keys(skb, &keys, flag);
flkeys = &keys;
@@ -1859,7 +1856,6 @@ int fib_multipath_hash(const struct net *net, const 
struct flowi4 *fl4,
hash_keys.ports.dst = flkeys->ports.dst;
hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
} else {
-   memset(&hash_keys, 0, sizeof(hash_keys));
hash_keys.control.addr_type = 
FLOW_DISSECTOR_KEY_IPV4_ADDRS;
hash_keys.addrs.v4addrs.src = fl4->saddr;
hash_keys.addrs.v4addrs.dst = fl4->daddr;
-- 
2.19.0.605.g01d371f741-goog



[PATCH 01/11] net: ip_rt_get_source() - use new style struct initializer instead of memset

2018-09-27 Thread Maciej Żenczykowski
From: Maciej Żenczykowski 

(allows for better compiler optimization)

Signed-off-by: Maciej Żenczykowski 
---
 net/ipv4/route.c | 21 +
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index dce2ed66ebe1..02482b71498b 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1217,18 +1217,15 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, 
struct rtable *rt)
src = ip_hdr(skb)->saddr;
else {
struct fib_result res;
-   struct flowi4 fl4;
-   struct iphdr *iph;
-
-   iph = ip_hdr(skb);
-
-   memset(&fl4, 0, sizeof(fl4));
-   fl4.daddr = iph->daddr;
-   fl4.saddr = iph->saddr;
-   fl4.flowi4_tos = RT_TOS(iph->tos);
-   fl4.flowi4_oif = rt->dst.dev->ifindex;
-   fl4.flowi4_iif = skb->dev->ifindex;
-   fl4.flowi4_mark = skb->mark;
+   struct iphdr *iph = ip_hdr(skb);
+   struct flowi4 fl4 = {
+   .daddr = iph->daddr,
+   .saddr = iph->saddr,
+   .flowi4_tos = RT_TOS(iph->tos),
+   .flowi4_oif = rt->dst.dev->ifindex,
+   .flowi4_iif = skb->dev->ifindex,
+   .flowi4_mark = skb->mark,
+   };
 
rcu_read_lock();
if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
-- 
2.19.0.605.g01d371f741-goog



[PATCH 02/11] net: inet_rtm_getroute() - use new style struct initializer instead of memset

2018-09-27 Thread Maciej Żenczykowski
From: Maciej Żenczykowski 

Signed-off-by: Maciej Żenczykowski 
---
 net/ipv4/route.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 02482b71498b..048919713f4e 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2780,7 +2780,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, 
struct nlmsghdr *nlh,
struct rtable *rt = NULL;
struct sk_buff *skb;
struct rtmsg *rtm;
-   struct flowi4 fl4;
+   struct flowi4 fl4 = {};
__be32 dst = 0;
__be32 src = 0;
kuid_t uid;
@@ -2820,7 +2820,6 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, 
struct nlmsghdr *nlh,
if (!skb)
return -ENOBUFS;
 
-   memset(&fl4, 0, sizeof(fl4));
fl4.daddr = dst;
fl4.saddr = src;
fl4.flowi4_tos = rtm->rtm_tos;
-- 
2.19.0.605.g01d371f741-goog



Re: [PATCH] netfilter: check if the socket netns is correct.

2018-09-27 Thread Flavio Leitner
On Thu, Sep 27, 2018 at 01:46:29PM -0700, Guenter Roeck wrote:
> Hi Flavio,
> 
> On Wed, Jun 27, 2018 at 10:34:25AM -0300, Flavio Leitner wrote:
> > Netfilter assumes that if the socket is present in the skb, then
> > it can be used because that reference is cleaned up while the skb
> > is crossing netns.
> > 
> > We want to change that to preserve the socket reference in a future
> > patch, so this is a preparation updating netfilter to check if the
> > socket netns matches before use it.
> > 
> > Signed-off-by: Flavio Leitner 
> > Acked-by: Florian Westphal 
> > Signed-off-by: David S. Miller 
> > ---
> ...
> > --- a/net/netfilter/xt_socket.c
> > +++ b/net/netfilter/xt_socket.c
> > @@ -56,8 +56,12 @@ socket_match(const struct sk_buff *skb, struct 
> > xt_action_param *par,
> > struct sk_buff *pskb = (struct sk_buff *)skb;
> > struct sock *sk = skb->sk;
> >  
> > +   if (!net_eq(xt_net(par), sock_net(sk)))
> > +   sk = NULL;
> > +
> 
> I am having trouble with this code. With CONFIG_NET_NS enabled, it crashes
> for me in read_pnet() because sk is NULL.
> 
> > if (!sk)
> > sk = nf_sk_lookup_slow_v4(xt_net(par), skb, xt_in(par));
> 
> The old code seems to suggest that sk == NULL was possible.
> 
> I see the problem with the Chrome OS kernel rebased to v4.19-rc5, so I
> can not guarantee that this really an upstream problem. The change seems
> odd, though. Are you sure that it is not (or, rather, no longer) necessary
> to check if sk == NULL before dereferencing it in sock_net() ?

Oops, it is necessary but if it's not and the netns doesn't match, we need
do the lookup. So, could you check if this fixes the problem for you?

>From a5f927e7f1368d753f87cb978d630d786d5adb62 Mon Sep 17 00:00:00 2001
From: Flavio Leitner 
Date: Thu, 27 Sep 2018 19:36:28 -0300
Subject: [PATCH] xt_socket: check sk before checking for netns.

Only check for the network namespace if the socket is available.

Fixes: f564650106a6 ("netfilter: check if the socket netns is correct.")
Reported-by: Guenter Roeck 
Signed-off-by: Flavio Leitner 
---
 net/netfilter/xt_socket.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 0472f3472842..ada144e5645b 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -56,7 +56,7 @@ socket_match(const struct sk_buff *skb, struct 
xt_action_param *par,
struct sk_buff *pskb = (struct sk_buff *)skb;
struct sock *sk = skb->sk;
 
-   if (!net_eq(xt_net(par), sock_net(sk)))
+   if (sk && !net_eq(xt_net(par), sock_net(sk)))
sk = NULL;
 
if (!sk)
@@ -117,7 +117,7 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct 
xt_action_param *par)
struct sk_buff *pskb = (struct sk_buff *)skb;
struct sock *sk = skb->sk;
 
-   if (!net_eq(xt_net(par), sock_net(sk)))
+   if (sk && !net_eq(xt_net(par), sock_net(sk)))
sk = NULL;
 
if (!sk)
-- 
2.14.4



[PATCH net-next 4/5] net: systemport: Be drop monitor friendly while re-allocating headroom

2018-09-27 Thread Florian Fainelli
During bcm_sysport_insert_tsb() make sure we differentiate a SKB
headroom re-allocation failure from the normal swap and replace path.

Signed-off-by: Florian Fainelli 
---
 drivers/net/ethernet/broadcom/bcmsysport.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c 
b/drivers/net/ethernet/broadcom/bcmsysport.c
index 977d9dec2fb0..6c40cf6090ab 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.c
+++ b/drivers/net/ethernet/broadcom/bcmsysport.c
@@ -1230,12 +1230,13 @@ static struct sk_buff *bcm_sysport_insert_tsb(struct 
sk_buff *skb,
/* Re-allocate SKB if needed */
if (unlikely(skb_headroom(skb) < sizeof(*tsb))) {
nskb = skb_realloc_headroom(skb, sizeof(*tsb));
-   dev_kfree_skb(skb);
if (!nskb) {
+   dev_kfree_skb_any(skb);
dev->stats.tx_errors++;
dev->stats.tx_dropped++;
return NULL;
}
+   dev_consume_skb_any(skb);
skb = nskb;
}
 
-- 
2.17.1



[PATCH net-next 2/5] net: systemport: Utilize bcm_sysport_set_features() during resume/open

2018-09-27 Thread Florian Fainelli
During driver resume and open, the HW may have lost its context/state,
utilize bcm_sysport_set_features() to make sure we do restore the
correct set of features that were previously configured.

Signed-off-by: Florian Fainelli 
---
 drivers/net/ethernet/broadcom/bcmsysport.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c 
b/drivers/net/ethernet/broadcom/bcmsysport.c
index 654a07b849c4..3b4cb906a275 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.c
+++ b/drivers/net/ethernet/broadcom/bcmsysport.c
@@ -1972,6 +1972,11 @@ static int bcm_sysport_open(struct net_device *dev)
else
gib_set_pad_extension(priv);
 
+   /* Apply features again in case we changed them while interface was
+* down
+*/
+   bcm_sysport_set_features(dev, dev->features);
+
/* Set MAC address */
umac_set_hw_addr(priv, dev->dev_addr);
 
@@ -2708,7 +2713,6 @@ static int __maybe_unused bcm_sysport_resume(struct 
device *d)
struct net_device *dev = dev_get_drvdata(d);
struct bcm_sysport_priv *priv = netdev_priv(dev);
unsigned int i;
-   u32 reg;
int ret;
 
if (!netif_running(dev))
@@ -2752,12 +2756,8 @@ static int __maybe_unused bcm_sysport_resume(struct 
device *d)
goto out_free_rx_ring;
}
 
-   /* Enable rxhck */
-   if (priv->rx_chk_en) {
-   reg = rxchk_readl(priv, RXCHK_CONTROL);
-   reg |= RXCHK_EN;
-   rxchk_writel(priv, reg, RXCHK_CONTROL);
-   }
+   /* Restore enabled features */
+   bcm_sysport_set_features(dev, dev->features);
 
rbuf_init(priv);
 
-- 
2.17.1



[PATCH net-next 1/5] net: systemport: Refactor bcm_sysport_set_features()

2018-09-27 Thread Florian Fainelli
In preparation for unconditionally enabling TX and RX checksum offloads,
refactor bcm_sysport_set_features() a bit such that
__netdev_update_features() during register_netdev() can make sure that
features are correctly programmed during network device registration.

Since we can now be called during register_netdev() with clocks gated,
we need to temporarily turn them on/off in order to have a successful
register programming.

We also move the CRC forward setting read into
bcm_sysport_set_features() since priv->crc_fwd matters while turning on
RX checksum offload, that way we are guaranteed they are in sync in case
we ever add support for NETIF_F_RXFCS at some point in the future.

Signed-off-by: Florian Fainelli 
---
 drivers/net/ethernet/broadcom/bcmsysport.c | 38 +-
 1 file changed, 15 insertions(+), 23 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c 
b/drivers/net/ethernet/broadcom/bcmsysport.c
index 147045757b10..654a07b849c4 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.c
+++ b/drivers/net/ethernet/broadcom/bcmsysport.c
@@ -126,8 +126,8 @@ static inline void tdma_port_write_desc_addr(struct 
bcm_sysport_priv *priv,
 }
 
 /* Ethtool operations */
-static int bcm_sysport_set_rx_csum(struct net_device *dev,
-  netdev_features_t wanted)
+static void bcm_sysport_set_rx_csum(struct net_device *dev,
+   netdev_features_t wanted)
 {
struct bcm_sysport_priv *priv = netdev_priv(dev);
u32 reg;
@@ -157,12 +157,10 @@ static int bcm_sysport_set_rx_csum(struct net_device *dev,
reg &= ~RXCHK_BRCM_TAG_EN;
 
rxchk_writel(priv, reg, RXCHK_CONTROL);
-
-   return 0;
 }
 
-static int bcm_sysport_set_tx_csum(struct net_device *dev,
-  netdev_features_t wanted)
+static void bcm_sysport_set_tx_csum(struct net_device *dev,
+   netdev_features_t wanted)
 {
struct bcm_sysport_priv *priv = netdev_priv(dev);
u32 reg;
@@ -177,23 +175,24 @@ static int bcm_sysport_set_tx_csum(struct net_device *dev,
else
reg &= ~tdma_control_bit(priv, TSB_EN);
tdma_writel(priv, reg, TDMA_CONTROL);
-
-   return 0;
 }
 
 static int bcm_sysport_set_features(struct net_device *dev,
netdev_features_t features)
 {
-   netdev_features_t changed = features ^ dev->features;
-   netdev_features_t wanted = dev->wanted_features;
-   int ret = 0;
+   struct bcm_sysport_priv *priv = netdev_priv(dev);
+
+   /* Read CRC forward */
+   if (!priv->is_lite)
+   priv->crc_fwd = !!(umac_readl(priv, UMAC_CMD) & CMD_CRC_FWD);
+   else
+   priv->crc_fwd = !((gib_readl(priv, GIB_CONTROL) &
+ GIB_FCS_STRIP) >> GIB_FCS_STRIP_SHIFT);
 
-   if (changed & NETIF_F_RXCSUM)
-   ret = bcm_sysport_set_rx_csum(dev, wanted);
-   if (changed & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))
-   ret = bcm_sysport_set_tx_csum(dev, wanted);
+   bcm_sysport_set_rx_csum(dev, features);
+   bcm_sysport_set_tx_csum(dev, features);
 
-   return ret;
+   return 0;
 }
 
 /* Hardware counters must be kept in sync because the order/offset
@@ -1976,13 +1975,6 @@ static int bcm_sysport_open(struct net_device *dev)
/* Set MAC address */
umac_set_hw_addr(priv, dev->dev_addr);
 
-   /* Read CRC forward */
-   if (!priv->is_lite)
-   priv->crc_fwd = !!(umac_readl(priv, UMAC_CMD) & CMD_CRC_FWD);
-   else
-   priv->crc_fwd = !((gib_readl(priv, GIB_CONTROL) &
- GIB_FCS_STRIP) >> GIB_FCS_STRIP_SHIFT);
-
phydev = of_phy_connect(dev, priv->phy_dn, bcm_sysport_adj_link,
0, priv->phy_interface);
if (!phydev) {
-- 
2.17.1



[PATCH net-next 5/5] net: systemport: Add software counters to track reallocations

2018-09-27 Thread Florian Fainelli
When inserting the TSB, keep track of how many times we had to do it and
if there was a failure in doing so, this helps profile the driver for
possibly incorrect headroom settings.

Signed-off-by: Florian Fainelli 
---
 drivers/net/ethernet/broadcom/bcmsysport.c | 5 +
 drivers/net/ethernet/broadcom/bcmsysport.h | 2 ++
 2 files changed, 7 insertions(+)

diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c 
b/drivers/net/ethernet/broadcom/bcmsysport.c
index 6c40cf6090ab..faba55fd656a 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.c
+++ b/drivers/net/ethernet/broadcom/bcmsysport.c
@@ -284,6 +284,8 @@ static const struct bcm_sysport_stats 
bcm_sysport_gstrings_stats[] = {
STAT_MIB_SOFT("alloc_rx_buff_failed", mib.alloc_rx_buff_failed),
STAT_MIB_SOFT("rx_dma_failed", mib.rx_dma_failed),
STAT_MIB_SOFT("tx_dma_failed", mib.tx_dma_failed),
+   STAT_MIB_SOFT("tx_realloc_tsb", mib.tx_realloc_tsb),
+   STAT_MIB_SOFT("tx_realloc_tsb_failed", mib.tx_realloc_tsb_failed),
/* Per TX-queue statistics are dynamically appended */
 };
 
@@ -1220,6 +1222,7 @@ static void bcm_sysport_poll_controller(struct net_device 
*dev)
 static struct sk_buff *bcm_sysport_insert_tsb(struct sk_buff *skb,
  struct net_device *dev)
 {
+   struct bcm_sysport_priv *priv = netdev_priv(dev);
struct sk_buff *nskb;
struct bcm_tsb *tsb;
u32 csum_info;
@@ -1232,12 +1235,14 @@ static struct sk_buff *bcm_sysport_insert_tsb(struct 
sk_buff *skb,
nskb = skb_realloc_headroom(skb, sizeof(*tsb));
if (!nskb) {
dev_kfree_skb_any(skb);
+   priv->mib.tx_realloc_tsb_failed++;
dev->stats.tx_errors++;
dev->stats.tx_dropped++;
return NULL;
}
dev_consume_skb_any(skb);
skb = nskb;
+   priv->mib.tx_realloc_tsb++;
}
 
tsb = skb_push(skb, sizeof(*tsb));
diff --git a/drivers/net/ethernet/broadcom/bcmsysport.h 
b/drivers/net/ethernet/broadcom/bcmsysport.h
index 046c6c1d97fd..a7a230884a87 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.h
+++ b/drivers/net/ethernet/broadcom/bcmsysport.h
@@ -607,6 +607,8 @@ struct bcm_sysport_mib {
u32 alloc_rx_buff_failed;
u32 rx_dma_failed;
u32 tx_dma_failed;
+   u32 tx_realloc_tsb;
+   u32 tx_realloc_tsb_failed;
 };
 
 /* HW maintains a large list of counters */
-- 
2.17.1



[PATCH net-next 3/5] net: systemport: Turn on offloads by default

2018-09-27 Thread Florian Fainelli
We can turn on the RX/TX checksum offloads by default and make sure that
those are properly reflected back to e.g: stacked devices such as VLAN
or DSA.

Signed-off-by: Florian Fainelli 
---
 drivers/net/ethernet/broadcom/bcmsysport.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c 
b/drivers/net/ethernet/broadcom/bcmsysport.c
index 3b4cb906a275..977d9dec2fb0 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.c
+++ b/drivers/net/ethernet/broadcom/bcmsysport.c
@@ -2508,9 +2508,10 @@ static int bcm_sysport_probe(struct platform_device 
*pdev)
dev->netdev_ops = &bcm_sysport_netdev_ops;
netif_napi_add(dev, &priv->napi, bcm_sysport_poll, 64);
 
-   /* HW supported features, none enabled by default */
-   dev->hw_features |= NETIF_F_RXCSUM | NETIF_F_HIGHDMA |
-   NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
+   dev->features |= NETIF_F_RXCSUM | NETIF_F_HIGHDMA |
+NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
+   dev->hw_features |= dev->features;
+   dev->vlan_features |= dev->features;
 
/* Request the WOL interrupt and advertise suspend if available */
priv->wol_irq_disabled = 1;
-- 
2.17.1



[PATCH net-next 0/5] net: systemport: Turn on offloads by

2018-09-27 Thread Florian Fainelli
Hi David,

Up until now, we had added all the code necessary to turn on RX/TX
checksum offloads at runtime, but there is no reason why they have to be
disabled by default given that this gives a slight performance
improvement.

Florian Fainelli (5):
  net: systemport: Refactor bcm_sysport_set_features()
  net: systemport: Utilize bcm_sysport_set_features() during resume/open
  net: systemport: Turn on offloads by default
  net: systemport: Be drop monitor friendly while re-allocating headroom
  net: systemport: Add software counters to track reallocations

 drivers/net/ethernet/broadcom/bcmsysport.c | 67 +++---
 drivers/net/ethernet/broadcom/bcmsysport.h |  2 +
 2 files changed, 35 insertions(+), 34 deletions(-)

-- 
2.17.1



Re: [PATCH net-next v6 23/23] net: WireGuard secure network tunnel

2018-09-27 Thread Jason A. Donenfeld
Hi Andrew,

Thanks for following up with this.

On Thu, Sep 27, 2018 at 3:15 AM Andrew Lunn  wrote:
> I know you have been concentrating on the crypto code, so i'm not
> expecting too many changes at the moment in the network code.

I should be addressing things in parallel, actually, so I'm happy to
work on this.

> WARNING: Avoid crashing the kernel - try using WARN_ON & recovery code rather 
> than BUG() or BUG_ON()
> #2984: FILE: drivers/net/wireguard/noise.c:293:
> +   BUG_ON(first_len > BLAKE2S_HASH_SIZE || second_len > 
> BLAKE2S_HASH_SIZE ||

I was actually going to ask you about this, because it applies
similarly in another context too that I'm trying to refine. The above
function you quote has the following properties:

- Only ever accepts fixed length parameters, so the compiler can
constant fold invocations of it fantastically. Those parameters are
fixed length in the sense that they're enum/macro constants. They
never come from the user or from a packet or something.
- Never produces an incorrect result. For said constants, all inputs
are valid, and so it succeeds in producing an output every time.
- Is a "pure" function, just knocking bytes around, without needing to
interact with fancy kernel-y things; could be implemented on some sort
of WWII-era rotor machine provided you had the patience.

Because of the above, there's never any error to return to the user of
the function. Also, because it only ever takes constant sized inputs,
in theory I should be able to change that BUG_ON() to BUILD_BUG_ON(),
but in practice the optimizer/inliner isn't actually that aggressive.

But what I would like is some way of signaling to the developer using
this function that they've passed it an illegal value, and their code
should not ship until that's fixed, under any circumstances at all  --
that their usage of the function is completely unacceptable and wrong.
Bla bla strong statements.

For this, I figured the notion would come across with the aberrant
behavior of "crash the developer's [in this case, my] QEMU instance"
when "#ifdef DEBUG is true". This is the same kind of place where I'd
have an "assert()" statement in userland. It sounds like what you're
saying is that a WARN_ON is equally as effective instead? Or given the
above, do you think the BUG_ON is actually sensible? Or neither and I
should do something else?

> WARNING: Macros with flow control statements should be avoided
> #5471: FILE: drivers/net/wireguard/selftest/allowedips.h:456:
> +#define init_peer(name) do {   \
> +   name = kzalloc(sizeof(*name), GFP_KERNEL); \
> +   if (unlikely(!name)) { \
> +   pr_info("allowedips self-test: out of memory\n");  \
> +   goto free; \
> +   }  \
> +   kref_init(&name->refcount);\
> +   } while (0)

This is part of a debugging selftest, where I'm initing a bunch of
peers one after another, and this macro helps keep the test clear
while offloading the actual irrelevant coding part to this macro. The
test itself then just has code like:

init_peer(a);
init_peer(b);
init_peer(c);
init_peer(d);
init_peer(e);
init_peer(f);
init_peer(g);
init_peer(h);

insert(4, a, 192, 168, 4, 0, 24);
insert(4, b, 192, 168, 4, 4, 32);
insert(4, c, 192, 168, 0, 0, 16);
insert(4, d, 192, 95, 5, 64, 27);
/* replaces previous entry, and maskself is required */
insert(4, c, 192, 95, 5, 65, 27);
insert(6, d, 0x26075300, 0x60006b00, 0, 0xc05f0543, 128);
insert(6, c, 0x26075300, 0x60006b00, 0, 0, 64);
...

And so forth. I can probably figure out a different way to code this
if you really want, but I thought this would be clear.

> The namespace pollution also needs to be addresses. You have some
> pretty generic named global symbols. I picked out a few examples from
> objdump
>
> 2a94 g F .text  0060 peer_put
> 3484 g F .text  004c timers_stop
> 3520 g F .text  0114 packet_queue_init
> 2640 g F .text  0034 device_uninit
> 26bc g F .text  0288 peer_create
> 90d4 g F .text  01bc ratelimiter_init
>
> Please make use of a prefix for global symbols, e.g. wg_.

Will do. v7 will include the wg_ prefix.

On a slightly related note, out of curiosity, any idea what's up with
the future of LTO in the kernel? It sounds like that'd be nice to have
on a module-by-module basis. IOW, I'd love to LTO all of my .c files
in wireguard together, and then only ever expose mod_init/exit and
whatever I explicitly EXPORT_SYMBOL, and then have the compiler and
linker treat the rest of everything as essentially in one .c file and
optimize the heck out of it, and then strip all the s

Re: KMSAN: uninit-value in __dev_mc_add

2018-09-27 Thread Eric Dumazet
On Thu, Sep 27, 2018 at 2:30 PM Vladis Dronov  wrote:
>
> Hello,
>
> This report is actually for the same bug which was reported in:
>
> https://syzkaller.appspot.com/bug?id=088efeac32fdde781038a777a63e436c0d4d7036
>
> The note there that the bug was fixed by "Commits: net: fix uninit-value in
> __hw_addr_add_ex()" is wrong. A C-reproducer from the 2nd syzkaller report
> can trigger the bug from this one.
>
> I've researched this and a result is a proposed patch, the problem is the tun
> device code allowing to set an arbitrary link type.
>
> https://lkml.org/lkml/2018/9/26/416
> https://lore.kernel.org/lkml/20180926093018.6646-1-vdro...@redhat.com/T/#u
> https://marc.info/?l=linux-netdev&m=153795423320016&w=2
>

I dunno, your patch looks quite not the right fix.

If TUN is able to change dev->type,  how comes it does not set the
appropriate dev->addr_len at the same time ?

Really the bug seems to be deeper, and without setting proper
dev->addr_len, we'll need more 'fixes' like yours.

Thanks.


Re: [Patch net-next] net_sched: fix an extack message in tcf_block_find()

2018-09-27 Thread Eric Dumazet



On 09/27/2018 02:36 PM, Cong Wang wrote:

> I don't understand what you mean by changing ip command, you must
> mean tc command, but still, I have no idea about how restarting failed
> syscall could be related to my patch and why we need to restart anything
> here. If the refcnt goes to 0, it will never come back, retrying won't help
> anything.
>

Yep, tc command it is.

I was not especially commenting your patch (replacing an english message by 
another does
not seem very big deal), but the fact that the code right there seems to be 
prepared
for parallel changes.

But using RCU lookups in control path will lead to occasional failures
that most user space tools would not expect.

Lets assume two tasks are launching "tc qdisc replace dev eth0 root XXX" in 
whatever order/parallelism.

Both should succeed, after/before major RTNL->other_locking_mechanism

Control paths are usually using a mutex or a spinlock so that they never hit a 
0-refcount at all.


[PATCH bpf-next] bpf: permit CGROUP_DEVICE programs accessing helper bpf_get_current_cgroup_id()

2018-09-27 Thread Yonghong Song
Currently, helper bpf_get_current_cgroup_id() is not permitted
for CGROUP_DEVICE type of programs. If the helper is used
in such cases, the verifier will log the following error:

  0: (bf) r6 = r1
  1: (69) r7 = *(u16 *)(r6 +0)
  2: (85) call bpf_get_current_cgroup_id#80
  unknown func bpf_get_current_cgroup_id#80

The bpf_get_current_cgroup_id() is useful for CGROUP_DEVICE
type of programs in order to customize action based on cgroup id.
This patch added such a support.

Cc: Roman Gushchin 
Signed-off-by: Yonghong Song 
---
 kernel/bpf/cgroup.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 065c3d9ff8eb..00f6ed2e4f9a 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -707,6 +707,8 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const 
struct bpf_prog *prog)
return &bpf_get_current_uid_gid_proto;
case BPF_FUNC_get_local_storage:
return &bpf_get_local_storage_proto;
+   case BPF_FUNC_get_current_cgroup_id:
+   return &bpf_get_current_cgroup_id_proto;
case BPF_FUNC_trace_printk:
if (capable(CAP_SYS_ADMIN))
return bpf_get_trace_printk_proto();
-- 
2.17.1



Re: [Patch net-next] net_sched: fix an extack message in tcf_block_find()

2018-09-27 Thread Cong Wang
On Thu, Sep 27, 2018 at 2:16 PM Eric Dumazet  wrote:
>
>
>
> On 09/27/2018 01:42 PM, Cong Wang wrote:
> > It is clearly a copy-n-paste.
> >
> > Signed-off-by: Cong Wang 
> > ---
> >  net/sched/cls_api.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
> > index 3de47e99b788..8dd7f8af6d54 100644
> > --- a/net/sched/cls_api.c
> > +++ b/net/sched/cls_api.c
> > @@ -655,7 +655,7 @@ static struct tcf_block *tcf_block_find(struct net 
> > *net, struct Qdisc **q,
> >
> >   *q = qdisc_refcount_inc_nz(*q);
> >   if (!*q) {
> > - NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists");
> > + NL_SET_ERR_MSG(extack, "Can't increase Qdisc 
> > refcount");
>
>
> I am not sure it was a copy-n-paste.


Make sure you knew there is an exactly same extack message
(with a same English grammar error).


>
> Qdisc refcount business is kernel internal.

Yeah, but the extack message is already there, this patch doesn't add
any new extack. Or you are suggesting we should remove it?



> If we can not increase the refcount, this is precisely because this qdisc is 
> about
> to be destroyed. Nothing fundamentally different than having this thread 
> delayed a bit
> and qdisc_lookup_rcu() returning NULL in the first place.


qdisc_lookup_rcu() is not always called, it could be dev->qdisc.
I am pretty sure parent exists in dev->qdisc.


>
> This also means that using RCU for control path is problematic, as surely the 
> caller
> of this interface would prefer something that succeeds, even if this means
> waiting a bit in the kernel.

I fail to validate this statement, Why it prefers success when refcnt reaches
0?


>
> Or are we willing to change ip command and make it restart failed syscalls ?
>

I don't understand what you mean by changing ip command, you must
mean tc command, but still, I have no idea about how restarting failed
syscall could be related to my patch and why we need to restart anything
here. If the refcnt goes to 0, it will never come back, retrying won't help
anything.

BTW:

If you have any other question beyond my patch's scope, isn't it better
that we start a new thread for discussion?

In case you still misunderstand, my patch never intends to address any
other problem rather than correcting an inaccurate extack message.


Re: [PATCH net-next v6 00/23] WireGuard: Secure Network Tunnel

2018-09-27 Thread Jason A. Donenfeld
Hi Eric,

On Thu, Sep 27, 2018 at 8:29 PM Eric Biggers  wrote:
> Why is Herbert Xu's existing crypto tree being circumvented, especially for
> future patches (the initial merge isn't quite as important as that's a 
> one-time
> event)?  I like being able to check out cryptodev to test upcoming crypto
> patches.  And currently, changes to APIs, algorithms, tests, and 
> implementations
> all go through cryptodev, which is convenient for crypto developers.
>
> Apparently, you're proposing that someone adding a new algorithm will now have
> to submit the API portion to one maintainer (Herbert Xu) and the 
> implementation
> portion to another maintainer (you), and they'll go through separate git 
> trees.
> That's inconvenient for developers, and it seems that in practice you and
> Herbert will be stepping on each other's toes a lot.
>
> Can you please reach some kind of sane agreement with Herbert so that the
> development process isn't fractured into two?  Perhaps you could review 
> patches,
> but Herbert could still apply them?

I think you're overthinking it a bit. Zinc will have a few software
implementations of primitives that are useful in cases where it's nice to call
the primitive directly. Think: various usages of sha2, siphash, the wireguard
suite (what this patchset includes), other things in lib/, etc. In so much as
this winds up duplicating things within the crypto API, I'll work with Herbert
to build one on top of the other -- as I've done in the two commits in this
series. But beyond that, think of the two initiatives as orthogonal. I'm
working on curating a few primitives that are maximally useful throughout
the kernel for various uses, and doing so in a way that I think brings
about a certain quality. Meanwhile the crypto API is amassing a huge
collection of primitives for some things, and that will continue to exist,
and Herbert will continue to maintain that. I expect for the crossover
to be fairly isolated and manageable, without too much foreseeable tree-
conflicts and such. Therefore, Samuel Neves and I plan to maintain the
codebase we've spent quite some time writing, and maintain our own tree for
it, which we'll be submitting through Greg. In other words, this is not
a matter of "circumvention" or "stepping on toes", but rather separate
efforts. I'm quite certain to the extent they overlap we'll be able to work
out fairly easily.

Either way, I'll take your suggestion and reach out to Herbert, since at
least a discussion between the two of us sounds like it could be productive.

> I'm also wondering about the criteria for making additions and changes to
> "Zinc".  You mentioned before that one of the "advantages" of Zinc is that it
> doesn't include "cipher modes from 90s cryptographers" -- what does that mean
> exactly?  You've also indicated before that you don't want people modifying 
> the
> Poly1305 implementations as they are too error-prone.  Useful contributions
> could be blocked or discouraged in the future. Can you please elaborate on
> your criteria for contributions to Zinc?
>
> Also, will you allow algorithms that aren't up to modern security standards 
> but
> are needed for compatibility reasons, e.g. MD5, SHA-1, and DES?  There are
> existing standards, APIs, and data formats that use these "legacy" algorithms;
> so implementations of them are often still needed, whether we like it or not.
>
> And does it matter who designed the algorithms, e.g. do algorithms from Daniel
> Bernstein get effectively a free pass, while algorithms from certain 
> countries,
> governments, or organizations are not allowed?  E.g. wireless driver 
> developers
> may need the SM4 block cipher (which is now supported by the crypto API) as 
> it's
> specified in a Chinese wireless standard.  Will you allow SM4 in Zinc?  Or 
> will
> people have to submit some algorithms to Herbert and some to you due to
> disagreements about what algorithms should be included?

Similarly here, I think you're over-politicizing everything. Stable address
generation for IPv6 uses SHA1 -- see net/ipv6/addrconf.c:3203 -- do you think
that this should use, say, the SM3 chinese hash function instead? No, of
course not, for a variety of interesting reasons. Rather, it should use some
simple hash function that's fast in software that we have available in Zinc.
On the other hand, it seems like parts of the kernel that have pretty high-
levels of cipher agility -- such as dmcrypt, ipsec, wifi apparently, and
so on -- will continue to use dynamic-dispatch system like the crypto API,
since that's what it was made to do and is effective at doing. And so, your
example of SM4 seems to fit perfectly into what the crypto API is well-suited
for, and it would fit naturally in there.

In other words, the "political criteria" for what we add to lib/zinc/ will
mostly be the same as for the rest of lib/: are there things using it that
benefit from it being there in a direct and obvious way, and does the
implementation meet certain q

Re: KMSAN: uninit-value in __dev_mc_add

2018-09-27 Thread Vladis Dronov
Hello,

This report is actually for the same bug which was reported in:

https://syzkaller.appspot.com/bug?id=088efeac32fdde781038a777a63e436c0d4d7036

The note there that the bug was fixed by "Commits: net: fix uninit-value in
__hw_addr_add_ex()" is wrong. A C-reproducer from the 2nd syzkaller report
can trigger the bug from this one.

I've researched this and a result is a proposed patch, the problem is the tun
device code allowing to set an arbitrary link type.

https://lkml.org/lkml/2018/9/26/416
https://lore.kernel.org/lkml/20180926093018.6646-1-vdro...@redhat.com/T/#u
https://marc.info/?l=linux-netdev&m=153795423320016&w=2

A simplified reproducer is attached.

Best regards,
Vladis Dronov
#define _GNU_SOURCE
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

#include 
#include 
#include 
#include 
#include 

int main(int argc, char **argv)
{
  int ret, sockfd, tunfd;

  syscall(__NR_mmap, 0x2000, 0x100, 3, 0x32, -1, 0);

  // socket(AF_PACKET, SOCK_DGRAM|SOCK_NONBLOCK, 0)
  sockfd = syscall(__NR_socket, 0x11, 0x10802, 0);
  if (sockfd < 0) {
perror("socket()");
ret = 1;
goto exit_end;
  }

  memcpy((void*)0x2240, "/dev/net/tun", 13);
  tunfd = open((char *)0x2240, 0);
  if (tunfd < 0) {
perror("open()");
ret = 2;
goto exit_sock_close;
  }

  memcpy((void*)0x20c0, 
"\x69\x67\x62\x30\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", 16);
  *(uint16_t*)0x20d0 = 0x4012;
  ret = syscall(__NR_ioctl, tunfd, 0x400454ca, 0x20c0); // TUNSETIFF 
_IOW('T', 202, int)
  if (ret < 0) {
perror("ioctl(TUNSETIFF)");
ret = 3;
goto exit_tun_close;
  }

  // TUNSETLINK _IOW('T', 205, int) / 0x30a = 778 = ARPHRD_IPGRE
  if (argc < 2)
ret = syscall(__NR_ioctl, tunfd, 0x400454cd, 0x30a);
  else
ret = syscall(__NR_ioctl, tunfd, 0x400454cd, atoi(argv[1]));
  if (ret < 0) {
perror("ioctl(TUNSETLINK)");
ret = 4;
goto exit_tun_close;
  }

  memcpy((void*)0x2040, 
"\x69\x67\x62\x30\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", 16);
  *(uint16_t*)0x2050 = 0xa201;
  ret = syscall(__NR_ioctl, sockfd, 0x8914, 0x2040); // SIOCSIFFLAGS 0x8914
  if (ret < 0) {
perror("ioctl(SIOCSIFFLAGS)");
ret = 5;
goto exit_tun_close;
  }

  printf("done:\n");
  system("/usr/sbin/ip -details link show igb0");

exit_tun_close:
  close(tunfd);
exit_sock_close:
  close(sockfd);
exit_end:
  munmap((void *)0x2000, 0x100);
  return 0;
}

Re: [Patch net-next] net_sched: fix an extack message in tcf_block_find()

2018-09-27 Thread Eric Dumazet



On 09/27/2018 01:42 PM, Cong Wang wrote:
> It is clearly a copy-n-paste.
> 
> Signed-off-by: Cong Wang 
> ---
>  net/sched/cls_api.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
> index 3de47e99b788..8dd7f8af6d54 100644
> --- a/net/sched/cls_api.c
> +++ b/net/sched/cls_api.c
> @@ -655,7 +655,7 @@ static struct tcf_block *tcf_block_find(struct net *net, 
> struct Qdisc **q,
>  
>   *q = qdisc_refcount_inc_nz(*q);
>   if (!*q) {
> - NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists");
> + NL_SET_ERR_MSG(extack, "Can't increase Qdisc refcount");


I am not sure it was a copy-n-paste.

Qdisc refcount business is kernel internal.
If we can not increase the refcount, this is precisely because this qdisc is 
about
to be destroyed. Nothing fundamentally different than having this thread 
delayed a bit
and qdisc_lookup_rcu() returning NULL in the first place.

This also means that using RCU for control path is problematic, as surely the 
caller
of this interface would prefer something that succeeds, even if this means
waiting a bit in the kernel.

Or are we willing to change ip command and make it restart failed syscalls ?



Re: [PATCH v3 bpf-next 00/10] bpf: per-cpu cgroup local storage

2018-09-27 Thread Daniel Borkmann
On 09/26/2018 01:33 PM, Roman Gushchin wrote:
> This patchset implements per-cpu cgroup local storage and provides
> an example how per-cpu and shared cgroup local storage can be used
> for efficient accounting of network traffic.
> 
> v3->v2:
>   1) incorporated Song's feedback
>   2) rebased on top of current bpf-next
> 
> v2->v1:
>   1) added a selftest implementing network counters
>   2) added a missing free() in cgroup local storage selftest
> 
> Roman Gushchin (10):
>   bpf: extend cgroup bpf core to allow multiple cgroup storage types
>   bpf: rework cgroup storage pointer passing
>   bpf: introduce per-cpu cgroup local storage
>   bpf: don't allow create maps of per-cpu cgroup local storages
>   bpf: sync include/uapi/linux/bpf.h to tools/include/uapi/linux/bpf.h
>   bpftool: add support for PERCPU_CGROUP_STORAGE maps
>   selftests/bpf: add verifier per-cpu cgroup storage tests
>   selftests/bpf: extend the storage test to test per-cpu cgroup storage
>   samples/bpf: extend test_cgrp2_attach2 test to use per-cpu cgroup
> storage
>   selftests/bpf: cgroup local storage-based network counters
> 
>  include/linux/bpf-cgroup.h|  55 --
>  include/linux/bpf.h   |  12 +-
>  include/linux/bpf_types.h |   1 +
>  include/uapi/linux/bpf.h  |   1 +
>  kernel/bpf/cgroup.c   |  74 +---
>  kernel/bpf/helpers.c  |  25 ++-
>  kernel/bpf/local_storage.c| 167 +++---
>  kernel/bpf/map_in_map.c   |   3 +-
>  kernel/bpf/syscall.c  |  20 ++-
>  kernel/bpf/verifier.c |  23 ++-
>  net/bpf/test_run.c|  20 ++-
>  samples/bpf/test_cgrp2_attach2.c  |  19 +-
>  tools/bpf/bpftool/map.c   |   4 +-
>  tools/include/uapi/linux/bpf.h|   1 +
>  tools/testing/selftests/bpf/Makefile  |   6 +-
>  tools/testing/selftests/bpf/netcnt_common.h   |  23 +++
>  tools/testing/selftests/bpf/netcnt_prog.c |  71 
>  .../selftests/bpf/test_cgroup_storage.c   |  60 ++-
>  tools/testing/selftests/bpf/test_netcnt.c | 153 
>  tools/testing/selftests/bpf/test_verifier.c   | 139 ++-
>  20 files changed, 778 insertions(+), 99 deletions(-)
>  create mode 100644 tools/testing/selftests/bpf/netcnt_common.h
>  create mode 100644 tools/testing/selftests/bpf/netcnt_prog.c
>  create mode 100644 tools/testing/selftests/bpf/test_netcnt.c
> 

Applied to bpf-next, thanks Roman!


[PATCH net] net/ncsi: Extend NC-SI Netlink interface to allow user space to send NC-SI command

2018-09-27 Thread Justin.Lee1
The new command (NCSI_CMD_SEND_CMD) is added to allow user space application 
to send NC-SI command to the network card.
Also, add a new attribute (NCSI_ATTR_DATA) for transferring request and 
response.

The work flow is as below. 

Request:
User space application -> Netlink interface (msg)
  -> new Netlink handler - 
ncsi_send_cmd_nl()
  -> ncsi_xmit_cmd()
Response:
Response received - ncsi_rcv_rsp() -> internal response handler - 
ncsi_rsp_handler_xxx()
-> 
ncsi_rsp_handler_netlink()
-> 
ncsi_send_netlink_rsp ()
-> 
Netlink interface (msg)
-> user 
space application
Command timeout - ncsi_request_timeout() -> ncsi_send_netlink_timeout ()

-> Netlink interface (msg with zero data length)

-> user space application
Error:
Error detected -> ncsi_send_netlink_err () -> Netlink interface (err msg)

   -> user space application


Signed-off-by: Justin Lee 


---
 include/uapi/linux/ncsi.h |   3 +
 net/ncsi/internal.h   |  12 ++-
 net/ncsi/ncsi-aen.c   |  10 ++-
 net/ncsi/ncsi-cmd.c   | 106 
 net/ncsi/ncsi-manage.c|  74 ++---
 net/ncsi/ncsi-netlink.c   | 199 +-
 net/ncsi/ncsi-netlink.h   |   4 +
 net/ncsi/ncsi-rsp.c   |  70 ++--
 8 files changed, 420 insertions(+), 58 deletions(-)

diff --git a/include/uapi/linux/ncsi.h b/include/uapi/linux/ncsi.h
index 4c292ec..4992bfc 100644
--- a/include/uapi/linux/ncsi.h
+++ b/include/uapi/linux/ncsi.h
@@ -30,6 +30,7 @@ enum ncsi_nl_commands {
NCSI_CMD_PKG_INFO,
NCSI_CMD_SET_INTERFACE,
NCSI_CMD_CLEAR_INTERFACE,
+   NCSI_CMD_SEND_CMD,
 
__NCSI_CMD_AFTER_LAST,
NCSI_CMD_MAX = __NCSI_CMD_AFTER_LAST - 1
@@ -43,6 +44,7 @@ enum ncsi_nl_commands {
  * @NCSI_ATTR_PACKAGE_LIST: nested array of NCSI_PKG_ATTR attributes
  * @NCSI_ATTR_PACKAGE_ID: package ID
  * @NCSI_ATTR_CHANNEL_ID: channel ID
+ * @NCSI_ATTR_DATA: command payload
  * @NCSI_ATTR_MAX: highest attribute number
  */
 enum ncsi_nl_attrs {
@@ -51,6 +53,7 @@ enum ncsi_nl_attrs {
NCSI_ATTR_PACKAGE_LIST,
NCSI_ATTR_PACKAGE_ID,
NCSI_ATTR_CHANNEL_ID,
+   NCSI_ATTR_DATA,
 
__NCSI_ATTR_AFTER_LAST,
NCSI_ATTR_MAX = __NCSI_ATTR_AFTER_LAST - 1
diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h
index 8055e39..20ce735 100644
--- a/net/ncsi/internal.h
+++ b/net/ncsi/internal.h
@@ -215,12 +215,17 @@ struct ncsi_request {
unsigned charid;  /* Request ID - 0 to 255   */
bool used;/* Request that has been assigned  */
unsigned int flags;   /* NCSI request property   */
-#define NCSI_REQ_FLAG_EVENT_DRIVEN 1
+#define NCSI_REQ_FLAG_EVENT_DRIVEN 1
+#define NCSI_REQ_FLAG_NETLINK_DRIVEN   2
struct ncsi_dev_priv *ndp;/* Associated NCSI device  */
struct sk_buff   *cmd;/* Associated NCSI command packet  */
struct sk_buff   *rsp;/* Associated NCSI response packet */
struct timer_listtimer;   /* Timer on waiting for response   */
bool enabled; /* Time has been enabled or not*/
+
+   u32  snd_seq; /* netlink sending sequence number */
+   u32  snd_portid;  /* netlink portid of sender*/
+   struct nlmsghdr  nlhdr;   /* netlink message header  */
 };
 
 enum {
@@ -301,10 +306,13 @@ struct ncsi_cmd_arg {
unsigned short   payload; /* Command packet payload length */
unsigned int req_flags;   /* NCSI request properties   */
union {
-   unsigned char  bytes[16]; /* Command packet specific data  */
+   unsigned char  bytes[16]; /* Command packet specific data  
*/
unsigned short words[8];
unsigned int   dwords[4];
};
+
+   unsigned char*data;   /* Netlink data  */
+   struct genl_info *info;   /* Netlink information   */
 };
 
 extern struct list_head ncsi_dev_list;
diff --git a/net/ncsi/ncsi-aen.c b/net/ncsi/ncsi-aen.c
index 25e483e..b5ec193 100644
--- a/net/ncsi/ncsi-aen.c
+++ b/net/ncsi/ncsi-aen.c
@@ -16,6 +16,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "internal.h"
 #include "ncsi-pkt.h"
@@ 

Re: [PATCH v3] PCI: Reprogram bridge prefetch registers on resume

2018-09-27 Thread Bjorn Helgaas
[+cc LKML]

On Tue, Sep 18, 2018 at 04:32:44PM -0500, Bjorn Helgaas wrote:
> On Thu, Sep 13, 2018 at 11:37:45AM +0800, Daniel Drake wrote:
> > On 38+ Intel-based Asus products, the nvidia GPU becomes unusable
> > after S3 suspend/resume. The affected products include multiple
> > generations of nvidia GPUs and Intel SoCs. After resume, nouveau logs
> > many errors such as:
> > 
> > fifo: fault 00 [READ] at 00555000 engine 00 [GR] client 04
> >   [HUB/FE] reason 4a [] on channel -1 [007fa91000 unknown]
> > DRM: failed to idle channel 0 [DRM]
> > 
> > Similarly, the nvidia proprietary driver also fails after resume
> > (black screen, 100% CPU usage in Xorg process). We shipped a sample
> > to Nvidia for diagnosis, and their response indicated that it's a
> > problem with the parent PCI bridge (on the Intel SoC), not the GPU.
> > 
> > Runtime suspend/resume works fine, only S3 suspend is affected.
> > 
> > We found a workaround: on resume, rewrite the Intel PCI bridge
> > 'Prefetchable Base Upper 32 Bits' register (PCI_PREF_BASE_UPPER32). In
> > the cases that I checked, this register has value 0 and we just have to
> > rewrite that value.
> > 
> > Linux already saves and restores PCI config space during suspend/resume,
> > but this register was being skipped because upon resume, it already
> > has value 0 (the correct, pre-suspend value).
> > 
> > Intel appear to have previously acknowledged this behaviour and the
> > requirement to rewrite this register.
> > https://bugzilla.kernel.org/show_bug.cgi?id=116851#c23
> > 
> > Based on that, rewrite the prefetch register values even when that
> > appears unnecessary.
> > 
> > We have confirmed this solution on all the affected models we have
> > in-hands (X542UQ, UX533FD, X530UN, V272UN).
> > 
> > Additionally, this solves an issue where r8169 MSI-X interrupts were
> > broken after S3 suspend/resume on Asus X441UAR. This issue was recently
> > worked around in commit 7bb05b85bc2d ("r8169: don't use MSI-X on
> > RTL8106e"). It also fixes the same issue on RTL6186evl/8111evl on an
> > Aimfor-tech laptop that we had not yet patched. I suspect it will also
> > fix the issue that was worked around in commit 7c53a722459c ("r8169:
> > don't use MSI-X on RTL8168g").
> > 
> > Thomas Martitz reports that this change also solves an issue where
> > the AMD Radeon Polaris 10 GPU on the HP Zbook 14u G5 is unresponsive
> > after S3 suspend/resume.
> > 
> > Link: https://bugzilla.kernel.org/show_bug.cgi?id=201069
> > Signed-off-by: Daniel Drake 
> 
> Applied with Rafael's and Peter's reviewed-by to pci/enumeration for v4.20.
> Thanks for the the huge investigative effort!

Since this looks low-risk and fixes several painful issues, I think
this merits a stable tag and being included in v4.19 (instead of
waiting for v4.20).  

I moved it to for-linus for v4.19.  Let me know if you object.

> > ---
> >  drivers/pci/pci.c | 25 +
> >  1 file changed, 17 insertions(+), 8 deletions(-)
> > 
> > diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> > index 29ff9619b5fa..5d58220b6997 100644
> > --- a/drivers/pci/pci.c
> > +++ b/drivers/pci/pci.c
> > @@ -1289,12 +1289,12 @@ int pci_save_state(struct pci_dev *dev)
> >  EXPORT_SYMBOL(pci_save_state);
> >  
> >  static void pci_restore_config_dword(struct pci_dev *pdev, int offset,
> > -u32 saved_val, int retry)
> > +u32 saved_val, int retry, bool force)
> >  {
> > u32 val;
> >  
> > pci_read_config_dword(pdev, offset, &val);
> > -   if (val == saved_val)
> > +   if (!force && val == saved_val)
> > return;
> >  
> > for (;;) {
> > @@ -1313,25 +1313,34 @@ static void pci_restore_config_dword(struct pci_dev 
> > *pdev, int offset,
> >  }
> >  
> >  static void pci_restore_config_space_range(struct pci_dev *pdev,
> > -  int start, int end, int retry)
> > +  int start, int end, int retry,
> > +  bool force)
> >  {
> > int index;
> >  
> > for (index = end; index >= start; index--)
> > pci_restore_config_dword(pdev, 4 * index,
> >  pdev->saved_config_space[index],
> > -retry);
> > +retry, force);
> >  }
> >  
> >  static void pci_restore_config_space(struct pci_dev *pdev)
> >  {
> > if (pdev->hdr_type == PCI_HEADER_TYPE_NORMAL) {
> > -   pci_restore_config_space_range(pdev, 10, 15, 0);
> > +   pci_restore_config_space_range(pdev, 10, 15, 0, false);
> > /* Restore BARs before the command register. */
> > -   pci_restore_config_space_range(pdev, 4, 9, 10);
> > -   pci_restore_config_space_range(pdev, 0, 3, 0);
> > +   pci_restore_config_space_range(pdev, 4, 9, 10, false);
> > +   pci_restore_config_space_range(pdev, 0, 3,

Re: [PATCH] netfilter: check if the socket netns is correct.

2018-09-27 Thread Guenter Roeck
Hi Flavio,

On Wed, Jun 27, 2018 at 10:34:25AM -0300, Flavio Leitner wrote:
> Netfilter assumes that if the socket is present in the skb, then
> it can be used because that reference is cleaned up while the skb
> is crossing netns.
> 
> We want to change that to preserve the socket reference in a future
> patch, so this is a preparation updating netfilter to check if the
> socket netns matches before use it.
> 
> Signed-off-by: Flavio Leitner 
> Acked-by: Florian Westphal 
> Signed-off-by: David S. Miller 
> ---
...
> --- a/net/netfilter/xt_socket.c
> +++ b/net/netfilter/xt_socket.c
> @@ -56,8 +56,12 @@ socket_match(const struct sk_buff *skb, struct 
> xt_action_param *par,
>   struct sk_buff *pskb = (struct sk_buff *)skb;
>   struct sock *sk = skb->sk;
>  
> + if (!net_eq(xt_net(par), sock_net(sk)))
> + sk = NULL;
> +

I am having trouble with this code. With CONFIG_NET_NS enabled, it crashes
for me in read_pnet() because sk is NULL.

>   if (!sk)
>   sk = nf_sk_lookup_slow_v4(xt_net(par), skb, xt_in(par));

The old code seems to suggest that sk == NULL was possible.

I see the problem with the Chrome OS kernel rebased to v4.19-rc5, so I
can not guarantee that this really an upstream problem. The change seems
odd, though. Are you sure that it is not (or, rather, no longer) necessary
to check if sk == NULL before dereferencing it in sock_net() ?

> +
>   if (sk) {
>   bool wildcard;
>   bool transparent = true;
> @@ -113,8 +117,12 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct 
> xt_action_param *par)
>   struct sk_buff *pskb = (struct sk_buff *)skb;
>   struct sock *sk = skb->sk;
>  
> + if (!net_eq(xt_net(par), sock_net(sk)))
> + sk = NULL;
> +
Same here.

>   if (!sk)
>   sk = nf_sk_lookup_slow_v6(xt_net(par), skb, xt_in(par));
> +
>   if (sk) {
>   bool wildcard;
>   bool transparent = true;

Thanks,
Guenter


[Patch net-next] net_sched: fix a crash in tc_new_tfilter()

2018-09-27 Thread Cong Wang
When tcf_block_find() fails, it already rollbacks the qdisc refcnt,
so its caller doesn't need to clean up this again. Avoid calling
qdisc_put() again by resetting qdisc to NULL for callers.

Reported-by: syzbot+37b8770e6d5a8220a...@syzkaller.appspotmail.com
Fixes: e368fdb61d8e ("net: sched: use Qdisc rcu API instead of relying on rtnl 
lock")
Signed-off-by: Cong Wang 
---
 net/sched/cls_api.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 8dd7f8af6d54..a4167ec0a220 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -717,8 +717,10 @@ static struct tcf_block *tcf_block_find(struct net *net, 
struct Qdisc **q,
 errout_rcu:
rcu_read_unlock();
 errout_qdisc:
-   if (*q)
+   if (*q) {
qdisc_put(*q);
+   *q = NULL;
+   }
return ERR_PTR(err);
 }
 
-- 
2.14.4



[Patch net-next] net_sched: fix an extack message in tcf_block_find()

2018-09-27 Thread Cong Wang
It is clearly a copy-n-paste.

Signed-off-by: Cong Wang 
---
 net/sched/cls_api.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 3de47e99b788..8dd7f8af6d54 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -655,7 +655,7 @@ static struct tcf_block *tcf_block_find(struct net *net, 
struct Qdisc **q,
 
*q = qdisc_refcount_inc_nz(*q);
if (!*q) {
-   NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists");
+   NL_SET_ERR_MSG(extack, "Can't increase Qdisc refcount");
err = -EINVAL;
goto errout_rcu;
}
-- 
2.14.4



Re: [PATCH v2 07/22] soc/fsl/bman_portals: defer probe after bman's probe

2018-09-27 Thread Li Yang
On Wed, Sep 26, 2018 at 8:26 AM  wrote:
>
> From: Laurentiu Tudor 
>
> A crash in bman portal probing could not be triggered (as is the case
> with qman portals) but it does make calls [1] into the bman driver so
> lets make sure the bman portal probing happens after bman's.
>
> [1]  bman_p_irqsource_add() (in bman) called by:
>init_pcfg() called by:
>  bman_portal_probe()
>
> Signed-off-by: Laurentiu Tudor 

As this is part of a bug fix for v4.19, applied on soc/fsl for fix.

> ---
>  drivers/soc/fsl/qbman/bman_portal.c | 10 +-
>  1 file changed, 9 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/soc/fsl/qbman/bman_portal.c 
> b/drivers/soc/fsl/qbman/bman_portal.c
> index 2f71f7df3465..f9edd28894fd 100644
> --- a/drivers/soc/fsl/qbman/bman_portal.c
> +++ b/drivers/soc/fsl/qbman/bman_portal.c
> @@ -91,7 +91,15 @@ static int bman_portal_probe(struct platform_device *pdev)
> struct device_node *node = dev->of_node;
> struct bm_portal_config *pcfg;
> struct resource *addr_phys[2];
> -   int irq, cpu;
> +   int irq, cpu, err;
> +
> +   err = bman_is_probed();
> +   if (!err)
> +   return -EPROBE_DEFER;
> +   if (err < 0) {
> +   dev_err(&pdev->dev, "failing probe due to bman probe 
> error\n");
> +   return -ENODEV;
> +   }
>
> pcfg = devm_kmalloc(dev, sizeof(*pcfg), GFP_KERNEL);
> if (!pcfg)
> --
> 2.17.1
>


Re: [PATCH v2 06/22] soc/fsl/qman_portals: defer probe after qman's probe

2018-09-27 Thread Li Yang
On Wed, Sep 26, 2018 at 8:26 AM  wrote:
>
> From: Laurentiu Tudor 
>
> Defer probe of qman portals after qman probing. This fixes the crash
> below, seen on NXP LS1043A SoCs:
>
> Unable to handle kernel NULL pointer dereference at virtual address
> 0004
> Mem abort info:
>   ESR = 0x9604
>   Exception class = DABT (current EL), IL = 32 bits
>   SET = 0, FnV = 0
>   EA = 0, S1PTW = 0
> Data abort info:
>   ISV = 0, ISS = 0x0004
>   CM = 0, WnR = 0
> [0004] user address but active_mm is swapper
> Internal error: Oops: 9604 [#1] PREEMPT SMP
> Modules linked in:
> CPU: 0 PID: 1 Comm: swapper/0 Not tainted
> 4.18.0-rc1-next-20180622-00200-g986f5c179185 #9
> Hardware name: LS1043A RDB Board (DT)
> pstate: 8005 (Nzcv daif -PAN -UAO)
> pc : qman_set_sdest+0x74/0xa0
> lr : qman_portal_probe+0x22c/0x470
> sp : 0803bbc0
> x29: 0803bbc0 x28: 
> x27: 090c1b88 x26: 0927cb68
> x25: 0927c000 x24: 0927cb60
> x23:  x22: 
> x21: 090e9000 x20: 800073b5c810
> x19: 800027401298 x18: 
> x17: 0001 x16: 
> x15: 090e96c8 x14: 80002740138a
> x13: 090f2000 x12: 0030
> x11: 08f25000 x10: 
> x9 : 80007bdfd2c0 x8 : 4000
> x7 : 80007393cc18 x6 : 0041
> x5 :  x4 : 
> x3 : 0004 x2 : 0927c900
> x1 :  x0 : 0004
> Process swapper/0 (pid: 1, stack limit = 0x(ptrval))
> Call trace:
>  qman_set_sdest+0x74/0xa0
>  platform_drv_probe+0x50/0xa8
>  driver_probe_device+0x214/0x2f8
>  __driver_attach+0xd8/0xe0
>  bus_for_each_dev+0x68/0xc8
>  driver_attach+0x20/0x28
>  bus_add_driver+0x108/0x228
>  driver_register+0x60/0x110
>  __platform_driver_register+0x40/0x48
>  qman_portal_driver_init+0x20/0x84
>  do_one_initcall+0x58/0x168
>  kernel_init_freeable+0x184/0x22c
>  kernel_init+0x10/0x108
>  ret_from_fork+0x10/0x18
> Code: f9400443 11001000 927e4800 8b63 (b9400063)
> ---[ end trace 4f6d50489ecfb930 ]---
> Kernel panic - not syncing: Attempted to kill init! exitcode=0x000b
>
> Signed-off-by: Laurentiu Tudor 

As this is part of a bug fix for v4.19, applied on soc/fsl for fix.

> ---
>  drivers/soc/fsl/qbman/qman_portal.c | 8 
>  1 file changed, 8 insertions(+)
>
> diff --git a/drivers/soc/fsl/qbman/qman_portal.c 
> b/drivers/soc/fsl/qbman/qman_portal.c
> index 6d9da3b1b5ad..eef93cab84f1 100644
> --- a/drivers/soc/fsl/qbman/qman_portal.c
> +++ b/drivers/soc/fsl/qbman/qman_portal.c
> @@ -229,6 +229,14 @@ static int qman_portal_probe(struct platform_device 
> *pdev)
> int irq, cpu, err;
> u32 val;
>
> +   err = qman_is_probed();
> +   if (!err)
> +   return -EPROBE_DEFER;
> +   if (err < 0) {
> +   dev_err(&pdev->dev, "failing probe due to qman probe 
> error\n");
> +   return -ENODEV;
> +   }
> +
> pcfg = devm_kmalloc(dev, sizeof(*pcfg), GFP_KERNEL);
> if (!pcfg)
> return -ENOMEM;
> --
> 2.17.1
>


Re: [PATCH v2 05/22] soc/fsl/qbman: add APIs to retrieve the probing status

2018-09-27 Thread Li Yang
On Wed, Sep 26, 2018 at 8:26 AM  wrote:
>
> From: Laurentiu Tudor 
>
> Add a couple of new APIs to check the probing status of qman and bman:
>  'int bman_is_probed()' and 'int qman_is_probed()'.
> They return the following values.
>  *  1 if qman/bman were probed correctly
>  *  0 if qman/bman were not yet probed
>  * -1 if probing of qman/bman failed
> Drivers that use qman/bman driver services are required to use these
> APIs before calling any functions exported by qman or bman drivers
> or otherwise they will crash the kernel.
> The APIs will be used in the following couple of qbman portal patches
> and later in the series in the dpaa1 ethernet driver.
>
> Signed-off-by: Laurentiu Tudor 

As this is part of a bug fix for v4.19, applied on soc/fsl for fix.

> ---
>  drivers/soc/fsl/qbman/bman_ccsr.c | 11 +++
>  drivers/soc/fsl/qbman/qman_ccsr.c | 11 +++
>  include/soc/fsl/bman.h|  8 
>  include/soc/fsl/qman.h|  8 
>  4 files changed, 38 insertions(+)
>
> diff --git a/drivers/soc/fsl/qbman/bman_ccsr.c 
> b/drivers/soc/fsl/qbman/bman_ccsr.c
> index d180da003e4a..b209c79511bb 100644
> --- a/drivers/soc/fsl/qbman/bman_ccsr.c
> +++ b/drivers/soc/fsl/qbman/bman_ccsr.c
> @@ -121,6 +121,7 @@ static void bm_set_memory(u64 ba, u32 size)
>   */
>  static dma_addr_t fbpr_a;
>  static size_t fbpr_sz;
> +static int __bman_probed;
>
>  static int bman_fbpr(struct reserved_mem *rmem)
>  {
> @@ -167,6 +168,12 @@ static irqreturn_t bman_isr(int irq, void *ptr)
> return IRQ_HANDLED;
>  }
>
> +int bman_is_probed(void)
> +{
> +   return __bman_probed;
> +}
> +EXPORT_SYMBOL_GPL(bman_is_probed);
> +
>  static int fsl_bman_probe(struct platform_device *pdev)
>  {
> int ret, err_irq;
> @@ -177,6 +184,8 @@ static int fsl_bman_probe(struct platform_device *pdev)
> u16 id, bm_pool_cnt;
> u8 major, minor;
>
> +   __bman_probed = -1;
> +
> res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
> if (!res) {
> dev_err(dev, "Can't get %pOF property 'IORESOURCE_MEM'\n",
> @@ -266,6 +275,8 @@ static int fsl_bman_probe(struct platform_device *pdev)
> return ret;
> }
>
> +   __bman_probed = 1;
> +
> return 0;
>  };
>
> diff --git a/drivers/soc/fsl/qbman/qman_ccsr.c 
> b/drivers/soc/fsl/qbman/qman_ccsr.c
> index 0cfe79f85a66..383a49dcce68 100644
> --- a/drivers/soc/fsl/qbman/qman_ccsr.c
> +++ b/drivers/soc/fsl/qbman/qman_ccsr.c
> @@ -274,6 +274,7 @@ static const struct qman_error_info_mdata error_mdata[] = 
> {
>  static u32 __iomem *qm_ccsr_start;
>  /* A SDQCR mask comprising all the available/visible pool channels */
>  static u32 qm_pools_sdqcr;
> +static int __qman_probed;
>
>  static inline u32 qm_ccsr_in(u32 offset)
>  {
> @@ -689,6 +690,12 @@ static int qman_resource_init(struct device *dev)
> return 0;
>  }
>
> +int qman_is_probed(void)
> +{
> +   return __qman_probed;
> +}
> +EXPORT_SYMBOL_GPL(qman_is_probed);
> +
>  static int fsl_qman_probe(struct platform_device *pdev)
>  {
> struct device *dev = &pdev->dev;
> @@ -699,6 +706,8 @@ static int fsl_qman_probe(struct platform_device *pdev)
> u16 id;
> u8 major, minor;
>
> +   __qman_probed = -1;
> +
> res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
> if (!res) {
> dev_err(dev, "Can't get %pOF property 'IORESOURCE_MEM'\n",
> @@ -845,6 +854,8 @@ static int fsl_qman_probe(struct platform_device *pdev)
> if (ret)
> return ret;
>
> +   __qman_probed = 1;
> +
> return 0;
>  }
>
> diff --git a/include/soc/fsl/bman.h b/include/soc/fsl/bman.h
> index eaaf56df4086..5b99cb2ea5ef 100644
> --- a/include/soc/fsl/bman.h
> +++ b/include/soc/fsl/bman.h
> @@ -126,4 +126,12 @@ int bman_release(struct bman_pool *pool, const struct 
> bm_buffer *bufs, u8 num);
>   */
>  int bman_acquire(struct bman_pool *pool, struct bm_buffer *bufs, u8 num);
>
> +/**
> + * bman_is_probed - Check if bman is probed
> + *
> + * Returns 1 if the bman driver successfully probed, -1 if the bman driver
> + * failed to probe or 0 if the bman driver did not probed yet.
> + */
> +int bman_is_probed(void);
> +
>  #endif /* __FSL_BMAN_H */
> diff --git a/include/soc/fsl/qman.h b/include/soc/fsl/qman.h
> index d4dfefdee6c1..597783b8a3a0 100644
> --- a/include/soc/fsl/qman.h
> +++ b/include/soc/fsl/qman.h
> @@ -1186,4 +1186,12 @@ int qman_alloc_cgrid_range(u32 *result, u32 count);
>   */
>  int qman_release_cgrid(u32 id);
>
> +/**
> + * qman_is_probed - Check if qman is probed
> + *
> + * Returns 1 if the qman driver successfully probed, -1 if the qman driver
> + * failed to probe or 0 if the qman driver did not probed yet.
> + */
> +int qman_is_probed(void);
> +
>  #endif /* __FSL_QMAN_H */
> --
> 2.17.1
>


Re: [PATCH net-next 0/7] rtnetlink: add RTM_GETADDR2

2018-09-27 Thread Christian Brauner
On September 27, 2018 10:24:36 PM GMT+02:00, David Ahern  
wrote:
>On 9/27/18 11:58 AM, Christian Brauner wrote:
>> Various userspace programs (e.g. iproute2) have sent RTM_GETADDR
>> requests with struct ifinfomsg. This is wrong and should have been
>> struct ifaddrmsg all along as mandated by the manpages. However, dump
>> requests so far didn't parse the netlink message that was sent and
>> succeeded even when a wrong struct was passed along.
>
>...
>
>> The correct solution at this point seems to me to introduce a new
>> RTM_GETADDR2 request. This way we can parse the message and fail hard
>if
>> the struct is not struct ifaddrmsg and can safely extend it in the
>> future. Userspace tools that rely on the buggy RTM_GETADDR API will
>> still keep working without even having to see any log messages and
>new
>> userspace tools that want to make user of new features can make use
>of
>> the new RTM_GETADDR2 requests.
>
>First, I think this is the wrong precedent when all we need is a single
>bit flag that userspace can use to tell the kernel "I have a clue and I
>am passing in the proper header for this dump request".

That had been NAKed previously but if you have an idea that will be accepted 
all the more power to you.

>
>Second, you are not addressing the problems of the past by requiring
>the
>proper header and checking values passed in it.

I don't follow. RTM_GETADDR requests are absolutely unchanged. The full legacy 
behavior is restored by this patchset.

And requiring that RTM_GETADDR2 requests always pass the correct header is 
absolutely fine. We don't want built invalid legacy behavior into a new request 
 type.

>
>I have another idea. I'll send an RFC patch soon.



Re: [PATCH net-next 0/7] rtnetlink: add RTM_GETADDR2

2018-09-27 Thread David Ahern
On 9/27/18 11:58 AM, Christian Brauner wrote:
> Various userspace programs (e.g. iproute2) have sent RTM_GETADDR
> requests with struct ifinfomsg. This is wrong and should have been
> struct ifaddrmsg all along as mandated by the manpages. However, dump
> requests so far didn't parse the netlink message that was sent and
> succeeded even when a wrong struct was passed along.

...

> The correct solution at this point seems to me to introduce a new
> RTM_GETADDR2 request. This way we can parse the message and fail hard if
> the struct is not struct ifaddrmsg and can safely extend it in the
> future. Userspace tools that rely on the buggy RTM_GETADDR API will
> still keep working without even having to see any log messages and new
> userspace tools that want to make user of new features can make use of
> the new RTM_GETADDR2 requests.

First, I think this is the wrong precedent when all we need is a single
bit flag that userspace can use to tell the kernel "I have a clue and I
am passing in the proper header for this dump request".

Second, you are not addressing the problems of the past by requiring the
proper header and checking values passed in it.

I have another idea. I'll send an RFC patch soon.


RE: bug: 'ethtool -m' reports spurious alarm & warning threshold values for QSFP28 transceivers

2018-09-27 Thread Chris Preimesberger
Update for posterity-

Mellanox support provided a work-around of using mlxcables instead of
ethtool to read alarm/warning info for an installed transceiver.

I was told that a couple of their engineers are currently looking into the
discrepancy between threshold reporting by mlxcables and ethtool, and
that they are deciding what to do about it...

Work-around steps:
1. add a cable with "sudo mst cable add".
2. find the cable name with "sudo mlxcables".  The name of my cable is
   01:00.0_cable_0 so I copy that name for insertion into the next command.
3. probe the cable for DDM with "sudo mlxcables -d 01:00.0_cable_0 --DDM".


Example copied/pasted from my CLI here.
All reported thresholds appear to be correct.

tech1@D7:~$ 
tech1@D7:~$ 
tech1@D7:~$ sudo mst cable add
-I- Added 1 cable devices ..
tech1@D7:~$ sudo mlxcables
Querying Cables 

Cable #1:
-
Cable name: 01:00.0_cable_0
>> No FW data to show
 Cable EEPROM 
Identifier: QSFP28 (11h)
Technology: 850 nm VCSEL (00h)
Compliance: Extended Specification Compliance is valid, 100GBASE-SR4 or 
25GBASE-SR
Wavelength: 850 nm
OUI   : 0x00c0f2
Vendor: TRANSITION  
Serial number : TN02000263  
Part number   : TN-QSFP-100G-SR4
Revision  : 02
Temperature   : 34 C
Length: 50 m

tech1@D7:~$ sudo mlxcables -d 01:00.0_cable_0 --DDM
Cable DDM:
--
Temperature: 34C
Voltage: 3.2918V
Channel 1:
RX Power : 0.1695dBm
TX Power : 0.8622dBm
TX Bias  : 7.0720mA
Channel 2:
RX Power : 0.1355dBm
TX Power : 1.1042dBm
TX Bias  : 6.9240mA
Channel 3:
RX Power : -0.1592dBm
TX Power : 0.6547dBm
TX Bias  : 6.9420mA
Channel 4:
RX Power : -0.1300dBm
TX Power : 0.4653dBm
TX Bias  : 6.9120mA
- Thresholds -
Temperature:
High Warning  : 70C
Low  Warning  : 0C
High Alarm: 75C
Low  Alarm: -5C
Warning mask  : 0
Alarm mask: 0
Voltage:
High Warning : 3.4600V
Low  Warning : 3.1300V
High Alarm   : 3.6300V
Low  Alarm   : 2.9700V
Warning mask : 0
Alarm mask   : 0
Channel 1:
RX Power high warn   : 2.4000dBm
RX Power low  warn   : -9.5001dBm
RX Power high alarm  : 5.4103dBm
RX Power low  alarm  : -12.5104dBm
RX Power Warning mask: 0
RX Power Alarm mask  : 0
TX Power high warn   : 2.4000dBm
TX Power low  warn   : -7.6020dBm
TX Power high alarm  : 3.1917dBm
TX Power low  alarm  : -8.5699dBm
TX Power Warning mask: 0
TX Power Alarm mask  : 0
TX Bias high warn: 12.mA
TX Bias low  warn: 2.mA
TX Bias high alarm   : 15.mA
TX Bias low  alarm   : 1.mA
TX Bias Warning mask : 0
TX Bias Alarm mask   : 0
Channel 2:
RX Power high warn   : 2.4000dBm
RX Power low  warn   : -9.5001dBm
RX Power high alarm  : 5.4103dBm
RX Power low  alarm  : -12.5104dBm
RX Power Warning mask: 0
RX Power Alarm mask  : 0
TX Power high warn   : 2.4000dBm
TX Power low  warn   : -7.6020dBm
TX Power high alarm  : 3.1917dBm
TX Power low  alarm  : -8.5699dBm
TX Power Warning mask: 0
TX Power Alarm mask  : 0
TX Bias high warn: 12.mA
TX Bias low  warn: 2.mA
TX Bias high alarm   : 15.mA
TX Bias low  alarm   : 1.mA
TX Bias Warning mask : 0
TX Bias Alarm mask   : 0
Channel 3:
RX Power high warn   : 2.4000dBm
RX Power low  warn   : -9.5001dBm
RX Power high alarm  : 5.4103dBm
RX Power low  alarm  : -12.5104dBm
RX Power Warning mask: 0
RX Power Alarm mask  : 0
TX Power high warn   : 2.4000dBm
TX Power low  warn   : -7.6020dBm
TX Power high alarm  : 3.1917dBm
TX Power low  alarm  : -8.5699dBm
TX Power Warning mask: 0
TX Power Alarm mask  : 0
TX Bias high warn: 12.mA
TX Bias low  warn: 2.mA
TX Bias high alarm   : 15.mA
TX Bias low  alarm   : 1.mA
TX Bias Warning mask : 0
TX Bias Alarm mask   : 0
Channel 4:
RX Power high warn   : 2.4000dBm
RX Power low  warn   : -9.5001dBm
RX Power high alarm  : 5.4103dBm
RX Power low  alarm  : -12.5104dBm
RX Power Warning mask: 0
RX Power Alarm mask  : 0
TX Power high warn   : 2.4000dBm
TX Power low  warn   : -7.6020dBm
TX Power high alarm  : 3.1917dBm
TX Power low  alarm  : -8.5699dBm
TX Power Warning mask: 0
TX Power Alarm mask  : 0
TX Bias high warn: 12.mA
TX Bias low  warn: 2.mA
TX Bias high alarm   : 15.mA
TX Bias low  alarm   : 1.mA
TX Bias Warning mask : 0

Re: [PATCH v2 08/22] soc/fsl/qbman_portals: add APIs to retrieve the probing status

2018-09-27 Thread Li Yang
On Wed, Sep 26, 2018 at 8:26 AM  wrote:
>
> From: Laurentiu Tudor 
>
> Add a couple of new APIs to check the probing status of the required
> cpu bound qman and bman portals:
>  'int bman_portals_probed()' and 'int qman_portals_probed()'.
> They return the following values.
>  *  1 if qman/bman portals were all probed correctly
>  *  0 if qman/bman portals were not yet probed
>  * -1 if probing of qman/bman portals failed
> Drivers that use qman/bman portal driver services are required to use
> these APIs before calling any functions exported by these drivers or
> otherwise they will crash the kernel.
> First user will be the dpaa1 ethernet driver, coming in a subsequent
> patch.
>
> Signed-off-by: Laurentiu Tudor 
> ---
>  drivers/soc/fsl/qbman/bman_portal.c | 10 ++
>  drivers/soc/fsl/qbman/qman_portal.c | 10 ++
>  include/soc/fsl/bman.h  |  8 
>  include/soc/fsl/qman.h  |  9 +
>  4 files changed, 37 insertions(+)
>
> diff --git a/drivers/soc/fsl/qbman/bman_portal.c 
> b/drivers/soc/fsl/qbman/bman_portal.c
> index f9edd28894fd..8048d35de8a2 100644
> --- a/drivers/soc/fsl/qbman/bman_portal.c
> +++ b/drivers/soc/fsl/qbman/bman_portal.c
> @@ -32,6 +32,7 @@
>
>  static struct bman_portal *affine_bportals[NR_CPUS];
>  static struct cpumask portal_cpus;
> +static int __bman_portals_probed;
>  /* protect bman global registers and global data shared among portals */
>  static DEFINE_SPINLOCK(bman_lock);
>
> @@ -85,6 +86,12 @@ static int bman_online_cpu(unsigned int cpu)
> return 0;
>  }
>
> +int bman_portals_probed(void)
> +{
> +   return __bman_portals_probed;
> +}
> +EXPORT_SYMBOL_GPL(bman_portals_probed);
> +
>  static int bman_portal_probe(struct platform_device *pdev)
>  {
> struct device *dev = &pdev->dev;
> @@ -148,6 +155,7 @@ static int bman_portal_probe(struct platform_device *pdev)
> spin_lock(&bman_lock);
> cpu = cpumask_next_zero(-1, &portal_cpus);
> if (cpu >= nr_cpu_ids) {
> +   __bman_portals_probed = 1;

What if the last CPU is not used for portals?  Is there a hard
requirement that all CPUs need to be used for portal?  What happens if
the last CPU is offline?

> /* unassigned portal, skip init */
> spin_unlock(&bman_lock);
> return 0;
> @@ -173,6 +181,8 @@ static int bman_portal_probe(struct platform_device *pdev)
>  err_ioremap2:
> memunmap(pcfg->addr_virt_ce);
>  err_ioremap1:
> +__bman_portals_probed = 1;
> +

There are other error paths that not covered.

> return -ENXIO;
>  }
>
> diff --git a/drivers/soc/fsl/qbman/qman_portal.c 
> b/drivers/soc/fsl/qbman/qman_portal.c
> index eef93cab84f1..1b2fc981c269 100644
> --- a/drivers/soc/fsl/qbman/qman_portal.c
> +++ b/drivers/soc/fsl/qbman/qman_portal.c
> @@ -39,6 +39,7 @@ EXPORT_SYMBOL(qman_dma_portal);
>  #define CONFIG_FSL_DPA_PIRQ_FAST  1
>
>  static struct cpumask portal_cpus;
> +static int __qman_portals_probed;
>  /* protect qman global registers and global data shared among portals */
>  static DEFINE_SPINLOCK(qman_lock);
>
> @@ -219,6 +220,12 @@ static int qman_online_cpu(unsigned int cpu)
> return 0;
>  }
>
> +int qman_portals_probed(void)
> +{
> +   return __qman_portals_probed;
> +}
> +EXPORT_SYMBOL_GPL(qman_portals_probed);
> +
>  static int qman_portal_probe(struct platform_device *pdev)
>  {
> struct device *dev = &pdev->dev;
> @@ -306,6 +313,7 @@ static int qman_portal_probe(struct platform_device *pdev)
> spin_lock(&qman_lock);
> cpu = cpumask_next_zero(-1, &portal_cpus);
> if (cpu >= nr_cpu_ids) {
> +   __qman_portals_probed = 1;

Ditto.

> /* unassigned portal, skip init */
> spin_unlock(&qman_lock);
> return 0;
> @@ -336,6 +344,8 @@ static int qman_portal_probe(struct platform_device *pdev)
>  err_ioremap2:
> memunmap(pcfg->addr_virt_ce);
>  err_ioremap1:
> +   __qman_portals_probed = -1;
> +

Ditto.

> return -ENXIO;
>  }
>
> diff --git a/include/soc/fsl/bman.h b/include/soc/fsl/bman.h
> index 5b99cb2ea5ef..173e4049d963 100644
> --- a/include/soc/fsl/bman.h
> +++ b/include/soc/fsl/bman.h
> @@ -133,5 +133,13 @@ int bman_acquire(struct bman_pool *pool, struct 
> bm_buffer *bufs, u8 num);
>   * failed to probe or 0 if the bman driver did not probed yet.
>   */
>  int bman_is_probed(void);
> +/**
> + * bman_portals_probed - Check if all cpu bound bman portals are probed
> + *
> + * Returns 1 if all the required cpu bound bman portals successfully probed,
> + * -1 if probe errors appeared or 0 if the bman portals did not yet finished
> + * probing.
> + */
> +int bman_portals_probed(void);
>
>  #endif /* __FSL_BMAN_H */
> diff --git a/include/soc/fsl/qman.h b/include/soc/fsl/qman.h
> index 597783b8a3a0..7732e48081eb 100644
> --- a/include/soc/fsl/qman.h
> +++ b/include/soc/fsl/qman.h
> @@ -1194,4 +1194,13 @@ int qman_release_cgrid(u

[PATCH net-next] net: nixge: Address compiler warnings when building for i386

2018-09-27 Thread Moritz Fischer
Address compiler warning reported by kbuild autobuilders
when building for i386 as a result of dma_addr_t size on
different architectures.

warning: cast to pointer from integer of different size
[-Wint-to-pointer-cast]

Fixes: 7e8d5755be0e ("net: nixge: Add support for 64-bit platforms")
Signed-off-by: Moritz Fischer 
Cc: Arnd Bergmann 
---
 drivers/net/ethernet/ni/nixge.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/ni/nixge.c b/drivers/net/ethernet/ni/nixge.c
index 74cf52e3fb09..0611f2335b4a 100644
--- a/drivers/net/ethernet/ni/nixge.c
+++ b/drivers/net/ethernet/ni/nixge.c
@@ -127,8 +127,8 @@ struct nixge_hw_dma_bd {
 #ifdef CONFIG_PHYS_ADDR_T_64BIT
 #define nixge_hw_dma_bd_set_addr(bd, field, addr) \
do { \
-   (bd)->field##_lo = lower_32_bits(((u64)addr)); \
-   (bd)->field##_hi = upper_32_bits(((u64)addr)); \
+   (bd)->field##_lo = lower_32_bits((addr)); \
+   (bd)->field##_hi = upper_32_bits((addr)); \
} while (0)
 #else
 #define nixge_hw_dma_bd_set_addr(bd, field, addr) \
@@ -251,7 +251,7 @@ static void nixge_hw_dma_bd_release(struct net_device *ndev)
 NIXGE_MAX_JUMBO_FRAME_SIZE,
 DMA_FROM_DEVICE);
 
-   skb = (struct sk_buff *)
+   skb = (struct sk_buff *)(uintptr_t)
nixge_hw_dma_bd_get_addr(&priv->rx_bd_v[i],
 sw_id_offset);
dev_kfree_skb(skb);
@@ -323,7 +323,7 @@ static int nixge_hw_dma_bd_init(struct net_device *ndev)
if (!skb)
goto out;
 
-   nixge_hw_dma_bd_set_offset(&priv->rx_bd_v[i], skb);
+   nixge_hw_dma_bd_set_offset(&priv->rx_bd_v[i], (uintptr_t)skb);
phys = dma_map_single(ndev->dev.parent, skb->data,
  NIXGE_MAX_JUMBO_FRAME_SIZE,
  DMA_FROM_DEVICE);
@@ -601,8 +601,8 @@ static int nixge_recv(struct net_device *ndev, int budget)
tail_p = priv->rx_bd_p + sizeof(*priv->rx_bd_v) *
 priv->rx_bd_ci;
 
-   skb = (struct sk_buff *)nixge_hw_dma_bd_get_addr(cur_p,
-sw_id_offset);
+   skb = (struct sk_buff *)(uintptr_t)
+   nixge_hw_dma_bd_get_addr(cur_p, sw_id_offset);
 
length = cur_p->status & XAXIDMA_BD_STS_ACTUAL_LEN_MASK;
if (length > NIXGE_MAX_JUMBO_FRAME_SIZE)
@@ -643,7 +643,7 @@ static int nixge_recv(struct net_device *ndev, int budget)
nixge_hw_dma_bd_set_phys(cur_p, cur_phys);
cur_p->cntrl = NIXGE_MAX_JUMBO_FRAME_SIZE;
cur_p->status = 0;
-   nixge_hw_dma_bd_set_offset(cur_p, new_skb);
+   nixge_hw_dma_bd_set_offset(cur_p, (uintptr_t)new_skb);
 
++priv->rx_bd_ci;
priv->rx_bd_ci %= RX_BD_NUM;
-- 
2.19.0



Re: [PATCH v2 net-next 2/2] dt-bindings: net: add support for Microchip KSZ9131 Ethernet PHY

2018-09-27 Thread Andrew Lunn
On Thu, Sep 27, 2018 at 04:16:55PM -0400, Yuiko Oshino wrote:
> Add support for Microchip Technology KSZ9131 10/100/1000 Ethernet PHY
> 
> Signed-off-by: Yuiko Oshino 
> ---
>  .../devicetree/bindings/net/micrel-ksz90x1.txt | 29 
> +-
>  1 file changed, 28 insertions(+), 1 deletion(-)
> 
> diff --git a/Documentation/devicetree/bindings/net/micrel-ksz90x1.txt 
> b/Documentation/devicetree/bindings/net/micrel-ksz90x1.txt
> index e22d8cf..d23d14a 100644
> --- a/Documentation/devicetree/bindings/net/micrel-ksz90x1.txt
> +++ b/Documentation/devicetree/bindings/net/micrel-ksz90x1.txt
> @@ -1,4 +1,4 @@
> -Micrel KSZ9021/KSZ9031 Gigabit Ethernet PHY
> +Micrel/Microchip KSZ9021/KSZ9031/KSZ9131 Gigabit Ethernet PHY
>  
>  Some boards require special tuning values, particularly when it comes
>  to clock delays. You can specify clock delay values in the PHY OF
> @@ -64,6 +64,33 @@ KSZ9031:
>  Attention: The link partner must be configurable as slave otherwise
>  no link will be established.
>  
> +KSZ9131:
> +
> +  All skew control options are specified in picoseconds. The minimum
> +  value is 0, and the maximum is property-dependent. The increment
> +  step is 100ps. The default value is the neutral setting, so setting
> +  rxc-skew-ps=<0> actually results in -700 picoseconds adjustment.

We also have:

KSZ9021:

  All skew control options are specified in picoseconds. The minimum
  value is 0, the maximum value is 3000, and it is incremented by 200ps
  steps.

and

KSZ9031:

  All skew control options are specified in picoseconds. The minimum
  value is 0, and the maximum is property-dependent. The increment
  step is 60ps. The default value is the neutral setting, so setting
  rxc-skew-ps=<0> actually results in -900 picoseconds adjustment.

So this is the third meaning of 0.

How about making 0 mean 0. Have the range being -700 to 1800 and -700
to 800? KSZ9031 and KSZ9131 then use the same meaning of 0, with 0
actually meaning 0!

   Andrew


Re: [PATCH net-next] tcp: up initial rmem to 128KB and SYN rwin to around 64KB

2018-09-27 Thread Yuchung Cheng
On Thu, Sep 27, 2018 at 11:21 AM, Yuchung Cheng  wrote:
> Previously TCP initial receive buffer is ~87KB by default and
> the initial receive window is ~29KB (20 MSS). This patch changes
> the two numbers to 128KB and ~64KB (rounding down to the multiples
> of MSS) respectively. The patch also simplifies the calculations s.t.
> the two numbers are directly controlled by sysctl tcp_rmem[1]:
>
>   1) Initial receiver buffer budget (sk_rcvbuf): while this should
>  be configured via sysctl tcp_rmem[1], previously tcp_fixup_rcvbuf()
>  always override and set a larger size when a new connection
>  establishes.
>
>   2) Initial receive window in SYN: previously it is set to 20
>  packets if MSS <= 1460. The number 20 was based on the initial
>  congestion window of 10: the receiver needs twice amount to
>  avoid being limited by the receive window upon out-of-order
>  delivery in the first window burst. But since this only
>  applies if the receiving MSS <= 1460, connection using large MTU
>  (e.g. to utilize receiver zero-copy) may be limited by the
>  receive window.
>
> With this patch TCP memory configuration is more straight-forward and
> more properly sized to modern high-speed networks by default. Several
> popular stacks have been announcing 64KB rwin in SYNs as well.
Sorry please ignore this patch for now.

We need to adjust rbuf autotuning as well otherwise w/ larger init
rbuf it may increase too slowly during slow start. Will submit a v2

>
> Signed-off-by: Yuchung Cheng 
> Signed-off-by: Wei Wang 
> Signed-off-by: Neal Cardwell 
> Signed-off-by: Eric Dumazet 
> Reviewed-by: Soheil Hassas Yeganeh 
> ---
>  net/ipv4/tcp.c|  4 ++--
>  net/ipv4/tcp_input.c  | 25 ++---
>  net/ipv4/tcp_output.c | 25 -
>  3 files changed, 8 insertions(+), 46 deletions(-)
>
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index 69c236943f56..dcf51fbf5ec7 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -3896,8 +3896,8 @@ void __init tcp_init(void)
> init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
>
> init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
> -   init_net.ipv4.sysctl_tcp_rmem[1] = 87380;
> -   init_net.ipv4.sysctl_tcp_rmem[2] = max(87380, max_rshare);
> +   init_net.ipv4.sysctl_tcp_rmem[1] = 131072;
> +   init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare);
>
> pr_info("Hash tables configured (established %u bind %u)\n",
> tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index d703a0b3b6a2..7a59f6a96212 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -426,26 +426,7 @@ static void tcp_grow_window(struct sock *sk, const 
> struct sk_buff *skb)
> }
>  }
>
> -/* 3. Tuning rcvbuf, when connection enters established state. */
> -static void tcp_fixup_rcvbuf(struct sock *sk)
> -{
> -   u32 mss = tcp_sk(sk)->advmss;
> -   int rcvmem;
> -
> -   rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) *
> -tcp_default_init_rwnd(mss);
> -
> -   /* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency
> -* Allow enough cushion so that sender is not limited by our window
> -*/
> -   if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf)
> -   rcvmem <<= 2;
> -
> -   if (sk->sk_rcvbuf < rcvmem)
> -   sk->sk_rcvbuf = min(rcvmem, 
> sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
> -}
> -
> -/* 4. Try to fixup all. It is made immediately after connection enters
> +/* 3. Try to fixup all. It is made immediately after connection enters
>   *established state.
>   */
>  void tcp_init_buffer_space(struct sock *sk)
> @@ -454,8 +435,6 @@ void tcp_init_buffer_space(struct sock *sk)
> struct tcp_sock *tp = tcp_sk(sk);
> int maxwin;
>
> -   if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
> -   tcp_fixup_rcvbuf(sk);
> if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
> tcp_sndbuf_expand(sk);
>
> @@ -485,7 +464,7 @@ void tcp_init_buffer_space(struct sock *sk)
> tp->snd_cwnd_stamp = tcp_jiffies32;
>  }
>
> -/* 5. Recalculate window clamp after socket hit its memory bounds. */
> +/* 4. Recalculate window clamp after socket hit its memory bounds. */
>  static void tcp_clamp_window(struct sock *sk)
>  {
> struct tcp_sock *tp = tcp_sk(sk);
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index fe7855b090e4..059b67af28b1 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -195,21 +195,6 @@ static inline void tcp_event_ack_sent(struct sock *sk, 
> unsigned int pkts,
> inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
>  }
>
> -
> -u32 tcp_default_init_rwnd(u32 mss)
> -{
> -   /* Initial receive window should be twice of TCP_INIT_CWND to
> -* enable proper sending of new unsent data during fast recovery
> -

Re: kernel 4.18.5 Realtek 8111G network adapter stops responding under high system load

2018-09-27 Thread David Arendt
Hi,

Heiner Kallweit's patch seems to resolve the problem. The machine was
under high disk and network io pressure today and networking was
perfectly stable.

Bye,
David Arendt

On 9/25/18 11:03 PM, Heiner Kallweit wrote:
> On 19.09.2018 06:12, David Arendt wrote:
>> Hi,
>>
>> Thanks for the patch.
>>
>> I just applied it and the TxConfig register now contains 0x4f000f80.
>> The next day will show if it really solves the problem.
>>
>> Thanks in advance,
>> David Arendt
>>
>> On 9/19/18 12:30 AM, Maciej S. Szmigiero wrote:
>>> Hi,
>>>
>>> On 18.09.2018 12:23, David Arendt wrote:
 Hi,

 Today I had the network adapter problems again.
 So the patch doesn't seem to change anything regarding this problem.
 This week my time is unfortunately very limited, but I will try to
 find some time next weekend to look a bit more into the issue.
>>> If the problem is caused by missing TXCFG_AUTO_FIFO bit in TxConfig,
>>> as the register difference would suggest, then you can try applying
>>> the following patch (hack) on top of 4.18.8 that is already patched
>>> with commit f74dd480cf4e:
>>> --- a/drivers/net/ethernet/realtek/r8169.c
>>> +++ b/drivers/net/ethernet/realtek/r8169.c
>>> @@ -5043,7 +5043,8 @@
>>>  {
>>> /* Set DMA burst size and Interframe Gap Time */
>>> RTL_W32(tp, TxConfig, (TX_DMA_BURST << TxDMAShift) |
>>> -   (InterFrameGap << TxInterFrameGapShift));
>>> +   (InterFrameGap << TxInterFrameGapShift)
>>> +   | TXCFG_AUTO_FIFO);
>>>  }
>>>  
>>>  static void rtl_set_rx_max_size(struct rtl8169_private *tp)
>>>
>>> This hack will probably only work properly on RTL_GIGA_MAC_VER_40 or
>>> later NICs.
>>>
>>> Before running any tests please verify with "ethtool -d enp3s0" that
>>> TxConfig register now contains 0x4f000f80, as it did in the old,
>>> working driver version.
>>>
>>> If this does not help then a bisection will most likely be needed.
>>>
 Thanks in advance,
 David Arendt
>>> Maciej
>>
>>
> @Gabriel:
> Thanks for the hint, I wasn't fully aware of this thread.
> @Maciej:
> Thanks for the analysis.
>
> It seems that all chip versions from 34 (= RTL8168E-VL) with the
> exception of version 39 (= RTL8106E, first sub-version) need
> bit TXCFG_AUTO_FIFO.
>
> And indeed, due to reordering of calls this bit is overwritten.
> Following patch moves setting the bit from the chip-specific
> hw_start function to rtl_set_tx_config_registers().
>
> Whoever is hit by the issue and has the option to build a kernel,
> could you please test whether the patch fixes the issue for you?
>
> Thanks, Heiner
>
> ---
>  drivers/net/ethernet/realtek/r8169.c | 20 
>  1 file changed, 8 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/net/ethernet/realtek/r8169.c 
> b/drivers/net/ethernet/realtek/r8169.c
> index f882be49f..ae8abe900 100644
> --- a/drivers/net/ethernet/realtek/r8169.c
> +++ b/drivers/net/ethernet/realtek/r8169.c
> @@ -4514,9 +4514,14 @@ static void rtl8169_hw_reset(struct rtl8169_private 
> *tp)
>  
>  static void rtl_set_tx_config_registers(struct rtl8169_private *tp)
>  {
> - /* Set DMA burst size and Interframe Gap Time */
> - RTL_W32(tp, TxConfig, (TX_DMA_BURST << TxDMAShift) |
> - (InterFrameGap << TxInterFrameGapShift));
> + u32 val = TX_DMA_BURST << TxDMAShift |
> +   InterFrameGap << TxInterFrameGapShift;
> +
> + if (tp->mac_version >= RTL_GIGA_MAC_VER_34 &&
> + tp->mac_version != RTL_GIGA_MAC_VER_39)
> + val |= TXCFG_AUTO_FIFO;
> +
> + RTL_W32(tp, TxConfig, val);
>  }
>  
>  static void rtl_set_rx_max_size(struct rtl8169_private *tp)
> @@ -5011,7 +5016,6 @@ static void rtl_hw_start_8168e_2(struct rtl8169_private 
> *tp)
>  
>   rtl_disable_clock_request(tp);
>  
> - RTL_W32(tp, TxConfig, RTL_R32(tp, TxConfig) | TXCFG_AUTO_FIFO);
>   RTL_W8(tp, MCU, RTL_R8(tp, MCU) & ~NOW_IS_OOB);
>  
>   /* Adjust EEE LED frequency */
> @@ -5045,7 +5049,6 @@ static void rtl_hw_start_8168f(struct rtl8169_private 
> *tp)
>  
>   rtl_disable_clock_request(tp);
>  
> - RTL_W32(tp, TxConfig, RTL_R32(tp, TxConfig) | TXCFG_AUTO_FIFO);
>   RTL_W8(tp, MCU, RTL_R8(tp, MCU) & ~NOW_IS_OOB);
>   RTL_W8(tp, DLLPR, RTL_R8(tp, DLLPR) | PFM_EN);
>   RTL_W32(tp, MISC, RTL_R32(tp, MISC) | PWM_EN);
> @@ -5090,8 +5093,6 @@ static void rtl_hw_start_8411(struct rtl8169_private 
> *tp)
>  
>  static void rtl_hw_start_8168g(struct rtl8169_private *tp)
>  {
> - RTL_W32(tp, TxConfig, RTL_R32(tp, TxConfig) | TXCFG_AUTO_FIFO);
> -
>   rtl_eri_write(tp, 0xc8, ERIAR_MASK_0101, 0x080002, ERIAR_EXGMAC);
>   rtl_eri_write(tp, 0xcc, ERIAR_MASK_0001, 0x38, ERIAR_EXGMAC);
>   rtl_eri_write(tp, 0xd0, ERIAR_MASK_0001, 0x48, ERIAR_EXGMAC);
> @@ -5189,8 +5190,6 @@ static void rtl_hw_start_8168h_1(struct rtl8169_private 
> *tp)
>   rtl_hw_aspm_clkreq_enable(tp, false);
>   rtl_ephy_init(tp, e_info_8168h_1, ARRAY_SIZE(e_info_8168h_1));
>  
> - RTL_W3

  1   2   3   >