[PATCH v2 net-next] net: phy: improve handling delayed work
Using mod_delayed_work() allows to simplify handling delayed work and removes the need for the sync parameter in phy_trigger_machine(). Also introduce a helper phy_queue_state_machine() to encapsulate the low-level delayed work calls. No functional change intended. Signed-off-by: Heiner Kallweit --- v2: - removed inline annotation from phy_queue_state_machine() --- drivers/net/phy/phy.c | 29 +++-- include/linux/phy.h | 2 +- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index a1f8e4816..14509a890 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -537,7 +537,7 @@ static int phy_start_aneg_priv(struct phy_device *phydev, bool sync) mutex_unlock(&phydev->lock); if (trigger) - phy_trigger_machine(phydev, sync); + phy_trigger_machine(phydev); return err; } @@ -635,6 +635,13 @@ int phy_speed_up(struct phy_device *phydev) } EXPORT_SYMBOL_GPL(phy_speed_up); +static void phy_queue_state_machine(struct phy_device *phydev, + unsigned int secs) +{ + mod_delayed_work(system_power_efficient_wq, &phydev->state_queue, +secs * HZ); +} + /** * phy_start_machine - start PHY state machine tracking * @phydev: the phy_device struct @@ -647,7 +654,7 @@ EXPORT_SYMBOL_GPL(phy_speed_up); */ void phy_start_machine(struct phy_device *phydev) { - queue_delayed_work(system_power_efficient_wq, &phydev->state_queue, HZ); + phy_queue_state_machine(phydev, 1); } EXPORT_SYMBOL_GPL(phy_start_machine); @@ -655,19 +662,14 @@ EXPORT_SYMBOL_GPL(phy_start_machine); * phy_trigger_machine - trigger the state machine to run * * @phydev: the phy_device struct - * @sync: indicate whether we should wait for the workqueue cancelation * * Description: There has been a change in state which requires that the * state machine runs. */ -void phy_trigger_machine(struct phy_device *phydev, bool sync) +void phy_trigger_machine(struct phy_device *phydev) { - if (sync) - cancel_delayed_work_sync(&phydev->state_queue); - else - cancel_delayed_work(&phydev->state_queue); - queue_delayed_work(system_power_efficient_wq, &phydev->state_queue, 0); + phy_queue_state_machine(phydev, 0); } /** @@ -703,7 +705,7 @@ static void phy_error(struct phy_device *phydev) phydev->state = PHY_HALTED; mutex_unlock(&phydev->lock); - phy_trigger_machine(phydev, false); + phy_trigger_machine(phydev); } /** @@ -745,7 +747,7 @@ static irqreturn_t phy_change(struct phy_device *phydev) mutex_unlock(&phydev->lock); /* reschedule state queue work to run as soon as possible */ - phy_trigger_machine(phydev, true); + phy_trigger_machine(phydev); if (phy_interrupt_is_valid(phydev) && phy_clear_interrupt(phydev)) goto phy_err; @@ -911,7 +913,7 @@ void phy_start(struct phy_device *phydev) } mutex_unlock(&phydev->lock); - phy_trigger_machine(phydev, true); + phy_trigger_machine(phydev); } EXPORT_SYMBOL(phy_start); @@ -1130,8 +1132,7 @@ void phy_state_machine(struct work_struct *work) * called from phy_disconnect() synchronously. */ if (phy_polling_mode(phydev) && old_state != PHY_HALTED) - queue_delayed_work(system_power_efficient_wq, &phydev->state_queue, - PHY_STATE_TIME * HZ); + phy_queue_state_machine(phydev, PHY_STATE_TIME); } /** diff --git a/include/linux/phy.h b/include/linux/phy.h index 192a1fa0c..15bd074ef 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1039,7 +1039,7 @@ void phy_change_work(struct work_struct *work); void phy_mac_interrupt(struct phy_device *phydev); void phy_start_machine(struct phy_device *phydev); void phy_stop_machine(struct phy_device *phydev); -void phy_trigger_machine(struct phy_device *phydev, bool sync); +void phy_trigger_machine(struct phy_device *phydev); int phy_ethtool_sset(struct phy_device *phydev, struct ethtool_cmd *cmd); void phy_ethtool_ksettings_get(struct phy_device *phydev, struct ethtool_link_ksettings *cmd); -- 2.19.0
[PATCH v3 net-next 6/9] bnxt_en: Use msix_vec_per_pf_max and msix_vec_per_pf_min devlink params.
This patch adds support for following generic permanent mode devlink parameters. They can be modified using devlink param commands. msix_vec_per_pf_max - This param sets the number of MSIX vectors that the device requests from the host on driver initialization. This value is set in the device which limits MSIX vectors per PF. msix_vec_per_pf_min - This param sets the number of minimal MSIX vectors required for the device initialization. Value 0 indicates a default value is selected. This value is set in the device which limits MSIX vectors per PF. Cc: Michael Chan Signed-off-by: Vasundhara Volam --- drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c | 50 ++- drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h | 5 +++ 2 files changed, 53 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c index dc566fd..de7e74a 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c @@ -26,6 +26,10 @@ BNXT_NVM_SHARED_CFG, 1}, {DEVLINK_PARAM_GENERIC_ID_IGNORE_ARI, NVM_OFF_IGNORE_ARI, BNXT_NVM_SHARED_CFG, 1}, + {DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MAX, +NVM_OFF_MSIX_VEC_PER_PF_MAX, BNXT_NVM_SHARED_CFG, 10}, + {DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MIN, +NVM_OFF_MSIX_VEC_PER_PF_MIN, BNXT_NVM_SHARED_CFG, 7}, }; static int bnxt_hwrm_nvm_req(struct bnxt *bp, u32 param_id, void *msg, @@ -57,8 +61,22 @@ static int bnxt_hwrm_nvm_req(struct bnxt *bp, u32 param_id, void *msg, idx = bp->pf.fw_fid - BNXT_FIRST_PF_FID; bytesize = roundup(nvm_param.num_bits, BITS_PER_BYTE) / BITS_PER_BYTE; - if (nvm_param.num_bits == 1) - buf = &val->vbool; + switch (bytesize) { + case 1: + if (nvm_param.num_bits == 1) + buf = &val->vbool; + else + buf = &val->vu8; + break; + case 2: + buf = &val->vu16; + break; + case 4: + buf = &val->vu32; + break; + default: + return -EFAULT; + } data_addr = dma_zalloc_coherent(&bp->pdev->dev, bytesize, &data_dma_addr, GFP_KERNEL); @@ -109,6 +127,26 @@ static int bnxt_dl_nvm_param_set(struct devlink *dl, u32 id, return bnxt_hwrm_nvm_req(bp, id, &req, sizeof(req), &ctx->val); } +static int bnxt_dl_msix_validate(struct devlink *dl, u32 id, +union devlink_param_value val, +struct netlink_ext_ack *extack) +{ + int max_val; + + if (id == DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MAX) + max_val = BNXT_MSIX_VEC_MAX; + + if (id == DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MIN) + max_val = BNXT_MSIX_VEC_MIN_MAX; + + if (val.vu32 < 0 || val.vu32 > max_val) { + NL_SET_ERR_MSG_MOD(extack, "MSIX value is exceeding the range"); + return -EINVAL; + } + + return 0; +} + static const struct devlink_param bnxt_dl_params[] = { DEVLINK_PARAM_GENERIC(ENABLE_SRIOV, BIT(DEVLINK_PARAM_CMODE_PERMANENT), @@ -118,6 +156,14 @@ static int bnxt_dl_nvm_param_set(struct devlink *dl, u32 id, BIT(DEVLINK_PARAM_CMODE_PERMANENT), bnxt_dl_nvm_param_get, bnxt_dl_nvm_param_set, NULL), + DEVLINK_PARAM_GENERIC(MSIX_VEC_PER_PF_MAX, + BIT(DEVLINK_PARAM_CMODE_PERMANENT), + bnxt_dl_nvm_param_get, bnxt_dl_nvm_param_set, + bnxt_dl_msix_validate), + DEVLINK_PARAM_GENERIC(MSIX_VEC_PER_PF_MIN, + BIT(DEVLINK_PARAM_CMODE_PERMANENT), + bnxt_dl_nvm_param_get, bnxt_dl_nvm_param_set, + bnxt_dl_msix_validate), }; int bnxt_dl_register(struct bnxt *bp) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h index da146492..0e67c05 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h @@ -33,10 +33,15 @@ static inline void bnxt_link_bp_to_dl(struct bnxt *bp, struct devlink *dl) } } +#define NVM_OFF_MSIX_VEC_PER_PF_MAX108 +#define NVM_OFF_MSIX_VEC_PER_PF_MIN114 #define NVM_OFF_IGNORE_ARI 164 #define NVM_OFF_HW_TC_OFFLOAD 170 #define NVM_OFF_ENABLE_SRIOV 401 +#define BNXT_MSIX_VEC_MAX 1280 +#define BNXT_MSIX_VEC_MIN_MAX 128 + enum bnxt_nvm_dir_type { BNXT_NVM_SHARED_CFG = 40, BNXT_NVM_PORT_CFG, -- 1.8.3.1
[PATCH v3 net-next 8/9] devlink: Add Documentation/networking/devlink-params.txt
This patch adds a new file to add information about some of the generic configuration parameters set via devlink. Cc: "David S. Miller" Cc: Jonathan Corbet Cc: linux-...@vger.kernel.org Cc: Jiri Pirko Cc: Michael Chan Signed-off-by: Vasundhara Volam --- Documentation/networking/devlink-params.txt | 42 + 1 file changed, 42 insertions(+) create mode 100644 Documentation/networking/devlink-params.txt diff --git a/Documentation/networking/devlink-params.txt b/Documentation/networking/devlink-params.txt new file mode 100644 index 000..ae444ff --- /dev/null +++ b/Documentation/networking/devlink-params.txt @@ -0,0 +1,42 @@ +Devlink configuration parameters + +Following is the list of configuration parameters via devlink interface. +Each parameter can be generic or driver specific and are device level +parameters. + +Note that the driver-specific files should contain the generic params +they support to, with supported config modes. + +Each parameter can be set in different configuration modes: + runtime - set while driver is running, no reset required. + driverinit - applied while driver initializes, requires restart + driver by devlink reload command. + permanent - written to device's non-volatile memory, hard reset + required. + +Following is the list of parameters: + +enable_sriov [DEVICE, GENERIC] + Enable Single Root I/O Virtualisation (SRIOV) in + the device. + Type: Boolean + +ignore_ari [DEVICE, GENERIC] + Ignore Alternative Routing-ID Interpretation (ARI) + capability. If enabled, adapter will ignore ARI + capability even when platforms has the support + enabled and creates same number of partitions when + platform does not support ARI. + Type: Boolean + +msix_vec_per_pf_max[DEVICE, GENERIC] + Provides the maximum number of MSIX interrupts that + a device can create. Value is same across all + physical functions (PFs) in the device. + Type: u32 + +msix_vec_per_pf_min[DEVICE, GENERIC] + Provides the minimum number of MSIX interrupts required + for the device initialization. Value is same across all + physical functions (PFs) in the device. + Type: u32 -- 1.8.3.1
[PATCH v3 net-next 2/9] devlink: Add generic parameter msix_vec_per_pf_max
msix_vec_per_pf_max - This param sets the number of MSIX vectors that the device requests from the host on driver initialization. This value is set in the device which is applicable per PF. Cc: Jiri Pirko Cc: Michael Chan Signed-off-by: Vasundhara Volam --- include/net/devlink.h | 4 net/core/devlink.c| 5 + 2 files changed, 9 insertions(+) diff --git a/include/net/devlink.h b/include/net/devlink.h index 90d8343..59be17b 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -363,6 +363,7 @@ enum devlink_param_generic_id { DEVLINK_PARAM_GENERIC_ID_ENABLE_SRIOV, DEVLINK_PARAM_GENERIC_ID_REGION_SNAPSHOT, DEVLINK_PARAM_GENERIC_ID_IGNORE_ARI, + DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MAX, /* add new param generic ids above here*/ __DEVLINK_PARAM_GENERIC_ID_MAX, @@ -384,6 +385,9 @@ enum devlink_param_generic_id { #define DEVLINK_PARAM_GENERIC_IGNORE_ARI_NAME "ignore_ari" #define DEVLINK_PARAM_GENERIC_IGNORE_ARI_TYPE DEVLINK_PARAM_TYPE_BOOL +#define DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_NAME "msix_vec_per_pf_max" +#define DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_TYPE DEVLINK_PARAM_TYPE_U32 + #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate) \ { \ .id = DEVLINK_PARAM_GENERIC_ID_##_id, \ diff --git a/net/core/devlink.c b/net/core/devlink.c index 3349a4d..ce9fe63 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2680,6 +2680,11 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) .name = DEVLINK_PARAM_GENERIC_IGNORE_ARI_NAME, .type = DEVLINK_PARAM_GENERIC_IGNORE_ARI_TYPE, }, + { + .id = DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MAX, + .name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_NAME, + .type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_TYPE, + }, }; static int devlink_param_generic_verify(const struct devlink_param *param) -- 1.8.3.1
[PATCH v3 net-next 4/9] bnxt_en: Use ignore_ari devlink parameter
This patch adds support for ignore_ari generic permanent mode devlink parameter. This parameter is disabled by default. It can be enabled using devlink param commands. ignore_ari - If enabled, device ignores ARI(Alternate Routing ID) capability, even when platforms has the support and creates same number of partitions when platform does not support ARI capability. Cc: Michael Chan Signed-off-by: Vasundhara Volam --- drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c | 6 ++ drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h | 2 ++ 2 files changed, 8 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c index 790c684..5173881 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c @@ -24,6 +24,8 @@ static const struct bnxt_dl_nvm_param nvm_params[] = { {DEVLINK_PARAM_GENERIC_ID_ENABLE_SRIOV, NVM_OFF_ENABLE_SRIOV, BNXT_NVM_SHARED_CFG, 1}, + {DEVLINK_PARAM_GENERIC_ID_IGNORE_ARI, NVM_OFF_IGNORE_ARI, +BNXT_NVM_SHARED_CFG, 1}, }; static int bnxt_hwrm_nvm_req(struct bnxt *bp, u32 param_id, void *msg, @@ -108,6 +110,10 @@ static int bnxt_dl_nvm_param_set(struct devlink *dl, u32 id, BIT(DEVLINK_PARAM_CMODE_PERMANENT), bnxt_dl_nvm_param_get, bnxt_dl_nvm_param_set, NULL), + DEVLINK_PARAM_GENERIC(IGNORE_ARI, + BIT(DEVLINK_PARAM_CMODE_PERMANENT), + bnxt_dl_nvm_param_get, bnxt_dl_nvm_param_set, + NULL), }; int bnxt_dl_register(struct bnxt *bp) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h index 2f68dc0..da146492 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h @@ -33,6 +33,8 @@ static inline void bnxt_link_bp_to_dl(struct bnxt *bp, struct devlink *dl) } } +#define NVM_OFF_IGNORE_ARI 164 +#define NVM_OFF_HW_TC_OFFLOAD 170 #define NVM_OFF_ENABLE_SRIOV 401 enum bnxt_nvm_dir_type { -- 1.8.3.1
[PATCH v3 net-next 1/9] devlink: Add generic parameter ignore_ari
ignore_ari - Device ignores ARI(Alternate Routing ID) capability, even when platforms has the support and creates same number of partitions when platform does not support ARI capability. Cc: Jiri Pirko Cc: Michael Chan Signed-off-by: Vasundhara Volam --- include/net/devlink.h | 4 net/core/devlink.c| 5 + 2 files changed, 9 insertions(+) diff --git a/include/net/devlink.h b/include/net/devlink.h index b9b89d6..90d8343 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -362,6 +362,7 @@ enum devlink_param_generic_id { DEVLINK_PARAM_GENERIC_ID_MAX_MACS, DEVLINK_PARAM_GENERIC_ID_ENABLE_SRIOV, DEVLINK_PARAM_GENERIC_ID_REGION_SNAPSHOT, + DEVLINK_PARAM_GENERIC_ID_IGNORE_ARI, /* add new param generic ids above here*/ __DEVLINK_PARAM_GENERIC_ID_MAX, @@ -380,6 +381,9 @@ enum devlink_param_generic_id { #define DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_NAME "region_snapshot_enable" #define DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_TYPE DEVLINK_PARAM_TYPE_BOOL +#define DEVLINK_PARAM_GENERIC_IGNORE_ARI_NAME "ignore_ari" +#define DEVLINK_PARAM_GENERIC_IGNORE_ARI_TYPE DEVLINK_PARAM_TYPE_BOOL + #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate) \ { \ .id = DEVLINK_PARAM_GENERIC_ID_##_id, \ diff --git a/net/core/devlink.c b/net/core/devlink.c index 8c0ed22..3349a4d 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2675,6 +2675,11 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) .name = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_NAME, .type = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_TYPE, }, + { + .id = DEVLINK_PARAM_GENERIC_ID_IGNORE_ARI, + .name = DEVLINK_PARAM_GENERIC_IGNORE_ARI_NAME, + .type = DEVLINK_PARAM_GENERIC_IGNORE_ARI_TYPE, + }, }; static int devlink_param_generic_verify(const struct devlink_param *param) -- 1.8.3.1
[PATCH v3 net-next 3/9] devlink: Add generic parameter msix_vec_per_pf_min
msix_vec_per_pf_min - This param sets the number of minimal MSIX vectors required for the device initialization. This value is set in the device which limits MSIX vectors per PF. Cc: Jiri Pirko Cc: Michael Chan Signed-off-by: Vasundhara Volam --- include/net/devlink.h | 4 net/core/devlink.c| 5 + 2 files changed, 9 insertions(+) diff --git a/include/net/devlink.h b/include/net/devlink.h index 59be17b..361f525 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -364,6 +364,7 @@ enum devlink_param_generic_id { DEVLINK_PARAM_GENERIC_ID_REGION_SNAPSHOT, DEVLINK_PARAM_GENERIC_ID_IGNORE_ARI, DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MAX, + DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MIN, /* add new param generic ids above here*/ __DEVLINK_PARAM_GENERIC_ID_MAX, @@ -388,6 +389,9 @@ enum devlink_param_generic_id { #define DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_NAME "msix_vec_per_pf_max" #define DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_TYPE DEVLINK_PARAM_TYPE_U32 +#define DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_NAME "msix_vec_per_pf_min" +#define DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_TYPE DEVLINK_PARAM_TYPE_U32 + #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate) \ { \ .id = DEVLINK_PARAM_GENERIC_ID_##_id, \ diff --git a/net/core/devlink.c b/net/core/devlink.c index ce9fe63..25d3bfa 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2685,6 +2685,11 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) .name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_NAME, .type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_TYPE, }, + { + .id = DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MIN, + .name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_NAME, + .type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_TYPE, + }, }; static int devlink_param_generic_verify(const struct devlink_param *param) -- 1.8.3.1
[PATCH v3 net-next 9/9] devlink: Add Documentation/networking/devlink-params-bnxt.txt
This patch adds a new file to add information about configuration parameters that are supported by bnxt_en driver via devlink. Cc: "David S. Miller" Cc: Jonathan Corbet Cc: linux-...@vger.kernel.org Cc: Jiri Pirko Cc: Michael Chan Signed-off-by: Vasundhara Volam --- Documentation/networking/devlink-params-bnxt.txt | 22 ++ 1 file changed, 22 insertions(+) create mode 100644 Documentation/networking/devlink-params-bnxt.txt diff --git a/Documentation/networking/devlink-params-bnxt.txt b/Documentation/networking/devlink-params-bnxt.txt new file mode 100644 index 000..c7bc9d8 --- /dev/null +++ b/Documentation/networking/devlink-params-bnxt.txt @@ -0,0 +1,22 @@ +enable_sriov [DEVICE, GENERIC] + Type: Boolean + Configuration mode: Permanent + +ignore_ari [DEVICE, GENERIC] + Type: Boolean + Configuration mode: Permanent + +msix_vec_per_pf_max[DEVICE, GENERIC] + Type: u32 + Configuration mode: Permanent + +msix_vec_per_pf_min[DEVICE, GENERIC] + Type: u32 + Configuration mode: Permanent + +gre_ver_check [DEVICE, DRIVER-SPECIFIC] + Generic Routing Encapsulation (GRE) version check will + be enabled in the device. If disabled, device skips + version checking for incoming packets. + Type: Boolean + Configuration mode: Permanent -- 1.8.3.1
[PATCH v3 net-next 7/9] bnxt_en: Add a driver specific gre_ver_check devlink parameter.
This patch adds following driver-specific permanent mode boolean parameter. gre_ver_check - Generic Routing Encapsulation(GRE) version check will be enabled in the device. If disabled, device skips version checking for GRE packets. Cc: Michael Chan Signed-off-by: Vasundhara Volam --- drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c | 24 ++- drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h | 1 + 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c index de7e74a..8a10e01 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c @@ -21,6 +21,11 @@ #endif /* CONFIG_BNXT_SRIOV */ }; +enum bnxt_dl_param_id { + BNXT_DEVLINK_PARAM_ID_BASE = DEVLINK_PARAM_GENERIC_ID_MAX, + BNXT_DEVLINK_PARAM_ID_GRE_VER_CHECK, +}; + static const struct bnxt_dl_nvm_param nvm_params[] = { {DEVLINK_PARAM_GENERIC_ID_ENABLE_SRIOV, NVM_OFF_ENABLE_SRIOV, BNXT_NVM_SHARED_CFG, 1}, @@ -30,6 +35,8 @@ NVM_OFF_MSIX_VEC_PER_PF_MAX, BNXT_NVM_SHARED_CFG, 10}, {DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MIN, NVM_OFF_MSIX_VEC_PER_PF_MIN, BNXT_NVM_SHARED_CFG, 7}, + {BNXT_DEVLINK_PARAM_ID_GRE_VER_CHECK, NVM_OFF_DIS_GRE_VER_CHECK, +BNXT_NVM_SHARED_CFG, 1}, }; static int bnxt_hwrm_nvm_req(struct bnxt *bp, u32 param_id, void *msg, @@ -112,9 +119,15 @@ static int bnxt_dl_nvm_param_get(struct devlink *dl, u32 id, { struct hwrm_nvm_get_variable_input req = {0}; struct bnxt *bp = bnxt_get_bp_from_dl(dl); + int rc; bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_NVM_GET_VARIABLE, -1, -1); - return bnxt_hwrm_nvm_req(bp, id, &req, sizeof(req), &ctx->val); + rc = bnxt_hwrm_nvm_req(bp, id, &req, sizeof(req), &ctx->val); + if (!rc) + if (id == BNXT_DEVLINK_PARAM_ID_GRE_VER_CHECK) + ctx->val.vbool = !ctx->val.vbool; + + return rc; } static int bnxt_dl_nvm_param_set(struct devlink *dl, u32 id, @@ -124,6 +137,10 @@ static int bnxt_dl_nvm_param_set(struct devlink *dl, u32 id, struct bnxt *bp = bnxt_get_bp_from_dl(dl); bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_NVM_SET_VARIABLE, -1, -1); + + if (id == BNXT_DEVLINK_PARAM_ID_GRE_VER_CHECK) + ctx->val.vbool = !ctx->val.vbool; + return bnxt_hwrm_nvm_req(bp, id, &req, sizeof(req), &ctx->val); } @@ -164,6 +181,11 @@ static int bnxt_dl_msix_validate(struct devlink *dl, u32 id, BIT(DEVLINK_PARAM_CMODE_PERMANENT), bnxt_dl_nvm_param_get, bnxt_dl_nvm_param_set, bnxt_dl_msix_validate), + DEVLINK_PARAM_DRIVER(BNXT_DEVLINK_PARAM_ID_GRE_VER_CHECK, +"gre_ver_check", DEVLINK_PARAM_TYPE_BOOL, +BIT(DEVLINK_PARAM_CMODE_PERMANENT), +bnxt_dl_nvm_param_get, bnxt_dl_nvm_param_set, +NULL), }; int bnxt_dl_register(struct bnxt *bp) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h index 0e67c05..e36e41a 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h @@ -37,6 +37,7 @@ static inline void bnxt_link_bp_to_dl(struct bnxt *bp, struct devlink *dl) #define NVM_OFF_MSIX_VEC_PER_PF_MIN114 #define NVM_OFF_IGNORE_ARI 164 #define NVM_OFF_HW_TC_OFFLOAD 170 +#define NVM_OFF_DIS_GRE_VER_CHECK 171 #define NVM_OFF_ENABLE_SRIOV 401 #define BNXT_MSIX_VEC_MAX 1280 -- 1.8.3.1
[PATCH v3 net-next 5/9] bnxt_en: return proper error when FW returns HWRM_ERR_CODE_RESOURCE_ACCESS_DENIED
Return proper error code when Firmware returns HWRM_ERR_CODE_RESOURCE_ACCESS_DENIED for HWRM_NVM_GET/SET_VARIABLE commands. Cc: Michael Chan Signed-off-by: Vasundhara Volam --- drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c index 5173881..dc566fd 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c @@ -80,8 +80,12 @@ static int bnxt_hwrm_nvm_req(struct bnxt *bp, u32 param_id, void *msg, memcpy(buf, data_addr, bytesize); dma_free_coherent(&bp->pdev->dev, bytesize, data_addr, data_dma_addr); - if (rc) + if (rc == HWRM_ERR_CODE_RESOURCE_ACCESS_DENIED) { + netdev_err(bp->dev, "PF does not have admin privileges to modify NVM config\n"); + return -EACCES; + } else if (rc) { return -EIO; + } return 0; } -- 1.8.3.1
[PATCH v3 net-next 0/9] bnxt_en: devlink param updates
This patchset adds support for 3 generic and 1 driver-specific devlink parameters. Add documentation for these configuration parameters. Also, this patchset adds support to return proper error code if HWRM_NVM_GET/SET_VARIABLE commands return error code HWRM_ERR_CODE_RESOURCE_ACCESS_DENIED. v2->v3: -Remove description of generic parameters from devlink-params-bnxt.txt. v1->v2: -Remove hw_tc_offload parameter. -Update all patches with Cc of MAINTAINERS. -Add more description in commit message for device specific parameter. -Add a new Documentation/networking/devlink-params.txt with some generic devlink parameters information. -Add a new Documentation/networking/devlink-params-bnxt.txt with devlink parameters information that are supported by bnxt_en driver. Vasundhara Volam (9): devlink: Add generic parameter ignore_ari devlink: Add generic parameter msix_vec_per_pf_max devlink: Add generic parameter msix_vec_per_pf_min bnxt_en: Use ignore_ari devlink parameter bnxt_en: return proper error when FW returns HWRM_ERR_CODE_RESOURCE_ACCESS_DENIED bnxt_en: Use msix_vec_per_pf_max and msix_vec_per_pf_min devlink params. bnxt_en: Add a driver specific gre_ver_check devlink parameter. devlink: Add Documentation/networking/devlink-params.txt devlink: Add Documentation/networking/devlink-params-bnxt.txt Documentation/networking/devlink-params-bnxt.txt | 22 ++ Documentation/networking/devlink-params.txt | 42 +++ drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c | 86 +-- drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h | 8 +++ include/net/devlink.h | 12 net/core/devlink.c| 15 6 files changed, 181 insertions(+), 4 deletions(-) create mode 100644 Documentation/networking/devlink-params-bnxt.txt create mode 100644 Documentation/networking/devlink-params.txt -- 1.8.3.1
Re: re iproute2 - don't return error on success fix
On Thu, Sep 27, 2018 at 3:53 PM Phil Sutter wrote: > Hmm, I can't reproduce this. My HEAD is at the commit you mentioned: > > | % sudo ./tc/tc filter add dev d0 protocol ip parent : flower skip_sw > ip_flags nofirstfrag action drop > | RTNETLINK answers: Operation not supported > | We have an error talking to the kernel, -1 > | % echo $? > | 2 > > Are you sure you tested the right binary? I will double check, but we're on weekend + holiday, so I'll get doing that next week
[PATCH 13/15] octeontx2-af: Add support for CGX link management
From: Linu Cherian CGX LMAC initialization, link status polling etc is done by low level secure firmware. For link management this patch adds a interface or communication mechanism between firmware and this kernel CGX driver. - Firmware interface specification is defined in cgx_fw_if.h. - Support to send/receive commands/events to/form firmware. - events/commands implemented * link up * link down * reading firmware version Signed-off-by: Linu Cherian Signed-off-by: Nithya Mani --- drivers/net/ethernet/marvell/octeontx2/af/cgx.c| 364 - drivers/net/ethernet/marvell/octeontx2/af/cgx.h| 32 ++ .../net/ethernet/marvell/octeontx2/af/cgx_fw_if.h | 222 + 3 files changed, 614 insertions(+), 4 deletions(-) create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/cgx_fw_if.h diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c index 06fd9fd..b306f57 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c @@ -24,16 +24,43 @@ #define DRV_NAME "octeontx2-cgx" #define DRV_STRING "Marvell OcteonTX2 CGX/MAC Driver" +/** + * struct lmac + * @wq_cmd_cmplt: waitq to keep the process blocked until cmd completion + * @cmd_lock: Lock to serialize the command interface + * @resp: command response + * @event_cb: callback for linkchange events + * @cmd_pend: flag set before new command is started + * flag cleared after command response is received + * @cgx: parent cgx port + * @lmac_id: lmac port id + * @name: lmac port name + */ +struct lmac { + wait_queue_head_t wq_cmd_cmplt; + struct mutex cmd_lock; + struct cgx_evt_sts resp; + struct cgx_event_cb event_cb; + bool cmd_pend; + struct cgx *cgx; + u8 lmac_id; + char *name; +}; + struct cgx { void __iomem*reg_base; struct pci_dev *pdev; u8 cgx_id; u8 lmac_count; + struct lmac *lmac_idmap[MAX_LMAC_PER_CGX]; struct list_headcgx_list; }; static LIST_HEAD(cgx_list); +/* CGX PHY management internal APIs */ +static int cgx_fwi_link_change(struct cgx *cgx, int lmac_id, bool en); + /* Supported devices */ static const struct pci_device_id cgx_id_table[] = { { PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, PCI_DEVID_OCTEONTX2_CGX) }, @@ -45,11 +72,24 @@ MODULE_DESCRIPTION(DRV_STRING); MODULE_LICENSE("GPL v2"); MODULE_DEVICE_TABLE(pci, cgx_id_table); +static void cgx_write(struct cgx *cgx, u64 lmac, u64 offset, u64 val) +{ + writeq(val, cgx->reg_base + (lmac << 18) + offset); +} + static u64 cgx_read(struct cgx *cgx, u64 lmac, u64 offset) { return readq(cgx->reg_base + (lmac << 18) + offset); } +static inline struct lmac *lmac_pdata(u8 lmac_id, struct cgx *cgx) +{ + if (!cgx || lmac_id >= MAX_LMAC_PER_CGX) + return NULL; + + return cgx->lmac_idmap[lmac_id]; +} + int cgx_get_cgx_cnt(void) { struct cgx *cgx_dev; @@ -85,18 +125,318 @@ void *cgx_get_pdata(int cgx_id) } EXPORT_SYMBOL(cgx_get_pdata); -static void cgx_lmac_init(struct cgx *cgx) +/* CGX Firmware interface low level support */ +static int cgx_fwi_cmd_send(struct cgx_cmd *cmd, struct cgx_evt_sts *rsp, + struct lmac *lmac) +{ + struct cgx *cgx = lmac->cgx; + union cgx_cmdreg creg; + union cgx_evtreg ereg; + struct device *dev; + int err = 0; + + /* Ensure no other command is in progress */ + err = mutex_lock_interruptible(&lmac->cmd_lock); + if (err) + return err; + + /* Ensure command register is free */ + creg.val = cgx_read(cgx, lmac->lmac_id, CGX_COMMAND_REG); + if (creg.cmd.own != CGX_CMD_OWN_NS) { + err = -EBUSY; + goto unlock; + } + + /* Update ownership in command request */ + cmd->own = CGX_CMD_OWN_FIRMWARE; + + /* Mark this lmac as pending, before we start */ + lmac->cmd_pend = true; + + /* Start command in hardware */ + creg.cmd = *cmd; + cgx_write(cgx, lmac->lmac_id, CGX_COMMAND_REG, creg.val); + creg.val = cgx_read(cgx, lmac->lmac_id, CGX_COMMAND_REG); + + /* Ensure command is completed without errors */ + if (!wait_event_timeout(lmac->wq_cmd_cmplt, !lmac->cmd_pend, + msecs_to_jiffies(CGX_CMD_TIMEOUT))) { + dev = &cgx->pdev->dev; + ereg.val = cgx_read(cgx, lmac->lmac_id, CGX_EVENT_REG); + if (ereg.val) { + dev_err(dev, "cgx port %d:%d: No event for response\n", + cgx->cgx_id, lmac->lmac_id); + /* copy event */ + lmac->resp = ereg
[PATCH 09/15] octeontx2-af: Configure block LF's MSIX vector offset
From: Sunil Goutham Firmware configures a certain number of MSIX vectors to each of enabled RVU PF/VF. When a block LF is attached to a PF/VF, number of MSIX vectors needed by that LF are set aside (out of PF/VF's total MSIX vectors) and LF's msix_offset is configured in HW. Also added support for a RVU PF/VF to retrieve that block LF's MSIX vector offset information from AF via mbox. Signed-off-by: Sunil Goutham --- drivers/net/ethernet/marvell/octeontx2/af/mbox.h | 18 ++ drivers/net/ethernet/marvell/octeontx2/af/rvu.c| 333 - drivers/net/ethernet/marvell/octeontx2/af/rvu.h| 7 + .../net/ethernet/marvell/octeontx2/af/rvu_struct.h | 2 + 4 files changed, 357 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h index 7280d49..bedf0ee 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h @@ -122,6 +122,7 @@ static inline struct mbox_msghdr *otx2_mbox_alloc_msg(struct otx2_mbox *mbox, M(READY, 0x001, msg_req, ready_msg_rsp) \ M(ATTACH_RESOURCES,0x002, rsrc_attach, msg_rsp)\ M(DETACH_RESOURCES,0x003, rsrc_detach, msg_rsp)\ +M(MSIX_OFFSET, 0x004, msg_req, msix_offset_rsp)\ /* CGX mbox IDs (range 0x200 - 0x3FF) */ \ /* NPA mbox IDs (range 0x400 - 0x5FF) */ \ /* SSO/SSOW mbox IDs (range 0x600 - 0x7FF) */ \ @@ -190,4 +191,21 @@ struct rsrc_detach { u8 cptlfs:1; }; +#define MSIX_VECTOR_INVALID0x +#define MAX_RVU_BLKLF_CNT 256 + +struct msix_offset_rsp { + struct mbox_msghdr hdr; + u16 npa_msixoff; + u16 nix_msixoff; + u8 sso; + u8 ssow; + u8 timlfs; + u8 cptlfs; + u16 sso_msixoff[MAX_RVU_BLKLF_CNT]; + u16 ssow_msixoff[MAX_RVU_BLKLF_CNT]; + u16 timlf_msixoff[MAX_RVU_BLKLF_CNT]; + u16 cptlf_msixoff[MAX_RVU_BLKLF_CNT]; +}; + #endif /* MBOX_H */ diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c index 23e635c..234d273 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c @@ -24,6 +24,11 @@ static int rvu_get_hwvf(struct rvu *rvu, int pcifunc); +static void rvu_set_msix_offset(struct rvu *rvu, struct rvu_pfvf *pfvf, + struct rvu_block *block, int lf); +static void rvu_clear_msix_offset(struct rvu *rvu, struct rvu_pfvf *pfvf, + struct rvu_block *block, int lf); + /* Supported devices */ static const struct pci_device_id rvu_id_table[] = { { PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, PCI_DEVID_OCTEONTX2_RVU_AF) }, @@ -75,6 +80,45 @@ int rvu_alloc_rsrc(struct rsrc_bmap *rsrc) return id; } +static int rvu_alloc_rsrc_contig(struct rsrc_bmap *rsrc, int nrsrc) +{ + int start; + + if (!rsrc->bmap) + return -EINVAL; + + start = bitmap_find_next_zero_area(rsrc->bmap, rsrc->max, 0, nrsrc, 0); + if (start >= rsrc->max) + return -ENOSPC; + + bitmap_set(rsrc->bmap, start, nrsrc); + return start; +} + +static void rvu_free_rsrc_contig(struct rsrc_bmap *rsrc, int nrsrc, int start) +{ + if (!rsrc->bmap) + return; + if (start >= rsrc->max) + return; + + bitmap_clear(rsrc->bmap, start, nrsrc); +} + +static bool rvu_rsrc_check_contig(struct rsrc_bmap *rsrc, int nrsrc) +{ + int start; + + if (!rsrc->bmap) + return false; + + start = bitmap_find_next_zero_area(rsrc->bmap, rsrc->max, 0, nrsrc, 0); + if (start >= rsrc->max) + return false; + + return true; +} + void rvu_free_rsrc(struct rsrc_bmap *rsrc, int id) { if (!rsrc->bmap) @@ -103,6 +147,26 @@ int rvu_alloc_bitmap(struct rsrc_bmap *rsrc) return 0; } +/* Get block LF's HW index from a PF_FUNC's block slot number */ +int rvu_get_lf(struct rvu *rvu, struct rvu_block *block, u16 pcifunc, u16 slot) +{ + int lf; + u16 match = 0; + + spin_lock(&rvu->rsrc_lock); + for (lf = 0; lf < block->lf.max; lf++) { + if (block->fn_map[lf] == pcifunc) { + if (slot == match) { + spin_unlock(&rvu->rsrc_lock); + return lf; + } + match++; + } + } + spin_unlock(&rvu->rsrc_lock); + return -ENODEV; +} + /* Convert BLOCK_TYPE_E to a BLOCK_ADDR_E. * Some silicon variants of OcteonTX2 supports * multiple blocks of same type. @@ -237,6 +301,16 @@ inline int rvu_get_pf(u16 pcifunc) return (pcifunc >> RVU_PFVF_PF_SHIFT) & RVU_PFVF_PF_MASK; }
[PATCH 05/15] octeontx2-af: Add mailbox IRQ and msg handlers
From: Sunil Goutham This patch adds support for mailbox interrupt and message handling. Mapped mailbox region and registered a workqueue for message handling. Enabled mailbox IRQ of RVU PFs and registered a interrupt handler. When IRQ is triggered work is added to the mbox workqueue for msgs to get processed. Signed-off-by: Sunil Goutham --- drivers/net/ethernet/marvell/octeontx2/af/mbox.h | 14 +- drivers/net/ethernet/marvell/octeontx2/af/rvu.c| 254 + drivers/net/ethernet/marvell/octeontx2/af/rvu.h| 22 ++ .../net/ethernet/marvell/octeontx2/af/rvu_struct.h | 22 ++ 4 files changed, 309 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h index 8e205fd..fc593f0 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h @@ -33,6 +33,8 @@ # error "incorrect mailbox area sizes" #endif +#define INTR_MASK(pfvfs) ((pfvfs < 64) ? (BIT_ULL(pfvfs) - 1) : (~0ull)) + #define MBOX_RSP_TIMEOUT 1000 /* in ms, Time to wait for mbox response */ #define MBOX_MSG_ALIGN 16 /* Align mbox msg start to 16bytes */ @@ -90,8 +92,9 @@ struct mbox_msghdr { void otx2_mbox_reset(struct otx2_mbox *mbox, int devid); void otx2_mbox_destroy(struct otx2_mbox *mbox); -int otx2_mbox_init(struct otx2_mbox *mbox, void *hwbase, struct pci_dev *pdev, - void *reg_base, int direction, int ndevs); +int otx2_mbox_init(struct otx2_mbox *mbox, void __force *hwbase, + struct pci_dev *pdev, void __force *reg_base, + int direction, int ndevs); void otx2_mbox_msg_send(struct otx2_mbox *mbox, int devid); int otx2_mbox_wait_for_rsp(struct otx2_mbox *mbox, int devid); int otx2_mbox_busy_poll_for_rsp(struct otx2_mbox *mbox, int devid); @@ -115,7 +118,7 @@ static inline struct mbox_msghdr *otx2_mbox_alloc_msg(struct otx2_mbox *mbox, #define MBOX_MSG_MAX 0x #define MBOX_MESSAGES \ -M(READY, 0x001, msg_req, msg_rsp) +M(READY, 0x001, msg_req, ready_msg_rsp) enum { #define M(_name, _id, _1, _2) MBOX_MSG_ ## _name = _id, @@ -139,4 +142,9 @@ struct msg_rsp { struct mbox_msghdr hdr; }; +struct ready_msg_rsp { + struct mbox_msghdr hdr; + u16sclk_feq;/* SCLK frequency */ +}; + #endif /* MBOX_H */ diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c index fa5f40b..e795c2f 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c @@ -258,6 +258,245 @@ static int rvu_setup_hw_resources(struct rvu *rvu) return 0; } +static int rvu_process_mbox_msg(struct rvu *rvu, int devid, + struct mbox_msghdr *req) +{ + /* Check if valid, if not reply with a invalid msg */ + if (req->sig != OTX2_MBOX_REQ_SIG) + goto bad_message; + + if (req->id == MBOX_MSG_READY) + return 0; + +bad_message: + otx2_reply_invalid_msg(&rvu->mbox, devid, req->pcifunc, + req->id); + return -ENODEV; +} + +static void rvu_mbox_handler(struct work_struct *work) +{ + struct rvu_work *mwork = container_of(work, struct rvu_work, work); + struct rvu *rvu = mwork->rvu; + struct otx2_mbox_dev *mdev; + struct mbox_hdr *req_hdr; + struct mbox_msghdr *msg; + struct otx2_mbox *mbox; + int offset, id, err; + u16 pf; + + mbox = &rvu->mbox; + pf = mwork - rvu->mbox_wrk; + mdev = &mbox->dev[pf]; + + /* Process received mbox messages */ + req_hdr = (struct mbox_hdr *)(mdev->mbase + mbox->rx_start); + if (req_hdr->num_msgs == 0) + return; + + offset = mbox->rx_start + ALIGN(sizeof(*req_hdr), MBOX_MSG_ALIGN); + + for (id = 0; id < req_hdr->num_msgs; id++) { + msg = (struct mbox_msghdr *)(mdev->mbase + offset); + + /* Set which PF sent this message based on mbox IRQ */ + msg->pcifunc &= ~(RVU_PFVF_PF_MASK << RVU_PFVF_PF_SHIFT); + msg->pcifunc |= (pf << RVU_PFVF_PF_SHIFT); + err = rvu_process_mbox_msg(rvu, pf, msg); + if (!err) { + offset = mbox->rx_start + msg->next_msgoff; + continue; + } + + if (msg->pcifunc & RVU_PFVF_FUNC_MASK) + dev_warn(rvu->dev, "Error %d when processing message %s (0x%x) from PF%d:VF%d\n", +err, otx2_mbox_id2name(msg->id), msg->id, pf, +(msg->pcifunc & RVU_PFVF_FUNC_MASK) - 1); + else + dev_warn(rvu->dev, "Error %d when processing message %s (0x%x) from PF%d\n", +
[PATCH 07/15] octeontx2-af: Scan blocks for LFs provisioned to PF/VF
From: Sunil Goutham Scan all RVU blocks to find any 'LF to RVU PF/VF' mapping done by low level firmware. If found any, mark them as used in respective block's LF bitmap and also save mapped PF/VF's PF_FUNC info. This is done to avoid reattaching a block LF to a different RVU PF/VF. Signed-off-by: Sunil Goutham --- drivers/net/ethernet/marvell/octeontx2/af/rvu.c| 148 - drivers/net/ethernet/marvell/octeontx2/af/rvu.h| 16 +++ .../net/ethernet/marvell/octeontx2/af/rvu_struct.h | 16 +++ 3 files changed, 178 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c index 25f79bf..9539ab9 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c @@ -22,6 +22,8 @@ #define DRV_STRING "Marvell OcteonTX2 RVU Admin Function Driver" #define DRV_VERSION"1.0" +static int rvu_get_hwvf(struct rvu *rvu, int pcifunc); + /* Supported devices */ static const struct pci_device_id rvu_id_table[] = { { PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, PCI_DEVID_OCTEONTX2_RVU_AF) }, @@ -66,6 +68,91 @@ int rvu_alloc_bitmap(struct rsrc_bmap *rsrc) return 0; } +static void rvu_update_rsrc_map(struct rvu *rvu, struct rvu_pfvf *pfvf, + struct rvu_block *block, u16 pcifunc, + u16 lf, bool attach) +{ + int devnum, num_lfs = 0; + bool is_pf; + u64 reg; + + if (lf >= block->lf.max) { + dev_err(&rvu->pdev->dev, + "%s: FATAL: LF %d is >= %s's max lfs i.e %d\n", + __func__, lf, block->name, block->lf.max); + return; + } + + /* Check if this is for a RVU PF or VF */ + if (pcifunc & RVU_PFVF_FUNC_MASK) { + is_pf = false; + devnum = rvu_get_hwvf(rvu, pcifunc); + } else { + is_pf = true; + devnum = rvu_get_pf(pcifunc); + } + + block->fn_map[lf] = attach ? pcifunc : 0; + + switch (block->type) { + case BLKTYPE_NPA: + pfvf->npalf = attach ? true : false; + num_lfs = pfvf->npalf; + break; + case BLKTYPE_NIX: + pfvf->nixlf = attach ? true : false; + num_lfs = pfvf->nixlf; + break; + case BLKTYPE_SSO: + attach ? pfvf->sso++ : pfvf->sso--; + num_lfs = pfvf->sso; + break; + case BLKTYPE_SSOW: + attach ? pfvf->ssow++ : pfvf->ssow--; + num_lfs = pfvf->ssow; + break; + case BLKTYPE_TIM: + attach ? pfvf->timlfs++ : pfvf->timlfs--; + num_lfs = pfvf->timlfs; + break; + case BLKTYPE_CPT: + attach ? pfvf->cptlfs++ : pfvf->cptlfs--; + num_lfs = pfvf->cptlfs; + break; + } + + reg = is_pf ? block->pf_lfcnt_reg : block->vf_lfcnt_reg; + rvu_write64(rvu, BLKADDR_RVUM, reg | (devnum << 16), num_lfs); +} + +inline int rvu_get_pf(u16 pcifunc) +{ + return (pcifunc >> RVU_PFVF_PF_SHIFT) & RVU_PFVF_PF_MASK; +} + +static int rvu_get_hwvf(struct rvu *rvu, int pcifunc) +{ + int pf, func; + u64 cfg; + + pf = rvu_get_pf(pcifunc); + func = pcifunc & RVU_PFVF_FUNC_MASK; + + /* Get first HWVF attached to this PF */ + cfg = rvu_read64(rvu, BLKADDR_RVUM, RVU_PRIV_PFX_CFG(pf)); + + return ((cfg & 0xFFF) + func - 1); +} + +struct rvu_pfvf *rvu_get_pfvf(struct rvu *rvu, int pcifunc) +{ + /* Check if it is a PF or VF */ + if (pcifunc & RVU_PFVF_FUNC_MASK) + return &rvu->hwvf[rvu_get_hwvf(rvu, pcifunc)]; + else + return &rvu->pf[rvu_get_pf(pcifunc)]; +} + static void rvu_check_block_implemented(struct rvu *rvu) { struct rvu_hwinfo *hw = rvu->hw; @@ -107,6 +194,28 @@ static void rvu_reset_all_blocks(struct rvu *rvu) rvu_block_reset(rvu, BLKADDR_NDC2, NDC_AF_BLK_RST); } +static void rvu_scan_block(struct rvu *rvu, struct rvu_block *block) +{ + struct rvu_pfvf *pfvf; + u64 cfg; + int lf; + + for (lf = 0; lf < block->lf.max; lf++) { + cfg = rvu_read64(rvu, block->addr, +block->lfcfg_reg | (lf << block->lfshift)); + if (!(cfg & BIT_ULL(63))) + continue; + + /* Set this resource as being used */ + __set_bit(lf, block->lf.bmap); + + /* Get, to whom this LF is attached */ + pfvf = rvu_get_pfvf(rvu, (cfg >> 8) & 0x); + rvu_update_rsrc_map(rvu, pfvf, block, + (cfg >> 8) & 0x, lf, true); + } +} + static void rvu_free_hw_resources(struct rvu *rvu) { struct rvu_hwinfo *hw = rvu->hw; @@ -124,7 +233,7 @@ static int
[PATCH 15/15] MAINTAINERS: Add entry for Marvell OcteonTX2 Admin Function driver
From: Sunil Goutham Added maintainers entry for Marvell OcteonTX2 SOC's RVU admin function driver. Signed-off-by: Sunil Goutham --- MAINTAINERS | 9 + 1 file changed, 9 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 7233a9e..4f93114 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8843,6 +8843,15 @@ S: Supported F: drivers/mmc/host/sdhci-xenon* F: Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.txt +MARVELL OCTEONTX2 RVU ADMIN FUNCTION DRIVER +M: Sunil Goutham +M: Linu Cherian +M: Geetha sowjanya +M: Jerin Jacob +L: netdev@vger.kernel.org +S: Maintained +F: drivers/net/ethernet/marvell/octeontx2/af + MATROX FRAMEBUFFER DRIVER L: linux-fb...@vger.kernel.org S: Orphan -- 2.7.4
[PATCH 11/15] octeontx2-af: Add Marvell OcteonTX2 CGX driver
From: Sunil Goutham This patch adds basic template for Marvell OcteonTX2's CGX ethernet interface driver. Just the probe. RVU AF driver will use APIs exported by this driver for various things like PF to physical interface mapping, loopback mode, interface stats etc. Hence marged both drivers into a single module. Signed-off-by: Sunil Goutham --- drivers/net/ethernet/marvell/octeontx2/af/Makefile | 2 +- drivers/net/ethernet/marvell/octeontx2/af/cgx.c| 100 + drivers/net/ethernet/marvell/octeontx2/af/cgx.h| 22 + drivers/net/ethernet/marvell/octeontx2/af/rvu.c| 14 ++- 4 files changed, 136 insertions(+), 2 deletions(-) create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/cgx.c create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/cgx.h diff --git a/drivers/net/ethernet/marvell/octeontx2/af/Makefile b/drivers/net/ethernet/marvell/octeontx2/af/Makefile index ac17cb9..8646421 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/Makefile +++ b/drivers/net/ethernet/marvell/octeontx2/af/Makefile @@ -7,4 +7,4 @@ obj-$(CONFIG_OCTEONTX2_MBOX) += octeontx2_mbox.o obj-$(CONFIG_OCTEONTX2_AF) += octeontx2_af.o octeontx2_mbox-y := mbox.o -octeontx2_af-y := rvu.o +octeontx2_af-y := cgx.o rvu.o diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c new file mode 100644 index 000..cfd80d2 --- /dev/null +++ b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c @@ -0,0 +1,100 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Marvell OcteonTx2 CGX driver + * + * Copyright (C) 2018 Marvell International Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cgx.h" + +#define DRV_NAME "octeontx2-cgx" +#define DRV_STRING "Marvell OcteonTX2 CGX/MAC Driver" + +struct cgx { + void __iomem*reg_base; + struct pci_dev *pdev; + u8 cgx_id; +}; + +/* Supported devices */ +static const struct pci_device_id cgx_id_table[] = { + { PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, PCI_DEVID_OCTEONTX2_CGX) }, + { 0, } /* end of table */ +}; + +MODULE_AUTHOR("Marvell International Ltd."); +MODULE_DESCRIPTION(DRV_STRING); +MODULE_LICENSE("GPL v2"); +MODULE_DEVICE_TABLE(pci, cgx_id_table); + +static int cgx_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + int err; + struct device *dev = &pdev->dev; + struct cgx *cgx; + + cgx = devm_kzalloc(dev, sizeof(*cgx), GFP_KERNEL); + if (!cgx) + return -ENOMEM; + cgx->pdev = pdev; + + pci_set_drvdata(pdev, cgx); + + err = pci_enable_device(pdev); + if (err) { + dev_err(dev, "Failed to enable PCI device\n"); + pci_set_drvdata(pdev, NULL); + return err; + } + + err = pci_request_regions(pdev, DRV_NAME); + if (err) { + dev_err(dev, "PCI request regions failed 0x%x\n", err); + goto err_disable_device; + } + + /* MAP configuration registers */ + cgx->reg_base = pcim_iomap(pdev, PCI_CFG_REG_BAR_NUM, 0); + if (!cgx->reg_base) { + dev_err(dev, "CGX: Cannot map CSR memory space, aborting\n"); + err = -ENOMEM; + goto err_release_regions; + } + + return 0; + +err_release_regions: + pci_release_regions(pdev); +err_disable_device: + pci_disable_device(pdev); + pci_set_drvdata(pdev, NULL); + return err; +} + +static void cgx_remove(struct pci_dev *pdev) +{ + pci_release_regions(pdev); + pci_disable_device(pdev); + pci_set_drvdata(pdev, NULL); +} + +struct pci_driver cgx_driver = { + .name = DRV_NAME, + .id_table = cgx_id_table, + .probe = cgx_probe, + .remove = cgx_remove, +}; diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cgx.h b/drivers/net/ethernet/marvell/octeontx2/af/cgx.h new file mode 100644 index 000..a7d4b39 --- /dev/null +++ b/drivers/net/ethernet/marvell/octeontx2/af/cgx.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 + * Marvell OcteonTx2 CGX driver + * + * Copyright (C) 2018 Marvell International Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef CGX_H +#define CGX_H + + /* PCI device IDs */ +#definePCI_DEVID_OCTEONTX2_CGX 0xA059 + +/* PCI BAR nos */ +#define PCI_CFG_REG_BAR_NUM0 + +extern struct pci_driver cgx_driver; + +#endif /* CGX_H */ diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c b/drivers/net/ethe
[PATCH 14/15] octeontx2-af: Register for CGX lmac events
From: Linu Cherian Added support in RVU AF driver to register for CGX LMAC link status change events from firmware and managing them. Processing part will be added in followup patches. - Introduced eventqueue for posting events from cgx lmac. Queueing mechanism will ensure that events can be posted and firmware can be acked immediately and hence event reception and processing are decoupled. - Events gets added to the queue by notification callback. Notification callback is expected to be atomic, since it is called from interrupt context. - Events are dequeued and processed in a worker thread. Signed-off-by: Linu Cherian --- drivers/net/ethernet/marvell/octeontx2/af/rvu.c| 6 +- drivers/net/ethernet/marvell/octeontx2/af/rvu.h| 5 + .../net/ethernet/marvell/octeontx2/af/rvu_cgx.c| 101 - 3 files changed, 108 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c index b363d19..adc7fc6 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c @@ -1564,10 +1564,11 @@ static int rvu_probe(struct pci_dev *pdev, const struct pci_device_id *id) err = rvu_register_interrupts(rvu); if (err) - goto err_mbox; + goto err_cgx; return 0; - +err_cgx: + rvu_cgx_wq_destroy(rvu); err_mbox: rvu_mbox_destroy(rvu); err_hwsetup: @@ -1589,6 +1590,7 @@ static void rvu_remove(struct pci_dev *pdev) struct rvu *rvu = pci_get_drvdata(pdev); rvu_unregister_interrupts(rvu); + rvu_cgx_wq_destroy(rvu); rvu_mbox_destroy(rvu); rvu_reset_all_blocks(rvu); rvu_free_hw_resources(rvu); diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h index 385f597..d169fa9 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h @@ -110,6 +110,10 @@ struct rvu { * every cgx lmac port */ void**cgx_idmap; /* cgx id to cgx data map table */ + struct work_struct cgx_evh_work; + struct workqueue_struct *cgx_evh_wq; + spinlock_t cgx_evq_lock; /* cgx event queue lock */ + struct list_headcgx_evq_head; /* cgx event queue head */ }; static inline void rvu_write64(struct rvu *rvu, u64 block, u64 offset, u64 val) @@ -150,4 +154,5 @@ int rvu_poll_reg(struct rvu *rvu, u64 block, u64 offset, u64 mask, bool zero); /* CGX APIs */ int rvu_cgx_probe(struct rvu *rvu); +void rvu_cgx_wq_destroy(struct rvu *rvu); #endif /* RVU_H */ diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c index bf81507..2359806e 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c @@ -15,6 +15,11 @@ #include "rvu.h" #include "cgx.h" +struct cgx_evq_entry { + struct list_head evq_node; + struct cgx_link_event link_event; +}; + static inline u8 cgxlmac_id_to_bmap(u8 cgx_id, u8 lmac_id) { return ((cgx_id & 0xF) << 4) | (lmac_id & 0xF); @@ -72,9 +77,95 @@ static int rvu_map_cgx_lmac_pf(struct rvu *rvu) return 0; } +/* This is called from interrupt context and is expected to be atomic */ +static int cgx_lmac_postevent(struct cgx_link_event *event, void *data) +{ + struct rvu *rvu = data; + struct cgx_evq_entry *qentry; + + /* post event to the event queue */ + qentry = kmalloc(sizeof(*qentry), GFP_ATOMIC); + if (!qentry) + return -ENOMEM; + qentry->link_event = *event; + spin_lock(&rvu->cgx_evq_lock); + list_add_tail(&qentry->evq_node, &rvu->cgx_evq_head); + spin_unlock(&rvu->cgx_evq_lock); + + /* start worker to process the events */ + queue_work(rvu->cgx_evh_wq, &rvu->cgx_evh_work); + + return 0; +} + +static void cgx_evhandler_task(struct work_struct *work) +{ + struct rvu *rvu = container_of(work, struct rvu, cgx_evh_work); + struct cgx_evq_entry *qentry; + struct cgx_link_event *event; + unsigned long flags; + + do { + /* Dequeue an event */ + spin_lock_irqsave(&rvu->cgx_evq_lock, flags); + qentry = list_first_entry_or_null(&rvu->cgx_evq_head, + struct cgx_evq_entry, + evq_node); + if (qentry) + list_del(&qentry->evq_node); + spin_unlock_irqrestore(&rvu->cgx_evq_lock, flags); + if (!qentry) + break; /* nothing more to process */ + + event = &qe
[PATCH 10/15] octeontx2-af: Reconfig MSIX base with IOVA
From: Geetha sowjanya HW interprets RVU_AF_MSIXTR_BASE address as an IOVA, hence create a IOMMU mapping for the physcial address configured by firmware and reconfig RVU_AF_MSIXTR_BASE with IOVA. Signed-off-by: Geetha sowjanya Signed-off-by: Sunil Goutham --- drivers/net/ethernet/marvell/octeontx2/af/rvu.c | 33 ++--- drivers/net/ethernet/marvell/octeontx2/af/rvu.h | 1 + 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c index 234d273..2a9d2b7 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c @@ -442,9 +442,10 @@ static int rvu_setup_msix_resources(struct rvu *rvu) { struct rvu_hwinfo *hw = rvu->hw; int pf, vf, numvfs, hwvf, err; + int nvecs, offset, max_msix; struct rvu_pfvf *pfvf; - int nvecs, offset; - u64 cfg; + u64 cfg, phy_addr; + dma_addr_t iova; for (pf = 0; pf < hw->total_pfs; pf++) { cfg = rvu_read64(rvu, BLKADDR_RVUM, RVU_PRIV_PFX_CFG(pf)); @@ -523,6 +524,22 @@ static int rvu_setup_msix_resources(struct rvu *rvu) } } + /* HW interprets RVU_AF_MSIXTR_BASE address as an IOVA, hence +* create a IOMMU mapping for the physcial address configured by +* firmware and reconfig RVU_AF_MSIXTR_BASE with IOVA. +*/ + cfg = rvu_read64(rvu, BLKADDR_RVUM, RVU_PRIV_CONST); + max_msix = cfg & 0xF; + phy_addr = rvu_read64(rvu, BLKADDR_RVUM, RVU_AF_MSIXTR_BASE); + iova = dma_map_single(rvu->dev, (void *)phy_addr, + max_msix * PCI_MSIX_ENTRY_SIZE, + DMA_BIDIRECTIONAL); + if (dma_mapping_error(rvu->dev, iova)) + return -ENOMEM; + + rvu_write64(rvu, BLKADDR_RVUM, RVU_AF_MSIXTR_BASE, (u64)iova); + rvu->msix_base_iova = iova; + return 0; } @@ -531,7 +548,8 @@ static void rvu_free_hw_resources(struct rvu *rvu) struct rvu_hwinfo *hw = rvu->hw; struct rvu_block *block; struct rvu_pfvf *pfvf; - int id; + int id, max_msix; + u64 cfg; /* Free block LF bitmaps */ for (id = 0; id < BLK_COUNT; id++) { @@ -549,6 +567,15 @@ static void rvu_free_hw_resources(struct rvu *rvu) pfvf = &rvu->hwvf[id]; kfree(pfvf->msix.bmap); } + + /* Unmap MSIX vector base IOVA mapping */ + if (!rvu->msix_base_iova) + return; + cfg = rvu_read64(rvu, BLKADDR_RVUM, RVU_PRIV_CONST); + max_msix = cfg & 0xF; + dma_unmap_single(rvu->dev, rvu->msix_base_iova, +max_msix * PCI_MSIX_ENTRY_SIZE, +DMA_BIDIRECTIONAL); } static int rvu_setup_hw_resources(struct rvu *rvu) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h index 7435e83..92c2022 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h @@ -99,6 +99,7 @@ struct rvu { u16 num_vec; char*irq_name; bool*irq_allocated; + dma_addr_t msix_base_iova; }; static inline void rvu_write64(struct rvu *rvu, u64 block, u64 offset, u64 val) -- 2.7.4
[PATCH 06/15] octeontx2-af: Convert mbox msg id check to a macro
From: Aleksey Makarov With 10's of mailbox messages expected to be handled in future, checking for message id could become a lengthy switch case. Hence added a macro to auto generate the switch case for each msg id. Signed-off-by: Aleksey Makarov --- drivers/net/ethernet/marvell/octeontx2/af/rvu.c | 44 + 1 file changed, 38 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c index e795c2f..25f79bf 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c @@ -258,6 +258,12 @@ static int rvu_setup_hw_resources(struct rvu *rvu) return 0; } +static int rvu_mbox_handler_READY(struct rvu *rvu, struct msg_req *req, + struct ready_msg_rsp *rsp) +{ + return 0; +} + static int rvu_process_mbox_msg(struct rvu *rvu, int devid, struct mbox_msghdr *req) { @@ -265,13 +271,39 @@ static int rvu_process_mbox_msg(struct rvu *rvu, int devid, if (req->sig != OTX2_MBOX_REQ_SIG) goto bad_message; - if (req->id == MBOX_MSG_READY) - return 0; - + switch (req->id) { +#define M(_name, _id, _req_type, _rsp_type)\ + case _id: { \ + struct _rsp_type *rsp; \ + int err;\ + \ + rsp = (struct _rsp_type *)otx2_mbox_alloc_msg( \ + &rvu->mbox, devid, \ + sizeof(struct _rsp_type)); \ + if (rsp) { \ + rsp->hdr.id = _id; \ + rsp->hdr.sig = OTX2_MBOX_RSP_SIG; \ + rsp->hdr.pcifunc = req->pcifunc;\ + rsp->hdr.rc = 0;\ + } \ + \ + err = rvu_mbox_handler_ ## _name(rvu, \ +(struct _req_type *)req, \ +rsp); \ + if (rsp && err) \ + rsp->hdr.rc = err; \ + \ + return rsp ? err : -ENOMEM; \ + } +MBOX_MESSAGES +#undef M + break; bad_message: - otx2_reply_invalid_msg(&rvu->mbox, devid, req->pcifunc, - req->id); - return -ENODEV; + default: + otx2_reply_invalid_msg(&rvu->mbox, devid, req->pcifunc, + req->id); + return -ENODEV; + } } static void rvu_mbox_handler(struct work_struct *work) -- 2.7.4
[PATCH 08/15] octeontx2-af: Add RVU block LF provisioning support
From: Sunil Goutham Added support for a RVU PF/VF to request AF via mailbox to attach or detach NPA/NIX/SSO/SSOW/TIM/CPT block LFs. Also supports partial detachment and modifying current LF attached count of a certian block type. Signed-off-by: Sunil Goutham --- drivers/net/ethernet/marvell/octeontx2/af/mbox.h | 45 +- drivers/net/ethernet/marvell/octeontx2/af/rvu.c| 472 - drivers/net/ethernet/marvell/octeontx2/af/rvu.h| 8 +- .../net/ethernet/marvell/octeontx2/af/rvu_reg.h| 8 +- 4 files changed, 523 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h index fc593f0..7280d49 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h @@ -118,7 +118,17 @@ static inline struct mbox_msghdr *otx2_mbox_alloc_msg(struct otx2_mbox *mbox, #define MBOX_MSG_MAX 0x #define MBOX_MESSAGES \ -M(READY, 0x001, msg_req, ready_msg_rsp) +/* Generic mbox IDs (range 0x000 - 0x1FF) */ \ +M(READY, 0x001, msg_req, ready_msg_rsp) \ +M(ATTACH_RESOURCES,0x002, rsrc_attach, msg_rsp)\ +M(DETACH_RESOURCES,0x003, rsrc_detach, msg_rsp)\ +/* CGX mbox IDs (range 0x200 - 0x3FF) */ \ +/* NPA mbox IDs (range 0x400 - 0x5FF) */ \ +/* SSO/SSOW mbox IDs (range 0x600 - 0x7FF) */ \ +/* TIM mbox IDs (range 0x800 - 0x9FF) */ \ +/* CPT mbox IDs (range 0xA00 - 0xBFF) */ \ +/* NPC mbox IDs (range 0x6000 - 0x7FFF) */ \ +/* NIX mbox IDs (range 0x8000 - 0x) */ \ enum { #define M(_name, _id, _1, _2) MBOX_MSG_ ## _name = _id, @@ -147,4 +157,37 @@ struct ready_msg_rsp { u16sclk_feq;/* SCLK frequency */ }; +/* Structure for requesting resource provisioning. + * 'modify' flag to be used when either requesting more + * or to detach partial of a cetain resource type. + * Rest of the fields specify how many of what type to + * be attached. + */ +struct rsrc_attach { + struct mbox_msghdr hdr; + u8 modify:1; + u8 npalf:1; + u8 nixlf:1; + u16 sso; + u16 ssow; + u16 timlfs; + u16 cptlfs; +}; + +/* Structure for relinquishing resources. + * 'partial' flag to be used when relinquishing all resources + * but only of a certain type. If not set, all resources of all + * types provisioned to the RVU function will be detached. + */ +struct rsrc_detach { + struct mbox_msghdr hdr; + u8 partial:1; + u8 npalf:1; + u8 nixlf:1; + u8 sso:1; + u8 ssow:1; + u8 timlfs:1; + u8 cptlfs:1; +}; + #endif /* MBOX_H */ diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c index 9539ab9..23e635c 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c @@ -59,6 +59,41 @@ int rvu_poll_reg(struct rvu *rvu, u64 block, u64 offset, u64 mask, bool zero) return -EBUSY; } +int rvu_alloc_rsrc(struct rsrc_bmap *rsrc) +{ + int id; + + if (!rsrc->bmap) + return -EINVAL; + + id = find_first_zero_bit(rsrc->bmap, rsrc->max); + if (id >= rsrc->max) + return -ENOSPC; + + __set_bit(id, rsrc->bmap); + + return id; +} + +void rvu_free_rsrc(struct rsrc_bmap *rsrc, int id) +{ + if (!rsrc->bmap) + return; + + __clear_bit(id, rsrc->bmap); +} + +int rvu_rsrc_free_count(struct rsrc_bmap *rsrc) +{ + int used; + + if (!rsrc->bmap) + return 0; + + used = bitmap_weight(rsrc->bmap, rsrc->max); + return (rsrc->max - used); +} + int rvu_alloc_bitmap(struct rsrc_bmap *rsrc) { rsrc->bmap = kcalloc(BITS_TO_LONGS(rsrc->max), @@ -68,6 +103,78 @@ int rvu_alloc_bitmap(struct rsrc_bmap *rsrc) return 0; } +/* Convert BLOCK_TYPE_E to a BLOCK_ADDR_E. + * Some silicon variants of OcteonTX2 supports + * multiple blocks of same type. + * + * @pcifunc has to be zero when no LF is yet attached. + */ +int rvu_get_blkaddr(struct rvu *rvu, int blktype, u16 pcifunc) +{ + int devnum, blkaddr = -ENODEV; + u64 cfg, reg; + bool is_pf; + + switch (blktype) { + case BLKTYPE_NPA: + blkaddr = BLKADDR_NPA; + goto exit; + case BLKTYPE_NIX: + /* For now assume NIX0 */ + if (!pcifunc) { + blkaddr = BLKADDR_NIX0; + goto exit; + } + break; + case BLKTYPE_SSO: + blkaddr = BLKADDR_SSO;
[PATCH 12/15] octeontx2-af: Set RVU PFs to CGX LMACs mapping
From: Linu Cherian Each of the enabled CGX LMAC is considered a physical interface and RVU PFs are mapped to these. VFs of these SRIOV PFs will be virtual interfaces and share CGX LMAC along with PF. This mapping info will be used later on for Rx/Tx pkt steering. Signed-off-by: Linu Cherian Signed-off-by: Geetha sowjanya --- drivers/net/ethernet/marvell/octeontx2/af/Makefile | 2 +- drivers/net/ethernet/marvell/octeontx2/af/cgx.c| 59 + drivers/net/ethernet/marvell/octeontx2/af/cgx.h| 15 +++- drivers/net/ethernet/marvell/octeontx2/af/rvu.c| 4 + drivers/net/ethernet/marvell/octeontx2/af/rvu.h| 12 +++ .../net/ethernet/marvell/octeontx2/af/rvu_cgx.c| 97 ++ 6 files changed, 186 insertions(+), 3 deletions(-) create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c diff --git a/drivers/net/ethernet/marvell/octeontx2/af/Makefile b/drivers/net/ethernet/marvell/octeontx2/af/Makefile index 8646421..eaac264 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/Makefile +++ b/drivers/net/ethernet/marvell/octeontx2/af/Makefile @@ -7,4 +7,4 @@ obj-$(CONFIG_OCTEONTX2_MBOX) += octeontx2_mbox.o obj-$(CONFIG_OCTEONTX2_AF) += octeontx2_af.o octeontx2_mbox-y := mbox.o -octeontx2_af-y := cgx.o rvu.o +octeontx2_af-y := cgx.o rvu.o rvu_cgx.o diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c index cfd80d2..06fd9fd 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c @@ -28,8 +28,12 @@ struct cgx { void __iomem*reg_base; struct pci_dev *pdev; u8 cgx_id; + u8 lmac_count; + struct list_headcgx_list; }; +static LIST_HEAD(cgx_list); + /* Supported devices */ static const struct pci_device_id cgx_id_table[] = { { PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, PCI_DEVID_OCTEONTX2_CGX) }, @@ -41,6 +45,53 @@ MODULE_DESCRIPTION(DRV_STRING); MODULE_LICENSE("GPL v2"); MODULE_DEVICE_TABLE(pci, cgx_id_table); +static u64 cgx_read(struct cgx *cgx, u64 lmac, u64 offset) +{ + return readq(cgx->reg_base + (lmac << 18) + offset); +} + +int cgx_get_cgx_cnt(void) +{ + struct cgx *cgx_dev; + int count = 0; + + list_for_each_entry(cgx_dev, &cgx_list, cgx_list) + count++; + + return count; +} +EXPORT_SYMBOL(cgx_get_cgx_cnt); + +int cgx_get_lmac_cnt(void *cgxd) +{ + struct cgx *cgx = cgxd; + + if (!cgx) + return -ENODEV; + + return cgx->lmac_count; +} +EXPORT_SYMBOL(cgx_get_lmac_cnt); + +void *cgx_get_pdata(int cgx_id) +{ + struct cgx *cgx_dev; + + list_for_each_entry(cgx_dev, &cgx_list, cgx_list) { + if (cgx_dev->cgx_id == cgx_id) + return cgx_dev; + } + return NULL; +} +EXPORT_SYMBOL(cgx_get_pdata); + +static void cgx_lmac_init(struct cgx *cgx) +{ + cgx->lmac_count = cgx_read(cgx, 0, CGXX_CMRX_RX_LMACS) & 0x7; + if (cgx->lmac_count > MAX_LMAC_PER_CGX) + cgx->lmac_count = MAX_LMAC_PER_CGX; +} + static int cgx_probe(struct pci_dev *pdev, const struct pci_device_id *id) { int err; @@ -75,9 +126,14 @@ static int cgx_probe(struct pci_dev *pdev, const struct pci_device_id *id) goto err_release_regions; } + list_add(&cgx->cgx_list, &cgx_list); + cgx->cgx_id = cgx_get_cgx_cnt() - 1; + cgx_lmac_init(cgx); + return 0; err_release_regions: + list_del(&cgx->cgx_list); pci_release_regions(pdev); err_disable_device: pci_disable_device(pdev); @@ -87,6 +143,9 @@ static int cgx_probe(struct pci_dev *pdev, const struct pci_device_id *id) static void cgx_remove(struct pci_dev *pdev) { + struct cgx *cgx = pci_get_drvdata(pdev); + + list_del(&cgx->cgx_list); pci_release_regions(pdev); pci_disable_device(pdev); pci_set_drvdata(pdev, NULL); diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cgx.h b/drivers/net/ethernet/marvell/octeontx2/af/cgx.h index a7d4b39..acdc16e 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cgx.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/cgx.h @@ -12,11 +12,22 @@ #define CGX_H /* PCI device IDs */ -#definePCI_DEVID_OCTEONTX2_CGX 0xA059 +#definePCI_DEVID_OCTEONTX2_CGX 0xA059 /* PCI BAR nos */ -#define PCI_CFG_REG_BAR_NUM0 +#define PCI_CFG_REG_BAR_NUM0 + +#define MAX_CGX3 +#define MAX_LMAC_PER_CGX 4 +#define CGX_OFFSET(x) ((x) * MAX_LMAC_PER_CGX) + +/* Registers */ +#define CGXX_CMRX_RX_ID_MAP0x060 +#define CGXX_CMRX_RX_LMACS 0x128 extern struct pci_driver cgx_driver; +int cgx_get_cgx_cnt(void); +int cgx_get_lmac_cnt(void *cgxd); +void *cgx_get_pdata(int cgx_id);
[PATCH 03/15] octeontx2-af: Gather RVU blocks HW info
From: Sunil Goutham This patch gathers NPA/NIX/SSO/SSOW/TIM/CPT RVU blocks's HW info like number of LFs. Important register offsets saved for later use to avoid code duplication for each block. A bitmap is allocated for each of the blocks which later on will be used to allocate a LF for a RVU PF/VF. Also added RVU NIX/NPA block registers and few registers of other blocks. Signed-off-by: Sunil Goutham --- drivers/net/ethernet/marvell/octeontx2/af/rvu.c| 167 +++ drivers/net/ethernet/marvell/octeontx2/af/rvu.h| 21 ++ .../net/ethernet/marvell/octeontx2/af/rvu_reg.h| 333 - 3 files changed, 517 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c index d40fabf..fa5f40b 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c @@ -57,6 +57,15 @@ int rvu_poll_reg(struct rvu *rvu, u64 block, u64 offset, u64 mask, bool zero) return -EBUSY; } +int rvu_alloc_bitmap(struct rsrc_bmap *rsrc) +{ + rsrc->bmap = kcalloc(BITS_TO_LONGS(rsrc->max), +sizeof(long), GFP_KERNEL); + if (!rsrc->bmap) + return -ENOMEM; + return 0; +} + static void rvu_check_block_implemented(struct rvu *rvu) { struct rvu_hwinfo *hw = rvu->hw; @@ -98,6 +107,157 @@ static void rvu_reset_all_blocks(struct rvu *rvu) rvu_block_reset(rvu, BLKADDR_NDC2, NDC_AF_BLK_RST); } +static void rvu_free_hw_resources(struct rvu *rvu) +{ + struct rvu_hwinfo *hw = rvu->hw; + struct rvu_block *block; + int id; + + /* Free all bitmaps */ + for (id = 0; id < BLK_COUNT; id++) { + block = &hw->block[id]; + kfree(block->lf.bmap); + } +} + +static int rvu_setup_hw_resources(struct rvu *rvu) +{ + struct rvu_hwinfo *hw = rvu->hw; + struct rvu_block *block; + int err; + u64 cfg; + + /* Get HW supported max RVU PF & VF count */ + cfg = rvu_read64(rvu, BLKADDR_RVUM, RVU_PRIV_CONST); + hw->total_pfs = (cfg >> 32) & 0xFF; + hw->total_vfs = (cfg >> 20) & 0xFFF; + hw->max_vfs_per_pf = (cfg >> 40) & 0xFF; + + /* Init NPA LF's bitmap */ + block = &hw->block[BLKADDR_NPA]; + if (!block->implemented) + goto nix; + cfg = rvu_read64(rvu, BLKADDR_NPA, NPA_AF_CONST); + block->lf.max = (cfg >> 16) & 0xFFF; + block->addr = BLKADDR_NPA; + block->lfshift = 8; + block->lookup_reg = NPA_AF_RVU_LF_CFG_DEBUG; + block->pf_lfcnt_reg = RVU_PRIV_PFX_NPA_CFG; + block->vf_lfcnt_reg = RVU_PRIV_HWVFX_NPA_CFG; + block->lfcfg_reg = NPA_PRIV_LFX_CFG; + block->msixcfg_reg = NPA_PRIV_LFX_INT_CFG; + block->lfreset_reg = NPA_AF_LF_RST; + sprintf(block->name, "NPA"); + err = rvu_alloc_bitmap(&block->lf); + if (err) + return err; + +nix: + /* Init NIX LF's bitmap */ + block = &hw->block[BLKADDR_NIX0]; + if (!block->implemented) + goto sso; + cfg = rvu_read64(rvu, BLKADDR_NIX0, NIX_AF_CONST2); + block->lf.max = cfg & 0xFFF; + block->addr = BLKADDR_NIX0; + block->lfshift = 8; + block->lookup_reg = NIX_AF_RVU_LF_CFG_DEBUG; + block->pf_lfcnt_reg = RVU_PRIV_PFX_NIX_CFG; + block->vf_lfcnt_reg = RVU_PRIV_HWVFX_NIX_CFG; + block->lfcfg_reg = NIX_PRIV_LFX_CFG; + block->msixcfg_reg = NIX_PRIV_LFX_INT_CFG; + block->lfreset_reg = NIX_AF_LF_RST; + sprintf(block->name, "NIX"); + err = rvu_alloc_bitmap(&block->lf); + if (err) + return err; + +sso: + /* Init SSO group's bitmap */ + block = &hw->block[BLKADDR_SSO]; + if (!block->implemented) + goto ssow; + cfg = rvu_read64(rvu, BLKADDR_SSO, SSO_AF_CONST); + block->lf.max = cfg & 0x; + block->addr = BLKADDR_SSO; + block->multislot = true; + block->lfshift = 3; + block->lookup_reg = SSO_AF_RVU_LF_CFG_DEBUG; + block->pf_lfcnt_reg = RVU_PRIV_PFX_SSO_CFG; + block->vf_lfcnt_reg = RVU_PRIV_HWVFX_SSO_CFG; + block->lfcfg_reg = SSO_PRIV_LFX_HWGRP_CFG; + block->msixcfg_reg = SSO_PRIV_LFX_HWGRP_INT_CFG; + block->lfreset_reg = SSO_AF_LF_HWGRP_RST; + sprintf(block->name, "SSO GROUP"); + err = rvu_alloc_bitmap(&block->lf); + if (err) + return err; + +ssow: + /* Init SSO workslot's bitmap */ + block = &hw->block[BLKADDR_SSOW]; + if (!block->implemented) + goto tim; + block->lf.max = (cfg >> 56) & 0xFF; + block->addr = BLKADDR_SSOW; + block->multislot = true; + block->lfshift = 3; + block->lookup_reg = SSOW_AF_RVU_LF_HWS_CFG_DEBUG; + block->pf_lfcnt_reg = RVU_PRIV_PFX_SSOW_CFG; + block->vf_lfcnt_reg = RVU_PRIV_HWVFX_SSOW_CFG; + block->lfcfg_reg = SSO
[PATCH 01/15] octeontx2-af: Add Marvell OcteonTX2 RVU AF driver
From: Sunil Goutham This patch adds basic template for Marvell OcteonTX2's resource virtualization unit (RVU) admin function (AF) driver. Just the driver registration and probe. Signed-off-by: Sunil Goutham --- drivers/net/ethernet/marvell/Kconfig | 3 + drivers/net/ethernet/marvell/Makefile | 1 + drivers/net/ethernet/marvell/octeontx2/Kconfig | 12 ++ drivers/net/ethernet/marvell/octeontx2/Makefile| 6 + drivers/net/ethernet/marvell/octeontx2/af/Makefile | 8 ++ drivers/net/ethernet/marvell/octeontx2/af/rvu.c| 126 + drivers/net/ethernet/marvell/octeontx2/af/rvu.h| 31 + 7 files changed, 187 insertions(+) create mode 100644 drivers/net/ethernet/marvell/octeontx2/Kconfig create mode 100644 drivers/net/ethernet/marvell/octeontx2/Makefile create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/Makefile create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/rvu.c create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/rvu.h diff --git a/drivers/net/ethernet/marvell/Kconfig b/drivers/net/ethernet/marvell/Kconfig index f33fd22..3238aa7 100644 --- a/drivers/net/ethernet/marvell/Kconfig +++ b/drivers/net/ethernet/marvell/Kconfig @@ -167,4 +167,7 @@ config SKY2_DEBUG If unsure, say N. + +source "drivers/net/ethernet/marvell/octeontx2/Kconfig" + endif # NET_VENDOR_MARVELL diff --git a/drivers/net/ethernet/marvell/Makefile b/drivers/net/ethernet/marvell/Makefile index 55d4d10..89dea72 100644 --- a/drivers/net/ethernet/marvell/Makefile +++ b/drivers/net/ethernet/marvell/Makefile @@ -11,3 +11,4 @@ obj-$(CONFIG_MVPP2) += mvpp2/ obj-$(CONFIG_PXA168_ETH) += pxa168_eth.o obj-$(CONFIG_SKGE) += skge.o obj-$(CONFIG_SKY2) += sky2.o +obj-y += octeontx2/ diff --git a/drivers/net/ethernet/marvell/octeontx2/Kconfig b/drivers/net/ethernet/marvell/octeontx2/Kconfig new file mode 100644 index 000..9743502 --- /dev/null +++ b/drivers/net/ethernet/marvell/octeontx2/Kconfig @@ -0,0 +1,12 @@ +# +# Marvell OcteonTX2 drivers configuration +# + +config OCTEONTX2_AF + tristate "Marvell OcteonTX2 RVU Admin Function driver" + depends on ARM64 && PCI + help + This driver supports Marvell's OcteonTX2 Resource Virtualization + Unit's admin function manager which manages all RVU HW resources + and provides a medium to other PF/VFs to configure HW. Should be + enabled for other RVU device drivers to work. diff --git a/drivers/net/ethernet/marvell/octeontx2/Makefile b/drivers/net/ethernet/marvell/octeontx2/Makefile new file mode 100644 index 000..e579dcd --- /dev/null +++ b/drivers/net/ethernet/marvell/octeontx2/Makefile @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for Marvell OcteonTX2 device drivers. +# + +obj-$(CONFIG_OCTEONTX2_AF) += af/ diff --git a/drivers/net/ethernet/marvell/octeontx2/af/Makefile b/drivers/net/ethernet/marvell/octeontx2/af/Makefile new file mode 100644 index 000..dacbd16 --- /dev/null +++ b/drivers/net/ethernet/marvell/octeontx2/af/Makefile @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for Marvell's OcteonTX2 RVU Admin Function driver +# + +obj-$(CONFIG_OCTEONTX2_AF) += octeontx2_af.o + +octeontx2_af-y := rvu.o diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c new file mode 100644 index 000..5af4da6 --- /dev/null +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c @@ -0,0 +1,126 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Marvell OcteonTx2 RVU Admin Function driver + * + * Copyright (C) 2018 Marvell International Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include + +#include "rvu.h" + +#define DRV_NAME "octeontx2-af" +#define DRV_STRING "Marvell OcteonTX2 RVU Admin Function Driver" +#define DRV_VERSION"1.0" + +/* Supported devices */ +static const struct pci_device_id rvu_id_table[] = { + { PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, PCI_DEVID_OCTEONTX2_RVU_AF) }, + { 0, } /* end of table */ +}; + +MODULE_AUTHOR("Marvell International Ltd."); +MODULE_DESCRIPTION(DRV_STRING); +MODULE_LICENSE("GPL v2"); +MODULE_VERSION(DRV_VERSION); +MODULE_DEVICE_TABLE(pci, rvu_id_table); + +static int rvu_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + struct device *dev = &pdev->dev; + struct rvu *rvu; + interr; + + rvu = devm_kzalloc(dev, sizeof(*rvu), GFP_KERNEL); + if (!rvu) + return -ENOMEM; + + pci_set_drvdata(pdev, rvu); + rvu->pdev = pdev; + rvu->dev = &pdev->dev; + + err = pci_enable_device(pdev); + if (err) { + dev_err(dev, "Failed to enable PCI device\n"); + goto err_free
[PATCH 02/15] octeontx2-af: Reset all RVU blocks
From: Sunil Goutham Go through all BLKADDRs and check which ones are implemented on this silicon and do a HW reset of each implemented block. Also added all RVU AF and PF register offsets. Signed-off-by: Sunil Goutham --- drivers/net/ethernet/marvell/octeontx2/af/rvu.c| 78 ++ drivers/net/ethernet/marvell/octeontx2/af/rvu.h| 37 +++ .../net/ethernet/marvell/octeontx2/af/rvu_reg.h| 112 + .../net/ethernet/marvell/octeontx2/af/rvu_struct.h | 34 +++ 4 files changed, 261 insertions(+) create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/rvu_reg.h create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/rvu_struct.h diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c index 5af4da6..d40fabf 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c @@ -16,6 +16,7 @@ #include #include "rvu.h" +#include "rvu_reg.h" #define DRV_NAME "octeontx2-af" #define DRV_STRING "Marvell OcteonTX2 RVU Admin Function Driver" @@ -33,6 +34,70 @@ MODULE_LICENSE("GPL v2"); MODULE_VERSION(DRV_VERSION); MODULE_DEVICE_TABLE(pci, rvu_id_table); +/* Poll a RVU block's register 'offset', for a 'zero' + * or 'nonzero' at bits specified by 'mask' + */ +int rvu_poll_reg(struct rvu *rvu, u64 block, u64 offset, u64 mask, bool zero) +{ + void __iomem *reg; + int timeout = 100; + u64 reg_val; + + reg = rvu->afreg_base + ((block << 28) | offset); + while (timeout) { + reg_val = readq(reg); + if (zero && !(reg_val & mask)) + return 0; + if (!zero && (reg_val & mask)) + return 0; + udelay(1); + cpu_relax(); + timeout--; + } + return -EBUSY; +} + +static void rvu_check_block_implemented(struct rvu *rvu) +{ + struct rvu_hwinfo *hw = rvu->hw; + struct rvu_block *block; + int blkid; + u64 cfg; + + /* For each block check if 'implemented' bit is set */ + for (blkid = 0; blkid < BLK_COUNT; blkid++) { + block = &hw->block[blkid]; + cfg = rvupf_read64(rvu, RVU_PF_BLOCK_ADDRX_DISC(blkid)); + if (cfg & BIT_ULL(11)) + block->implemented = true; + } +} + +static void rvu_block_reset(struct rvu *rvu, int blkaddr, u64 rst_reg) +{ + struct rvu_block *block = &rvu->hw->block[blkaddr]; + + if (!block->implemented) + return; + + rvu_write64(rvu, blkaddr, rst_reg, BIT_ULL(0)); + rvu_poll_reg(rvu, blkaddr, rst_reg, BIT_ULL(63), true); +} + +static void rvu_reset_all_blocks(struct rvu *rvu) +{ + /* Do a HW reset of all RVU blocks */ + rvu_block_reset(rvu, BLKADDR_NPA, NPA_AF_BLK_RST); + rvu_block_reset(rvu, BLKADDR_NIX0, NIX_AF_BLK_RST); + rvu_block_reset(rvu, BLKADDR_NPC, NPC_AF_BLK_RST); + rvu_block_reset(rvu, BLKADDR_SSO, SSO_AF_BLK_RST); + rvu_block_reset(rvu, BLKADDR_TIM, TIM_AF_BLK_RST); + rvu_block_reset(rvu, BLKADDR_CPT0, CPT_AF_BLK_RST); + rvu_block_reset(rvu, BLKADDR_NDC0, NDC_AF_BLK_RST); + rvu_block_reset(rvu, BLKADDR_NDC1, NDC_AF_BLK_RST); + rvu_block_reset(rvu, BLKADDR_NDC2, NDC_AF_BLK_RST); +} + static int rvu_probe(struct pci_dev *pdev, const struct pci_device_id *id) { struct device *dev = &pdev->dev; @@ -43,6 +108,12 @@ static int rvu_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (!rvu) return -ENOMEM; + rvu->hw = devm_kzalloc(dev, sizeof(struct rvu_hwinfo), GFP_KERNEL); + if (!rvu->hw) { + devm_kfree(dev, rvu); + return -ENOMEM; + } + pci_set_drvdata(pdev, rvu); rvu->pdev = pdev; rvu->dev = &pdev->dev; @@ -80,6 +151,11 @@ static int rvu_probe(struct pci_dev *pdev, const struct pci_device_id *id) goto err_release_regions; } + /* Check which blocks the HW supports */ + rvu_check_block_implemented(rvu); + + rvu_reset_all_blocks(rvu); + return 0; err_release_regions: @@ -88,6 +164,7 @@ static int rvu_probe(struct pci_dev *pdev, const struct pci_device_id *id) pci_disable_device(pdev); err_freemem: pci_set_drvdata(pdev, NULL); + devm_kfree(&pdev->dev, rvu->hw); devm_kfree(dev, rvu); return err; } @@ -100,6 +177,7 @@ static void rvu_remove(struct pci_dev *pdev) pci_disable_device(pdev); pci_set_drvdata(pdev, NULL); + devm_kfree(&pdev->dev, rvu->hw); devm_kfree(&pdev->dev, rvu); } diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h index 4a4b0ad..e2c54d0 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h @
[PATCH 04/15] octeontx2-af: Add mailbox support infra
From: Sunil Goutham This patch adds mailbox support infrastructure APIs. Each RVU device has a dedicated 64KB mailbox region shared with it's peer for communication. RVU AF has a separate mailbox region shared with each of RVU PFs and a RVU PF has a separate region shared with each of it's VF. These set of APIs are used by this driver (RVU AF) and other RVU PF/VF drivers eg netdev, crypto e.t.c. Signed-off-by: Aleksey Makarov Signed-off-by: Sunil Goutham Signed-off-by: Lukasz Bartosik --- drivers/net/ethernet/marvell/octeontx2/Kconfig | 4 + drivers/net/ethernet/marvell/octeontx2/af/Makefile | 2 + drivers/net/ethernet/marvell/octeontx2/af/mbox.c | 303 + drivers/net/ethernet/marvell/octeontx2/af/mbox.h | 142 ++ .../net/ethernet/marvell/octeontx2/af/rvu_reg.h| 4 + 5 files changed, 455 insertions(+) create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/mbox.c create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/mbox.h diff --git a/drivers/net/ethernet/marvell/octeontx2/Kconfig b/drivers/net/ethernet/marvell/octeontx2/Kconfig index 9743502..8002f9c 100644 --- a/drivers/net/ethernet/marvell/octeontx2/Kconfig +++ b/drivers/net/ethernet/marvell/octeontx2/Kconfig @@ -2,8 +2,12 @@ # Marvell OcteonTX2 drivers configuration # +config OCTEONTX2_MBOX +tristate + config OCTEONTX2_AF tristate "Marvell OcteonTX2 RVU Admin Function driver" + select OCTEONTX2_MBOX depends on ARM64 && PCI help This driver supports Marvell's OcteonTX2 Resource Virtualization diff --git a/drivers/net/ethernet/marvell/octeontx2/af/Makefile b/drivers/net/ethernet/marvell/octeontx2/af/Makefile index dacbd16..ac17cb9 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/Makefile +++ b/drivers/net/ethernet/marvell/octeontx2/af/Makefile @@ -3,6 +3,8 @@ # Makefile for Marvell's OcteonTX2 RVU Admin Function driver # +obj-$(CONFIG_OCTEONTX2_MBOX) += octeontx2_mbox.o obj-$(CONFIG_OCTEONTX2_AF) += octeontx2_af.o +octeontx2_mbox-y := mbox.o octeontx2_af-y := rvu.o diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mbox.c b/drivers/net/ethernet/marvell/octeontx2/af/mbox.c new file mode 100644 index 000..0722fa4 --- /dev/null +++ b/drivers/net/ethernet/marvell/octeontx2/af/mbox.c @@ -0,0 +1,303 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Marvell OcteonTx2 RVU Admin Function driver + * + * Copyright (C) 2018 Marvell International Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include + +#include "rvu_reg.h" +#include "mbox.h" + +static const u16 msgs_offset = ALIGN(sizeof(struct mbox_hdr), MBOX_MSG_ALIGN); + +void otx2_mbox_reset(struct otx2_mbox *mbox, int devid) +{ + struct otx2_mbox_dev *mdev = &mbox->dev[devid]; + struct mbox_hdr *tx_hdr = + (struct mbox_hdr *)(mdev->mbase + mbox->tx_start); + struct mbox_hdr *rx_hdr = + (struct mbox_hdr *)(mdev->mbase + mbox->rx_start); + + spin_lock(&mdev->mbox_lock); + mdev->msg_size = 0; + mdev->rsp_size = 0; + tx_hdr->num_msgs = 0; + rx_hdr->num_msgs = 0; + spin_unlock(&mdev->mbox_lock); +} +EXPORT_SYMBOL(otx2_mbox_reset); + +void otx2_mbox_destroy(struct otx2_mbox *mbox) +{ + mbox->reg_base = NULL; + mbox->hwbase = NULL; + + kfree(mbox->dev); + mbox->dev = NULL; +} +EXPORT_SYMBOL(otx2_mbox_destroy); + +int otx2_mbox_init(struct otx2_mbox *mbox, void *hwbase, struct pci_dev *pdev, + void *reg_base, int direction, int ndevs) +{ + int devid; + struct otx2_mbox_dev *mdev; + + switch (direction) { + case MBOX_DIR_AFPF: + case MBOX_DIR_PFVF: + mbox->tx_start = MBOX_DOWN_TX_START; + mbox->rx_start = MBOX_DOWN_RX_START; + mbox->tx_size = MBOX_DOWN_TX_SIZE; + mbox->rx_size = MBOX_DOWN_RX_SIZE; + break; + case MBOX_DIR_PFAF: + case MBOX_DIR_VFPF: + mbox->tx_start = MBOX_DOWN_RX_START; + mbox->rx_start = MBOX_DOWN_TX_START; + mbox->tx_size = MBOX_DOWN_RX_SIZE; + mbox->rx_size = MBOX_DOWN_TX_SIZE; + break; + case MBOX_DIR_AFPF_UP: + case MBOX_DIR_PFVF_UP: + mbox->tx_start = MBOX_UP_TX_START; + mbox->rx_start = MBOX_UP_RX_START; + mbox->tx_size = MBOX_UP_TX_SIZE; + mbox->rx_size = MBOX_UP_RX_SIZE; + break; + case MBOX_DIR_PFAF_UP: + case MBOX_DIR_VFPF_UP: + mbox->tx_start = MBOX_UP_RX_START; + mbox->rx_start = MBOX_UP_TX_START; + mbox->tx_size = MBOX_UP_RX_SIZE; + mbox->rx_size = MBOX_UP_TX_SIZE; + break; + default: +
[PATCH 00/15] octeontx2-af: Add RVU Admin Function driver
From: Sunil Goutham Resource virtualization unit (RVU) on Marvell's OcteonTX2 SOC maps HW resources from the network, crypto and other functional blocks into PCI-compatible physical and virtual functions. Each functional block again has multiple local functions (LFs) for provisioning to PCI devices. RVU supports multiple PCIe SRIOV physical functions (PFs) and virtual functions (VFs). PF0 is called the administrative / admin function (AF) and has privileges to provision RVU functional block's LFs to each of the PF/VF. RVU managed networking functional blocks - Network pool allocator (NPA) - Network interface controller (NIX) - Network parser CAM (NPC) - Schedule/Synchronize/Order unit (SSO) RVU managed non-networking functional blocks - Crypto accelerator (CPT) - Scheduled timers unit (TIM) - Schedule/Synchronize/Order unit (SSO) Used for both networking and non networking usecases - Compression (upcoming in future variants of the silicons) Resource provisioning examples - A PF/VF with NIX-LF & NPA-LF resources works as a pure network device - A PF/VF with CPT-LF resource works as a pure cyrpto offload device. This admin function driver neither receives any data nor processes it i.e no I/O, a configuration only driver. PF/VFs communicates with AF via a shared memory region (mailbox). Upon receiving requests from PF/VF, AF does resource provisioning and other HW configuration. AF is always attached to host, but PF/VFs may be used by host kernel itself, or attached to VMs or to userspace applications like DPDK etc. So AF has to handle provisioning/configuration requests sent by any device from any domain. This patch series adds logic for the following - RVU AF driver with functional blocks provisioning support. - Mailbox infrastructure for communication between AF and PFs. - CGX (MAC controller) driver which communicates with firmware for managing physical ethernet interfaces. AF collects info from this driver and forwards the same to the PF/VFs uaing these interfaces. This is the first set of patches out of 80+ patches. Aleksey Makarov (1): octeontx2-af: Convert mbox msg id check to a macro Geetha sowjanya (1): octeontx2-af: Reconfig MSIX base with IOVA Linu Cherian (3): octeontx2-af: Set RVU PFs to CGX LMACs mapping octeontx2-af: Add support for CGX link management octeontx2-af: Register for CGX lmac events Sunil Goutham (10): octeontx2-af: Add Marvell OcteonTX2 RVU AF driver octeontx2-af: Reset all RVU blocks octeontx2-af: Gather RVU blocks HW info octeontx2-af: Add mailbox support infra octeontx2-af: Add mailbox IRQ and msg handlers octeontx2-af: Scan blocks for LFs provisioned to PF/VF octeontx2-af: Add RVU block LF provisioning support octeontx2-af: Configure block LF's MSIX vector offset octeontx2-af: Add Marvell OcteonTX2 CGX driver MAINTAINERS: Add entry for Marvell OcteonTX2 Admin Function driver MAINTAINERS|9 + drivers/net/ethernet/marvell/Kconfig |3 + drivers/net/ethernet/marvell/Makefile |1 + drivers/net/ethernet/marvell/octeontx2/Kconfig | 16 + drivers/net/ethernet/marvell/octeontx2/Makefile|6 + drivers/net/ethernet/marvell/octeontx2/af/Makefile | 10 + drivers/net/ethernet/marvell/octeontx2/af/cgx.c| 515 ++ drivers/net/ethernet/marvell/octeontx2/af/cgx.h| 65 + .../net/ethernet/marvell/octeontx2/af/cgx_fw_if.h | 225 +++ drivers/net/ethernet/marvell/octeontx2/af/mbox.c | 303 drivers/net/ethernet/marvell/octeontx2/af/mbox.h | 211 +++ drivers/net/ethernet/marvell/octeontx2/af/rvu.c| 1637 drivers/net/ethernet/marvell/octeontx2/af/rvu.h| 158 ++ .../net/ethernet/marvell/octeontx2/af/rvu_cgx.c| 194 +++ .../net/ethernet/marvell/octeontx2/af/rvu_reg.h| 441 ++ .../net/ethernet/marvell/octeontx2/af/rvu_struct.h | 74 + 16 files changed, 3868 insertions(+) create mode 100644 drivers/net/ethernet/marvell/octeontx2/Kconfig create mode 100644 drivers/net/ethernet/marvell/octeontx2/Makefile create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/Makefile create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/cgx.c create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/cgx.h create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/cgx_fw_if.h create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/mbox.c create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/mbox.h create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/rvu.c create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/rvu.h create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/rvu_reg.h create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/rvu_struct.h -- 2.7.4
Re: [Patch net-next] net_sched: fix an extack message in tcf_block_find()
On 9/27/18 3:36 PM, Cong Wang wrote: > On Thu, Sep 27, 2018 at 2:16 PM Eric Dumazet wrote: >> >> >> >> On 09/27/2018 01:42 PM, Cong Wang wrote: >>> It is clearly a copy-n-paste. >>> >>> Signed-off-by: Cong Wang >>> --- >>> net/sched/cls_api.c | 2 +- >>> 1 file changed, 1 insertion(+), 1 deletion(-) >>> >>> diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c >>> index 3de47e99b788..8dd7f8af6d54 100644 >>> --- a/net/sched/cls_api.c >>> +++ b/net/sched/cls_api.c >>> @@ -655,7 +655,7 @@ static struct tcf_block *tcf_block_find(struct net >>> *net, struct Qdisc **q, >>> >>> *q = qdisc_refcount_inc_nz(*q); >>> if (!*q) { >>> - NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists"); >>> + NL_SET_ERR_MSG(extack, "Can't increase Qdisc >>> refcount"); >> >> >> I am not sure it was a copy-n-paste. > > > Make sure you knew there is an exactly same extack message > (with a same English grammar error). > > >> >> Qdisc refcount business is kernel internal. > > Yeah, but the extack message is already there, this patch doesn't add > any new extack. Or you are suggesting we should remove it? IMO the message grammar should be fixed, but the content is correct -- ie, parent qdisc does not exist.
Re: [PATCH net] net/ncsi: Extend NC-SI Netlink interface to allow user space to send NC-SI command
On Thu, 2018-09-27 at 21:08 +, justin.l...@dell.com wrote: > The new command (NCSI_CMD_SEND_CMD) is added to allow user space application > to send NC-SI command to the network card. > Also, add a new attribute (NCSI_ATTR_DATA) for transferring request and > response. > > The work flow is as below. > > Request: > User space application -> Netlink interface (msg) > -> new Netlink handler - > ncsi_send_cmd_nl() > -> ncsi_xmit_cmd() > Response: > Response received - ncsi_rcv_rsp() -> internal response handler - > ncsi_rsp_handler_xxx() > -> > ncsi_rsp_handler_netlink() > -> > ncsi_send_netlink_rsp () > -> > Netlink interface (msg) > -> > user space application > Command timeout - ncsi_request_timeout() -> ncsi_send_netlink_timeout () > > -> Netlink interface (msg with zero data length) > > -> user space application > Error: > Error detected -> ncsi_send_netlink_err () -> Netlink interface (err msg) > > -> user space application > > > Signed-off-by: Justin Lee > Hi Justin, Thanks for posting this on the list! The overall design looks good and so far looks like it should fit relatively well with the other OEM command patch. I'll try and run some OEM commands against my machine. Some comments below: > > --- > include/uapi/linux/ncsi.h | 3 + > net/ncsi/internal.h | 12 ++- > net/ncsi/ncsi-aen.c | 10 ++- > net/ncsi/ncsi-cmd.c | 106 > net/ncsi/ncsi-manage.c| 74 ++--- > net/ncsi/ncsi-netlink.c | 199 > +- > net/ncsi/ncsi-netlink.h | 4 + > net/ncsi/ncsi-rsp.c | 70 ++-- > 8 files changed, 420 insertions(+), 58 deletions(-) > > diff --git a/include/uapi/linux/ncsi.h b/include/uapi/linux/ncsi.h > index 4c292ec..4992bfc 100644 > --- a/include/uapi/linux/ncsi.h > +++ b/include/uapi/linux/ncsi.h > @@ -30,6 +30,7 @@ enum ncsi_nl_commands { > NCSI_CMD_PKG_INFO, > NCSI_CMD_SET_INTERFACE, > NCSI_CMD_CLEAR_INTERFACE, > + NCSI_CMD_SEND_CMD, > > __NCSI_CMD_AFTER_LAST, > NCSI_CMD_MAX = __NCSI_CMD_AFTER_LAST - 1 > @@ -43,6 +44,7 @@ enum ncsi_nl_commands { > * @NCSI_ATTR_PACKAGE_LIST: nested array of NCSI_PKG_ATTR attributes > * @NCSI_ATTR_PACKAGE_ID: package ID > * @NCSI_ATTR_CHANNEL_ID: channel ID > + * @NCSI_ATTR_DATA: command payload > * @NCSI_ATTR_MAX: highest attribute number > */ > enum ncsi_nl_attrs { > @@ -51,6 +53,7 @@ enum ncsi_nl_attrs { > NCSI_ATTR_PACKAGE_LIST, > NCSI_ATTR_PACKAGE_ID, > NCSI_ATTR_CHANNEL_ID, > + NCSI_ATTR_DATA, > > __NCSI_ATTR_AFTER_LAST, > NCSI_ATTR_MAX = __NCSI_ATTR_AFTER_LAST - 1 > diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h > index 8055e39..20ce735 100644 > --- a/net/ncsi/internal.h > +++ b/net/ncsi/internal.h > @@ -215,12 +215,17 @@ struct ncsi_request { > unsigned charid; /* Request ID - 0 to 255 */ > bool used;/* Request that has been assigned */ > unsigned int flags; /* NCSI request property */ > -#define NCSI_REQ_FLAG_EVENT_DRIVEN 1 > +#define NCSI_REQ_FLAG_EVENT_DRIVEN 1 > +#define NCSI_REQ_FLAG_NETLINK_DRIVEN 2 > struct ncsi_dev_priv *ndp;/* Associated NCSI device */ > struct sk_buff *cmd;/* Associated NCSI command packet */ > struct sk_buff *rsp;/* Associated NCSI response packet */ > struct timer_listtimer; /* Timer on waiting for response */ > bool enabled; /* Time has been enabled or not*/ > + > + u32 snd_seq; /* netlink sending sequence number */ > + u32 snd_portid; /* netlink portid of sender*/ > + struct nlmsghdr nlhdr; /* netlink message header */ > }; > > enum { > @@ -301,10 +306,13 @@ struct ncsi_cmd_arg { > unsigned short payload; /* Command packet payload length */ > unsigned int req_flags; /* NCSI request properties */ > union { > - unsigned char bytes[16]; /* Command packet specific data */ > + unsigned char bytes[16]; /* Command packet specific data > */ > unsigned short words[8]; > unsigned int dwords[4]; >
Re: [PATCH net-next v6 00/23] WireGuard: Secure Network Tunnel
On Thu, Sep 27, 2018 at 11:35:39PM +0200, Jason A. Donenfeld wrote: > Hi Eric, > > On Thu, Sep 27, 2018 at 8:29 PM Eric Biggers wrote: > > Why is Herbert Xu's existing crypto tree being circumvented, especially for > > future patches (the initial merge isn't quite as important as that's a > > one-time > > event)? I like being able to check out cryptodev to test upcoming crypto > > patches. And currently, changes to APIs, algorithms, tests, and > > implementations > > all go through cryptodev, which is convenient for crypto developers. > > > > Apparently, you're proposing that someone adding a new algorithm will now > > have > > to submit the API portion to one maintainer (Herbert Xu) and the > > implementation > > portion to another maintainer (you), and they'll go through separate git > > trees. > > That's inconvenient for developers, and it seems that in practice you and > > Herbert will be stepping on each other's toes a lot. > > > > Can you please reach some kind of sane agreement with Herbert so that the > > development process isn't fractured into two? Perhaps you could review > > patches, > > but Herbert could still apply them? > > I think you're overthinking it a bit. Zinc will have a few software > implementations of primitives that are useful in cases where it's nice to call > the primitive directly. Think: various usages of sha2, siphash, the wireguard > suite (what this patchset includes), other things in lib/, etc. In so much as > this winds up duplicating things within the crypto API, I'll work with Herbert > to build one on top of the other -- as I've done in the two commits in this > series. But beyond that, think of the two initiatives as orthogonal. I'm > working on curating a few primitives that are maximally useful throughout > the kernel for various uses, and doing so in a way that I think brings > about a certain quality. Meanwhile the crypto API is amassing a huge > collection of primitives for some things, and that will continue to exist, > and Herbert will continue to maintain that. I expect for the crossover > to be fairly isolated and manageable, without too much foreseeable tree- > conflicts and such. Therefore, Samuel Neves and I plan to maintain the > codebase we've spent quite some time writing, and maintain our own tree for > it, which we'll be submitting through Greg. In other words, this is not > a matter of "circumvention" or "stepping on toes", but rather separate > efforts. I'm quite certain to the extent they overlap we'll be able to work > out fairly easily. > > Either way, I'll take your suggestion and reach out to Herbert, since at > least a discussion between the two of us sounds like it could be productive. So, Zinc will simultaneously replace the current crypto implementations, *and* be "orthogonal" and "separate" from all the crypto code currently maintained by Herbert? You can't have your cake and eat it too... I'm still concerned you're splitting the community in two. It will be unclear where new algorithms and implementations should go. Some people will choose Herbert and the current crypto API and conventions, and some people will choose you and Zinc... I still don't see clear guidelines for what will go where. And yes, you and Herbert will step on each others' toes and duplicate stuff, as the efforts are *not* separate, as you've even argued yourself. Please reach out to Herbert to find a sane solution, ideally one that involves having a single git tree for crypto development and allows people to continue crypto development without choosing "sides". > > > I'm also wondering about the criteria for making additions and changes to > > "Zinc". You mentioned before that one of the "advantages" of Zinc is that > > it > > doesn't include "cipher modes from 90s cryptographers" -- what does that > > mean > > exactly? You've also indicated before that you don't want people modifying > > the > > Poly1305 implementations as they are too error-prone. Useful contributions > > could be blocked or discouraged in the future. Can you please elaborate on > > your criteria for contributions to Zinc? > > > > Also, will you allow algorithms that aren't up to modern security standards > > but > > are needed for compatibility reasons, e.g. MD5, SHA-1, and DES? There are > > existing standards, APIs, and data formats that use these "legacy" > > algorithms; > > so implementations of them are often still needed, whether we like it or > > not. > > > > And does it matter who designed the algorithms, e.g. do algorithms from > > Daniel > > Bernstein get effectively a free pass, while algorithms from certain > > countries, > > governments, or organizations are not allowed? E.g. wireless driver > > developers > > may need the SM4 block cipher (which is now supported by the crypto API) as > > it's > > specified in a Chinese wireless standard. Will you allow SM4 in Zinc? Or > > will > > people have to submit some algorithms to Herbert and some to you
[PATCH net-next] geneve: fix ttl inherit type
Phil pointed out that there is a mismatch between vxlan and geneve ttl inherit. We should define it as a flag and use nla_put_flag to export this opiton. Fixes: 52d0d404d39dd ("geneve: add ttl inherit support") Reported-by: Phil Sutter Signed-off-by: Hangbin Liu --- drivers/net/geneve.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 6625fab..09ab2fd 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -1100,7 +1100,7 @@ static const struct nla_policy geneve_policy[IFLA_GENEVE_MAX + 1] = { [IFLA_GENEVE_UDP_CSUM] = { .type = NLA_U8 }, [IFLA_GENEVE_UDP_ZERO_CSUM6_TX] = { .type = NLA_U8 }, [IFLA_GENEVE_UDP_ZERO_CSUM6_RX] = { .type = NLA_U8 }, - [IFLA_GENEVE_TTL_INHERIT] = { .type = NLA_U8 }, + [IFLA_GENEVE_TTL_INHERIT] = { .type = NLA_FLAG }, }; static int geneve_validate(struct nlattr *tb[], struct nlattr *data[], @@ -1582,7 +1582,7 @@ static size_t geneve_get_size(const struct net_device *dev) nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_CSUM */ nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_ZERO_CSUM6_TX */ nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_ZERO_CSUM6_RX */ - nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_TTL_INHERIT */ + nla_total_size(0) + /* IFLA_GENEVE_TTL_INHERIT */ 0; } @@ -1636,7 +1636,7 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev) goto nla_put_failure; #endif - if (nla_put_u8(skb, IFLA_GENEVE_TTL_INHERIT, ttl_inherit)) + if (ttl_inherit && nla_put_flag(skb, IFLA_GENEVE_TTL_INHERIT)) goto nla_put_failure; return 0; -- 2.5.5
Re: [PATCH net-next v6 23/23] net: WireGuard secure network tunnel
On Fri, Sep 28, 2018 at 12:37 AM Jason A. Donenfeld wrote: > Will do. v7 will include the wg_ prefix. $ nm *.o | while read a b c; do [[ $b == T ]] && echo $c; done | grep -v ^wg_ cleanup_module init_module Success.
[PATCH net] vxlan: use nla_put_flag for ttl inherit
Phil pointed out that there is a mismatch between vxlan and geneve ttl inherit. We should define it as a flag and use nla_put_flag to export this opiton. Fixes: 8fd780698745b ("vxlan: fill ttl inherit info") Reported-by: Phil Sutter Signed-off-by: Hangbin Liu --- drivers/net/vxlan.c | 8 +--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 2b8da2b..479dda4 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -3539,7 +3539,7 @@ static size_t vxlan_get_size(const struct net_device *dev) nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LINK */ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_LOCAL{6} */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TTL */ - nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TTL_INHERIT */ + nla_total_size(0) + /* IFLA_VXLAN_TTL_INHERIT */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TOS */ nla_total_size(sizeof(__be32)) + /* IFLA_VXLAN_LABEL */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_LEARNING */ @@ -3604,8 +3604,6 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) } if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->cfg.ttl) || - nla_put_u8(skb, IFLA_VXLAN_TTL_INHERIT, - !!(vxlan->cfg.flags & VXLAN_F_TTL_INHERIT)) || nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) || nla_put_be32(skb, IFLA_VXLAN_LABEL, vxlan->cfg.label) || nla_put_u8(skb, IFLA_VXLAN_LEARNING, @@ -3650,6 +3648,10 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_flag(skb, IFLA_VXLAN_REMCSUM_NOPARTIAL)) goto nla_put_failure; + if (vxlan->cfg.flags & VXLAN_F_TTL_INHERIT && + nla_put_flag(skb, IFLA_VXLAN_TTL_INHERIT)) + goto nla_put_failure; + return 0; nla_put_failure: -- 2.5.5
Re: [PATCH iproute2-next] geneve: fix ttl inherit behavior
On Thu, Sep 27, 2018 at 11:08:36AM +0200, Phil Sutter wrote: > On Thu, Sep 27, 2018 at 03:27:37PM +0800, Hangbin Liu wrote: > > Currently when we add geneve with "ttl inherit", we set ttl to 0, which > > is actually use whatever default value instead of inherit the inner > > protocol's ttl value. > > > > To respect compatibility with old behavior and make a difference between > > ttl inherit and ttl == 0, we add an attribute IFLA_GENEVE_TTL_INHERIT in > > kernel commit 52d0d404d39dd ("geneve: add ttl inherit support"). > > > > Now let's use "ttl inherit" to inherit the inner protocol's ttl, and use > > "ttl auto" to means "use whatever default value", the same behavior with > > ttl == 0. > > > > Reported-by: Jianlin Shi > > Signed-off-by: Hangbin Liu > > Acked-by: Phil Sutter Hi Stephen, David, Please hold on this path and let me fix the inherit flag issue first. Thanks Hangbin
Re: [PATCH resend] can: rcar_can: convert to SPDX identifiers
Hi Marc > > From: Kuninori Morimoto > > > > This patch updates license to use SPDX-License-Identifier > > instead of verbose license text. > > > > Signed-off-by: Kuninori Morimoto > > Reviewed-by: Simon Horman > > Wolfram Sang has already supplied a similar patch, but not for Makefile > and Kconfig. I've applied your patch for Makefile and Kconfig and > adjusted the commit message accordingly. Thank you very much
Re: WARN_ON in TLP causing RT throttling
On 09/27/2018 05:16 PM, stran...@codeaurora.org wrote: > Hi Yuchung, > > Based on the dumps we were able to get, it appears that TFO was not used in > this case. > We also tried some local experiments where we dropped incoming SYN packets > after already > successful TFO connections on the receive side to see if TFO would trigger > this scenario, but > have not been able to reproduce it. > > One other interesting thing we found is that the socket never sent or > received any data. It only > sent/received the packets for the initial handshake and the outgoing FIN. Just to make sure : Was this some sort of syzkaller (or other fuzzer) run ?
Re: WARN_ON in TLP causing RT throttling
On 2018-09-27 13:14, Yuchung Cheng wrote: On Wed, Sep 26, 2018 at 5:09 PM, Eric Dumazet wrote: On 09/26/2018 04:46 PM, stran...@codeaurora.org wrote: > Hi Eric, > > Someone recently reported a crash to us on the 4.14.62 kernel where excessive > WARNING prints were spamming the logs and causing watchdog bites. The kernel > does have the following commit by Soheil: > bffd168c3fc5 "tcp: clear tp->packets_out when purging write queue" > > Before this bug we see over 1 second of continuous WARN_ON prints from > tcp_send_loss_probe() like so: > > 7795.530450: <2> tcp_send_loss_probe+0x194/0x1b8 > 7795.534833: <2> tcp_write_timer_handler+0xf8/0x1c4 > 7795.539492: <2> tcp_write_timer+0x4c/0x74 > 7795.543348: <2> call_timer_fn+0xc0/0x1b4 > 7795.547113: <2> run_timer_softirq+0x248/0x81c > > Specifically, the prints come from the following check: > > /* Retransmit last segment. */ > if (WARN_ON(!skb)) > goto rearm_timer; > > Since skb is always NULL, we know there's nothing on the write queue or the > retransmit queue, so we just keep resetting the timer, waiting for more data > to be queued. However, we were able to determine that the TCP socket is in the > TCP_FIN_WAIT1 state, so we will no longer be sending any data and these queues > remain empty. > > Would it be appropriate to stop resetting the TLP timer if we detect that the > connection is starting to close and we have no more data to send the probe with, > or is there some way that this scenario should already be handled? > > Unfortunately, we don't have a reproducer for this crash. > Something is fishy. If there is no skb in the queues, then tp->packets_out should be 0, therefore tcp_rearm_rto() should simply call inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); I have never seen this report before. Do you use Fast Open? I am wondering if its a bug when a TFO server closes the socket before the handshake finishes... Either way, it's pretty safe to just stop TLP if write queue is empty for any unexpected reason. Hi Yuchung, Based on the dumps we were able to get, it appears that TFO was not used in this case. We also tried some local experiments where we dropped incoming SYN packets after already successful TFO connections on the receive side to see if TFO would trigger this scenario, but have not been able to reproduce it. One other interesting thing we found is that the socket never sent or received any data. It only sent/received the packets for the initial handshake and the outgoing FIN.
Re: [PATCH net V2] vhost-vsock: fix use after free
On Fri, Sep 28, 2018 at 07:37:37AM +0800, Jason Wang wrote: > > > On 2018年09月28日 01:04, Michael S. Tsirkin wrote: > > On Thu, Sep 27, 2018 at 08:22:04PM +0800, Jason Wang wrote: > > > The access of vsock is not protected by vhost_vsock_lock. This may > > > lead to use after free since vhost_vsock_dev_release() may free the > > > pointer at the same time. > > > > > > Fix this by holding the lock during the access. > > > > > > Reported-by:syzbot+e3e074963495f92a8...@syzkaller.appspotmail.com > > > Fixes: 16320f363ae1 ("vhost-vsock: add pkt cancel capability") > > > Fixes: 433fc58e6bf2 ("VSOCK: Introduce vhost_vsock.ko") > > > Cc: Stefan Hajnoczi > > > Signed-off-by: Jason Wang > > Wow is that really the best we can do? > > For net/stable, probably yes. > > > A global lock on a data path > > operation? > > It's already there, &vhost_vsock_lock? were is it takes on data path? > and the patch only increase the critical section. > > > Granted use after free is nasty but Stefan said he sees > > a way to fix it using a per socket refcount. He's on vacation > > until Oct 4 though ... > > > > Stefan has acked the pacth, so I think it's ok? We can do optimization for > -next on top. > > Thanks Well on high SMP serializing can drop performance as much as x100 so I'm not sure it's appropriate - seems to fix a bug but can introduce a regression. Let's see how does a proper fix look first? -- MST
Re: [PATCH net-next] virtio_net: ethtool tx napi configuration
On 2018年09月27日 21:53, Willem de Bruijn wrote: On Thu, Sep 27, 2018 at 4:51 AM Jason Wang wrote: On 2018年09月14日 12:46, Willem de Bruijn wrote: I'm not sure I get this. If we don't enable tx napi, we tend to delay TX interrupt if we found the ring is about to full to avoid interrupt storm, so we're probably ok in this case. I'm only concerned about the transition state when converting from napi to no-napi when the queue is stopped and tx interrupt disabled. With napi mode the interrupt is only disabled if napi is scheduled, in which case it will eventually reenable the interrupt. But when switching to no-napi mode in this state no progress will be made. But it seems this cannot happen. When converting to no-napi mode, set_coalesce waits for napi to complete in napi_disable. So the interrupt should always start enabled when transitioning into no-napi mode. An update, I meet a hang in napi_disalbe(). But it's hard to be reproduced. I tend to choose a easy way like V1 that only allow the switching when device is down. I agree. I will post the patch after a vacation. (or you can post if it was urgent for you). If you have time to review and add your signed-off-by, I can post it. It's a pretty small diff at this point. But no rush, we can also wait until after your vacation. Then let me post it after the vacation. I also need to look at a patch to toggle LRO using ethtool, btw. Interesting, we've already did something similar during XDP. The GUEST_TSO_XXX part may need some private flags I believe. Thanks
Re: [PATCH net V2] vhost-vsock: fix use after free
On 2018年09月28日 01:04, Michael S. Tsirkin wrote: On Thu, Sep 27, 2018 at 08:22:04PM +0800, Jason Wang wrote: The access of vsock is not protected by vhost_vsock_lock. This may lead to use after free since vhost_vsock_dev_release() may free the pointer at the same time. Fix this by holding the lock during the access. Reported-by:syzbot+e3e074963495f92a8...@syzkaller.appspotmail.com Fixes: 16320f363ae1 ("vhost-vsock: add pkt cancel capability") Fixes: 433fc58e6bf2 ("VSOCK: Introduce vhost_vsock.ko") Cc: Stefan Hajnoczi Signed-off-by: Jason Wang Wow is that really the best we can do? For net/stable, probably yes. A global lock on a data path operation? It's already there, and the patch only increase the critical section. Granted use after free is nasty but Stefan said he sees a way to fix it using a per socket refcount. He's on vacation until Oct 4 though ... Stefan has acked the pacth, so I think it's ok? We can do optimization for -next on top. Thanks
[PATCHv3 bpf-next 00/12] Add socket lookup support
This series proposes a new helper for the BPF API which allows BPF programs to perform lookups for sockets in a network namespace. This would allow programs to determine early on in processing whether the stack is expecting to receive the packet, and perform some action (eg drop, forward somewhere) based on this information. The series is structured roughly into: * Misc refactor * Add the socket pointer type * Add reference tracking to ensure that socket references are freed * Extend the BPF API to add sk_lookup_xxx() / sk_release() functions * Add tests/documentation The helper proposed in this series includes a parameter for a tuple which must be filled in by the caller to determine the socket to look up. The simplest case would be filling with the contents of the packet, ie mapping the packet's 5-tuple into the parameter. In common cases, it may alternatively be useful to reverse the direction of the tuple and perform a lookup, to find the socket that initiates this connection; and if the BPF program ever performs a form of IP address translation, it may further be useful to be able to look up arbitrary tuples that are not based upon the packet, but instead based on state held in BPF maps or hardcoded in the BPF program. Currently, access into the socket's fields are limited to those which are otherwise already accessible, and are restricted to read-only access. Changes since v2: * New patch: "selftests/bpf: Generalize dummy program types". This enables adding verifier tests for socket lookup with tail calls. * Define the semantics of the new helpers more clearly in uAPI header. * Fix release of caller_net when netns is not specified. * Use skb->sk to find caller net when skb->dev is unavailable. * Fix build with !CONFIG_NET. * Replace ptr_id defensive coding when releasing reference state with an internal error (-EFAULT). * Remove flags argument to sk_release(). * Add several new assembly tests suggested by Daniel. * Add a few new C tests. * Fix typo in verifier error message. Changes since v1: * Limit netns_id field to 32 bits * Reuse reg_type_mismatch() in more places * Reduce the number of passes at convert_ctx_access() * Replace ptr_id defensive coding when releasing reference state with an internal error (-EFAULT) * Rework 'struct bpf_sock_tuple' to allow passing a packet pointer * Allow direct packet access from helper * Fix compile error with CONFIG_IPV6 enabled * Improve commit messages Changes since RFC: * Split up sk_lookup() into sk_lookup_tcp(), sk_lookup_udp(). * Only take references on the socket when necessary. * Make sk_release() only free the socket reference in this case. * Fix some runtime reference leaks: * Disallow BPF_LD_[ABS|IND] instructions while holding a reference. * Disallow bpf_tail_call() while holding a reference. * Prevent the same instruction being used for reference and other pointer type. * Simplify locating copies of a reference during helper calls by caching the pointer id from the caller. * Fix kbuild compilation warnings with particular configs. * Improve code comments describing the new verifier pieces. * Testing courtesy of Nitin This tree is also available at: https://github.com/joestringer/linux/commits/submit/sk-lookup-v3 Joe Stringer (12): bpf: Add iterator for spilled registers bpf: Simplify ptr_min_max_vals adjustment bpf: Generalize ptr_or_null regs check bpf: Add PTR_TO_SOCKET verifier type bpf: Macrofy stack state copy bpf: Add reference tracking to verifier bpf: Add helper to retrieve socket in BPF selftests/bpf: Generalize dummy program types selftests/bpf: Add tests for reference tracking libbpf: Support loading individual progs selftests/bpf: Add C tests for reference tracking Documentation: Describe bpf reference tracking Documentation/networking/filter.txt | 64 ++ include/linux/bpf.h | 34 + include/linux/bpf_verifier.h | 37 +- include/uapi/linux/bpf.h | 93 ++- kernel/bpf/verifier.c | 594 +--- net/core/filter.c | 181 - tools/include/uapi/linux/bpf.h| 93 ++- tools/lib/bpf/libbpf.c| 4 +- tools/lib/bpf/libbpf.h| 3 + tools/testing/selftests/bpf/Makefile | 2 +- tools/testing/selftests/bpf/bpf_helpers.h | 12 + tools/testing/selftests/bpf/test_progs.c | 38 + .../selftests/bpf/test_sk_lookup_kern.c | 180 + tools/testing/selftests/bpf/test_verifier.c | 670 +- 14 files changed, 1858 insertions(+), 147 deletions(-) create mode 100644 tools/testing/selftests/bpf/test_sk_lookup_kern.c -- 2.17.1
[PATCHv3 bpf-next 04/12] bpf: Add PTR_TO_SOCKET verifier type
Teach the verifier a little bit about a new type of pointer, a PTR_TO_SOCKET. This pointer type is accessed from BPF through the 'struct bpf_sock' structure. Signed-off-by: Joe Stringer --- v2: Reuse reg_type_mismatch() in more places Reduce the number of passes at convert_ctx_access() v3: Fix build with !CONFIG_NET --- include/linux/bpf.h | 34 ++ include/linux/bpf_verifier.h | 2 + kernel/bpf/verifier.c| 120 +++ net/core/filter.c| 30 + 4 files changed, 160 insertions(+), 26 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 018299a595c8..027697b6a22f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -154,6 +154,7 @@ enum bpf_arg_type { ARG_PTR_TO_CTX, /* pointer to context */ ARG_ANYTHING, /* any (initialized) argument is ok */ + ARG_PTR_TO_SOCKET, /* pointer to bpf_sock */ }; /* type of values returned from helper functions */ @@ -162,6 +163,7 @@ enum bpf_return_type { RET_VOID, /* function doesn't return anything */ RET_PTR_TO_MAP_VALUE, /* returns a pointer to map elem value */ RET_PTR_TO_MAP_VALUE_OR_NULL, /* returns a pointer to map elem value or NULL */ + RET_PTR_TO_SOCKET_OR_NULL, /* returns a pointer to a socket or NULL */ }; /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs @@ -213,6 +215,8 @@ enum bpf_reg_type { PTR_TO_PACKET, /* reg points to skb->data */ PTR_TO_PACKET_END, /* skb->data + headlen */ PTR_TO_FLOW_KEYS,/* reg points to bpf_flow_keys */ + PTR_TO_SOCKET, /* reg points to struct bpf_sock */ + PTR_TO_SOCKET_OR_NULL, /* reg points to struct bpf_sock or NULL */ }; /* The information passed from prog-specific *_is_valid_access @@ -343,6 +347,11 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void); typedef unsigned long (*bpf_ctx_copy_t)(void *dst, const void *src, unsigned long off, unsigned long len); +typedef u32 (*bpf_convert_ctx_access_t)(enum bpf_access_type type, + const struct bpf_insn *src, + struct bpf_insn *dst, + struct bpf_prog *prog, + u32 *target_size); u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy); @@ -836,4 +845,29 @@ extern const struct bpf_func_proto bpf_get_local_storage_proto; void bpf_user_rnd_init_once(void); u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); +#if defined(CONFIG_NET) +bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, + struct bpf_insn_access_aux *info); +u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size); +#else +static inline bool bpf_sock_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + return false; +} +static inline u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size) +{ + return 0; +} +#endif + #endif /* _LINUX_BPF_H */ diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index af262b97f586..23a2b17bfd75 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -58,6 +58,8 @@ struct bpf_reg_state { * offset, so they can share range knowledge. * For PTR_TO_MAP_VALUE_OR_NULL this is used to share which map value we * came from, when one is tested for != NULL. +* For PTR_TO_SOCKET this is used to share which pointers retain the +* same reference to the socket, to determine proper reference freeing. */ u32 id; /* For scalar types (SCALAR_VALUE), this represents our knowledge of diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bbb0a812ee81..d4abbf0d5727 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -80,8 +80,8 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { * (like pointer plus pointer becomes SCALAR_VALUE type) * * When verifier sees load or store instructions the type of base registe
[PATCHv3 bpf-next 08/12] selftests/bpf: Generalize dummy program types
Don't hardcode the dummy program types to SOCKET_FILTER type, as this prevents testing bpf_tail_call in conjunction with other program types. Instead, use the program type specified in the test case. Signed-off-by: Joe Stringer --- tools/testing/selftests/bpf/test_verifier.c | 31 +++-- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index a90be44f61e0..020b1467e565 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -12652,18 +12652,18 @@ static int create_map(uint32_t type, uint32_t size_key, return fd; } -static int create_prog_dummy1(void) +static int create_prog_dummy1(enum bpf_map_type prog_type) { struct bpf_insn prog[] = { BPF_MOV64_IMM(BPF_REG_0, 42), BPF_EXIT_INSN(), }; - return bpf_load_program(BPF_PROG_TYPE_SOCKET_FILTER, prog, + return bpf_load_program(prog_type, prog, ARRAY_SIZE(prog), "GPL", 0, NULL, 0); } -static int create_prog_dummy2(int mfd, int idx) +static int create_prog_dummy2(enum bpf_map_type prog_type, int mfd, int idx) { struct bpf_insn prog[] = { BPF_MOV64_IMM(BPF_REG_3, idx), @@ -12674,11 +12674,12 @@ static int create_prog_dummy2(int mfd, int idx) BPF_EXIT_INSN(), }; - return bpf_load_program(BPF_PROG_TYPE_SOCKET_FILTER, prog, + return bpf_load_program(prog_type, prog, ARRAY_SIZE(prog), "GPL", 0, NULL, 0); } -static int create_prog_array(uint32_t max_elem, int p1key) +static int create_prog_array(enum bpf_map_type prog_type, uint32_t max_elem, +int p1key) { int p2key = 1; int mfd, p1fd, p2fd; @@ -12690,8 +12691,8 @@ static int create_prog_array(uint32_t max_elem, int p1key) return -1; } - p1fd = create_prog_dummy1(); - p2fd = create_prog_dummy2(mfd, p2key); + p1fd = create_prog_dummy1(prog_type); + p2fd = create_prog_dummy2(prog_type, mfd, p2key); if (p1fd < 0 || p2fd < 0) goto out; if (bpf_map_update_elem(mfd, &p1key, &p1fd, BPF_ANY) < 0) @@ -12748,8 +12749,8 @@ static int create_cgroup_storage(bool percpu) static char bpf_vlog[UINT_MAX >> 8]; -static void do_test_fixup(struct bpf_test *test, struct bpf_insn *prog, - int *map_fds) +static void do_test_fixup(struct bpf_test *test, enum bpf_map_type prog_type, + struct bpf_insn *prog, int *map_fds) { int *fixup_map1 = test->fixup_map1; int *fixup_map2 = test->fixup_map2; @@ -12805,7 +12806,7 @@ static void do_test_fixup(struct bpf_test *test, struct bpf_insn *prog, } if (*fixup_prog1) { - map_fds[4] = create_prog_array(4, 0); + map_fds[4] = create_prog_array(prog_type, 4, 0); do { prog[*fixup_prog1].imm = map_fds[4]; fixup_prog1++; @@ -12813,7 +12814,7 @@ static void do_test_fixup(struct bpf_test *test, struct bpf_insn *prog, } if (*fixup_prog2) { - map_fds[5] = create_prog_array(8, 7); + map_fds[5] = create_prog_array(prog_type, 8, 7); do { prog[*fixup_prog2].imm = map_fds[5]; fixup_prog2++; @@ -12859,11 +12860,13 @@ static void do_test_single(struct bpf_test *test, bool unpriv, for (i = 0; i < MAX_NR_MAPS; i++) map_fds[i] = -1; - do_test_fixup(test, prog, map_fds); + if (!prog_type) + prog_type = BPF_PROG_TYPE_SOCKET_FILTER; + do_test_fixup(test, prog_type, prog, map_fds); prog_len = probe_filter_length(prog); - fd_prog = bpf_verify_program(prog_type ? : BPF_PROG_TYPE_SOCKET_FILTER, -prog, prog_len, test->flags & F_LOAD_WITH_STRICT_ALIGNMENT, + fd_prog = bpf_verify_program(prog_type, prog, prog_len, +test->flags & F_LOAD_WITH_STRICT_ALIGNMENT, "GPL", 0, bpf_vlog, sizeof(bpf_vlog), 1); expected_ret = unpriv && test->result_unpriv != UNDEF ? -- 2.17.1
[PATCHv3 bpf-next 02/12] bpf: Simplify ptr_min_max_vals adjustment
An upcoming commit will add another two pointer types that need very similar behaviour, so generalise this function now. Signed-off-by: Joe Stringer Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 22 ++--- tools/testing/selftests/bpf/test_verifier.c | 14 ++--- 2 files changed, 17 insertions(+), 19 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 18347de310ad..87b75efc1dc1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2669,20 +2669,18 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, return -EACCES; } - if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { - verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n", - dst); - return -EACCES; - } - if (ptr_reg->type == CONST_PTR_TO_MAP) { - verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n", - dst); + switch (ptr_reg->type) { + case PTR_TO_MAP_VALUE_OR_NULL: + verbose(env, "R%d pointer arithmetic on %s prohibited, null-check it first\n", + dst, reg_type_str[ptr_reg->type]); return -EACCES; - } - if (ptr_reg->type == PTR_TO_PACKET_END) { - verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n", - dst); + case CONST_PTR_TO_MAP: + case PTR_TO_PACKET_END: + verbose(env, "R%d pointer arithmetic on %s prohibited\n", + dst, reg_type_str[ptr_reg->type]); return -EACCES; + default: + break; } /* In case of 'scalar += pointer', dst_reg inherits pointer type and id. diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index c7d25f23baf9..a90be44f61e0 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -3638,7 +3638,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .errstr = "R3 pointer arithmetic on PTR_TO_PACKET_END", + .errstr = "R3 pointer arithmetic on pkt_end", .result = REJECT, .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, @@ -4896,7 +4896,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .fixup_map1 = { 4 }, - .errstr = "R4 pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL", + .errstr = "R4 pointer arithmetic on map_value_or_null", .result = REJECT, .prog_type = BPF_PROG_TYPE_SCHED_CLS }, @@ -4917,7 +4917,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .fixup_map1 = { 4 }, - .errstr = "R4 pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL", + .errstr = "R4 pointer arithmetic on map_value_or_null", .result = REJECT, .prog_type = BPF_PROG_TYPE_SCHED_CLS }, @@ -4938,7 +4938,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .fixup_map1 = { 4 }, - .errstr = "R4 pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL", + .errstr = "R4 pointer arithmetic on map_value_or_null", .result = REJECT, .prog_type = BPF_PROG_TYPE_SCHED_CLS }, @@ -7253,7 +7253,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .fixup_map_in_map = { 3 }, - .errstr = "R1 pointer arithmetic on CONST_PTR_TO_MAP prohibited", + .errstr = "R1 pointer arithmetic on map_ptr prohibited", .result = REJECT, }, { @@ -8927,7 +8927,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .errstr = "R3 pointer arithmetic on PTR_TO_PACKET_END", + .errstr = "R3 pointer arithmetic on pkt_end", .result = REJECT, .prog_type = BPF_PROG_TYPE_XDP, }, @@ -8946,7 +8946,7 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, - .errstr = "R3 pointer arithmetic on PTR_TO_PACKET_END", + .errstr = "R3 pointer arithmetic on pkt_end", .result = REJECT, .prog_type = BPF_PROG_TYPE_XDP, }, -- 2.17.1
[PATCHv3 bpf-next 05/12] bpf: Macrofy stack state copy
An upcoming commit will need very similar copy/realloc boilerplate, so refactor the existing stack copy/realloc functions into macros to simplify it. Signed-off-by: Joe Stringer Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 106 -- 1 file changed, 60 insertions(+), 46 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d4abbf0d5727..cf8704d137fa 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -388,60 +388,74 @@ static void print_verifier_state(struct bpf_verifier_env *env, verbose(env, "\n"); } -static int copy_stack_state(struct bpf_func_state *dst, - const struct bpf_func_state *src) -{ - if (!src->stack) - return 0; - if (WARN_ON_ONCE(dst->allocated_stack < src->allocated_stack)) { - /* internal bug, make state invalid to reject the program */ - memset(dst, 0, sizeof(*dst)); - return -EFAULT; - } - memcpy(dst->stack, src->stack, - sizeof(*src->stack) * (src->allocated_stack / BPF_REG_SIZE)); - return 0; -} +#define COPY_STATE_FN(NAME, COUNT, FIELD, SIZE) \ +static int copy_##NAME##_state(struct bpf_func_state *dst, \ + const struct bpf_func_state *src)\ +{ \ + if (!src->FIELD)\ + return 0; \ + if (WARN_ON_ONCE(dst->COUNT < src->COUNT)) {\ + /* internal bug, make state invalid to reject the program */ \ + memset(dst, 0, sizeof(*dst)); \ + return -EFAULT; \ + } \ + memcpy(dst->FIELD, src->FIELD, \ + sizeof(*src->FIELD) * (src->COUNT / SIZE)); \ + return 0; \ +} +/* copy_stack_state() */ +COPY_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE) +#undef COPY_STATE_FN + +#define REALLOC_STATE_FN(NAME, COUNT, FIELD, SIZE) \ +static int realloc_##NAME##_state(struct bpf_func_state *state, int size, \ + bool copy_old)\ +{ \ + u32 old_size = state->COUNT;\ + struct bpf_##NAME##_state *new_##FIELD; \ + int slot = size / SIZE; \ + \ + if (size <= old_size || !size) {\ + if (copy_old) \ + return 0; \ + state->COUNT = slot * SIZE; \ + if (!size && old_size) {\ + kfree(state->FIELD);\ + state->FIELD = NULL;\ + } \ + return 0; \ + } \ + new_##FIELD = kmalloc_array(slot, sizeof(struct bpf_##NAME##_state), \ + GFP_KERNEL);\ + if (!new_##FIELD) \ + return -ENOMEM; \ + if (copy_old) { \ + if (state->FIELD) \ + memcpy(new_##FIELD, state->FIELD, \ + sizeof(*new_##FIELD) * (old_size / SIZE)); \ + memset(new_##FIELD + old_size / SIZE, 0,\ + sizeof(*new_##FIELD) * (size - old_size) / SIZE); \ + } \ + state->COUNT = slot * SIZE; \ + kfree(state->FIELD);\ + state->FIELD = new_##FIELD; \ + return 0; \ +} +/* realloc_stack_state() */ +REALLOC_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE) +#undef REALLOC_STATE_FN /* do_check() starts with zero-sized stack in struct bpf_verifier_state to * make it consume minim
[PATCHv3 bpf-next 01/12] bpf: Add iterator for spilled registers
Add this iterator for spilled registers, it concentrates the details of how to get the current frame's spilled registers into a single macro while clarifying the intention of the code which is calling the macro. Signed-off-by: Joe Stringer Acked-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 11 +++ kernel/bpf/verifier.c| 16 +++- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index b42b60a83e19..af262b97f586 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -131,6 +131,17 @@ struct bpf_verifier_state { u32 curframe; }; +#define __get_spilled_reg(slot, frame) \ + (((slot < frame->allocated_stack / BPF_REG_SIZE) && \ + (frame->stack[slot].slot_type[0] == STACK_SPILL)) \ +? &frame->stack[slot].spilled_ptr : NULL) + +/* Iterate over 'frame', setting 'reg' to either NULL or a spilled register. */ +#define for_each_spilled_reg(iter, frame, reg) \ + for (iter = 0, reg = __get_spilled_reg(iter, frame);\ +iter < frame->allocated_stack / BPF_REG_SIZE; \ +iter++, reg = __get_spilled_reg(iter, frame)) + /* linked list of verifier states used to prune search */ struct bpf_verifier_state_list { struct bpf_verifier_state state; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a8cc83a970d1..18347de310ad 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2252,10 +2252,9 @@ static void __clear_all_pkt_pointers(struct bpf_verifier_env *env, if (reg_is_pkt_pointer_any(®s[i])) mark_reg_unknown(env, regs, i); - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].slot_type[0] != STACK_SPILL) + for_each_spilled_reg(i, state, reg) { + if (!reg) continue; - reg = &state->stack[i].spilled_ptr; if (reg_is_pkt_pointer_any(reg)) __mark_reg_unknown(reg); } @@ -3395,10 +3394,9 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, for (j = 0; j <= vstate->curframe; j++) { state = vstate->frame[j]; - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].slot_type[0] != STACK_SPILL) + for_each_spilled_reg(i, state, reg) { + if (!reg) continue; - reg = &state->stack[i].spilled_ptr; if (reg->type == type && reg->id == dst_reg->id) reg->range = max(reg->range, new_range); } @@ -3643,7 +3641,7 @@ static void mark_map_regs(struct bpf_verifier_state *vstate, u32 regno, bool is_null) { struct bpf_func_state *state = vstate->frame[vstate->curframe]; - struct bpf_reg_state *regs = state->regs; + struct bpf_reg_state *reg, *regs = state->regs; u32 id = regs[regno].id; int i, j; @@ -3652,8 +3650,8 @@ static void mark_map_regs(struct bpf_verifier_state *vstate, u32 regno, for (j = 0; j <= vstate->curframe; j++) { state = vstate->frame[j]; - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].slot_type[0] != STACK_SPILL) + for_each_spilled_reg(i, state, reg) { + if (!reg) continue; mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null); } -- 2.17.1
[PATCHv3 bpf-next 11/12] selftests/bpf: Add C tests for reference tracking
Add some tests that demonstrate and test the balanced lookup/free nature of socket lookup. Section names that start with "fail" represent programs that are expected to fail verification; all others should succeed. Signed-off-by: Joe Stringer Acked-by: Alexei Starovoitov --- v3: Rebase against flags arg change of bpf_sk_release() New tests: * "fail_use_after_free" * "fail_modify_sk_pointer" * "fail_modify_sk_or_null_pointer" --- tools/testing/selftests/bpf/Makefile | 2 +- tools/testing/selftests/bpf/test_progs.c | 38 .../selftests/bpf/test_sk_lookup_kern.c | 180 ++ 3 files changed, 219 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/test_sk_lookup_kern.c diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index f802de526f57..1381ab81099c 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -36,7 +36,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \ test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o \ get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o \ - test_skb_cgroup_id_kern.o bpf_flow.o netcnt_prog.o + test_skb_cgroup_id_kern.o bpf_flow.o netcnt_prog.o test_sk_lookup_kern.o # Order correspond to 'make run_tests' order TEST_PROGS := test_kmod.sh \ diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 63a671803ed6..e8becca9c521 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -1698,6 +1698,43 @@ static void test_task_fd_query_tp(void) "sys_enter_read"); } +static void test_reference_tracking() +{ + const char *file = "./test_sk_lookup_kern.o"; + struct bpf_object *obj; + struct bpf_program *prog; + __u32 duration; + int err = 0; + + obj = bpf_object__open(file); + if (IS_ERR(obj)) { + error_cnt++; + return; + } + + bpf_object__for_each_program(prog, obj) { + const char *title; + + /* Ignore .text sections */ + title = bpf_program__title(prog, false); + if (strstr(title, ".text") != NULL) + continue; + + bpf_program__set_type(prog, BPF_PROG_TYPE_SCHED_CLS); + + /* Expect verifier failure if test name has 'fail' */ + if (strstr(title, "fail") != NULL) { + libbpf_set_print(NULL, NULL, NULL); + err = !bpf_program__load(prog, "GPL", 0); + libbpf_set_print(printf, printf, NULL); + } else { + err = bpf_program__load(prog, "GPL", 0); + } + CHECK(err, title, "\n"); + } + bpf_object__close(obj); +} + int main(void) { jit_enabled = is_jit_enabled(); @@ -1719,6 +1756,7 @@ int main(void) test_get_stack_raw_tp(); test_task_fd_query_rawtp(); test_task_fd_query_tp(); + test_reference_tracking(); printf("Summary: %d PASSED, %d FAILED\n", pass_cnt, error_cnt); return error_cnt ? EXIT_FAILURE : EXIT_SUCCESS; diff --git a/tools/testing/selftests/bpf/test_sk_lookup_kern.c b/tools/testing/selftests/bpf/test_sk_lookup_kern.c new file mode 100644 index ..b745bdc08c2b --- /dev/null +++ b/tools/testing/selftests/bpf/test_sk_lookup_kern.c @@ -0,0 +1,180 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +// Copyright (c) 2018 Covalent IO, Inc. http://covalent.io + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "bpf_helpers.h" +#include "bpf_endian.h" + +int _version SEC("version") = 1; +char _license[] SEC("license") = "GPL"; + +/* Fill 'tuple' with L3 info, and attempt to find L4. On fail, return NULL. */ +static struct bpf_sock_tuple *get_tuple(void *data, __u64 nh_off, + void *data_end, __u16 eth_proto, + bool *ipv4) +{ + struct bpf_sock_tuple *result; + __u8 proto = 0; + __u64 ihl_len; + + if (eth_proto == bpf_htons(ETH_P_IP)) { + struct iphdr *iph = (struct iphdr *)(data + nh_off); + + if (iph + 1 > data_end) + return NULL; + ihl_len = iph->ihl * 4; + proto = iph->protocol; + *ipv4 = true; + result = (struct bpf_sock_tuple *)&iph->saddr; + } else if (eth_proto == bpf_htons(ETH_P_IPV6)) { + struct ipv6hdr *ip6h = (struct ipv6hdr *)(data + nh_off); + + if (ip6h + 1 > data_end) + return NULL
[PATCHv3 bpf-next 12/12] Documentation: Describe bpf reference tracking
Document the new pointer types in the verifier and how the pointer ID tracking works to ensure that references which are taken are later released. Signed-off-by: Joe Stringer Acked-by: Alexei Starovoitov --- Documentation/networking/filter.txt | 64 + 1 file changed, 64 insertions(+) diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt index e6b4ebb2b243..4443ce958862 100644 --- a/Documentation/networking/filter.txt +++ b/Documentation/networking/filter.txt @@ -1125,6 +1125,14 @@ pointer type. The types of pointers describe their base, as follows: PTR_TO_STACKFrame pointer. PTR_TO_PACKET skb->data. PTR_TO_PACKET_END skb->data + headlen; arithmetic forbidden. +PTR_TO_SOCKET Pointer to struct bpf_sock_ops, implicitly refcounted. +PTR_TO_SOCKET_OR_NULL +Either a pointer to a socket, or NULL; socket lookup +returns this type, which becomes a PTR_TO_SOCKET when +checked != NULL. PTR_TO_SOCKET is reference-counted, +so programs must release the reference through the +socket release function before the end of the program. +Arithmetic on these pointers is forbidden. However, a pointer may be offset from this base (as a result of pointer arithmetic), and this is tracked in two parts: the 'fixed offset' and 'variable offset'. The former is used when an exactly-known value (e.g. an immediate @@ -1171,6 +1179,13 @@ over the Ethernet header, then reads IHL and addes (IHL * 4), the resulting pointer will have a variable offset known to be 4n+2 for some n, so adding the 2 bytes (NET_IP_ALIGN) gives a 4-byte alignment and so word-sized accesses through that pointer are safe. +The 'id' field is also used on PTR_TO_SOCKET and PTR_TO_SOCKET_OR_NULL, common +to all copies of the pointer returned from a socket lookup. This has similar +behaviour to the handling for PTR_TO_MAP_VALUE_OR_NULL->PTR_TO_MAP_VALUE, but +it also handles reference tracking for the pointer. PTR_TO_SOCKET implicitly +represents a reference to the corresponding 'struct sock'. To ensure that the +reference is not leaked, it is imperative to NULL-check the reference and in +the non-NULL case, and pass the valid reference to the socket release function. Direct packet access @@ -1444,6 +1459,55 @@ Error: 8: (7a) *(u64 *)(r0 +0) = 1 R0 invalid mem access 'imm' +Program that performs a socket lookup then sets the pointer to NULL without +checking it: +value: + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -8), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_MOV64_IMM(BPF_REG_3, 4), + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_MOV64_IMM(BPF_REG_5, 0), + BPF_EMIT_CALL(BPF_FUNC_sk_lookup_tcp), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), +Error: + 0: (b7) r2 = 0 + 1: (63) *(u32 *)(r10 -8) = r2 + 2: (bf) r2 = r10 + 3: (07) r2 += -8 + 4: (b7) r3 = 4 + 5: (b7) r4 = 0 + 6: (b7) r5 = 0 + 7: (85) call bpf_sk_lookup_tcp#65 + 8: (b7) r0 = 0 + 9: (95) exit + Unreleased reference id=1, alloc_insn=7 + +Program that performs a socket lookup but does not NULL-check the returned +value: + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -8), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_MOV64_IMM(BPF_REG_3, 4), + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_MOV64_IMM(BPF_REG_5, 0), + BPF_EMIT_CALL(BPF_FUNC_sk_lookup_tcp), + BPF_EXIT_INSN(), +Error: + 0: (b7) r2 = 0 + 1: (63) *(u32 *)(r10 -8) = r2 + 2: (bf) r2 = r10 + 3: (07) r2 += -8 + 4: (b7) r3 = 4 + 5: (b7) r4 = 0 + 6: (b7) r5 = 0 + 7: (85) call bpf_sk_lookup_tcp#65 + 8: (95) exit + Unreleased reference id=1, alloc_insn=7 + Testing --- -- 2.17.1
[PATCHv3 bpf-next 07/12] bpf: Add helper to retrieve socket in BPF
This patch adds new BPF helper functions, bpf_sk_lookup_tcp() and bpf_sk_lookup_udp() which allows BPF programs to find out if there is a socket listening on this host, and returns a socket pointer which the BPF program can then access to determine, for instance, whether to forward or drop traffic. bpf_sk_lookup_xxx() may take a reference on the socket, so when a BPF program makes use of this function, it must subsequently pass the returned pointer into the newly added sk_release() to return the reference. By way of example, the following pseudocode would filter inbound connections at XDP if there is no corresponding service listening for the traffic: struct bpf_sock_tuple tuple; struct bpf_sock_ops *sk; populate_tuple(ctx, &tuple); // Extract the 5tuple from the packet sk = bpf_sk_lookup_tcp(ctx, &tuple, sizeof tuple, netns, 0); if (!sk) { // Couldn't find a socket listening for this traffic. Drop. return TC_ACT_SHOT; } bpf_sk_release(sk); return TC_ACT_OK; Signed-off-by: Joe Stringer --- v2: Rework 'struct bpf_sock_tuple' to allow passing a packet pointer Limit netns_id field to 32 bits Fix compile error with CONFIG_IPV6 enabled Allow direct packet access from helper v3: Fix release of caller_net when netns is not specified. Use skb->sk to find caller net when skb->dev is unavailable. Remove flags argument to sk_release() Define the semantics of the new helpers more clearly. --- include/uapi/linux/bpf.h | 93 - kernel/bpf/verifier.c | 8 +- net/core/filter.c | 151 ++ tools/include/uapi/linux/bpf.h| 93 - tools/testing/selftests/bpf/bpf_helpers.h | 12 ++ 5 files changed, 354 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e2070d819e04..f9187b41dff6 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2144,6 +2144,77 @@ union bpf_attr { * request in the skb. * Return * 0 on success, or a negative error in case of failure. + * + * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags) + * Description + * Look for TCP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-NULL, released via **bpf_sk_release**\ (). + * + * The *ctx* should point to the context of the program, such as + * the skb or socket (depending on the hook in use). This is used + * to determine the base network namespace for the lookup. + * + * *tuple_size* must be one of: + * + * **sizeof**\ (*tuple*\ **->ipv4**) + * Look for an IPv4 socket. + * **sizeof**\ (*tuple*\ **->ipv6**) + * Look for an IPv6 socket. + * + * If the *netns* is zero, then the socket lookup table in the + * netns associated with the *ctx* will be used. For the TC hooks, + * this in the netns of the device in the skb. For socket hooks, + * this in the netns of the socket. If *netns* is non-zero, then + * it specifies the ID of the netns relative to the netns + * associated with the *ctx*. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * Return + * Pointer to *struct bpf_sock*, or NULL in case of failure. + * + * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags) + * Description + * Look for UDP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-NULL, released via **bpf_sk_release**\ (). + * + * The *ctx* should point to the context of the program, such as + * the skb or socket (depending on the hook in use). This is used + * to determine the base network namespace for the lookup. + * + * *tuple_size* must be one of: + * + * **sizeof**\ (*tuple*\ **->ipv4**) + * Look for an IPv4 socket. + * **sizeof**\ (*tuple*\ **->ipv6**) + * Look for an IPv6 socket. + * + * If the *netns* is zero, then the socket lookup table in the + * netns associated with the *ctx* will be used. For the TC hooks, + * this in the netns of the device in the skb. For socket hooks, + * this in the netns of the socket. If *netns* is non-zero, then + * it specifies the ID of the netns relative to the netns +
[PATCHv3 bpf-next 03/12] bpf: Generalize ptr_or_null regs check
This check will be reused by an upcoming commit for conditional jump checks for sockets. Refactor it a bit to simplify the later commit. Signed-off-by: Joe Stringer Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 43 +-- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 87b75efc1dc1..bbb0a812ee81 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -249,6 +249,11 @@ static bool type_is_pkt_pointer(enum bpf_reg_type type) type == PTR_TO_PACKET_META; } +static bool reg_type_may_be_null(enum bpf_reg_type type) +{ + return type == PTR_TO_MAP_VALUE_OR_NULL; +} + /* string representation of 'enum bpf_reg_type' */ static const char * const reg_type_str[] = { [NOT_INIT] = "?", @@ -3600,12 +3605,10 @@ static void reg_combine_min_max(struct bpf_reg_state *true_src, } } -static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, -bool is_null) +static void mark_ptr_or_null_reg(struct bpf_reg_state *reg, u32 id, +bool is_null) { - struct bpf_reg_state *reg = ®s[regno]; - - if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) { + if (reg_type_may_be_null(reg->type) && reg->id == id) { /* Old offset (both fixed and variable parts) should * have been known-zero, because we don't allow pointer * arithmetic on pointers that might be NULL. @@ -3618,11 +3621,13 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, } if (is_null) { reg->type = SCALAR_VALUE; - } else if (reg->map_ptr->inner_map_meta) { - reg->type = CONST_PTR_TO_MAP; - reg->map_ptr = reg->map_ptr->inner_map_meta; - } else { - reg->type = PTR_TO_MAP_VALUE; + } else if (reg->type == PTR_TO_MAP_VALUE_OR_NULL) { + if (reg->map_ptr->inner_map_meta) { + reg->type = CONST_PTR_TO_MAP; + reg->map_ptr = reg->map_ptr->inner_map_meta; + } else { + reg->type = PTR_TO_MAP_VALUE; + } } /* We don't need id from this point onwards anymore, thus we * should better reset it, so that state pruning has chances @@ -3635,8 +3640,8 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, /* The logic is similar to find_good_pkt_pointers(), both could eventually * be folded together at some point. */ -static void mark_map_regs(struct bpf_verifier_state *vstate, u32 regno, - bool is_null) +static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno, + bool is_null) { struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *reg, *regs = state->regs; @@ -3644,14 +3649,14 @@ static void mark_map_regs(struct bpf_verifier_state *vstate, u32 regno, int i, j; for (i = 0; i < MAX_BPF_REG; i++) - mark_map_reg(regs, i, id, is_null); + mark_ptr_or_null_reg(®s[i], id, is_null); for (j = 0; j <= vstate->curframe; j++) { state = vstate->frame[j]; for_each_spilled_reg(i, state, reg) { if (!reg) continue; - mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null); + mark_ptr_or_null_reg(reg, id, is_null); } } } @@ -3853,12 +3858,14 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, /* detect if R == 0 where R is returned from bpf_map_lookup_elem() */ if (BPF_SRC(insn->code) == BPF_K && insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) && - dst_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { - /* Mark all identical map registers in each branch as either + reg_type_may_be_null(dst_reg->type)) { + /* Mark all identical registers in each branch as either * safe or unknown depending R == 0 or R != 0 conditional. */ - mark_map_regs(this_branch, insn->dst_reg, opcode == BPF_JNE); - mark_map_regs(other_branch, insn->dst_reg, opcode == BPF_JEQ); + mark_ptr_or_null_regs(this_branch, insn->dst_reg, + opcode == BPF_JNE); + mark_ptr_or_null_regs(other_branch, insn->dst_reg, + opcode == BPF_JEQ); } else if (!try_match_pkt_pointers(insn, dst_reg, ®s[insn->src_reg],
[PATCHv3 bpf-next 09/12] selftests/bpf: Add tests for reference tracking
reference tracking: leak potential reference reference tracking: leak potential reference on stack reference tracking: leak potential reference on stack 2 reference tracking: zero potential reference reference tracking: copy and zero potential references reference tracking: release reference without check reference tracking: release reference reference tracking: release reference twice reference tracking: release reference twice inside branch reference tracking: alloc, check, free in one subbranch reference tracking: alloc, check, free in both subbranches reference tracking in call: free reference in subprog reference tracking in call: free reference in subprog and outside reference tracking in call: alloc & leak reference in subprog reference tracking in call: alloc in subprog, release outside reference tracking in call: sk_ptr leak into caller stack reference tracking in call: sk_ptr spill into caller stack reference tracking: allow LD_ABS reference tracking: forbid LD_ABS while holding reference reference tracking: allow LD_IND reference tracking: forbid LD_IND while holding reference reference tracking: check reference or tail call reference tracking: release reference then tail call reference tracking: leak possible reference over tail call reference tracking: leak checked reference over tail call reference tracking: mangle and release sock_or_null reference tracking: mangle and release sock reference tracking: access member reference tracking: write to member reference tracking: invalid 64-bit access of member reference tracking: access after release reference tracking: direct access for lookup Signed-off-by: Joe Stringer --- v3: Rebase against bpf_sk_release() flags argument removal. Removed Alexei's ack since there are many new tests: * "reference tracking: allow LD_ABS", * "reference tracking: forbid LD_ABS while holding reference", * "reference tracking: allow LD_IND", * "reference tracking: forbid LD_IND while holding reference", * "reference tracking: check reference or tail call", * "reference tracking: release reference then tail call", * "reference tracking: leak possible reference over tail call", * "reference tracking: leak checked reference over tail call", * "reference tracking: mangle and release sock_or_null", * "reference tracking: mangle and release sock", * "reference tracking: access member", * "reference tracking: write to member", * "reference tracking: invalid 64-bit access of member", * "reference tracking: access after release", * "reference tracking: direct access for lookup", --- tools/testing/selftests/bpf/test_verifier.c | 625 1 file changed, 625 insertions(+) diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 020b1467e565..9fad54b0bbd0 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -3,6 +3,7 @@ * * Copyright (c) 2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2017 Facebook + * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -178,6 +179,24 @@ static void bpf_fill_rand_ld_dw(struct bpf_test *self) self->retval = (uint32_t)res; } +/* BPF_SK_LOOKUP contains 13 instructions, if you need to fix up maps */ +#define BPF_SK_LOOKUP \ + /* struct bpf_sock_tuple tuple = {} */ \ + BPF_MOV64_IMM(BPF_REG_2, 0),\ + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -8), \ + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -16),\ + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -24),\ + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -32),\ + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -40),\ + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -48),\ + /* sk = sk_lookup_tcp(ctx, &tuple, sizeof tuple, 0, 0) */ \ + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), \ + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -48), \ + BPF_MOV64_IMM(BPF_REG_3, sizeof(struct bpf_sock_tuple)),\ + BPF_MOV64_IMM(BPF_REG_4, 0),\ + BPF_MOV64_IMM(BPF_REG_5, 0),\ + BPF_EMIT_CALL(BPF_FUNC_sk_lookup_tcp) + static struct bpf_test tests[] = { { "add+sub+mul", @@ -12557,6 +12576,214 @@ static struct bpf_test tests[] = { .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = ACCEPT, }, + { + "reference tracking: leak potential reference", + .insns = { + BPF_SK
[PATCHv3 bpf-next 06/12] bpf: Add reference tracking to verifier
Allow helper functions to acquire a reference and return it into a register. Specific pointer types such as the PTR_TO_SOCKET will implicitly represent such a reference. The verifier must ensure that these references are released exactly once in each path through the program. To achieve this, this commit assigns an id to the pointer and tracks it in the 'bpf_func_state', then when the function or program exits, verifies that all of the acquired references have been freed. When the pointer is passed to a function that frees the reference, it is removed from the 'bpf_func_state` and all existing copies of the pointer in registers are marked invalid. Signed-off-by: Joe Stringer Acked-by: Alexei Starovoitov --- v2: Replace ptr_id defensive coding when releasing reference state with an internal error (-EFAULT) v3: No changes. --- include/linux/bpf_verifier.h | 24 ++- kernel/bpf/verifier.c| 303 --- 2 files changed, 306 insertions(+), 21 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 23a2b17bfd75..23f222e0cb0b 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -104,6 +104,17 @@ struct bpf_stack_state { u8 slot_type[BPF_REG_SIZE]; }; +struct bpf_reference_state { + /* Track each reference created with a unique id, even if the same +* instruction creates the reference multiple times (eg, via CALL). +*/ + int id; + /* Instruction where the allocation of this reference occurred. This +* is used purely to inform the user of a reference leak. +*/ + int insn_idx; +}; + /* state of the program: * type of all registers and stack info */ @@ -121,7 +132,9 @@ struct bpf_func_state { */ u32 subprogno; - /* should be second to last. See copy_func_state() */ + /* The following fields should be last. See copy_func_state() */ + int acquired_refs; + struct bpf_reference_state *refs; int allocated_stack; struct bpf_stack_state *stack; }; @@ -217,11 +230,16 @@ __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log, __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, const char *fmt, ...); -static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env) +static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env) { struct bpf_verifier_state *cur = env->cur_state; - return cur->frame[cur->curframe]->regs; + return cur->frame[cur->curframe]; +} + +static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env) +{ + return cur_func(env)->regs; } int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index cf8704d137fa..dcc5e8cab537 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1,5 +1,6 @@ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook + * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -140,6 +141,18 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { * * After the call R0 is set to return type of the function and registers R1-R5 * are set to NOT_INIT to indicate that they are no longer readable. + * + * The following reference types represent a potential reference to a kernel + * resource which, after first being allocated, must be checked and freed by + * the BPF program: + * - PTR_TO_SOCKET_OR_NULL, PTR_TO_SOCKET + * + * When the verifier sees a helper call return a reference type, it allocates a + * pointer id for the reference and stores it in the current function state. + * Similar to the way that PTR_TO_MAP_VALUE_OR_NULL is converted into + * PTR_TO_MAP_VALUE, PTR_TO_SOCKET_OR_NULL becomes PTR_TO_SOCKET when the type + * passes through a NULL-check conditional. For the branch wherein the state is + * changed to CONST_IMM, the verifier releases the reference. */ /* verifier_state + insn_idx are pushed to stack when branch is encountered */ @@ -189,6 +202,7 @@ struct bpf_call_arg_meta { int access_size; s64 msize_smax_value; u64 msize_umax_value; + int ptr_id; }; static DEFINE_MUTEX(bpf_verifier_lock); @@ -251,7 +265,42 @@ static bool type_is_pkt_pointer(enum bpf_reg_type type) static bool reg_type_may_be_null(enum bpf_reg_type type) { - return type == PTR_TO_MAP_VALUE_OR_NULL; + return type == PTR_TO_MAP_VALUE_OR_NULL || + type == PTR_TO_SOCKET_OR_NULL; +} + +static bool type_is_refcounted(enum bpf_reg_type type) +{ + return type == PTR_TO_SOCKET; +} + +static bool type_is_refcounted_or_null(enum bpf_reg_type type) +{ + return type == PTR_TO_SOC
[PATCHv3 bpf-next 10/12] libbpf: Support loading individual progs
Allow the individual program load to be invoked. This will help with testing, where a single ELF may contain several sections, some of which denote subprograms that are expected to fail verification, along with some which are expected to pass verification. By allowing programs to be iterated and individually loaded, each program can be independently checked against its expected verification result. Signed-off-by: Joe Stringer Acked-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 4 ++-- tools/lib/bpf/libbpf.h | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 425d5ca45c97..9e68fd9fcfca 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -228,7 +228,7 @@ struct bpf_object { }; #define obj_elf_valid(o) ((o)->efile.elf) -static void bpf_program__unload(struct bpf_program *prog) +void bpf_program__unload(struct bpf_program *prog) { int i; @@ -1375,7 +1375,7 @@ load_program(enum bpf_prog_type type, enum bpf_attach_type expected_attach_type, return ret; } -static int +int bpf_program__load(struct bpf_program *prog, char *license, u32 kern_version) { diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 511c1294dcbf..2ed24d3f80b3 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -128,10 +128,13 @@ void bpf_program__set_ifindex(struct bpf_program *prog, __u32 ifindex); const char *bpf_program__title(struct bpf_program *prog, bool needs_copy); +int bpf_program__load(struct bpf_program *prog, char *license, + u32 kern_version); int bpf_program__fd(struct bpf_program *prog); int bpf_program__pin_instance(struct bpf_program *prog, const char *path, int instance); int bpf_program__pin(struct bpf_program *prog, const char *path); +void bpf_program__unload(struct bpf_program *prog); struct bpf_insn; -- 2.17.1
Re: KMSAN: uninit-value in __dev_mc_add
Hello, Eric, all, > I dunno, your patch looks quite not the right fix. I agree, it looks more like a dirty hack. Unfortunately, I lack the deep expertise in the network stack subsystem, so I've posted the patch to, sort of, start a discussion and probably get some hints. > If TUN is able to change dev->type, how comes it does not set the > appropriate dev->addr_len at the same time ? Well,... probably, nobody cared to do so: [drivers/net/tun.c] case TUNSETLINK: ... tun->dev->type = (int) arg; //<--- that's all! tun_debug(KERN_INFO, tun, "linktype set to %d\n", tun->dev->type); ret = 0; } break; > Really the bug seems to be deeper, and without setting proper > dev->addr_len, we'll need more 'fixes' like yours. Absolutely. Unfortunately, I wasn't able to just write such deeper patch. Let me share what I have found and let me hope to get an advise. - So setting just the tun->dev->type makes the dev struct inconsistent. - There are more field to adjust, at least dev->broadcast. Also, there are a number of *_ops fields which are all set for the Ethernet type, most probably they must be adjusted also. - There is no get_addr_len_by_link_type() or a simple way to get link layer properties by dev->type. Such settings are scattered in *_setup and *_init functions, like ipgre_tunnel_init() { ... dev->addr_len = 4; ...} Having these, I can imagine 2 ways for a proper fix. 1) Destroy the net_device in question and recreate it when changing a link type. This way all the dev fields are set right. Create it in a similar way as rtnl_newlink() does. Again, we do not have get_X_by_link_type(), so it probably will be some large switch()/case: $ grep '^#define ARPHRD_' include/uapi/linux/if_arp.h | wc -l 59 2) Leave tun an Ethernet device, add some tun->pretend_to_be_this_link_type field and change only it on TUNSETLINK. And use this field in cases for which TUNSETLINK was invented in the first place. Unfortunately, I do not have such a list. The initial the commit ff4cc3ac93e1 says: For use with wireless and other networking types it should be possible to set the ARP type via an ioctl. Surely, there can be something else which I do not see. Could anyone suggest an advice on this? Best regards, Vladis Dronov | Red Hat, Inc. | Product Security Engineer
Re: [PATCH] netfilter: check if the socket netns is correct.
On Thu, Sep 27, 2018 at 07:58:24PM -0300, Flavio Leitner wrote: > On Thu, Sep 27, 2018 at 01:46:29PM -0700, Guenter Roeck wrote: > > Hi Flavio, > > > > On Wed, Jun 27, 2018 at 10:34:25AM -0300, Flavio Leitner wrote: > > > Netfilter assumes that if the socket is present in the skb, then > > > it can be used because that reference is cleaned up while the skb > > > is crossing netns. > > > > > > We want to change that to preserve the socket reference in a future > > > patch, so this is a preparation updating netfilter to check if the > > > socket netns matches before use it. > > > > > > Signed-off-by: Flavio Leitner > > > Acked-by: Florian Westphal > > > Signed-off-by: David S. Miller > > > --- > > ... > > > --- a/net/netfilter/xt_socket.c > > > +++ b/net/netfilter/xt_socket.c > > > @@ -56,8 +56,12 @@ socket_match(const struct sk_buff *skb, struct > > > xt_action_param *par, > > > struct sk_buff *pskb = (struct sk_buff *)skb; > > > struct sock *sk = skb->sk; > > > > > > + if (!net_eq(xt_net(par), sock_net(sk))) > > > + sk = NULL; > > > + > > > > I am having trouble with this code. With CONFIG_NET_NS enabled, it crashes > > for me in read_pnet() because sk is NULL. > > > > > if (!sk) > > > sk = nf_sk_lookup_slow_v4(xt_net(par), skb, xt_in(par)); > > > > The old code seems to suggest that sk == NULL was possible. > > > > I see the problem with the Chrome OS kernel rebased to v4.19-rc5, so I > > can not guarantee that this really an upstream problem. The change seems > > odd, though. Are you sure that it is not (or, rather, no longer) necessary > > to check if sk == NULL before dereferencing it in sock_net() ? > > Oops, it is necessary but if it's not and the netns doesn't match, we need > do the lookup. So, could you check if this fixes the problem for you? > > From a5f927e7f1368d753f87cb978d630d786d5adb62 Mon Sep 17 00:00:00 2001 > From: Flavio Leitner > Date: Thu, 27 Sep 2018 19:36:28 -0300 > Subject: [PATCH] xt_socket: check sk before checking for netns. > > Only check for the network namespace if the socket is available. > > Fixes: f564650106a6 ("netfilter: check if the socket netns is correct.") > Reported-by: Guenter Roeck > Signed-off-by: Flavio Leitner This fixes the problem for me. Tested-by: Guenter Roeck Thanks, Guenter > --- > net/netfilter/xt_socket.c | 4 ++-- > 1 file changed, 2 insertions(+), 2 deletions(-) > > diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c > index 0472f3472842..ada144e5645b 100644 > --- a/net/netfilter/xt_socket.c > +++ b/net/netfilter/xt_socket.c > @@ -56,7 +56,7 @@ socket_match(const struct sk_buff *skb, struct > xt_action_param *par, > struct sk_buff *pskb = (struct sk_buff *)skb; > struct sock *sk = skb->sk; > > - if (!net_eq(xt_net(par), sock_net(sk))) > + if (sk && !net_eq(xt_net(par), sock_net(sk))) > sk = NULL; > > if (!sk) > @@ -117,7 +117,7 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct > xt_action_param *par) > struct sk_buff *pskb = (struct sk_buff *)skb; > struct sock *sk = skb->sk; > > - if (!net_eq(xt_net(par), sock_net(sk))) > + if (sk && !net_eq(xt_net(par), sock_net(sk))) > sk = NULL; > > if (!sk) > -- > 2.14.4 >
[PATCH] cfg80211: fix use-after-free in reg_process_hint()
reg_process_hint_country_ie() can free regulatory_request and return REG_REQ_ALREADY_SET. We shouldn't use regulatory_request after it's called. KASAN error was observed when this happens. BUG: KASAN: use-after-free in reg_process_hint+0x839/0x8aa [cfg80211] Read of size 4 at addr 8800c430d434 by task kworker/1:3/89 Workqueue: events reg_todo [cfg80211] Call Trace: dump_stack+0xc1/0x10c ? _atomic_dec_and_lock+0x1ad/0x1ad ? _raw_spin_lock_irqsave+0xa0/0xd2 print_address_description+0x86/0x26f ? reg_process_hint+0x839/0x8aa [cfg80211] kasan_report+0x241/0x29b reg_process_hint+0x839/0x8aa [cfg80211] reg_todo+0x204/0x5b9 [cfg80211] process_one_work+0x55f/0x8d0 ? worker_detach_from_pool+0x1b5/0x1b5 ? _raw_spin_unlock_irq+0x65/0xdd ? _raw_spin_unlock_irqrestore+0xf3/0xf3 worker_thread+0x5dd/0x841 ? kthread_parkme+0x1d/0x1d kthread+0x270/0x285 ? pr_cont_work+0xe3/0xe3 ? rcu_read_unlock_sched_notrace+0xca/0xca ret_from_fork+0x22/0x40 Allocated by task 2718: set_track+0x63/0xfa __kmalloc+0x119/0x1ac regulatory_hint_country_ie+0x38/0x329 [cfg80211] __cfg80211_connect_result+0x854/0xadd [cfg80211] cfg80211_rx_assoc_resp+0x3bc/0x4f0 [cfg80211] smsc95xx v1.0.6 ieee80211_sta_rx_queued_mgmt+0x1803/0x7ed5 [mac80211] ieee80211_iface_work+0x411/0x696 [mac80211] process_one_work+0x55f/0x8d0 worker_thread+0x5dd/0x841 kthread+0x270/0x285 ret_from_fork+0x22/0x40 Freed by task 89: set_track+0x63/0xfa kasan_slab_free+0x6a/0x87 kfree+0xdc/0x470 reg_process_hint+0x31e/0x8aa [cfg80211] reg_todo+0x204/0x5b9 [cfg80211] process_one_work+0x55f/0x8d0 worker_thread+0x5dd/0x841 kthread+0x270/0x285 ret_from_fork+0x22/0x40 Signed-off-by: Yu Zhao --- net/wireless/reg.c | 7 --- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 765dedb12361..24cfa2776f50 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -2661,11 +2661,12 @@ static void reg_process_hint(struct regulatory_request *reg_request) { struct wiphy *wiphy = NULL; enum reg_request_treatment treatment; + enum nl80211_reg_initiator initiator = reg_request->initiator; if (reg_request->wiphy_idx != WIPHY_IDX_INVALID) wiphy = wiphy_idx_to_wiphy(reg_request->wiphy_idx); - switch (reg_request->initiator) { + switch (initiator) { case NL80211_REGDOM_SET_BY_CORE: treatment = reg_process_hint_core(reg_request); break; @@ -2683,7 +2684,7 @@ static void reg_process_hint(struct regulatory_request *reg_request) treatment = reg_process_hint_country_ie(wiphy, reg_request); break; default: - WARN(1, "invalid initiator %d\n", reg_request->initiator); + WARN(1, "invalid initiator %d\n", initiator); goto out_free; } @@ -2698,7 +2699,7 @@ static void reg_process_hint(struct regulatory_request *reg_request) */ if (treatment == REG_REQ_ALREADY_SET && wiphy && wiphy->regulatory_flags & REGULATORY_STRICT_REG) { - wiphy_update_regulatory(wiphy, reg_request->initiator); + wiphy_update_regulatory(wiphy, initiator); wiphy_all_share_dfs_chan_state(wiphy); reg_check_channels(); } -- 2.19.0.605.g01d371f741-goog
[PATCH 07/11] net: remove 1 always zero parameter from ip6_redirect_no_header()
From: Maciej Żenczykowski (the parameter in question is mark) Signed-off-by: Maciej Żenczykowski --- include/net/ip6_route.h | 3 +-- net/ipv6/ndisc.c| 2 +- net/ipv6/route.c| 4 +--- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 7b9c82de11cc..cef186dbd2ce 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -165,8 +165,7 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, int oif, void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu); void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, kuid_t uid); -void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, - u32 mark); +void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif); void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk); struct netlink_callback; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 0ec273997d1d..51863ada15a4 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1533,7 +1533,7 @@ static void ndisc_redirect_rcv(struct sk_buff *skb) if (!ndopts.nd_opts_rh) { ip6_redirect_no_header(skb, dev_net(skb->dev), - skb->dev->ifindex, 0); + skb->dev->ifindex); return; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index dd8c04f253d5..27f1260e053a 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2520,8 +2520,7 @@ void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, } EXPORT_SYMBOL_GPL(ip6_redirect); -void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, - u32 mark) +void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif) { const struct ipv6hdr *iph = ipv6_hdr(skb); const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); @@ -2529,7 +2528,6 @@ void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, struct flowi6 fl6 = { .flowi6_iif = LOOPBACK_IFINDEX, .flowi6_oif = oif, - .flowi6_mark = mark, .daddr = msg->dest, .saddr = iph->daddr, .flowi6_uid = sock_net_uid(net, NULL), -- 2.19.0.605.g01d371f741-goog
[PATCH 05/11] net: ip6_redirect() - use new style struct initializer instead of memset
From: Maciej Żenczykowski (allows for better compiler optimization) Signed-off-by: Maciej Żenczykowski --- net/ipv6/route.c | 19 +-- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 9cb024451fc5..e148d197d628 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2504,16 +2504,15 @@ void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, { const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; struct dst_entry *dst; - struct flowi6 fl6; - - memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_iif = LOOPBACK_IFINDEX; - fl6.flowi6_oif = oif; - fl6.flowi6_mark = mark; - fl6.daddr = iph->daddr; - fl6.saddr = iph->saddr; - fl6.flowlabel = ip6_flowinfo(iph); - fl6.flowi6_uid = uid; + struct flowi6 fl6 = { + .flowi6_iif = LOOPBACK_IFINDEX, + .flowi6_oif = oif, + .flowi6_mark = mark, + .daddr = iph->daddr, + .saddr = iph->saddr, + .flowlabel = ip6_flowinfo(iph), + .flowi6_uid = uid, + }; dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); rt6_do_redirect(dst, NULL, skb); -- 2.19.0.605.g01d371f741-goog
[PATCH 11/11] net: inet6_rtm_getroute() - use new style struct initializer instead of memset
From: Maciej Żenczykowski Signed-off-by: Maciej Żenczykowski --- net/ipv6/route.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 9aca81772c93..aca6a84de794 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4819,7 +4819,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct rt6_info *rt; struct sk_buff *skb; struct rtmsg *rtm; - struct flowi6 fl6; + struct flowi6 fl6 = {}; bool fibmatch; err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, @@ -4828,7 +4828,6 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, goto errout; err = -EINVAL; - memset(&fl6, 0, sizeof(fl6)); rtm = nlmsg_data(nlh); fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); -- 2.19.0.605.g01d371f741-goog
[PATCH 06/11] net: ip6_redirect_no_header() - use new style struct initializer instead of memset
From: Maciej Żenczykowski (allows for better compiler optimization) Signed-off-by: Maciej Żenczykowski --- net/ipv6/route.c | 17 - 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index e148d197d628..dd8c04f253d5 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2526,15 +2526,14 @@ void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, const struct ipv6hdr *iph = ipv6_hdr(skb); const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); struct dst_entry *dst; - struct flowi6 fl6; - - memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_iif = LOOPBACK_IFINDEX; - fl6.flowi6_oif = oif; - fl6.flowi6_mark = mark; - fl6.daddr = msg->dest; - fl6.saddr = iph->daddr; - fl6.flowi6_uid = sock_net_uid(net, NULL); + struct flowi6 fl6 = { + .flowi6_iif = LOOPBACK_IFINDEX, + .flowi6_oif = oif, + .flowi6_mark = mark, + .daddr = msg->dest, + .saddr = iph->daddr, + .flowi6_uid = sock_net_uid(net, NULL), + }; dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); rt6_do_redirect(dst, NULL, skb); -- 2.19.0.605.g01d371f741-goog
[PATCH 08/11] net: ip6_update_pmtu() - use new style struct initializer instead of memset
From: Maciej Żenczykowski (allows for better compiler optimization) Signed-off-by: Maciej Żenczykowski --- net/ipv6/route.c | 17 - 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 27f1260e053a..a87b79574a91 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2345,15 +2345,14 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, { const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; struct dst_entry *dst; - struct flowi6 fl6; - - memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_oif = oif; - fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark); - fl6.daddr = iph->daddr; - fl6.saddr = iph->saddr; - fl6.flowlabel = ip6_flowinfo(iph); - fl6.flowi6_uid = uid; + struct flowi6 fl6 = { + .flowi6_oif = oif, + .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark), + .daddr = iph->daddr, + .saddr = iph->saddr, + .flowlabel = ip6_flowinfo(iph), + .flowi6_uid = uid, + }; dst = ip6_route_output(net, NULL, &fl6); if (!dst->error) -- 2.19.0.605.g01d371f741-goog
[PATCH 09/11] net: rtmsg_to_fib6_config() - use new style struct initializer instead of memset
From: Maciej Żenczykowski (allows for better compiler optimization) Signed-off-by: Maciej Żenczykowski --- net/ipv6/route.c | 32 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a87b79574a91..b8fece1d6021 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3600,23 +3600,23 @@ static void rtmsg_to_fib6_config(struct net *net, struct in6_rtmsg *rtmsg, struct fib6_config *cfg) { - memset(cfg, 0, sizeof(*cfg)); + *cfg = (struct fib6_config){ + .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? +: RT6_TABLE_MAIN, + .fc_ifindex = rtmsg->rtmsg_ifindex, + .fc_metric = rtmsg->rtmsg_metric, + .fc_expires = rtmsg->rtmsg_info, + .fc_dst_len = rtmsg->rtmsg_dst_len, + .fc_src_len = rtmsg->rtmsg_src_len, + .fc_flags = rtmsg->rtmsg_flags, + .fc_type = rtmsg->rtmsg_type, + + .fc_nlinfo.nl_net = net, - cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? -: RT6_TABLE_MAIN; - cfg->fc_ifindex = rtmsg->rtmsg_ifindex; - cfg->fc_metric = rtmsg->rtmsg_metric; - cfg->fc_expires = rtmsg->rtmsg_info; - cfg->fc_dst_len = rtmsg->rtmsg_dst_len; - cfg->fc_src_len = rtmsg->rtmsg_src_len; - cfg->fc_flags = rtmsg->rtmsg_flags; - cfg->fc_type = rtmsg->rtmsg_type; - - cfg->fc_nlinfo.nl_net = net; - - cfg->fc_dst = rtmsg->rtmsg_dst; - cfg->fc_src = rtmsg->rtmsg_src; - cfg->fc_gateway = rtmsg->rtmsg_gateway; + .fc_dst = rtmsg->rtmsg_dst, + .fc_src = rtmsg->rtmsg_src, + .fc_gateway = rtmsg->rtmsg_gateway, + }; } int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) -- 2.19.0.605.g01d371f741-goog
[PATCH 10/11] net: rtm_to_fib6_config() - use new style struct initializer instead of memset
From: Maciej Żenczykowski (allows for better compiler optimization) Signed-off-by: Maciej Żenczykowski --- net/ipv6/route.c | 23 --- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index b8fece1d6021..9aca81772c93 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4139,14 +4139,19 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, err = -EINVAL; rtm = nlmsg_data(nlh); - memset(cfg, 0, sizeof(*cfg)); - cfg->fc_table = rtm->rtm_table; - cfg->fc_dst_len = rtm->rtm_dst_len; - cfg->fc_src_len = rtm->rtm_src_len; - cfg->fc_flags = RTF_UP; - cfg->fc_protocol = rtm->rtm_protocol; - cfg->fc_type = rtm->rtm_type; + *cfg = (struct fib6_config){ + .fc_table = rtm->rtm_table, + .fc_dst_len = rtm->rtm_dst_len, + .fc_src_len = rtm->rtm_src_len, + .fc_flags = RTF_UP, + .fc_protocol = rtm->rtm_protocol, + .fc_type = rtm->rtm_type, + + .fc_nlinfo.portid = NETLINK_CB(skb).portid, + .fc_nlinfo.nlh = nlh, + .fc_nlinfo.nl_net = sock_net(skb->sk), + }; if (rtm->rtm_type == RTN_UNREACHABLE || rtm->rtm_type == RTN_BLACKHOLE || @@ -4162,10 +4167,6 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); - cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; - cfg->fc_nlinfo.nlh = nlh; - cfg->fc_nlinfo.nl_net = sock_net(skb->sk); - if (tb[RTA_GATEWAY]) { cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); cfg->fc_flags |= RTF_GATEWAY; -- 2.19.0.605.g01d371f741-goog
[PATCH 04/11] net: ip6_multipath_l3_keys() - use new style struct initializer instead of memset
From: Maciej Żenczykowski Signed-off-by: Maciej Żenczykowski --- net/ipv6/route.c | 6 +- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index d28f83e01593..9cb024451fc5 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1981,12 +1981,11 @@ static void ip6_multipath_l3_keys(const struct sk_buff *skb, u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, const struct sk_buff *skb, struct flow_keys *flkeys) { - struct flow_keys hash_keys; + struct flow_keys hash_keys = {}; u32 mhash; switch (ip6_multipath_hash_policy(net)) { case 0: - memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; if (skb) { ip6_multipath_l3_keys(skb, &hash_keys, flkeys); @@ -2006,8 +2005,6 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, if (skb->l4_hash) return skb_get_hash_raw(skb) >> 1; - memset(&hash_keys, 0, sizeof(hash_keys)); - if (!flkeys) { skb_flow_dissect_flow_keys(skb, &keys, flag); flkeys = &keys; @@ -2019,7 +2016,6 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, hash_keys.ports.dst = flkeys->ports.dst; hash_keys.basic.ip_proto = flkeys->basic.ip_proto; } else { - memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; hash_keys.addrs.v6addrs.src = fl6->saddr; hash_keys.addrs.v6addrs.dst = fl6->daddr; -- 2.19.0.605.g01d371f741-goog
[PATCH 03/11] net: fib_multipath_hash() - use new style struct initializer instead of memset
From: Maciej Żenczykowski Signed-off-by: Maciej Żenczykowski --- net/ipv4/route.c | 6 +- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 048919713f4e..17953a52fbd0 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1821,12 +1821,11 @@ static void ip_multipath_l3_keys(const struct sk_buff *skb, int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, const struct sk_buff *skb, struct flow_keys *flkeys) { - struct flow_keys hash_keys; + struct flow_keys hash_keys = {}; u32 mhash; switch (net->ipv4.sysctl_fib_multipath_hash_policy) { case 0: - memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; if (skb) { ip_multipath_l3_keys(skb, &hash_keys); @@ -1845,8 +1844,6 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, if (skb->l4_hash) return skb_get_hash_raw(skb) >> 1; - memset(&hash_keys, 0, sizeof(hash_keys)); - if (!flkeys) { skb_flow_dissect_flow_keys(skb, &keys, flag); flkeys = &keys; @@ -1859,7 +1856,6 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, hash_keys.ports.dst = flkeys->ports.dst; hash_keys.basic.ip_proto = flkeys->basic.ip_proto; } else { - memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; hash_keys.addrs.v4addrs.src = fl4->saddr; hash_keys.addrs.v4addrs.dst = fl4->daddr; -- 2.19.0.605.g01d371f741-goog
[PATCH 01/11] net: ip_rt_get_source() - use new style struct initializer instead of memset
From: Maciej Żenczykowski (allows for better compiler optimization) Signed-off-by: Maciej Żenczykowski --- net/ipv4/route.c | 21 + 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index dce2ed66ebe1..02482b71498b 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1217,18 +1217,15 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) src = ip_hdr(skb)->saddr; else { struct fib_result res; - struct flowi4 fl4; - struct iphdr *iph; - - iph = ip_hdr(skb); - - memset(&fl4, 0, sizeof(fl4)); - fl4.daddr = iph->daddr; - fl4.saddr = iph->saddr; - fl4.flowi4_tos = RT_TOS(iph->tos); - fl4.flowi4_oif = rt->dst.dev->ifindex; - fl4.flowi4_iif = skb->dev->ifindex; - fl4.flowi4_mark = skb->mark; + struct iphdr *iph = ip_hdr(skb); + struct flowi4 fl4 = { + .daddr = iph->daddr, + .saddr = iph->saddr, + .flowi4_tos = RT_TOS(iph->tos), + .flowi4_oif = rt->dst.dev->ifindex, + .flowi4_iif = skb->dev->ifindex, + .flowi4_mark = skb->mark, + }; rcu_read_lock(); if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0) -- 2.19.0.605.g01d371f741-goog
[PATCH 02/11] net: inet_rtm_getroute() - use new style struct initializer instead of memset
From: Maciej Żenczykowski Signed-off-by: Maciej Żenczykowski --- net/ipv4/route.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 02482b71498b..048919713f4e 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2780,7 +2780,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct rtable *rt = NULL; struct sk_buff *skb; struct rtmsg *rtm; - struct flowi4 fl4; + struct flowi4 fl4 = {}; __be32 dst = 0; __be32 src = 0; kuid_t uid; @@ -2820,7 +2820,6 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (!skb) return -ENOBUFS; - memset(&fl4, 0, sizeof(fl4)); fl4.daddr = dst; fl4.saddr = src; fl4.flowi4_tos = rtm->rtm_tos; -- 2.19.0.605.g01d371f741-goog
Re: [PATCH] netfilter: check if the socket netns is correct.
On Thu, Sep 27, 2018 at 01:46:29PM -0700, Guenter Roeck wrote: > Hi Flavio, > > On Wed, Jun 27, 2018 at 10:34:25AM -0300, Flavio Leitner wrote: > > Netfilter assumes that if the socket is present in the skb, then > > it can be used because that reference is cleaned up while the skb > > is crossing netns. > > > > We want to change that to preserve the socket reference in a future > > patch, so this is a preparation updating netfilter to check if the > > socket netns matches before use it. > > > > Signed-off-by: Flavio Leitner > > Acked-by: Florian Westphal > > Signed-off-by: David S. Miller > > --- > ... > > --- a/net/netfilter/xt_socket.c > > +++ b/net/netfilter/xt_socket.c > > @@ -56,8 +56,12 @@ socket_match(const struct sk_buff *skb, struct > > xt_action_param *par, > > struct sk_buff *pskb = (struct sk_buff *)skb; > > struct sock *sk = skb->sk; > > > > + if (!net_eq(xt_net(par), sock_net(sk))) > > + sk = NULL; > > + > > I am having trouble with this code. With CONFIG_NET_NS enabled, it crashes > for me in read_pnet() because sk is NULL. > > > if (!sk) > > sk = nf_sk_lookup_slow_v4(xt_net(par), skb, xt_in(par)); > > The old code seems to suggest that sk == NULL was possible. > > I see the problem with the Chrome OS kernel rebased to v4.19-rc5, so I > can not guarantee that this really an upstream problem. The change seems > odd, though. Are you sure that it is not (or, rather, no longer) necessary > to check if sk == NULL before dereferencing it in sock_net() ? Oops, it is necessary but if it's not and the netns doesn't match, we need do the lookup. So, could you check if this fixes the problem for you? >From a5f927e7f1368d753f87cb978d630d786d5adb62 Mon Sep 17 00:00:00 2001 From: Flavio Leitner Date: Thu, 27 Sep 2018 19:36:28 -0300 Subject: [PATCH] xt_socket: check sk before checking for netns. Only check for the network namespace if the socket is available. Fixes: f564650106a6 ("netfilter: check if the socket netns is correct.") Reported-by: Guenter Roeck Signed-off-by: Flavio Leitner --- net/netfilter/xt_socket.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 0472f3472842..ada144e5645b 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -56,7 +56,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, struct sk_buff *pskb = (struct sk_buff *)skb; struct sock *sk = skb->sk; - if (!net_eq(xt_net(par), sock_net(sk))) + if (sk && !net_eq(xt_net(par), sock_net(sk))) sk = NULL; if (!sk) @@ -117,7 +117,7 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par) struct sk_buff *pskb = (struct sk_buff *)skb; struct sock *sk = skb->sk; - if (!net_eq(xt_net(par), sock_net(sk))) + if (sk && !net_eq(xt_net(par), sock_net(sk))) sk = NULL; if (!sk) -- 2.14.4
[PATCH net-next 4/5] net: systemport: Be drop monitor friendly while re-allocating headroom
During bcm_sysport_insert_tsb() make sure we differentiate a SKB headroom re-allocation failure from the normal swap and replace path. Signed-off-by: Florian Fainelli --- drivers/net/ethernet/broadcom/bcmsysport.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c index 977d9dec2fb0..6c40cf6090ab 100644 --- a/drivers/net/ethernet/broadcom/bcmsysport.c +++ b/drivers/net/ethernet/broadcom/bcmsysport.c @@ -1230,12 +1230,13 @@ static struct sk_buff *bcm_sysport_insert_tsb(struct sk_buff *skb, /* Re-allocate SKB if needed */ if (unlikely(skb_headroom(skb) < sizeof(*tsb))) { nskb = skb_realloc_headroom(skb, sizeof(*tsb)); - dev_kfree_skb(skb); if (!nskb) { + dev_kfree_skb_any(skb); dev->stats.tx_errors++; dev->stats.tx_dropped++; return NULL; } + dev_consume_skb_any(skb); skb = nskb; } -- 2.17.1
[PATCH net-next 2/5] net: systemport: Utilize bcm_sysport_set_features() during resume/open
During driver resume and open, the HW may have lost its context/state, utilize bcm_sysport_set_features() to make sure we do restore the correct set of features that were previously configured. Signed-off-by: Florian Fainelli --- drivers/net/ethernet/broadcom/bcmsysport.c | 14 +++--- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c index 654a07b849c4..3b4cb906a275 100644 --- a/drivers/net/ethernet/broadcom/bcmsysport.c +++ b/drivers/net/ethernet/broadcom/bcmsysport.c @@ -1972,6 +1972,11 @@ static int bcm_sysport_open(struct net_device *dev) else gib_set_pad_extension(priv); + /* Apply features again in case we changed them while interface was +* down +*/ + bcm_sysport_set_features(dev, dev->features); + /* Set MAC address */ umac_set_hw_addr(priv, dev->dev_addr); @@ -2708,7 +2713,6 @@ static int __maybe_unused bcm_sysport_resume(struct device *d) struct net_device *dev = dev_get_drvdata(d); struct bcm_sysport_priv *priv = netdev_priv(dev); unsigned int i; - u32 reg; int ret; if (!netif_running(dev)) @@ -2752,12 +2756,8 @@ static int __maybe_unused bcm_sysport_resume(struct device *d) goto out_free_rx_ring; } - /* Enable rxhck */ - if (priv->rx_chk_en) { - reg = rxchk_readl(priv, RXCHK_CONTROL); - reg |= RXCHK_EN; - rxchk_writel(priv, reg, RXCHK_CONTROL); - } + /* Restore enabled features */ + bcm_sysport_set_features(dev, dev->features); rbuf_init(priv); -- 2.17.1
[PATCH net-next 1/5] net: systemport: Refactor bcm_sysport_set_features()
In preparation for unconditionally enabling TX and RX checksum offloads, refactor bcm_sysport_set_features() a bit such that __netdev_update_features() during register_netdev() can make sure that features are correctly programmed during network device registration. Since we can now be called during register_netdev() with clocks gated, we need to temporarily turn them on/off in order to have a successful register programming. We also move the CRC forward setting read into bcm_sysport_set_features() since priv->crc_fwd matters while turning on RX checksum offload, that way we are guaranteed they are in sync in case we ever add support for NETIF_F_RXFCS at some point in the future. Signed-off-by: Florian Fainelli --- drivers/net/ethernet/broadcom/bcmsysport.c | 38 +- 1 file changed, 15 insertions(+), 23 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c index 147045757b10..654a07b849c4 100644 --- a/drivers/net/ethernet/broadcom/bcmsysport.c +++ b/drivers/net/ethernet/broadcom/bcmsysport.c @@ -126,8 +126,8 @@ static inline void tdma_port_write_desc_addr(struct bcm_sysport_priv *priv, } /* Ethtool operations */ -static int bcm_sysport_set_rx_csum(struct net_device *dev, - netdev_features_t wanted) +static void bcm_sysport_set_rx_csum(struct net_device *dev, + netdev_features_t wanted) { struct bcm_sysport_priv *priv = netdev_priv(dev); u32 reg; @@ -157,12 +157,10 @@ static int bcm_sysport_set_rx_csum(struct net_device *dev, reg &= ~RXCHK_BRCM_TAG_EN; rxchk_writel(priv, reg, RXCHK_CONTROL); - - return 0; } -static int bcm_sysport_set_tx_csum(struct net_device *dev, - netdev_features_t wanted) +static void bcm_sysport_set_tx_csum(struct net_device *dev, + netdev_features_t wanted) { struct bcm_sysport_priv *priv = netdev_priv(dev); u32 reg; @@ -177,23 +175,24 @@ static int bcm_sysport_set_tx_csum(struct net_device *dev, else reg &= ~tdma_control_bit(priv, TSB_EN); tdma_writel(priv, reg, TDMA_CONTROL); - - return 0; } static int bcm_sysport_set_features(struct net_device *dev, netdev_features_t features) { - netdev_features_t changed = features ^ dev->features; - netdev_features_t wanted = dev->wanted_features; - int ret = 0; + struct bcm_sysport_priv *priv = netdev_priv(dev); + + /* Read CRC forward */ + if (!priv->is_lite) + priv->crc_fwd = !!(umac_readl(priv, UMAC_CMD) & CMD_CRC_FWD); + else + priv->crc_fwd = !((gib_readl(priv, GIB_CONTROL) & + GIB_FCS_STRIP) >> GIB_FCS_STRIP_SHIFT); - if (changed & NETIF_F_RXCSUM) - ret = bcm_sysport_set_rx_csum(dev, wanted); - if (changed & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) - ret = bcm_sysport_set_tx_csum(dev, wanted); + bcm_sysport_set_rx_csum(dev, features); + bcm_sysport_set_tx_csum(dev, features); - return ret; + return 0; } /* Hardware counters must be kept in sync because the order/offset @@ -1976,13 +1975,6 @@ static int bcm_sysport_open(struct net_device *dev) /* Set MAC address */ umac_set_hw_addr(priv, dev->dev_addr); - /* Read CRC forward */ - if (!priv->is_lite) - priv->crc_fwd = !!(umac_readl(priv, UMAC_CMD) & CMD_CRC_FWD); - else - priv->crc_fwd = !((gib_readl(priv, GIB_CONTROL) & - GIB_FCS_STRIP) >> GIB_FCS_STRIP_SHIFT); - phydev = of_phy_connect(dev, priv->phy_dn, bcm_sysport_adj_link, 0, priv->phy_interface); if (!phydev) { -- 2.17.1
[PATCH net-next 5/5] net: systemport: Add software counters to track reallocations
When inserting the TSB, keep track of how many times we had to do it and if there was a failure in doing so, this helps profile the driver for possibly incorrect headroom settings. Signed-off-by: Florian Fainelli --- drivers/net/ethernet/broadcom/bcmsysport.c | 5 + drivers/net/ethernet/broadcom/bcmsysport.h | 2 ++ 2 files changed, 7 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c index 6c40cf6090ab..faba55fd656a 100644 --- a/drivers/net/ethernet/broadcom/bcmsysport.c +++ b/drivers/net/ethernet/broadcom/bcmsysport.c @@ -284,6 +284,8 @@ static const struct bcm_sysport_stats bcm_sysport_gstrings_stats[] = { STAT_MIB_SOFT("alloc_rx_buff_failed", mib.alloc_rx_buff_failed), STAT_MIB_SOFT("rx_dma_failed", mib.rx_dma_failed), STAT_MIB_SOFT("tx_dma_failed", mib.tx_dma_failed), + STAT_MIB_SOFT("tx_realloc_tsb", mib.tx_realloc_tsb), + STAT_MIB_SOFT("tx_realloc_tsb_failed", mib.tx_realloc_tsb_failed), /* Per TX-queue statistics are dynamically appended */ }; @@ -1220,6 +1222,7 @@ static void bcm_sysport_poll_controller(struct net_device *dev) static struct sk_buff *bcm_sysport_insert_tsb(struct sk_buff *skb, struct net_device *dev) { + struct bcm_sysport_priv *priv = netdev_priv(dev); struct sk_buff *nskb; struct bcm_tsb *tsb; u32 csum_info; @@ -1232,12 +1235,14 @@ static struct sk_buff *bcm_sysport_insert_tsb(struct sk_buff *skb, nskb = skb_realloc_headroom(skb, sizeof(*tsb)); if (!nskb) { dev_kfree_skb_any(skb); + priv->mib.tx_realloc_tsb_failed++; dev->stats.tx_errors++; dev->stats.tx_dropped++; return NULL; } dev_consume_skb_any(skb); skb = nskb; + priv->mib.tx_realloc_tsb++; } tsb = skb_push(skb, sizeof(*tsb)); diff --git a/drivers/net/ethernet/broadcom/bcmsysport.h b/drivers/net/ethernet/broadcom/bcmsysport.h index 046c6c1d97fd..a7a230884a87 100644 --- a/drivers/net/ethernet/broadcom/bcmsysport.h +++ b/drivers/net/ethernet/broadcom/bcmsysport.h @@ -607,6 +607,8 @@ struct bcm_sysport_mib { u32 alloc_rx_buff_failed; u32 rx_dma_failed; u32 tx_dma_failed; + u32 tx_realloc_tsb; + u32 tx_realloc_tsb_failed; }; /* HW maintains a large list of counters */ -- 2.17.1
[PATCH net-next 3/5] net: systemport: Turn on offloads by default
We can turn on the RX/TX checksum offloads by default and make sure that those are properly reflected back to e.g: stacked devices such as VLAN or DSA. Signed-off-by: Florian Fainelli --- drivers/net/ethernet/broadcom/bcmsysport.c | 7 --- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c index 3b4cb906a275..977d9dec2fb0 100644 --- a/drivers/net/ethernet/broadcom/bcmsysport.c +++ b/drivers/net/ethernet/broadcom/bcmsysport.c @@ -2508,9 +2508,10 @@ static int bcm_sysport_probe(struct platform_device *pdev) dev->netdev_ops = &bcm_sysport_netdev_ops; netif_napi_add(dev, &priv->napi, bcm_sysport_poll, 64); - /* HW supported features, none enabled by default */ - dev->hw_features |= NETIF_F_RXCSUM | NETIF_F_HIGHDMA | - NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM; + dev->features |= NETIF_F_RXCSUM | NETIF_F_HIGHDMA | +NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM; + dev->hw_features |= dev->features; + dev->vlan_features |= dev->features; /* Request the WOL interrupt and advertise suspend if available */ priv->wol_irq_disabled = 1; -- 2.17.1
[PATCH net-next 0/5] net: systemport: Turn on offloads by
Hi David, Up until now, we had added all the code necessary to turn on RX/TX checksum offloads at runtime, but there is no reason why they have to be disabled by default given that this gives a slight performance improvement. Florian Fainelli (5): net: systemport: Refactor bcm_sysport_set_features() net: systemport: Utilize bcm_sysport_set_features() during resume/open net: systemport: Turn on offloads by default net: systemport: Be drop monitor friendly while re-allocating headroom net: systemport: Add software counters to track reallocations drivers/net/ethernet/broadcom/bcmsysport.c | 67 +++--- drivers/net/ethernet/broadcom/bcmsysport.h | 2 + 2 files changed, 35 insertions(+), 34 deletions(-) -- 2.17.1
Re: [PATCH net-next v6 23/23] net: WireGuard secure network tunnel
Hi Andrew, Thanks for following up with this. On Thu, Sep 27, 2018 at 3:15 AM Andrew Lunn wrote: > I know you have been concentrating on the crypto code, so i'm not > expecting too many changes at the moment in the network code. I should be addressing things in parallel, actually, so I'm happy to work on this. > WARNING: Avoid crashing the kernel - try using WARN_ON & recovery code rather > than BUG() or BUG_ON() > #2984: FILE: drivers/net/wireguard/noise.c:293: > + BUG_ON(first_len > BLAKE2S_HASH_SIZE || second_len > > BLAKE2S_HASH_SIZE || I was actually going to ask you about this, because it applies similarly in another context too that I'm trying to refine. The above function you quote has the following properties: - Only ever accepts fixed length parameters, so the compiler can constant fold invocations of it fantastically. Those parameters are fixed length in the sense that they're enum/macro constants. They never come from the user or from a packet or something. - Never produces an incorrect result. For said constants, all inputs are valid, and so it succeeds in producing an output every time. - Is a "pure" function, just knocking bytes around, without needing to interact with fancy kernel-y things; could be implemented on some sort of WWII-era rotor machine provided you had the patience. Because of the above, there's never any error to return to the user of the function. Also, because it only ever takes constant sized inputs, in theory I should be able to change that BUG_ON() to BUILD_BUG_ON(), but in practice the optimizer/inliner isn't actually that aggressive. But what I would like is some way of signaling to the developer using this function that they've passed it an illegal value, and their code should not ship until that's fixed, under any circumstances at all -- that their usage of the function is completely unacceptable and wrong. Bla bla strong statements. For this, I figured the notion would come across with the aberrant behavior of "crash the developer's [in this case, my] QEMU instance" when "#ifdef DEBUG is true". This is the same kind of place where I'd have an "assert()" statement in userland. It sounds like what you're saying is that a WARN_ON is equally as effective instead? Or given the above, do you think the BUG_ON is actually sensible? Or neither and I should do something else? > WARNING: Macros with flow control statements should be avoided > #5471: FILE: drivers/net/wireguard/selftest/allowedips.h:456: > +#define init_peer(name) do { \ > + name = kzalloc(sizeof(*name), GFP_KERNEL); \ > + if (unlikely(!name)) { \ > + pr_info("allowedips self-test: out of memory\n"); \ > + goto free; \ > + } \ > + kref_init(&name->refcount);\ > + } while (0) This is part of a debugging selftest, where I'm initing a bunch of peers one after another, and this macro helps keep the test clear while offloading the actual irrelevant coding part to this macro. The test itself then just has code like: init_peer(a); init_peer(b); init_peer(c); init_peer(d); init_peer(e); init_peer(f); init_peer(g); init_peer(h); insert(4, a, 192, 168, 4, 0, 24); insert(4, b, 192, 168, 4, 4, 32); insert(4, c, 192, 168, 0, 0, 16); insert(4, d, 192, 95, 5, 64, 27); /* replaces previous entry, and maskself is required */ insert(4, c, 192, 95, 5, 65, 27); insert(6, d, 0x26075300, 0x60006b00, 0, 0xc05f0543, 128); insert(6, c, 0x26075300, 0x60006b00, 0, 0, 64); ... And so forth. I can probably figure out a different way to code this if you really want, but I thought this would be clear. > The namespace pollution also needs to be addresses. You have some > pretty generic named global symbols. I picked out a few examples from > objdump > > 2a94 g F .text 0060 peer_put > 3484 g F .text 004c timers_stop > 3520 g F .text 0114 packet_queue_init > 2640 g F .text 0034 device_uninit > 26bc g F .text 0288 peer_create > 90d4 g F .text 01bc ratelimiter_init > > Please make use of a prefix for global symbols, e.g. wg_. Will do. v7 will include the wg_ prefix. On a slightly related note, out of curiosity, any idea what's up with the future of LTO in the kernel? It sounds like that'd be nice to have on a module-by-module basis. IOW, I'd love to LTO all of my .c files in wireguard together, and then only ever expose mod_init/exit and whatever I explicitly EXPORT_SYMBOL, and then have the compiler and linker treat the rest of everything as essentially in one .c file and optimize the heck out of it, and then strip all the s
Re: KMSAN: uninit-value in __dev_mc_add
On Thu, Sep 27, 2018 at 2:30 PM Vladis Dronov wrote: > > Hello, > > This report is actually for the same bug which was reported in: > > https://syzkaller.appspot.com/bug?id=088efeac32fdde781038a777a63e436c0d4d7036 > > The note there that the bug was fixed by "Commits: net: fix uninit-value in > __hw_addr_add_ex()" is wrong. A C-reproducer from the 2nd syzkaller report > can trigger the bug from this one. > > I've researched this and a result is a proposed patch, the problem is the tun > device code allowing to set an arbitrary link type. > > https://lkml.org/lkml/2018/9/26/416 > https://lore.kernel.org/lkml/20180926093018.6646-1-vdro...@redhat.com/T/#u > https://marc.info/?l=linux-netdev&m=153795423320016&w=2 > I dunno, your patch looks quite not the right fix. If TUN is able to change dev->type, how comes it does not set the appropriate dev->addr_len at the same time ? Really the bug seems to be deeper, and without setting proper dev->addr_len, we'll need more 'fixes' like yours. Thanks.
Re: [Patch net-next] net_sched: fix an extack message in tcf_block_find()
On 09/27/2018 02:36 PM, Cong Wang wrote: > I don't understand what you mean by changing ip command, you must > mean tc command, but still, I have no idea about how restarting failed > syscall could be related to my patch and why we need to restart anything > here. If the refcnt goes to 0, it will never come back, retrying won't help > anything. > Yep, tc command it is. I was not especially commenting your patch (replacing an english message by another does not seem very big deal), but the fact that the code right there seems to be prepared for parallel changes. But using RCU lookups in control path will lead to occasional failures that most user space tools would not expect. Lets assume two tasks are launching "tc qdisc replace dev eth0 root XXX" in whatever order/parallelism. Both should succeed, after/before major RTNL->other_locking_mechanism Control paths are usually using a mutex or a spinlock so that they never hit a 0-refcount at all.
[PATCH bpf-next] bpf: permit CGROUP_DEVICE programs accessing helper bpf_get_current_cgroup_id()
Currently, helper bpf_get_current_cgroup_id() is not permitted for CGROUP_DEVICE type of programs. If the helper is used in such cases, the verifier will log the following error: 0: (bf) r6 = r1 1: (69) r7 = *(u16 *)(r6 +0) 2: (85) call bpf_get_current_cgroup_id#80 unknown func bpf_get_current_cgroup_id#80 The bpf_get_current_cgroup_id() is useful for CGROUP_DEVICE type of programs in order to customize action based on cgroup id. This patch added such a support. Cc: Roman Gushchin Signed-off-by: Yonghong Song --- kernel/bpf/cgroup.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 065c3d9ff8eb..00f6ed2e4f9a 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -707,6 +707,8 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; + case BPF_FUNC_get_current_cgroup_id: + return &bpf_get_current_cgroup_id_proto; case BPF_FUNC_trace_printk: if (capable(CAP_SYS_ADMIN)) return bpf_get_trace_printk_proto(); -- 2.17.1
Re: [Patch net-next] net_sched: fix an extack message in tcf_block_find()
On Thu, Sep 27, 2018 at 2:16 PM Eric Dumazet wrote: > > > > On 09/27/2018 01:42 PM, Cong Wang wrote: > > It is clearly a copy-n-paste. > > > > Signed-off-by: Cong Wang > > --- > > net/sched/cls_api.c | 2 +- > > 1 file changed, 1 insertion(+), 1 deletion(-) > > > > diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c > > index 3de47e99b788..8dd7f8af6d54 100644 > > --- a/net/sched/cls_api.c > > +++ b/net/sched/cls_api.c > > @@ -655,7 +655,7 @@ static struct tcf_block *tcf_block_find(struct net > > *net, struct Qdisc **q, > > > > *q = qdisc_refcount_inc_nz(*q); > > if (!*q) { > > - NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists"); > > + NL_SET_ERR_MSG(extack, "Can't increase Qdisc > > refcount"); > > > I am not sure it was a copy-n-paste. Make sure you knew there is an exactly same extack message (with a same English grammar error). > > Qdisc refcount business is kernel internal. Yeah, but the extack message is already there, this patch doesn't add any new extack. Or you are suggesting we should remove it? > If we can not increase the refcount, this is precisely because this qdisc is > about > to be destroyed. Nothing fundamentally different than having this thread > delayed a bit > and qdisc_lookup_rcu() returning NULL in the first place. qdisc_lookup_rcu() is not always called, it could be dev->qdisc. I am pretty sure parent exists in dev->qdisc. > > This also means that using RCU for control path is problematic, as surely the > caller > of this interface would prefer something that succeeds, even if this means > waiting a bit in the kernel. I fail to validate this statement, Why it prefers success when refcnt reaches 0? > > Or are we willing to change ip command and make it restart failed syscalls ? > I don't understand what you mean by changing ip command, you must mean tc command, but still, I have no idea about how restarting failed syscall could be related to my patch and why we need to restart anything here. If the refcnt goes to 0, it will never come back, retrying won't help anything. BTW: If you have any other question beyond my patch's scope, isn't it better that we start a new thread for discussion? In case you still misunderstand, my patch never intends to address any other problem rather than correcting an inaccurate extack message.
Re: [PATCH net-next v6 00/23] WireGuard: Secure Network Tunnel
Hi Eric, On Thu, Sep 27, 2018 at 8:29 PM Eric Biggers wrote: > Why is Herbert Xu's existing crypto tree being circumvented, especially for > future patches (the initial merge isn't quite as important as that's a > one-time > event)? I like being able to check out cryptodev to test upcoming crypto > patches. And currently, changes to APIs, algorithms, tests, and > implementations > all go through cryptodev, which is convenient for crypto developers. > > Apparently, you're proposing that someone adding a new algorithm will now have > to submit the API portion to one maintainer (Herbert Xu) and the > implementation > portion to another maintainer (you), and they'll go through separate git > trees. > That's inconvenient for developers, and it seems that in practice you and > Herbert will be stepping on each other's toes a lot. > > Can you please reach some kind of sane agreement with Herbert so that the > development process isn't fractured into two? Perhaps you could review > patches, > but Herbert could still apply them? I think you're overthinking it a bit. Zinc will have a few software implementations of primitives that are useful in cases where it's nice to call the primitive directly. Think: various usages of sha2, siphash, the wireguard suite (what this patchset includes), other things in lib/, etc. In so much as this winds up duplicating things within the crypto API, I'll work with Herbert to build one on top of the other -- as I've done in the two commits in this series. But beyond that, think of the two initiatives as orthogonal. I'm working on curating a few primitives that are maximally useful throughout the kernel for various uses, and doing so in a way that I think brings about a certain quality. Meanwhile the crypto API is amassing a huge collection of primitives for some things, and that will continue to exist, and Herbert will continue to maintain that. I expect for the crossover to be fairly isolated and manageable, without too much foreseeable tree- conflicts and such. Therefore, Samuel Neves and I plan to maintain the codebase we've spent quite some time writing, and maintain our own tree for it, which we'll be submitting through Greg. In other words, this is not a matter of "circumvention" or "stepping on toes", but rather separate efforts. I'm quite certain to the extent they overlap we'll be able to work out fairly easily. Either way, I'll take your suggestion and reach out to Herbert, since at least a discussion between the two of us sounds like it could be productive. > I'm also wondering about the criteria for making additions and changes to > "Zinc". You mentioned before that one of the "advantages" of Zinc is that it > doesn't include "cipher modes from 90s cryptographers" -- what does that mean > exactly? You've also indicated before that you don't want people modifying > the > Poly1305 implementations as they are too error-prone. Useful contributions > could be blocked or discouraged in the future. Can you please elaborate on > your criteria for contributions to Zinc? > > Also, will you allow algorithms that aren't up to modern security standards > but > are needed for compatibility reasons, e.g. MD5, SHA-1, and DES? There are > existing standards, APIs, and data formats that use these "legacy" algorithms; > so implementations of them are often still needed, whether we like it or not. > > And does it matter who designed the algorithms, e.g. do algorithms from Daniel > Bernstein get effectively a free pass, while algorithms from certain > countries, > governments, or organizations are not allowed? E.g. wireless driver > developers > may need the SM4 block cipher (which is now supported by the crypto API) as > it's > specified in a Chinese wireless standard. Will you allow SM4 in Zinc? Or > will > people have to submit some algorithms to Herbert and some to you due to > disagreements about what algorithms should be included? Similarly here, I think you're over-politicizing everything. Stable address generation for IPv6 uses SHA1 -- see net/ipv6/addrconf.c:3203 -- do you think that this should use, say, the SM3 chinese hash function instead? No, of course not, for a variety of interesting reasons. Rather, it should use some simple hash function that's fast in software that we have available in Zinc. On the other hand, it seems like parts of the kernel that have pretty high- levels of cipher agility -- such as dmcrypt, ipsec, wifi apparently, and so on -- will continue to use dynamic-dispatch system like the crypto API, since that's what it was made to do and is effective at doing. And so, your example of SM4 seems to fit perfectly into what the crypto API is well-suited for, and it would fit naturally in there. In other words, the "political criteria" for what we add to lib/zinc/ will mostly be the same as for the rest of lib/: are there things using it that benefit from it being there in a direct and obvious way, and does the implementation meet certain q
Re: KMSAN: uninit-value in __dev_mc_add
Hello, This report is actually for the same bug which was reported in: https://syzkaller.appspot.com/bug?id=088efeac32fdde781038a777a63e436c0d4d7036 The note there that the bug was fixed by "Commits: net: fix uninit-value in __hw_addr_add_ex()" is wrong. A C-reproducer from the 2nd syzkaller report can trigger the bug from this one. I've researched this and a result is a proposed patch, the problem is the tun device code allowing to set an arbitrary link type. https://lkml.org/lkml/2018/9/26/416 https://lore.kernel.org/lkml/20180926093018.6646-1-vdro...@redhat.com/T/#u https://marc.info/?l=linux-netdev&m=153795423320016&w=2 A simplified reproducer is attached. Best regards, Vladis Dronov #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include int main(int argc, char **argv) { int ret, sockfd, tunfd; syscall(__NR_mmap, 0x2000, 0x100, 3, 0x32, -1, 0); // socket(AF_PACKET, SOCK_DGRAM|SOCK_NONBLOCK, 0) sockfd = syscall(__NR_socket, 0x11, 0x10802, 0); if (sockfd < 0) { perror("socket()"); ret = 1; goto exit_end; } memcpy((void*)0x2240, "/dev/net/tun", 13); tunfd = open((char *)0x2240, 0); if (tunfd < 0) { perror("open()"); ret = 2; goto exit_sock_close; } memcpy((void*)0x20c0, "\x69\x67\x62\x30\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", 16); *(uint16_t*)0x20d0 = 0x4012; ret = syscall(__NR_ioctl, tunfd, 0x400454ca, 0x20c0); // TUNSETIFF _IOW('T', 202, int) if (ret < 0) { perror("ioctl(TUNSETIFF)"); ret = 3; goto exit_tun_close; } // TUNSETLINK _IOW('T', 205, int) / 0x30a = 778 = ARPHRD_IPGRE if (argc < 2) ret = syscall(__NR_ioctl, tunfd, 0x400454cd, 0x30a); else ret = syscall(__NR_ioctl, tunfd, 0x400454cd, atoi(argv[1])); if (ret < 0) { perror("ioctl(TUNSETLINK)"); ret = 4; goto exit_tun_close; } memcpy((void*)0x2040, "\x69\x67\x62\x30\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", 16); *(uint16_t*)0x2050 = 0xa201; ret = syscall(__NR_ioctl, sockfd, 0x8914, 0x2040); // SIOCSIFFLAGS 0x8914 if (ret < 0) { perror("ioctl(SIOCSIFFLAGS)"); ret = 5; goto exit_tun_close; } printf("done:\n"); system("/usr/sbin/ip -details link show igb0"); exit_tun_close: close(tunfd); exit_sock_close: close(sockfd); exit_end: munmap((void *)0x2000, 0x100); return 0; }
Re: [Patch net-next] net_sched: fix an extack message in tcf_block_find()
On 09/27/2018 01:42 PM, Cong Wang wrote: > It is clearly a copy-n-paste. > > Signed-off-by: Cong Wang > --- > net/sched/cls_api.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c > index 3de47e99b788..8dd7f8af6d54 100644 > --- a/net/sched/cls_api.c > +++ b/net/sched/cls_api.c > @@ -655,7 +655,7 @@ static struct tcf_block *tcf_block_find(struct net *net, > struct Qdisc **q, > > *q = qdisc_refcount_inc_nz(*q); > if (!*q) { > - NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists"); > + NL_SET_ERR_MSG(extack, "Can't increase Qdisc refcount"); I am not sure it was a copy-n-paste. Qdisc refcount business is kernel internal. If we can not increase the refcount, this is precisely because this qdisc is about to be destroyed. Nothing fundamentally different than having this thread delayed a bit and qdisc_lookup_rcu() returning NULL in the first place. This also means that using RCU for control path is problematic, as surely the caller of this interface would prefer something that succeeds, even if this means waiting a bit in the kernel. Or are we willing to change ip command and make it restart failed syscalls ?
Re: [PATCH v3 bpf-next 00/10] bpf: per-cpu cgroup local storage
On 09/26/2018 01:33 PM, Roman Gushchin wrote: > This patchset implements per-cpu cgroup local storage and provides > an example how per-cpu and shared cgroup local storage can be used > for efficient accounting of network traffic. > > v3->v2: > 1) incorporated Song's feedback > 2) rebased on top of current bpf-next > > v2->v1: > 1) added a selftest implementing network counters > 2) added a missing free() in cgroup local storage selftest > > Roman Gushchin (10): > bpf: extend cgroup bpf core to allow multiple cgroup storage types > bpf: rework cgroup storage pointer passing > bpf: introduce per-cpu cgroup local storage > bpf: don't allow create maps of per-cpu cgroup local storages > bpf: sync include/uapi/linux/bpf.h to tools/include/uapi/linux/bpf.h > bpftool: add support for PERCPU_CGROUP_STORAGE maps > selftests/bpf: add verifier per-cpu cgroup storage tests > selftests/bpf: extend the storage test to test per-cpu cgroup storage > samples/bpf: extend test_cgrp2_attach2 test to use per-cpu cgroup > storage > selftests/bpf: cgroup local storage-based network counters > > include/linux/bpf-cgroup.h| 55 -- > include/linux/bpf.h | 12 +- > include/linux/bpf_types.h | 1 + > include/uapi/linux/bpf.h | 1 + > kernel/bpf/cgroup.c | 74 +--- > kernel/bpf/helpers.c | 25 ++- > kernel/bpf/local_storage.c| 167 +++--- > kernel/bpf/map_in_map.c | 3 +- > kernel/bpf/syscall.c | 20 ++- > kernel/bpf/verifier.c | 23 ++- > net/bpf/test_run.c| 20 ++- > samples/bpf/test_cgrp2_attach2.c | 19 +- > tools/bpf/bpftool/map.c | 4 +- > tools/include/uapi/linux/bpf.h| 1 + > tools/testing/selftests/bpf/Makefile | 6 +- > tools/testing/selftests/bpf/netcnt_common.h | 23 +++ > tools/testing/selftests/bpf/netcnt_prog.c | 71 > .../selftests/bpf/test_cgroup_storage.c | 60 ++- > tools/testing/selftests/bpf/test_netcnt.c | 153 > tools/testing/selftests/bpf/test_verifier.c | 139 ++- > 20 files changed, 778 insertions(+), 99 deletions(-) > create mode 100644 tools/testing/selftests/bpf/netcnt_common.h > create mode 100644 tools/testing/selftests/bpf/netcnt_prog.c > create mode 100644 tools/testing/selftests/bpf/test_netcnt.c > Applied to bpf-next, thanks Roman!
[PATCH net] net/ncsi: Extend NC-SI Netlink interface to allow user space to send NC-SI command
The new command (NCSI_CMD_SEND_CMD) is added to allow user space application to send NC-SI command to the network card. Also, add a new attribute (NCSI_ATTR_DATA) for transferring request and response. The work flow is as below. Request: User space application -> Netlink interface (msg) -> new Netlink handler - ncsi_send_cmd_nl() -> ncsi_xmit_cmd() Response: Response received - ncsi_rcv_rsp() -> internal response handler - ncsi_rsp_handler_xxx() -> ncsi_rsp_handler_netlink() -> ncsi_send_netlink_rsp () -> Netlink interface (msg) -> user space application Command timeout - ncsi_request_timeout() -> ncsi_send_netlink_timeout () -> Netlink interface (msg with zero data length) -> user space application Error: Error detected -> ncsi_send_netlink_err () -> Netlink interface (err msg) -> user space application Signed-off-by: Justin Lee --- include/uapi/linux/ncsi.h | 3 + net/ncsi/internal.h | 12 ++- net/ncsi/ncsi-aen.c | 10 ++- net/ncsi/ncsi-cmd.c | 106 net/ncsi/ncsi-manage.c| 74 ++--- net/ncsi/ncsi-netlink.c | 199 +- net/ncsi/ncsi-netlink.h | 4 + net/ncsi/ncsi-rsp.c | 70 ++-- 8 files changed, 420 insertions(+), 58 deletions(-) diff --git a/include/uapi/linux/ncsi.h b/include/uapi/linux/ncsi.h index 4c292ec..4992bfc 100644 --- a/include/uapi/linux/ncsi.h +++ b/include/uapi/linux/ncsi.h @@ -30,6 +30,7 @@ enum ncsi_nl_commands { NCSI_CMD_PKG_INFO, NCSI_CMD_SET_INTERFACE, NCSI_CMD_CLEAR_INTERFACE, + NCSI_CMD_SEND_CMD, __NCSI_CMD_AFTER_LAST, NCSI_CMD_MAX = __NCSI_CMD_AFTER_LAST - 1 @@ -43,6 +44,7 @@ enum ncsi_nl_commands { * @NCSI_ATTR_PACKAGE_LIST: nested array of NCSI_PKG_ATTR attributes * @NCSI_ATTR_PACKAGE_ID: package ID * @NCSI_ATTR_CHANNEL_ID: channel ID + * @NCSI_ATTR_DATA: command payload * @NCSI_ATTR_MAX: highest attribute number */ enum ncsi_nl_attrs { @@ -51,6 +53,7 @@ enum ncsi_nl_attrs { NCSI_ATTR_PACKAGE_LIST, NCSI_ATTR_PACKAGE_ID, NCSI_ATTR_CHANNEL_ID, + NCSI_ATTR_DATA, __NCSI_ATTR_AFTER_LAST, NCSI_ATTR_MAX = __NCSI_ATTR_AFTER_LAST - 1 diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h index 8055e39..20ce735 100644 --- a/net/ncsi/internal.h +++ b/net/ncsi/internal.h @@ -215,12 +215,17 @@ struct ncsi_request { unsigned charid; /* Request ID - 0 to 255 */ bool used;/* Request that has been assigned */ unsigned int flags; /* NCSI request property */ -#define NCSI_REQ_FLAG_EVENT_DRIVEN 1 +#define NCSI_REQ_FLAG_EVENT_DRIVEN 1 +#define NCSI_REQ_FLAG_NETLINK_DRIVEN 2 struct ncsi_dev_priv *ndp;/* Associated NCSI device */ struct sk_buff *cmd;/* Associated NCSI command packet */ struct sk_buff *rsp;/* Associated NCSI response packet */ struct timer_listtimer; /* Timer on waiting for response */ bool enabled; /* Time has been enabled or not*/ + + u32 snd_seq; /* netlink sending sequence number */ + u32 snd_portid; /* netlink portid of sender*/ + struct nlmsghdr nlhdr; /* netlink message header */ }; enum { @@ -301,10 +306,13 @@ struct ncsi_cmd_arg { unsigned short payload; /* Command packet payload length */ unsigned int req_flags; /* NCSI request properties */ union { - unsigned char bytes[16]; /* Command packet specific data */ + unsigned char bytes[16]; /* Command packet specific data */ unsigned short words[8]; unsigned int dwords[4]; }; + + unsigned char*data; /* Netlink data */ + struct genl_info *info; /* Netlink information */ }; extern struct list_head ncsi_dev_list; diff --git a/net/ncsi/ncsi-aen.c b/net/ncsi/ncsi-aen.c index 25e483e..b5ec193 100644 --- a/net/ncsi/ncsi-aen.c +++ b/net/ncsi/ncsi-aen.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "internal.h" #include "ncsi-pkt.h" @@
Re: [PATCH v3] PCI: Reprogram bridge prefetch registers on resume
[+cc LKML] On Tue, Sep 18, 2018 at 04:32:44PM -0500, Bjorn Helgaas wrote: > On Thu, Sep 13, 2018 at 11:37:45AM +0800, Daniel Drake wrote: > > On 38+ Intel-based Asus products, the nvidia GPU becomes unusable > > after S3 suspend/resume. The affected products include multiple > > generations of nvidia GPUs and Intel SoCs. After resume, nouveau logs > > many errors such as: > > > > fifo: fault 00 [READ] at 00555000 engine 00 [GR] client 04 > > [HUB/FE] reason 4a [] on channel -1 [007fa91000 unknown] > > DRM: failed to idle channel 0 [DRM] > > > > Similarly, the nvidia proprietary driver also fails after resume > > (black screen, 100% CPU usage in Xorg process). We shipped a sample > > to Nvidia for diagnosis, and their response indicated that it's a > > problem with the parent PCI bridge (on the Intel SoC), not the GPU. > > > > Runtime suspend/resume works fine, only S3 suspend is affected. > > > > We found a workaround: on resume, rewrite the Intel PCI bridge > > 'Prefetchable Base Upper 32 Bits' register (PCI_PREF_BASE_UPPER32). In > > the cases that I checked, this register has value 0 and we just have to > > rewrite that value. > > > > Linux already saves and restores PCI config space during suspend/resume, > > but this register was being skipped because upon resume, it already > > has value 0 (the correct, pre-suspend value). > > > > Intel appear to have previously acknowledged this behaviour and the > > requirement to rewrite this register. > > https://bugzilla.kernel.org/show_bug.cgi?id=116851#c23 > > > > Based on that, rewrite the prefetch register values even when that > > appears unnecessary. > > > > We have confirmed this solution on all the affected models we have > > in-hands (X542UQ, UX533FD, X530UN, V272UN). > > > > Additionally, this solves an issue where r8169 MSI-X interrupts were > > broken after S3 suspend/resume on Asus X441UAR. This issue was recently > > worked around in commit 7bb05b85bc2d ("r8169: don't use MSI-X on > > RTL8106e"). It also fixes the same issue on RTL6186evl/8111evl on an > > Aimfor-tech laptop that we had not yet patched. I suspect it will also > > fix the issue that was worked around in commit 7c53a722459c ("r8169: > > don't use MSI-X on RTL8168g"). > > > > Thomas Martitz reports that this change also solves an issue where > > the AMD Radeon Polaris 10 GPU on the HP Zbook 14u G5 is unresponsive > > after S3 suspend/resume. > > > > Link: https://bugzilla.kernel.org/show_bug.cgi?id=201069 > > Signed-off-by: Daniel Drake > > Applied with Rafael's and Peter's reviewed-by to pci/enumeration for v4.20. > Thanks for the the huge investigative effort! Since this looks low-risk and fixes several painful issues, I think this merits a stable tag and being included in v4.19 (instead of waiting for v4.20). I moved it to for-linus for v4.19. Let me know if you object. > > --- > > drivers/pci/pci.c | 25 + > > 1 file changed, 17 insertions(+), 8 deletions(-) > > > > diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c > > index 29ff9619b5fa..5d58220b6997 100644 > > --- a/drivers/pci/pci.c > > +++ b/drivers/pci/pci.c > > @@ -1289,12 +1289,12 @@ int pci_save_state(struct pci_dev *dev) > > EXPORT_SYMBOL(pci_save_state); > > > > static void pci_restore_config_dword(struct pci_dev *pdev, int offset, > > -u32 saved_val, int retry) > > +u32 saved_val, int retry, bool force) > > { > > u32 val; > > > > pci_read_config_dword(pdev, offset, &val); > > - if (val == saved_val) > > + if (!force && val == saved_val) > > return; > > > > for (;;) { > > @@ -1313,25 +1313,34 @@ static void pci_restore_config_dword(struct pci_dev > > *pdev, int offset, > > } > > > > static void pci_restore_config_space_range(struct pci_dev *pdev, > > - int start, int end, int retry) > > + int start, int end, int retry, > > + bool force) > > { > > int index; > > > > for (index = end; index >= start; index--) > > pci_restore_config_dword(pdev, 4 * index, > > pdev->saved_config_space[index], > > -retry); > > +retry, force); > > } > > > > static void pci_restore_config_space(struct pci_dev *pdev) > > { > > if (pdev->hdr_type == PCI_HEADER_TYPE_NORMAL) { > > - pci_restore_config_space_range(pdev, 10, 15, 0); > > + pci_restore_config_space_range(pdev, 10, 15, 0, false); > > /* Restore BARs before the command register. */ > > - pci_restore_config_space_range(pdev, 4, 9, 10); > > - pci_restore_config_space_range(pdev, 0, 3, 0); > > + pci_restore_config_space_range(pdev, 4, 9, 10, false); > > + pci_restore_config_space_range(pdev, 0, 3,
Re: [PATCH] netfilter: check if the socket netns is correct.
Hi Flavio, On Wed, Jun 27, 2018 at 10:34:25AM -0300, Flavio Leitner wrote: > Netfilter assumes that if the socket is present in the skb, then > it can be used because that reference is cleaned up while the skb > is crossing netns. > > We want to change that to preserve the socket reference in a future > patch, so this is a preparation updating netfilter to check if the > socket netns matches before use it. > > Signed-off-by: Flavio Leitner > Acked-by: Florian Westphal > Signed-off-by: David S. Miller > --- ... > --- a/net/netfilter/xt_socket.c > +++ b/net/netfilter/xt_socket.c > @@ -56,8 +56,12 @@ socket_match(const struct sk_buff *skb, struct > xt_action_param *par, > struct sk_buff *pskb = (struct sk_buff *)skb; > struct sock *sk = skb->sk; > > + if (!net_eq(xt_net(par), sock_net(sk))) > + sk = NULL; > + I am having trouble with this code. With CONFIG_NET_NS enabled, it crashes for me in read_pnet() because sk is NULL. > if (!sk) > sk = nf_sk_lookup_slow_v4(xt_net(par), skb, xt_in(par)); The old code seems to suggest that sk == NULL was possible. I see the problem with the Chrome OS kernel rebased to v4.19-rc5, so I can not guarantee that this really an upstream problem. The change seems odd, though. Are you sure that it is not (or, rather, no longer) necessary to check if sk == NULL before dereferencing it in sock_net() ? > + > if (sk) { > bool wildcard; > bool transparent = true; > @@ -113,8 +117,12 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct > xt_action_param *par) > struct sk_buff *pskb = (struct sk_buff *)skb; > struct sock *sk = skb->sk; > > + if (!net_eq(xt_net(par), sock_net(sk))) > + sk = NULL; > + Same here. > if (!sk) > sk = nf_sk_lookup_slow_v6(xt_net(par), skb, xt_in(par)); > + > if (sk) { > bool wildcard; > bool transparent = true; Thanks, Guenter
[Patch net-next] net_sched: fix a crash in tc_new_tfilter()
When tcf_block_find() fails, it already rollbacks the qdisc refcnt, so its caller doesn't need to clean up this again. Avoid calling qdisc_put() again by resetting qdisc to NULL for callers. Reported-by: syzbot+37b8770e6d5a8220a...@syzkaller.appspotmail.com Fixes: e368fdb61d8e ("net: sched: use Qdisc rcu API instead of relying on rtnl lock") Signed-off-by: Cong Wang --- net/sched/cls_api.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 8dd7f8af6d54..a4167ec0a220 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -717,8 +717,10 @@ static struct tcf_block *tcf_block_find(struct net *net, struct Qdisc **q, errout_rcu: rcu_read_unlock(); errout_qdisc: - if (*q) + if (*q) { qdisc_put(*q); + *q = NULL; + } return ERR_PTR(err); } -- 2.14.4
[Patch net-next] net_sched: fix an extack message in tcf_block_find()
It is clearly a copy-n-paste. Signed-off-by: Cong Wang --- net/sched/cls_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 3de47e99b788..8dd7f8af6d54 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -655,7 +655,7 @@ static struct tcf_block *tcf_block_find(struct net *net, struct Qdisc **q, *q = qdisc_refcount_inc_nz(*q); if (!*q) { - NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists"); + NL_SET_ERR_MSG(extack, "Can't increase Qdisc refcount"); err = -EINVAL; goto errout_rcu; } -- 2.14.4
Re: [PATCH v2 07/22] soc/fsl/bman_portals: defer probe after bman's probe
On Wed, Sep 26, 2018 at 8:26 AM wrote: > > From: Laurentiu Tudor > > A crash in bman portal probing could not be triggered (as is the case > with qman portals) but it does make calls [1] into the bman driver so > lets make sure the bman portal probing happens after bman's. > > [1] bman_p_irqsource_add() (in bman) called by: >init_pcfg() called by: > bman_portal_probe() > > Signed-off-by: Laurentiu Tudor As this is part of a bug fix for v4.19, applied on soc/fsl for fix. > --- > drivers/soc/fsl/qbman/bman_portal.c | 10 +- > 1 file changed, 9 insertions(+), 1 deletion(-) > > diff --git a/drivers/soc/fsl/qbman/bman_portal.c > b/drivers/soc/fsl/qbman/bman_portal.c > index 2f71f7df3465..f9edd28894fd 100644 > --- a/drivers/soc/fsl/qbman/bman_portal.c > +++ b/drivers/soc/fsl/qbman/bman_portal.c > @@ -91,7 +91,15 @@ static int bman_portal_probe(struct platform_device *pdev) > struct device_node *node = dev->of_node; > struct bm_portal_config *pcfg; > struct resource *addr_phys[2]; > - int irq, cpu; > + int irq, cpu, err; > + > + err = bman_is_probed(); > + if (!err) > + return -EPROBE_DEFER; > + if (err < 0) { > + dev_err(&pdev->dev, "failing probe due to bman probe > error\n"); > + return -ENODEV; > + } > > pcfg = devm_kmalloc(dev, sizeof(*pcfg), GFP_KERNEL); > if (!pcfg) > -- > 2.17.1 >
Re: [PATCH v2 06/22] soc/fsl/qman_portals: defer probe after qman's probe
On Wed, Sep 26, 2018 at 8:26 AM wrote: > > From: Laurentiu Tudor > > Defer probe of qman portals after qman probing. This fixes the crash > below, seen on NXP LS1043A SoCs: > > Unable to handle kernel NULL pointer dereference at virtual address > 0004 > Mem abort info: > ESR = 0x9604 > Exception class = DABT (current EL), IL = 32 bits > SET = 0, FnV = 0 > EA = 0, S1PTW = 0 > Data abort info: > ISV = 0, ISS = 0x0004 > CM = 0, WnR = 0 > [0004] user address but active_mm is swapper > Internal error: Oops: 9604 [#1] PREEMPT SMP > Modules linked in: > CPU: 0 PID: 1 Comm: swapper/0 Not tainted > 4.18.0-rc1-next-20180622-00200-g986f5c179185 #9 > Hardware name: LS1043A RDB Board (DT) > pstate: 8005 (Nzcv daif -PAN -UAO) > pc : qman_set_sdest+0x74/0xa0 > lr : qman_portal_probe+0x22c/0x470 > sp : 0803bbc0 > x29: 0803bbc0 x28: > x27: 090c1b88 x26: 0927cb68 > x25: 0927c000 x24: 0927cb60 > x23: x22: > x21: 090e9000 x20: 800073b5c810 > x19: 800027401298 x18: > x17: 0001 x16: > x15: 090e96c8 x14: 80002740138a > x13: 090f2000 x12: 0030 > x11: 08f25000 x10: > x9 : 80007bdfd2c0 x8 : 4000 > x7 : 80007393cc18 x6 : 0041 > x5 : x4 : > x3 : 0004 x2 : 0927c900 > x1 : x0 : 0004 > Process swapper/0 (pid: 1, stack limit = 0x(ptrval)) > Call trace: > qman_set_sdest+0x74/0xa0 > platform_drv_probe+0x50/0xa8 > driver_probe_device+0x214/0x2f8 > __driver_attach+0xd8/0xe0 > bus_for_each_dev+0x68/0xc8 > driver_attach+0x20/0x28 > bus_add_driver+0x108/0x228 > driver_register+0x60/0x110 > __platform_driver_register+0x40/0x48 > qman_portal_driver_init+0x20/0x84 > do_one_initcall+0x58/0x168 > kernel_init_freeable+0x184/0x22c > kernel_init+0x10/0x108 > ret_from_fork+0x10/0x18 > Code: f9400443 11001000 927e4800 8b63 (b9400063) > ---[ end trace 4f6d50489ecfb930 ]--- > Kernel panic - not syncing: Attempted to kill init! exitcode=0x000b > > Signed-off-by: Laurentiu Tudor As this is part of a bug fix for v4.19, applied on soc/fsl for fix. > --- > drivers/soc/fsl/qbman/qman_portal.c | 8 > 1 file changed, 8 insertions(+) > > diff --git a/drivers/soc/fsl/qbman/qman_portal.c > b/drivers/soc/fsl/qbman/qman_portal.c > index 6d9da3b1b5ad..eef93cab84f1 100644 > --- a/drivers/soc/fsl/qbman/qman_portal.c > +++ b/drivers/soc/fsl/qbman/qman_portal.c > @@ -229,6 +229,14 @@ static int qman_portal_probe(struct platform_device > *pdev) > int irq, cpu, err; > u32 val; > > + err = qman_is_probed(); > + if (!err) > + return -EPROBE_DEFER; > + if (err < 0) { > + dev_err(&pdev->dev, "failing probe due to qman probe > error\n"); > + return -ENODEV; > + } > + > pcfg = devm_kmalloc(dev, sizeof(*pcfg), GFP_KERNEL); > if (!pcfg) > return -ENOMEM; > -- > 2.17.1 >
Re: [PATCH v2 05/22] soc/fsl/qbman: add APIs to retrieve the probing status
On Wed, Sep 26, 2018 at 8:26 AM wrote: > > From: Laurentiu Tudor > > Add a couple of new APIs to check the probing status of qman and bman: > 'int bman_is_probed()' and 'int qman_is_probed()'. > They return the following values. > * 1 if qman/bman were probed correctly > * 0 if qman/bman were not yet probed > * -1 if probing of qman/bman failed > Drivers that use qman/bman driver services are required to use these > APIs before calling any functions exported by qman or bman drivers > or otherwise they will crash the kernel. > The APIs will be used in the following couple of qbman portal patches > and later in the series in the dpaa1 ethernet driver. > > Signed-off-by: Laurentiu Tudor As this is part of a bug fix for v4.19, applied on soc/fsl for fix. > --- > drivers/soc/fsl/qbman/bman_ccsr.c | 11 +++ > drivers/soc/fsl/qbman/qman_ccsr.c | 11 +++ > include/soc/fsl/bman.h| 8 > include/soc/fsl/qman.h| 8 > 4 files changed, 38 insertions(+) > > diff --git a/drivers/soc/fsl/qbman/bman_ccsr.c > b/drivers/soc/fsl/qbman/bman_ccsr.c > index d180da003e4a..b209c79511bb 100644 > --- a/drivers/soc/fsl/qbman/bman_ccsr.c > +++ b/drivers/soc/fsl/qbman/bman_ccsr.c > @@ -121,6 +121,7 @@ static void bm_set_memory(u64 ba, u32 size) > */ > static dma_addr_t fbpr_a; > static size_t fbpr_sz; > +static int __bman_probed; > > static int bman_fbpr(struct reserved_mem *rmem) > { > @@ -167,6 +168,12 @@ static irqreturn_t bman_isr(int irq, void *ptr) > return IRQ_HANDLED; > } > > +int bman_is_probed(void) > +{ > + return __bman_probed; > +} > +EXPORT_SYMBOL_GPL(bman_is_probed); > + > static int fsl_bman_probe(struct platform_device *pdev) > { > int ret, err_irq; > @@ -177,6 +184,8 @@ static int fsl_bman_probe(struct platform_device *pdev) > u16 id, bm_pool_cnt; > u8 major, minor; > > + __bman_probed = -1; > + > res = platform_get_resource(pdev, IORESOURCE_MEM, 0); > if (!res) { > dev_err(dev, "Can't get %pOF property 'IORESOURCE_MEM'\n", > @@ -266,6 +275,8 @@ static int fsl_bman_probe(struct platform_device *pdev) > return ret; > } > > + __bman_probed = 1; > + > return 0; > }; > > diff --git a/drivers/soc/fsl/qbman/qman_ccsr.c > b/drivers/soc/fsl/qbman/qman_ccsr.c > index 0cfe79f85a66..383a49dcce68 100644 > --- a/drivers/soc/fsl/qbman/qman_ccsr.c > +++ b/drivers/soc/fsl/qbman/qman_ccsr.c > @@ -274,6 +274,7 @@ static const struct qman_error_info_mdata error_mdata[] = > { > static u32 __iomem *qm_ccsr_start; > /* A SDQCR mask comprising all the available/visible pool channels */ > static u32 qm_pools_sdqcr; > +static int __qman_probed; > > static inline u32 qm_ccsr_in(u32 offset) > { > @@ -689,6 +690,12 @@ static int qman_resource_init(struct device *dev) > return 0; > } > > +int qman_is_probed(void) > +{ > + return __qman_probed; > +} > +EXPORT_SYMBOL_GPL(qman_is_probed); > + > static int fsl_qman_probe(struct platform_device *pdev) > { > struct device *dev = &pdev->dev; > @@ -699,6 +706,8 @@ static int fsl_qman_probe(struct platform_device *pdev) > u16 id; > u8 major, minor; > > + __qman_probed = -1; > + > res = platform_get_resource(pdev, IORESOURCE_MEM, 0); > if (!res) { > dev_err(dev, "Can't get %pOF property 'IORESOURCE_MEM'\n", > @@ -845,6 +854,8 @@ static int fsl_qman_probe(struct platform_device *pdev) > if (ret) > return ret; > > + __qman_probed = 1; > + > return 0; > } > > diff --git a/include/soc/fsl/bman.h b/include/soc/fsl/bman.h > index eaaf56df4086..5b99cb2ea5ef 100644 > --- a/include/soc/fsl/bman.h > +++ b/include/soc/fsl/bman.h > @@ -126,4 +126,12 @@ int bman_release(struct bman_pool *pool, const struct > bm_buffer *bufs, u8 num); > */ > int bman_acquire(struct bman_pool *pool, struct bm_buffer *bufs, u8 num); > > +/** > + * bman_is_probed - Check if bman is probed > + * > + * Returns 1 if the bman driver successfully probed, -1 if the bman driver > + * failed to probe or 0 if the bman driver did not probed yet. > + */ > +int bman_is_probed(void); > + > #endif /* __FSL_BMAN_H */ > diff --git a/include/soc/fsl/qman.h b/include/soc/fsl/qman.h > index d4dfefdee6c1..597783b8a3a0 100644 > --- a/include/soc/fsl/qman.h > +++ b/include/soc/fsl/qman.h > @@ -1186,4 +1186,12 @@ int qman_alloc_cgrid_range(u32 *result, u32 count); > */ > int qman_release_cgrid(u32 id); > > +/** > + * qman_is_probed - Check if qman is probed > + * > + * Returns 1 if the qman driver successfully probed, -1 if the qman driver > + * failed to probe or 0 if the qman driver did not probed yet. > + */ > +int qman_is_probed(void); > + > #endif /* __FSL_QMAN_H */ > -- > 2.17.1 >
Re: [PATCH net-next 0/7] rtnetlink: add RTM_GETADDR2
On September 27, 2018 10:24:36 PM GMT+02:00, David Ahern wrote: >On 9/27/18 11:58 AM, Christian Brauner wrote: >> Various userspace programs (e.g. iproute2) have sent RTM_GETADDR >> requests with struct ifinfomsg. This is wrong and should have been >> struct ifaddrmsg all along as mandated by the manpages. However, dump >> requests so far didn't parse the netlink message that was sent and >> succeeded even when a wrong struct was passed along. > >... > >> The correct solution at this point seems to me to introduce a new >> RTM_GETADDR2 request. This way we can parse the message and fail hard >if >> the struct is not struct ifaddrmsg and can safely extend it in the >> future. Userspace tools that rely on the buggy RTM_GETADDR API will >> still keep working without even having to see any log messages and >new >> userspace tools that want to make user of new features can make use >of >> the new RTM_GETADDR2 requests. > >First, I think this is the wrong precedent when all we need is a single >bit flag that userspace can use to tell the kernel "I have a clue and I >am passing in the proper header for this dump request". That had been NAKed previously but if you have an idea that will be accepted all the more power to you. > >Second, you are not addressing the problems of the past by requiring >the >proper header and checking values passed in it. I don't follow. RTM_GETADDR requests are absolutely unchanged. The full legacy behavior is restored by this patchset. And requiring that RTM_GETADDR2 requests always pass the correct header is absolutely fine. We don't want built invalid legacy behavior into a new request type. > >I have another idea. I'll send an RFC patch soon.
Re: [PATCH net-next 0/7] rtnetlink: add RTM_GETADDR2
On 9/27/18 11:58 AM, Christian Brauner wrote: > Various userspace programs (e.g. iproute2) have sent RTM_GETADDR > requests with struct ifinfomsg. This is wrong and should have been > struct ifaddrmsg all along as mandated by the manpages. However, dump > requests so far didn't parse the netlink message that was sent and > succeeded even when a wrong struct was passed along. ... > The correct solution at this point seems to me to introduce a new > RTM_GETADDR2 request. This way we can parse the message and fail hard if > the struct is not struct ifaddrmsg and can safely extend it in the > future. Userspace tools that rely on the buggy RTM_GETADDR API will > still keep working without even having to see any log messages and new > userspace tools that want to make user of new features can make use of > the new RTM_GETADDR2 requests. First, I think this is the wrong precedent when all we need is a single bit flag that userspace can use to tell the kernel "I have a clue and I am passing in the proper header for this dump request". Second, you are not addressing the problems of the past by requiring the proper header and checking values passed in it. I have another idea. I'll send an RFC patch soon.
RE: bug: 'ethtool -m' reports spurious alarm & warning threshold values for QSFP28 transceivers
Update for posterity- Mellanox support provided a work-around of using mlxcables instead of ethtool to read alarm/warning info for an installed transceiver. I was told that a couple of their engineers are currently looking into the discrepancy between threshold reporting by mlxcables and ethtool, and that they are deciding what to do about it... Work-around steps: 1. add a cable with "sudo mst cable add". 2. find the cable name with "sudo mlxcables". The name of my cable is 01:00.0_cable_0 so I copy that name for insertion into the next command. 3. probe the cable for DDM with "sudo mlxcables -d 01:00.0_cable_0 --DDM". Example copied/pasted from my CLI here. All reported thresholds appear to be correct. tech1@D7:~$ tech1@D7:~$ tech1@D7:~$ sudo mst cable add -I- Added 1 cable devices .. tech1@D7:~$ sudo mlxcables Querying Cables Cable #1: - Cable name: 01:00.0_cable_0 >> No FW data to show Cable EEPROM Identifier: QSFP28 (11h) Technology: 850 nm VCSEL (00h) Compliance: Extended Specification Compliance is valid, 100GBASE-SR4 or 25GBASE-SR Wavelength: 850 nm OUI : 0x00c0f2 Vendor: TRANSITION Serial number : TN02000263 Part number : TN-QSFP-100G-SR4 Revision : 02 Temperature : 34 C Length: 50 m tech1@D7:~$ sudo mlxcables -d 01:00.0_cable_0 --DDM Cable DDM: -- Temperature: 34C Voltage: 3.2918V Channel 1: RX Power : 0.1695dBm TX Power : 0.8622dBm TX Bias : 7.0720mA Channel 2: RX Power : 0.1355dBm TX Power : 1.1042dBm TX Bias : 6.9240mA Channel 3: RX Power : -0.1592dBm TX Power : 0.6547dBm TX Bias : 6.9420mA Channel 4: RX Power : -0.1300dBm TX Power : 0.4653dBm TX Bias : 6.9120mA - Thresholds - Temperature: High Warning : 70C Low Warning : 0C High Alarm: 75C Low Alarm: -5C Warning mask : 0 Alarm mask: 0 Voltage: High Warning : 3.4600V Low Warning : 3.1300V High Alarm : 3.6300V Low Alarm : 2.9700V Warning mask : 0 Alarm mask : 0 Channel 1: RX Power high warn : 2.4000dBm RX Power low warn : -9.5001dBm RX Power high alarm : 5.4103dBm RX Power low alarm : -12.5104dBm RX Power Warning mask: 0 RX Power Alarm mask : 0 TX Power high warn : 2.4000dBm TX Power low warn : -7.6020dBm TX Power high alarm : 3.1917dBm TX Power low alarm : -8.5699dBm TX Power Warning mask: 0 TX Power Alarm mask : 0 TX Bias high warn: 12.mA TX Bias low warn: 2.mA TX Bias high alarm : 15.mA TX Bias low alarm : 1.mA TX Bias Warning mask : 0 TX Bias Alarm mask : 0 Channel 2: RX Power high warn : 2.4000dBm RX Power low warn : -9.5001dBm RX Power high alarm : 5.4103dBm RX Power low alarm : -12.5104dBm RX Power Warning mask: 0 RX Power Alarm mask : 0 TX Power high warn : 2.4000dBm TX Power low warn : -7.6020dBm TX Power high alarm : 3.1917dBm TX Power low alarm : -8.5699dBm TX Power Warning mask: 0 TX Power Alarm mask : 0 TX Bias high warn: 12.mA TX Bias low warn: 2.mA TX Bias high alarm : 15.mA TX Bias low alarm : 1.mA TX Bias Warning mask : 0 TX Bias Alarm mask : 0 Channel 3: RX Power high warn : 2.4000dBm RX Power low warn : -9.5001dBm RX Power high alarm : 5.4103dBm RX Power low alarm : -12.5104dBm RX Power Warning mask: 0 RX Power Alarm mask : 0 TX Power high warn : 2.4000dBm TX Power low warn : -7.6020dBm TX Power high alarm : 3.1917dBm TX Power low alarm : -8.5699dBm TX Power Warning mask: 0 TX Power Alarm mask : 0 TX Bias high warn: 12.mA TX Bias low warn: 2.mA TX Bias high alarm : 15.mA TX Bias low alarm : 1.mA TX Bias Warning mask : 0 TX Bias Alarm mask : 0 Channel 4: RX Power high warn : 2.4000dBm RX Power low warn : -9.5001dBm RX Power high alarm : 5.4103dBm RX Power low alarm : -12.5104dBm RX Power Warning mask: 0 RX Power Alarm mask : 0 TX Power high warn : 2.4000dBm TX Power low warn : -7.6020dBm TX Power high alarm : 3.1917dBm TX Power low alarm : -8.5699dBm TX Power Warning mask: 0 TX Power Alarm mask : 0 TX Bias high warn: 12.mA TX Bias low warn: 2.mA TX Bias high alarm : 15.mA TX Bias low alarm : 1.mA TX Bias Warning mask : 0
Re: [PATCH v2 08/22] soc/fsl/qbman_portals: add APIs to retrieve the probing status
On Wed, Sep 26, 2018 at 8:26 AM wrote: > > From: Laurentiu Tudor > > Add a couple of new APIs to check the probing status of the required > cpu bound qman and bman portals: > 'int bman_portals_probed()' and 'int qman_portals_probed()'. > They return the following values. > * 1 if qman/bman portals were all probed correctly > * 0 if qman/bman portals were not yet probed > * -1 if probing of qman/bman portals failed > Drivers that use qman/bman portal driver services are required to use > these APIs before calling any functions exported by these drivers or > otherwise they will crash the kernel. > First user will be the dpaa1 ethernet driver, coming in a subsequent > patch. > > Signed-off-by: Laurentiu Tudor > --- > drivers/soc/fsl/qbman/bman_portal.c | 10 ++ > drivers/soc/fsl/qbman/qman_portal.c | 10 ++ > include/soc/fsl/bman.h | 8 > include/soc/fsl/qman.h | 9 + > 4 files changed, 37 insertions(+) > > diff --git a/drivers/soc/fsl/qbman/bman_portal.c > b/drivers/soc/fsl/qbman/bman_portal.c > index f9edd28894fd..8048d35de8a2 100644 > --- a/drivers/soc/fsl/qbman/bman_portal.c > +++ b/drivers/soc/fsl/qbman/bman_portal.c > @@ -32,6 +32,7 @@ > > static struct bman_portal *affine_bportals[NR_CPUS]; > static struct cpumask portal_cpus; > +static int __bman_portals_probed; > /* protect bman global registers and global data shared among portals */ > static DEFINE_SPINLOCK(bman_lock); > > @@ -85,6 +86,12 @@ static int bman_online_cpu(unsigned int cpu) > return 0; > } > > +int bman_portals_probed(void) > +{ > + return __bman_portals_probed; > +} > +EXPORT_SYMBOL_GPL(bman_portals_probed); > + > static int bman_portal_probe(struct platform_device *pdev) > { > struct device *dev = &pdev->dev; > @@ -148,6 +155,7 @@ static int bman_portal_probe(struct platform_device *pdev) > spin_lock(&bman_lock); > cpu = cpumask_next_zero(-1, &portal_cpus); > if (cpu >= nr_cpu_ids) { > + __bman_portals_probed = 1; What if the last CPU is not used for portals? Is there a hard requirement that all CPUs need to be used for portal? What happens if the last CPU is offline? > /* unassigned portal, skip init */ > spin_unlock(&bman_lock); > return 0; > @@ -173,6 +181,8 @@ static int bman_portal_probe(struct platform_device *pdev) > err_ioremap2: > memunmap(pcfg->addr_virt_ce); > err_ioremap1: > +__bman_portals_probed = 1; > + There are other error paths that not covered. > return -ENXIO; > } > > diff --git a/drivers/soc/fsl/qbman/qman_portal.c > b/drivers/soc/fsl/qbman/qman_portal.c > index eef93cab84f1..1b2fc981c269 100644 > --- a/drivers/soc/fsl/qbman/qman_portal.c > +++ b/drivers/soc/fsl/qbman/qman_portal.c > @@ -39,6 +39,7 @@ EXPORT_SYMBOL(qman_dma_portal); > #define CONFIG_FSL_DPA_PIRQ_FAST 1 > > static struct cpumask portal_cpus; > +static int __qman_portals_probed; > /* protect qman global registers and global data shared among portals */ > static DEFINE_SPINLOCK(qman_lock); > > @@ -219,6 +220,12 @@ static int qman_online_cpu(unsigned int cpu) > return 0; > } > > +int qman_portals_probed(void) > +{ > + return __qman_portals_probed; > +} > +EXPORT_SYMBOL_GPL(qman_portals_probed); > + > static int qman_portal_probe(struct platform_device *pdev) > { > struct device *dev = &pdev->dev; > @@ -306,6 +313,7 @@ static int qman_portal_probe(struct platform_device *pdev) > spin_lock(&qman_lock); > cpu = cpumask_next_zero(-1, &portal_cpus); > if (cpu >= nr_cpu_ids) { > + __qman_portals_probed = 1; Ditto. > /* unassigned portal, skip init */ > spin_unlock(&qman_lock); > return 0; > @@ -336,6 +344,8 @@ static int qman_portal_probe(struct platform_device *pdev) > err_ioremap2: > memunmap(pcfg->addr_virt_ce); > err_ioremap1: > + __qman_portals_probed = -1; > + Ditto. > return -ENXIO; > } > > diff --git a/include/soc/fsl/bman.h b/include/soc/fsl/bman.h > index 5b99cb2ea5ef..173e4049d963 100644 > --- a/include/soc/fsl/bman.h > +++ b/include/soc/fsl/bman.h > @@ -133,5 +133,13 @@ int bman_acquire(struct bman_pool *pool, struct > bm_buffer *bufs, u8 num); > * failed to probe or 0 if the bman driver did not probed yet. > */ > int bman_is_probed(void); > +/** > + * bman_portals_probed - Check if all cpu bound bman portals are probed > + * > + * Returns 1 if all the required cpu bound bman portals successfully probed, > + * -1 if probe errors appeared or 0 if the bman portals did not yet finished > + * probing. > + */ > +int bman_portals_probed(void); > > #endif /* __FSL_BMAN_H */ > diff --git a/include/soc/fsl/qman.h b/include/soc/fsl/qman.h > index 597783b8a3a0..7732e48081eb 100644 > --- a/include/soc/fsl/qman.h > +++ b/include/soc/fsl/qman.h > @@ -1194,4 +1194,13 @@ int qman_release_cgrid(u
[PATCH net-next] net: nixge: Address compiler warnings when building for i386
Address compiler warning reported by kbuild autobuilders when building for i386 as a result of dma_addr_t size on different architectures. warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] Fixes: 7e8d5755be0e ("net: nixge: Add support for 64-bit platforms") Signed-off-by: Moritz Fischer Cc: Arnd Bergmann --- drivers/net/ethernet/ni/nixge.c | 14 +++--- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/ni/nixge.c b/drivers/net/ethernet/ni/nixge.c index 74cf52e3fb09..0611f2335b4a 100644 --- a/drivers/net/ethernet/ni/nixge.c +++ b/drivers/net/ethernet/ni/nixge.c @@ -127,8 +127,8 @@ struct nixge_hw_dma_bd { #ifdef CONFIG_PHYS_ADDR_T_64BIT #define nixge_hw_dma_bd_set_addr(bd, field, addr) \ do { \ - (bd)->field##_lo = lower_32_bits(((u64)addr)); \ - (bd)->field##_hi = upper_32_bits(((u64)addr)); \ + (bd)->field##_lo = lower_32_bits((addr)); \ + (bd)->field##_hi = upper_32_bits((addr)); \ } while (0) #else #define nixge_hw_dma_bd_set_addr(bd, field, addr) \ @@ -251,7 +251,7 @@ static void nixge_hw_dma_bd_release(struct net_device *ndev) NIXGE_MAX_JUMBO_FRAME_SIZE, DMA_FROM_DEVICE); - skb = (struct sk_buff *) + skb = (struct sk_buff *)(uintptr_t) nixge_hw_dma_bd_get_addr(&priv->rx_bd_v[i], sw_id_offset); dev_kfree_skb(skb); @@ -323,7 +323,7 @@ static int nixge_hw_dma_bd_init(struct net_device *ndev) if (!skb) goto out; - nixge_hw_dma_bd_set_offset(&priv->rx_bd_v[i], skb); + nixge_hw_dma_bd_set_offset(&priv->rx_bd_v[i], (uintptr_t)skb); phys = dma_map_single(ndev->dev.parent, skb->data, NIXGE_MAX_JUMBO_FRAME_SIZE, DMA_FROM_DEVICE); @@ -601,8 +601,8 @@ static int nixge_recv(struct net_device *ndev, int budget) tail_p = priv->rx_bd_p + sizeof(*priv->rx_bd_v) * priv->rx_bd_ci; - skb = (struct sk_buff *)nixge_hw_dma_bd_get_addr(cur_p, -sw_id_offset); + skb = (struct sk_buff *)(uintptr_t) + nixge_hw_dma_bd_get_addr(cur_p, sw_id_offset); length = cur_p->status & XAXIDMA_BD_STS_ACTUAL_LEN_MASK; if (length > NIXGE_MAX_JUMBO_FRAME_SIZE) @@ -643,7 +643,7 @@ static int nixge_recv(struct net_device *ndev, int budget) nixge_hw_dma_bd_set_phys(cur_p, cur_phys); cur_p->cntrl = NIXGE_MAX_JUMBO_FRAME_SIZE; cur_p->status = 0; - nixge_hw_dma_bd_set_offset(cur_p, new_skb); + nixge_hw_dma_bd_set_offset(cur_p, (uintptr_t)new_skb); ++priv->rx_bd_ci; priv->rx_bd_ci %= RX_BD_NUM; -- 2.19.0
Re: [PATCH v2 net-next 2/2] dt-bindings: net: add support for Microchip KSZ9131 Ethernet PHY
On Thu, Sep 27, 2018 at 04:16:55PM -0400, Yuiko Oshino wrote: > Add support for Microchip Technology KSZ9131 10/100/1000 Ethernet PHY > > Signed-off-by: Yuiko Oshino > --- > .../devicetree/bindings/net/micrel-ksz90x1.txt | 29 > +- > 1 file changed, 28 insertions(+), 1 deletion(-) > > diff --git a/Documentation/devicetree/bindings/net/micrel-ksz90x1.txt > b/Documentation/devicetree/bindings/net/micrel-ksz90x1.txt > index e22d8cf..d23d14a 100644 > --- a/Documentation/devicetree/bindings/net/micrel-ksz90x1.txt > +++ b/Documentation/devicetree/bindings/net/micrel-ksz90x1.txt > @@ -1,4 +1,4 @@ > -Micrel KSZ9021/KSZ9031 Gigabit Ethernet PHY > +Micrel/Microchip KSZ9021/KSZ9031/KSZ9131 Gigabit Ethernet PHY > > Some boards require special tuning values, particularly when it comes > to clock delays. You can specify clock delay values in the PHY OF > @@ -64,6 +64,33 @@ KSZ9031: > Attention: The link partner must be configurable as slave otherwise > no link will be established. > > +KSZ9131: > + > + All skew control options are specified in picoseconds. The minimum > + value is 0, and the maximum is property-dependent. The increment > + step is 100ps. The default value is the neutral setting, so setting > + rxc-skew-ps=<0> actually results in -700 picoseconds adjustment. We also have: KSZ9021: All skew control options are specified in picoseconds. The minimum value is 0, the maximum value is 3000, and it is incremented by 200ps steps. and KSZ9031: All skew control options are specified in picoseconds. The minimum value is 0, and the maximum is property-dependent. The increment step is 60ps. The default value is the neutral setting, so setting rxc-skew-ps=<0> actually results in -900 picoseconds adjustment. So this is the third meaning of 0. How about making 0 mean 0. Have the range being -700 to 1800 and -700 to 800? KSZ9031 and KSZ9131 then use the same meaning of 0, with 0 actually meaning 0! Andrew
Re: [PATCH net-next] tcp: up initial rmem to 128KB and SYN rwin to around 64KB
On Thu, Sep 27, 2018 at 11:21 AM, Yuchung Cheng wrote: > Previously TCP initial receive buffer is ~87KB by default and > the initial receive window is ~29KB (20 MSS). This patch changes > the two numbers to 128KB and ~64KB (rounding down to the multiples > of MSS) respectively. The patch also simplifies the calculations s.t. > the two numbers are directly controlled by sysctl tcp_rmem[1]: > > 1) Initial receiver buffer budget (sk_rcvbuf): while this should > be configured via sysctl tcp_rmem[1], previously tcp_fixup_rcvbuf() > always override and set a larger size when a new connection > establishes. > > 2) Initial receive window in SYN: previously it is set to 20 > packets if MSS <= 1460. The number 20 was based on the initial > congestion window of 10: the receiver needs twice amount to > avoid being limited by the receive window upon out-of-order > delivery in the first window burst. But since this only > applies if the receiving MSS <= 1460, connection using large MTU > (e.g. to utilize receiver zero-copy) may be limited by the > receive window. > > With this patch TCP memory configuration is more straight-forward and > more properly sized to modern high-speed networks by default. Several > popular stacks have been announcing 64KB rwin in SYNs as well. Sorry please ignore this patch for now. We need to adjust rbuf autotuning as well otherwise w/ larger init rbuf it may increase too slowly during slow start. Will submit a v2 > > Signed-off-by: Yuchung Cheng > Signed-off-by: Wei Wang > Signed-off-by: Neal Cardwell > Signed-off-by: Eric Dumazet > Reviewed-by: Soheil Hassas Yeganeh > --- > net/ipv4/tcp.c| 4 ++-- > net/ipv4/tcp_input.c | 25 ++--- > net/ipv4/tcp_output.c | 25 - > 3 files changed, 8 insertions(+), 46 deletions(-) > > diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c > index 69c236943f56..dcf51fbf5ec7 100644 > --- a/net/ipv4/tcp.c > +++ b/net/ipv4/tcp.c > @@ -3896,8 +3896,8 @@ void __init tcp_init(void) > init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare); > > init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM; > - init_net.ipv4.sysctl_tcp_rmem[1] = 87380; > - init_net.ipv4.sysctl_tcp_rmem[2] = max(87380, max_rshare); > + init_net.ipv4.sysctl_tcp_rmem[1] = 131072; > + init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare); > > pr_info("Hash tables configured (established %u bind %u)\n", > tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c > index d703a0b3b6a2..7a59f6a96212 100644 > --- a/net/ipv4/tcp_input.c > +++ b/net/ipv4/tcp_input.c > @@ -426,26 +426,7 @@ static void tcp_grow_window(struct sock *sk, const > struct sk_buff *skb) > } > } > > -/* 3. Tuning rcvbuf, when connection enters established state. */ > -static void tcp_fixup_rcvbuf(struct sock *sk) > -{ > - u32 mss = tcp_sk(sk)->advmss; > - int rcvmem; > - > - rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) * > -tcp_default_init_rwnd(mss); > - > - /* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency > -* Allow enough cushion so that sender is not limited by our window > -*/ > - if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) > - rcvmem <<= 2; > - > - if (sk->sk_rcvbuf < rcvmem) > - sk->sk_rcvbuf = min(rcvmem, > sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); > -} > - > -/* 4. Try to fixup all. It is made immediately after connection enters > +/* 3. Try to fixup all. It is made immediately after connection enters > *established state. > */ > void tcp_init_buffer_space(struct sock *sk) > @@ -454,8 +435,6 @@ void tcp_init_buffer_space(struct sock *sk) > struct tcp_sock *tp = tcp_sk(sk); > int maxwin; > > - if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) > - tcp_fixup_rcvbuf(sk); > if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) > tcp_sndbuf_expand(sk); > > @@ -485,7 +464,7 @@ void tcp_init_buffer_space(struct sock *sk) > tp->snd_cwnd_stamp = tcp_jiffies32; > } > > -/* 5. Recalculate window clamp after socket hit its memory bounds. */ > +/* 4. Recalculate window clamp after socket hit its memory bounds. */ > static void tcp_clamp_window(struct sock *sk) > { > struct tcp_sock *tp = tcp_sk(sk); > diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c > index fe7855b090e4..059b67af28b1 100644 > --- a/net/ipv4/tcp_output.c > +++ b/net/ipv4/tcp_output.c > @@ -195,21 +195,6 @@ static inline void tcp_event_ack_sent(struct sock *sk, > unsigned int pkts, > inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); > } > > - > -u32 tcp_default_init_rwnd(u32 mss) > -{ > - /* Initial receive window should be twice of TCP_INIT_CWND to > -* enable proper sending of new unsent data during fast recovery > -
Re: kernel 4.18.5 Realtek 8111G network adapter stops responding under high system load
Hi, Heiner Kallweit's patch seems to resolve the problem. The machine was under high disk and network io pressure today and networking was perfectly stable. Bye, David Arendt On 9/25/18 11:03 PM, Heiner Kallweit wrote: > On 19.09.2018 06:12, David Arendt wrote: >> Hi, >> >> Thanks for the patch. >> >> I just applied it and the TxConfig register now contains 0x4f000f80. >> The next day will show if it really solves the problem. >> >> Thanks in advance, >> David Arendt >> >> On 9/19/18 12:30 AM, Maciej S. Szmigiero wrote: >>> Hi, >>> >>> On 18.09.2018 12:23, David Arendt wrote: Hi, Today I had the network adapter problems again. So the patch doesn't seem to change anything regarding this problem. This week my time is unfortunately very limited, but I will try to find some time next weekend to look a bit more into the issue. >>> If the problem is caused by missing TXCFG_AUTO_FIFO bit in TxConfig, >>> as the register difference would suggest, then you can try applying >>> the following patch (hack) on top of 4.18.8 that is already patched >>> with commit f74dd480cf4e: >>> --- a/drivers/net/ethernet/realtek/r8169.c >>> +++ b/drivers/net/ethernet/realtek/r8169.c >>> @@ -5043,7 +5043,8 @@ >>> { >>> /* Set DMA burst size and Interframe Gap Time */ >>> RTL_W32(tp, TxConfig, (TX_DMA_BURST << TxDMAShift) | >>> - (InterFrameGap << TxInterFrameGapShift)); >>> + (InterFrameGap << TxInterFrameGapShift) >>> + | TXCFG_AUTO_FIFO); >>> } >>> >>> static void rtl_set_rx_max_size(struct rtl8169_private *tp) >>> >>> This hack will probably only work properly on RTL_GIGA_MAC_VER_40 or >>> later NICs. >>> >>> Before running any tests please verify with "ethtool -d enp3s0" that >>> TxConfig register now contains 0x4f000f80, as it did in the old, >>> working driver version. >>> >>> If this does not help then a bisection will most likely be needed. >>> Thanks in advance, David Arendt >>> Maciej >> >> > @Gabriel: > Thanks for the hint, I wasn't fully aware of this thread. > @Maciej: > Thanks for the analysis. > > It seems that all chip versions from 34 (= RTL8168E-VL) with the > exception of version 39 (= RTL8106E, first sub-version) need > bit TXCFG_AUTO_FIFO. > > And indeed, due to reordering of calls this bit is overwritten. > Following patch moves setting the bit from the chip-specific > hw_start function to rtl_set_tx_config_registers(). > > Whoever is hit by the issue and has the option to build a kernel, > could you please test whether the patch fixes the issue for you? > > Thanks, Heiner > > --- > drivers/net/ethernet/realtek/r8169.c | 20 > 1 file changed, 8 insertions(+), 12 deletions(-) > > diff --git a/drivers/net/ethernet/realtek/r8169.c > b/drivers/net/ethernet/realtek/r8169.c > index f882be49f..ae8abe900 100644 > --- a/drivers/net/ethernet/realtek/r8169.c > +++ b/drivers/net/ethernet/realtek/r8169.c > @@ -4514,9 +4514,14 @@ static void rtl8169_hw_reset(struct rtl8169_private > *tp) > > static void rtl_set_tx_config_registers(struct rtl8169_private *tp) > { > - /* Set DMA burst size and Interframe Gap Time */ > - RTL_W32(tp, TxConfig, (TX_DMA_BURST << TxDMAShift) | > - (InterFrameGap << TxInterFrameGapShift)); > + u32 val = TX_DMA_BURST << TxDMAShift | > + InterFrameGap << TxInterFrameGapShift; > + > + if (tp->mac_version >= RTL_GIGA_MAC_VER_34 && > + tp->mac_version != RTL_GIGA_MAC_VER_39) > + val |= TXCFG_AUTO_FIFO; > + > + RTL_W32(tp, TxConfig, val); > } > > static void rtl_set_rx_max_size(struct rtl8169_private *tp) > @@ -5011,7 +5016,6 @@ static void rtl_hw_start_8168e_2(struct rtl8169_private > *tp) > > rtl_disable_clock_request(tp); > > - RTL_W32(tp, TxConfig, RTL_R32(tp, TxConfig) | TXCFG_AUTO_FIFO); > RTL_W8(tp, MCU, RTL_R8(tp, MCU) & ~NOW_IS_OOB); > > /* Adjust EEE LED frequency */ > @@ -5045,7 +5049,6 @@ static void rtl_hw_start_8168f(struct rtl8169_private > *tp) > > rtl_disable_clock_request(tp); > > - RTL_W32(tp, TxConfig, RTL_R32(tp, TxConfig) | TXCFG_AUTO_FIFO); > RTL_W8(tp, MCU, RTL_R8(tp, MCU) & ~NOW_IS_OOB); > RTL_W8(tp, DLLPR, RTL_R8(tp, DLLPR) | PFM_EN); > RTL_W32(tp, MISC, RTL_R32(tp, MISC) | PWM_EN); > @@ -5090,8 +5093,6 @@ static void rtl_hw_start_8411(struct rtl8169_private > *tp) > > static void rtl_hw_start_8168g(struct rtl8169_private *tp) > { > - RTL_W32(tp, TxConfig, RTL_R32(tp, TxConfig) | TXCFG_AUTO_FIFO); > - > rtl_eri_write(tp, 0xc8, ERIAR_MASK_0101, 0x080002, ERIAR_EXGMAC); > rtl_eri_write(tp, 0xcc, ERIAR_MASK_0001, 0x38, ERIAR_EXGMAC); > rtl_eri_write(tp, 0xd0, ERIAR_MASK_0001, 0x48, ERIAR_EXGMAC); > @@ -5189,8 +5190,6 @@ static void rtl_hw_start_8168h_1(struct rtl8169_private > *tp) > rtl_hw_aspm_clkreq_enable(tp, false); > rtl_ephy_init(tp, e_info_8168h_1, ARRAY_SIZE(e_info_8168h_1)); > > - RTL_W3